orc-order-v2/app/services/ocr_service.py
houhuan 4a8169ff63 ## v1.5.3 (2024-03-21)
- 优化了完整流程处理逻辑:
  - 修改了OCR处理逻辑,当遇到已处理的图片时自动跳过并继续执行
  - 改进了错误处理,避免因图片已处理而中断流程
  - 优化了日志提示信息,提供更清晰的处理状态反馈
- 改进了OCRService的process_image方法:
  - 添加了文件存在性检查
  - 添加了文件类型验证
  - 添加了已处理文件检查
  - 优化了错误处理和日志记录
2025-05-10 12:58:28 +08:00

130 lines
4.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
OCR服务模块
---------
提供OCR识别服务协调OCR流程。
"""
from typing import Dict, List, Optional, Tuple, Union, Any
import os
from ..config.settings import ConfigManager
from ..core.utils.log_utils import get_logger
from ..core.ocr.table_ocr import OCRProcessor
logger = get_logger(__name__)
class OCRService:
"""
OCR识别服务协调OCR流程
"""
def __init__(self, config: Optional[ConfigManager] = None):
"""
初始化OCR服务
Args:
config: 配置管理器如果为None则创建新的
"""
logger.info("初始化OCRService")
self.config = config or ConfigManager()
# 创建OCR处理器
self.ocr_processor = OCRProcessor(self.config)
logger.info("OCRService初始化完成")
def get_unprocessed_images(self) -> List[str]:
"""
获取待处理的图片列表
Returns:
待处理图片路径列表
"""
return self.ocr_processor.get_unprocessed_images()
def process_image(self, image_path: str) -> Optional[str]:
"""
处理单个图片文件
Args:
image_path: 图片文件路径
Returns:
生成的Excel文件路径如果处理失败则返回None
"""
try:
# 检查文件是否存在
if not os.path.exists(image_path):
logger.error(f"文件不存在: {image_path}")
return None
# 检查文件类型
if not self._is_valid_image(image_path):
logger.error(f"不支持的文件类型: {image_path}")
return None
# 检查是否已处理
excel_file = self._get_excel_path(image_path)
if os.path.exists(excel_file):
logger.info(f"文件已处理过跳过OCR识别: {image_path}")
return excel_file
# 执行OCR识别
result = self.ocr_processor.process_image(image_path)
if not result:
logger.error(f"OCR识别失败: {image_path}")
return None
# 生成Excel文件
excel_file = self._generate_excel(result, image_path)
if not excel_file:
logger.error(f"生成Excel文件失败: {image_path}")
return None
logger.info(f"处理完成: {image_path} -> {excel_file}")
return excel_file
except Exception as e:
logger.error(f"处理图片时发生错误: {e}", exc_info=True)
return None
def process_images_batch(self, batch_size: int = None, max_workers: int = None) -> Tuple[int, int]:
"""
批量处理图片
Args:
batch_size: 批处理大小
max_workers: 最大线程数
Returns:
(总处理数, 成功处理数)元组
"""
logger.info(f"OCRService开始批量处理图片, batch_size={batch_size}, max_workers={max_workers}")
return self.ocr_processor.process_images_batch(batch_size, max_workers)
# 添加batch_process作为process_images_batch的别名确保兼容性
def batch_process(self, batch_size: int = None, max_workers: int = None) -> Tuple[int, int]:
"""
批量处理图片别名方法与process_images_batch功能相同
Args:
batch_size: 批处理大小
max_workers: 最大线程数
Returns:
(总处理数, 成功处理数)元组
"""
logger.info(f"OCRService.batch_process被调用转发到process_images_batch")
return self.process_images_batch(batch_size, max_workers)
def validate_image(self, image_path: str) -> bool:
"""
验证图片是否有效
Args:
image_path: 图片路径
Returns:
图片是否有效
"""
return self.ocr_processor.validate_image(image_path)