- 优化了完整流程处理逻辑: - 修改了OCR处理逻辑,当遇到已处理的图片时自动跳过并继续执行 - 改进了错误处理,避免因图片已处理而中断流程 - 优化了日志提示信息,提供更清晰的处理状态反馈 - 改进了OCRService的process_image方法: - 添加了文件存在性检查 - 添加了文件类型验证 - 添加了已处理文件检查 - 优化了错误处理和日志记录
130 lines
4.1 KiB
Python
130 lines
4.1 KiB
Python
"""
|
||
OCR服务模块
|
||
---------
|
||
提供OCR识别服务,协调OCR流程。
|
||
"""
|
||
|
||
from typing import Dict, List, Optional, Tuple, Union, Any
|
||
import os
|
||
|
||
from ..config.settings import ConfigManager
|
||
from ..core.utils.log_utils import get_logger
|
||
from ..core.ocr.table_ocr import OCRProcessor
|
||
|
||
logger = get_logger(__name__)
|
||
|
||
class OCRService:
|
||
"""
|
||
OCR识别服务:协调OCR流程
|
||
"""
|
||
|
||
def __init__(self, config: Optional[ConfigManager] = None):
|
||
"""
|
||
初始化OCR服务
|
||
|
||
Args:
|
||
config: 配置管理器,如果为None则创建新的
|
||
"""
|
||
logger.info("初始化OCRService")
|
||
self.config = config or ConfigManager()
|
||
|
||
# 创建OCR处理器
|
||
self.ocr_processor = OCRProcessor(self.config)
|
||
|
||
logger.info("OCRService初始化完成")
|
||
|
||
def get_unprocessed_images(self) -> List[str]:
|
||
"""
|
||
获取待处理的图片列表
|
||
|
||
Returns:
|
||
待处理图片路径列表
|
||
"""
|
||
return self.ocr_processor.get_unprocessed_images()
|
||
|
||
def process_image(self, image_path: str) -> Optional[str]:
|
||
"""
|
||
处理单个图片文件
|
||
|
||
Args:
|
||
image_path: 图片文件路径
|
||
|
||
Returns:
|
||
生成的Excel文件路径,如果处理失败则返回None
|
||
"""
|
||
try:
|
||
# 检查文件是否存在
|
||
if not os.path.exists(image_path):
|
||
logger.error(f"文件不存在: {image_path}")
|
||
return None
|
||
|
||
# 检查文件类型
|
||
if not self._is_valid_image(image_path):
|
||
logger.error(f"不支持的文件类型: {image_path}")
|
||
return None
|
||
|
||
# 检查是否已处理
|
||
excel_file = self._get_excel_path(image_path)
|
||
if os.path.exists(excel_file):
|
||
logger.info(f"文件已处理过,跳过OCR识别: {image_path}")
|
||
return excel_file
|
||
|
||
# 执行OCR识别
|
||
result = self.ocr_processor.process_image(image_path)
|
||
if not result:
|
||
logger.error(f"OCR识别失败: {image_path}")
|
||
return None
|
||
|
||
# 生成Excel文件
|
||
excel_file = self._generate_excel(result, image_path)
|
||
if not excel_file:
|
||
logger.error(f"生成Excel文件失败: {image_path}")
|
||
return None
|
||
|
||
logger.info(f"处理完成: {image_path} -> {excel_file}")
|
||
return excel_file
|
||
|
||
except Exception as e:
|
||
logger.error(f"处理图片时发生错误: {e}", exc_info=True)
|
||
return None
|
||
|
||
def process_images_batch(self, batch_size: int = None, max_workers: int = None) -> Tuple[int, int]:
|
||
"""
|
||
批量处理图片
|
||
|
||
Args:
|
||
batch_size: 批处理大小
|
||
max_workers: 最大线程数
|
||
|
||
Returns:
|
||
(总处理数, 成功处理数)元组
|
||
"""
|
||
logger.info(f"OCRService开始批量处理图片, batch_size={batch_size}, max_workers={max_workers}")
|
||
return self.ocr_processor.process_images_batch(batch_size, max_workers)
|
||
|
||
# 添加batch_process作为process_images_batch的别名,确保兼容性
|
||
def batch_process(self, batch_size: int = None, max_workers: int = None) -> Tuple[int, int]:
|
||
"""
|
||
批量处理图片(别名方法,与process_images_batch功能相同)
|
||
|
||
Args:
|
||
batch_size: 批处理大小
|
||
max_workers: 最大线程数
|
||
|
||
Returns:
|
||
(总处理数, 成功处理数)元组
|
||
"""
|
||
logger.info(f"OCRService.batch_process被调用,转发到process_images_batch")
|
||
return self.process_images_batch(batch_size, max_workers)
|
||
|
||
def validate_image(self, image_path: str) -> bool:
|
||
"""
|
||
验证图片是否有效
|
||
|
||
Args:
|
||
image_path: 图片路径
|
||
|
||
Returns:
|
||
图片是否有效
|
||
"""
|
||
return self.ocr_processor.validate_image(image_path) |