新增条码映射编辑功能图形化界面
This commit is contained in:
Binary file not shown.
Binary file not shown.
+41
-22
@@ -105,34 +105,53 @@ class BaiduOCRClient:
|
||||
百度OCR API客户端
|
||||
"""
|
||||
|
||||
def __init__(self, config: Optional[ConfigManager] = None):
|
||||
def __init__(self, config):
|
||||
"""
|
||||
初始化百度OCR客户端
|
||||
|
||||
Args:
|
||||
config: 配置管理器,如果为None则创建新的
|
||||
config: 配置信息
|
||||
"""
|
||||
self.config = config or ConfigManager()
|
||||
self.config = config
|
||||
|
||||
# 获取配置
|
||||
self.api_key = self.config.get('API', 'api_key')
|
||||
self.secret_key = self.config.get('API', 'secret_key')
|
||||
self.timeout = self.config.getint('API', 'timeout', 30)
|
||||
self.max_retries = self.config.getint('API', 'max_retries', 3)
|
||||
self.retry_delay = self.config.getint('API', 'retry_delay', 2)
|
||||
self.api_url = self.config.get('API', 'api_url', 'https://aip.baidubce.com/rest/2.0/ocr/v1/table')
|
||||
|
||||
# 创建令牌管理器
|
||||
self.token_manager = TokenManager(
|
||||
self.api_key,
|
||||
self.secret_key,
|
||||
self.max_retries,
|
||||
self.retry_delay
|
||||
)
|
||||
|
||||
# 验证API配置
|
||||
if not self.api_key or not self.secret_key:
|
||||
logger.warning("API密钥未设置,请在配置文件中设置API密钥")
|
||||
# 从配置中读取API信息
|
||||
try:
|
||||
# 修复getint调用方式
|
||||
self.timeout = config.get('API', 'timeout', fallback=30)
|
||||
if isinstance(self.timeout, str):
|
||||
self.timeout = int(self.timeout)
|
||||
|
||||
self.api_key = config.get('API', 'api_key', fallback='')
|
||||
self.secret_key = config.get('API', 'secret_key', fallback='')
|
||||
|
||||
# 使用fallback而不是位置参数
|
||||
try:
|
||||
self.max_retries = config.getint('API', 'max_retries', fallback=3)
|
||||
except (TypeError, AttributeError):
|
||||
# 如果getint不支持fallback,则使用get再转换
|
||||
self.max_retries = int(config.get('API', 'max_retries', fallback='3'))
|
||||
|
||||
try:
|
||||
self.retry_delay = config.getint('API', 'retry_delay', fallback=2)
|
||||
except (TypeError, AttributeError):
|
||||
# 如果getint不支持fallback,则使用get再转换
|
||||
self.retry_delay = int(config.get('API', 'retry_delay', fallback='2'))
|
||||
|
||||
self.api_url = config.get('API', 'api_url', fallback='https://aip.baidubce.com/rest/2.0/ocr/v1/table')
|
||||
|
||||
# 创建令牌管理器
|
||||
self.token_manager = TokenManager(
|
||||
self.api_key,
|
||||
self.secret_key,
|
||||
self.max_retries,
|
||||
self.retry_delay
|
||||
)
|
||||
|
||||
# 验证API配置
|
||||
if not self.api_key or not self.secret_key:
|
||||
logger.warning("API密钥未设置,请在配置文件中设置API密钥")
|
||||
except Exception as e:
|
||||
logger.error(f"初始化失败: {e}")
|
||||
|
||||
def read_image(self, image_path: str) -> Optional[bytes]:
|
||||
"""
|
||||
|
||||
+108
-61
@@ -103,51 +103,65 @@ class ProcessedRecordManager:
|
||||
|
||||
class OCRProcessor:
|
||||
"""
|
||||
OCR处理器,用于表格识别与处理
|
||||
OCR处理器,负责协调OCR识别和结果处理
|
||||
"""
|
||||
|
||||
def __init__(self, config: Optional[ConfigManager] = None):
|
||||
def __init__(self, config):
|
||||
"""
|
||||
初始化OCR处理器
|
||||
|
||||
Args:
|
||||
config: 配置管理器,如果为None则创建新的
|
||||
config: 配置信息
|
||||
"""
|
||||
self.config = config or ConfigManager()
|
||||
self.config = config
|
||||
|
||||
# 创建百度OCR客户端
|
||||
self.ocr_client = BaiduOCRClient(self.config)
|
||||
# 修复ConfigParser对象没有get_path方法的问题
|
||||
try:
|
||||
# 获取输入和输出目录
|
||||
self.input_folder = config.get('Paths', 'input_folder', fallback='data/input')
|
||||
self.output_folder = config.get('Paths', 'output_folder', fallback='data/output')
|
||||
self.temp_folder = config.get('Paths', 'temp_folder', fallback='data/temp')
|
||||
|
||||
# 确保目录存在
|
||||
os.makedirs(self.input_folder, exist_ok=True)
|
||||
os.makedirs(self.output_folder, exist_ok=True)
|
||||
os.makedirs(self.temp_folder, exist_ok=True)
|
||||
|
||||
# 获取文件类型列表
|
||||
allowed_extensions_str = config.get('File', 'allowed_extensions', fallback='.jpg,.jpeg,.png,.bmp')
|
||||
self.file_types = [ext.strip() for ext in allowed_extensions_str.split(',') if ext.strip()]
|
||||
if not self.file_types:
|
||||
self.file_types = ['.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tif', '.tiff']
|
||||
|
||||
# 初始化OCR客户端
|
||||
self.ocr_client = BaiduOCRClient(self.config)
|
||||
|
||||
# 记录实际路径
|
||||
logger.info(f"使用输入目录: {os.path.abspath(self.input_folder)}")
|
||||
logger.info(f"使用输出目录: {os.path.abspath(self.output_folder)}")
|
||||
logger.info(f"使用临时目录: {os.path.abspath(self.temp_folder)}")
|
||||
logger.info(f"允许的文件类型: {self.file_types}")
|
||||
|
||||
# 初始化processed_files_json和record_manager
|
||||
self.processed_files_json = os.path.join(self.output_folder, 'processed_files.json')
|
||||
self.record_manager = ProcessedRecordManager(self.processed_files_json)
|
||||
|
||||
# 加载已处理文件记录
|
||||
self.processed_files = self._load_processed_files()
|
||||
|
||||
logger.info(f"初始化OCRProcessor完成:输入目录={self.input_folder}, 输出目录={self.output_folder}")
|
||||
except Exception as e:
|
||||
logger.error(f"初始化OCRProcessor失败: {e}")
|
||||
raise
|
||||
|
||||
def _load_processed_files(self) -> Dict[str, str]:
|
||||
"""
|
||||
加载已处理的文件记录
|
||||
|
||||
# 获取配置
|
||||
self.input_folder = self.config.get_path('Paths', 'input_folder', 'data/input', create=True)
|
||||
self.output_folder = self.config.get_path('Paths', 'output_folder', 'data/output', create=True)
|
||||
self.temp_folder = self.config.get_path('Paths', 'temp_folder', 'data/temp', create=True)
|
||||
|
||||
# 确保目录结构正确
|
||||
for folder in [self.input_folder, self.output_folder, self.temp_folder]:
|
||||
if not os.path.exists(folder):
|
||||
os.makedirs(folder, exist_ok=True)
|
||||
logger.info(f"创建目录: {folder}")
|
||||
|
||||
# 记录实际路径
|
||||
logger.info(f"使用输入目录: {os.path.abspath(self.input_folder)}")
|
||||
logger.info(f"使用输出目录: {os.path.abspath(self.output_folder)}")
|
||||
logger.info(f"使用临时目录: {os.path.abspath(self.temp_folder)}")
|
||||
|
||||
self.allowed_extensions = self.config.get_list('File', 'allowed_extensions', '.jpg,.jpeg,.png,.bmp')
|
||||
self.max_file_size_mb = self.config.getfloat('File', 'max_file_size_mb', 4.0)
|
||||
self.excel_extension = self.config.get('File', 'excel_extension', '.xlsx')
|
||||
|
||||
# 处理性能配置
|
||||
self.max_workers = self.config.getint('Performance', 'max_workers', 4)
|
||||
self.batch_size = self.config.getint('Performance', 'batch_size', 5)
|
||||
self.skip_existing = self.config.getboolean('Performance', 'skip_existing', True)
|
||||
|
||||
# 初始化处理记录管理器
|
||||
record_file = self.config.get('Paths', 'processed_record', 'data/processed_files.json')
|
||||
self.record_manager = ProcessedRecordManager(record_file)
|
||||
|
||||
logger.info(f"OCR处理器初始化完成,输入目录: {self.input_folder}, 输出目录: {self.output_folder}")
|
||||
Returns:
|
||||
已处理的文件记录字典,键为输入文件路径,值为输出文件路径
|
||||
"""
|
||||
return load_json(self.processed_files_json, {})
|
||||
|
||||
def get_unprocessed_images(self) -> List[str]:
|
||||
"""
|
||||
@@ -157,10 +171,16 @@ class OCRProcessor:
|
||||
未处理的图片文件路径列表
|
||||
"""
|
||||
# 获取所有图片文件
|
||||
image_files = get_files_by_extensions(self.input_folder, self.allowed_extensions)
|
||||
image_files = get_files_by_extensions(self.input_folder, self.file_types)
|
||||
|
||||
# 如果需要跳过已存在的文件
|
||||
if self.skip_existing:
|
||||
skip_existing = True
|
||||
try:
|
||||
skip_existing = self.config.getboolean('Performance', 'skip_existing', fallback=True)
|
||||
except:
|
||||
pass
|
||||
|
||||
if skip_existing:
|
||||
# 过滤已处理的文件
|
||||
unprocessed_files = self.record_manager.get_unprocessed_files(image_files)
|
||||
logger.info(f"找到 {len(image_files)} 个图片文件,其中 {len(unprocessed_files)} 个未处理")
|
||||
@@ -186,13 +206,19 @@ class OCRProcessor:
|
||||
|
||||
# 检查文件扩展名
|
||||
ext = get_file_extension(image_path)
|
||||
if ext not in self.allowed_extensions:
|
||||
if ext not in self.file_types:
|
||||
logger.warning(f"不支持的文件类型: {ext}, 文件: {image_path}")
|
||||
return False
|
||||
|
||||
# 检查文件大小
|
||||
if not is_file_size_valid(image_path, self.max_file_size_mb):
|
||||
logger.warning(f"文件大小超过限制 ({self.max_file_size_mb}MB): {image_path}")
|
||||
max_size_mb = 4.0
|
||||
try:
|
||||
max_size_mb = float(self.config.get('File', 'max_file_size_mb', fallback='4.0'))
|
||||
except:
|
||||
pass
|
||||
|
||||
if not is_file_size_valid(image_path, max_size_mb):
|
||||
logger.warning(f"文件大小超过限制 ({max_size_mb}MB): {image_path}")
|
||||
return False
|
||||
|
||||
return True
|
||||
@@ -211,8 +237,15 @@ class OCRProcessor:
|
||||
if not self.validate_image(image_path):
|
||||
return None
|
||||
|
||||
# 获取是否跳过已处理文件的配置
|
||||
skip_existing = True
|
||||
try:
|
||||
skip_existing = self.config.getboolean('Performance', 'skip_existing', fallback=True)
|
||||
except:
|
||||
pass
|
||||
|
||||
# 如果需要跳过已处理的文件
|
||||
if self.skip_existing and self.record_manager.is_processed(image_path):
|
||||
if skip_existing and self.record_manager.is_processed(image_path):
|
||||
output_file = self.record_manager.get_output_file(image_path)
|
||||
logger.info(f"图片已处理,跳过: {image_path}, 输出文件: {output_file}")
|
||||
return output_file
|
||||
@@ -220,12 +253,19 @@ class OCRProcessor:
|
||||
logger.info(f"开始处理图片: {image_path}")
|
||||
|
||||
try:
|
||||
# 获取Excel扩展名
|
||||
excel_extension = '.xlsx'
|
||||
try:
|
||||
excel_extension = self.config.get('File', 'excel_extension', fallback='.xlsx')
|
||||
except:
|
||||
pass
|
||||
|
||||
# 生成输出文件路径
|
||||
file_name = os.path.splitext(os.path.basename(image_path))[0]
|
||||
output_file = os.path.join(self.output_folder, f"{file_name}{self.excel_extension}")
|
||||
output_file = os.path.join(self.output_folder, f"{file_name}{excel_extension}")
|
||||
|
||||
# 检查是否已存在对应的Excel文件
|
||||
if os.path.exists(output_file) and self.skip_existing:
|
||||
if os.path.exists(output_file) and skip_existing:
|
||||
logger.info(f"已存在对应的Excel文件,跳过处理: {os.path.basename(image_path)} -> {os.path.basename(output_file)}")
|
||||
# 记录处理结果
|
||||
self.record_manager.mark_as_processed(image_path, output_file)
|
||||
@@ -304,31 +344,38 @@ class OCRProcessor:
|
||||
(总处理数, 成功处理数)元组
|
||||
"""
|
||||
# 使用配置值或参数值
|
||||
batch_size = batch_size or self.batch_size
|
||||
max_workers = max_workers or self.max_workers
|
||||
|
||||
if batch_size is None:
|
||||
try:
|
||||
batch_size = self.config.getint('Performance', 'batch_size', fallback=5)
|
||||
except:
|
||||
batch_size = 5
|
||||
|
||||
if max_workers is None:
|
||||
try:
|
||||
max_workers = self.config.getint('Performance', 'max_workers', fallback=4)
|
||||
except:
|
||||
max_workers = 4
|
||||
|
||||
# 获取未处理的图片
|
||||
unprocessed_images = self.get_unprocessed_images()
|
||||
if not unprocessed_images:
|
||||
logger.warning("没有需要处理的图片")
|
||||
return 0, 0
|
||||
|
||||
|
||||
total = len(unprocessed_images)
|
||||
success = 0
|
||||
|
||||
success_count = 0
|
||||
|
||||
# 按批次处理
|
||||
for i in range(0, total, batch_size):
|
||||
batch = unprocessed_images[i:i + batch_size]
|
||||
logger.info(f"处理批次 {i//batch_size + 1}/{(total-1)//batch_size + 1}, 大小: {len(batch)}")
|
||||
|
||||
# 使用线程池并行处理
|
||||
batch = unprocessed_images[i:i+batch_size]
|
||||
logger.info(f"处理批次 {i//batch_size+1}/{(total+batch_size-1)//batch_size}: {len(batch)} 个文件")
|
||||
|
||||
# 使用多线程处理批次
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
results = list(executor.map(self.process_image, batch))
|
||||
|
||||
# 统计成功数
|
||||
success += sum(1 for result in results if result is not None)
|
||||
|
||||
logger.info(f"批次处理完成, 成功: {sum(1 for result in results if result is not None)}/{len(batch)}")
|
||||
|
||||
logger.info(f"所有图片处理完成, 总计: {total}, 成功: {success}")
|
||||
return total, success
|
||||
# 统计成功数
|
||||
success_count += sum(1 for result in results if result is not None)
|
||||
|
||||
logger.info(f"所有图片处理完成, 总计: {total}, 成功: {success_count}")
|
||||
return total, success_count
|
||||
|
||||
Reference in New Issue
Block a user