orc-order-v2/app/services/processor_service.py
2025-11-15 18:46:03 +08:00

298 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
处理器调度服务
负责管理和调度各种文件处理器,实现智能文件类型检测和处理器选择
"""
import logging
from typing import Dict, Any, Optional, List
from pathlib import Path
from ..core.processors.base import BaseProcessor
from ..core.processors.tobacco_processor import TobaccoProcessor
from ..core.processors.ocr_processor import OCRProcessor
from ..core.utils.log_utils import get_logger
logger = get_logger(__name__)
class ProcessorService:
"""处理器调度服务
负责管理所有处理器实例,提供统一的文件处理接口
"""
def __init__(self, config: Dict[str, Any]):
"""初始化处理器服务
Args:
config: 系统配置字典
"""
self.config = config
self.processors: List[BaseProcessor] = []
self._load_processors()
logger.info(f"处理器服务初始化完成,加载了{len(self.processors)}个处理器")
def _load_processors(self):
"""加载所有处理器"""
try:
self.processors = [
TobaccoProcessor(self.config),
OCRProcessor(self.config),
]
supplier_configs = []
try:
import json
from pathlib import Path
# 优先从`config/suppliers_config.json`加载
config_path = Path("config/suppliers_config.json")
if not config_path.exists():
# 兼容其它路径
config_path = Path("./suppliers_config.json")
if config_path.exists():
with open(config_path, 'r', encoding='utf-8') as f:
data = json.load(f)
ok, errs, supplier_configs = self._validate_suppliers_config(data)
if not ok:
logger.error("供应商配置校验失败:\n" + "\n".join([f"- {e}" for e in errs]))
else:
logger.info(f"{config_path} 加载供应商配置,共 {len(supplier_configs)}")
else:
logger.info("未找到供应商配置文件,跳过供应商处理器加载")
except Exception as e:
logger.error(f"读取供应商配置失败: {e}")
for supplier_config in supplier_configs:
try:
from ..core.processors.supplier_processors.generic_supplier_processor import GenericSupplierProcessor
processor = GenericSupplierProcessor(self.config, supplier_config)
self.processors.append(processor)
logger.info(f"加载供应商处理器: {processor.name}")
except Exception as e:
logger.error(f"加载供应商处理器失败: {e}")
logger.info(f"成功加载{len(self.processors)}个处理器")
except Exception as e:
logger.error(f"加载处理器时出错: {e}", exc_info=True)
self.processors = [
TobaccoProcessor(self.config),
OCRProcessor(self.config),
]
def _validate_suppliers_config(self, data):
try:
suppliers = data.get('suppliers')
errors = []
valid = []
if not isinstance(suppliers, list) or not suppliers:
errors.append('suppliers必须是非空数组')
return False, errors, []
for idx, s in enumerate(suppliers):
e = self._validate_single_supplier(s, idx)
if e:
errors.extend(e)
else:
valid.append(s)
return len(errors) == 0, errors, valid
except Exception as e:
return False, [f'配置解析异常: {e}'], []
def _validate_single_supplier(self, s, idx):
errs = []
prefix = f'suppliers[{idx}]'
name = s.get('name')
if not name or not isinstance(name, str):
errs.append(f'{prefix}.name 必须为字符串')
fp = s.get('filename_patterns', [])
ci = s.get('content_indicators', [])
if not fp and not ci:
errs.append(f'{prefix} 必须至少提供 filename_patterns 或 content_indicators 之一')
cm = s.get('column_mapping', {})
if cm and not isinstance(cm, dict):
errs.append(f'{prefix}.column_mapping 必须为对象')
cr = s.get('cleaning_rules', [])
if cr and not isinstance(cr, list):
errs.append(f'{prefix}.cleaning_rules 必须为数组')
else:
for i, rule in enumerate(cr):
rtype = rule.get('type')
if rtype not in ('remove_rows','fill_na','convert_type'):
errs.append(f'{prefix}.cleaning_rules[{i}].type 非法: {rtype}')
if rtype == 'remove_rows' and not rule.get('condition'):
errs.append(f'{prefix}.cleaning_rules[{i}].condition 必填')
if rtype in ('fill_na','convert_type'):
if not rule.get('columns') and not rule.get('column'):
errs.append(f'{prefix}.cleaning_rules[{i}] 需提供 columns 或 column')
calc = s.get('calculations', [])
if calc and not isinstance(calc, list):
errs.append(f'{prefix}.calculations 必须为数组')
else:
for i, c in enumerate(calc):
ctype = c.get('type')
if ctype not in ('multiply','divide','formula'):
errs.append(f'{prefix}.calculations[{i}].type 非法: {ctype}')
if ctype in ('multiply','divide'):
if not c.get('source_column') or not c.get('target_column'):
errs.append(f'{prefix}.calculations[{i}] 需提供 source_column 与 target_column')
if ctype == 'formula' and (not c.get('formula') or not c.get('target_column')):
errs.append(f'{prefix}.calculations[{i}] 需提供 formula 与 target_column')
return errs
def process_file(self, input_file: Path, output_dir: Path,
preferred_processor: Optional[str] = None) -> Optional[Path]:
"""处理文件 - 自动选择合适的处理器
Args:
input_file: 输入文件路径
output_dir: 输出目录路径
preferred_processor: 优先使用的处理器名称(可选)
Returns:
输出文件路径处理失败返回None
"""
if not input_file.exists():
logger.error(f"输入文件不存在: {input_file}")
return None
if not output_dir.exists():
output_dir.mkdir(parents=True, exist_ok=True)
try:
# 如果指定了优先处理器,先尝试使用它
if preferred_processor:
processor = self._get_processor_by_name(preferred_processor)
if processor and processor.can_process(input_file):
logger.info(f"使用指定的处理器: {processor.name}")
return processor.process(input_file, output_dir)
else:
logger.warning(f"指定的处理器不可用或无法处理该文件: {preferred_processor}")
# 自动选择合适的处理器
suitable_processors = [p for p in self.processors if p.can_process(input_file)]
if not suitable_processors:
logger.warning(f"未找到适合处理文件的处理器: {input_file}")
logger.info(f"支持的文件类型: {self.get_supported_types()}")
return None
# 使用第一个合适的处理器
processor = suitable_processors[0]
logger.info(f"使用处理器 {processor.name} 处理文件: {input_file}")
return processor.process(input_file, output_dir)
except Exception as e:
logger.error(f"处理文件时出错: {e}", exc_info=True)
return None
def _get_processor_by_name(self, name: str) -> Optional[BaseProcessor]:
"""根据名称获取处理器
Args:
name: 处理器名称
Returns:
处理器实例或None
"""
for processor in self.processors:
if processor.name == name or processor.__class__.__name__ == name:
return processor
return None
def get_supported_types(self) -> List[Dict[str, Any]]:
"""获取支持的文件类型信息
Returns:
处理器类型信息列表
"""
return [
{
'name': processor.name,
'description': processor.description,
'extensions': processor.get_supported_extensions(),
'class_name': processor.__class__.__name__
}
for processor in self.processors
]
def get_processor_info(self) -> List[Dict[str, Any]]:
"""获取处理器详细信息
Returns:
处理器详细信息列表
"""
return [
{
'name': processor.name,
'description': processor.description,
'extensions': processor.get_supported_extensions(),
'required_columns': processor.get_required_columns(),
'class_name': processor.__class__.__name__,
'module': processor.__class__.__module__
}
for processor in self.processors
]
def can_process_file(self, file_path: Path) -> bool:
"""检查是否有处理器能处理该文件
Args:
file_path: 文件路径
Returns:
是否有处理器能处理
"""
if not file_path.exists():
return False
return any(processor.can_process(file_path) for processor in self.processors)
def get_suitable_processors(self, file_path: Path) -> List[BaseProcessor]:
"""获取能处理该文件的所有处理器
Args:
file_path: 文件路径
Returns:
合适的处理器列表
"""
if not file_path.exists():
return []
return [p for p in self.processors if p.can_process(file_path)]
def reload_processors(self):
"""重新加载处理器"""
logger.info("重新加载处理器...")
self.processors.clear()
self._load_processors()
logger.info(f"重新加载完成,共{len(self.processors)}个处理器")
def add_processor(self, processor: BaseProcessor):
"""添加处理器
Args:
processor: 处理器实例
"""
self.processors.append(processor)
logger.info(f"添加处理器: {processor.name}")
def remove_processor(self, processor_name: str) -> bool:
"""移除处理器
Args:
processor_name: 处理器名称
Returns:
是否成功移除
"""
for i, processor in enumerate(self.processors):
if processor.name == processor_name or processor.__class__.__name__ == processor_name:
del self.processors[i]
logger.info(f"移除处理器: {processor_name}")
return True
logger.warning(f"未找到要移除的处理器: {processor_name}")
return False