298 lines
12 KiB
Python
298 lines
12 KiB
Python
"""
|
||
处理器调度服务
|
||
|
||
负责管理和调度各种文件处理器,实现智能文件类型检测和处理器选择
|
||
"""
|
||
|
||
import logging
|
||
from typing import Dict, Any, Optional, List
|
||
from pathlib import Path
|
||
|
||
from ..core.processors.base import BaseProcessor
|
||
from ..core.processors.tobacco_processor import TobaccoProcessor
|
||
from ..core.processors.ocr_processor import OCRProcessor
|
||
from ..core.utils.log_utils import get_logger
|
||
|
||
logger = get_logger(__name__)
|
||
|
||
|
||
class ProcessorService:
|
||
"""处理器调度服务
|
||
|
||
负责管理所有处理器实例,提供统一的文件处理接口
|
||
"""
|
||
|
||
def __init__(self, config: Dict[str, Any]):
|
||
"""初始化处理器服务
|
||
|
||
Args:
|
||
config: 系统配置字典
|
||
"""
|
||
self.config = config
|
||
self.processors: List[BaseProcessor] = []
|
||
self._load_processors()
|
||
logger.info(f"处理器服务初始化完成,加载了{len(self.processors)}个处理器")
|
||
|
||
def _load_processors(self):
|
||
"""加载所有处理器"""
|
||
try:
|
||
self.processors = [
|
||
TobaccoProcessor(self.config),
|
||
OCRProcessor(self.config),
|
||
]
|
||
|
||
supplier_configs = []
|
||
try:
|
||
import json
|
||
from pathlib import Path
|
||
# 优先从`config/suppliers_config.json`加载
|
||
config_path = Path("config/suppliers_config.json")
|
||
if not config_path.exists():
|
||
# 兼容其它路径
|
||
config_path = Path("./suppliers_config.json")
|
||
if config_path.exists():
|
||
with open(config_path, 'r', encoding='utf-8') as f:
|
||
data = json.load(f)
|
||
ok, errs, supplier_configs = self._validate_suppliers_config(data)
|
||
if not ok:
|
||
logger.error("供应商配置校验失败:\n" + "\n".join([f"- {e}" for e in errs]))
|
||
else:
|
||
logger.info(f"从 {config_path} 加载供应商配置,共 {len(supplier_configs)} 项")
|
||
else:
|
||
logger.info("未找到供应商配置文件,跳过供应商处理器加载")
|
||
except Exception as e:
|
||
logger.error(f"读取供应商配置失败: {e}")
|
||
|
||
for supplier_config in supplier_configs:
|
||
try:
|
||
from ..core.processors.supplier_processors.generic_supplier_processor import GenericSupplierProcessor
|
||
processor = GenericSupplierProcessor(self.config, supplier_config)
|
||
self.processors.append(processor)
|
||
logger.info(f"加载供应商处理器: {processor.name}")
|
||
except Exception as e:
|
||
logger.error(f"加载供应商处理器失败: {e}")
|
||
|
||
logger.info(f"成功加载{len(self.processors)}个处理器")
|
||
|
||
except Exception as e:
|
||
logger.error(f"加载处理器时出错: {e}", exc_info=True)
|
||
self.processors = [
|
||
TobaccoProcessor(self.config),
|
||
OCRProcessor(self.config),
|
||
]
|
||
|
||
def _validate_suppliers_config(self, data):
|
||
try:
|
||
suppliers = data.get('suppliers')
|
||
errors = []
|
||
valid = []
|
||
if not isinstance(suppliers, list) or not suppliers:
|
||
errors.append('suppliers必须是非空数组')
|
||
return False, errors, []
|
||
for idx, s in enumerate(suppliers):
|
||
e = self._validate_single_supplier(s, idx)
|
||
if e:
|
||
errors.extend(e)
|
||
else:
|
||
valid.append(s)
|
||
return len(errors) == 0, errors, valid
|
||
except Exception as e:
|
||
return False, [f'配置解析异常: {e}'], []
|
||
|
||
def _validate_single_supplier(self, s, idx):
|
||
errs = []
|
||
prefix = f'suppliers[{idx}]'
|
||
name = s.get('name')
|
||
if not name or not isinstance(name, str):
|
||
errs.append(f'{prefix}.name 必须为字符串')
|
||
fp = s.get('filename_patterns', [])
|
||
ci = s.get('content_indicators', [])
|
||
if not fp and not ci:
|
||
errs.append(f'{prefix} 必须至少提供 filename_patterns 或 content_indicators 之一')
|
||
cm = s.get('column_mapping', {})
|
||
if cm and not isinstance(cm, dict):
|
||
errs.append(f'{prefix}.column_mapping 必须为对象')
|
||
cr = s.get('cleaning_rules', [])
|
||
if cr and not isinstance(cr, list):
|
||
errs.append(f'{prefix}.cleaning_rules 必须为数组')
|
||
else:
|
||
for i, rule in enumerate(cr):
|
||
rtype = rule.get('type')
|
||
if rtype not in ('remove_rows','fill_na','convert_type'):
|
||
errs.append(f'{prefix}.cleaning_rules[{i}].type 非法: {rtype}')
|
||
if rtype == 'remove_rows' and not rule.get('condition'):
|
||
errs.append(f'{prefix}.cleaning_rules[{i}].condition 必填')
|
||
if rtype in ('fill_na','convert_type'):
|
||
if not rule.get('columns') and not rule.get('column'):
|
||
errs.append(f'{prefix}.cleaning_rules[{i}] 需提供 columns 或 column')
|
||
calc = s.get('calculations', [])
|
||
if calc and not isinstance(calc, list):
|
||
errs.append(f'{prefix}.calculations 必须为数组')
|
||
else:
|
||
for i, c in enumerate(calc):
|
||
ctype = c.get('type')
|
||
if ctype not in ('multiply','divide','formula'):
|
||
errs.append(f'{prefix}.calculations[{i}].type 非法: {ctype}')
|
||
if ctype in ('multiply','divide'):
|
||
if not c.get('source_column') or not c.get('target_column'):
|
||
errs.append(f'{prefix}.calculations[{i}] 需提供 source_column 与 target_column')
|
||
if ctype == 'formula' and (not c.get('formula') or not c.get('target_column')):
|
||
errs.append(f'{prefix}.calculations[{i}] 需提供 formula 与 target_column')
|
||
return errs
|
||
|
||
def process_file(self, input_file: Path, output_dir: Path,
|
||
preferred_processor: Optional[str] = None) -> Optional[Path]:
|
||
"""处理文件 - 自动选择合适的处理器
|
||
|
||
Args:
|
||
input_file: 输入文件路径
|
||
output_dir: 输出目录路径
|
||
preferred_processor: 优先使用的处理器名称(可选)
|
||
|
||
Returns:
|
||
输出文件路径,处理失败返回None
|
||
"""
|
||
if not input_file.exists():
|
||
logger.error(f"输入文件不存在: {input_file}")
|
||
return None
|
||
|
||
if not output_dir.exists():
|
||
output_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
try:
|
||
# 如果指定了优先处理器,先尝试使用它
|
||
if preferred_processor:
|
||
processor = self._get_processor_by_name(preferred_processor)
|
||
if processor and processor.can_process(input_file):
|
||
logger.info(f"使用指定的处理器: {processor.name}")
|
||
return processor.process(input_file, output_dir)
|
||
else:
|
||
logger.warning(f"指定的处理器不可用或无法处理该文件: {preferred_processor}")
|
||
|
||
# 自动选择合适的处理器
|
||
suitable_processors = [p for p in self.processors if p.can_process(input_file)]
|
||
|
||
if not suitable_processors:
|
||
logger.warning(f"未找到适合处理文件的处理器: {input_file}")
|
||
logger.info(f"支持的文件类型: {self.get_supported_types()}")
|
||
return None
|
||
|
||
# 使用第一个合适的处理器
|
||
processor = suitable_processors[0]
|
||
logger.info(f"使用处理器 {processor.name} 处理文件: {input_file}")
|
||
|
||
return processor.process(input_file, output_dir)
|
||
|
||
except Exception as e:
|
||
logger.error(f"处理文件时出错: {e}", exc_info=True)
|
||
return None
|
||
|
||
def _get_processor_by_name(self, name: str) -> Optional[BaseProcessor]:
|
||
"""根据名称获取处理器
|
||
|
||
Args:
|
||
name: 处理器名称
|
||
|
||
Returns:
|
||
处理器实例或None
|
||
"""
|
||
for processor in self.processors:
|
||
if processor.name == name or processor.__class__.__name__ == name:
|
||
return processor
|
||
return None
|
||
|
||
def get_supported_types(self) -> List[Dict[str, Any]]:
|
||
"""获取支持的文件类型信息
|
||
|
||
Returns:
|
||
处理器类型信息列表
|
||
"""
|
||
return [
|
||
{
|
||
'name': processor.name,
|
||
'description': processor.description,
|
||
'extensions': processor.get_supported_extensions(),
|
||
'class_name': processor.__class__.__name__
|
||
}
|
||
for processor in self.processors
|
||
]
|
||
|
||
def get_processor_info(self) -> List[Dict[str, Any]]:
|
||
"""获取处理器详细信息
|
||
|
||
Returns:
|
||
处理器详细信息列表
|
||
"""
|
||
return [
|
||
{
|
||
'name': processor.name,
|
||
'description': processor.description,
|
||
'extensions': processor.get_supported_extensions(),
|
||
'required_columns': processor.get_required_columns(),
|
||
'class_name': processor.__class__.__name__,
|
||
'module': processor.__class__.__module__
|
||
}
|
||
for processor in self.processors
|
||
]
|
||
|
||
def can_process_file(self, file_path: Path) -> bool:
|
||
"""检查是否有处理器能处理该文件
|
||
|
||
Args:
|
||
file_path: 文件路径
|
||
|
||
Returns:
|
||
是否有处理器能处理
|
||
"""
|
||
if not file_path.exists():
|
||
return False
|
||
|
||
return any(processor.can_process(file_path) for processor in self.processors)
|
||
|
||
def get_suitable_processors(self, file_path: Path) -> List[BaseProcessor]:
|
||
"""获取能处理该文件的所有处理器
|
||
|
||
Args:
|
||
file_path: 文件路径
|
||
|
||
Returns:
|
||
合适的处理器列表
|
||
"""
|
||
if not file_path.exists():
|
||
return []
|
||
|
||
return [p for p in self.processors if p.can_process(file_path)]
|
||
|
||
def reload_processors(self):
|
||
"""重新加载处理器"""
|
||
logger.info("重新加载处理器...")
|
||
self.processors.clear()
|
||
self._load_processors()
|
||
logger.info(f"重新加载完成,共{len(self.processors)}个处理器")
|
||
|
||
def add_processor(self, processor: BaseProcessor):
|
||
"""添加处理器
|
||
|
||
Args:
|
||
processor: 处理器实例
|
||
"""
|
||
self.processors.append(processor)
|
||
logger.info(f"添加处理器: {processor.name}")
|
||
|
||
def remove_processor(self, processor_name: str) -> bool:
|
||
"""移除处理器
|
||
|
||
Args:
|
||
processor_name: 处理器名称
|
||
|
||
Returns:
|
||
是否成功移除
|
||
"""
|
||
for i, processor in enumerate(self.processors):
|
||
if processor.name == processor_name or processor.__class__.__name__ == processor_name:
|
||
del self.processors[i]
|
||
logger.info(f"移除处理器: {processor_name}")
|
||
return True
|
||
logger.warning(f"未找到要移除的处理器: {processor_name}")
|
||
return False
|