""" 处理器调度服务 负责管理和调度各种文件处理器,实现智能文件类型检测和处理器选择 """ import logging from typing import Dict, Any, Optional, List from pathlib import Path from ..core.processors.base import BaseProcessor from ..core.processors.tobacco_processor import TobaccoProcessor from ..core.processors.ocr_processor import OCRProcessor from ..core.utils.log_utils import get_logger logger = get_logger(__name__) class ProcessorService: """处理器调度服务 负责管理所有处理器实例,提供统一的文件处理接口 """ def __init__(self, config: Dict[str, Any]): """初始化处理器服务 Args: config: 系统配置字典 """ self.config = config self.processors: List[BaseProcessor] = [] self._load_processors() logger.info(f"处理器服务初始化完成,加载了{len(self.processors)}个处理器") def _load_processors(self): """加载所有处理器""" try: self.processors = [ TobaccoProcessor(self.config), OCRProcessor(self.config), ] supplier_configs = [] try: import json from pathlib import Path # 优先从`config/suppliers_config.json`加载 config_path = Path("config/suppliers_config.json") if not config_path.exists(): # 兼容其它路径 config_path = Path("./suppliers_config.json") if config_path.exists(): with open(config_path, 'r', encoding='utf-8') as f: data = json.load(f) ok, errs, supplier_configs = self._validate_suppliers_config(data) if not ok: logger.error("供应商配置校验失败:\n" + "\n".join([f"- {e}" for e in errs])) else: logger.info(f"从 {config_path} 加载供应商配置,共 {len(supplier_configs)} 项") else: logger.info("未找到供应商配置文件,跳过供应商处理器加载") except Exception as e: logger.error(f"读取供应商配置失败: {e}") for supplier_config in supplier_configs: try: from ..core.processors.supplier_processors.generic_supplier_processor import GenericSupplierProcessor processor = GenericSupplierProcessor(self.config, supplier_config) self.processors.append(processor) logger.info(f"加载供应商处理器: {processor.name}") except Exception as e: logger.error(f"加载供应商处理器失败: {e}") logger.info(f"成功加载{len(self.processors)}个处理器") except Exception as e: logger.error(f"加载处理器时出错: {e}", exc_info=True) self.processors = [ TobaccoProcessor(self.config), OCRProcessor(self.config), ] def _validate_suppliers_config(self, data): try: suppliers = data.get('suppliers') errors = [] valid = [] if not isinstance(suppliers, list) or not suppliers: errors.append('suppliers必须是非空数组') return False, errors, [] for idx, s in enumerate(suppliers): e = self._validate_single_supplier(s, idx) if e: errors.extend(e) else: valid.append(s) return len(errors) == 0, errors, valid except Exception as e: return False, [f'配置解析异常: {e}'], [] def _validate_single_supplier(self, s, idx): errs = [] prefix = f'suppliers[{idx}]' name = s.get('name') if not name or not isinstance(name, str): errs.append(f'{prefix}.name 必须为字符串') fp = s.get('filename_patterns', []) ci = s.get('content_indicators', []) if not fp and not ci: errs.append(f'{prefix} 必须至少提供 filename_patterns 或 content_indicators 之一') cm = s.get('column_mapping', {}) if cm and not isinstance(cm, dict): errs.append(f'{prefix}.column_mapping 必须为对象') cr = s.get('cleaning_rules', []) if cr and not isinstance(cr, list): errs.append(f'{prefix}.cleaning_rules 必须为数组') else: for i, rule in enumerate(cr): rtype = rule.get('type') if rtype not in ('remove_rows','fill_na','convert_type'): errs.append(f'{prefix}.cleaning_rules[{i}].type 非法: {rtype}') if rtype == 'remove_rows' and not rule.get('condition'): errs.append(f'{prefix}.cleaning_rules[{i}].condition 必填') if rtype in ('fill_na','convert_type'): if not rule.get('columns') and not rule.get('column'): errs.append(f'{prefix}.cleaning_rules[{i}] 需提供 columns 或 column') calc = s.get('calculations', []) if calc and not isinstance(calc, list): errs.append(f'{prefix}.calculations 必须为数组') else: for i, c in enumerate(calc): ctype = c.get('type') if ctype not in ('multiply','divide','formula'): errs.append(f'{prefix}.calculations[{i}].type 非法: {ctype}') if ctype in ('multiply','divide'): if not c.get('source_column') or not c.get('target_column'): errs.append(f'{prefix}.calculations[{i}] 需提供 source_column 与 target_column') if ctype == 'formula' and (not c.get('formula') or not c.get('target_column')): errs.append(f'{prefix}.calculations[{i}] 需提供 formula 与 target_column') return errs def process_file(self, input_file: Path, output_dir: Path, preferred_processor: Optional[str] = None) -> Optional[Path]: """处理文件 - 自动选择合适的处理器 Args: input_file: 输入文件路径 output_dir: 输出目录路径 preferred_processor: 优先使用的处理器名称(可选) Returns: 输出文件路径,处理失败返回None """ if not input_file.exists(): logger.error(f"输入文件不存在: {input_file}") return None if not output_dir.exists(): output_dir.mkdir(parents=True, exist_ok=True) try: # 如果指定了优先处理器,先尝试使用它 if preferred_processor: processor = self._get_processor_by_name(preferred_processor) if processor and processor.can_process(input_file): logger.info(f"使用指定的处理器: {processor.name}") return processor.process(input_file, output_dir) else: logger.warning(f"指定的处理器不可用或无法处理该文件: {preferred_processor}") # 自动选择合适的处理器 suitable_processors = [p for p in self.processors if p.can_process(input_file)] if not suitable_processors: logger.warning(f"未找到适合处理文件的处理器: {input_file}") logger.info(f"支持的文件类型: {self.get_supported_types()}") return None # 使用第一个合适的处理器 processor = suitable_processors[0] logger.info(f"使用处理器 {processor.name} 处理文件: {input_file}") return processor.process(input_file, output_dir) except Exception as e: logger.error(f"处理文件时出错: {e}", exc_info=True) return None def _get_processor_by_name(self, name: str) -> Optional[BaseProcessor]: """根据名称获取处理器 Args: name: 处理器名称 Returns: 处理器实例或None """ for processor in self.processors: if processor.name == name or processor.__class__.__name__ == name: return processor return None def get_supported_types(self) -> List[Dict[str, Any]]: """获取支持的文件类型信息 Returns: 处理器类型信息列表 """ return [ { 'name': processor.name, 'description': processor.description, 'extensions': processor.get_supported_extensions(), 'class_name': processor.__class__.__name__ } for processor in self.processors ] def get_processor_info(self) -> List[Dict[str, Any]]: """获取处理器详细信息 Returns: 处理器详细信息列表 """ return [ { 'name': processor.name, 'description': processor.description, 'extensions': processor.get_supported_extensions(), 'required_columns': processor.get_required_columns(), 'class_name': processor.__class__.__name__, 'module': processor.__class__.__module__ } for processor in self.processors ] def can_process_file(self, file_path: Path) -> bool: """检查是否有处理器能处理该文件 Args: file_path: 文件路径 Returns: 是否有处理器能处理 """ if not file_path.exists(): return False return any(processor.can_process(file_path) for processor in self.processors) def get_suitable_processors(self, file_path: Path) -> List[BaseProcessor]: """获取能处理该文件的所有处理器 Args: file_path: 文件路径 Returns: 合适的处理器列表 """ if not file_path.exists(): return [] return [p for p in self.processors if p.can_process(file_path)] def reload_processors(self): """重新加载处理器""" logger.info("重新加载处理器...") self.processors.clear() self._load_processors() logger.info(f"重新加载完成,共{len(self.processors)}个处理器") def add_processor(self, processor: BaseProcessor): """添加处理器 Args: processor: 处理器实例 """ self.processors.append(processor) logger.info(f"添加处理器: {processor.name}") def remove_processor(self, processor_name: str) -> bool: """移除处理器 Args: processor_name: 处理器名称 Returns: 是否成功移除 """ for i, processor in enumerate(self.processors): if processor.name == processor_name or processor.__class__.__name__ == processor_name: del self.processors[i] logger.info(f"移除处理器: {processor_name}") return True logger.warning(f"未找到要移除的处理器: {processor_name}") return False