增强版v2-初始化仓库，验证好了ocr部分，先备份一次

2025-05-02 17:25:47 +08:00
commit 0035cd1893
88 changed files with 9031 additions and 0 deletions
@@ -0,0 +1,5 @@
+"""
+OCR订单处理系统 - Excel处理模块
+----------------------------
+提供Excel文件处理、数据提取和转换功能。
+""" 
@@ -0,0 +1,213 @@
+"""
+单位转换处理模块
+-------------
+提供规格和单位的处理和转换功能。
+"""
+
+import re
+from typing import Dict, List, Optional, Tuple, Any
+
+from ..utils.log_utils import get_logger
+from ..utils.string_utils import (
+    clean_string,
+    extract_number,
+    extract_unit,
+    extract_number_and_unit,
+    parse_specification
+)
+
+logger = get_logger(__name__)
+
+class UnitConverter:
+    """
+    单位转换器：处理商品规格和单位转换
+    """
+    
+    def __init__(self):
+        """初始化单位转换器"""
+        # 特殊条码配置
+        self.special_barcodes = {
+            '6925019900087': {
+                'multiplier': 10,  # 数量乘以10
+                'target_unit': '瓶',  # 目标单位
+                'description': '特殊处理：数量*10，单位转换为瓶'
+            }
+            # 可以在这里添加更多特殊条码的配置
+        }
+        
+        # 有效的单位列表
+        self.valid_units = ['件', '箱', '包', '提', '盒', '瓶', '个', '支', '袋', '副', '桶', '罐', 'L', 'l', '升']
+        
+        # 需要特殊处理的单位
+        self.special_units = ['件', '箱', '提', '盒']
+        
+        logger.info("单位转换器初始化完成")
+    
+    def add_special_barcode(self, barcode: str, multiplier: int, target_unit: str, description: str = "") -> None:
+        """
+        添加特殊条码处理配置
+        
+        Args:
+            barcode: 条码
+            multiplier: 数量乘数
+            target_unit: 目标单位
+            description: 处理描述
+        """
+        self.special_barcodes[barcode] = {
+            'multiplier': multiplier,
+            'target_unit': target_unit,
+            'description': description or f'特殊处理：数量*{multiplier}，单位转换为{target_unit}'
+        }
+        logger.info(f"添加特殊条码配置: {barcode}, {description}")
+    
+    def infer_specification_from_name(self, product_name: str) -> Optional[str]:
+        """
+        从商品名称推断规格
+        
+        Args:
+            product_name: 商品名称
+            
+        Returns:
+            推断的规格，如果无法推断则返回None
+        """
+        if not product_name or not isinstance(product_name, str):
+            return None
+            
+        try:
+            # 清理商品名称
+            name = clean_string(product_name)
+            
+            # 1. 匹配 XX入纸箱 格式
+            match = re.search(r'(\d+)入纸箱', name)
+            if match:
+                return f"1*{match.group(1)}"
+                
+            # 2. 匹配 绿茶1*15-纸箱装 格式
+            match = re.search(r'(\d+)[*×xX](\d+)[-\s]?纸箱', name)
+            if match:
+                return f"{match.group(1)}*{match.group(2)}"
+                
+            # 3. 匹配 12.9L桶装水 格式
+            match = re.search(r'([\d\.]+)[Ll升](?!.*[*×xX])', name)
+            if match:
+                return f"{match.group(1)}L*1"
+                
+            # 4. 匹配 商品12入纸箱 格式（数字在中间）
+            match = re.search(r'\D(\d+)入\w*箱', name)
+            if match:
+                return f"1*{match.group(1)}"
+                
+            # 5. 匹配 商品15纸箱 格式（数字在中间）
+            match = re.search(r'\D(\d+)\w*箱', name)
+            if match:
+                return f"1*{match.group(1)}"
+                
+            # 6. 匹配 商品1*30 格式
+            match = re.search(r'(\d+)[*×xX](\d+)', name)
+            if match:
+                return f"{match.group(1)}*{match.group(2)}"
+                
+            logger.debug(f"无法从商品名称推断规格: {name}")
+            return None
+            
+        except Exception as e:
+            logger.error(f"从商品名称推断规格时出错: {e}")
+            return None
+    
+    def extract_unit_from_quantity(self, quantity_str: str) -> Tuple[Optional[float], Optional[str]]:
+        """
+        从数量字符串提取单位
+        
+        Args:
+            quantity_str: 数量字符串
+            
+        Returns:
+            (数量, 单位)元组
+        """
+        if not quantity_str or not isinstance(quantity_str, str):
+            return None, None
+            
+        try:
+            # 清理数量字符串
+            quantity_str = clean_string(quantity_str)
+            
+            # 提取数字和单位
+            return extract_number_and_unit(quantity_str)
+            
+        except Exception as e:
+            logger.error(f"从数量字符串提取单位时出错: {quantity_str}, 错误: {e}")
+            return None, None
+    
+    def process_unit_conversion(self, product: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        处理单位转换，根据单位和规格转换数量和单价
+        
+        Args:
+            product: 商品字典，包含条码、单位、规格、数量和单价等字段
+            
+        Returns:
+            处理后的商品字典
+        """
+        # 复制商品信息，避免修改原始数据
+        result = product.copy()
+        
+        try:
+            # 获取条码、单位、规格、数量和单价
+            barcode = product.get('barcode', '')
+            unit = product.get('unit', '')
+            specification = product.get('specification', '')
+            quantity = product.get('quantity', 0)
+            price = product.get('price', 0)
+            
+            # 如果缺少关键信息，无法进行转换
+            if not barcode or quantity == 0:
+                return result
+                
+            # 1. 首先检查是否是特殊条码
+            if barcode in self.special_barcodes:
+                special_config = self.special_barcodes[barcode]
+                logger.info(f"应用特殊条码配置: {barcode}, {special_config['description']}")
+                
+                # 应用乘数和单位转换
+                result['quantity'] = quantity * special_config['multiplier']
+                result['unit'] = special_config['target_unit']
+                
+                # 如果有单价，进行单价转换
+                if price != 0:
+                    result['price'] = price / special_config['multiplier']
+                
+                return result
+            
+            # 2. 提取规格包装数量
+            package_quantity = None
+            if specification:
+                package_quantity = parse_specification(specification)
+            
+            # 3. 处理单位转换
+            if unit and unit in self.special_units and package_quantity:
+                # 判断是否是三级规格（1*5*12格式）
+                is_three_level = bool(re.search(r'\d+[\*xX×]\d+[\*xX×]\d+', str(specification)))
+                
+                # 对于"提"和"盒"单位的特殊处理
+                if (unit in ['提', '盒']) and not is_three_level:
+                    # 二级规格：保持原数量不变
+                    logger.info(f"二级规格的提/盒单位，保持原状: {unit}, 规格={specification}")
+                    return result
+                
+                # 标准处理：数量×包装数量，单价÷包装数量
+                logger.info(f"标准单位转换: {unit}->瓶, 规格={specification}, 包装数量={package_quantity}")
+                result['quantity'] = quantity * package_quantity
+                result['unit'] = '瓶'
+                
+                if price != 0:
+                    result['price'] = price / package_quantity
+                
+                return result
+            
+            # 4. 默认返回原始数据
+            return result
+        
+        except Exception as e:
+            logger.error(f"单位转换处理出错: {e}")
+            # 发生错误时，返回原始数据
+            return result 
@@ -0,0 +1,375 @@
+"""
+订单合并模块
+----------
+提供采购单合并功能，将多个采购单合并为一个。
+"""
+
+import os
+import re
+import pandas as pd
+import numpy as np
+import xlrd
+import xlwt
+from xlutils.copy import copy as xlcopy
+from typing import Dict, List, Optional, Tuple, Union, Any
+from datetime import datetime
+
+from ...config.settings import ConfigManager
+from ..utils.log_utils import get_logger
+from ..utils.file_utils import (
+    ensure_dir,
+    get_file_extension,
+    get_files_by_extensions,
+    load_json,
+    save_json
+)
+from ..utils.string_utils import (
+    clean_string,
+    clean_barcode,
+    format_barcode
+)
+
+logger = get_logger(__name__)
+
+class PurchaseOrderMerger:
+    """
+    采购单合并器：将多个采购单Excel文件合并成一个文件
+    """
+    
+    def __init__(self, config: Optional[ConfigManager] = None):
+        """
+        初始化采购单合并器
+        
+        Args:
+            config: 配置管理器，如果为None则创建新的
+        """
+        logger.info("初始化PurchaseOrderMerger")
+        self.config = config or ConfigManager()
+        
+        # 获取配置
+        self.output_dir = self.config.get_path('Paths', 'output_folder', 'data/output', create=True)
+        
+        # 获取模板文件路径
+        template_folder = self.config.get('Paths', 'template_folder', 'templates')
+        template_name = self.config.get('Templates', 'purchase_order', '银豹-采购单模板.xls')
+        
+        self.template_path = os.path.join(template_folder, template_name)
+        
+        # 检查模板文件是否存在
+        if not os.path.exists(self.template_path):
+            logger.error(f"模板文件不存在: {self.template_path}")
+            raise FileNotFoundError(f"模板文件不存在: {self.template_path}")
+        
+        # 用于记录已合并的文件
+        self.cache_file = os.path.join(self.output_dir, "merged_files.json")
+        self.merged_files = self._load_merged_files()
+        
+        logger.info(f"初始化完成，模板文件: {self.template_path}")
+    
+    def _load_merged_files(self) -> Dict[str, str]:
+        """
+        加载已合并文件的缓存
+        
+        Returns:
+            合并记录字典
+        """
+        return load_json(self.cache_file, {})
+        
+    def _save_merged_files(self) -> None:
+        """保存已合并文件的缓存"""
+        save_json(self.merged_files, self.cache_file)
+    
+    def get_purchase_orders(self) -> List[str]:
+        """
+        获取output目录下的采购单Excel文件
+        
+        Returns:
+            采购单文件路径列表
+        """
+        logger.info(f"搜索目录 {self.output_dir} 中的采购单Excel文件")
+        
+        # 获取所有Excel文件
+        all_files = get_files_by_extensions(self.output_dir, ['.xls', '.xlsx'])
+        
+        # 筛选采购单文件
+        purchase_orders = [
+            file for file in all_files 
+            if os.path.basename(file).startswith('采购单_')
+        ]
+        
+        if not purchase_orders:
+            logger.warning(f"未在 {self.output_dir} 目录下找到采购单Excel文件")
+            return []
+        
+        # 按修改时间排序，最新的在前
+        purchase_orders.sort(key=lambda x: os.path.getmtime(x), reverse=True)
+        
+        logger.info(f"找到 {len(purchase_orders)} 个采购单Excel文件")
+        return purchase_orders
+    
+    def read_purchase_order(self, file_path: str) -> Optional[pd.DataFrame]:
+        """
+        读取采购单Excel文件
+        
+        Args:
+            file_path: 采购单文件路径
+            
+        Returns:
+            数据帧，如果读取失败则返回None
+        """
+        try:
+            # 读取Excel文件
+            df = pd.read_excel(file_path)
+            logger.info(f"成功读取采购单文件: {file_path}")
+            
+            # 打印列名，用于调试
+            logger.debug(f"Excel文件的列名: {df.columns.tolist()}")
+            
+            # 检查是否有特殊表头结构（如在第3行）
+            special_header = False
+            if len(df) > 3:  # 确保有足够的行
+                row3 = df.iloc[3].astype(str)
+                header_keywords = ['行号', '条形码', '条码', '商品名称', '规格', '单价', '数量', '金额', '单位']
+                # 计算匹配的关键词数量
+                matches = sum(1 for keyword in header_keywords if any(keyword in str(val) for val in row3.values))
+                # 如果匹配了至少3个关键词，认为第3行是表头
+                if matches >= 3:
+                    logger.info(f"检测到特殊表头结构，使用第3行作为列名")
+                    # 创建新的数据帧，使用第3行作为列名，数据从第4行开始
+                    header_row = df.iloc[3]
+                    data_rows = df.iloc[4:].reset_index(drop=True)
+                    # 为每一列分配一个名称（避免重复的列名）
+                    new_columns = []
+                    for i, col in enumerate(header_row):
+                        col_str = str(col)
+                        if col_str == 'nan' or col_str == 'None' or pd.isna(col):
+                            new_columns.append(f"Col_{i}")
+                        else:
+                            new_columns.append(col_str)
+                    # 使用新列名创建新的DataFrame
+                    data_rows.columns = new_columns
+                    df = data_rows
+                    special_header = True
+                    logger.debug(f"重新构建的数据帧列名: {df.columns.tolist()}")
+            
+            # 定义可能的列名映射
+            column_mapping = {
+                '条码': ['条码', '条形码', '商品条码', 'barcode', '商品条形码', '条形码', '商品条码', '商品编码', '商品编号', '条形码', '条码（必填）'],
+                '采购量': ['数量', '采购数量', '购买数量', '采购数量', '订单数量', '采购数量', '采购量（必填）'],
+                '采购单价': ['单价', '价格', '采购单价', '销售价', '采购单价（必填）'],
+                '赠送量': ['赠送量', '赠品数量', '赠送数量', '赠品']
+            }
+            
+            # 映射实际的列名
+            mapped_columns = {}
+            for target_col, possible_names in column_mapping.items():
+                for col in df.columns:
+                    # 移除列名中的空白字符和括号内容以进行比较
+                    clean_col = re.sub(r'\s+', '', str(col))
+                    clean_col = re.sub(r'（.*?）', '', clean_col)  # 移除括号内容
+                    for name in possible_names:
+                        clean_name = re.sub(r'\s+', '', name)
+                        clean_name = re.sub(r'（.*?）', '', clean_name)  # 移除括号内容
+                        if clean_col == clean_name:
+                            mapped_columns[target_col] = col
+                            break
+                    if target_col in mapped_columns:
+                        break
+            
+            # 如果找到了必要的列，重命名列
+            if mapped_columns:
+                # 如果没有找到条码列，无法继续处理
+                if '条码' not in mapped_columns:
+                    logger.error(f"未找到条码列: {file_path}")
+                    return None
+                    
+                df = df.rename(columns=mapped_columns)
+                logger.info(f"列名映射结果: {mapped_columns}")
+            
+            return df
+            
+        except Exception as e:
+            logger.error(f"读取采购单文件失败: {file_path}, 错误: {str(e)}")
+            return None
+    
+    def merge_purchase_orders(self, file_paths: List[str]) -> Optional[pd.DataFrame]:
+        """
+        合并多个采购单文件
+        
+        Args:
+            file_paths: 采购单文件路径列表
+            
+        Returns:
+            合并后的数据帧，如果合并失败则返回None
+        """
+        if not file_paths:
+            logger.warning("没有需要合并的采购单文件")
+            return None
+        
+        # 读取所有采购单文件
+        dfs = []
+        for file_path in file_paths:
+            df = self.read_purchase_order(file_path)
+            if df is not None:
+                dfs.append(df)
+        
+        if not dfs:
+            logger.warning("没有成功读取的采购单文件")
+            return None
+        
+        # 合并数据
+        logger.info(f"开始合并 {len(dfs)} 个采购单文件")
+        
+        # 首先，整理每个数据帧以确保它们有相同的结构
+        processed_dfs = []
+        for i, df in enumerate(dfs):
+            # 确保必要的列存在
+            required_columns = ['条码', '采购量', '采购单价']
+            missing_columns = [col for col in required_columns if col not in df.columns]
+            
+            if missing_columns:
+                logger.warning(f"数据帧 {i} 缺少必要的列: {missing_columns}")
+                continue
+            
+            # 处理赠送量列不存在的情况
+            if '赠送量' not in df.columns:
+                df['赠送量'] = pd.NA
+            
+            # 选择需要的列
+            selected_df = df[['条码', '采购量', '采购单价', '赠送量']].copy()
+            
+            # 清理和转换数据
+            selected_df['条码'] = selected_df['条码'].apply(lambda x: format_barcode(x) if pd.notna(x) else x)
+            selected_df['采购量'] = pd.to_numeric(selected_df['采购量'], errors='coerce')
+            selected_df['采购单价'] = pd.to_numeric(selected_df['采购单价'], errors='coerce')
+            selected_df['赠送量'] = pd.to_numeric(selected_df['赠送量'], errors='coerce')
+            
+            # 过滤无效行
+            valid_df = selected_df.dropna(subset=['条码', '采购量'])
+            
+            processed_dfs.append(valid_df)
+        
+        if not processed_dfs:
+            logger.warning("没有有效的数据帧用于合并")
+            return None
+        
+        # 将所有数据帧合并
+        merged_df = pd.concat(processed_dfs, ignore_index=True)
+        
+        # 按条码和单价分组，合并相同商品
+        merged_df['采购单价'] = merged_df['采购单价'].round(4)  # 四舍五入到4位小数，避免浮点误差
+        
+        # 对于同一条码和单价的商品，合并数量和赠送量
+        grouped = merged_df.groupby(['条码', '采购单价'], as_index=False).agg({
+            '采购量': 'sum',
+            '赠送量': lambda x: sum(x.dropna()) if len(x.dropna()) > 0 else pd.NA
+        })
+        
+        # 计算其他信息
+        grouped['采购金额'] = grouped['采购量'] * grouped['采购单价']
+        
+        # 排序，按条码升序
+        result = grouped.sort_values('条码').reset_index(drop=True)
+        
+        logger.info(f"合并完成，共 {len(result)} 条商品记录")
+        return result
+    
+    def create_merged_purchase_order(self, df: pd.DataFrame) -> Optional[str]:
+        """
+        创建合并的采购单文件
+        
+        Args:
+            df: 合并后的数据帧
+            
+        Returns:
+            输出文件路径，如果创建失败则返回None
+        """
+        try:
+            # 打开模板文件
+            template_workbook = xlrd.open_workbook(self.template_path, formatting_info=True)
+            template_sheet = template_workbook.sheet_by_index(0)
+            
+            # 创建可写的副本
+            output_workbook = xlcopy(template_workbook)
+            output_sheet = output_workbook.get_sheet(0)
+            
+            # 填充商品信息
+            start_row = 4  # 从第5行开始填充数据（索引从0开始）
+            
+            for i, (_, row) in enumerate(df.iterrows()):
+                r = start_row + i
+                
+                # 序号
+                output_sheet.write(r, 0, i + 1)
+                # 商品编码（条码）
+                output_sheet.write(r, 1, row['条码'])
+                # 商品名称（合并单没有名称信息，留空）
+                output_sheet.write(r, 2, "")
+                # 规格（合并单没有规格信息，留空）
+                output_sheet.write(r, 3, "")
+                # 单位（合并单没有单位信息，留空）
+                output_sheet.write(r, 4, "")
+                # 单价
+                output_sheet.write(r, 5, row['采购单价'])
+                # 采购数量
+                output_sheet.write(r, 6, row['采购量'])
+                # 采购金额
+                output_sheet.write(r, 7, row['采购金额'])
+                # 税率
+                output_sheet.write(r, 8, 0)
+                # 赠送量
+                if pd.notna(row['赠送量']):
+                    output_sheet.write(r, 9, row['赠送量'])
+                else:
+                    output_sheet.write(r, 9, "")
+            
+            # 生成输出文件名
+            timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
+            output_file = os.path.join(self.output_dir, f"合并采购单_{timestamp}.xls")
+            
+            # 保存文件
+            output_workbook.save(output_file)
+            logger.info(f"合并采购单已保存到: {output_file}")
+            return output_file
+            
+        except Exception as e:
+            logger.error(f"创建合并采购单时出错: {e}")
+            return None
+    
+    def process(self, file_paths: Optional[List[str]] = None) -> Optional[str]:
+        """
+        处理采购单合并
+        
+        Args:
+            file_paths: 指定要合并的文件路径列表，如果为None则自动获取
+            
+        Returns:
+            合并后的文件路径，如果合并失败则返回None
+        """
+        # 如果未指定文件路径，则获取所有采购单文件
+        if file_paths is None:
+            file_paths = self.get_purchase_orders()
+        
+        # 检查是否有文件需要合并
+        if not file_paths:
+            logger.warning("没有找到可合并的采购单文件")
+            return None
+        
+        # 合并采购单
+        merged_df = self.merge_purchase_orders(file_paths)
+        if merged_df is None:
+            logger.error("合并采购单失败")
+            return None
+        
+        # 创建合并的采购单文件
+        output_file = self.create_merged_purchase_order(merged_df)
+        if output_file is None:
+            logger.error("创建合并采购单文件失败")
+            return None
+        
+        # 记录已合并文件
+        for file_path in file_paths:
+            self.merged_files[file_path] = output_file
+        self._save_merged_files()
+        
+        return output_file 
@@ -0,0 +1,393 @@
+"""
+Excel处理核心模块
+--------------
+提供Excel文件处理功能，包括表格解析、数据提取和处理。
+"""
+
+import os
+import re
+import pandas as pd
+import numpy as np
+import xlrd
+import xlwt
+from xlutils.copy import copy as xlcopy
+from typing import Dict, List, Optional, Tuple, Union, Any
+from datetime import datetime
+
+from ...config.settings import ConfigManager
+from ..utils.log_utils import get_logger
+from ..utils.file_utils import (
+    ensure_dir,
+    get_file_extension,
+    get_latest_file,
+    load_json,
+    save_json
+)
+from ..utils.string_utils import (
+    clean_string,
+    clean_barcode,
+    extract_number,
+    format_barcode
+)
+from .converter import UnitConverter
+
+logger = get_logger(__name__)
+
+class ExcelProcessor:
+    """
+    Excel处理器：处理OCR识别后的Excel文件，
+    提取条码、单价和数量，并按照采购单模板的格式填充
+    """
+    
+    def __init__(self, config: Optional[ConfigManager] = None):
+        """
+        初始化Excel处理器
+        
+        Args:
+            config: 配置管理器，如果为None则创建新的
+        """
+        logger.info("初始化ExcelProcessor")
+        self.config = config or ConfigManager()
+        
+        # 获取配置
+        self.output_dir = self.config.get_path('Paths', 'output_folder', 'data/output', create=True)
+        self.temp_dir = self.config.get_path('Paths', 'temp_folder', 'data/temp', create=True)
+        
+        # 获取模板文件路径
+        template_folder = self.config.get('Paths', 'template_folder', 'templates')
+        template_name = self.config.get('Templates', 'purchase_order', '银豹-采购单模板.xls')
+        
+        self.template_path = os.path.join(template_folder, template_name)
+        
+        # 检查模板文件是否存在
+        if not os.path.exists(self.template_path):
+            logger.error(f"模板文件不存在: {self.template_path}")
+            raise FileNotFoundError(f"模板文件不存在: {self.template_path}")
+        
+        # 用于记录已处理的文件
+        self.cache_file = os.path.join(self.output_dir, "processed_files.json")
+        self.processed_files = self._load_processed_files()
+        
+        # 创建单位转换器
+        self.unit_converter = UnitConverter()
+        
+        logger.info(f"初始化完成，模板文件: {self.template_path}")
+    
+    def _load_processed_files(self) -> Dict[str, str]:
+        """
+        加载已处理文件的缓存
+        
+        Returns:
+            处理记录字典
+        """
+        return load_json(self.cache_file, {})
+        
+    def _save_processed_files(self) -> None:
+        """保存已处理文件的缓存"""
+        save_json(self.processed_files, self.cache_file)
+    
+    def get_latest_excel(self) -> Optional[str]:
+        """
+        获取output目录下最新的Excel文件（排除采购单文件）
+        
+        Returns:
+            最新Excel文件的路径，如果未找到则返回None
+        """
+        logger.info(f"搜索目录 {self.output_dir} 中的Excel文件")
+        
+        # 使用文件工具获取最新文件
+        latest_file = get_latest_file(
+            self.output_dir,
+            pattern="",  # 不限制文件名
+            extensions=['.xlsx', '.xls']  # 限制为Excel文件
+        )
+        
+        # 如果没有找到文件
+        if not latest_file:
+            logger.warning(f"未在 {self.output_dir} 目录下找到未处理的Excel文件")
+            return None
+        
+        # 检查是否是采购单（以"采购单_"开头的文件）
+        file_name = os.path.basename(latest_file)
+        if file_name.startswith('采购单_'):
+            logger.warning(f"找到的最新文件是采购单，不作处理: {latest_file}")
+            return None
+        
+        logger.info(f"找到最新的Excel文件: {latest_file}")
+        return latest_file
+    
+    def validate_barcode(self, barcode: Any) -> bool:
+        """
+        验证条码是否有效
+        新增功能：如果条码是"仓库"，则返回False以避免误认为有效条码
+        
+        Args:
+            barcode: 条码值
+            
+        Returns:
+            条码是否有效
+        """
+        # 处理"仓库"特殊情况
+        if isinstance(barcode, str) and barcode.strip() in ["仓库", "仓库全名"]:
+            logger.warning(f"条码为仓库标识: {barcode}")
+            return False
+            
+        # 清理条码格式
+        barcode_clean = clean_barcode(barcode)
+        
+        # 对特定的错误条码进行修正（开头改6开头）
+        if len(barcode_clean) > 8 and barcode_clean.startswith('5') and not barcode_clean.startswith('53'):
+            barcode_clean = '6' + barcode_clean[1:]
+            logger.info(f"修正条码前缀 5->6: {barcode} -> {barcode_clean}")
+            
+        # 验证条码长度
+        if len(barcode_clean) < 8 or len(barcode_clean) > 13:
+            logger.warning(f"条码长度异常: {barcode_clean}, 长度={len(barcode_clean)}")
+            return False
+            
+        # 验证条码是否全为数字
+        if not barcode_clean.isdigit():
+            logger.warning(f"条码包含非数字字符: {barcode_clean}")
+            return False
+            
+        # 对于序号9的特殊情况，允许其条码格式
+        if barcode_clean == "5321545613":
+            logger.info(f"特殊条码验证通过: {barcode_clean}")
+            return True
+            
+        logger.debug(f"条码验证通过: {barcode_clean}")
+        return True
+    
+    def extract_barcode(self, df: pd.DataFrame) -> List[str]:
+        """
+        从数据帧中提取条码列名
+        
+        Args:
+            df: 数据帧
+            
+        Returns:
+            可能的条码列名列表
+        """
+        possible_barcode_columns = [
+            '条码', '条形码', '商品条码', '商品条形码', 
+            '商品编码', '商品编号', '条形码', '条码（必填）', 
+            'barcode', 'Barcode', '编码', '条形码'
+        ]
+        
+        found_columns = []
+        for col in df.columns:
+            col_str = str(col).strip()
+            if col_str in possible_barcode_columns:
+                found_columns.append(col)
+        
+        return found_columns
+    
+    def extract_product_info(self, df: pd.DataFrame) -> List[Dict]:
+        """
+        从数据帧中提取商品信息
+        
+        Args:
+            df: 数据帧
+            
+        Returns:
+            商品信息列表
+        """
+        # 提取有用的列
+        barcode_cols = self.extract_barcode(df)
+        
+        # 如果没有找到条码列，无法继续处理
+        if not barcode_cols:
+            logger.error("未找到条码列，无法处理")
+            return []
+            
+        # 定义列名映射
+        column_mapping = {
+            'name': ['商品名称', '名称', '品名', '商品', '商品名', '商品或服务名称', '品项名'],
+            'specification': ['规格', '规格型号', '型号', '商品规格'],
+            'quantity': ['数量', '采购数量', '购买数量', '采购数量', '订单数量', '数量（必填）'],
+            'unit': ['单位', '采购单位', '计量单位', '单位（必填）'],
+            'price': ['单价', '价格', '采购单价', '销售价', '进货价', '单价（必填）']
+        }
+        
+        # 映射列名到标准名称
+        mapped_columns = {'barcode': barcode_cols[0]}  # 使用第一个找到的条码列
+        
+        for target, possible_names in column_mapping.items():
+            for col in df.columns:
+                col_str = str(col).strip()
+                for name in possible_names:
+                    if col_str == name:
+                        mapped_columns[target] = col
+                        break
+                if target in mapped_columns:
+                    break
+        
+        logger.info(f"列名映射结果: {mapped_columns}")
+        
+        # 提取商品信息
+        products = []
+        
+        for _, row in df.iterrows():
+            barcode = row.get(mapped_columns['barcode'])
+            
+            # 跳过空行或无效条码
+            if pd.isna(barcode) or not self.validate_barcode(barcode):
+                continue
+                
+            # 创建商品信息字典
+            product = {
+                'barcode': format_barcode(barcode),
+                'name': row.get(mapped_columns.get('name', ''), ''),
+                'specification': row.get(mapped_columns.get('specification', ''), ''),
+                'quantity': extract_number(str(row.get(mapped_columns.get('quantity', ''), 0))) or 0,
+                'unit': str(row.get(mapped_columns.get('unit', ''), '')),
+                'price': extract_number(str(row.get(mapped_columns.get('price', ''), 0))) or 0
+            }
+            
+            # 如果商品名称为空但商品条码不为空，则使用条码作为名称
+            if not product['name'] and product['barcode']:
+                product['name'] = f"商品 ({product['barcode']})"
+            
+            # 推断规格
+            if not product['specification'] and product['name']:
+                inferred_spec = self.unit_converter.infer_specification_from_name(product['name'])
+                if inferred_spec:
+                    product['specification'] = inferred_spec
+                    logger.info(f"从商品名称推断规格: {product['name']} -> {inferred_spec}")
+            
+            # 单位处理：如果单位为空但数量包含单位信息
+            quantity_str = str(row.get(mapped_columns.get('quantity', ''), ''))
+            if not product['unit'] and '数量' in mapped_columns:
+                num, unit = self.unit_converter.extract_unit_from_quantity(quantity_str)
+                if unit:
+                    product['unit'] = unit
+                    logger.info(f"从数量提取单位: {quantity_str} -> {unit}")
+                    # 如果数量被提取出来，更新数量
+                    if num is not None:
+                        product['quantity'] = num
+            
+            # 应用单位转换规则
+            product = self.unit_converter.process_unit_conversion(product)
+            
+            products.append(product)
+        
+        logger.info(f"提取到 {len(products)} 个商品信息")
+        return products
+    
+    def fill_template(self, products: List[Dict], output_file_path: str) -> bool:
+        """
+        填充采购单模板
+        
+        Args:
+            products: 商品信息列表
+            output_file_path: 输出文件路径
+            
+        Returns:
+            是否成功填充
+        """
+        try:
+            # 打开模板文件
+            template_workbook = xlrd.open_workbook(self.template_path, formatting_info=True)
+            template_sheet = template_workbook.sheet_by_index(0)
+            
+            # 创建可写的副本
+            output_workbook = xlcopy(template_workbook)
+            output_sheet = output_workbook.get_sheet(0)
+            
+            # 填充商品信息
+            start_row = 1  # 从第2行开始填充数据（索引从0开始）
+            
+            for i, product in enumerate(products):
+                row = start_row + i
+                
+                # 序号
+                output_sheet.write(row, 0, i + 1)
+                # 商品编码（条码）
+                output_sheet.write(row, 1, product['barcode'])
+                # 商品名称
+                output_sheet.write(row, 2, product['name'])
+                # 规格
+                output_sheet.write(row, 3, product['specification'])
+                # 单位
+                output_sheet.write(row, 4, product['unit'])
+                # 单价
+                output_sheet.write(row, 5, product['price'])
+                # 采购数量
+                output_sheet.write(row, 6, product['quantity'])
+                # 采购金额（单价 × 数量）
+                amount = product['price'] * product['quantity']
+                output_sheet.write(row, 7, amount)
+                # 税率
+                output_sheet.write(row, 8, 0)
+                # 赠送量（默认为0）
+                output_sheet.write(row, 9, 0)
+            
+            # 保存文件
+            output_workbook.save(output_file_path)
+            logger.info(f"采购单已保存到: {output_file_path}")
+            return True
+            
+        except Exception as e:
+            logger.error(f"填充模板时出错: {e}")
+            return False
+    
+    def process_specific_file(self, file_path: str) -> Optional[str]:
+        """
+        处理指定的Excel文件
+        
+        Args:
+            file_path: Excel文件路径
+            
+        Returns:
+            输出文件路径，如果处理失败则返回None
+        """
+        logger.info(f"开始处理Excel文件: {file_path}")
+        
+        if not os.path.exists(file_path):
+            logger.error(f"文件不存在: {file_path}")
+            return None
+        
+        try:
+            # 读取Excel文件
+            df = pd.read_excel(file_path)
+            logger.info(f"成功读取Excel文件: {file_path}, 共 {len(df)} 行")
+            
+            # 提取商品信息
+            products = self.extract_product_info(df)
+            
+            if not products:
+                logger.warning("未提取到有效商品信息")
+                return None
+            
+            # 生成输出文件名
+            file_name = os.path.splitext(os.path.basename(file_path))[0]
+            output_file = os.path.join(self.output_dir, f"采购单_{file_name}.xls")
+            
+            # 填充模板并保存
+            if self.fill_template(products, output_file):
+                # 记录已处理文件
+                self.processed_files[file_path] = output_file
+                self._save_processed_files()
+                return output_file
+            
+            return None
+            
+        except Exception as e:
+            logger.error(f"处理Excel文件时出错: {file_path}, 错误: {e}")
+            return None
+    
+    def process_latest_file(self) -> Optional[str]:
+        """
+        处理最新的Excel文件
+        
+        Returns:
+            输出文件路径，如果处理失败则返回None
+        """
+        # 获取最新的Excel文件
+        latest_file = self.get_latest_excel()
+        if not latest_file:
+            logger.warning("未找到可处理的Excel文件")
+            return None
+        
+        # 处理文件
+        return self.process_specific_file(latest_file) 
@@ -0,0 +1,5 @@
+"""
+OCR订单处理系统 - OCR核心模块
+---------------------------
+提供OCR识别相关功能，包括图片预处理、文字识别和表格识别。
+""" 
@@ -0,0 +1,344 @@
+"""
+百度OCR客户端模块
+---------------
+提供百度OCR API的访问和调用功能。
+"""
+
+import os
+import time
+import base64
+import requests
+import logging
+from typing import Dict, Optional, Any, Union
+
+from ...config.settings import ConfigManager
+from ..utils.log_utils import get_logger
+
+logger = get_logger(__name__)
+
+class TokenManager:
+    """
+    令牌管理类，负责获取和刷新百度API访问令牌
+    """
+    
+    def __init__(self, api_key: str, secret_key: str, max_retries: int = 3, retry_delay: int = 2):
+        """
+        初始化令牌管理器
+        
+        Args:
+            api_key: 百度API Key
+            secret_key: 百度Secret Key
+            max_retries: 最大重试次数
+            retry_delay: 重试延迟（秒）
+        """
+        self.api_key = api_key
+        self.secret_key = secret_key
+        self.max_retries = max_retries
+        self.retry_delay = retry_delay
+        self.access_token = None
+        self.token_expiry = 0
+    
+    def get_token(self) -> Optional[str]:
+        """
+        获取访问令牌，如果令牌已过期则刷新
+        
+        Returns:
+            访问令牌，如果获取失败则返回None
+        """
+        if self.is_token_valid():
+            return self.access_token
+        
+        return self.refresh_token()
+    
+    def is_token_valid(self) -> bool:
+        """
+        检查令牌是否有效
+        
+        Returns:
+            令牌是否有效
+        """
+        return (
+            self.access_token is not None and 
+            self.token_expiry > time.time() + 60  # 提前1分钟刷新
+        )
+    
+    def refresh_token(self) -> Optional[str]:
+        """
+        刷新访问令牌
+        
+        Returns:
+            新的访问令牌，如果获取失败则返回None
+        """
+        url = "https://aip.baidubce.com/oauth/2.0/token"
+        params = {
+            "grant_type": "client_credentials",
+            "client_id": self.api_key,
+            "client_secret": self.secret_key
+        }
+        
+        for attempt in range(self.max_retries):
+            try:
+                response = requests.post(url, params=params, timeout=10)
+                if response.status_code == 200:
+                    result = response.json()
+                    if "access_token" in result:
+                        self.access_token = result["access_token"]
+                        # 设置令牌过期时间（默认30天，提前1小时过期以确保安全）
+                        self.token_expiry = time.time() + result.get("expires_in", 2592000) - 3600
+                        logger.info("成功获取访问令牌")
+                        return self.access_token
+                
+                logger.warning(f"获取访问令牌失败 (尝试 {attempt+1}/{self.max_retries}): {response.text}")
+                
+            except Exception as e:
+                logger.warning(f"获取访问令牌时发生错误 (尝试 {attempt+1}/{self.max_retries}): {e}")
+            
+            # 如果不是最后一次尝试，则等待后重试
+            if attempt < self.max_retries - 1:
+                time.sleep(self.retry_delay * (attempt + 1))  # 指数退避
+        
+        logger.error("无法获取访问令牌")
+        return None
+
+class BaiduOCRClient:
+    """
+    百度OCR API客户端
+    """
+    
+    def __init__(self, config: Optional[ConfigManager] = None):
+        """
+        初始化百度OCR客户端
+        
+        Args:
+            config: 配置管理器，如果为None则创建新的
+        """
+        self.config = config or ConfigManager()
+        
+        # 获取配置
+        self.api_key = self.config.get('API', 'api_key')
+        self.secret_key = self.config.get('API', 'secret_key')
+        self.timeout = self.config.getint('API', 'timeout', 30)
+        self.max_retries = self.config.getint('API', 'max_retries', 3)
+        self.retry_delay = self.config.getint('API', 'retry_delay', 2)
+        self.api_url = self.config.get('API', 'api_url', 'https://aip.baidubce.com/rest/2.0/ocr/v1/table')
+        
+        # 创建令牌管理器
+        self.token_manager = TokenManager(
+            self.api_key, 
+            self.secret_key, 
+            self.max_retries, 
+            self.retry_delay
+        )
+        
+        # 验证API配置
+        if not self.api_key or not self.secret_key:
+            logger.warning("API密钥未设置，请在配置文件中设置API密钥")
+    
+    def read_image(self, image_path: str) -> Optional[bytes]:
+        """
+        读取图片文件为二进制数据
+        
+        Args:
+            image_path: 图片文件路径
+            
+        Returns:
+            图片二进制数据，如果读取失败则返回None
+        """
+        try:
+            with open(image_path, 'rb') as f:
+                return f.read()
+        except Exception as e:
+            logger.error(f"读取图片文件失败: {image_path}, 错误: {e}")
+            return None
+    
+    def recognize_table(self, image_data: Union[str, bytes]) -> Optional[Dict]:
+        """
+        识别表格
+        
+        Args:
+            image_data: 图片数据，可以是文件路径或二进制数据
+            
+        Returns:
+            识别结果字典，如果识别失败则返回None
+        """
+        # 获取访问令牌
+        access_token = self.token_manager.get_token()
+        if not access_token:
+            logger.error("无法获取访问令牌，无法进行表格识别")
+            return None
+        
+        # 如果是文件路径，读取图片数据
+        if isinstance(image_data, str):
+            image_data = self.read_image(image_data)
+            if image_data is None:
+                return None
+        
+        # 准备请求参数
+        url = f"{self.api_url}?access_token={access_token}"
+        image_base64 = base64.b64encode(image_data).decode('utf-8')
+        
+        # 请求参数 - 添加return_excel参数，与v1版本保持一致
+        payload = {
+            'image': image_base64,
+            'is_sync': 'true',  # 同步请求
+            'request_type': 'excel',  # 输出为Excel
+            'return_excel': 'true'  # 直接返回Excel数据
+        }
+        
+        headers = {
+            'Content-Type': 'application/x-www-form-urlencoded',
+            'Accept': 'application/json'
+        }
+        
+        # 发送请求
+        for attempt in range(self.max_retries):
+            try:
+                response = requests.post(
+                    url, 
+                    data=payload, 
+                    headers=headers, 
+                    timeout=self.timeout
+                )
+                
+                if response.status_code == 200:
+                    result = response.json()
+                    # 打印返回结果以便调试
+                    logger.debug(f"百度OCR API返回结果: {result}")
+                    
+                    if 'error_code' in result:
+                        error_msg = result.get('error_msg', '未知错误')
+                        logger.error(f"百度OCR API错误: {error_msg}")
+                        # 如果是授权错误，尝试刷新令牌
+                        if result.get('error_code') in [110, 111]:  # 授权相关错误码
+                            logger.info("尝试刷新访问令牌...")
+                            self.token_manager.refresh_token()
+                        return None
+                    
+                    # 兼容不同的返回结构
+                    # 这是最关键的修改部分: 直接返回整个结果，不强制要求特定结构
+                    return result
+                else:
+                    logger.warning(f"表格识别请求失败 (尝试 {attempt+1}/{self.max_retries}): {response.text}")
+            
+            except Exception as e:
+                logger.warning(f"表格识别时发生错误 (尝试 {attempt+1}/{self.max_retries}): {e}")
+            
+            # 如果不是最后一次尝试，则等待后重试
+            if attempt < self.max_retries - 1:
+                wait_time = self.retry_delay * (2 ** attempt)  # 指数退避
+                logger.info(f"将在 {wait_time} 秒后重试...")
+                time.sleep(wait_time)
+        
+        logger.error("表格识别失败")
+        return None
+    
+    def get_excel_result(self, request_id_or_result: Union[str, Dict]) -> Optional[bytes]:
+        """
+        获取Excel结果
+        
+        Args:
+            request_id_or_result: 请求ID或完整的识别结果
+            
+        Returns:
+            Excel二进制数据，如果获取失败则返回None
+        """
+        # 获取访问令牌
+        access_token = self.token_manager.get_token()
+        if not access_token:
+            logger.error("无法获取访问令牌，无法获取Excel结果")
+            return None
+        
+        # 处理直接传入结果对象的情况
+        request_id = request_id_or_result
+        if isinstance(request_id_or_result, dict):
+            # v1版本兼容处理：如果结果中直接包含Excel数据
+            if 'result' in request_id_or_result:
+                # 如果是同步返回的Excel结果（某些API版本会直接返回）
+                if 'result_data' in request_id_or_result['result']:
+                    excel_content = request_id_or_result['result']['result_data']
+                    if excel_content:
+                        try:
+                            return base64.b64decode(excel_content)
+                        except Exception as e:
+                            logger.error(f"解析Excel数据失败: {e}")
+                
+                # 提取request_id
+                if 'request_id' in request_id_or_result['result']:
+                    request_id = request_id_or_result['result']['request_id']
+                    logger.debug(f"从result子对象中提取request_id: {request_id}")
+                elif 'tables_result' in request_id_or_result['result'] and len(request_id_or_result['result']['tables_result']) > 0:
+                    # 某些版本API可能直接返回表格内容，此时可能没有request_id
+                    logger.info("检测到API直接返回了表格内容，但没有request_id")
+                    return None
+            # 有些版本可能request_id在顶层
+            elif 'request_id' in request_id_or_result:
+                request_id = request_id_or_result['request_id']
+                logger.debug(f"从顶层对象中提取request_id: {request_id}")
+        
+        # 如果没有有效的request_id，无法获取结果
+        if not isinstance(request_id, str):
+            logger.error(f"无法从结果中提取有效的request_id: {request_id_or_result}")
+            return None
+            
+        url = f"https://aip.baidubce.com/rest/2.0/solution/v1/form_ocr/get_request_result?access_token={access_token}"
+        
+        payload = {
+            'request_id': request_id,
+            'result_type': 'excel'
+        }
+        
+        headers = {
+            'Content-Type': 'application/x-www-form-urlencoded',
+            'Accept': 'application/json'
+        }
+        
+        for attempt in range(self.max_retries):
+            try:
+                response = requests.post(
+                    url, 
+                    data=payload, 
+                    headers=headers, 
+                    timeout=self.timeout
+                )
+                
+                if response.status_code == 200:
+                    try:
+                        result = response.json()
+                        logger.debug(f"获取Excel结果返回: {result}")
+                        
+                        # 检查是否还在处理中
+                        if result.get('result', {}).get('ret_code') == 3:
+                            logger.info(f"Excel结果正在处理中，等待后重试 (尝试 {attempt+1}/{self.max_retries})")
+                            time.sleep(2)
+                            continue
+                        
+                        # 检查是否有错误
+                        if 'error_code' in result or result.get('result', {}).get('ret_code') != 0:
+                            error_msg = result.get('error_msg') or result.get('result', {}).get('ret_msg', '未知错误')
+                            logger.error(f"获取Excel结果失败: {error_msg}")
+                            return None
+                        
+                        # 获取Excel内容
+                        excel_content = result.get('result', {}).get('result_data')
+                        if excel_content:
+                            return base64.b64decode(excel_content)
+                        else:
+                            logger.error("Excel结果为空")
+                            return None
+                    
+                    except Exception as e:
+                        logger.error(f"解析Excel结果时出错: {e}")
+                        return None
+                
+                else:
+                    logger.warning(f"获取Excel结果请求失败 (尝试 {attempt+1}/{self.max_retries}): {response.text}")
+            
+            except Exception as e:
+                logger.warning(f"获取Excel结果时发生错误 (尝试 {attempt+1}/{self.max_retries}): {e}")
+            
+            # 如果不是最后一次尝试，则等待后重试
+            if attempt < self.max_retries - 1:
+                time.sleep(self.retry_delay * (attempt + 1))
+        
+        logger.error("获取Excel结果失败")
+        return None 
@@ -0,0 +1,334 @@
+"""
+表格OCR处理模块
+-------------
+处理图片并提取表格内容，保存为Excel文件。
+"""
+
+import os
+import sys
+import time
+import json
+import base64
+from datetime import datetime
+from concurrent.futures import ThreadPoolExecutor
+from typing import Dict, List, Optional, Tuple, Union, Any
+
+from ...config.settings import ConfigManager
+from ..utils.log_utils import get_logger
+from ..utils.file_utils import (
+    ensure_dir, 
+    get_file_extension, 
+    get_files_by_extensions, 
+    generate_timestamp_filename,
+    is_file_size_valid,
+    load_json,
+    save_json
+)
+from .baidu_ocr import BaiduOCRClient
+
+logger = get_logger(__name__)
+
+class ProcessedRecordManager:
+    """处理记录管理器，用于跟踪已处理的文件"""
+    
+    def __init__(self, record_file: str):
+        """
+        初始化处理记录管理器
+        
+        Args:
+            record_file: 记录文件路径
+        """
+        self.record_file = record_file
+        self.processed_files = self._load_record()
+    
+    def _load_record(self) -> Dict[str, str]:
+        """
+        加载处理记录
+        
+        Returns:
+            处理记录字典，键为输入文件路径，值为输出文件路径
+        """
+        return load_json(self.record_file, {})
+    
+    def save_record(self) -> None:
+        """保存处理记录"""
+        save_json(self.processed_files, self.record_file)
+    
+    def is_processed(self, image_file: str) -> bool:
+        """
+        检查图片是否已处理
+        
+        Args:
+            image_file: 图片文件路径
+            
+        Returns:
+            是否已处理
+        """
+        return image_file in self.processed_files
+    
+    def mark_as_processed(self, image_file: str, output_file: str) -> None:
+        """
+        标记图片为已处理
+        
+        Args:
+            image_file: 图片文件路径
+            output_file: 输出文件路径
+        """
+        self.processed_files[image_file] = output_file
+        self.save_record()
+    
+    def get_output_file(self, image_file: str) -> Optional[str]:
+        """
+        获取图片的输出文件路径
+        
+        Args:
+            image_file: 图片文件路径
+            
+        Returns:
+            输出文件路径，如果不存在则返回None
+        """
+        return self.processed_files.get(image_file)
+    
+    def get_unprocessed_files(self, files: List[str]) -> List[str]:
+        """
+        获取未处理的文件列表
+        
+        Args:
+            files: 文件列表
+            
+        Returns:
+            未处理的文件列表
+        """
+        return [file for file in files if not self.is_processed(file)]
+
+class OCRProcessor:
+    """
+    OCR处理器，用于表格识别与处理
+    """
+    
+    def __init__(self, config: Optional[ConfigManager] = None):
+        """
+        初始化OCR处理器
+        
+        Args:
+            config: 配置管理器，如果为None则创建新的
+        """
+        self.config = config or ConfigManager()
+        
+        # 创建百度OCR客户端
+        self.ocr_client = BaiduOCRClient(self.config)
+        
+        # 获取配置
+        self.input_folder = self.config.get_path('Paths', 'input_folder', 'data/input', create=True)
+        self.output_folder = self.config.get_path('Paths', 'output_folder', 'data/output', create=True)
+        self.temp_folder = self.config.get_path('Paths', 'temp_folder', 'data/temp', create=True)
+        
+        # 确保目录结构正确
+        for folder in [self.input_folder, self.output_folder, self.temp_folder]:
+            if not os.path.exists(folder):
+                os.makedirs(folder, exist_ok=True)
+                logger.info(f"创建目录: {folder}")
+        
+        # 记录实际路径
+        logger.info(f"使用输入目录: {os.path.abspath(self.input_folder)}")
+        logger.info(f"使用输出目录: {os.path.abspath(self.output_folder)}")
+        logger.info(f"使用临时目录: {os.path.abspath(self.temp_folder)}")
+        
+        self.allowed_extensions = self.config.get_list('File', 'allowed_extensions', '.jpg,.jpeg,.png,.bmp')
+        self.max_file_size_mb = self.config.getfloat('File', 'max_file_size_mb', 4.0)
+        self.excel_extension = self.config.get('File', 'excel_extension', '.xlsx')
+        
+        # 处理性能配置
+        self.max_workers = self.config.getint('Performance', 'max_workers', 4)
+        self.batch_size = self.config.getint('Performance', 'batch_size', 5)
+        self.skip_existing = self.config.getboolean('Performance', 'skip_existing', True)
+        
+        # 初始化处理记录管理器
+        record_file = self.config.get('Paths', 'processed_record', 'data/processed_files.json')
+        self.record_manager = ProcessedRecordManager(record_file)
+        
+        logger.info(f"OCR处理器初始化完成，输入目录: {self.input_folder}, 输出目录: {self.output_folder}")
+    
+    def get_unprocessed_images(self) -> List[str]:
+        """
+        获取未处理的图片列表
+        
+        Returns:
+            未处理的图片文件路径列表
+        """
+        # 获取所有图片文件
+        image_files = get_files_by_extensions(self.input_folder, self.allowed_extensions)
+        
+        # 如果需要跳过已存在的文件
+        if self.skip_existing:
+            # 过滤已处理的文件
+            unprocessed_files = self.record_manager.get_unprocessed_files(image_files)
+            logger.info(f"找到 {len(image_files)} 个图片文件，其中 {len(unprocessed_files)} 个未处理")
+            return unprocessed_files
+        
+        logger.info(f"找到 {len(image_files)} 个图片文件（不跳过已处理的文件）")
+        return image_files
+    
+    def validate_image(self, image_path: str) -> bool:
+        """
+        验证图片是否有效
+        
+        Args:
+            image_path: 图片文件路径
+            
+        Returns:
+            图片是否有效
+        """
+        # 检查文件是否存在
+        if not os.path.exists(image_path):
+            logger.warning(f"图片文件不存在: {image_path}")
+            return False
+        
+        # 检查文件扩展名
+        ext = get_file_extension(image_path)
+        if ext not in self.allowed_extensions:
+            logger.warning(f"不支持的文件类型: {ext}, 文件: {image_path}")
+            return False
+        
+        # 检查文件大小
+        if not is_file_size_valid(image_path, self.max_file_size_mb):
+            logger.warning(f"文件大小超过限制 ({self.max_file_size_mb}MB): {image_path}")
+            return False
+        
+        return True
+    
+    def process_image(self, image_path: str) -> Optional[str]:
+        """
+        处理单个图片
+        
+        Args:
+            image_path: 图片文件路径
+            
+        Returns:
+            输出Excel文件路径，如果处理失败则返回None
+        """
+        # 验证图片
+        if not self.validate_image(image_path):
+            return None
+        
+        # 如果需要跳过已处理的文件
+        if self.skip_existing and self.record_manager.is_processed(image_path):
+            output_file = self.record_manager.get_output_file(image_path)
+            logger.info(f"图片已处理，跳过: {image_path}, 输出文件: {output_file}")
+            return output_file
+        
+        logger.info(f"开始处理图片: {image_path}")
+        
+        try:
+            # 生成输出文件路径
+            file_name = os.path.splitext(os.path.basename(image_path))[0]
+            output_file = os.path.join(self.output_folder, f"{file_name}{self.excel_extension}")
+            
+            # 检查是否已存在对应的Excel文件
+            if os.path.exists(output_file) and self.skip_existing:
+                logger.info(f"已存在对应的Excel文件，跳过处理: {os.path.basename(image_path)} -> {os.path.basename(output_file)}")
+                # 记录处理结果
+                self.record_manager.mark_as_processed(image_path, output_file)
+                return output_file
+            
+            # 进行OCR识别
+            ocr_result = self.ocr_client.recognize_table(image_path)
+            if not ocr_result:
+                logger.error(f"OCR识别失败: {image_path}")
+                return None
+                
+            # 保存Excel文件 - 按照v1版本逻辑提取Excel数据
+            excel_base64 = None
+            
+            # 从不同可能的字段中尝试获取Excel数据
+            if 'excel_file' in ocr_result:
+                excel_base64 = ocr_result['excel_file']
+                logger.debug("从excel_file字段获取Excel数据")
+            elif 'result' in ocr_result:
+                if 'result_data' in ocr_result['result']:
+                    excel_base64 = ocr_result['result']['result_data']
+                    logger.debug("从result.result_data字段获取Excel数据")
+                elif 'excel_file' in ocr_result['result']:
+                    excel_base64 = ocr_result['result']['excel_file']
+                    logger.debug("从result.excel_file字段获取Excel数据")
+                elif 'tables_result' in ocr_result['result'] and ocr_result['result']['tables_result']:
+                    for table in ocr_result['result']['tables_result']:
+                        if 'excel_file' in table:
+                            excel_base64 = table['excel_file']
+                            logger.debug("从tables_result中获取Excel数据")
+                            break
+                    
+            # 如果还是没有找到Excel数据，尝试通过get_excel_result获取
+            if not excel_base64:
+                logger.info("无法从直接返回中获取Excel数据，尝试通过API获取...")
+                excel_data = self.ocr_client.get_excel_result(ocr_result)
+                if not excel_data:
+                    logger.error(f"获取Excel结果失败: {image_path}")
+                    return None
+                    
+                # 保存Excel文件
+                os.makedirs(os.path.dirname(output_file), exist_ok=True)
+                with open(output_file, 'wb') as f:
+                    f.write(excel_data)
+            else:
+                # 解码并保存Excel文件
+                try:
+                    excel_data = base64.b64decode(excel_base64)
+                    os.makedirs(os.path.dirname(output_file), exist_ok=True)
+                    with open(output_file, 'wb') as f:
+                        f.write(excel_data)
+                except Exception as e:
+                    logger.error(f"解码或保存Excel数据时出错: {e}")
+                    return None
+            
+            logger.info(f"图片处理成功: {image_path}, 输出文件: {output_file}")
+            
+            # 标记为已处理
+            self.record_manager.mark_as_processed(image_path, output_file)
+            
+            return output_file
+            
+        except Exception as e:
+            logger.error(f"处理图片时出错: {image_path}, 错误: {e}")
+            return None
+    
+    def process_images_batch(self, batch_size: int = None, max_workers: int = None) -> Tuple[int, int]:
+        """
+        批量处理图片
+        
+        Args:
+            batch_size: 批处理大小，如果为None则使用配置值
+            max_workers: 最大线程数，如果为None则使用配置值
+            
+        Returns:
+            (总处理数, 成功处理数)元组
+        """
+        # 使用配置值或参数值
+        batch_size = batch_size or self.batch_size
+        max_workers = max_workers or self.max_workers
+        
+        # 获取未处理的图片
+        unprocessed_images = self.get_unprocessed_images()
+        if not unprocessed_images:
+            logger.warning("没有需要处理的图片")
+            return 0, 0
+        
+        total = len(unprocessed_images)
+        success = 0
+        
+        # 按批次处理
+        for i in range(0, total, batch_size):
+            batch = unprocessed_images[i:i + batch_size]
+            logger.info(f"处理批次 {i//batch_size + 1}/{(total-1)//batch_size + 1}, 大小: {len(batch)}")
+            
+            # 使用线程池并行处理
+            with ThreadPoolExecutor(max_workers=max_workers) as executor:
+                results = list(executor.map(self.process_image, batch))
+            
+            # 统计成功数
+            success += sum(1 for result in results if result is not None)
+            
+            logger.info(f"批次处理完成, 成功: {sum(1 for result in results if result is not None)}/{len(batch)}")
+        
+        logger.info(f"所有图片处理完成, 总计: {total}, 成功: {success}")
+        return total, success 
@@ -0,0 +1,5 @@
+"""
+OCR订单处理系统 - 工具模块
+------------------------
+提供系统通用工具和辅助函数。
+""" 
@@ -0,0 +1,251 @@
+"""
+文件操作工具模块
+--------------
+提供文件处理、查找和管理功能。
+"""
+
+import os
+import sys
+import shutil
+import json
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, List, Optional, Union, Any
+
+from .log_utils import get_logger
+
+logger = get_logger(__name__)
+
+def ensure_dir(directory: str) -> bool:
+    """
+    确保目录存在，如果不存在则创建
+    
+    Args:
+        directory: 目录路径
+        
+    Returns:
+        是否成功创建或目录已存在
+    """
+    try:
+        os.makedirs(directory, exist_ok=True)
+        return True
+    except Exception as e:
+        logger.error(f"创建目录失败: {directory}, 错误: {e}")
+        return False
+
+def get_file_extension(file_path: str) -> str:
+    """
+    获取文件扩展名（小写）
+    
+    Args:
+        file_path: 文件路径
+        
+    Returns:
+        文件扩展名，包含点（例如 .jpg）
+    """
+    return os.path.splitext(file_path)[1].lower()
+
+def is_valid_extension(file_path: str, allowed_extensions: List[str]) -> bool:
+    """
+    检查文件扩展名是否在允许的列表中
+    
+    Args:
+        file_path: 文件路径
+        allowed_extensions: 允许的扩展名列表（例如 ['.jpg', '.png']）
+        
+    Returns:
+        文件扩展名是否有效
+    """
+    ext = get_file_extension(file_path)
+    return ext in allowed_extensions
+
+def get_files_by_extensions(directory: str, extensions: List[str], exclude_patterns: List[str] = None) -> List[str]:
+    """
+    获取指定目录下所有符合扩展名的文件路径
+    
+    Args:
+        directory: 目录路径
+        extensions: 扩展名列表（例如 ['.jpg', '.png']）
+        exclude_patterns: 排除的文件名模式（例如 ['~$', '.tmp']）
+        
+    Returns:
+        文件路径列表
+    """
+    if exclude_patterns is None:
+        exclude_patterns = ['~$', '.tmp']
+        
+    files = []
+    for file in os.listdir(directory):
+        file_path = os.path.join(directory, file)
+        
+        # 检查是否是文件
+        if not os.path.isfile(file_path):
+            continue
+            
+        # 检查扩展名
+        if not is_valid_extension(file_path, extensions):
+            continue
+            
+        # 检查排除模式
+        exclude = False
+        for pattern in exclude_patterns:
+            if pattern in file:
+                exclude = True
+                break
+                
+        if not exclude:
+            files.append(file_path)
+            
+    return files
+
+def get_latest_file(directory: str, pattern: str = "", extensions: List[str] = None) -> Optional[str]:
+    """
+    获取指定目录下最新的文件
+    
+    Args:
+        directory: 目录路径
+        pattern: 文件名包含的字符串模式
+        extensions: 限制的文件扩展名列表
+        
+    Returns:
+        最新文件的路径，如果没有找到则返回None
+    """
+    if not os.path.exists(directory):
+        logger.warning(f"目录不存在: {directory}")
+        return None
+        
+    files = []
+    for file in os.listdir(directory):
+        # 检查模式和扩展名
+        if (pattern and pattern not in file) or \
+           (extensions and not is_valid_extension(file, extensions)):
+            continue
+            
+        file_path = os.path.join(directory, file)
+        if os.path.isfile(file_path):
+            files.append((file_path, os.path.getmtime(file_path)))
+    
+    if not files:
+        logger.warning(f"未在目录 {directory} 中找到符合条件的文件")
+        return None
+    
+    # 按修改时间排序，返回最新的
+    sorted_files = sorted(files, key=lambda x: x[1], reverse=True)
+    return sorted_files[0][0]
+
+def generate_timestamp_filename(original_path: str) -> str:
+    """
+    生成基于时间戳的文件名
+    
+    Args:
+        original_path: 原始文件路径
+        
+    Returns:
+        带时间戳的新文件路径
+    """
+    dir_path = os.path.dirname(original_path)
+    ext = os.path.splitext(original_path)[1]
+    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
+    return os.path.join(dir_path, f"{timestamp}{ext}")
+
+def rename_file(source_path: str, target_path: str) -> bool:
+    """
+    重命名文件
+    
+    Args:
+        source_path: 源文件路径
+        target_path: 目标文件路径
+        
+    Returns:
+        是否成功重命名
+    """
+    try:
+        # 确保目标目录存在
+        target_dir = os.path.dirname(target_path)
+        ensure_dir(target_dir)
+        
+        # 重命名文件
+        os.rename(source_path, target_path)
+        logger.info(f"文件已重命名: {os.path.basename(source_path)} -> {os.path.basename(target_path)}")
+        return True
+    except Exception as e:
+        logger.error(f"重命名文件失败: {e}")
+        return False
+
+def load_json(file_path: str, default: Any = None) -> Any:
+    """
+    加载JSON文件
+    
+    Args:
+        file_path: JSON文件路径
+        default: 如果文件不存在或加载失败时返回的默认值
+        
+    Returns:
+        JSON内容，或者默认值
+    """
+    if not os.path.exists(file_path):
+        return default
+        
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    except Exception as e:
+        logger.error(f"加载JSON文件失败: {file_path}, 错误: {e}")
+        return default
+
+def save_json(data: Any, file_path: str, ensure_ascii: bool = False, indent: int = 2) -> bool:
+    """
+    保存数据到JSON文件
+    
+    Args:
+        data: 要保存的数据
+        file_path: JSON文件路径
+        ensure_ascii: 是否确保ASCII编码
+        indent: 缩进空格数
+        
+    Returns:
+        是否成功保存
+    """
+    try:
+        # 确保目录存在
+        directory = os.path.dirname(file_path)
+        ensure_dir(directory)
+        
+        with open(file_path, 'w', encoding='utf-8') as f:
+            json.dump(data, f, ensure_ascii=ensure_ascii, indent=indent)
+        logger.debug(f"JSON数据已保存到: {file_path}")
+        return True
+    except Exception as e:
+        logger.error(f"保存JSON文件失败: {file_path}, 错误: {e}")
+        return False
+
+def get_file_size(file_path: str) -> int:
+    """
+    获取文件大小（字节）
+    
+    Args:
+        file_path: 文件路径
+        
+    Returns:
+        文件大小（字节）
+    """
+    try:
+        return os.path.getsize(file_path)
+    except Exception as e:
+        logger.error(f"获取文件大小失败: {file_path}, 错误: {e}")
+        return 0
+
+def is_file_size_valid(file_path: str, max_size_mb: float) -> bool:
+    """
+    检查文件大小是否在允许范围内
+    
+    Args:
+        file_path: 文件路径
+        max_size_mb: 最大允许大小（MB）
+        
+    Returns:
+        文件大小是否有效
+    """
+    size_bytes = get_file_size(file_path)
+    max_size_bytes = max_size_mb * 1024 * 1024
+    return size_bytes <= max_size_bytes 
@@ -0,0 +1,129 @@
+"""
+日志工具模块
+----------
+提供统一的日志配置和管理功能。
+"""
+
+import os
+import sys
+import logging
+from datetime import datetime
+from pathlib import Path
+from typing import Optional, Dict
+
+# 日志处理器字典，用于跟踪已创建的处理器
+_handlers: Dict[str, logging.Handler] = {}
+
+def setup_logger(name: str, 
+                log_file: Optional[str] = None, 
+                level=logging.INFO, 
+                console_output: bool = True,
+                file_output: bool = True,
+                log_format: str = '%(asctime)s - %(name)s - %(levelname)s - %(message)s') -> logging.Logger:
+    """
+    配置并返回日志记录器
+    
+    Args:
+        name: 日志记录器的名称
+        log_file: 日志文件路径，如果为None则使用默认路径
+        level: 日志级别
+        console_output: 是否输出到控制台
+        file_output: 是否输出到文件
+        log_format: 日志格式
+        
+    Returns:
+        配置好的日志记录器
+    """
+    # 获取或创建日志记录器
+    logger = logging.getLogger(name)
+    
+    # 如果已经配置过处理器，不重复配置
+    if logger.handlers:
+        return logger
+    
+    # 设置日志级别
+    logger.setLevel(level)
+    
+    # 创建格式化器
+    formatter = logging.Formatter(log_format)
+    
+    # 如果需要输出到文件
+    if file_output:
+        # 如果没有指定日志文件，使用默认路径
+        if log_file is None:
+            log_dir = os.path.abspath('logs')
+            # 确保日志目录存在
+            os.makedirs(log_dir, exist_ok=True)
+            log_file = os.path.join(log_dir, f"{name}.log")
+        
+        # 创建文件处理器
+        try:
+            file_handler = logging.FileHandler(log_file, encoding='utf-8')
+            file_handler.setFormatter(formatter)
+            file_handler.setLevel(level)
+            logger.addHandler(file_handler)
+            _handlers[f"{name}_file"] = file_handler
+            
+            # 记录活跃标记，避免被日志清理工具删除
+            active_marker = os.path.join(os.path.dirname(log_file), f"{name}.active")
+            with open(active_marker, 'w', encoding='utf-8') as f:
+                f.write(f"Active since: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+        except Exception as e:
+            print(f"无法创建日志文件处理器: {e}")
+    
+    # 如果需要输出到控制台
+    if console_output:
+        # 创建控制台处理器
+        console_handler = logging.StreamHandler(sys.stdout)
+        console_handler.setFormatter(formatter)
+        console_handler.setLevel(level)
+        logger.addHandler(console_handler)
+        _handlers[f"{name}_console"] = console_handler
+    
+    return logger
+
+def get_logger(name: str) -> logging.Logger:
+    """
+    获取已配置的日志记录器，如果不存在则创建一个新的
+    
+    Args:
+        name: 日志记录器的名称
+        
+    Returns:
+        日志记录器
+    """
+    logger = logging.getLogger(name)
+    if not logger.handlers:
+        return setup_logger(name)
+    return logger
+
+def close_logger(name: str) -> None:
+    """
+    关闭日志记录器的所有处理器
+    
+    Args:
+        name: 日志记录器的名称
+    """
+    logger = logging.getLogger(name)
+    for handler in logger.handlers[:]:
+        handler.close()
+        logger.removeHandler(handler)
+    
+    # 清除处理器缓存
+    _handlers.pop(f"{name}_file", None)
+    _handlers.pop(f"{name}_console", None)
+
+def cleanup_active_marker(name: str) -> None:
+    """
+    清理日志活跃标记
+    
+    Args:
+        name: 日志记录器的名称
+    """
+    try:
+        log_dir = os.path.abspath('logs')
+        active_marker = os.path.join(log_dir, f"{name}.active")
+        if os.path.exists(active_marker):
+            os.remove(active_marker)
+    except Exception as e:
+        print(f"无法清理日志活跃标记: {e}") 
@@ -0,0 +1,207 @@
+"""
+字符串处理工具模块
+---------------
+提供字符串处理、正则表达式匹配等功能。
+"""
+
+import re
+from typing import Dict, List, Optional, Tuple, Any, Match, Pattern
+
+def clean_string(text: str) -> str:
+    """
+    清理字符串，移除多余空白
+    
+    Args:
+        text: 源字符串
+        
+    Returns:
+        清理后的字符串
+    """
+    if not isinstance(text, str):
+        return ""
+    
+    # 移除首尾空白
+    text = text.strip()
+    # 移除多余空白
+    text = re.sub(r'\s+', ' ', text)
+    return text
+
+def remove_non_digits(text: str) -> str:
+    """
+    移除字符串中的非数字字符
+    
+    Args:
+        text: 源字符串
+        
+    Returns:
+        只包含数字的字符串
+    """
+    if not isinstance(text, str):
+        return ""
+        
+    return re.sub(r'\D', '', text)
+
+def extract_number(text: str) -> Optional[float]:
+    """
+    从字符串中提取数字
+    
+    Args:
+        text: 源字符串
+        
+    Returns:
+        提取的数字，如果没有则返回None
+    """
+    if not isinstance(text, str):
+        return None
+        
+    # 匹配数字（可以包含小数点和负号）
+    match = re.search(r'-?\d+(\.\d+)?', text)
+    if match:
+        return float(match.group())
+    return None
+
+def extract_unit(text: str, units: List[str] = None) -> Optional[str]:
+    """
+    从字符串中提取单位
+    
+    Args:
+        text: 源字符串
+        units: 有效单位列表，如果为None则自动识别
+        
+    Returns:
+        提取的单位，如果没有则返回None
+    """
+    if not isinstance(text, str):
+        return None
+        
+    # 如果提供了单位列表，检查字符串中是否包含
+    if units:
+        for unit in units:
+            if unit in text:
+                return unit
+        return None
+        
+    # 否则，尝试自动识别常见单位
+    # 正则表达式：匹配数字后面的非数字部分作为单位
+    match = re.search(r'\d+\s*([^\d\s]+)', text)
+    if match:
+        return match.group(1)
+    return None
+
+def extract_number_and_unit(text: str) -> Tuple[Optional[float], Optional[str]]:
+    """
+    从字符串中同时提取数字和单位
+    
+    Args:
+        text: 源字符串
+        
+    Returns:
+        (数字, 单位)元组，如果没有则对应返回None
+    """
+    if not isinstance(text, str):
+        return None, None
+        
+    # 匹配数字和单位的组合
+    match = re.search(r'(-?\d+(?:\.\d+)?)\s*([^\d\s]+)?', text)
+    if match:
+        number = float(match.group(1))
+        unit = match.group(2) if match.group(2) else None
+        return number, unit
+    return None, None
+
+def parse_specification(spec_str: str) -> Optional[int]:
+    """
+    解析规格字符串，提取包装数量
+    支持格式：1*15, 1x15, 1*5*10
+    
+    Args:
+        spec_str: 规格字符串
+        
+    Returns:
+        包装数量，如果无法解析则返回None
+    """
+    if not spec_str or not isinstance(spec_str, str):
+        return None
+    
+    try:
+        # 清理规格字符串
+        spec_str = clean_string(spec_str)
+        
+        # 匹配1*5*10 格式的三级规格
+        match = re.search(r'(\d+)[\*xX×](\d+)[\*xX×](\d+)', spec_str)
+        if match:
+            # 取最后一个数字作为袋数量
+            return int(match.group(3))
+        
+        # 匹配1*15, 1x15 格式
+        match = re.search(r'(\d+)[\*xX×](\d+)', spec_str)
+        if match:
+            # 取第二个数字作为包装数量
+            return int(match.group(2))
+            
+        # 匹配24瓶/件等格式
+        match = re.search(r'(\d+)[瓶个支袋][/／](件|箱)', spec_str)
+        if match:
+            return int(match.group(1))
+            
+        # 匹配4L格式
+        match = re.search(r'(\d+(?:\.\d+)?)\s*[Ll升][*×]?(\d+)?', spec_str)
+        if match:
+            # 如果有第二个数字，返回它；否则返回1
+            return int(match.group(2)) if match.group(2) else 1
+            
+    except Exception:
+        pass
+        
+    return None
+
+def clean_barcode(barcode: Any) -> str:
+    """
+    清理条码格式
+    
+    Args:
+        barcode: 条码（可以是字符串、整数或浮点数）
+        
+    Returns:
+        清理后的条码字符串
+    """
+    if isinstance(barcode, (int, float)):
+        barcode = f"{barcode:.0f}"
+        
+    # 清理条码格式，移除可能的非数字字符（包括小数点）
+    barcode_clean = re.sub(r'\.0+$', '', str(barcode))  # 移除末尾0
+    barcode_clean = re.sub(r'\D', '', barcode_clean)  # 只保留数字
+    
+    return barcode_clean
+
+def is_scientific_notation(value: str) -> bool:
+    """
+    检查字符串是否是科学计数法表示
+    
+    Args:
+        value: 字符串值
+        
+    Returns:
+        是否是科学计数法
+    """
+    return bool(re.match(r'^-?\d+(\.\d+)?[eE][+-]?\d+$', str(value)))
+
+def format_barcode(barcode: Any) -> str:
+    """
+    格式化条码，处理科学计数法
+    
+    Args:
+        barcode: 条码值
+        
+    Returns:
+        格式化后的条码字符串
+    """
+    if isinstance(barcode, (int, float)) or is_scientific_notation(str(barcode)):
+        try:
+            # 转换为整数并格式化为字符串
+            return f"{int(float(barcode))}"
+        except (ValueError, TypeError):
+            pass
+    
+    # 如果不是数字或转换失败，返回原始字符串
+    return str(barcode)