feat: 益选 OCR 订单处理系统初始提交

- 智能供应商识别（蓉城易购/烟草/杨碧月/通用） - 百度 OCR 表格识别集成 - 规则引擎（列映射/数据清洗/单位转换/规格推断） - 条码映射管理与云端同步（Gitea REST API） - 云端同步支持：条码映射、供应商配置、商品资料、采购模板 - 拖拽一键处理（图片→OCR→Excel→合并） - 191 个单元测试 - 移除无用的模板管理功能 - 清理 IDE 产物目录 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-04 19:51:13 +08:00
commit e4d62df7e3
78 changed files with 15257 additions and 0 deletions
@@ -0,0 +1,214 @@
+"""
+商品资料 SQLite 数据库
+
+将商品资料 (条码/名称/进货价/单位) 存储在 SQLite 中，
+支持从 Excel 自动导入和按条码快速查询。
+"""
+
+import os
+import sqlite3
+from datetime import datetime
+from typing import Dict, List, Optional
+
+import pandas as pd
+
+from ..utils.log_utils import get_logger
+from ..utils.file_utils import smart_read_excel
+from ...core.handlers.column_mapper import ColumnMapper
+
+logger = get_logger(__name__)
+
+
+class ProductDatabase:
+    """商品资料 SQLite 数据库"""
+
+    SCHEMA = """
+    CREATE TABLE IF NOT EXISTS products (
+        barcode TEXT PRIMARY KEY,
+        name TEXT DEFAULT '',
+        price REAL DEFAULT 0.0,
+        unit TEXT DEFAULT '',
+        updated_at TEXT
+    );
+    """
+
+    def __init__(self, db_path: str, excel_source: str):
+        """初始化数据库，如果 SQLite 不存在则自动从 Excel 导入
+
+        Args:
+            db_path: SQLite 数据库文件路径
+            excel_source: 商品资料 Excel 文件路径
+        """
+        self.db_path = db_path
+        self.excel_source = excel_source
+        self._ensure_db()
+
+    def _connect(self) -> sqlite3.Connection:
+        return sqlite3.connect(self.db_path)
+
+    def _ensure_db(self):
+        """确保数据库存在，不存在则从 Excel 导入"""
+        if os.path.exists(self.db_path):
+            return
+
+        if not os.path.exists(self.excel_source):
+            logger.warning(f"商品资料 Excel 不存在，跳过导入: {self.excel_source}")
+            self._create_empty_db()
+            return
+
+        logger.info(f"首次运行，从 Excel 导入商品资料: {self.excel_source}")
+        os.makedirs(os.path.dirname(self.db_path), exist_ok=True)
+        self._create_empty_db()
+        count = self.import_from_excel(self.excel_source)
+        logger.info(f"商品资料导入完成: {count} 条记录")
+
+    def _create_empty_db(self):
+        """创建空数据库"""
+        conn = self._connect()
+        try:
+            conn.executescript(self.SCHEMA)
+            conn.commit()
+        finally:
+            conn.close()
+
+    def import_from_excel(self, excel_path: str) -> int:
+        """从 Excel 导入商品资料
+
+        Args:
+            excel_path: Excel 文件路径
+
+        Returns:
+            导入的记录数
+        """
+        df = smart_read_excel(excel_path)
+        if df is None or df.empty:
+            logger.warning(f"Excel 文件为空或读取失败: {excel_path}")
+            return 0
+
+        # 查找条码列
+        barcode_col = ColumnMapper.find_column(list(df.columns), 'barcode')
+        if not barcode_col:
+            logger.error(f"Excel 中未找到条码列: {list(df.columns)}")
+            return 0
+
+        # 查找进货价列
+        price_col = ColumnMapper.find_column(list(df.columns), 'unit_price')
+        # 进货价可能没有标准别名，补充查找
+        if not price_col:
+            for col in df.columns:
+                col_str = str(col).strip()
+                if '进货价' in col_str:
+                    price_col = col
+                    break
+
+        # 查找名称列和单位列 (可选)
+        name_col = ColumnMapper.find_column(list(df.columns), 'name')
+        unit_col = ColumnMapper.find_column(list(df.columns), 'unit')
+
+        now = datetime.now().isoformat()
+        rows = []
+        for _, row in df.iterrows():
+            barcode = str(row.get(barcode_col, '')).strip()
+            if not barcode or barcode == 'nan':
+                continue
+
+            price = 0.0
+            if price_col:
+                try:
+                    p = row.get(price_col)
+                    if p is not None and str(p).strip() not in ('', 'nan', 'None'):
+                        price = float(p)
+                except (ValueError, TypeError):
+                    pass
+
+            name = str(row.get(name_col, '')).strip() if name_col else ''
+            if name == 'nan':
+                name = ''
+            unit = str(row.get(unit_col, '')).strip() if unit_col else ''
+            if unit == 'nan':
+                unit = ''
+
+            rows.append((barcode, name, price, unit, now))
+
+        if not rows:
+            logger.warning(f"Excel 中未解析出有效记录: {excel_path}")
+            return 0
+
+        conn = self._connect()
+        try:
+            conn.executemany(
+                "INSERT OR REPLACE INTO products (barcode, name, price, unit, updated_at) "
+                "VALUES (?, ?, ?, ?, ?)",
+                rows
+            )
+            conn.commit()
+        finally:
+            conn.close()
+
+        return len(rows)
+
+    def reimport(self) -> int:
+        """重新从 Excel 导入（清空现有数据后重新导入）
+
+        Returns:
+            导入的记录数
+        """
+        conn = self._connect()
+        try:
+            conn.execute("DELETE FROM products")
+            conn.commit()
+        finally:
+            conn.close()
+        return self.import_from_excel(self.excel_source)
+
+    def get_price(self, barcode: str) -> Optional[float]:
+        """按条码查询进货价
+
+        Args:
+            barcode: 商品条码
+
+        Returns:
+            进货价，未找到返回 None
+        """
+        conn = self._connect()
+        try:
+            cursor = conn.execute(
+                "SELECT price FROM products WHERE barcode = ?",
+                (str(barcode).strip(),)
+            )
+            row = cursor.fetchone()
+            return row[0] if row else None
+        finally:
+            conn.close()
+
+    def get_prices(self, barcodes: List[str]) -> Dict[str, float]:
+        """批量查询进货价
+
+        Args:
+            barcodes: 条码列表
+
+        Returns:
+            {条码: 进货价} 字典，未找到的不包含
+        """
+        if not barcodes:
+            return {}
+
+        conn = self._connect()
+        try:
+            placeholders = ','.join('?' * len(barcodes))
+            cursor = conn.execute(
+                f"SELECT barcode, price FROM products WHERE barcode IN ({placeholders})",
+                [str(b).strip() for b in barcodes]
+            )
+            return {row[0]: row[1] for row in cursor.fetchall()}
+        finally:
+            conn.close()
+
+    def count(self) -> int:
+        """返回商品总数"""
+        conn = self._connect()
+        try:
+            cursor = conn.execute("SELECT COUNT(*) FROM products")
+            return cursor.fetchone()[0]
+        finally:
+            conn.close()
@@ -0,0 +1,5 @@
+"""
+OCR订单处理系统 - Excel处理模块
+----------------------------
+提供Excel文件处理、数据提取和转换功能。
+""" 
@@ -0,0 +1,535 @@
+"""
+单位转换模块
+----------
+提供单位转换功能，支持规格推断和单位自动提取。
+"""
+
+import re
+import logging
+import os
+import json
+from typing import Dict, Tuple, Optional, Any, List, Union
+
+from ..utils.log_utils import get_logger
+from .handlers.barcode_mapper import BarcodeMapper
+from .handlers.unit_converter_handlers import (
+    JianUnitHandler, BoxUnitHandler, TiHeUnitHandler, 
+    GiftUnitHandler, UnitHandler
+)
+from .validators import ProductValidator
+
+logger = get_logger(__name__)
+
+# 条码映射配置文件路径
+BARCODE_MAPPING_CONFIG = "config/barcode_mappings.json"
+
+class UnitConverter:
+    """
+    单位转换器：处理不同单位之间的转换，支持从商品名称推断规格
+    """
+    
+    def __init__(self):
+        """
+        初始化单位转换器
+        """
+        # 加载特殊条码配置
+        self.special_barcodes = self.load_barcode_mappings()
+        
+        # 规格推断的正则表达式模式
+        self.spec_patterns = [
+            # 1*6、1x12、1X20等格式
+            (r'(\d+)[*xX×](\d+)', r'\1*\2'),
+            # 1*5*12和1x5x12等三级格式
+            (r'(\d+)[*xX×](\d+)[*xX×](\d+)', r'\1*\2*\3'),
+            # "xx入"格式，如"12入"、"24入"
+            (r'(\d+)入', r'1*\1'),
+            # "xxL*1"或"xx升*1"格式
+            (r'([\d\.]+)[L升][*xX×]?(\d+)?', r'\1L*\2' if r'\2' else r'\1L*1'),
+            # "xxkg*1"或"xx公斤*1"格式
+            (r'([\d\.]+)(?:kg|公斤)[*xX×]?(\d+)?', r'\1kg*\2' if r'\2' else r'\1kg*1'),
+            # "xxg*1"或"xx克*1"格式
+            (r'([\d\.]+)(?:g|克)[*xX×]?(\d+)?', r'\1g*\2' if r'\2' else r'\1g*1'),
+            # "xxmL*1"或"xx毫升*1"格式
+            (r'([\d\.]+)(?:mL|毫升)[*xX×]?(\d+)?', r'\1mL*\2' if r'\2' else r'\1mL*1'),
+        ]
+        
+        # 初始化处理程序
+        self._init_handlers()
+        
+        # 初始化验证器
+        self.validator = ProductValidator()
+
+    def _init_handlers(self):
+        """
+        初始化各种处理程序
+        """
+        # 创建条码处理程序
+        self.barcode_mapper = BarcodeMapper(self.special_barcodes)
+        
+        # 创建单位处理程序列表，优先级从高到低
+        self.unit_handlers: List[UnitHandler] = [
+            GiftUnitHandler(),   # 首先处理赠品，优先级最高
+            JianUnitHandler(),   # 处理"件"单位
+            BoxUnitHandler(),    # 处理"箱"单位
+            TiHeUnitHandler()    # 处理"提"和"盒"单位
+        ]
+
+    def extract_unit_from_quantity(self, quantity_str: str) -> Tuple[Optional[float], Optional[str]]:
+        """
+        从数量字符串中提取单位
+        
+        支持的格式:
+        1. "2箱" -> (2, "箱")
+        2. "3件" -> (3, "件")
+        3. "1.5提" -> (1.5, "提")
+        4. "数量: 5盒" -> (5, "盒")
+        5. "× 2瓶" -> (2, "瓶")
+        
+        Args:
+            quantity_str: 数量字符串，如"2箱"、"5件"
+            
+        Returns:
+            (数量, 单位)的元组，如果无法提取则返回(None, None)
+        """
+        if not quantity_str or not isinstance(quantity_str, str):
+            return None, None
+        
+        # 清理字符串，移除前后空白和一些常见前缀
+        cleaned_str = quantity_str.strip()
+        for prefix in ['数量:', '数量：', '×', 'x', 'X', '*']:
+            cleaned_str = cleaned_str.replace(prefix, '').strip()
+        
+        # 匹配数字+单位格式 (基本格式)
+        basic_match = re.match(r'^([\d\.]+)\s*([^\d\s\.]+)$', cleaned_str)
+        if basic_match:
+            try:
+                num = float(basic_match.group(1))
+                unit = basic_match.group(2)
+                logger.info(f"从数量提取单位(基本格式): {quantity_str} -> 数量={num}, 单位={unit}")
+                return num, unit
+            except ValueError:
+                pass
+        
+        # 匹配更复杂的格式，如包含其他文本的情况
+        complex_match = re.search(r'([\d\.]+)\s*([箱|件|瓶|提|盒|袋|桶|包|kg|g|升|毫升|L|ml|个])', cleaned_str)
+        if complex_match:
+            try:
+                num = float(complex_match.group(1))
+                unit = complex_match.group(2)
+                logger.info(f"从数量提取单位(复杂格式): {quantity_str} -> 数量={num}, 单位={unit}")
+                return num, unit
+            except ValueError:
+                pass
+        
+        return None, None
+    
+    def extract_specification(self, text: str) -> Optional[str]:
+        """
+        从文本中提取规格信息
+        
+        Args:
+            text: 文本字符串
+            
+        Returns:
+            提取的规格字符串，如果无法提取则返回None
+        """
+        if not text or not isinstance(text, str):
+            return None
+        
+        # 处理XX入白膜格式，如"550纯净水24入白膜"
+        match = re.search(r'.*?(\d+)入白膜', text)
+        if match:
+            result = f"1*{match.group(1)}"
+            logger.info(f"提取规格(入白膜): {text} -> {result}")
+            return result
+            
+        # 尝试所有模式
+        for pattern, replacement in self.spec_patterns:
+            match = re.search(pattern, text)
+            if match:
+                # 特殊处理三级格式，确保正确显示为1*5*12
+                if '*' in replacement and replacement.count('*') == 1 and len(match.groups()) >= 2:
+                    result = f"{match.group(1)}*{match.group(2)}"
+                    logger.info(f"提取规格: {text} -> {result}")
+                    return result
+                # 特殊处理三级规格格式
+                elif '*' in replacement and replacement.count('*') == 2 and len(match.groups()) >= 3:
+                    result = f"{match.group(1)}*{match.group(2)}*{match.group(3)}"
+                    logger.info(f"提取三级规格: {text} -> {result}")
+                    return result
+                # 一般情况
+                else:
+                    result = re.sub(pattern, replacement, text)
+                    logger.info(f"提取规格: {text} -> {result}")
+                    return result
+                
+        # 没有匹配任何模式
+        return None
+    
+    def infer_specification_from_name(self, name: str) -> Optional[str]:
+        """
+        从商品名称中推断规格
+        
+        规则:
+        1. "xx入纸箱" -> 1*xx (如"15入纸箱" -> 1*15)
+        2. 直接包含规格 "1*15" -> 1*15
+        3. "xx纸箱" -> 1*xx (如"15纸箱" -> 1*15)
+        4. "xx白膜" -> 1*xx (如"12白膜" -> 1*12)
+        5. "xxL" 容量单位特殊处理
+        6. "xx(g|ml|毫升|克)*数字" -> 1*数字 (如"450g*15" -> 1*15)
+        
+        Args:
+            name: 商品名称
+            
+        Returns:
+            推断的规格，如果无法推断则返回None
+        """
+        if not name or not isinstance(name, str):
+            return None
+        
+        # 记录原始商品名称，用于日志
+        original_name = name
+        
+        # 新增模式: 处理重量/容量*数字格式，如"450g*15", "450ml*15"
+        # 忽略重量/容量值，只提取后面的数量作为规格
+        weight_volume_pattern = r'.*?\d+(?:g|ml|毫升|克)[*xX×](\d+)'
+        match = re.search(weight_volume_pattern, name)
+        if match:
+            inferred_spec = f"1*{match.group(1)}"
+            logger.info(f"从名称推断规格(重量/容量*数量): {original_name} -> {inferred_spec}")
+            return inferred_spec
+        
+        # 特殊模式1.1: "xx入白膜" 格式，如"550纯净水24入白膜" -> "1*24"
+        pattern1_1 = r'.*?(\d+)入白膜'
+        match = re.search(pattern1_1, name)
+        if match:
+            inferred_spec = f"1*{match.group(1)}"
+            logger.info(f"从名称推断规格(入白膜): {original_name} -> {inferred_spec}")
+            return inferred_spec
+        
+        # 特殊模式1: "xx入纸箱" 格式，如"445水溶C血橙15入纸箱" -> "1*15"
+        pattern1 = r'.*?(\d+)入纸箱'
+        match = re.search(pattern1, name)
+        if match:
+            inferred_spec = f"1*{match.group(1)}"
+            logger.info(f"从名称推断规格(入纸箱): {original_name} -> {inferred_spec}")
+            return inferred_spec
+        
+        # 特殊模式2: 直接包含规格，如"500-东方树叶-乌龙茶1*15-纸箱装" -> "1*15"
+        pattern2 = r'.*?(\d+)[*xX×](\d+).*'
+        match = re.search(pattern2, name)
+        if match:
+            inferred_spec = f"{match.group(1)}*{match.group(2)}"
+            logger.info(f"从名称推断规格(直接格式): {original_name} -> {inferred_spec}")
+            return inferred_spec
+        
+        # 特殊模式3: "xx纸箱" 格式，如"500茶π蜜桃乌龙15纸箱" -> "1*15"
+        pattern3 = r'.*?(\d+)纸箱'
+        match = re.search(pattern3, name)
+        if match:
+            inferred_spec = f"1*{match.group(1)}"
+            logger.info(f"从名称推断规格(纸箱): {original_name} -> {inferred_spec}")
+            return inferred_spec
+        
+        # 特殊模式4: "xx白膜" 格式，如"1.5L水12白膜" 或 "550水24白膜" -> "1*12" 或 "1*24"
+        pattern4 = r'.*?(\d+)白膜'
+        match = re.search(pattern4, name)
+        if match:
+            inferred_spec = f"1*{match.group(1)}"
+            logger.info(f"从名称推断规格(白膜): {original_name} -> {inferred_spec}")
+            return inferred_spec
+        
+        # 特殊模式5: 容量单位带数量格式 "1.8L*8瓶" -> "1.8L*8"
+        volume_count_pattern = r'.*?([\d\.]+)[Ll升][*×xX](\d+).*'
+        match = re.search(volume_count_pattern, name)
+        if match:
+            volume = match.group(1)
+            count = match.group(2)
+            inferred_spec = f"{volume}L*{count}"
+            logger.info(f"从名称推断规格(容量*数量): {original_name} -> {inferred_spec}")
+            return inferred_spec
+            
+        # 特殊模式6: 简单容量单位如"12.9L桶装水" -> "12.9L*1"
+        simple_volume_pattern = r'.*?([\d\.]+)[Ll升].*'
+        match = re.search(simple_volume_pattern, name)
+        if match:
+            inferred_spec = f"{match.group(1)}L*1"
+            logger.info(f"从名称推断规格(简单容量): {original_name} -> {inferred_spec}")
+            return inferred_spec
+        
+        # 尝试通用模式匹配
+        spec = self.extract_specification(name)
+        if spec:
+            logger.info(f"从名称推断规格(通用模式): {original_name} -> {spec}")
+            return spec
+            
+        return None
+        
+    def parse_specification(self, spec: str) -> Tuple[int, int, Optional[int]]:
+        """
+        解析规格字符串，支持1*12和1*5*12等格式
+        
+        Args:
+            spec: 规格字符串
+            
+        Returns:
+            (一级包装, 二级包装, 三级包装)元组，如果是二级包装，第三个值为None
+        """
+        if not spec or not isinstance(spec, str):
+            return 1, 1, None
+            
+        try:
+            # 清理规格字符串，确保格式统一
+            spec = re.sub(r'\s+', '', spec)  # 移除所有空白
+            spec = re.sub(r'[xX×]', '*', spec)  # 统一分隔符为*
+            
+            logger.debug(f"解析规格: {spec}")
+            
+            # 新增：处理“1件=12桶/袋/盒...”等等式规格，统一为1*12
+            eq_match = re.match(r'(\d+(?:\.\d+)?)\s*(?:件|箱|提|盒)\s*[=＝]\s*(\d+)\s*(?:瓶|桶|盒|支|个|袋|罐|包|卷)', spec)
+            if eq_match:
+                try:
+                    level2 = int(eq_match.group(2))
+                    logger.info(f"解析等式规格: {spec} -> 1*{level2}")
+                    return 1, level2, None
+                except ValueError:
+                    pass
+
+            # 处理三级包装，如1*5*12
+            three_level_match = re.match(r'(\d+)[*](\d+)[*](\d+)', spec)
+            if three_level_match:
+                try:
+                    level1 = int(three_level_match.group(1))
+                    level2 = int(three_level_match.group(2))
+                    level3 = int(three_level_match.group(3))
+                    logger.info(f"解析三级规格: {spec} -> {level1}*{level2}*{level3}")
+                    return level1, level2, level3
+                except ValueError:
+                    pass
+            
+            # 处理带重量单位的规格，如5kg*6、500g*12等
+            weight_match = re.match(r'([\d\.]+)(?:kg|g|克|千克|公斤)[*](\d+)', spec, re.IGNORECASE)
+            if weight_match:
+                try:
+                    # 对于重量单位，使用1作为一级包装，后面的数字作为二级包装
+                    level2 = int(weight_match.group(2))
+                    logger.info(f"解析重量规格: {spec} -> 1*{level2}")
+                    return 1, level2, None
+                except ValueError:
+                    pass
+            
+            # 处理带容量单位的规格，如500ml*15, 1L*12等
+            ml_match = re.match(r'(\d+)(?:ml|毫升)[*](\d+)', spec, re.IGNORECASE)
+            if ml_match:
+                try:
+                    # 对于ml单位，使用1作为一级包装，后面的数字作为二级包装
+                    level2 = int(ml_match.group(2))
+                    logger.info(f"解析容量(ml)规格: {spec} -> 1*{level2}")
+                    return 1, level2, None
+                except ValueError:
+                    pass
+            
+            # 处理带L单位的规格，如1L*12等
+            l_match = re.match(r'(\d+(?:\.\d+)?)[Ll升][*](\d+)', spec)
+            if l_match:
+                try:
+                    # 对于L单位，正确提取第二部分作为包装数量
+                    level2 = int(l_match.group(2))
+                    logger.info(f"解析容量(L)规格: {spec} -> 1*{level2}")
+                    return 1, level2, None
+                except ValueError:
+                    pass
+            
+            # 处理二级包装，如1*12
+            two_level_match = re.match(r'(\d+)[*](\d+)', spec)
+            if two_level_match:
+                try:
+                    level1 = int(two_level_match.group(1))
+                    level2 = int(two_level_match.group(2))
+                    logger.info(f"解析二级规格: {spec} -> {level1}*{level2}")
+                    return level1, level2, None
+                except ValueError:
+                    pass
+                
+            # 特殊处理L/升为单位的规格，如12.5L*1
+            volume_match = re.match(r'([\d\.]+)[L升][*xX×](\d+)', spec)
+            if volume_match:
+                try:
+                    volume = float(volume_match.group(1))
+                    quantity = int(volume_match.group(2))
+                    logger.info(f"解析容量规格: {spec} -> {volume}L*{quantity}")
+                    return 1, quantity, None
+                except ValueError:
+                    pass
+                    
+            # 处理不规范格式，如IL*12, 6oo*12等，从中提取数字部分作为包装数量
+            # 只要规格中包含*和数字，就尝试提取*后面的数字作为件数
+            irregular_match = re.search(r'[^0-9]*\*(\d+)', spec)
+            if irregular_match:
+                try:
+                    level2 = int(irregular_match.group(1))
+                    logger.info(f"解析不规范规格: {spec} -> 1*{level2}")
+                    return 1, level2, None
+                except ValueError:
+                    pass
+            
+            # 默认值
+            logger.warning(f"无法解析规格: {spec}，使用默认值1*1")
+            return 1, 1, None
+        except Exception as e:
+            logger.error(f"解析规格时出错: {e}")
+            return 1, 1, None
+        
+    def process_unit_conversion(self, product: Dict) -> Dict:
+        """
+        处理单位转换，按照以下规则：
+        1. 特殊条码: 优先处理特殊条码
+        2. 赠品处理: 对于赠品，维持数量转换但单价为0
+        3. "件"单位: 数量×包装数量, 单价÷包装数量, 单位转为"瓶"
+        4. "箱"单位: 数量×包装数量, 单价÷包装数量, 单位转为"瓶"
+        5. "提"和"盒"单位: 如果是三级规格, 按件处理; 如果是二级规格, 保持不变
+        6. 其他单位: 保持不变
+        
+        Args:
+            product: 商品信息字典
+            
+        Returns:
+            处理后的商品信息字典
+        """
+        # 首先验证商品数据
+        product = self.validator.validate_product(product)
+        
+        # 复制原始数据，避免修改原始字典
+        result = product.copy()
+        
+        barcode = result.get('barcode', '')
+        specification = result.get('specification', '')
+        
+        # 跳过无效数据
+        if not barcode:
+            return result
+        
+        # 先处理条码映射
+        result = self.barcode_mapper.map_barcode(result)
+        
+        # 如果没有规格信息，无法进行单位转换
+        if not specification:
+            # 尝试从商品名称推断规格
+            inferred_spec = self.infer_specification_from_name(result.get('name', ''))
+            if inferred_spec:
+                result['specification'] = inferred_spec
+                logger.info(f"从商品名称推断规格: {result.get('name', '')} -> {inferred_spec}")
+            else:
+                return result
+            
+        # 解析规格信息
+        level1, level2, level3 = self.parse_specification(result.get('specification', ''))
+        
+        # 使用单位处理程序处理单位转换
+        for handler in self.unit_handlers:
+            if handler.can_handle(result):
+                return handler.handle(result, level1, level2, level3)
+        
+        # 没有找到适用的处理程序，保持不变
+        logger.info(f"其他单位处理: 保持原样 数量: {result.get('quantity', 0)}, 单价: {result.get('price', 0)}, 单位: {result.get('unit', '')}")
+        return result 
+
+    def load_barcode_mappings(self) -> Dict[str, Dict[str, Any]]:
+        """
+        从配置文件加载条码映射
+        
+        Returns:
+            条码映射字典
+        """
+        # 默认映射
+        default_mappings = {
+            '6925019900087': {
+                'multiplier': 10,
+                'target_unit': '瓶',
+                'description': '特殊处理：数量*10，单位转换为瓶'
+            },
+            '6921168593804': {
+                'multiplier': 30,
+                'target_unit': '瓶',
+                'description': 'NFC产品特殊处理：每箱30瓶'
+            },
+            '6901826888138': {
+                'multiplier': 30,
+                'target_unit': '瓶',
+                'fixed_price': 112/30,
+                'specification': '1*30',
+                'description': '特殊处理: 规格1*30，数量*30，单价=112/30'
+            },
+            # 条码映射配置
+            '6920584471055': {
+                'map_to': '6920584471017',
+                'description': '条码映射：6920584471055 -> 6920584471017'
+            },
+            '6925861571159': {
+                'map_to': '69021824',
+                'description': '条码映射：6925861571159 -> 69021824'
+            },
+            '6923644268923': {
+                'map_to': '6923644268480',
+                'description': '条码映射：6923644268923 -> 6923644268480'
+            },
+            # 添加特殊条码6958620703716，既需要特殊处理又需要映射
+            '6958620703716': {
+                'specification': '1*14',
+                'map_to': '6958620703907',
+                'description': '特殊处理: 规格1*14，同时映射到6958620703907'
+            }
+        }
+        
+        try:
+            # 检查配置文件是否存在
+            if os.path.exists(BARCODE_MAPPING_CONFIG):
+                with open(BARCODE_MAPPING_CONFIG, 'r', encoding='utf-8') as file:
+                    mappings = json.load(file)
+                    logger.info(f"成功加载条码映射配置，共{len(mappings)}项")
+                    return mappings
+            else:
+                # 创建默认配置文件
+                self.save_barcode_mappings(default_mappings)
+                logger.info(f"创建默认条码映射配置，共{len(default_mappings)}项")
+                return default_mappings
+        except Exception as e:
+            logger.error(f"加载条码映射配置失败: {e}")
+            return default_mappings
+    
+    def save_barcode_mappings(self, mappings: Dict[str, Dict[str, Any]]) -> bool:
+        """
+        保存条码映射到配置文件
+        
+        Args:
+            mappings: 条码映射字典
+            
+        Returns:
+            保存是否成功
+        """
+        try:
+            # 确保配置目录存在
+            os.makedirs(os.path.dirname(BARCODE_MAPPING_CONFIG), exist_ok=True)
+            
+            # 写入配置文件
+            with open(BARCODE_MAPPING_CONFIG, 'w', encoding='utf-8') as file:
+                json.dump(mappings, file, ensure_ascii=False, indent=2)
+            
+            logger.info(f"条码映射配置保存成功，共{len(mappings)}项")
+            return True
+        except Exception as e:
+            logger.error(f"保存条码映射配置失败: {e}")
+            return False
+    
+    def update_barcode_mappings(self, new_mappings: Dict[str, Dict[str, Any]]) -> bool:
+        """
+        更新条码映射配置
+        
+        Args:
+            new_mappings: 新的条码映射字典
+            
+        Returns:
+            更新是否成功
+        """
+        self.special_barcodes = new_mappings
+        return self.save_barcode_mappings(new_mappings) 
@@ -0,0 +1,11 @@
+"""
+单位转换处理程序包
+-----------------
+提供单位转换和条码处理的各种处理程序
+"""
+
+from typing import Dict, Any
+
+# 导出所有处理程序类
+from .barcode_mapper import BarcodeMapper
+from .unit_converter_handlers import JianUnitHandler, BoxUnitHandler, TiHeUnitHandler, GiftUnitHandler, UnitHandler 
@@ -0,0 +1,83 @@
+"""
+条码映射处理程序
+-------------
+处理特殊条码的映射和转换
+"""
+
+import logging
+from typing import Dict, Optional, Any
+
+from ...utils.log_utils import get_logger
+
+logger = get_logger(__name__)
+
+
+class BarcodeMapper:
+    """
+    条码映射器：负责特殊条码的映射和处理
+    """
+    
+    def __init__(self, special_barcodes: Dict[str, Dict[str, Any]]):
+        """
+        初始化条码映射器
+        
+        Args:
+            special_barcodes: 特殊条码配置字典
+        """
+        self.special_barcodes = special_barcodes or {}
+        
+    def map_barcode(self, product: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        映射商品条码，处理特殊情况
+        
+        Args:
+            product: 包含条码的商品信息字典
+            
+        Returns:
+            处理后的商品信息字典
+        """
+        result = product.copy()
+        barcode = result.get('barcode', '')
+        
+        # 如果条码不在特殊条码列表中，直接返回
+        if not barcode or barcode not in self.special_barcodes:
+            return result
+        
+        special_config = self.special_barcodes[barcode]
+        
+        # 处理特殊倍数
+        if 'multiplier' in special_config:
+            multiplier = special_config.get('multiplier', 1)
+            target_unit = special_config.get('target_unit', '瓶')
+            
+            # 数量乘以倍数
+            quantity = result.get('quantity', 0)
+            new_quantity = quantity * multiplier
+            
+            # 单价除以倍数
+            price = result.get('price', 0)
+            new_price = price / multiplier if price else 0
+            
+            # 如果有固定单价，优先使用
+            if 'fixed_price' in special_config:
+                new_price = special_config['fixed_price']
+                logger.info(f"特殊条码({barcode})使用固定单价: {new_price}")
+            
+            # 如果有固定规格，设置规格
+            if 'specification' in special_config:
+                result['specification'] = special_config['specification']
+                logger.info(f"特殊条码({barcode})使用固定规格: {special_config['specification']}")
+            
+            logger.info(f"特殊条码处理: {barcode}, 数量: {quantity} -> {new_quantity}, 单价: {price} -> {new_price}, 单位: {result.get('unit', '')} -> {target_unit}")
+            
+            result['quantity'] = new_quantity
+            result['price'] = new_price
+            result['unit'] = target_unit
+        
+        # 处理条码映射 - 放在后面以便可以同时进行特殊处理和条码映射
+        if 'map_to' in special_config:
+            new_barcode = special_config['map_to']
+            logger.info(f"条码映射: {barcode} -> {new_barcode}")
+            result['barcode'] = new_barcode
+            
+        return result 
@@ -0,0 +1,286 @@
+"""
+单位转换处理程序
+-------------
+处理不同单位的转换逻辑
+"""
+
+import logging
+from typing import Dict, Optional, Any, Tuple, Protocol
+from abc import ABC, abstractmethod
+
+from ...utils.log_utils import get_logger
+
+logger = get_logger(__name__)
+
+
+class UnitHandler(ABC):
+    """
+    单位处理器基类：定义单位处理接口
+    """
+    
+    @abstractmethod
+    def can_handle(self, product: Dict[str, Any]) -> bool:
+        """
+        检查是否可以处理该商品
+        
+        Args:
+            product: 商品信息字典
+            
+        Returns:
+            是否可以处理
+        """
+        pass
+    
+    @abstractmethod
+    def handle(self, product: Dict[str, Any], level1: int, level2: int, level3: Optional[int]) -> Dict[str, Any]:
+        """
+        处理单位转换
+        
+        Args:
+            product: 商品信息字典
+            level1: 一级包装数量
+            level2: 二级包装数量
+            level3: 三级包装数量，可能为None
+            
+        Returns:
+            处理后的商品信息字典
+        """
+        pass
+
+
+class JianUnitHandler(UnitHandler):
+    """
+    处理"件"单位的转换
+    """
+    
+    def can_handle(self, product: Dict[str, Any]) -> bool:
+        """
+        检查是否可以处理该商品（单位为"件"）
+        
+        Args:
+            product: 商品信息字典
+            
+        Returns:
+            是否可以处理
+        """
+        unit = str(product.get('unit', '')).strip()
+        # 匹配"件"、"件、"、"件装"等
+        return unit == '件' or unit.startswith('件')
+    
+    def handle(self, product: Dict[str, Any], level1: int, level2: int, level3: Optional[int]) -> Dict[str, Any]:
+        """
+        处理"件"单位转换：数量×包装数量，单价÷包装数量，单位转为"瓶"
+        
+        Args:
+            product: 商品信息字典
+            level1: 一级包装数量
+            level2: 二级包装数量
+            level3: 三级包装数量，可能为None
+            
+        Returns:
+            处理后的商品信息字典
+        """
+        result = product.copy()
+        
+        quantity = result.get('quantity', 0)
+        price = result.get('price', 0)
+        
+        # 计算包装数量（二级*三级，如果无三级则仅二级）
+        packaging_count = level2 * (level3 or 1)
+        
+        # 数量×包装数量
+        new_quantity = quantity * packaging_count
+        
+        # 单价÷包装数量
+        new_price = price / packaging_count if price else 0
+        
+        logger.info(f"件单位处理: 数量: {quantity} -> {new_quantity}, 单价: {price} -> {new_price}, 单位: 件 -> 瓶")
+        
+        result['quantity'] = new_quantity
+        result['price'] = new_price
+        result['unit'] = '瓶'
+        
+        return result
+
+
+class BoxUnitHandler(UnitHandler):
+    """
+    处理"箱"单位的转换
+    """
+    
+    def can_handle(self, product: Dict[str, Any]) -> bool:
+        """
+        检查是否可以处理该商品（单位为"箱"）
+        
+        Args:
+            product: 商品信息字典
+            
+        Returns:
+            是否可以处理
+        """
+        unit = str(product.get('unit', '')).strip()
+        # 匹配"箱"、"箱、"、"箱装"等
+        return unit == '箱' or unit.startswith('箱')
+    
+    def handle(self, product: Dict[str, Any], level1: int, level2: int, level3: Optional[int]) -> Dict[str, Any]:
+        """
+        处理"箱"单位转换：数量×包装数量，单价÷包装数量，单位转为"瓶"
+        
+        Args:
+            product: 商品信息字典
+            level1: 一级包装数量
+            level2: 二级包装数量
+            level3: 三级包装数量，可能为None
+            
+        Returns:
+            处理后的商品信息字典
+        """
+        result = product.copy()
+        
+        quantity = result.get('quantity', 0)
+        price = result.get('price', 0)
+        
+        # 计算包装数量（二级*三级，如果无三级则仅二级）
+        packaging_count = level2 * (level3 or 1)
+        
+        # 数量×包装数量
+        new_quantity = quantity * packaging_count
+        
+        # 单价÷包装数量
+        new_price = price / packaging_count if price else 0
+        
+        logger.info(f"箱单位处理: 数量: {quantity} -> {new_quantity}, 单价: {price} -> {new_price}, 单位: 箱 -> 瓶")
+        
+        result['quantity'] = new_quantity
+        result['price'] = new_price
+        result['unit'] = '瓶'
+        
+        return result
+
+
+class TiHeUnitHandler(UnitHandler):
+    """
+    处理"提"和"盒"单位的转换
+    """
+    
+    def can_handle(self, product: Dict[str, Any]) -> bool:
+        """
+        检查是否可以处理该商品（单位为"提"或"盒"）
+        
+        Args:
+            product: 商品信息字典
+            
+        Returns:
+            是否可以处理
+        """
+        unit = str(product.get('unit', '')).strip()
+        return unit in ['提', '盒'] or unit.startswith('提') or unit.startswith('盒')
+    
+    def handle(self, product: Dict[str, Any], level1: int, level2: int, level3: Optional[int]) -> Dict[str, Any]:
+        """
+        处理"提"和"盒"单位转换：
+        - 如果是三级规格，按件处理（数量×包装数量，单价÷包装数量，单位转为"瓶"）
+        - 如果是二级规格，保持不变
+        
+        Args:
+            product: 商品信息字典
+            level1: 一级包装数量
+            level2: 二级包装数量
+            level3: 三级包装数量，可能为None
+            
+        Returns:
+            处理后的商品信息字典
+        """
+        result = product.copy()
+        
+        quantity = result.get('quantity', 0)
+        price = result.get('price', 0)
+        unit = result.get('unit', '')
+        
+        # 如果是三级规格，按件处理
+        if level3 is not None:
+            # 计算包装数量 - 只乘以最后一级数量
+            packaging_count = level3
+            
+            # 数量×包装数量
+            new_quantity = quantity * packaging_count
+            
+            # 单价÷包装数量
+            new_price = price / packaging_count if price else 0
+            
+            logger.info(f"提/盒单位(三级规格)处理: 数量: {quantity} -> {new_quantity}, 单价: {price} -> {new_price}, 单位: {unit} -> 瓶")
+            
+            result['quantity'] = new_quantity
+            result['price'] = new_price
+            result['unit'] = '瓶'
+        else:
+            # 如果是二级规格，保持不变
+            logger.info(f"提/盒单位(二级规格)处理: 保持原样 数量: {quantity}, 单价: {price}, 单位: {unit}")
+        
+        return result
+
+
+class GiftUnitHandler(UnitHandler):
+    """
+    处理赠品的特殊情况
+    """
+    
+    def can_handle(self, product: Dict[str, Any]) -> bool:
+        """
+        检查是否可以处理该商品（是否为赠品）
+        
+        Args:
+            product: 商品信息字典
+            
+        Returns:
+            是否可以处理
+        """
+        return product.get('is_gift', False) is True
+    
+    def handle(self, product: Dict[str, Any], level1: int, level2: int, level3: Optional[int]) -> Dict[str, Any]:
+        """
+        处理赠品的单位转换：
+        - 对于件/箱单位，数量仍然需要转换，但赠品的单价保持为0
+        
+        Args:
+            product: 商品信息字典
+            level1: 一级包装数量
+            level2: 二级包装数量
+            level3: 三级包装数量，可能为None
+            
+        Returns:
+            处理后的商品信息字典
+        """
+        result = product.copy()
+        
+        unit = result.get('unit', '')
+        quantity = result.get('quantity', 0)
+        
+        # 根据单位类型选择适当的包装数计算
+        if unit in ['件', '箱']:
+            # 计算包装数量（二级*三级，如果无三级则仅二级）
+            packaging_count = level2 * (level3 or 1)
+            
+            # 数量×包装数量
+            new_quantity = quantity * packaging_count
+            
+            logger.info(f"赠品{unit}单位处理: 数量: {quantity} -> {new_quantity}, 单价: 0, 单位: {unit} -> 瓶")
+            
+            result['quantity'] = new_quantity
+            result['unit'] = '瓶'
+        elif unit in ['提', '盒'] and level3 is not None:
+            # 对于三级规格的提/盒，类似件处理
+            new_quantity = quantity * level3
+            
+            logger.info(f"赠品{unit}单位(三级规格)处理: 数量: {quantity} -> {new_quantity}, 单价: 0, 单位: {unit} -> 瓶")
+            
+            result['quantity'] = new_quantity
+            result['unit'] = '瓶'
+        else:
+            # 其他情况保持不变
+            logger.info(f"赠品{unit}单位处理: 保持原样 数量: {quantity}, 单价: 0, 单位: {unit}")
+        
+        # 确保单价为0
+        result['price'] = 0
+        
+        return result 
@@ -0,0 +1,423 @@
+"""
+订单合并模块
+----------
+提供采购单合并功能，将多个采购单合并为一个。
+"""
+
+import os
+import re
+import pandas as pd
+import numpy as np
+import xlrd
+import xlwt
+from xlutils.copy import copy as xlcopy
+from typing import Dict, List, Optional, Tuple, Union, Any, Callable
+from datetime import datetime
+
+from ...config.settings import ConfigManager
+from ..utils.log_utils import get_logger
+from ..handlers.column_mapper import ColumnMapper
+from ..utils.file_utils import (
+    ensure_dir,
+    get_file_extension,
+    get_files_by_extensions,
+    load_json,
+    save_json
+)
+from ..utils.string_utils import (
+    clean_string,
+    clean_barcode,
+    format_barcode
+)
+
+logger = get_logger(__name__)
+
+class PurchaseOrderMerger:
+    """
+    采购单合并器：将多个采购单Excel文件合并成一个文件
+    """
+    
+    def __init__(self, config):
+        """
+        初始化采购单合并器
+        
+        Args:
+            config: 配置信息
+        """
+        self.config = config
+        
+        # 修复ConfigParser对象没有get_path方法的问题
+        try:
+            # 获取输出目录
+            self.output_dir = config.get('Paths', 'output_folder', fallback='data/output')
+            
+            # 确保目录存在
+            os.makedirs(self.output_dir, exist_ok=True)
+            
+            # 记录实际路径
+            logger.info(f"使用输出目录: {os.path.abspath(self.output_dir)}")
+            
+            # 获取模板文件路径
+            template_folder = config.get('Paths', 'template_folder', fallback='templates')
+            template_name = config.get('Templates', 'purchase_order', fallback='银豹-采购单模板.xls')
+            
+            self.template_path = os.path.join(template_folder, template_name)
+            
+            # 检查模板文件是否存在
+            if not os.path.exists(self.template_path):
+                logger.warning(f"模板文件不存在: {self.template_path}")
+            
+            # 用于记录已合并的文件
+            self.merged_files_json = os.path.join(self.output_dir, "merged_files.json")
+            self.merged_files = self._load_merged_files()
+            
+            logger.info(f"初始化PurchaseOrderMerger完成，模板文件: {self.template_path}")
+        except Exception as e:
+            logger.error(f"初始化PurchaseOrderMerger失败: {e}")
+            raise
+    
+    def _load_merged_files(self) -> Dict[str, str]:
+        """
+        加载已合并文件的缓存
+        
+        Returns:
+            合并记录字典
+        """
+        return load_json(self.merged_files_json, {})
+        
+    def _save_merged_files(self) -> None:
+        """保存已合并文件的缓存"""
+        save_json(self.merged_files, self.merged_files_json)
+    
+    def get_purchase_orders(self) -> List[str]:
+        """
+        获取result目录下的采购单Excel文件
+        
+        Returns:
+            采购单文件路径列表
+        """
+        # 采购单文件保存在data/result目录
+        result_dir = "data/result"
+        logger.info(f"搜索目录 {result_dir} 中的采购单Excel文件")
+        
+        # 确保目录存在
+        os.makedirs(result_dir, exist_ok=True)
+        
+        # 获取所有Excel文件
+        all_files = get_files_by_extensions(result_dir, ['.xls', '.xlsx'])
+        
+        # 筛选采购单文件
+        purchase_orders = [
+            file for file in all_files 
+            if os.path.basename(file).startswith('采购单_')
+        ]
+        
+        if not purchase_orders:
+            logger.warning(f"未在 {result_dir} 目录下找到采购单Excel文件")
+            return []
+        
+        # 按修改时间排序，最新的在前
+        purchase_orders.sort(key=lambda x: os.path.getmtime(x), reverse=True)
+        
+        logger.info(f"找到 {len(purchase_orders)} 个采购单Excel文件")
+        return purchase_orders
+    
+    def read_purchase_order(self, file_path: str) -> Optional[pd.DataFrame]:
+        """
+        读取采购单Excel文件
+        
+        Args:
+            file_path: 采购单文件路径
+            
+        Returns:
+            数据帧，如果读取失败则返回None
+        """
+        try:
+            # 读取Excel文件
+            df = pd.read_excel(file_path)
+            logger.info(f"成功读取采购单文件: {file_path}")
+            
+            # 打印列名，用于调试
+            logger.debug(f"Excel文件的列名: {df.columns.tolist()}")
+            
+            # 处理特殊情况：检查是否需要读取指定行作为标题行
+            header_row_idx = ColumnMapper.detect_header_row(df, max_rows=5, min_matches=3)
+            if header_row_idx >= 0:
+                logger.info(f"检测到表头在第 {header_row_idx+1} 行")
+
+                # 使用此行作为列名，数据从下一行开始
+                header_row = df.iloc[header_row_idx].astype(str)
+                data_rows = df.iloc[header_row_idx+1:].reset_index(drop=True)
+
+                # 为每一列分配名称（避免重复的列名）
+                new_columns = []
+                for i, col in enumerate(header_row):
+                    col_str = str(col)
+                    if col_str == 'nan' or col_str == 'None' or pd.isna(col):
+                        new_columns.append(f"Col_{i}")
+                    else:
+                        new_columns.append(col_str)
+
+                # 使用新列名创建新的DataFrame
+                data_rows.columns = new_columns
+                df = data_rows
+                logger.debug(f"重新构建的数据帧列名: {df.columns.tolist()}")
+
+            # 使用 ColumnMapper 统一查找列名（保留中文键名以兼容下游代码）
+            all_columns = df.columns.tolist()
+            logger.info(f"列名: {all_columns}")
+
+            standard_to_chinese = {
+                'barcode': '条码',
+                'quantity': '采购量',
+                'unit_price': '采购单价',
+                'gift_quantity': '赠送量',
+            }
+
+            mapped_columns = {}
+            for std_name, chinese_name in standard_to_chinese.items():
+                matched = ColumnMapper.find_column(all_columns, std_name)
+                if matched:
+                    mapped_columns[chinese_name] = matched
+                    logger.info(f"列名映射: {matched} -> {chinese_name}")
+
+            # 如果找到了必要的列，重命名列
+            if mapped_columns:
+                rename_dict = {mapped_columns[key]: key for key in mapped_columns}
+                logger.info(f"列名重命名映射: {rename_dict}")
+                df = df.rename(columns=rename_dict)
+                logger.info(f"重命名后的列名: {df.columns.tolist()}")
+            else:
+                logger.warning(f"未找到可映射的列名: {file_path}")
+            
+            return df
+            
+        except Exception as e:
+            logger.error(f"读取采购单文件失败: {file_path}, 错误: {str(e)}")
+            return None
+    
+    def merge_purchase_orders(self, file_paths: List[str]) -> Optional[pd.DataFrame]:
+        """
+        合并多个采购单文件
+        
+        Args:
+            file_paths: 采购单文件路径列表
+            
+        Returns:
+            合并后的数据帧，如果合并失败则返回None
+        """
+        if not file_paths:
+            logger.warning("没有需要合并的采购单文件")
+            return None
+        
+        # 读取所有采购单文件
+        dfs = []
+        for file_path in file_paths:
+            df = self.read_purchase_order(file_path)
+            if df is not None:
+                dfs.append(df)
+        
+        if not dfs:
+            logger.warning("没有成功读取的采购单文件")
+            return None
+        
+        # 合并数据
+        logger.info(f"开始合并 {len(dfs)} 个采购单文件")
+        
+        # 首先，整理每个数据帧以确保它们有相同的结构
+        processed_dfs = []
+        for i, df in enumerate(dfs):
+            # 确保必要的列存在
+            required_columns = ['条码', '采购量', '采购单价']
+            missing_columns = [col for col in required_columns if col not in df.columns]
+            
+            if missing_columns:
+                logger.warning(f"数据帧 {i} 缺少必要的列: {missing_columns}")
+                continue
+            
+            # 处理赠送量列不存在的情况
+            if '赠送量' not in df.columns:
+                df['赠送量'] = 0
+            
+            # 选择并清理需要的列
+            cleaned_df = pd.DataFrame()
+            
+            # 清理条码 - 确保是字符串且无小数点
+            cleaned_df['条码'] = df['条码'].apply(lambda x: format_barcode(x) if pd.notna(x) else '')
+            
+            # 清理采购量 - 确保是数字
+            cleaned_df['采购量'] = pd.to_numeric(df['采购量'], errors='coerce').fillna(0)
+            
+            # 清理单价 - 确保是数字并保留4位小数
+            cleaned_df['采购单价'] = pd.to_numeric(df['采购单价'], errors='coerce').fillna(0).round(4)
+            
+            # 清理赠送量 - 确保是数字
+            cleaned_df['赠送量'] = pd.to_numeric(df['赠送量'], errors='coerce').fillna(0)
+            
+            # 过滤无效行 - 条码为空或采购量为0的行跳过
+            valid_df = cleaned_df[(cleaned_df['条码'] != '') & (cleaned_df['采购量'] > 0)]
+            
+            if len(valid_df) > 0:
+                processed_dfs.append(valid_df)
+                logger.info(f"处理文件 {i+1}: 有效记录 {len(valid_df)} 行")
+            else:
+                logger.warning(f"处理文件 {i+1}: 没有有效记录")
+        
+        if not processed_dfs:
+            logger.warning("没有有效的数据帧用于合并")
+            return None
+        
+        # 将所有数据帧合并
+        merged_df = pd.concat(processed_dfs, ignore_index=True)
+        
+        # 按条码和单价分组，合并相同商品
+        # 四舍五入到4位小数，避免浮点误差导致相同价格被当作不同价格
+        merged_df['采购单价'] = merged_df['采购单价'].round(4)  
+        
+        # 对于同一条码和单价的商品，合并数量和赠送量
+        result = merged_df.groupby(['条码', '采购单价'], as_index=False).agg({
+            '采购量': 'sum',
+            '赠送量': 'sum'
+        })
+        
+        # 排序，按条码升序
+        result = result.sort_values('条码').reset_index(drop=True)
+        
+        # 设置为0的赠送量设为空
+        result.loc[result['赠送量'] == 0, '赠送量'] = pd.NA
+        
+        logger.info(f"合并完成，共 {len(result)} 条商品记录")
+        return result
+    
+    def create_merged_purchase_order(self, df: pd.DataFrame) -> Optional[str]:
+        """
+        创建合并的采购单文件，完全按照银豹格式要求
+        
+        Args:
+            df: 合并后的数据帧
+            
+        Returns:
+            输出文件路径，如果创建失败则返回None
+        """
+        try:
+            # 打开模板文件
+            template_workbook = xlrd.open_workbook(self.template_path, formatting_info=True)
+            template_sheet = template_workbook.sheet_by_index(0)
+            
+            # 首先分析模板结构，确定关键列的位置
+            logger.info(f"分析模板结构")
+            for i in range(min(5, template_sheet.nrows)):
+                row_values = [str(cell.value).strip() for cell in template_sheet.row(i)]
+                logger.debug(f"模板第{i+1}行: {row_values}")
+            
+            # 银豹模板的标准列位置：
+            # 条码列(商品条码): B列(索引1)
+            barcode_col = 1
+            # 采购量列: C列(索引2)
+            quantity_col = 2 
+            # 赠送量列: D列(索引3)
+            gift_col = 3
+            # 采购单价列: E列(索引4)
+            price_col = 4
+            
+            # 找到数据开始行 - 通常是第二行(索引1)
+            data_start_row = 1
+            
+            # 创建可写的副本
+            output_workbook = xlcopy(template_workbook)
+            output_sheet = output_workbook.get_sheet(0)
+            
+            # 设置单价的格式样式（保留4位小数）
+            price_style = xlwt.XFStyle()
+            price_style.num_format_str = '0.0000'
+            
+            # 数量格式
+            quantity_style = xlwt.XFStyle()
+            quantity_style.num_format_str = '0'
+            
+            # 遍历数据并填充到Excel
+            for i, (_, row) in enumerate(df.iterrows()):
+                r = data_start_row + i
+                
+                # 只填充银豹采购单格式要求的4个列：条码、采购量、赠送量、采购单价
+                
+                # 条码（必填）- B列(1)
+                output_sheet.write(r, barcode_col, row['条码'])
+                
+                # 采购量（必填）- C列(2)
+                output_sheet.write(r, quantity_col, float(row['采购量']), quantity_style)
+                
+                # 赠送量 - D列(3)
+                if pd.notna(row['赠送量']) and float(row['赠送量']) > 0:
+                    output_sheet.write(r, gift_col, float(row['赠送量']), quantity_style)
+                
+                # 采购单价（必填）- E列(4)
+                output_sheet.write(r, price_col, float(row['采购单价']), price_style)
+            
+            # 生成输出文件名，保存到data/result目录
+            timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
+            result_dir = "data/result"
+            os.makedirs(result_dir, exist_ok=True)
+            output_file = os.path.join(result_dir, f"合并采购单_{timestamp}.xls")
+            
+            # 保存文件
+            output_workbook.save(output_file)
+            logger.info(f"合并采购单已保存到: {output_file}，共{len(df)}条记录")
+            return output_file
+            
+        except Exception as e:
+            logger.error(f"创建合并采购单时出错: {e}")
+            return None
+    
+    def process(self, file_paths: Optional[List[str]] = None, progress_cb: Optional[Callable[[int], None]] = None) -> Optional[str]:
+        """
+        处理采购单合并
+        
+        Args:
+            file_paths: 指定要合并的文件路径列表，如果为None则自动获取
+            
+        Returns:
+            合并后的文件路径，如果合并失败则返回None
+        """
+        # 如果未指定文件路径，则获取所有采购单文件
+        if file_paths is None:
+            file_paths = self.get_purchase_orders()
+            try:
+                if progress_cb:
+                    progress_cb(97)
+            except Exception:
+                pass
+        
+        # 检查是否有文件需要合并
+        if not file_paths:
+            logger.warning("没有找到可合并的采购单文件")
+            return None
+        
+        # 合并采购单
+        merged_df = self.merge_purchase_orders(file_paths)
+        if merged_df is None:
+            logger.error("合并采购单失败")
+            return None
+        try:
+            if progress_cb:
+                progress_cb(98)
+        except Exception:
+            pass
+        
+        # 创建合并的采购单文件
+        output_file = self.create_merged_purchase_order(merged_df)
+        if output_file is None:
+            logger.error("创建合并采购单文件失败")
+            return None
+        try:
+            if progress_cb:
+                progress_cb(100)
+        except Exception:
+            pass
+        
+        # 记录已合并文件
+        for file_path in file_paths:
+            self.merged_files[file_path] = output_file
+        self._save_merged_files()
+        
+        return output_file
@@ -0,0 +1,860 @@
+"""
+Excel处理核心模块
+--------------
+提供Excel文件处理功能，包括表格解析、数据提取和处理。
+"""
+
+import os
+import re
+import pandas as pd
+import numpy as np
+import xlrd
+import xlwt
+from xlutils.copy import copy as xlcopy
+from typing import Dict, List, Optional, Tuple, Union, Any, Callable
+from datetime import datetime
+
+from ...config.settings import ConfigManager
+from ..utils.log_utils import get_logger
+from ..utils.file_utils import (
+    ensure_dir,
+    get_file_extension,
+    get_latest_file,
+    load_json,
+    save_json
+)
+from ..utils.string_utils import (
+    clean_string,
+    extract_number,
+    format_barcode,
+    parse_monetary_string
+)
+from .converter import UnitConverter
+from ..handlers.column_mapper import ColumnMapper
+
+logger = get_logger(__name__)
+
+class ExcelProcessor:
+    """
+    Excel处理器：处理OCR识别后的Excel文件，
+    提取条码、单价和数量，并按照采购单模板的格式填充
+    """
+    
+    def __init__(self, config):
+        """
+        初始化Excel处理器
+        
+        Args:
+            config: 配置信息
+        """
+        self.config = config
+        
+        # 修复ConfigParser对象没有get_path方法的问题
+        try:
+            # 获取输入和输出目录
+            self.output_dir = config.get('Paths', 'output_folder', fallback='data/output')
+            self.temp_dir = config.get('Paths', 'temp_folder', fallback='data/temp')
+            
+            # 获取模板文件路径
+            self.template_path = config.get('Paths', 'template_file', fallback='templates/银豹-采购单模板.xls')
+            if not os.path.exists(self.template_path):
+                logger.warning(f"模板文件不存在: {self.template_path}")
+            
+            # 设置缓存文件路径
+            self.cache_file = os.path.join(self.output_dir, "processed_files.json")
+            self.processed_files = self._load_processed_files()
+            
+            # 确保目录存在
+            os.makedirs(self.output_dir, exist_ok=True)
+            os.makedirs(self.temp_dir, exist_ok=True)
+            
+            # 记录实际路径
+            logger.info(f"使用输出目录: {os.path.abspath(self.output_dir)}")
+            logger.info(f"使用临时目录: {os.path.abspath(self.temp_dir)}")
+            
+            # 加载单位转换器和配置
+            self.unit_converter = UnitConverter()
+            logger.info(f"初始化ExcelProcessor完成，模板文件: {self.template_path}")
+        except Exception as e:
+            logger.error(f"初始化ExcelProcessor失败: {e}")
+            raise
+    
+    def _load_processed_files(self) -> Dict[str, str]:
+        """
+        加载已处理文件的缓存
+        
+        Returns:
+            处理记录字典
+        """
+        return load_json(self.cache_file, {})
+        
+    def _save_processed_files(self) -> None:
+        """保存已处理文件的缓存"""
+        save_json(self.processed_files, self.cache_file)
+    
+    def get_latest_excel(self) -> Optional[str]:
+        """
+        获取output目录下最新的Excel文件（排除采购单文件）
+        
+        Returns:
+            最新Excel文件的路径，如果未找到则返回None
+        """
+        logger.info(f"搜索目录 {self.output_dir} 中的Excel文件")
+        
+        # 使用文件工具获取最新文件
+        latest_file = get_latest_file(
+            self.output_dir,
+            pattern="",  # 不限制文件名
+            extensions=['.xlsx', '.xls']  # 限制为Excel文件
+        )
+        
+        # 如果没有找到文件
+        if not latest_file:
+            logger.warning(f"未在 {self.output_dir} 目录下找到未处理的Excel文件")
+            return None
+        
+        # 检查是否是采购单（以"采购单_"开头的文件）
+        file_name = os.path.basename(latest_file)
+        if file_name.startswith('采购单_'):
+            logger.warning(f"找到的最新文件是采购单，不作处理: {latest_file}")
+            return None
+        
+        logger.info(f"找到最新的Excel文件: {latest_file}")
+        return latest_file
+    
+    def extract_barcode(self, df: pd.DataFrame) -> List[str]:
+        """
+        从数据帧中提取条码列名
+        
+        Args:
+            df: 数据帧
+            
+        Returns:
+            可能的条码列名列表
+        """
+        possible_barcode_columns = ColumnMapper.STANDARD_COLUMNS['barcode']
+        
+        found_columns = []
+        
+        # 检查精确匹配
+        for col in df.columns:
+            col_str = str(col).strip()
+            if col_str in possible_barcode_columns:
+                found_columns.append(col)
+                logger.info(f"找到精确匹配的条码列: {col_str}")
+        
+        # 如果找不到精确匹配，尝试部分匹配
+        if not found_columns:
+            for col in df.columns:
+                col_str = str(col).strip().lower()
+                for keyword in ['条码', '条形码', 'barcode', '编码']:
+                    if keyword.lower() in col_str:
+                        found_columns.append(col)
+                        logger.info(f"找到部分匹配的条码列: {col} (包含关键词: {keyword})")
+                        break
+        
+        # 如果仍然找不到，尝试使用数据特征识别
+        if not found_columns and len(df) > 0:
+            for col in df.columns:
+                # 检查此列数据是否符合条码特征
+                sample_values = df[col].dropna().astype(str).tolist()[:10]  # 取前10个非空值
+                
+                if sample_values and all(len(val) >= 8 and len(val) <= 14 for val in sample_values):
+                    # 大多数条码长度在8-14之间
+                    if all(val.isdigit() for val in sample_values):
+                        found_columns.append(col)
+                        logger.info(f"基于数据特征识别的可能条码列: {col}")
+        
+        return found_columns
+    
+    def extract_product_info(self, df: pd.DataFrame) -> List[Dict]:
+        """
+        从数据帧中提取商品信息
+        
+        Args:
+            df: 数据帧
+            
+        Returns:
+            商品信息列表
+        """
+        products = []
+        
+        # 检测列映射
+        column_mapping = self._detect_column_mapping(df)
+        logger.info(f"检测到列映射: {column_mapping}")
+        
+        # 处理每一行
+        for idx, row in df.iterrows():
+            try:
+                # 初始化商品信息
+                product = {
+                    'barcode': '',     # 条码
+                    'name': '',        # 商品名称
+                    'specification': '',  # 规格
+                    'quantity': 0,     # 数量
+                    'unit': '',        # 单位
+                    'price': 0,        # 单价
+                    'amount': 0,       # 金额
+                    'is_gift': False   # 是否为赠品
+                }
+                
+                # 提取条码
+                if '条码' in df.columns and not pd.isna(row['条码']):
+                    product['barcode'] = str(row['条码']).strip()
+                elif column_mapping.get('barcode') and not pd.isna(row[column_mapping['barcode']]):
+                    product['barcode'] = str(row[column_mapping['barcode']]).strip()
+                
+                # 跳过空条码行
+                if not product['barcode']:
+                    continue
+                
+                # 检查备注列，过滤换货、退货、作废等非采购行
+                skip_row = False
+                for col in df.columns:
+                    col_str = str(col)
+                    if any(k in col_str for k in ['备注', '说明', '类型', '备注1']):
+                        val = str(row[col]).strip()
+                        # 过滤常见的非采购关键字
+                        if any(k in val for k in ['换货', '退货', '作废', '减钱', '冲减', '赠品单', '补货']):
+                            logger.info(f"过滤非采购行: {product['barcode']} - {product.get('name', '')}, 原因: {col_str}包含 '{val}'")
+                            skip_row = True
+                            break
+                if skip_row:
+                    continue
+                    
+                # 提取商品名称
+                if '商品名称' in df.columns and not pd.isna(row['商品名称']):
+                    product['name'] = str(row['商品名称']).strip()
+                elif '名称' in df.columns and not pd.isna(row['名称']):
+                    product['name'] = str(row['名称']).strip()
+                elif column_mapping.get('name') and not pd.isna(row[column_mapping['name']]):
+                    product['name'] = str(row[column_mapping['name']]).strip()
+                    
+                # 提取单位
+                if '单位' in df.columns and not pd.isna(row['单位']):
+                    product['unit'] = str(row['单位']).strip()
+                elif column_mapping.get('unit') and not pd.isna(row[column_mapping['unit']]):
+                    product['unit'] = str(row[column_mapping['unit']]).strip()
+                
+                # 提取单价
+                if '单价' in df.columns and not pd.isna(row['单价']):
+                    product['price'] = row['单价']
+                elif column_mapping.get('price') and not pd.isna(row[column_mapping['price']]):
+                    product['price'] = row[column_mapping['price']]
+                
+                # 提取金额
+                if '金额' in df.columns and not pd.isna(row['金额']):
+                    product['amount'] = row['金额']
+                elif '小计' in df.columns and not pd.isna(row['小计']):
+                    product['amount'] = row['小计']
+                elif column_mapping.get('amount') and not pd.isna(row[column_mapping['amount']]):
+                    product['amount'] = row[column_mapping['amount']]
+                # 根据金额判断赠品：金额为0、为空、或为o/O
+                amt = product.get('amount', None)
+                try:
+                    is_amt_gift = False
+                    if amt is None:
+                        is_amt_gift = True
+                    elif isinstance(amt, str):
+                        parsed = parse_monetary_string(amt)
+                        is_amt_gift = (parsed is None or parsed == 0.0)
+                    else:
+                        parsed = parse_monetary_string(amt)
+                        is_amt_gift = (parsed is not None and parsed == 0.0)
+                    if is_amt_gift:
+                        product['is_gift'] = True
+                except Exception:
+                    pass
+                
+                # 提取数量
+                if '数量' in df.columns and not pd.isna(row['数量']):
+                    product['quantity'] = row['数量']
+                elif column_mapping.get('quantity') and not pd.isna(row[column_mapping['quantity']]):
+                    product['quantity'] = row[column_mapping['quantity']]
+                
+                # 处理可能的复合数量字段，例如"2箱"、"3件"
+                if isinstance(product['quantity'], str) and product['quantity']:
+                    num, unit = self.unit_converter.extract_unit_from_quantity(product['quantity'])
+                    if unit:
+                        product['unit'] = unit
+                        if num is not None:
+                            product['quantity'] = num
+                
+                # 提取规格并解析包装数量
+                if '规格' in df.columns and not pd.isna(row['规格']):
+                    product['specification'] = str(row['规格'])
+                    # 修正OCR误识别的4.51*4为4.5L*4
+                    product['specification'] = re.sub(r'(\d+\.\d+)1\*(\d+)', r'\1L*\2', product['specification'])
+                    package_quantity = self.parse_specification(product['specification'])
+                    if package_quantity:
+                        product['package_quantity'] = package_quantity
+                        logger.info(f"解析规格: {product['specification']} -> 包装数量={package_quantity}")
+                elif column_mapping.get('specification') and not pd.isna(row[column_mapping['specification']]):
+                    product['specification'] = str(row[column_mapping['specification']])
+                    # 修正OCR误识别的4.51*4为4.5L*4
+                    product['specification'] = re.sub(r'(\d+\.\d+)1\*(\d+)', r'\1L*\2', product['specification'])
+                    package_quantity = self.parse_specification(product['specification'])
+                    if package_quantity:
+                        product['package_quantity'] = package_quantity
+                        logger.info(f"从映射列解析规格: {product['specification']} -> 包装数量={package_quantity}")
+                else:
+                    # 只有在无法从Excel获取规格时，才尝试从商品名称推断规格
+                    if product['name']:
+                        # 特殊处理：优先检查名称中是否包含"容量*数量"格式
+                        container_pattern = r'.*?(\d+(?:\.\d+)?)\s*(?:ml|[mM][lL]|[lL]|升|毫升)[*×xX](\d+).*'
+                        match = re.search(container_pattern, product['name'])
+                        if match:
+                            # 容量单位*数量格式，如"1.8L*8瓶"，取数量部分作为包装数量
+                            volume = match.group(1)
+                            count = match.group(2)
+                            inferred_spec = f"{volume}L*{count}"
+                            inferred_qty = int(count)
+                            product['specification'] = inferred_spec
+                            product['package_quantity'] = inferred_qty
+                            logger.info(f"从商品名称提取容量*数量格式: {product['name']} -> {inferred_spec}, 包装数量={inferred_qty}")
+                        # 原来的重量/容量*数字格式处理逻辑
+                        else:
+                            weight_volume_pattern = r'.*?\d+(?:g|ml|毫升|克)[*xX×](\d+)'
+                            match = re.search(weight_volume_pattern, product['name'])
+                            if match:
+                                inferred_spec = f"1*{match.group(1)}"
+                                inferred_qty = int(match.group(1))
+                                product['specification'] = inferred_spec
+                                product['package_quantity'] = inferred_qty
+                                logger.info(f"从商品名称提取重量/容量规格: {product['name']} -> {inferred_spec}, 包装数量={inferred_qty}")
+                            else:
+                                # 一般情况的规格推断
+                                inferred_spec = self.unit_converter.infer_specification_from_name(product['name'])
+                                if inferred_spec:
+                                    product['specification'] = inferred_spec
+                                    package_quantity = self.parse_specification(inferred_spec)
+                                    if package_quantity:
+                                        product['package_quantity'] = package_quantity
+                                    logger.info(f"从商品名称推断规格: {product['name']} -> {inferred_spec}, 包装数量={package_quantity}")
+                
+                # 检查已设置的规格但未设置包装数量的情况
+                if product.get('specification') and not product.get('package_quantity'):
+                    package_quantity = self.parse_specification(product['specification'])
+                    if package_quantity:
+                        product['package_quantity'] = package_quantity
+                        logger.info(f"解析已设置的规格: {product['specification']} -> 包装数量={package_quantity}")
+                
+                # 新增逻辑：根据规格推断单位为"件"
+                if not product['unit'] and product.get('barcode') and product.get('specification') and product.get('quantity') and product.get('price') is not None:
+                    # 检查规格是否符合容量*数量格式
+                    volume_pattern = r'(\d+(?:\.\d+)?)\s*(?:ml|[mL]L|l|L|升|毫升)[*×xX](\d+)'
+                    match = re.search(volume_pattern, product['specification'])
+                    
+                    # 判断是否需要推断单位为"件"
+                    if match:
+                        product['unit'] = '件'
+                        logger.info(f"根据规格推断单位: {product['specification']} -> 单位=件")
+                    else:
+                        # 检查简单的数量*数量格式
+                        simple_pattern = r'(\d+)[*×xX](\d+)'
+                        match = re.search(simple_pattern, product['specification'])
+                        if match:
+                            product['unit'] = '件'
+                            logger.info(f"根据规格推断单位: {product['specification']} -> 单位=件")
+                
+                # 应用单位转换规则
+                product = self.unit_converter.process_unit_conversion(product)
+                
+                # 如果数量为0但单价和金额都存在，计算数量 = 金额/单价
+                if (product['quantity'] == 0 or product['quantity'] is None) and product['price'] > 0 and product['amount']:
+                    try:
+                        amount = parse_monetary_string(product['amount'])
+                        if amount is not None and amount > 0:
+                            quantity = amount / product['price']
+                            logger.info(f"数量为空或为0，通过金额({amount})和单价({product['price']})计算得出数量: {quantity}")
+                            product['quantity'] = quantity
+                    except Exception as e:
+                        logger.warning(f"通过金额和单价计算数量失败: {e}")
+                
+                products.append(product)
+            except Exception as e:
+                logger.error(f"提取第{idx+1}行商品信息时出错: {e}", exc_info=True)
+                continue
+                
+        logger.info(f"提取到 {len(products)} 个商品信息")
+        return products
+    
+    def fill_template(self, products: List[Dict], output_file_path: str) -> bool:
+        """
+        填充采购单模板
+        
+        Args:
+            products: 商品信息列表
+            output_file_path: 输出文件路径
+            
+        Returns:
+            是否成功填充
+        """
+        try:
+            # 打开模板文件
+            template_workbook = xlrd.open_workbook(self.template_path, formatting_info=True)
+            template_sheet = template_workbook.sheet_by_index(0)
+            
+            # 创建可写的副本
+            output_workbook = xlcopy(template_workbook)
+            output_sheet = output_workbook.get_sheet(0)
+            
+            # 先对产品按条码分组，区分正常商品和赠品
+            barcode_groups = {}
+            
+            # 遍历所有产品，按条码分组
+            logger.info(f"开始处理{len(products)} 个产品信息")
+            for product in products:
+                barcode = product.get('barcode', '')
+                # 确保条码是整数字符串
+                barcode = format_barcode(barcode)
+                
+                if not barcode:
+                    logger.warning(f"跳过无条码商品")
+                    continue
+                
+                # 获取数量和单价
+                quantity = product.get('quantity', 0)
+                price = product.get('price', 0)
+                amount = product.get('amount', 0)
+                
+                # 如果数量为0但单价和金额都存在，计算数量 = 金额/单价
+                if (quantity == 0 or quantity is None) and price > 0 and amount:
+                    try:
+                        amount = parse_monetary_string(amount)
+                        if amount is not None and amount > 0:
+                            quantity = amount / price
+                            logger.info(f"数量为空或为0，通过金额({amount})和单价({price})计算得出数量: {quantity}")
+                            product['quantity'] = quantity
+                    except Exception as e:
+                        logger.warning(f"通过金额和单价计算数量失败: {e}")
+                
+                # 判断是否为赠品（价格为0）
+                is_gift = bool(product.get('is_gift', False)) or (price == 0)
+                
+                logger.info(f"处理商品: 条码={barcode}, 数量={quantity}, 单价={price}, 是否赠品={is_gift}")
+                
+                if barcode not in barcode_groups:
+                    barcode_groups[barcode] = {
+                        'normal': None,  # 正常商品信息
+                        'gift_quantity': 0  # 赠品数量
+                    }
+                
+                if is_gift:
+                    # 是赠品，累加赠品数量
+                    barcode_groups[barcode]['gift_quantity'] += quantity
+                    logger.info(f"发现赠品：条码{barcode}, 数量={quantity}")
+                else:
+                    # 是正常商品
+                    if barcode_groups[barcode]['normal'] is None:
+                        barcode_groups[barcode]['normal'] = {
+                            'product': product,
+                            'quantity': quantity,
+                            'price': price
+                        }
+                        logger.info(f"发现正常商品：条码{barcode}, 数量={quantity}, 单价={price}")
+                    else:
+                        # 如果有多个正常商品记录，累加数量
+                        barcode_groups[barcode]['normal']['quantity'] += quantity
+                        logger.info(f"累加正常商品数量：条码{barcode}, 新增={quantity}, 累计={barcode_groups[barcode]['normal']['quantity']}")
+                        
+                        # 如果单价不同，取平均值
+                        if price != barcode_groups[barcode]['normal']['price']:
+                            avg_price = (barcode_groups[barcode]['normal']['price'] + price) / 2
+                            barcode_groups[barcode]['normal']['price'] = avg_price
+                            logger.info(f"调整单价(取平均值)：条码{barcode}, 原价={barcode_groups[barcode]['normal']['price']}, 新价={price}, 平均={avg_price}")
+            
+            # 输出调试信息
+            logger.info(f"分组后共{len(barcode_groups)} 个不同条码的商品")
+            for barcode, group in barcode_groups.items():
+                if group['normal'] is not None:
+                    logger.info(f"条码 {barcode} 处理结果：正常商品数量{group['normal']['quantity']}，单价{group['normal']['price']}，赠品数量{group['gift_quantity']}")
+                else:
+                    logger.info(f"条码 {barcode} 处理结果：只有赠品，数量={group['gift_quantity']}")
+            
+            # 准备填充数据
+            row_index = 1  # 从第2行开始填充（索引从0开始）
+            
+            for barcode, group in barcode_groups.items():
+                # 1. 列B(1): 条码（必填）
+                output_sheet.write(row_index, 1, barcode)
+                
+                if group['normal'] is not None:
+                    # 有正常商品
+                    product = group['normal']['product']
+                    
+                    # 2. 列C(2): 采购量（必填） 使用正常商品的采购量
+                    normal_quantity = group['normal']['quantity']
+                    output_sheet.write(row_index, 2, normal_quantity)
+                    
+                    # 3. 列D(3): 赠送量 - 添加赠品数量
+                    if group['gift_quantity'] > 0:
+                        output_sheet.write(row_index, 3, group['gift_quantity'])
+                        logger.info(f"条码 {barcode} 填充：采购量={normal_quantity}，赠品数量{group['gift_quantity']}")
+                    
+                    # 4. 列E(4): 采购单价（必填）
+                    purchase_price = group['normal']['price']
+                    style = xlwt.XFStyle()
+                    style.num_format_str = '0.0000'
+                    output_sheet.write(row_index, 4, round(purchase_price, 4), style)
+                else:
+                    # 只有赠品，没有正常商品
+                    # 采购量填0，赠送量填赠品数量
+                    output_sheet.write(row_index, 2, 0)  # 采购量为0
+                    output_sheet.write(row_index, 3, group['gift_quantity'])  # 赠送量
+                    output_sheet.write(row_index, 4, 0)  # 单价为0
+                    
+                    logger.info(f"条码 {barcode} 填充：仅有赠品，采购量=0，赠品数量={group['gift_quantity']}")
+                
+                # 移到下一行
+                row_index += 1
+            
+            # 保存文件
+            output_workbook.save(output_file_path)
+            logger.info(f"采购单已保存到: {output_file_path}")
+            return True
+            
+        except Exception as e:
+            logger.error(f"填充模板时出错: {e}")
+            return False
+    
+    def _find_header_row(self, df: pd.DataFrame) -> Optional[int]:
+        """自动识别表头行，委托给 ColumnMapper.detect_header_row"""
+        result = ColumnMapper.detect_header_row(df, max_rows=30)
+        if result >= 0:
+            logger.info(f"找到表头行: 第{result+1}行")
+            return result
+        # 回退：找第一个非空行
+        for row in range(len(df)):
+            if df.iloc[row].notna().sum() > 3:
+                logger.info(f"未找到明确表头，使用第一个有效行: 第{row+1}行")
+                return row
+        logger.warning("无法识别表头行")
+        return None
+    
+    def process_specific_file(self, file_path: str, progress_cb: Optional[Callable[[int], None]] = None) -> Optional[str]:
+        """
+        处理指定的Excel文件
+        
+        Args:
+            file_path: Excel文件路径
+            
+        Returns:
+            输出文件路径，如果处理失败则返回None
+        """
+        logger.info(f"开始处理Excel文件: {file_path}")
+        
+        if not os.path.exists(file_path):
+            logger.error(f"文件不存在: {file_path}")
+            return None
+        
+        try:
+            # 读取Excel文件时不立即指定表头
+            if progress_cb:
+                try:
+                    progress_cb(92)
+                except Exception:
+                    pass
+            df = pd.read_excel(file_path, header=None)
+            logger.info(f"成功读取Excel文件: {file_path}, 共 {len(df)} 行")
+            
+            # 自动识别表头行
+            header_row = self._find_header_row(df)
+            if header_row is None:
+                logger.error("无法识别表头行")
+                return None
+                
+            logger.info(f"识别到表头在第 {header_row+1} 行")
+            
+            # 重新设置表头，避免二次读取
+            if progress_cb:
+                try:
+                    progress_cb(94)
+                except Exception:
+                    pass
+            
+            # 使用识别到的表头行设置列名，并过滤掉表头之前的行
+            df.columns = df.iloc[header_row]
+            df = df.iloc[header_row + 1:].reset_index(drop=True)
+            
+            logger.info(f"重新整理数据结构，共 {len(df)} 行有效数据")
+            
+            # 提取商品信息
+            if progress_cb:
+                try:
+                    progress_cb(96)
+                except Exception:
+                    pass
+            products = self.extract_product_info(df)
+            
+            if not products:
+                logger.warning("未提取到有效商品信息")
+                return None
+            
+            # 生成输出文件名，保存到data/result目录
+            file_name = os.path.splitext(os.path.basename(file_path))[0]
+            result_dir = "data/result"
+            os.makedirs(result_dir, exist_ok=True)
+            output_file = os.path.join(result_dir, f"采购单_{file_name}.xls")
+            
+            # 填充模板并保存
+            if self.fill_template(products, output_file):
+                # 记录已处理文件
+                self.processed_files[file_path] = output_file
+                self._save_processed_files()
+                
+                # 不再自动打开输出目录
+                logger.info(f"采购单已保存到: {output_file}")
+                if progress_cb:
+                    try:
+                        progress_cb(100)
+                    except Exception:
+                        pass
+                
+                return output_file
+            
+            return None
+            
+        except Exception as e:
+            logger.error(f"处理Excel文件时出错: {file_path}, 错误: {e}")
+            return None
+    
+    def process_latest_file(self, progress_cb: Optional[Callable[[int], None]] = None) -> Optional[str]:
+        """
+        处理最新的Excel文件
+        
+        Returns:
+            输出文件路径，如果处理失败则返回None
+        """
+        # 获取最新的Excel文件
+        latest_file = self.get_latest_excel()
+        if not latest_file:
+            logger.warning("未找到可处理的Excel文件")
+            return None
+        
+        # 处理文件
+        return self.process_specific_file(latest_file, progress_cb=progress_cb)
+    
+    def _detect_column_mapping(self, df: pd.DataFrame) -> Dict[str, str]:
+        """
+        自动检测列名映射
+        
+        Args:
+            df: 数据框
+            
+        Returns:
+            列名映射字典，键为标准列名，值为实际列名
+        """
+        # 提取有用的列
+        barcode_cols = self.extract_barcode(df)
+
+        # 如果没有找到条码列，无法继续处理
+        if not barcode_cols:
+            logger.error("未找到条码列，无法处理")
+            return {}
+
+        # 使用 ColumnMapper 统一查找列名
+        mapped_columns = {'barcode': barcode_cols[0]}
+        logger.info(f"使用条码列: {mapped_columns['barcode']}")
+
+        # 内部键名 -> 标准列名映射 (processor.py 使用 price/amount 作为内部键名)
+        field_map = [
+            ('name', 'name'),
+            ('specification', 'specification'),
+            ('quantity', 'quantity'),
+            ('unit', 'unit'),
+            ('price', 'unit_price'),
+            ('amount', 'total_price'),
+        ]
+
+        for internal_key, standard_name in field_map:
+            matched = ColumnMapper.find_column(list(df.columns), standard_name)
+            if matched:
+                mapped_columns[internal_key] = matched
+                logger.info(f"找到{internal_key}列: {matched}")
+
+        return mapped_columns
+    
+    def infer_specification_from_name(self, product_name: str) -> Tuple[Optional[str], Optional[int]]:
+        """
+        从商品名称推断规格
+        根据特定的命名规则匹配规格信息
+        
+        Args:
+            product_name: 商品名称
+            
+        Returns:
+            规格字符串和包装数量的元组
+        """
+        if not product_name or not isinstance(product_name, str):
+            logger.warning(f"无效的商品名: {product_name}")
+            return None, None
+            
+        product_name = product_name.strip()
+        
+        # 特殊处理：重量/容量*数字格式
+        weight_volume_pattern = r'.*?\d+(?:g|ml|毫升|克)[*xX×](\d+)'
+        match = re.search(weight_volume_pattern, product_name)
+        if match:
+            inferred_spec = f"1*{match.group(1)}"
+            inferred_qty = int(match.group(1))
+            logger.info(f"从商品名称提取重量/容量规格: {product_name} -> {inferred_spec}, 包装数量={inferred_qty}")
+            return inferred_spec, inferred_qty
+        
+        # 使用单位转换器推断规格
+        inferred_spec = self.unit_converter.infer_specification_from_name(product_name)
+        if inferred_spec:
+            # 解析规格中的包装数量
+            package_quantity = self.parse_specification(inferred_spec)
+            if package_quantity:
+                logger.info(f"从商品名称推断规格: {product_name} -> {inferred_spec}, 包装数量={package_quantity}")
+                return inferred_spec, package_quantity
+        
+        # 特定商品规则匹配
+        spec_rules = [
+            # XX入白膜格式，如"550纯净水24入白膜"
+            (r'.*?(\d+)入白膜', lambda m: (f"1*{m.group(1)}", int(m.group(1)))),
+            
+            # 白膜格式，如"550水24白膜"
+            (r'.*?(\d+)白膜', lambda m: (f"1*{m.group(1)}", int(m.group(1)))),
+            
+            # 445水溶C系列
+            (r'445水溶C.*?(\d+)[入个]纸箱', lambda m: (f"1*{m.group(1)}", int(m.group(1)))),
+            
+            # 东方树叶系列
+            (r'东方树叶.*?(\d+\*\d+).*纸箱', lambda m: (m.group(1), int(m.group(1).split('*')[1]))),
+            
+            # 桶装
+            (r'(\d+\.?\d*L)桶装', lambda m: (f"{m.group(1)}*1", 1)),
+            
+            # 树叶茶系
+            (r'树叶.*?(\d+)[入个]纸箱', lambda m: (f"1*{m.group(1)}", int(m.group(1)))),
+            
+            # 茶π系列
+            (r'茶[πΠπ].*?(\d+)纸箱', lambda m: (f"1*{m.group(1)}", int(m.group(1)))),
+            
+            # 通用入数匹配
+            (r'.*?(\d+)[入个](?:纸箱|箱装|白膜)', lambda m: (f"1*{m.group(1)}", int(m.group(1)))),
+            
+            # 通用数字+纸箱格式
+            (r'.*?(\d+)纸箱', lambda m: (f"1*{m.group(1)}", int(m.group(1))))
+        ]
+        
+        # 尝试所有规则
+        for pattern, formatter in spec_rules:
+            match = re.search(pattern, product_name)
+            if match:
+                spec, qty = formatter(match)
+                logger.info(f"根据特定规则推断规格: {product_name} -> {spec}, 包装数量={qty}")
+                return spec, qty
+        
+        # 尝试直接从名称中提取数字*数字格式
+        match = re.search(r'(\d+\*\d+)', product_name)
+        if match:
+            spec = match.group(1)
+            package_quantity = self.parse_specification(spec)
+            if package_quantity:
+                logger.info(f"从名称中直接提取规格: {spec}, 包装数量={package_quantity}")
+                return spec, package_quantity
+        
+        # 最后尝试提取任何位置的数字，默认典型件装数
+        numbers = re.findall(r'\d+', product_name)
+        if numbers:
+            for num in numbers:
+                # 检查是否为典型的件装数(12/15/24/30)
+                if num in ['12', '15', '24', '30']:
+                    spec = f"1*{num}"
+                    logger.info(f"从名称中提取可能的件装数: {spec}, 包装数量={int(num)}")
+                    return spec, int(num)
+            
+        logger.warning(f"无法从商品名'{product_name}' 推断规格")
+        return None, None 
+    
+    def parse_specification(self, spec_str: str) -> Optional[int]:
+        """
+        解析规格字符串，提取包装数量
+        支持格式：1*15, 1x15, 1*5*10, 5kg*6, IL*12等
+        
+        Args:
+            spec_str: 规格字符串
+            
+        Returns:
+            包装数量，如果无法解析则返回None
+        """
+        if not spec_str or not isinstance(spec_str, str):
+            return None
+        
+        try:
+            # 清理规格字符串
+            spec_str = clean_string(spec_str)
+            
+            # 处理可能的OCR误识别，如"IL"应为"1L"，"6oo"应为"600"
+            spec_str = re.sub(r'(\b|^)[iIlL](\d+)', r'1\2', spec_str)  # 将"IL"替换为"1L"
+            spec_str = re.sub(r'(\d+)[oO0]{2,}', lambda m: m.group(1) + '00', spec_str)  # 将"6oo"替换为"600"
+            spec_str = spec_str.replace('×', '*').replace('x', '*').replace('X', '*')  # 统一乘号
+            
+            logger.debug(f"清理后的规格字符串: {spec_str}")
+            
+            # 新增：匹配“1件=12桶/袋/盒…”等等式规格，取右侧数量作为包装数量
+            eq_match = re.search(r'(\d+(?:\.\d+)?)\s*(?:件|箱|提|盒)\s*[=＝]\s*(\d+)\s*(?:瓶|桶|盒|支|个|袋|罐|包|卷)', spec_str)
+            if eq_match:
+                return int(eq_match.group(2))
+
+            # 匹配带单位的格式，如"5kg*6"、"450g*15"、"450ml*15"
+            weight_pattern = r'(\d+(?:\.\d+)?)\s*(?:kg|KG|千克|公斤)[*×](\d+)'
+            match = re.search(weight_pattern, spec_str)
+            if match:
+                return int(match.group(2))
+            
+            # 匹配克、毫升等单位格式
+            match = re.search(r'\d+(?:\.\d+)?(?:g|G|ml|ML|mL|毫升|克)[*×](\d+)', spec_str)
+            if match:
+                return int(match.group(1))
+            
+            # 匹配1*5*10 格式的三级规格
+            match = re.search(r'(\d+(?:\.\d+)?)[*×](\d+(?:\.\d+)?)[*×](\d+(?:\.\d+)?)', spec_str)
+            if match:
+                # 取最后一个数字作为袋数量
+                return int(float(match.group(3)))
+            
+            # 匹配1*15, 1x15 格式
+            match = re.search(r'(\d+(?:\.\d+)?)[*×](\d+(?:\.\d+)?)', spec_str)
+            if match:
+                # 取第二个数字作为包装数量
+                return int(float(match.group(2)))
+                
+            # 匹配24瓶/件等格式
+            match = re.search(r'(\d+(?:\.\d+)?)[瓶个支袋][/／](件|箱)', spec_str)
+            if match:
+                return int(float(match.group(1)))
+                
+            # 匹配4L格式
+            match = re.search(r'(\d+(?:\.\d+)?)\s*[Ll升][*×]?(\d+(?:\.\d+)?)?', spec_str)
+            if match:
+                # 如果有第二个数字，返回它；否则返回1
+                return int(float(match.group(2))) if match.group(2) else 1
+            
+            # 匹配单独的数字+单位格式，如"12瓶装"
+            match = re.search(r'(\d+(?:\.\d+)?)[瓶个支袋包盒罐箱](?:装|\/箱)?', spec_str)
+            if match:
+                return int(float(match.group(1)))
+            
+            # 尝试直接匹配任何数字
+            numbers = re.findall(r'\d+(?:\.\d+)?', spec_str)
+            if numbers and len(numbers) > 0:
+                # 如果只有一个数字，通常是包装数量
+                if len(numbers) == 1:
+                    return int(float(numbers[0]))
+                
+                # 如果有多个数字，尝试识别可能的包装数量（典型数值如6/12/24/30）
+                for num in numbers:
+                    if float(num) in [6.0, 12.0, 24.0, 30.0]:
+                        return int(float(num))
+                
+                # 如果没有典型数值，选择最后一个数字（通常是包装数量）
+                return int(float(numbers[-1]))
+                
+        except Exception as e:
+            logger.warning(f"解析规格'{spec_str}'时出错: {e}")
+            
+        return None
@@ -0,0 +1,259 @@
+"""
+数据验证器模块
+----------
+提供对商品数据的验证和修复功能
+"""
+
+import re
+import logging
+from typing import Dict, Any, Optional, List, Tuple, Union
+
+from ..utils.log_utils import get_logger
+from ..utils.string_utils import parse_monetary_string
+
+logger = get_logger(__name__)
+
+
+class ProductValidator:
+    """
+    商品数据验证器：验证和修复商品数据
+    """
+    
+    def __init__(self):
+        """
+        初始化商品数据验证器
+        """
+        # 仓库标识列表
+        self.warehouse_identifiers = ["仓库", "仓库全名", "warehouse"]
+        
+    def validate_barcode(self, barcode: Any) -> Tuple[bool, str, Optional[str]]:
+        """
+        验证并修复条码
+        
+        Args:
+            barcode: 原始条码值
+            
+        Returns:
+            (是否有效, 修复后的条码, 错误信息)元组
+        """
+        error_message = None
+        
+        # 处理空值
+        if barcode is None:
+            return False, "", "条码为空"
+            
+        # 转为字符串
+        barcode_str = str(barcode).strip()
+        
+        # 处理"仓库"特殊情况
+        if barcode_str in self.warehouse_identifiers:
+            return False, barcode_str, "条码为仓库标识"
+            
+        # 清理条码格式（移除非数字字符）
+        barcode_clean = re.sub(r'\D', '', barcode_str)
+        
+        # 如果清理后为空，无效
+        if not barcode_clean:
+            return False, barcode_str, "条码不包含数字"
+            
+        # 对特定的错误条码进行修正（5开头改为6开头）
+        if len(barcode_clean) > 8 and barcode_clean.startswith('5') and not barcode_clean.startswith('53'):
+            original_barcode = barcode_clean
+            barcode_clean = '6' + barcode_clean[1:]
+            logger.info(f"修正条码前缀 5->6: {original_barcode} -> {barcode_clean}")
+        
+        # 新增：处理14位条码，如果多余长度都是0，截断为13位
+        if len(barcode_clean) > 13:
+            original_length = len(barcode_clean)
+            # 检查多余部分是否都是0
+            if barcode_clean.endswith('0'):
+                # 从末尾开始移除0，直到条码长度为13位或不再以0结尾
+                while len(barcode_clean) > 13 and barcode_clean.endswith('0'):
+                    barcode_clean = barcode_clean[:-1]
+                logger.info(f"修正条码长度: 从{original_length}位截断到{len(barcode_clean)}位")
+            else:
+                error_message = f"条码长度异常: {barcode_clean}, 长度={len(barcode_clean)}"
+                logger.warning(error_message)
+                return False, barcode_clean, error_message
+            
+        # 验证条码长度
+        if len(barcode_clean) < 8 or len(barcode_clean) > 13:
+            error_message = f"条码长度异常: {barcode_clean}, 长度={len(barcode_clean)}"
+            logger.warning(error_message)
+            return False, barcode_clean, error_message
+            
+        # 验证条码是否全为数字
+        if not barcode_clean.isdigit():
+            error_message = f"条码包含非数字字符: {barcode_clean}"
+            logger.warning(error_message)
+            return False, barcode_clean, error_message
+            
+        # 对于序号9的特殊情况，允许其条码格式
+        if barcode_clean == "5321545613":
+            logger.info(f"特殊条码验证通过: {barcode_clean}")
+            return True, barcode_clean, None
+            
+        logger.debug(f"条码验证通过: {barcode_clean}")
+        return True, barcode_clean, None
+        
+    def validate_quantity(self, quantity: Any) -> Tuple[bool, float, Optional[str]]:
+        """
+        验证并修复数量
+        
+        Args:
+            quantity: 原始数量值
+            
+        Returns:
+            (是否有效, 修复后的数量, 错误信息)元组
+        """
+        # 处理空值
+        if quantity is None:
+            return False, 0.0, "数量为空"
+            
+        # 如果是字符串，尝试解析
+        if isinstance(quantity, str):
+            # 去除空白和非数字字符（保留小数点）
+            quantity_clean = re.sub(r'[^\d\.]', '', quantity.strip())
+            if not quantity_clean:
+                return False, 0.0, "数量不包含数字"
+                
+            try:
+                quantity_value = float(quantity_clean)
+            except ValueError:
+                return False, 0.0, f"无法将数量 '{quantity}' 转换为数字"
+        else:
+            # 尝试直接转换
+            try:
+                quantity_value = float(quantity)
+            except (ValueError, TypeError):
+                return False, 0.0, f"无法将数量 '{quantity}' 转换为数字"
+        
+        # 数量必须大于0
+        if quantity_value <= 0:
+            return False, 0.0, f"数量必须大于0，当前值: {quantity_value}"
+            
+        return True, quantity_value, None
+        
+    def validate_price(self, price: Any) -> Tuple[bool, float, bool, Optional[str]]:
+        """
+        验证并修复单价
+        
+        Args:
+            price: 原始单价值
+            
+        Returns:
+            (是否有效, 修复后的单价, 是否为赠品, 错误信息)元组
+        """
+        # 初始化不是赠品
+        is_gift = False
+        
+        # 处理空值
+        if price is None:
+            return False, 0.0, True, "单价为空，视为赠品"
+            
+        # 如果是字符串，检查赠品标识
+        if isinstance(price, str):
+            price_str = price.strip().lower()
+            if price_str in ["赠品", "gift", "赠送", "0", ""]:
+                return True, 0.0, True, None
+                
+            price_value = parse_monetary_string(price_str)
+            if price_value is None:
+                return False, 0.0, True, f"无法将单价 '{price}' 转换为数字，视为赠品"
+        else:
+            # 尝试直接转换
+            try:
+                price_value = float(price)
+            except (ValueError, TypeError):
+                return False, 0.0, True, f"无法将单价 '{price}' 转换为数字，视为赠品"
+        
+        # 单价为0视为赠品
+        if price_value == 0:
+            return True, 0.0, True, None
+            
+        # 单价必须大于0
+        if price_value < 0:
+            return False, 0.0, True, f"单价不能为负数: {price_value}，视为赠品"
+            
+        return True, price_value, False, None
+        
+    def validate_product(self, product: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        验证并修复商品数据
+        
+        Args:
+            product: 商品数据字典
+            
+        Returns:
+            修复后的商品数据字典
+        """
+        # 创建新字典，避免修改原始数据
+        validated_product = product.copy()
+        
+        # 验证条码
+        barcode = product.get('barcode', '')
+        is_valid, fixed_barcode, error_msg = self.validate_barcode(barcode)
+        if is_valid:
+            validated_product['barcode'] = fixed_barcode
+        else:
+            logger.warning(f"条码验证失败: {error_msg}")
+            if fixed_barcode:
+                # 即使验证失败，但如果有修复后的条码仍然使用它
+                validated_product['barcode'] = fixed_barcode
+        
+        # 验证单价
+        price = product.get('price', 0)
+        is_valid, fixed_price, is_gift, error_msg = self.validate_price(price)
+        validated_product['price'] = fixed_price
+        
+        # 如果单价验证结果表示为赠品，更新赠品标识
+        if is_gift:
+            validated_product['is_gift'] = True
+            if error_msg:
+                logger.info(error_msg)
+
+        amount = product.get('amount', None)
+        try:
+            is_amount_gift = False
+            parsed_amount = parse_monetary_string(amount)
+            if parsed_amount is None or parsed_amount == 0.0:
+                is_amount_gift = True
+            if is_amount_gift:
+                validated_product['is_gift'] = True
+        except Exception:
+            pass
+        
+        # 验证数量
+        quantity = product.get('quantity', None)
+        is_valid, fixed_quantity, error_msg = self.validate_quantity(quantity)
+        
+        # 检查数量是否为空，但单价和金额存在的情况
+        if not is_valid and error_msg == "数量为空":
+            # 获取金额
+            amount = product.get('amount', None)
+            
+            # 如果单价有效且金额存在，则可以计算数量
+            if fixed_price > 0 and amount is not None:
+                try:
+                    # 确保金额是数字
+                    amount = parse_monetary_string(amount)
+                    if amount is None:
+                        raise ValueError("无法解析金额")
+                    
+                    # 计算数量 = 金额 / 单价
+                    if amount > 0:
+                        calculated_quantity = amount / fixed_price
+                        logger.info(f"数量为空，通过金额({amount})和单价({fixed_price})计算得出数量: {calculated_quantity}")
+                        validated_product['quantity'] = calculated_quantity
+                        is_valid = True
+                except (ValueError, TypeError, ZeroDivisionError) as e:
+                    logger.warning(f"通过金额和单价计算数量失败: {e}")
+        
+        # 如果数量验证有效或通过金额计算成功
+        if is_valid:
+            validated_product['quantity'] = fixed_quantity if is_valid and fixed_quantity > 0 else validated_product.get('quantity', 0)
+        else:
+            logger.warning(f"数量验证失败: {error_msg}")
+            validated_product['quantity'] = 0.0
+        
+        return validated_product
@@ -0,0 +1,9 @@
+"""
+数据处理handlers模块初始化文件
+"""
+
+from .data_cleaner import DataCleaner
+from .column_mapper import ColumnMapper
+from .calculator import DataCalculator
+
+__all__ = ['DataCleaner', 'ColumnMapper', 'DataCalculator']
@@ -0,0 +1,378 @@
+"""
+数据计算处理器
+
+提供各种数据计算功能，如数量计算、价格计算、汇总统计等
+"""
+
+import pandas as pd
+import numpy as np
+from typing import Dict, Any, Optional, List, Union
+from ...core.utils.log_utils import get_logger
+
+logger = get_logger(__name__)
+
+
+class DataCalculator:
+    """数据计算处理器
+    
+    提供标准化的数据计算功能，支持各种业务计算规则
+    """
+    
+    def __init__(self, config: Optional[Dict[str, Any]] = None):
+        """初始化数据计算器
+        
+        Args:
+            config: 计算配置
+        """
+        self.config = config or {}
+        self.calculation_rules = []
+    
+    def add_rule(self, rule_type: str, **kwargs):
+        """添加计算规则
+        
+        Args:
+            rule_type: 规则类型
+            **kwargs: 规则参数
+        """
+        rule = {'type': rule_type, **kwargs}
+        self.calculation_rules.append(rule)
+        logger.debug(f"添加计算规则: {rule_type}")
+    
+    def calculate(self, df: pd.DataFrame) -> pd.DataFrame:
+        """执行数据计算
+        
+        Args:
+            df: 输入数据
+            
+        Returns:
+            计算后的数据
+        """
+        logger.info(f"开始数据计算，原始数据形状: {df.shape}")
+        
+        result_df = df.copy()
+        
+        for i, rule in enumerate(self.calculation_rules):
+            try:
+                logger.debug(f"执行计算规则 {i+1}/{len(self.calculation_rules)}: {rule['type']}")
+                result_df = self._apply_rule(result_df, rule)
+                logger.debug(f"规则执行完成，数据形状: {result_df.shape}")
+            except Exception as e:
+                logger.error(f"计算规则执行失败: {rule}, 错误: {e}")
+                # 继续执行下一个规则，而不是中断整个流程
+                continue
+        
+        logger.info(f"数据计算完成，最终数据形状: {result_df.shape}")
+        return result_df
+    
+    def _apply_rule(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
+        """应用单个计算规则
+        
+        Args:
+            df: 数据
+            rule: 规则配置
+            
+        Returns:
+            处理后的数据
+        """
+        rule_type = rule.get('type')
+        
+        if rule_type == 'multiply':
+            return self._multiply(df, rule)
+        elif rule_type == 'divide':
+            return self._divide(df, rule)
+        elif rule_type == 'add':
+            return self._add(df, rule)
+        elif rule_type == 'subtract':
+            return self._subtract(df, rule)
+        elif rule_type == 'formula':
+            return self._formula(df, rule)
+        elif rule_type == 'round':
+            return self._round(df, rule)
+        elif rule_type == 'sum':
+            return self._sum(df, rule)
+        elif rule_type == 'aggregate':
+            return self._aggregate(df, rule)
+        else:
+            logger.warning(f"未知的计算规则类型: {rule_type}")
+            return df
+    
+    def _multiply(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
+        """乘法计算
+        
+        Args:
+            df: 数据
+            rule: 规则配置
+            
+        Returns:
+            处理后的数据
+        """
+        source_column = rule.get('source_column')
+        target_column = rule.get('target_column')
+        factor = rule.get('factor', 1)
+        
+        if source_column and target_column:
+            if source_column in df.columns:
+                df[target_column] = df[source_column] * factor
+                logger.debug(f"乘法计算: {source_column} * {factor} -> {target_column}")
+            else:
+                logger.warning(f"源列不存在: {source_column}")
+        
+        return df
+    
+    def _divide(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
+        """除法计算
+        
+        Args:
+            df: 数据
+            rule: 规则配置
+            
+        Returns:
+            处理后的数据
+        """
+        source_column = rule.get('source_column')
+        target_column = rule.get('target_column')
+        divisor = rule.get('divisor', 1)
+        
+        if source_column and target_column and divisor != 0:
+            if source_column in df.columns:
+                df[target_column] = df[source_column] / divisor
+                logger.debug(f"除法计算: {source_column} / {divisor} -> {target_column}")
+            else:
+                logger.warning(f"源列不存在: {source_column}")
+        elif divisor == 0:
+            logger.error("除数不能为0")
+        
+        return df
+    
+    def _add(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
+        """加法计算
+        
+        Args:
+            df: 数据
+            rule: 规则配置
+            
+        Returns:
+            处理后的数据
+        """
+        columns = rule.get('columns', [])
+        target_column = rule.get('target_column')
+        constant = rule.get('constant', 0)
+        
+        if target_column:
+            if isinstance(columns, str):
+                columns = [columns]
+            
+            if columns:
+                # 列相加
+                valid_columns = [col for col in columns if col in df.columns]
+                if valid_columns:
+                    df[target_column] = df[valid_columns].sum(axis=1) + constant
+                    logger.debug(f"加法计算: {valid_columns} + {constant} -> {target_column}")
+                else:
+                    logger.warning(f"没有有效的列用于加法计算: {columns}")
+            else:
+                # 只加常数
+                if target_column in df.columns:
+                    df[target_column] = df[target_column] + constant
+                    logger.debug(f"加法计算: {target_column} + {constant}")
+                else:
+                    logger.warning(f"目标列不存在: {target_column}")
+        
+        return df
+    
+    def _subtract(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
+        """减法计算
+        
+        Args:
+            df: 数据
+            rule: 规则配置
+            
+        Returns:
+            处理后的数据
+        """
+        minuend = rule.get('minuend')  # 被减数列
+        subtrahend = rule.get('subtrahend')  # 减数列
+        target_column = rule.get('target_column')
+        constant = rule.get('constant', 0)
+        
+        if target_column and minuend and minuend in df.columns:
+            if subtrahend and subtrahend in df.columns:
+                df[target_column] = df[minuend] - df[subtrahend] - constant
+                logger.debug(f"减法计算: {minuend} - {subtrahend} - {constant} -> {target_column}")
+            else:
+                df[target_column] = df[minuend] - constant
+                logger.debug(f"减法计算: {minuend} - {constant} -> {target_column}")
+        else:
+            logger.warning(f"减法计算参数不完整或列不存在")
+        
+        return df
+    
+    def _formula(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
+        """公式计算
+        
+        Args:
+            df: 数据
+            rule: 规则配置
+            
+        Returns:
+            处理后的数据
+        """
+        formula = rule.get('formula')
+        target_column = rule.get('target_column')
+        
+        if formula and target_column:
+            try:
+                df[target_column] = df.eval(formula)
+                logger.debug(f"公式计算: {formula} -> {target_column}")
+            except Exception as e:
+                logger.error(f"公式计算失败: {formula}, 错误: {e}")
+        else:
+            logger.warning("公式计算缺少公式或目标列")
+        
+        return df
+    
+    def _round(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
+        """四舍五入
+        
+        Args:
+            df: 数据
+            rule: 规则配置
+            
+        Returns:
+            处理后的数据
+        """
+        columns = rule.get('columns', [])
+        decimals = rule.get('decimals', 0)
+        
+        if isinstance(columns, str):
+            columns = [columns]
+        
+        target_columns = columns or df.select_dtypes(include=[np.number]).columns
+        
+        for col in target_columns:
+            if col in df.columns and pd.api.types.is_numeric_dtype(df[col]):
+                df[col] = df[col].round(decimals)
+                logger.debug(f"四舍五入: {col} 保留 {decimals} 位小数")
+        
+        return df
+    
+    def _sum(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
+        """求和计算
+        
+        Args:
+            df: 数据
+            rule: 规则配置
+            
+        Returns:
+            处理后的数据
+        """
+        columns = rule.get('columns', [])
+        target_column = rule.get('target_column')
+        group_by = rule.get('group_by')
+        
+        if isinstance(columns, str):
+            columns = [columns]
+        
+        if group_by and group_by in df.columns:
+            # 分组求和
+            if columns:
+                for col in columns:
+                    if col in df.columns:
+                        sum_result = df.groupby(group_by)[col].sum()
+                        logger.debug(f"分组求和: {col} 按 {group_by} 分组")
+            else:
+                # 所有数值列分组求和
+                numeric_columns = df.select_dtypes(include=[np.number]).columns
+                sum_result = df.groupby(group_by)[numeric_columns].sum()
+                logger.debug(f"分组求和: 所有数值列 按 {group_by} 分组")
+        else:
+            # 总体求和
+            if columns:
+                valid_columns = [col for col in columns if col in df.columns]
+                if valid_columns and target_column:
+                    df[target_column] = df[valid_columns].sum(axis=1)
+                    logger.debug(f"求和计算: {valid_columns} -> {target_column}")
+            else:
+                # 所有数值列求和
+                numeric_columns = df.select_dtypes(include=[np.number]).columns
+                if target_column and len(numeric_columns) > 0:
+                    df[target_column] = df[numeric_columns].sum(axis=1)
+                    logger.debug(f"求和计算: {list(numeric_columns)} -> {target_column}")
+        
+        return df
+    
+    def _aggregate(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
+        """聚合计算
+        
+        Args:
+            df: 数据
+            rule: 规则配置
+            
+        Returns:
+            处理后的数据
+        """
+        group_by = rule.get('group_by')
+        aggregations = rule.get('aggregations', {})
+        
+        if group_by and group_by in df.columns:
+            # 构建聚合函数字典
+            agg_dict = {}
+            for column, func in aggregations.items():
+                if column in df.columns:
+                    if isinstance(func, str):
+                        agg_dict[column] = func
+                    elif isinstance(func, list):
+                        agg_dict[column] = func
+            
+            if agg_dict:
+                result = df.groupby(group_by).agg(agg_dict)
+                logger.debug(f"聚合计算: 按 {group_by} 分组, 聚合: {agg_dict}")
+                return result.reset_index()
+        
+        return df
+    
+    # 便捷方法
+    def multiply(self, source_column: str, target_column: str, factor: float):
+        """乘法计算"""
+        self.add_rule('multiply', source_column=source_column, 
+                     target_column=target_column, factor=factor)
+        return self
+    
+    def divide(self, source_column: str, target_column: str, divisor: float):
+        """除法计算"""
+        self.add_rule('divide', source_column=source_column, 
+                     target_column=target_column, divisor=divisor)
+        return self
+    
+    def add(self, columns: Union[str, List[str]], target_column: str, constant: float = 0):
+        """加法计算"""
+        self.add_rule('add', columns=columns, target_column=target_column, constant=constant)
+        return self
+    
+    def subtract(self, minuend: str, target_column: str, 
+                  subtrahend: Optional[str] = None, constant: float = 0):
+        """减法计算"""
+        self.add_rule('subtract', minuend=minuend, target_column=target_column,
+                     subtrahend=subtrahend, constant=constant)
+        return self
+    
+    def formula(self, formula: str, target_column: str):
+        """公式计算"""
+        self.add_rule('formula', formula=formula, target_column=target_column)
+        return self
+    
+    def round_columns(self, columns: Optional[Union[str, List[str]]] = None, decimals: int = 0):
+        """四舍五入"""
+        self.add_rule('round', columns=columns, decimals=decimals)
+        return self
+    
+    def sum_columns(self, columns: Optional[Union[str, List[str]]] = None, 
+                   target_column: Optional[str] = None, group_by: Optional[str] = None):
+        """求和计算"""
+        self.add_rule('sum', columns=columns, target_column=target_column, group_by=group_by)
+        return self
+    
+    def aggregate(self, group_by: str, aggregations: Dict[str, Union[str, List[str]]]):
+        """聚合计算"""
+        self.add_rule('aggregate', group_by=group_by, aggregations=aggregations)
+        return self
@@ -0,0 +1,382 @@
+"""
+列映射处理器
+
+提供列名映射和转换功能，支持不同供应商的列名标准化
+"""
+
+import re
+import pandas as pd
+from typing import Dict, Any, Optional, List, Union
+from ...core.utils.log_utils import get_logger
+
+logger = get_logger(__name__)
+
+
+class ColumnMapper:
+    """列映射处理器
+    
+    提供列名标准化功能，将不同供应商的列名映射到标准列名
+    """
+    
+    # 标准列名定义（所有列名别名的唯一来源）
+    STANDARD_COLUMNS = {
+        'barcode': [
+            '条码', '条形码', '商品条码', '商品条形码', '产品条码', '商品编码',
+            '商品编号', '条码（必填）', '电脑条码', '条码ID',
+            'barcode', 'Barcode', 'BarCode', 'code', '编码',
+        ],
+        'name': [
+            '商品名称', '产品名称', '名称', '商品', '产品', '商品名', '品名',
+            '品项名', '商品或服务名称', '品项', '名 称',
+            'name', 'product_name',
+        ],
+        'specification': [
+            '规格', '规格型号', '型号', '商品规格', '产品规格', '包装规格', '规 格',
+            'specification', 'spec', 'model',
+        ],
+        'quantity': [
+            '数量', '采购量', '订货数量', '订单量', '需求量', '采购数量', '购买数量',
+            '订单数量', '数量（必填）', '采购量（必填）', '入库数', '入库数量', '数 量',
+            'quantity', 'qty',
+        ],
+        'unit': [
+            '单位', '计量单位', '采购单位', '单位（必填）', '单位名称', '计价单位', '单 位',
+            'unit', 'units',
+        ],
+        'unit_price': [
+            '单价', '价格', '采购单价', '进货价', '销售价', '采购价', '参考价',
+            '入库单价', '单价（必填）', '采购单价（必填）', '价格（必填）', '单 价',
+            'unit_price', 'price',
+        ],
+        'total_price': [
+            '总价', '金额', '小计', '合计金额', '小计金额', '金额（元）',
+            '金额合计', '合计', '总额',
+            'total_price', 'total', 'amount',
+        ],
+        'gift_quantity': [
+            '赠送量', '赠品数量', '赠送数量', '赠品',
+        ],
+        'category': ['类别', '分类', '商品类别', 'category', 'type'],
+        'brand': ['品牌', '商标', 'brand'],
+        'supplier': ['供应商', '供货商', 'supplier', 'vendor'],
+    }
+    
+    def __init__(self, mapping_config: Optional[Dict[str, Any]] = None):
+        """初始化列映射器
+        
+        Args:
+            mapping_config: 映射配置
+        """
+        self.mapping_config = mapping_config or {}
+        self.custom_mappings = {}
+        self._build_reverse_mapping()
+    
+    def _build_reverse_mapping(self):
+        """构建反向映射表"""
+        self.reverse_mapping = {}
+        
+        # 添加标准列的反向映射
+        for standard_name, variations in self.STANDARD_COLUMNS.items():
+            for variation in variations:
+                self.reverse_mapping[variation.lower()] = standard_name
+        
+        # 添加自定义映射
+        for standard_name, custom_names in self.mapping_config.items():
+            if isinstance(custom_names, str):
+                custom_names = [custom_names]
+            
+            for custom_name in custom_names:
+                self.reverse_mapping[custom_name.lower()] = standard_name
+                self.custom_mappings[custom_name.lower()] = standard_name
+    
+    def map_columns(self, df: pd.DataFrame, target_columns: Optional[List[str]] = None) -> pd.DataFrame:
+        """映射列名
+        
+        Args:
+            df: 输入数据
+            target_columns: 目标列名列表，如果为None则使用所有标准列
+            
+        Returns:
+            列名映射后的数据
+        """
+        if target_columns is None:
+            target_columns = list(self.STANDARD_COLUMNS.keys())
+        
+        logger.info(f"开始列名映射，目标列: {target_columns}")
+        logger.info(f"原始列名: {list(df.columns)}")
+        
+        # 创建列名映射
+        column_mapping = {}
+        used_columns = set()
+        
+        for target_col in target_columns:
+            # 查找匹配的原始列名
+            matched_column = self._find_matching_column(df.columns, target_col)
+            if matched_column:
+                column_mapping[matched_column] = target_col
+                used_columns.add(matched_column)
+                logger.debug(f"列名映射: {matched_column} -> {target_col}")
+        
+        # 重命名列
+        if column_mapping:
+            df_mapped = df.rename(columns=column_mapping)
+            
+            # 添加缺失的目标列
+            for target_col in target_columns:
+                if target_col not in df_mapped.columns:
+                    df_mapped[target_col] = self._get_default_value(target_col)
+                    logger.debug(f"添加缺失列: {target_col}")
+            
+            # 只保留目标列
+            existing_target_columns = [col for col in target_columns if col in df_mapped.columns]
+            df_result = df_mapped[existing_target_columns]
+            
+            logger.info(f"列名映射完成，结果列名: {list(df_result.columns)}")
+            return df_result
+        else:
+            logger.warning("没有找到可映射的列名")
+            return df
+    
+    def _find_matching_column(self, columns: List[str], target_column: str) -> Optional[str]:
+        """查找匹配的列名
+        
+        Args:
+            columns: 原始列名列表
+            target_column: 目标标准列名
+            
+        Returns:
+            匹配的原始列名或None
+        """
+        # 获取目标列的所有可能变体
+        possible_names = []
+        
+        # 标准列名变体
+        if target_column in self.STANDARD_COLUMNS:
+            possible_names.extend(self.STANDARD_COLUMNS[target_column])
+        
+        # 自定义映射
+        for standard_name, custom_names in self.mapping_config.items():
+            if standard_name == target_column:
+                if isinstance(custom_names, str):
+                    possible_names.append(custom_names)
+                else:
+                    possible_names.extend(custom_names)
+        
+        # 查找匹配
+        for possible_name in possible_names:
+            # 精确匹配（忽略大小写）
+            for column in columns:
+                if column.lower() == possible_name.lower():
+                    return column
+            
+            # 模糊匹配
+            for column in columns:
+                if possible_name.lower() in column.lower() or column.lower() in possible_name.lower():
+                    return column
+        
+        return None
+    
+    def _get_default_value(self, column_name: str) -> Any:
+        """获取列的默认值
+        
+        Args:
+            column_name: 列名
+            
+        Returns:
+            默认值
+        """
+        # 根据列名类型返回合适的默认值
+        if column_name in ['quantity', 'unit_price', 'total_price']:
+            return 0
+        elif column_name in ['barcode', 'name', 'specification', 'unit', 'category', 'brand', 'supplier']:
+            return ''
+        else:
+            return None
+    
+    def add_custom_mapping(self, standard_name: str, custom_names: Union[str, List[str]]):
+        """添加自定义列名映射
+        
+        Args:
+            standard_name: 标准列名
+            custom_names: 自定义列名或列名列表
+        """
+        if isinstance(custom_names, str):
+            custom_names = [custom_names]
+        
+        # 更新配置
+        self.mapping_config[standard_name] = custom_names
+        
+        # 更新反向映射
+        for custom_name in custom_names:
+            self.reverse_mapping[custom_name.lower()] = standard_name
+            self.custom_mappings[custom_name.lower()] = standard_name
+        
+        logger.info(f"添加自定义映射: {standard_name} <- {custom_names}")
+    
+    def detect_column_types(self, df: pd.DataFrame) -> Dict[str, str]:
+        """检测列的数据类型
+        
+        Args:
+            df: 数据
+            
+        Returns:
+            列类型字典
+        """
+        column_types = {}
+        
+        for column in df.columns:
+            if pd.api.types.is_numeric_dtype(df[column]):
+                column_types[column] = 'numeric'
+            elif pd.api.types.is_datetime64_any_dtype(df[column]):
+                column_types[column] = 'datetime'
+            elif pd.api.types.is_bool_dtype(df[column]):
+                column_types[column] = 'boolean'
+            else:
+                column_types[column] = 'text'
+        
+        return column_types
+    
+    def suggest_column_mapping(self, df: pd.DataFrame) -> Dict[str, List[str]]:
+        """建议列名映射
+        
+        Args:
+            df: 数据
+            
+        Returns:
+            建议的映射关系
+        """
+        suggestions = {}
+        
+        for column in df.columns:
+            column_lower = column.lower()
+            suggestions[column] = []
+            
+            # 检查标准列名
+            for standard_name, variations in self.STANDARD_COLUMNS.items():
+                for variation in variations:
+                    if column_lower in variation.lower() or variation.lower() in column_lower:
+                        suggestions[column].append(standard_name)
+            
+            # 检查自定义映射
+            for custom_name, standard_name in self.custom_mappings.items():
+                if column_lower in custom_name or custom_name in column_lower:
+                    suggestions[column].append(standard_name)
+            
+            # 去重
+            suggestions[column] = list(set(suggestions[column]))
+        
+        # 只返回有建议的列
+        return {k: v for k, v in suggestions.items() if v}
+    
+    def validate_mapping(self, df: pd.DataFrame, required_columns: List[str]) -> Dict[str, Any]:
+        """验证列映射结果
+        
+        Args:
+            df: 映射后的数据
+            required_columns: 必需的列名列表
+            
+        Returns:
+            验证结果
+        """
+        result = {
+            'valid': True,
+            'missing_columns': [],
+            'empty_columns': [],
+            'warnings': []
+        }
+        
+        # 检查缺失列
+        for col in required_columns:
+            if col not in df.columns:
+                result['missing_columns'].append(col)
+                result['valid'] = False
+        
+        # 检查空列
+        for col in df.columns:
+            if df[col].isnull().all():
+                result['empty_columns'].append(col)
+                result['warnings'].append(f"列 '{col}' 全部为空值")
+        
+        # 检查数值列
+        numeric_columns = ['quantity', 'unit_price', 'total_price']
+        for col in numeric_columns:
+            if col in df.columns and not pd.api.types.is_numeric_dtype(df[col]):
+                result['warnings'].append(f"列 '{col}' 不是数值类型")
+        
+        return result
+
+    @classmethod
+    def find_column(cls, columns: List[str], standard_name: str) -> Optional[str]:
+        """在列名列表中查找匹配标准列名的列
+
+        匹配策略: 精确匹配 → 忽略空白匹配 → 子串匹配
+
+        Args:
+            columns: 实际列名列表
+            standard_name: 标准列名 (STANDARD_COLUMNS 的键)
+
+        Returns:
+            匹配到的实际列名，未找到返回 None
+        """
+        candidates = cls.STANDARD_COLUMNS.get(standard_name, [])
+        if not candidates:
+            return None
+
+        columns_str = [str(c) for c in columns]
+
+        # 精确匹配
+        for col in columns_str:
+            col_clean = col.strip()
+            for candidate in candidates:
+                if col_clean == candidate:
+                    return col
+
+        # 忽略空白匹配
+        for col in columns_str:
+            col_clean = re.sub(r'\s+', '', col.strip())
+            for candidate in candidates:
+                if col_clean == re.sub(r'\s+', '', candidate):
+                    return col
+
+        # 子串匹配 (候选名包含在列名中)
+        for col in columns_str:
+            col_lower = col.strip().lower()
+            for candidate in candidates:
+                if candidate.lower() in col_lower:
+                    return col
+
+        return None
+
+    @staticmethod
+    def detect_header_row(df: pd.DataFrame, max_rows: int = 10, min_matches: int = 3) -> int:
+        """检测表头所在行
+
+        扫描前 max_rows 行，返回包含最多关键词匹配的行索引。
+
+        Args:
+            df: 数据框
+            max_rows: 最大扫描行数
+            min_matches: 最少关键词匹配数
+
+        Returns:
+            表头行索引，未找到返回 -1
+        """
+        header_keywords = [
+            '条码', '条形码', '商品条码', '商品名称', '名称', '规格',
+            '单价', '数量', '金额', '单位', '必填', '编码',
+        ]
+
+        best_row = -1
+        best_matches = 0
+
+        for row_idx in range(min(max_rows, len(df))):
+            row_values = df.iloc[row_idx].astype(str)
+            matches = sum(
+                1 for kw in header_keywords
+                if any(kw in str(val) for val in row_values.values)
+            )
+            if matches >= min_matches and matches > best_matches:
+                best_matches = matches
+                best_row = row_idx
+
+        return best_row
@@ -0,0 +1,401 @@
+"""
+数据清洗处理器
+
+提供各种数据清洗功能，如空值处理、重复项处理、数据类型转换等
+"""
+
+import pandas as pd
+from typing import Dict, Any, Optional, List, Union
+from ...core.utils.log_utils import get_logger
+
+logger = get_logger(__name__)
+
+
+class DataCleaner:
+    """数据清洗处理器
+    
+    提供标准化的数据清洗功能，支持链式调用和规则配置
+    """
+    
+    def __init__(self, config: Optional[Dict[str, Any]] = None):
+        """初始化数据清洗器
+        
+        Args:
+            config: 清洗配置
+        """
+        self.config = config or {}
+        self.cleaning_rules = []
+    
+    def add_rule(self, rule_type: str, **kwargs):
+        """添加清洗规则
+        
+        Args:
+            rule_type: 规则类型
+            **kwargs: 规则参数
+        """
+        rule = {'type': rule_type, **kwargs}
+        self.cleaning_rules.append(rule)
+        logger.debug(f"添加清洗规则: {rule_type}")
+    
+    def clean(self, df: pd.DataFrame) -> pd.DataFrame:
+        """执行数据清洗
+        
+        Args:
+            df: 输入数据
+            
+        Returns:
+            清洗后的数据
+        """
+        logger.info(f"开始数据清洗，原始数据形状: {df.shape}")
+        
+        result_df = df.copy()
+        
+        for i, rule in enumerate(self.cleaning_rules):
+            try:
+                logger.debug(f"执行清洗规则 {i+1}/{len(self.cleaning_rules)}: {rule['type']}")
+                result_df = self._apply_rule(result_df, rule)
+                logger.debug(f"规则执行完成，数据形状: {result_df.shape}")
+            except Exception as e:
+                logger.error(f"清洗规则执行失败: {rule}, 错误: {e}")
+                # 继续执行下一个规则，而不是中断整个流程
+                continue
+        
+        logger.info(f"数据清洗完成，最终数据形状: {result_df.shape}")
+        return result_df
+    
+    def _apply_rule(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
+        """应用单个清洗规则
+        
+        Args:
+            df: 数据
+            rule: 规则配置
+            
+        Returns:
+            处理后的数据
+        """
+        rule_type = rule.get('type')
+        
+        if rule_type == 'remove_duplicates':
+            return self._remove_duplicates(df, rule)
+        elif rule_type == 'fill_na':
+            return self._fill_na(df, rule)
+        elif rule_type == 'remove_rows':
+            return self._remove_rows(df, rule)
+        elif rule_type == 'convert_type':
+            return self._convert_type(df, rule)
+        elif rule_type == 'strip_whitespace':
+            return self._strip_whitespace(df, rule)
+        elif rule_type == 'normalize_text':
+            return self._normalize_text(df, rule)
+        elif rule_type == 'validate_data':
+            return self._validate_data(df, rule)
+        else:
+            logger.warning(f"未知的清洗规则类型: {rule_type}")
+            return df
+    
+    def _remove_duplicates(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
+        """移除重复项
+        
+        Args:
+            df: 数据
+            rule: 规则配置
+            
+        Returns:
+            处理后的数据
+        """
+        subset = rule.get('subset')  # 用于判断重复的列
+        keep = rule.get('keep', 'first')  # 保留哪个重复项
+        
+        before_count = len(df)
+        df_cleaned = df.drop_duplicates(subset=subset, keep=keep)
+        after_count = len(df_cleaned)
+        
+        logger.info(f"移除重复项: {before_count - after_count} 行被移除")
+        return df_cleaned
+    
+    def _fill_na(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
+        """填充空值
+        
+        Args:
+            df: 数据
+            rule: 规则配置
+            
+        Returns:
+            处理后的数据
+        """
+        columns = rule.get('columns')  # 要处理的列
+        value = rule.get('value', 0)  # 填充值
+        method = rule.get('method')  # 填充方法（'ffill', 'bfill', 'mean', 'median'）
+        
+        if columns:
+            # 处理指定列
+            if isinstance(columns, str):
+                columns = [columns]
+            
+            for col in columns:
+                if col in df.columns:
+                    if method == 'ffill':
+                        df[col] = df[col].fillna(method='ffill')
+                    elif method == 'bfill':
+                        df[col] = df[col].fillna(method='bfill')
+                    elif method == 'mean':
+                        df[col] = df[col].fillna(df[col].mean())
+                    elif method == 'median':
+                        df[col] = df[col].fillna(df[col].median())
+                    else:
+                        df[col] = df[col].fillna(value)
+                    
+                    logger.debug(f"填充列 {col} 的空值: {method or value}")
+        else:
+            # 处理所有列
+            if method == 'ffill':
+                df = df.fillna(method='ffill')
+            elif method == 'bfill':
+                df = df.fillna(method='bfill')
+            else:
+                df = df.fillna(value)
+            
+            logger.debug(f"填充所有列的空值: {method or value}")
+        
+        return df
+    
+    def _remove_rows(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
+        """移除行
+        
+        Args:
+            df: 数据
+            rule: 规则配置
+            
+        Returns:
+            处理后的数据
+        """
+        condition = rule.get('condition')  # 条件表达式
+        columns = rule.get('columns')  # 要检查的列
+        values = rule.get('values')  # 要移除的值
+        
+        if condition:
+            # 使用条件表达式
+            try:
+                before_count = len(df)
+                df_filtered = df.query(condition)
+                after_count = len(df_filtered)
+                logger.info(f"条件过滤: {condition}, 移除了 {before_count - after_count} 行")
+                return df_filtered
+            except Exception as e:
+                logger.error(f"条件表达式执行失败: {condition}, 错误: {e}")
+                return df
+        
+        if columns and values:
+            # 基于列值过滤
+            if isinstance(columns, str):
+                columns = [columns]
+            if not isinstance(values, list):
+                values = [values]
+            
+            df_filtered = df.copy()
+            for col in columns:
+                if col in df_filtered.columns:
+                    mask = ~df_filtered[col].isin(values)
+                    df_filtered = df_filtered[mask]
+                    logger.debug(f"列 {col} 过滤值 {values}")
+            
+            return df_filtered
+        
+        logger.warning("移除行规则缺少条件或列配置")
+        return df
+    
+    def _convert_type(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
+        """类型转换
+        
+        Args:
+            df: 数据
+            rule: 规则配置
+            
+        Returns:
+            处理后的数据
+        """
+        columns = rule.get('columns')
+        target_type = rule.get('target_type', 'float')
+        errors = rule.get('errors', 'coerce')  # 错误处理方式
+        
+        if isinstance(columns, str):
+            columns = [columns]
+        
+        for col in columns:
+            if col in df.columns:
+                try:
+                    if target_type == 'int':
+                        df[col] = pd.to_numeric(df[col], errors=errors).astype('Int64')
+                    elif target_type == 'float':
+                        df[col] = pd.to_numeric(df[col], errors=errors)
+                    elif target_type == 'datetime':
+                        df[col] = pd.to_datetime(df[col], errors=errors)
+                    elif target_type == 'string':
+                        df[col] = df[col].astype(str)
+                    else:
+                        df[col] = df[col].astype(target_type)
+                    
+                    logger.debug(f"列 {col} 类型转换: {target_type}")
+                except Exception as e:
+                    logger.error(f"列 {col} 类型转换失败: {e}")
+        
+        return df
+    
+    def _strip_whitespace(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
+        """去除空白字符
+        
+        Args:
+            df: 数据
+            rule: 规则配置
+            
+        Returns:
+            处理后的数据
+        """
+        columns = rule.get('columns')
+        
+        if columns:
+            if isinstance(columns, str):
+                columns = [columns]
+            
+            for col in columns:
+                if col in df.columns and df[col].dtype == 'object':
+                    df[col] = df[col].str.strip()
+                    logger.debug(f"列 {col} 去除空白字符")
+        else:
+            # 处理所有文本列
+            text_columns = df.select_dtypes(include=['object']).columns
+            for col in text_columns:
+                df[col] = df[col].str.strip()
+            
+            logger.debug(f"所有文本列去除空白字符: {list(text_columns)}")
+        
+        return df
+    
+    def _normalize_text(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
+        """文本标准化
+        
+        Args:
+            df: 数据
+            rule: 规则配置
+            
+        Returns:
+            处理后的数据
+        """
+        columns = rule.get('columns')
+        lowercase = rule.get('lowercase', False)
+        uppercase = rule.get('uppercase', False)
+        replace_map = rule.get('replace_map', {})  # 替换映射
+        
+        if isinstance(columns, str):
+            columns = [columns]
+        
+        target_columns = columns or df.select_dtypes(include=['object']).columns
+        
+        for col in target_columns:
+            if col in df.columns and df[col].dtype == 'object':
+                if lowercase:
+                    df[col] = df[col].str.lower()
+                elif uppercase:
+                    df[col] = df[col].str.upper()
+                
+                # 应用替换映射
+                for old, new in replace_map.items():
+                    df[col] = df[col].str.replace(old, new)
+                
+                logger.debug(f"列 {col} 文本标准化完成")
+        
+        return df
+    
+    def _validate_data(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
+        """数据验证
+        
+        Args:
+            df: 数据
+            rule: 规则配置
+            
+        Returns:
+            处理后的数据
+        """
+        columns = rule.get('columns')
+        min_value = rule.get('min_value')
+        max_value = rule.get('max_value')
+        required = rule.get('required', False)
+        
+        if isinstance(columns, str):
+            columns = [columns]
+        
+        validation_results = []
+        
+        for col in columns:
+            if col in df.columns:
+                # 检查必需值
+                if required:
+                    null_count = df[col].isnull().sum()
+                    if null_count > 0:
+                        validation_results.append(f"{col}: {null_count} 个空值")
+                
+                # 检查数值范围
+                if min_value is not None or max_value is not None:
+                    if pd.api.types.is_numeric_dtype(df[col]):
+                        invalid_mask = pd.Series(False, index=df.index)
+                        if min_value is not None:
+                            invalid_mask |= df[col] < min_value
+                        if max_value is not None:
+                            invalid_mask |= df[col] > max_value
+                        
+                        invalid_count = invalid_mask.sum()
+                        if invalid_count > 0:
+                            validation_results.append(f"{col}: {invalid_count} 个值超出范围")
+        
+        if validation_results:
+            logger.warning(f"数据验证发现问题: {', '.join(validation_results)}")
+        else:
+            logger.debug("数据验证通过")
+        
+        return df
+    
+    # 便捷方法
+    def remove_duplicates(self, subset: Optional[List[str]] = None, keep: str = 'first'):
+        """移除重复项"""
+        self.add_rule('remove_duplicates', subset=subset, keep=keep)
+        return self
+    
+    def fill_na(self, columns: Optional[Union[str, List[str]]] = None, 
+                value: Any = 0, method: Optional[str] = None):
+        """填充空值"""
+        self.add_rule('fill_na', columns=columns, value=value, method=method)
+        return self
+    
+    def remove_rows(self, condition: Optional[str] = None,
+                   columns: Optional[Union[str, List[str]]] = None,
+                   values: Optional[Any] = None):
+        """移除行"""
+        self.add_rule('remove_rows', condition=condition, columns=columns, values=values)
+        return self
+    
+    def convert_type(self, columns: Union[str, List[str]], target_type: str, errors: str = 'coerce'):
+        """类型转换"""
+        self.add_rule('convert_type', columns=columns, target_type=target_type, errors=errors)
+        return self
+    
+    def strip_whitespace(self, columns: Optional[Union[str, List[str]]] = None):
+        """去除空白字符"""
+        self.add_rule('strip_whitespace', columns=columns)
+        return self
+    
+    def normalize_text(self, columns: Optional[Union[str, List[str]]] = None,
+                      lowercase: bool = False, uppercase: bool = False,
+                      replace_map: Optional[Dict[str, str]] = None):
+        """文本标准化"""
+        self.add_rule('normalize_text', columns=columns, lowercase=lowercase, 
+                     uppercase=uppercase, replace_map=replace_map or {})
+        return self
+    
+    def validate_data(self, columns: Union[str, List[str]], 
+                     min_value: Optional[float] = None,
+                     max_value: Optional[float] = None,
+                     required: bool = False):
+        """数据验证"""
+        self.add_rule('validate_data', columns=columns, min_value=min_value,
+                     max_value=max_value, required=required)
+        return self
@@ -0,0 +1,150 @@
+import re
+import pandas as pd
+from typing import List, Dict, Any, Optional
+
+def _split_quantity_unit(df: pd.DataFrame, source: str, dictionary: Optional[Dict[str, Any]] = None) -> pd.DataFrame:
+    if source in df.columns:
+        vals = df[source].astype(str).fillna("")
+        nums = []
+        units = []
+        default_unit = (dictionary or {}).get("default_unit", "")
+        unit_synonyms = (dictionary or {}).get("unit_synonyms", {})
+        for v in vals:
+            m = re.search(r"(\d+(?:\.\d+)?)(箱|件|提|盒|瓶)", v)
+            if m:
+                nums.append(float(m.group(1)))
+                u = unit_synonyms.get(m.group(2), m.group(2))
+                units.append(u)
+            else:
+                try:
+                    nums.append(float(v))
+                    units.append(unit_synonyms.get(default_unit, default_unit))
+                except Exception:
+                    nums.append(0.0)
+                    units.append(unit_synonyms.get(default_unit, default_unit))
+        df["quantity"] = nums
+        df["unit"] = units
+    return df
+
+def _extract_spec_from_name(df: pd.DataFrame, source: str, dictionary: Optional[Dict[str, Any]] = None) -> pd.DataFrame:
+    if source in df.columns:
+        names = df[source].astype(str).fillna("")
+        specs = []
+        packs = []
+        ignore_words = (dictionary or {}).get("ignore_words", [])
+        name_patterns = (dictionary or {}).get("name_patterns", [])
+        for s in names:
+            if ignore_words:
+                for w in ignore_words:
+                    s = s.replace(w, "")
+            matched = False
+            for pat in name_patterns:
+                try:
+                    m = re.search(pat, s)
+                    if m and len(m.groups()) >= 2:
+                        try:
+                            qty = int(m.group(len(m.groups())))
+                        except Exception:
+                            qty = None
+                        specs.append(s)
+                        packs.append(qty)
+                        matched = True
+                        break
+                except Exception:
+                    pass
+            if matched:
+                continue
+            m = re.search(r"(\d+(?:\.\d+)?)(ml|l|升|毫升)[*×xX](\d+)", s, re.IGNORECASE)
+            if m:
+                specs.append(f"{m.group(1)}{m.group(2)}*{m.group(3)}")
+                packs.append(int(m.group(3)))
+                continue
+            m2 = re.search(r"(\d+)[*×xX](\d+)", s)
+            if m2:
+                specs.append(f"1*{m2.group(2)}")
+                packs.append(int(m2.group(2)))
+                continue
+            m3 = re.search(r"(\d{2,3})\D*(\d{1,3})\D*", s)
+            if m3:
+                specs.append(f"1*{m3.group(2)}")
+                packs.append(int(m3.group(2)))
+                continue
+            specs.append("")
+            packs.append(None)
+        df["specification"] = df.get("specification", pd.Series(specs))
+        df["package_quantity"] = packs
+    return df
+
+def _normalize_unit(df: pd.DataFrame, target: str, unit_map: Dict[str, str], dictionary: Optional[Dict[str, Any]] = None) -> pd.DataFrame:
+    if target in df.columns:
+        df[target] = df[target].astype(str)
+        df[target] = df[target].apply(lambda u: unit_map.get(u, u))
+        pack_multipliers = (dictionary or {}).get("pack_multipliers", {})
+        default_pq = (dictionary or {}).get("default_package_quantity", 1)
+        try:
+            if "quantity" in df.columns:
+                def convert_qty(row):
+                    u = row.get(target)
+                    q = row.get("quantity")
+                    pq = row.get("package_quantity")
+                    if u in ("件", "箱", "提", "盒"):
+                        mult = pq or pack_multipliers.get(u, default_pq)
+                        if pd.notna(q) and pd.notna(mult) and float(mult) > 0:
+                            return float(q) * float(mult)
+                    return q
+                df["quantity"] = df.apply(convert_qty, axis=1)
+                df[target] = df[target].apply(lambda u: "瓶" if u in ("件","箱","提","盒") else u)
+        except Exception:
+            pass
+    return df
+
+def _compute_quantity_from_total(df: pd.DataFrame) -> pd.DataFrame:
+    if "quantity" in df.columns and "unit_price" in df.columns:
+        qty = df["quantity"].fillna(0)
+        up = pd.to_numeric(df.get("unit_price", 0), errors="coerce").fillna(0)
+        tp = pd.to_numeric(df.get("total_price", 0), errors="coerce").fillna(0)
+        need = (qty <= 0) & (up > 0) & (tp > 0)
+        df.loc[need, "quantity"] = (tp[need] / up[need]).round(6)
+    return df
+
+def _fill_missing(df: pd.DataFrame, fills: Dict[str, Any]) -> pd.DataFrame:
+    for k, v in fills.items():
+        if k in df.columns:
+            df[k] = df[k].fillna(v)
+        else:
+            df[k] = v
+    return df
+
+def _mark_gift(df: pd.DataFrame) -> pd.DataFrame:
+    df["is_gift"] = False
+    tp = df.get("total_price")
+    up = df.get("unit_price")
+    flags = pd.Series([False]*len(df))
+    if tp is not None:
+        tpn = pd.to_numeric(tp, errors="coerce").fillna(0)
+        flags = flags | (tpn == 0)
+    if up is not None:
+        upn = pd.to_numeric(up, errors="coerce").fillna(0)
+        flags = flags | (upn == 0)
+    if "name" in df.columns:
+        flags = flags | df["name"].astype(str).str.contains(r"赠品|^o$|^O$", regex=True)
+    df.loc[flags, "is_gift"] = True
+    return df
+
+def apply_rules(df: pd.DataFrame, rules: List[Dict[str, Any]], dictionary: Optional[Dict[str, Any]] = None) -> pd.DataFrame:
+    out = df.copy()
+    for r in rules or []:
+        t = r.get("type")
+        if t == "split_quantity_unit":
+            out = _split_quantity_unit(out, r.get("source", "quantity"), dictionary)
+        elif t == "extract_spec_from_name":
+            out = _extract_spec_from_name(out, r.get("source", "name"), dictionary)
+        elif t == "normalize_unit":
+            out = _normalize_unit(out, r.get("target", "unit"), r.get("map", {}), dictionary)
+        elif t == "compute_quantity_from_total":
+            out = _compute_quantity_from_total(out)
+        elif t == "fill_missing":
+            out = _fill_missing(out, r.get("fills", {}))
+        elif t == "mark_gift":
+            out = _mark_gift(out)
+    return out
@@ -0,0 +1,5 @@
+"""
+OCR订单处理系统 - OCR核心模块
+---------------------------
+提供OCR识别相关功能，包括图片预处理、文字识别和表格识别。
+""" 
@@ -0,0 +1,368 @@
+"""
+百度OCR客户端模块
+---------------
+提供百度OCR API的访问和调用功能。
+"""
+
+import time
+import base64
+import requests
+from typing import Dict, Optional, Union
+
+from ..utils.log_utils import get_logger
+
+logger = get_logger(__name__)
+
+# Token 过期相关常量
+_DEFAULT_TOKEN_LIFETIME = 30 * 24 * 3600  # 30天（秒）
+_TOKEN_EARLY_EXPIRY = 3600                 # 提前1小时刷新（秒）
+
+class TokenManager:
+    """
+    令牌管理类，负责获取和刷新百度API访问令牌
+    """
+    
+    def __init__(self, api_key: str, secret_key: str, max_retries: int = 3, retry_delay: int = 2, token_url: str = None):
+        """
+        初始化令牌管理器
+
+        Args:
+            api_key: 百度API Key
+            secret_key: 百度Secret Key
+            max_retries: 最大重试次数
+            retry_delay: 重试延迟（秒）
+            token_url: 令牌获取地址
+        """
+        self.api_key = api_key
+        self.secret_key = secret_key
+        self.max_retries = max_retries
+        self.retry_delay = retry_delay
+        self.token_url = token_url or 'https://aip.baidubce.com/oauth/2.0/token'
+        self.access_token = None
+        self.token_expiry = 0
+    
+    def get_token(self) -> Optional[str]:
+        """
+        获取访问令牌，如果令牌已过期则刷新
+        
+        Returns:
+            访问令牌，如果获取失败则返回None
+        """
+        if self.is_token_valid():
+            return self.access_token
+        
+        return self.refresh_token()
+    
+    def is_token_valid(self) -> bool:
+        """
+        检查令牌是否有效
+        
+        Returns:
+            令牌是否有效
+        """
+        return (
+            self.access_token is not None and 
+            self.token_expiry > time.time() + 60  # 提前1分钟刷新
+        )
+    
+    def refresh_token(self) -> Optional[str]:
+        """
+        刷新访问令牌
+        
+        Returns:
+            新的访问令牌，如果获取失败则返回None
+        """
+        url = self.token_url
+        params = {
+            "grant_type": "client_credentials",
+            "client_id": self.api_key,
+            "client_secret": self.secret_key
+        }
+        
+        for attempt in range(self.max_retries):
+            try:
+                response = requests.post(url, params=params, timeout=10)
+                if response.status_code == 200:
+                    result = response.json()
+                    if "access_token" in result:
+                        self.access_token = result["access_token"]
+                        # 设置令牌过期时间（默认30天，提前1小时过期以确保安全）
+                        self.token_expiry = time.time() + result.get("expires_in", _DEFAULT_TOKEN_LIFETIME) - _TOKEN_EARLY_EXPIRY
+                        logger.info("成功获取访问令牌")
+                        return self.access_token
+                
+                logger.warning(f"获取访问令牌失败 (尝试 {attempt+1}/{self.max_retries}): {response.text}")
+                
+            except Exception as e:
+                logger.warning(f"获取访问令牌时发生错误 (尝试 {attempt+1}/{self.max_retries}): {e}")
+            
+            # 如果不是最后一次尝试，则等待后重试
+            if attempt < self.max_retries - 1:
+                time.sleep(self.retry_delay * (attempt + 1))  # 指数退避
+        
+        logger.error("无法获取访问令牌")
+        return None
+
+class BaiduOCRClient:
+    """
+    百度OCR API客户端
+    """
+    
+    def __init__(self, config):
+        """
+        初始化百度OCR客户端
+        
+        Args:
+            config: 配置信息
+        """
+        self.config = config
+        
+        # 从配置中读取API信息
+        try:
+            # 修复getint调用方式
+            self.timeout = config.get('API', 'timeout', fallback=30)
+            if isinstance(self.timeout, str):
+                self.timeout = int(self.timeout)
+                
+            self.api_key = config.get('API', 'api_key', fallback='')
+            self.secret_key = config.get('API', 'secret_key', fallback='')
+            
+            # 使用fallback而不是位置参数
+            try:
+                self.max_retries = config.getint('API', 'max_retries', fallback=3)
+            except (TypeError, AttributeError):
+                # 如果getint不支持fallback，则使用get再转换
+                self.max_retries = int(config.get('API', 'max_retries', fallback='3'))
+                
+            try:
+                self.retry_delay = config.getint('API', 'retry_delay', fallback=2)
+            except (TypeError, AttributeError):
+                # 如果getint不支持fallback，则使用get再转换
+                self.retry_delay = int(config.get('API', 'retry_delay', fallback='2'))
+                
+            self.api_url = config.get('API', 'api_url', fallback='https://aip.baidubce.com/rest/2.0/ocr/v1/table')
+            
+            # 创建令牌管理器
+            self.token_manager = TokenManager(
+                self.api_key,
+                self.secret_key,
+                self.max_retries,
+                self.retry_delay,
+                token_url=config.get('API', 'token_url', fallback='https://aip.baidubce.com/oauth/2.0/token')
+            )
+            
+            # 验证API配置
+            if not self.api_key or not self.secret_key:
+                logger.warning("API密钥未设置，请在配置文件中设置API密钥")
+        except Exception as e:
+            logger.error(f"初始化失败: {e}")
+    
+    def read_image(self, image_path: str) -> Optional[bytes]:
+        """
+        读取图片文件为二进制数据
+        
+        Args:
+            image_path: 图片文件路径
+            
+        Returns:
+            图片二进制数据，如果读取失败则返回None
+        """
+        try:
+            with open(image_path, 'rb') as f:
+                return f.read()
+        except Exception as e:
+            logger.error(f"读取图片文件失败: {image_path}, 错误: {e}")
+            return None
+    
+    def recognize_table(self, image_data: Union[str, bytes]) -> Optional[Dict]:
+        """
+        识别表格
+        
+        Args:
+            image_data: 图片数据，可以是文件路径或二进制数据
+            
+        Returns:
+            识别结果字典，如果识别失败则返回None
+        """
+        # 获取访问令牌
+        access_token = self.token_manager.get_token()
+        if not access_token:
+            logger.error("无法获取访问令牌，无法进行表格识别")
+            return None
+        
+        # 如果是文件路径，读取图片数据
+        if isinstance(image_data, str):
+            image_data = self.read_image(image_data)
+            if image_data is None:
+                return None
+        
+        # 准备请求参数
+        url = f"{self.api_url}?access_token={access_token}"
+        image_base64 = base64.b64encode(image_data).decode('utf-8')
+        
+        # 请求参数 - 添加return_excel参数，与v1版本保持一致
+        payload = {
+            'image': image_base64,
+            'is_sync': 'true',  # 同步请求
+            'request_type': 'excel',  # 输出为Excel
+            'return_excel': 'true'  # 直接返回Excel数据
+        }
+        
+        headers = {
+            'Content-Type': 'application/x-www-form-urlencoded',
+            'Accept': 'application/json'
+        }
+        
+        # 发送请求
+        for attempt in range(self.max_retries):
+            try:
+                response = requests.post(
+                    url, 
+                    data=payload, 
+                    headers=headers, 
+                    timeout=self.timeout
+                )
+                
+                if response.status_code == 200:
+                    result = response.json()
+                    # 打印返回结果以便调试
+                    logger.debug(f"百度OCR API返回结果: {result}")
+                    
+                    if 'error_code' in result:
+                        error_msg = result.get('error_msg', '未知错误')
+                        logger.error(f"百度OCR API错误: {error_msg}")
+                        # 如果是授权错误，尝试刷新令牌
+                        if result.get('error_code') in [110, 111]:  # 授权相关错误码
+                            logger.info("尝试刷新访问令牌...")
+                            self.token_manager.refresh_token()
+                        return None
+                    
+                    # 兼容不同的返回结构
+                    # 这是最关键的修改部分: 直接返回整个结果，不强制要求特定结构
+                    return result
+                else:
+                    logger.warning(f"表格识别请求失败 (尝试 {attempt+1}/{self.max_retries}): {response.text}")
+            
+            except Exception as e:
+                logger.warning(f"表格识别时发生错误 (尝试 {attempt+1}/{self.max_retries}): {e}")
+            
+            # 如果不是最后一次尝试，则等待后重试
+            if attempt < self.max_retries - 1:
+                wait_time = self.retry_delay * (2 ** attempt)  # 指数退避
+                logger.info(f"将在 {wait_time} 秒后重试...")
+                time.sleep(wait_time)
+        
+        logger.error("表格识别失败")
+        return None
+    
+    def get_excel_result(self, request_id_or_result: Union[str, Dict]) -> Optional[bytes]:
+        """
+        获取Excel结果
+        
+        Args:
+            request_id_or_result: 请求ID或完整的识别结果
+            
+        Returns:
+            Excel二进制数据，如果获取失败则返回None
+        """
+        # 获取访问令牌
+        access_token = self.token_manager.get_token()
+        if not access_token:
+            logger.error("无法获取访问令牌，无法获取Excel结果")
+            return None
+        
+        # 处理直接传入结果对象的情况
+        request_id = request_id_or_result
+        if isinstance(request_id_or_result, dict):
+            # v1版本兼容处理：如果结果中直接包含Excel数据
+            if 'result' in request_id_or_result:
+                # 如果是同步返回的Excel结果（某些API版本会直接返回）
+                if 'result_data' in request_id_or_result['result']:
+                    excel_content = request_id_or_result['result']['result_data']
+                    if excel_content:
+                        try:
+                            return base64.b64decode(excel_content)
+                        except Exception as e:
+                            logger.error(f"解析Excel数据失败: {e}")
+                
+                # 提取request_id
+                if 'request_id' in request_id_or_result['result']:
+                    request_id = request_id_or_result['result']['request_id']
+                    logger.debug(f"从result子对象中提取request_id: {request_id}")
+                elif 'tables_result' in request_id_or_result['result'] and len(request_id_or_result['result']['tables_result']) > 0:
+                    # 某些版本API可能直接返回表格内容，此时可能没有request_id
+                    logger.info("检测到API直接返回了表格内容，但没有request_id")
+                    return None
+            # 有些版本可能request_id在顶层
+            elif 'request_id' in request_id_or_result:
+                request_id = request_id_or_result['request_id']
+                logger.debug(f"从顶层对象中提取request_id: {request_id}")
+        
+        # 如果没有有效的request_id，无法获取结果
+        if not isinstance(request_id, str):
+            logger.error(f"无法从结果中提取有效的request_id: {request_id_or_result}")
+            return None
+            
+        base_url = self.config.get('API', 'form_ocr_url', fallback='https://aip.baidubce.com/rest/2.0/solution/v1/form_ocr/get_request_result')
+        url = f"{base_url}?access_token={access_token}"
+        
+        payload = {
+            'request_id': request_id,
+            'result_type': 'excel'
+        }
+        
+        headers = {
+            'Content-Type': 'application/x-www-form-urlencoded',
+            'Accept': 'application/json'
+        }
+        
+        for attempt in range(self.max_retries):
+            try:
+                response = requests.post(
+                    url, 
+                    data=payload, 
+                    headers=headers, 
+                    timeout=self.timeout
+                )
+                
+                if response.status_code == 200:
+                    try:
+                        result = response.json()
+                        logger.debug(f"获取Excel结果返回: {result}")
+                        
+                        # 检查是否还在处理中
+                        if result.get('result', {}).get('ret_code') == 3:
+                            logger.info(f"Excel结果正在处理中，等待后重试 (尝试 {attempt+1}/{self.max_retries})")
+                            time.sleep(2)
+                            continue
+                        
+                        # 检查是否有错误
+                        if 'error_code' in result or result.get('result', {}).get('ret_code') != 0:
+                            error_msg = result.get('error_msg') or result.get('result', {}).get('ret_msg', '未知错误')
+                            logger.error(f"获取Excel结果失败: {error_msg}")
+                            return None
+                        
+                        # 获取Excel内容
+                        excel_content = result.get('result', {}).get('result_data')
+                        if excel_content:
+                            return base64.b64decode(excel_content)
+                        else:
+                            logger.error("Excel结果为空")
+                            return None
+                    
+                    except Exception as e:
+                        logger.error(f"解析Excel结果时出错: {e}")
+                        return None
+                
+                else:
+                    logger.warning(f"获取Excel结果请求失败 (尝试 {attempt+1}/{self.max_retries}): {response.text}")
+            
+            except Exception as e:
+                logger.warning(f"获取Excel结果时发生错误 (尝试 {attempt+1}/{self.max_retries}): {e}")
+            
+            # 如果不是最后一次尝试，则等待后重试
+            if attempt < self.max_retries - 1:
+                time.sleep(self.retry_delay * (attempt + 1))
+        
+        logger.error("获取Excel结果失败")
+        return None 
@@ -0,0 +1,389 @@
+"""
+表格OCR处理模块
+-------------
+处理图片并提取表格内容，保存为Excel文件。
+"""
+
+import os
+import time
+import base64
+from concurrent.futures import ThreadPoolExecutor
+from typing import Dict, List, Optional, Tuple, Callable
+
+from ..utils.log_utils import get_logger
+from ..utils.file_utils import (
+    ensure_dir, 
+    get_file_extension, 
+    get_files_by_extensions, 
+    generate_timestamp_filename,
+    is_file_size_valid,
+    load_json,
+    save_json
+)
+from .baidu_ocr import BaiduOCRClient
+
+logger = get_logger(__name__)
+
+class ProcessedRecordManager:
+    """处理记录管理器，用于跟踪已处理的文件"""
+    
+    def __init__(self, record_file: str):
+        """
+        初始化处理记录管理器
+        
+        Args:
+            record_file: 记录文件路径
+        """
+        self.record_file = record_file
+        self.processed_files = self._load_record()
+    
+    def _load_record(self) -> Dict[str, str]:
+        """
+        加载处理记录
+        
+        Returns:
+            处理记录字典，键为输入文件路径，值为输出文件路径
+        """
+        return load_json(self.record_file, {})
+    
+    def save_record(self) -> None:
+        """保存处理记录"""
+        save_json(self.processed_files, self.record_file)
+    
+    def is_processed(self, image_file: str) -> bool:
+        """
+        检查图片是否已处理
+        
+        Args:
+            image_file: 图片文件路径
+            
+        Returns:
+            是否已处理
+        """
+        return image_file in self.processed_files
+    
+    def mark_as_processed(self, image_file: str, output_file: str) -> None:
+        """
+        标记图片为已处理
+        
+        Args:
+            image_file: 图片文件路径
+            output_file: 输出文件路径
+        """
+        self.processed_files[image_file] = output_file
+        self.save_record()
+    
+    def get_output_file(self, image_file: str) -> Optional[str]:
+        """
+        获取图片的输出文件路径
+        
+        Args:
+            image_file: 图片文件路径
+            
+        Returns:
+            输出文件路径，如果不存在则返回None
+        """
+        return self.processed_files.get(image_file)
+    
+    def get_unprocessed_files(self, files: List[str]) -> List[str]:
+        """
+        获取未处理的文件列表
+        
+        Args:
+            files: 文件列表
+            
+        Returns:
+            未处理的文件列表
+        """
+        return [file for file in files if not self.is_processed(file)]
+
+class OCRProcessor:
+    """
+    OCR处理器，负责协调OCR识别和结果处理
+    """
+    
+    def __init__(self, config):
+        """
+        初始化OCR处理器
+        
+        Args:
+            config: 配置信息
+        """
+        self.config = config
+        
+        # 修复ConfigParser对象没有get_path方法的问题
+        try:
+            # 获取输入和输出目录
+            self.input_folder = config.get('Paths', 'input_folder', fallback='data/input')
+            self.output_folder = config.get('Paths', 'output_folder', fallback='data/output')
+            self.temp_folder = config.get('Paths', 'temp_folder', fallback='data/temp')
+            
+            # 确保目录存在
+            os.makedirs(self.input_folder, exist_ok=True)
+            os.makedirs(self.output_folder, exist_ok=True)
+            os.makedirs(self.temp_folder, exist_ok=True)
+            
+            # 获取文件类型列表
+            allowed_extensions_str = config.get('File', 'allowed_extensions', fallback='.jpg,.jpeg,.png,.bmp')
+            self.file_types = [ext.strip() for ext in allowed_extensions_str.split(',') if ext.strip()]
+            if not self.file_types:
+                self.file_types = ['.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tif', '.tiff']
+            
+            # 初始化OCR客户端
+            self.ocr_client = BaiduOCRClient(self.config)
+            
+            # 记录实际路径
+            logger.info(f"使用输入目录: {os.path.abspath(self.input_folder)}")
+            logger.info(f"使用输出目录: {os.path.abspath(self.output_folder)}")
+            logger.info(f"使用临时目录: {os.path.abspath(self.temp_folder)}")
+            logger.info(f"允许的文件类型: {self.file_types}")
+            
+            # 初始化processed_files_json和record_manager
+            self.processed_files_json = os.path.join(self.output_folder, 'processed_files.json')
+            self.record_manager = ProcessedRecordManager(self.processed_files_json)
+            
+            # 加载已处理文件记录
+            self.processed_files = self._load_processed_files()
+            
+            logger.info(f"初始化OCRProcessor完成：输入目录={self.input_folder}, 输出目录={self.output_folder}")
+        except Exception as e:
+            logger.error(f"初始化OCRProcessor失败: {e}")
+            raise
+    
+    def _load_processed_files(self) -> Dict[str, str]:
+        """
+        加载已处理的文件记录
+        
+        Returns:
+            已处理的文件记录字典，键为输入文件路径，值为输出文件路径
+        """
+        return load_json(self.processed_files_json, {})
+    
+    def get_unprocessed_images(self) -> List[str]:
+        """
+        获取未处理的图片列表
+        
+        Returns:
+            未处理的图片文件路径列表
+        """
+        # 获取所有图片文件
+        image_files = get_files_by_extensions(self.input_folder, self.file_types)
+        
+        # 如果需要跳过已存在的文件
+        skip_existing = True
+        try:
+            skip_existing = self.config.getboolean('Performance', 'skip_existing', fallback=True)
+        except Exception:
+            pass
+
+        if skip_existing:
+            # 过滤已处理的文件
+            unprocessed_files = self.record_manager.get_unprocessed_files(image_files)
+            logger.info(f"找到 {len(image_files)} 个图片文件，其中 {len(unprocessed_files)} 个未处理")
+            return unprocessed_files
+        
+        logger.info(f"找到 {len(image_files)} 个图片文件（不跳过已处理的文件）")
+        return image_files
+    
+    def validate_image(self, image_path: str) -> bool:
+        """
+        验证图片是否有效
+        
+        Args:
+            image_path: 图片文件路径
+            
+        Returns:
+            图片是否有效
+        """
+        # 检查文件是否存在
+        if not os.path.exists(image_path):
+            logger.warning(f"图片文件不存在: {image_path}")
+            return False
+        
+        # 检查文件扩展名
+        ext = get_file_extension(image_path)
+        if ext not in self.file_types:
+            logger.warning(f"不支持的文件类型: {ext}, 文件: {image_path}")
+            return False
+        
+        # 检查文件大小
+        max_size_mb = 4.0
+        try:
+            max_size_mb = float(self.config.get('File', 'max_file_size_mb', fallback='4.0'))
+        except Exception:
+            pass
+        
+        if not is_file_size_valid(image_path, max_size_mb):
+            logger.warning(f"文件大小超过限制 ({max_size_mb}MB): {image_path}")
+            return False
+        
+        return True
+    
+    def process_image(self, image_path: str) -> Optional[str]:
+        """
+        处理单个图片
+        
+        Args:
+            image_path: 图片文件路径
+            
+        Returns:
+            输出Excel文件路径，如果处理失败则返回None
+        """
+        # 验证图片
+        if not self.validate_image(image_path):
+            return None
+        
+        # 获取是否跳过已处理文件的配置
+        skip_existing = True
+        try:
+            skip_existing = self.config.getboolean('Performance', 'skip_existing', fallback=True)
+        except Exception:
+            pass
+        
+        # 如果需要跳过已处理的文件
+        if skip_existing and self.record_manager.is_processed(image_path):
+            output_file = self.record_manager.get_output_file(image_path)
+            logger.info(f"图片已处理，跳过: {image_path}, 输出文件: {output_file}")
+            return output_file
+        
+        logger.info(f"开始处理图片: {image_path}")
+        
+        try:
+            # 获取Excel扩展名
+            excel_extension = '.xlsx'
+            try:
+                excel_extension = self.config.get('File', 'excel_extension', fallback='.xlsx')
+            except Exception:
+                pass
+            
+            # 生成输出文件路径
+            file_name = os.path.splitext(os.path.basename(image_path))[0]
+            output_file = os.path.join(self.output_folder, f"{file_name}{excel_extension}")
+            
+            # 检查是否已存在对应的Excel文件
+            if os.path.exists(output_file) and skip_existing:
+                logger.info(f"已存在对应的Excel文件，跳过处理: {os.path.basename(image_path)} -> {os.path.basename(output_file)}")
+                # 记录处理结果
+                self.record_manager.mark_as_processed(image_path, output_file)
+                return output_file
+            
+            # 进行OCR识别
+            ocr_result = self.ocr_client.recognize_table(image_path)
+            if not ocr_result:
+                logger.error(f"OCR识别失败: {image_path}")
+                return None
+                
+            # 保存Excel文件 - 按照v1版本逻辑提取Excel数据
+            excel_base64 = None
+            
+            # 从不同可能的字段中尝试获取Excel数据
+            if 'excel_file' in ocr_result:
+                excel_base64 = ocr_result['excel_file']
+                logger.debug("从excel_file字段获取Excel数据")
+            elif 'result' in ocr_result:
+                if 'result_data' in ocr_result['result']:
+                    excel_base64 = ocr_result['result']['result_data']
+                    logger.debug("从result.result_data字段获取Excel数据")
+                elif 'excel_file' in ocr_result['result']:
+                    excel_base64 = ocr_result['result']['excel_file']
+                    logger.debug("从result.excel_file字段获取Excel数据")
+                elif 'tables_result' in ocr_result['result'] and ocr_result['result']['tables_result']:
+                    for table in ocr_result['result']['tables_result']:
+                        if 'excel_file' in table:
+                            excel_base64 = table['excel_file']
+                            logger.debug("从tables_result中获取Excel数据")
+                            break
+                    
+            # 如果还是没有找到Excel数据，尝试通过get_excel_result获取
+            if not excel_base64:
+                logger.info("无法从直接返回中获取Excel数据，尝试通过API获取...")
+                excel_data = self.ocr_client.get_excel_result(ocr_result)
+                if not excel_data:
+                    logger.error(f"获取Excel结果失败: {image_path}")
+                    return None
+                    
+                # 保存Excel文件
+                os.makedirs(os.path.dirname(output_file), exist_ok=True)
+                with open(output_file, 'wb') as f:
+                    f.write(excel_data)
+            else:
+                # 解码并保存Excel文件
+                try:
+                    excel_data = base64.b64decode(excel_base64)
+                    os.makedirs(os.path.dirname(output_file), exist_ok=True)
+                    with open(output_file, 'wb') as f:
+                        f.write(excel_data)
+                except Exception as e:
+                    logger.error(f"解码或保存Excel数据时出错: {e}")
+                    return None
+            
+            logger.info(f"图片处理成功: {image_path}, 输出文件: {output_file}")
+            
+            # 标记为已处理
+            self.record_manager.mark_as_processed(image_path, output_file)
+            
+            return output_file
+            
+        except Exception as e:
+            logger.error(f"处理图片时出错: {image_path}, 错误: {e}")
+            return None
+    
+    def process_images_batch(self, batch_size: int = None, max_workers: int = None, progress_cb: Optional[Callable[[int], None]] = None) -> Tuple[int, int]:
+        """
+        批量处理图片
+        
+        Args:
+            batch_size: 批处理大小，如果为None则使用配置值
+            max_workers: 最大线程数，如果为None则使用配置值
+            
+        Returns:
+            (总处理数, 成功处理数)元组
+        """
+        # 使用配置值或参数值
+        if batch_size is None:
+            try:
+                batch_size = self.config.getint('Performance', 'batch_size', fallback=5)
+            except Exception:
+                batch_size = 5
+
+        if max_workers is None:
+            try:
+                max_workers = self.config.getint('Performance', 'max_workers', fallback=4)
+            except Exception:
+                max_workers = 4
+    
+        # 获取未处理的图片
+        unprocessed_images = self.get_unprocessed_images()
+        if not unprocessed_images:
+            logger.warning("没有需要处理的图片")
+            return 0, 0
+    
+        total = len(unprocessed_images)
+        success_count = 0
+    
+        # 按批次处理
+        for i in range(0, total, batch_size):
+            batch = unprocessed_images[i:i+batch_size]
+            logger.info(f"处理批次 {i//batch_size+1}/{(total+batch_size-1)//batch_size}: {len(batch)} 个文件")
+            try:
+                if progress_cb:
+                    # 以批次为单位估算进度（0-90%），保留10%给后续阶段
+                    percent = int(10 + (i / max(total, 1)) * 80)
+                    progress_cb(min(percent, 90))
+            except Exception:
+                pass
+        
+            # 使用多线程处理批次
+            with ThreadPoolExecutor(max_workers=max_workers) as executor:
+                results = list(executor.map(self.process_image, batch))
+            
+                # 统计成功数
+                success_count += sum(1 for result in results if result is not None)
+    
+        logger.info(f"所有图片处理完成, 总计: {total}, 成功: {success_count}")
+        try:
+            if progress_cb:
+                progress_cb(90)
+        except Exception:
+            pass
+        return total, success_count 
@@ -0,0 +1,9 @@
+"""
+处理器模块初始化文件
+"""
+
+from .base import BaseProcessor
+from .ocr_processor import OCRProcessor
+from .tobacco_processor import TobaccoProcessor
+
+__all__ = ['BaseProcessor', 'OCRProcessor', 'TobaccoProcessor']
@@ -0,0 +1,167 @@
+"""
+基础处理器接口模块
+
+定义所有处理器的基类，提供统一的处理接口
+"""
+
+from abc import ABC, abstractmethod
+from typing import Dict, Any, Optional, List
+from pathlib import Path
+import logging
+import pandas as pd
+
+from ...core.utils.log_utils import get_logger
+
+logger = get_logger(__name__)
+
+
+class BaseProcessor(ABC):
+    """基础处理器接口 - 所有处理器的基类
+    
+    采用策略模式设计，每个处理器负责特定类型的文件处理
+    """
+    
+    def __init__(self, config: Dict[str, Any]):
+        """初始化处理器
+        
+        Args:
+            config: 处理器配置字典
+        """
+        self.config = config
+        self.name = self.__class__.__name__
+        self.description = ""
+        self._setup_logging()
+    
+    def _setup_logging(self):
+        """设置处理器日志"""
+        self.logger = logging.getLogger(f"{__name__}.{self.name}")
+    
+    @abstractmethod
+    def can_process(self, file_path: Path) -> bool:
+        """判断是否能处理该文件
+        
+        Args:
+            file_path: 文件路径
+            
+        Returns:
+            是否能处理该文件
+        """
+        pass
+    
+    @abstractmethod
+    def process(self, input_file: Path, output_dir: Path) -> Optional[Path]:
+        """处理文件，返回输出文件路径
+        
+        Args:
+            input_file: 输入文件路径
+            output_dir: 输出目录路径
+            
+        Returns:
+            输出文件路径，处理失败返回None
+        """
+        pass
+    
+    @abstractmethod
+    def get_required_columns(self) -> List[str]:
+        """返回需要的列名列表
+        
+        Returns:
+            列名列表
+        """
+        pass
+    
+    def validate_input(self, file_path: Path) -> bool:
+        """验证输入文件有效性
+        
+        Args:
+            file_path: 文件路径
+            
+        Returns:
+            文件是否有效
+        """
+        try:
+            if not file_path.exists():
+                self.logger.warning(f"文件不存在: {file_path}")
+                return False
+            
+            if not file_path.is_file():
+                self.logger.warning(f"不是文件: {file_path}")
+                return False
+            
+            supported_extensions = self.get_supported_extensions()
+            if supported_extensions and file_path.suffix.lower() not in supported_extensions:
+                self.logger.warning(f"不支持的文件类型: {file_path.suffix}, 支持的类型: {supported_extensions}")
+                return False
+            
+            return True
+            
+        except Exception as e:
+            self.logger.error(f"验证文件时出错: {e}")
+            return False
+    
+    def get_supported_extensions(self) -> List[str]:
+        """获取支持的文件扩展名
+        
+        Returns:
+            支持的扩展名列表，空列表表示支持所有类型
+        """
+        return []
+    
+    def get_output_filename(self, input_file: Path, suffix: str = "_processed") -> str:
+        """生成输出文件名
+
+        Args:
+            input_file: 输入文件路径
+            suffix: 文件名后缀
+
+        Returns:
+            输出文件名
+        """
+        return f"{input_file.stem}{suffix}{input_file.suffix}"
+
+    def _read_excel_safely(self, file_path: Path, **kwargs) -> pd.DataFrame:
+        """根据扩展名选择合适的读取引擎
+
+        Args:
+            file_path: 文件路径
+            **kwargs: 传递给 pd.read_excel 的参数
+
+        Returns:
+            DataFrame
+
+        Raises:
+            Exception: 读取失败时抛出
+        """
+        suffix = file_path.suffix.lower()
+        if suffix == '.xlsx':
+            return pd.read_excel(file_path, engine='openpyxl', **kwargs)
+        elif suffix == '.xls':
+            try:
+                return pd.read_excel(file_path, engine='xlrd', **kwargs)
+            except Exception as e:
+                self.logger.warning(f"读取xls失败，可能缺少xlrd: {e}")
+                raise
+        else:
+            return pd.read_excel(file_path, **kwargs)
+    
+    def log_processing_start(self, input_file: Path):
+        """记录处理开始日志"""
+        self.logger.info(f"开始处理文件: {input_file}")
+        self.logger.info(f"处理器: {self.name} - {self.description}")
+    
+    def log_processing_end(self, input_file: Path, output_file: Optional[Path] = None, success: bool = True):
+        """记录处理结束日志"""
+        if success:
+            self.logger.info(f"处理完成: {input_file}")
+            if output_file:
+                self.logger.info(f"输出文件: {output_file}")
+        else:
+            self.logger.error(f"处理失败: {input_file}")
+    
+    def __str__(self) -> str:
+        """字符串表示"""
+        return f"{self.name}({self.description})"
+    
+    def __repr__(self) -> str:
+        """详细字符串表示"""
+        return f"{self.__class__.__module__}.{self.__class__.__name__}(name='{self.name}', description='{self.description}')"
@@ -0,0 +1,192 @@
+"""
+OCR处理器
+
+处理图片文件的OCR识别完整流程：图片识别 → Excel处理 → 标准采购单生成
+"""
+
+import os
+from pathlib import Path
+from typing import Optional, Dict, Any, List
+
+from .base import BaseProcessor
+from ...services.ocr_service import OCRService
+from ...services.order_service import OrderService
+from ...core.utils.log_utils import get_logger
+
+logger = get_logger(__name__)
+
+
+class OCRProcessor(BaseProcessor):
+    """OCR处理器
+    
+    处理图片文件的完整OCR识别流程：
+    1. OCR识别图片中的表格信息
+    2. 处理识别结果生成Excel文件
+    3. 转换为标准采购单格式
+    """
+    
+    def __init__(self, config: Dict[str, Any]):
+        """初始化OCR处理器
+        
+        Args:
+            config: 配置信息
+        """
+        super().__init__(config)
+        self.description = "OCR识别完整流程（图片→识别→Excel→采购单）"
+        
+        # 初始化服务
+        self.ocr_service = OCRService(config)
+        self.order_service = OrderService(config)
+    
+    def can_process(self, file_path: Path) -> bool:
+        """判断是否为支持的图片文件
+        
+        Args:
+            file_path: 文件路径
+            
+        Returns:
+            是否能处理该文件
+        """
+        if not self.validate_input(file_path):
+            return False
+        
+        # 支持的图片格式
+        supported_extensions = ['.jpg', '.jpeg', '.png', '.bmp']
+        
+        if file_path.suffix.lower() in supported_extensions:
+            self.logger.info(f"识别为图片文件: {file_path.name}")
+            return True
+        
+        return False
+    
+    def process(self, input_file: Path, output_dir: Path) -> Optional[Path]:
+        """处理图片文件的完整OCR流程
+        
+        Args:
+            input_file: 输入图片文件路径
+            output_dir: 输出目录路径
+            
+        Returns:
+            输出文件路径，处理失败返回None
+        """
+        self.log_processing_start(input_file)
+        
+        try:
+            self.logger.info("开始OCR识别流程...")
+            
+            # 步骤1: OCR识别
+            self.logger.info("步骤1/3: OCR识别图片...")
+            ocr_result = self._perform_ocr(input_file, output_dir)
+            if not ocr_result:
+                self.logger.error("OCR识别失败")
+                self.log_processing_end(input_file, success=False)
+                return None
+            
+            # 步骤2: Excel处理
+            self.logger.info("步骤2/3: 处理Excel文件...")
+            excel_result = self._process_excel(ocr_result, output_dir)
+            if not excel_result:
+                self.logger.error("Excel处理失败")
+                self.log_processing_end(input_file, success=False)
+                return None
+            
+            # 步骤3: 生成标准采购单
+            self.logger.info("步骤3/3: 生成标准采购单...")
+            final_result = self._generate_purchase_order(excel_result, output_dir)
+            
+            if final_result:
+                self.logger.info(f"OCR处理流程完成，输出文件: {final_result}")
+                self.log_processing_end(input_file, final_result, success=True)
+                return final_result
+            else:
+                self.logger.error("生成采购单失败")
+                self.log_processing_end(input_file, success=False)
+                return None
+                
+        except Exception as e:
+            self.logger.error(f"OCR处理流程出错: {e}", exc_info=True)
+            self.log_processing_end(input_file, success=False)
+            return None
+    
+    def get_required_columns(self) -> List[str]:
+        """返回需要的列名列表"""
+        # OCR处理不直接依赖列名，由后续处理步骤决定
+        return []
+    
+    def get_supported_extensions(self) -> List[str]:
+        """支持的文件扩展名"""
+        return ['.jpg', '.jpeg', '.png', '.bmp']
+    
+    def _perform_ocr(self, input_file: Path, output_dir: Path) -> Optional[Path]:
+        """执行OCR识别
+        
+        Args:
+            input_file: 输入图片文件
+            output_dir: 输出目录
+            
+        Returns:
+            OCR生成的Excel文件路径，失败返回None
+        """
+        try:
+            self.logger.info(f"开始OCR识别: {input_file}")
+            
+            # 使用OCR服务处理图片
+            result_path = self.ocr_service.process_image(str(input_file))
+            
+            if result_path:
+                # 确保结果文件在输出目录中
+                result_path = Path(result_path)
+                if result_path.exists():
+                    self.logger.info(f"OCR识别成功，输出文件: {result_path}")
+                    return result_path
+                else:
+                    self.logger.error(f"OCR结果文件不存在: {result_path}")
+                    return None
+            else:
+                self.logger.error("OCR服务返回None")
+                return None
+                
+        except Exception as e:
+            self.logger.error(f"OCR识别失败: {e}", exc_info=True)
+            return None
+    
+    def _process_excel(self, excel_file: Path, output_dir: Path) -> Optional[Path]:
+        """处理Excel文件
+        
+        Args:
+            excel_file: Excel文件路径
+            output_dir: 输出目录
+            
+        Returns:
+            处理后的Excel文件路径，失败返回None
+        """
+        try:
+            self.logger.info(f"开始处理Excel文件: {excel_file}")
+            
+            # 使用订单服务处理Excel文件（生成采购单）
+            result_path = self.order_service.process_excel(str(excel_file))
+            
+            if result_path:
+                result_path = Path(result_path)
+                if result_path.exists():
+                    self.logger.info(f"Excel处理成功，输出文件: {result_path}")
+                    return result_path
+                else:
+                    self.logger.error(f"Excel处理结果文件不存在: {result_path}")
+                    return None
+            else:
+                self.logger.error("Excel处理服务返回None")
+                return None
+                
+        except Exception as e:
+            self.logger.error(f"Excel处理失败: {e}", exc_info=True)
+            return None
+    
+    def _generate_purchase_order(self, processed_file: Path, output_dir: Path) -> Optional[Path]:
+        """采购单生成由OrderService完成，此处直接返回处理结果"""
+        try:
+            if processed_file and processed_file.exists():
+                return processed_file
+            return None
+        except Exception:
+            return None
@@ -0,0 +1,7 @@
+"""
+供应商处理器模块初始化文件
+"""
+
+from .generic_supplier_processor import GenericSupplierProcessor
+
+__all__ = ['GenericSupplierProcessor']
@@ -0,0 +1,340 @@
+"""
+通用供应商处理器
+
+可配置化的供应商处理器，支持通过配置文件定义处理规则
+"""
+
+import fnmatch
+import pandas as pd
+from typing import Optional, Dict, Any, List
+from pathlib import Path
+
+from ..base import BaseProcessor
+from ...utils.log_utils import get_logger
+from ...handlers.rule_engine import apply_rules
+from ...handlers.column_mapper import ColumnMapper
+from ...handlers.data_cleaner import DataCleaner
+from ...handlers.calculator import DataCalculator
+
+logger = get_logger(__name__)
+
+
+class GenericSupplierProcessor(BaseProcessor):
+    """通用供应商处理器
+    
+    基于配置文件处理不同供应商的Excel文件，支持：
+    - 文件名模式匹配
+    - 内容特征识别
+    - 列映射配置
+    - 数据清洗规则
+    - 计算处理规则
+    """
+    
+    def __init__(self, config: Dict[str, Any], supplier_config: Dict[str, Any]):
+        """初始化通用供应商处理器
+        
+        Args:
+            config: 系统配置
+            supplier_config: 供应商特定配置
+        """
+        super().__init__(config)
+        self.supplier_config = supplier_config
+        
+        # 从配置中提取基本信息
+        self.name = supplier_config.get('name', 'GenericSupplier')
+        self.description = supplier_config.get('description', '通用供应商处理器')
+        
+        # 处理规则配置
+        self.filename_patterns = supplier_config.get('filename_patterns', [])
+        self.content_indicators = supplier_config.get('content_indicators', [])
+        self.column_mapping = supplier_config.get('column_mapping', {})
+        self.cleaning_rules = supplier_config.get('cleaning_rules', [])
+        self.calculations = supplier_config.get('calculations', [])
+        
+        # 输出配置
+        self.output_template = supplier_config.get('output_template', 'templates/银豹-采购单模板.xls')
+        self.output_suffix = supplier_config.get('output_suffix', '_银豹采购单')
+    
+    def can_process(self, file_path: Path) -> bool:
+        """判断是否能处理该文件
+        
+        Args:
+            file_path: 文件路径
+            
+        Returns:
+            是否能处理
+        """
+        if not self.validate_input(file_path):
+            return False
+        
+        # 检查文件名模式
+        if self.filename_patterns:
+            filename_match = self._check_filename_patterns(file_path)
+            if filename_match:
+                return True
+        
+        # 检查文件内容特征
+        if self.content_indicators:
+            content_match = self._check_content_indicators(file_path)
+            if content_match:
+                return True
+        
+        # 如果都没有配置，则无法判断
+        if not self.filename_patterns and not self.content_indicators:
+            self.logger.warning(f"处理器 {self.name} 没有配置识别规则")
+            return False
+        
+        return False
+    
+    def process(self, input_file: Path, output_dir: Path) -> Optional[Path]:
+        """处理文件
+        
+        Args:
+            input_file: 输入文件路径
+            output_dir: 输出目录路径
+            
+        Returns:
+            输出文件路径，处理失败返回None
+        """
+        self.log_processing_start(input_file)
+        
+        try:
+            # 步骤1: 读取数据
+            self.logger.info("步骤1/4: 读取数据...")
+            df = self._read_supplier_data(input_file)
+            if df is None or df.empty:
+                self.logger.error("读取数据失败或数据为空")
+                self.log_processing_end(input_file, success=False)
+                return None
+            
+            # 步骤2: 应用列映射
+            self.logger.info("步骤2/4: 应用列映射...")
+            mapped_df = self._apply_column_mapping(df)
+            if mapped_df is None:
+                self.logger.error("列映射失败")
+                self.log_processing_end(input_file, success=False)
+                return None
+            
+            # 步骤3: 数据清洗
+            self.logger.info("步骤3/4: 数据清洗...")
+            cleaned_df = self._apply_data_cleaning(mapped_df)
+            if cleaned_df is None:
+                self.logger.error("数据清洗失败")
+                self.log_processing_end(input_file, success=False)
+                return None
+            try:
+                rules = self.supplier_config.get('rules', [])
+                dictionary = self.supplier_config.get('dictionary')
+                standardized_df = apply_rules(cleaned_df, rules, dictionary)
+            except Exception as e:
+                self.logger.warning(f"规则执行失败: {e}")
+                standardized_df = cleaned_df
+            
+            # 步骤4: 计算处理
+            self.logger.info("步骤4/4: 计算处理...")
+            calculated_df = self._apply_calculations(standardized_df)
+            if calculated_df is None:
+                self.logger.error("计算处理失败")
+                self.log_processing_end(input_file, success=False)
+                return None
+            
+            # 生成输出文件
+            output_file = self._generate_output(calculated_df, input_file, output_dir)
+            
+            if output_file and output_file.exists():
+                self.logger.info(f"处理完成，输出文件: {output_file}")
+                self.log_processing_end(input_file, output_file, success=True)
+                return output_file
+            else:
+                self.logger.error("输出文件生成失败")
+                self.log_processing_end(input_file, success=False)
+                return None
+                
+        except Exception as e:
+            self.logger.error(f"处理文件时出错: {e}", exc_info=True)
+            self.log_processing_end(input_file, success=False)
+            return None
+    
+    def get_required_columns(self) -> List[str]:
+        """返回需要的列名列表"""
+        # 从列映射配置中提取目标列名
+        return list(self.column_mapping.values()) if self.column_mapping else []
+    
+    def _check_filename_patterns(self, file_path: Path) -> bool:
+        """检查文件名模式
+        
+        Args:
+            file_path: 文件路径
+            
+        Returns:
+            是否匹配
+        """
+        try:
+            filename = file_path.name
+            for pattern in self.filename_patterns:
+                if fnmatch.fnmatch(filename.lower(), pattern.lower()):
+                    self.logger.info(f"文件名匹配成功: {filename} -> {pattern}")
+                    return True
+            return False
+        except Exception as e:
+            self.logger.error(f"检查文件名模式时出错: {e}")
+            return False
+    
+    def _check_content_indicators(self, file_path: Path) -> bool:
+        """检查文件内容特征
+        
+        Args:
+            file_path: 文件路径
+            
+        Returns:
+            是否匹配
+        """
+        try:
+            df = self._read_excel_safely(file_path, nrows=5)
+            
+            # 检查列名中是否包含指定关键词
+            columns_str = str(list(df.columns)).lower()
+            
+            for indicator in self.content_indicators:
+                if indicator.lower() in columns_str:
+                    self.logger.info(f"内容特征匹配成功: {indicator}")
+                    return True
+            
+            return False
+            
+        except Exception as e:
+            self.logger.error(f"检查内容特征时出错: {e}")
+            return False
+    
+    def _read_supplier_data(self, file_path: Path) -> Optional[pd.DataFrame]:
+        """读取供应商数据
+        
+        Args:
+            file_path: 文件路径
+            
+        Returns:
+            数据DataFrame或None
+        """
+        try:
+            specified = self.supplier_config.get('header_row')
+            if specified is not None:
+                try:
+                    df = self._read_excel_safely(file_path, header=int(specified))
+                except Exception:
+                    df = self._read_excel_safely(file_path)
+            else:
+                df0 = self._read_excel_safely(file_path, header=None)
+                if df0 is None:
+                    return None
+                header_row = self._find_header_row(df0)
+                if header_row is not None:
+                    df = self._read_excel_safely(file_path, header=header_row)
+                else:
+                    df = self._read_excel_safely(file_path)
+            if df is None or df.empty:
+                self.logger.warning("数据文件为空")
+                return None
+            self.logger.info(f"成功读取数据，形状: {df.shape}")
+            return df
+        except Exception as e:
+            self.logger.error(f"读取数据失败: {e}")
+            return None
+
+    def _find_header_row(self, df: pd.DataFrame) -> Optional[int]:
+        result = ColumnMapper.detect_header_row(df, max_rows=30)
+        return result if result >= 0 else None
+    
+    def _apply_column_mapping(self, df: pd.DataFrame) -> Optional[pd.DataFrame]:
+        """应用列映射
+        
+        Args:
+            df: 原始数据
+            
+        Returns:
+            映射后的数据或None
+        """
+        if not self.column_mapping:
+            self.logger.info("没有列映射配置")
+            return df
+        
+        try:
+            # 应用列重命名
+            df_renamed = df.rename(columns=self.column_mapping)
+            
+            # 检查必需的列是否存在
+            required_columns = self.get_required_columns()
+            missing_columns = [col for col in required_columns if col not in df_renamed.columns]
+            
+            if missing_columns:
+                self.logger.warning(f"缺少必需的列: {missing_columns}")
+                # 创建缺失的列并填充默认值
+                for col in missing_columns:
+                    df_renamed[col] = 0 if '量' in col or '价' in col else ''
+                    self.logger.info(f"创建缺失列: {col}，默认值: {df_renamed[col].iloc[0] if len(df_renamed) > 0 else 'N/A'}")
+            
+            self.logger.info(f"列映射完成，列名: {list(df_renamed.columns)}")
+            return df_renamed
+            
+        except Exception as e:
+            self.logger.error(f"列映射失败: {e}")
+            return None
+    
+    def _apply_data_cleaning(self, df: pd.DataFrame) -> Optional[pd.DataFrame]:
+        """应用数据清洗规则，委托给 DataCleaner"""
+        if not self.cleaning_rules:
+            self.logger.info("没有数据清洗规则")
+            return df
+        try:
+            cleaner = DataCleaner()
+            for rule in self.cleaning_rules:
+                cleaner.add_rule(rule.get('type'), **{k: v for k, v in rule.items() if k != 'type'})
+            result = cleaner.clean(df)
+            self.logger.info(f"数据清洗完成，数据形状: {result.shape}")
+            return result
+        except Exception as e:
+            self.logger.error(f"数据清洗失败: {e}")
+            return None
+    
+    def _apply_calculations(self, df: pd.DataFrame) -> Optional[pd.DataFrame]:
+        """应用计算处理，委托给 DataCalculator"""
+        if not self.calculations:
+            self.logger.info("没有计算规则")
+            return df
+        try:
+            calculator = DataCalculator()
+            for calc in self.calculations:
+                calculator.add_rule(calc.get('type'), **{k: v for k, v in calc.items() if k != 'type'})
+            result = calculator.calculate(df)
+            self.logger.info(f"计算处理完成，数据形状: {result.shape}")
+            return result
+        except Exception as e:
+            self.logger.error(f"计算处理失败: {e}")
+            return None
+    
+    def _generate_output(self, df: pd.DataFrame, input_file: Path, output_dir: Path) -> Optional[Path]:
+        """生成输出文件
+        
+        Args:
+            df: 最终数据
+            input_file: 输入文件路径
+            output_dir: 输出目录
+            
+        Returns:
+            输出文件路径或None
+        """
+        try:
+            # 生成输出文件名
+            timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")
+            output_filename = f"{input_file.stem}{self.output_suffix}_{timestamp}.xls"
+            output_file = output_dir / output_filename
+            
+            # 这里应该使用实际的模板生成逻辑
+            # 暂时直接保存为Excel文件
+            df.to_excel(output_file, index=False)
+            
+            self.logger.info(f"输出文件生成成功: {output_file}")
+            return output_file
+            
+        except Exception as e:
+            self.logger.error(f"生成输出文件失败: {e}")
+            return None
@@ -0,0 +1,347 @@
+"""
+烟草订单处理器
+
+处理烟草公司特定格式的订单明细文件，生成银豹采购单
+"""
+
+import os
+import datetime
+import pandas as pd
+import xlrd
+import xlwt
+from xlutils.copy import copy
+from openpyxl import load_workbook
+from typing import Optional, Dict, Any, List, Tuple
+from pathlib import Path
+
+from .base import BaseProcessor
+from ...core.utils.log_utils import get_logger
+from ...core.utils.string_utils import parse_monetary_string
+from ...core.utils.dialog_utils import show_custom_dialog
+
+logger = get_logger(__name__)
+
+
+class TobaccoProcessor(BaseProcessor):
+    """烟草订单处理器
+    
+    处理烟草公司订单明细文件，提取商品信息并生成标准银豹采购单格式
+    """
+    
+    def __init__(self, config: Dict[str, Any]):
+        """初始化烟草订单处理器
+        
+        Args:
+            config: 配置信息
+        """
+        super().__init__(config)
+        self.description = "处理烟草公司订单明细文件"
+        self.template_file = config.get('Paths', 'template_file', fallback='templates/银豹-采购单模板.xls')
+        
+        # 输出目录配置
+        self.result_dir = Path("data/result")
+        self.result_dir.mkdir(exist_ok=True)
+        
+        # 默认输出文件名
+        self.default_output_name = "银豹采购单_烟草公司.xls"
+    
+    def can_process(self, file_path: Path) -> bool:
+        """判断是否为烟草订单文件
+        
+        Args:
+            file_path: 文件路径
+            
+        Returns:
+            是否能处理该文件
+        """
+        if not self.validate_input(file_path):
+            return False
+        
+        # 检查文件名特征
+        filename = file_path.name
+        tobacco_keywords = ['烟草', '卷烟', '订单明细', 'tobacco', '烟']
+        
+        # 检查文件内容特征
+        try:
+            df = self._read_excel_safely(file_path, nrows=5)
+            required_columns = ['商品', '盒码', '订单量']
+            
+            # 检查文件名或内容特征
+            filename_match = any(keyword in filename for keyword in tobacco_keywords)
+            content_match = all(col in df.columns for col in required_columns)
+            
+            if filename_match or content_match:
+                self.logger.info(f"识别为烟草订单文件: {filename}")
+                return True
+            
+            return False
+            
+        except Exception as e:
+            self.logger.warning(f"检查文件内容时出错: {e}")
+            # 如果无法读取内容，仅基于文件名判断
+            return any(keyword in filename for keyword in tobacco_keywords)
+    
+    def process(self, input_file: Path, output_dir: Path) -> Optional[Path]:
+        """处理烟草订单
+        
+        Args:
+            input_file: 输入文件路径
+            output_dir: 输出目录路径
+            
+        Returns:
+            输出文件路径，处理失败返回None
+        """
+        self.log_processing_start(input_file)
+        
+        try:
+            # 读取订单信息（时间和总金额）
+            order_info = self._read_order_info(input_file)
+            if not order_info:
+                self.logger.error(f"读取订单信息失败: {input_file}")
+                self.log_processing_end(input_file, success=False)
+                return None
+            
+            order_time, total_amount = order_info
+            self.logger.info(f"订单信息 - 时间: {order_time}, 总金额: {total_amount}")
+            
+            # 读取订单数据
+            order_data = self._read_order_data(input_file)
+            if order_data is None or order_data.empty:
+                self.logger.error(f"读取订单数据失败或数据为空: {input_file}")
+                self.log_processing_end(input_file, success=False)
+                return None
+            
+            self.logger.info(f"成功读取订单数据，共{len(order_data)}条记录")
+            
+            # 生成输出文件路径
+            timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+            output_filename = f"银豹采购单_烟草公司_{timestamp}.xls"
+            output_file = output_dir / output_filename
+            
+            # 确保输出目录存在
+            output_file.parent.mkdir(parents=True, exist_ok=True)
+            
+            # 生成银豹采购单
+            result = self._generate_pospal_order(order_data, order_time, output_file)
+            
+            if result:
+                self.logger.info(f"采购单生成成功: {output_file}")
+                self.log_processing_end(input_file, output_file, success=True)
+                
+                # 显示处理结果
+                self._show_processing_result(output_file, order_time, len(order_data), total_amount)
+                
+                return output_file
+            else:
+                self.logger.error("生成银豹采购单失败")
+                self.log_processing_end(input_file, success=False)
+                return None
+                
+        except Exception as e:
+            self.logger.error(f"处理烟草订单时发生错误: {e}", exc_info=True)
+            self.log_processing_end(input_file, success=False)
+            return None
+    
+    def get_required_columns(self) -> List[str]:
+        """返回需要的列名列表"""
+        return ['商品', '盒码', '条码', '建议零售价', '批发价', '需求量', '订单量', '金额']
+    
+    def get_supported_extensions(self) -> List[str]:
+        """支持的文件扩展名"""
+        return ['.xlsx', '.xls']
+    
+    def _read_order_info(self, file_path: Path) -> Optional[Tuple[str, float]]:
+        """读取订单信息（时间和总金额）
+        
+        Args:
+            file_path: 文件路径
+            
+        Returns:
+            包含订单时间和总金额的元组或None
+        """
+        try:
+            wb_info = load_workbook(file_path, data_only=True)
+            ws_info = wb_info.active
+            
+            # 从指定单元格读取订单信息
+            order_time = ws_info["H1"].value or "（空）"
+            total_amount = ws_info["H3"].value or 0.0
+            
+            self.logger.info(f"成功读取订单信息: 时间={order_time}, 总金额={total_amount}")
+            return (order_time, total_amount)
+            
+        except Exception as e:
+            self.logger.error(f"读取订单信息出错: {e}")
+            return None
+    
+    def _read_order_data(self, file_path: Path) -> Optional[pd.DataFrame]:
+        """读取订单数据
+        
+        Args:
+            file_path: 文件路径
+            
+        Returns:
+            订单数据DataFrame或None
+        """
+        columns = ['商品', '盒码', '条码', '建议零售价', '批发价', '需求量', '订单量', '金额']
+        
+        try:
+            df_old = self._read_excel_safely(file_path, header=None, skiprows=3, names=columns)
+            
+            # 过滤订单量不为0的数据，并计算采购量和单价
+            df_filtered = df_old[df_old['订单量'] != 0].copy()
+            
+            if df_filtered.empty:
+                self.logger.warning("没有订单量不为0的记录")
+                return None
+            
+            # 计算采购量和单价
+            df_filtered['采购量'] = df_filtered['订单量'] * 10  # 烟草订单通常需要乘以10
+            df_filtered['采购单价'] = df_filtered['金额'] / df_filtered['采购量']
+            df_filtered = df_filtered.reset_index(drop=True)
+            
+            self.logger.info(f"成功处理订单数据，有效记录数: {len(df_filtered)}")
+            return df_filtered
+            
+        except Exception as e:
+            self.logger.error(f"读取订单数据失败: {e}")
+            return None
+
+    def _generate_pospal_order(self, order_data: pd.DataFrame, order_time: str, output_file: Path) -> bool:
+        """生成银豹采购单
+        
+        Args:
+            order_data: 订单数据
+            order_time: 订单时间
+            output_file: 输出文件路径
+            
+        Returns:
+            是否生成成功
+        """
+        try:
+            # 检查模板文件是否存在
+            template_path = Path(self.template_file)
+            if not template_path.exists():
+                self.logger.error(f"采购单模板文件不存在: {template_path}")
+                return False
+            
+            self.logger.info(f"使用模板文件: {template_path}")
+            
+            # 打开模板，准备写入
+            template_rd = xlrd.open_workbook(str(template_path), formatting_info=True)
+            template_wb = copy(template_rd)
+            template_ws = template_wb.get_sheet(0)
+            
+            # 获取模板中的表头列索引
+            header_row = template_rd.sheet_by_index(0).row_values(0)
+            
+            # 查找需要的列索引
+            try:
+                barcode_col = header_row.index("条码（必填）")
+                amount_col = header_row.index("采购量（必填）")
+                gift_col = header_row.index("赠送量")
+                price_col = header_row.index("采购单价（必填）")
+            except ValueError as e:
+                self.logger.error(f"模板列查找失败: {e}")
+                return False
+            
+            self.logger.info(f"模板列索引 - 条码:{barcode_col}, 采购量:{amount_col}, 赠送量:{gift_col}, 单价:{price_col}")
+            
+            # 写入数据到模板
+            for i, row in order_data.iterrows():
+                template_ws.write(i + 1, barcode_col, row['盒码'])  # 商品条码
+                template_ws.write(i + 1, amount_col, int(row['采购量']))  # 采购量
+                template_ws.write(i + 1, gift_col, "")  # 赠送量为空
+                template_ws.write(i + 1, price_col, round(row['采购单价'], 2))  # 采购单价保留两位小数
+            
+            # 确保输出目录存在
+            output_file.parent.mkdir(parents=True, exist_ok=True)
+            
+            # 保存输出文件
+            template_wb.save(str(output_file))
+            
+            self.logger.info(f"采购单生成成功: {output_file}")
+            return True
+            
+        except Exception as e:
+            self.logger.error(f"生成银豹采购单失败: {e}", exc_info=True)
+            return False
+    
+    def _show_processing_result(self, output_file: Path, order_time: str, total_count: int, total_amount: float):
+        """显示处理结果
+        
+        Args:
+            output_file: 输出文件路径
+            order_time: 订单时间
+            total_count: 处理条目数
+            total_amount: 总金额
+        """
+        try:
+            # 创建附加信息
+            additional_info = {
+                "订单来源": "烟草公司",
+                "处理时间": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+            }
+            
+            # 格式化金额显示
+            parsed = parse_monetary_string(total_amount)
+            total_amount = parsed if parsed is not None else 0.0
+            amount_display = f"¥{total_amount:.2f}"
+            
+            # 显示自定义对话框
+            show_custom_dialog(
+                title="烟草订单处理结果",
+                message="烟草订单处理完成",
+                result_file=str(output_file),
+                time_info=order_time,
+                count_info=f"{total_count}个商品",
+                amount_info=amount_display,
+                additional_info=additional_info
+            )
+            
+            self.logger.info(f"显示处理结果 - 文件:{output_file}, 时间:{order_time}, 数量:{total_count}, 金额:{total_amount}")
+            
+        except Exception as e:
+            self.logger.error(f"显示处理结果时出错: {e}")
+    
+    def get_latest_tobacco_order(self) -> Optional[Path]:
+        """获取最新的烟草订单明细文件（兼容旧接口）
+        
+        Returns:
+            文件路径或None
+        """
+        try:
+            # 获取今日开始时间戳
+            today = datetime.date.today()
+            today_start = datetime.datetime.combine(today, datetime.time.min).timestamp()
+            
+            # 查找订单明细文件
+            result_dir = Path("data/output")
+            if not result_dir.exists():
+                return None
+            
+            # 查找符合条件的文件
+            candidates = []
+            for file_path in result_dir.glob("订单明细*.xlsx"):
+                if file_path.stat().st_ctime >= today_start:
+                    candidates.append(file_path)
+            
+            if not candidates:
+                self.logger.warning("未找到今天创建的烟草订单明细文件")
+                # 返回最新的文件
+                all_files = list(result_dir.glob("订单明细*.xlsx"))
+                if all_files:
+                    all_files.sort(key=lambda x: x.stat().st_ctime, reverse=True)
+                    return all_files[0]
+                return None
+            
+            # 返回最新的文件
+            candidates.sort(key=lambda x: x.stat().st_ctime, reverse=True)
+            latest_file = candidates[0]
+            
+            self.logger.info(f"找到最新烟草订单明细文件: {latest_file}")
+            return latest_file
+            
+        except Exception as e:
+            self.logger.error(f"获取最新烟草订单文件时出错: {e}")
+            return None
@@ -0,0 +1,5 @@
+"""
+OCR订单处理系统 - 工具模块
+------------------------
+提供系统通用工具和辅助函数。
+""" 
@@ -0,0 +1,184 @@
+"""云端同步模块 — 基于 Gitea REST API 的文件同步"""
+
+import base64
+import json
+from typing import Optional, Tuple
+
+import requests
+
+from .log_utils import get_logger
+
+logger = get_logger(__name__)
+
+
+class GiteaSync:
+    """通过 Gitea REST API 读写仓库文件"""
+
+    def __init__(self, base_url: str, owner: str, repo: str, token: str, timeout: int = 15):
+        self.base_url = base_url.rstrip("/")
+        self.owner = owner
+        self.repo = repo
+        self.token = token
+        self.timeout = timeout
+
+    @property
+    def _headers(self) -> dict:
+        return {"Authorization": f"token {self.token}"}
+
+    def _api_url(self, path: str) -> str:
+        return f"{self.base_url}/api/v1/repos/{self.owner}/{self.repo}/contents/{path}"
+
+    def pull_file(self, remote_path: str) -> Optional[Tuple[bytes, str]]:
+        """从仓库下载文件
+
+        Returns:
+            (content_bytes, sha) 或 None（文件不存在或失败）
+        """
+        try:
+            resp = requests.get(
+                self._api_url(remote_path),
+                headers=self._headers,
+                timeout=self.timeout,
+            )
+            if resp.status_code == 404:
+                logger.info(f"云端文件不存在: {remote_path}")
+                return None
+            if resp.status_code != 200:
+                logger.warning(f"拉取文件失败: {resp.status_code} {resp.text[:200]}")
+                return None
+
+            data = resp.json()
+            sha = data.get("sha", "")
+            content_b64 = data.get("content", "")
+            # Gitea 返回的 base64 可能含换行
+            content_bytes = base64.b64decode(content_b64.replace("\n", ""))
+            logger.info(f"拉取文件成功: {remote_path} ({len(content_bytes)} bytes)")
+            return content_bytes, sha
+
+        except requests.RequestException as e:
+            logger.error(f"拉取文件网络错误: {e}")
+            return None
+
+    def push_file(
+        self,
+        remote_path: str,
+        content: bytes,
+        message: str,
+        sha: Optional[str] = None,
+    ) -> Optional[str]:
+        """上传或更新文件到仓库
+
+        Args:
+            remote_path: 仓库中的文件路径
+            content: 文件内容（bytes）
+            message: commit message
+            sha: 文件当前 sha（更新时必传，新建时省略）
+
+        Returns:
+            新的 sha，失败返回 None
+        """
+        payload = {
+            "message": message,
+            "content": base64.b64encode(content).decode("ascii"),
+        }
+        if sha:
+            payload["sha"] = sha
+
+        try:
+            resp = requests.put(
+                self._api_url(remote_path),
+                headers={**self._headers, "Content-Type": "application/json"},
+                json=payload,
+                timeout=self.timeout,
+            )
+            if resp.status_code not in (200, 201):
+                logger.warning(f"推送文件失败: {resp.status_code} {resp.text[:200]}")
+                return None
+
+            new_sha = resp.json().get("content", {}).get("sha", "")
+            logger.info(f"推送文件成功: {remote_path} (sha={new_sha[:12]})")
+            return new_sha
+
+        except requests.RequestException as e:
+            logger.error(f"推送文件网络错误: {e}")
+            return None
+
+    def file_exists(self, remote_path: str) -> Optional[str]:
+        """检查文件是否存在
+
+        Returns:
+            文件 sha（存在）或 None（不存在）
+        """
+        try:
+            resp = requests.head(
+                self._api_url(remote_path),
+                headers=self._headers,
+                timeout=self.timeout,
+            )
+            if resp.status_code == 200:
+                # HEAD 不返回 body，需要 GET 获取 sha
+                result = self.pull_file(remote_path)
+                return result[1] if result else None
+            return None
+        except requests.RequestException:
+            return None
+
+    def pull_json(self, remote_path: str) -> Optional[Tuple[dict, str]]:
+        """拉取并解析 JSON 文件
+
+        Returns:
+            (parsed_dict, sha) 或 None
+        """
+        result = self.pull_file(remote_path)
+        if result is None:
+            return None
+        content_bytes, sha = result
+        try:
+            data = json.loads(content_bytes)
+            return data, sha
+        except json.JSONDecodeError as e:
+            logger.error(f"解析 JSON 失败: {e}")
+            return None
+
+    def push_json(self, remote_path: str, data: dict, message: str, sha: Optional[str] = None) -> Optional[str]:
+        """将 dict 序列化为 JSON 并推送
+
+        Returns:
+            新的 sha，失败返回 None
+        """
+        content = json.dumps(data, ensure_ascii=False, indent=2).encode("utf-8")
+        return self.push_file(remote_path, content, message, sha)
+
+    def push_binary(self, remote_path: str, local_path: str, message: str) -> Optional[str]:
+        """读取本地二进制文件并推送到云端
+
+        Returns:
+            新的 sha，失败返回 None
+        """
+        try:
+            with open(local_path, "rb") as f:
+                content = f.read()
+        except OSError as e:
+            logger.error(f"读取本地文件失败: {local_path} — {e}")
+            return None
+
+        existing_sha = self.file_exists(remote_path)
+        return self.push_file(remote_path, content, message, sha=existing_sha)
+
+    @classmethod
+    def from_config(cls, config) -> Optional["GiteaSync"]:
+        """从 ConfigManager 创建实例
+
+        Returns:
+            GiteaSync 实例，配置不完整时返回 None
+        """
+        base_url = config.get("Gitea", "base_url", fallback="").strip()
+        owner = config.get("Gitea", "owner", fallback="").strip()
+        repo = config.get("Gitea", "repo", fallback="").strip()
+        token = config.get("Gitea", "token", fallback="").strip()
+
+        if not all([base_url, owner, repo, token]):
+            logger.debug("Gitea 配置不完整，跳过云端同步")
+            return None
+
+        return cls(base_url=base_url, owner=owner, repo=repo, token=token)
@@ -0,0 +1,286 @@
+"""
+文件操作工具模块
+--------------
+提供文件处理、查找和管理功能。
+"""
+
+import os
+import sys
+import shutil
+import json
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, List, Optional, Union, Any
+
+from .log_utils import get_logger
+
+logger = get_logger(__name__)
+
+def ensure_dir(directory: str) -> bool:
+    """
+    确保目录存在，如果不存在则创建
+    
+    Args:
+        directory: 目录路径
+        
+    Returns:
+        是否成功创建或目录已存在
+    """
+    try:
+        os.makedirs(directory, exist_ok=True)
+        return True
+    except Exception as e:
+        logger.error(f"创建目录失败: {directory}, 错误: {e}")
+        return False
+
+def get_file_extension(file_path: str) -> str:
+    """
+    获取文件扩展名（小写）
+    
+    Args:
+        file_path: 文件路径
+        
+    Returns:
+        文件扩展名，包含点（例如 .jpg）
+    """
+    return os.path.splitext(file_path)[1].lower()
+
+def is_valid_extension(file_path: str, allowed_extensions: List[str]) -> bool:
+    """
+    检查文件扩展名是否在允许的列表中
+    
+    Args:
+        file_path: 文件路径
+        allowed_extensions: 允许的扩展名列表（例如 ['.jpg', '.png']）
+        
+    Returns:
+        文件扩展名是否有效
+    """
+    ext = get_file_extension(file_path)
+    return ext in allowed_extensions
+
+def get_files_by_extensions(directory: str, extensions: List[str], exclude_patterns: List[str] = None) -> List[str]:
+    """
+    获取指定目录下所有符合扩展名的文件路径
+    
+    Args:
+        directory: 目录路径
+        extensions: 扩展名列表（例如 ['.jpg', '.png']）
+        exclude_patterns: 排除的文件名模式（例如 ['~$', '.tmp']）
+        
+    Returns:
+        文件路径列表
+    """
+    if exclude_patterns is None:
+        exclude_patterns = ['~$', '.tmp']
+        
+    files = []
+    for file in os.listdir(directory):
+        file_path = os.path.join(directory, file)
+        
+        # 检查是否是文件
+        if not os.path.isfile(file_path):
+            continue
+            
+        # 检查扩展名
+        if not is_valid_extension(file_path, extensions):
+            continue
+            
+        # 检查排除模式
+        exclude = False
+        for pattern in exclude_patterns:
+            if pattern in file:
+                exclude = True
+                break
+                
+        if not exclude:
+            files.append(file_path)
+            
+    return files
+
+def get_latest_file(directory: str, pattern: str = "", extensions: List[str] = None) -> Optional[str]:
+    """
+    获取指定目录下最新的文件
+    
+    Args:
+        directory: 目录路径
+        pattern: 文件名包含的字符串模式
+        extensions: 限制的文件扩展名列表
+        
+    Returns:
+        最新文件的路径，如果没有找到则返回None
+    """
+    if not os.path.exists(directory):
+        logger.warning(f"目录不存在: {directory}")
+        return None
+        
+    files = []
+    for file in os.listdir(directory):
+        # 检查模式和扩展名
+        if (pattern and pattern not in file) or \
+           (extensions and not is_valid_extension(file, extensions)):
+            continue
+            
+        file_path = os.path.join(directory, file)
+        if os.path.isfile(file_path):
+            files.append((file_path, os.path.getmtime(file_path)))
+    
+    if not files:
+        logger.warning(f"未在目录 {directory} 中找到符合条件的文件")
+        return None
+    
+    # 按修改时间排序，返回最新的
+    sorted_files = sorted(files, key=lambda x: x[1], reverse=True)
+    return sorted_files[0][0]
+
+def generate_timestamp_filename(original_path: str) -> str:
+    """
+    生成基于时间戳的文件名
+    
+    Args:
+        original_path: 原始文件路径
+        
+    Returns:
+        带时间戳的新文件路径
+    """
+    dir_path = os.path.dirname(original_path)
+    ext = os.path.splitext(original_path)[1]
+    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
+    return os.path.join(dir_path, f"{timestamp}{ext}")
+
+def rename_file(source_path: str, target_path: str) -> bool:
+    """
+    重命名文件
+    
+    Args:
+        source_path: 源文件路径
+        target_path: 目标文件路径
+        
+    Returns:
+        是否成功重命名
+    """
+    try:
+        # 确保目标目录存在
+        target_dir = os.path.dirname(target_path)
+        ensure_dir(target_dir)
+        
+        # 重命名文件
+        os.rename(source_path, target_path)
+        logger.info(f"文件已重命名: {os.path.basename(source_path)} -> {os.path.basename(target_path)}")
+        return True
+    except Exception as e:
+        logger.error(f"重命名文件失败: {e}")
+        return False
+
+def load_json(file_path: str, default: Any = None) -> Any:
+    """
+    加载JSON文件
+    
+    Args:
+        file_path: JSON文件路径
+        default: 如果文件不存在或加载失败时返回的默认值
+        
+    Returns:
+        JSON内容，或者默认值
+    """
+    if not os.path.exists(file_path):
+        return default
+        
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    except Exception as e:
+        logger.error(f"加载JSON文件失败: {file_path}, 错误: {e}")
+        return default
+
+def save_json(data: Any, file_path: str, ensure_ascii: bool = False, indent: int = 2) -> bool:
+    """
+    保存数据到JSON文件
+    
+    Args:
+        data: 要保存的数据
+        file_path: JSON文件路径
+        ensure_ascii: 是否确保ASCII编码
+        indent: 缩进空格数
+        
+    Returns:
+        是否成功保存
+    """
+    try:
+        # 确保目录存在
+        directory = os.path.dirname(file_path)
+        ensure_dir(directory)
+        
+        with open(file_path, 'w', encoding='utf-8') as f:
+            json.dump(data, f, ensure_ascii=ensure_ascii, indent=indent)
+        logger.debug(f"JSON数据已保存到: {file_path}")
+        return True
+    except Exception as e:
+        logger.error(f"保存JSON文件失败: {file_path}, 错误: {e}")
+        return False
+
+def smart_read_excel(file_path: Union[str, Path], **kwargs) -> Any:
+    """
+    智能读取 Excel 文件，自动选择引擎并处理常见错误
+    
+    Args:
+        file_path: Excel 文件路径
+        **kwargs: 传递给 pd.read_excel 的额外参数
+        
+    Returns:
+        pandas.DataFrame 对象
+    """
+    import pandas as pd
+    
+    path_str = str(file_path)
+    ext = os.path.splitext(path_str)[1].lower()
+    
+    # 自动选择引擎
+    if ext == '.xlsx':
+        kwargs.setdefault('engine', 'openpyxl')
+    elif ext == '.xls':
+        kwargs.setdefault('engine', 'xlrd')
+    
+    try:
+        return pd.read_excel(path_str, **kwargs)
+    except Exception as e:
+        logger.error(f"读取 Excel 文件失败: {path_str}, 错误: {e}")
+        raise
+
+def get_file_size(file_path: str) -> int:
+    """
+    获取文件大小（字节）
+    
+    Args:
+        file_path: 文件路径
+        
+    Returns:
+        文件大小（字节）
+    """
+    try:
+        return os.path.getsize(file_path)
+    except Exception as e:
+        logger.error(f"获取文件大小失败: {file_path}, 错误: {e}")
+        return 0
+
+def is_file_size_valid(file_path: str, max_size_mb: float) -> bool:
+    """
+    检查文件大小是否在允许范围内
+    
+    Args:
+        file_path: 文件路径
+        max_size_mb: 最大允许大小（MB）
+        
+    Returns:
+        文件大小是否有效
+    """
+    size_bytes = get_file_size(file_path)
+    max_size_bytes = max_size_mb * 1024 * 1024
+    return size_bytes <= max_size_bytes
+
+
+def format_file_size(size_bytes: int) -> str:
+    """将字节数格式化为可读的文件大小字符串（KB/MB）"""
+    if size_bytes < 1024 * 1024:
+        return f"{size_bytes / 1024:.1f} KB"
+    return f"{size_bytes / (1024 * 1024):.1f} MB"
@@ -0,0 +1,180 @@
+"""
+日志工具模块
+----------
+提供统一的日志配置和管理功能。
+"""
+
+import os
+import sys
+import logging
+from logging.handlers import RotatingFileHandler
+from datetime import datetime
+from pathlib import Path
+from typing import Optional, Dict
+
+# 日志处理器字典，用于跟踪已创建的处理器
+_handlers: Dict[str, logging.Handler] = {}
+
+def setup_logger(name: str, 
+                log_file: Optional[str] = None, 
+                level=logging.INFO, 
+                console_output: bool = True,
+                file_output: bool = True,
+                log_format: str = '%(asctime)s - %(name)s - %(levelname)s - %(message)s') -> logging.Logger:
+    """
+    配置并返回日志记录器
+    
+    Args:
+        name: 日志记录器的名称
+        log_file: 日志文件路径，如果为None则使用默认路径
+        level: 日志级别
+        console_output: 是否输出到控制台
+        file_output: 是否输出到文件
+        log_format: 日志格式
+        
+    Returns:
+        配置好的日志记录器
+    """
+    # 获取或创建日志记录器
+    logger = logging.getLogger(name)
+    
+    # 如果已经配置过处理器，不重复配置
+    if logger.handlers:
+        return logger
+    
+    # 设置日志级别
+    logger.setLevel(level)
+    
+    # 创建格式化器
+    formatter = logging.Formatter(log_format)
+    
+    # 如果需要输出到文件
+    if file_output:
+        # 如果没有指定日志文件，使用默认路径
+        if log_file is None:
+            log_dir = os.path.abspath('logs')
+            # 确保日志目录存在
+            os.makedirs(log_dir, exist_ok=True)
+            log_file = os.path.join(log_dir, f"{name}.log")
+        
+        # 创建文件处理器
+        try:
+            # 使用滚动日志，限制单个日志大小与备份数量
+            file_handler = RotatingFileHandler(log_file, maxBytes=5 * 1024 * 1024, backupCount=3, encoding='utf-8')
+            file_handler.setFormatter(formatter)
+            file_handler.setLevel(level)
+            logger.addHandler(file_handler)
+            _handlers[f"{name}_file"] = file_handler
+            
+            # 记录活跃标记，避免被日志清理工具删除
+            active_marker = os.path.join(os.path.dirname(log_file), f"{name}.active")
+            with open(active_marker, 'w', encoding='utf-8') as f:
+                f.write(f"Active since: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+        except Exception as e:
+            print(f"无法创建日志文件处理器: {e}")
+    
+    # 如果需要输出到控制台
+    if console_output:
+        # 创建控制台处理器
+        console_handler = logging.StreamHandler(sys.stdout)
+        console_handler.setFormatter(formatter)
+        console_handler.setLevel(level)
+        logger.addHandler(console_handler)
+        _handlers[f"{name}_console"] = console_handler
+    
+    return logger
+
+def get_logger(name: str) -> logging.Logger:
+    """
+    获取已配置的日志记录器，如果不存在则创建一个新的
+    
+    Args:
+        name: 日志记录器的名称
+        
+    Returns:
+        日志记录器
+    """
+    logger = logging.getLogger(name)
+    if not logger.handlers:
+        return setup_logger(name)
+    return logger
+
+def set_log_level(level: str) -> None:
+    """
+    设置所有日志记录器的级别
+    
+    Args:
+        level: 日志级别(DEBUG, INFO, WARNING, ERROR, CRITICAL)
+    """
+    level_map = {
+        'debug': logging.DEBUG,
+        'info': logging.INFO,
+        'warning': logging.WARNING,
+        'error': logging.ERROR,
+        'critical': logging.CRITICAL
+    }
+    
+    # 获取对应的日志级别
+    log_level = level_map.get(level.lower(), logging.INFO)
+    
+    # 获取所有记录器
+    loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict]
+    
+    # 设置每个记录器的级别
+    for logger in loggers:
+        logger.setLevel(log_level)
+        
+    # 设置根记录器的级别
+    logging.getLogger().setLevel(log_level)
+    
+    print(f"所有日志记录器级别已设置为: {logging.getLevelName(log_level)}")
+
+def close_logger(name: str) -> None:
+    """
+    关闭日志记录器的所有处理器
+    
+    Args:
+        name: 日志记录器的名称
+    """
+    logger = logging.getLogger(name)
+    for handler in logger.handlers[:]:
+        handler.close()
+        logger.removeHandler(handler)
+    
+    # 清除处理器缓存
+    _handlers.pop(f"{name}_file", None)
+    _handlers.pop(f"{name}_console", None)
+
+def close_all_loggers() -> None:
+    """
+    关闭所有日志记录器的处理器
+    """
+    # 获取所有记录器
+    loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict]
+    
+    # 关闭每个记录器的处理器
+    for logger in loggers:
+        if hasattr(logger, 'handlers'):
+            for handler in logger.handlers[:]:
+                handler.close()
+                logger.removeHandler(handler)
+    
+    # 清空处理器缓存
+    _handlers.clear()
+    
+    print("所有日志记录器已关闭")
+
+def cleanup_active_marker(name: str) -> None:
+    """
+    清理日志活跃标记
+    
+    Args:
+        name: 日志记录器的名称
+    """
+    try:
+        log_dir = os.path.abspath('logs')
+        active_marker = os.path.join(log_dir, f"{name}.active")
+        if os.path.exists(active_marker):
+            os.remove(active_marker)
+    except Exception as e:
+        print(f"无法清理日志活跃标记: {e}") 
@@ -0,0 +1,279 @@
+"""
+字符串处理工具模块
+---------------
+提供字符串处理、正则表达式匹配等功能。
+"""
+
+import re
+from typing import Dict, List, Optional, Tuple, Any
+
+def clean_string(text: str) -> str:
+    """
+    清理字符串，移除多余空白
+    
+    Args:
+        text: 源字符串
+        
+    Returns:
+        清理后的字符串
+    """
+    if not isinstance(text, str):
+        return ""
+    
+    # 移除首尾空白
+    text = text.strip()
+    # 移除多余空白
+    text = re.sub(r'\s+', ' ', text)
+    return text
+
+def remove_non_digits(text: str) -> str:
+    """
+    移除字符串中的非数字字符
+    
+    Args:
+        text: 源字符串
+        
+    Returns:
+        只包含数字的字符串
+    """
+    if not isinstance(text, str):
+        return ""
+        
+    return re.sub(r'\D', '', text)
+
+def extract_number(text: str) -> Optional[float]:
+    """
+    从字符串中提取数字
+    
+    Args:
+        text: 源字符串
+        
+    Returns:
+        提取的数字，如果没有则返回None
+    """
+    if not isinstance(text, str):
+        return None
+        
+    # 匹配数字（可以包含小数点和负号）
+    match = re.search(r'-?\d+(\.\d+)?', text)
+    if match:
+        return float(match.group())
+    return None
+
+def extract_unit(text: str, units: List[str] = None) -> Optional[str]:
+    """
+    从字符串中提取单位
+    
+    Args:
+        text: 源字符串
+        units: 有效单位列表，如果为None则自动识别
+        
+    Returns:
+        提取的单位，如果没有则返回None
+    """
+    if not isinstance(text, str):
+        return None
+        
+    # 如果提供了单位列表，检查字符串中是否包含
+    if units:
+        for unit in units:
+            if unit in text:
+                return unit
+        return None
+        
+    # 否则，尝试自动识别常见单位
+    # 正则表达式：匹配数字后面的非数字部分作为单位
+    match = re.search(r'\d+\s*([^\d\s]+)', text)
+    if match:
+        return match.group(1)
+    return None
+
+def extract_number_and_unit(text: str) -> Tuple[Optional[float], Optional[str]]:
+    """
+    从字符串中同时提取数字和单位
+    
+    Args:
+        text: 源字符串
+        
+    Returns:
+        (数字, 单位)元组，如果没有则对应返回None
+    """
+    if not isinstance(text, str):
+        return None, None
+        
+    # 匹配数字和单位的组合
+    match = re.search(r'(-?\d+(?:\.\d+)?)\s*([^\d\s]+)?', text)
+    if match:
+        number = float(match.group(1))
+        unit = match.group(2) if match.group(2) else None
+        return number, unit
+    return None, None
+
+def parse_specification(spec_str: str) -> Optional[int]:
+    """
+    解析规格字符串，提取包装数量
+    支持格式：1*15, 1x15, 1*5*10
+    
+    Args:
+        spec_str: 规格字符串
+        
+    Returns:
+        包装数量，如果无法解析则返回None
+    """
+    if not spec_str or not isinstance(spec_str, str):
+        return None
+    
+    try:
+        # 清理规格字符串
+        spec_str = clean_string(spec_str)
+        
+        # 匹配重量/容量格式，如"450g*15"、"450ml*15"
+        match = re.search(r'\d+(?:g|ml|毫升|克)[*xX×](\d+)', spec_str)
+        if match:
+            # 返回后面的数量
+            return int(match.group(1))
+        
+        # 匹配1*5*10 格式的三级规格
+        match = re.search(r'(\d+)[\*xX×](\d+)[\*xX×](\d+)', spec_str)
+        if match:
+            # 取最后一个数字作为袋数量
+            return int(match.group(3))
+        
+        # 匹配1*15, 1x15 格式
+        match = re.search(r'(\d+)[\*xX×](\d+)', spec_str)
+        if match:
+            # 取第二个数字作为包装数量
+            return int(match.group(2))
+            
+        # 匹配24瓶/件等格式
+        match = re.search(r'(\d+)[瓶个支袋][/／](件|箱)', spec_str)
+        if match:
+            return int(match.group(1))
+            
+        # 匹配4L格式
+        match = re.search(r'(\d+(?:\.\d+)?)\s*[Ll升][*×]?(\d+)?', spec_str)
+        if match:
+            # 如果有第二个数字，返回它；否则返回1
+            return int(match.group(2)) if match.group(2) else 1
+            
+    except Exception:
+        pass
+        
+    return None
+
+def clean_barcode(barcode: Any) -> str:
+    """
+    清理条码格式
+    
+    Args:
+        barcode: 条码（可以是字符串、整数或浮点数）
+        
+    Returns:
+        清理后的条码字符串
+    """
+    if isinstance(barcode, (int, float)):
+        barcode = f"{barcode:.0f}"
+        
+    # 清理条码格式，移除可能的非数字字符（包括小数点）
+    barcode_clean = re.sub(r'\.0+$', '', str(barcode))  # 移除末尾0
+    barcode_clean = re.sub(r'\D', '', barcode_clean)  # 只保留数字
+    
+    return barcode_clean
+
+def is_scientific_notation(value: str) -> bool:
+    """
+    检查字符串是否是科学计数法表示
+    
+    Args:
+        value: 字符串值
+        
+    Returns:
+        是否是科学计数法
+    """
+    return bool(re.match(r'^-?\d+(\.\d+)?[eE][+-]?\d+$', str(value)))
+
+def parse_monetary_string(value: Any) -> Optional[float]:
+    """
+    解析金额/数量字符串为浮点数。
+    处理: 货币符号(¥/$)、逗号作小数点、逗号作千位分隔符、中文"元"后缀等。
+
+    Args:
+        value: 金额值（字符串、数字或其他类型）
+
+    Returns:
+        解析后的浮点数，无法解析则返回 None
+    """
+    if value is None:
+        return None
+    if isinstance(value, (int, float)):
+        return float(value)
+    if not isinstance(value, str):
+        return None
+
+    s = value.strip()
+    if not s or s.lower() in ('o', 'none', 'null', '-', '--'):
+        return None
+
+    # 移除非数字字符，保留数字、小数点、逗号和负号
+    cleaned = re.sub(r'[^\d\.\-,]', '', s)
+    if not cleaned or cleaned in ('-', '.', '-.', ','):
+        return None
+
+    # 逗号处理策略:
+    #   多个逗号 -> 千位分隔符，全部移除 (如 "1,234,567" = 1234567)
+    #   一个逗号 + 无小数点 -> 逗号当小数点 (如 "1,5" = 1.5)
+    #   一个逗号 + 有小数点 -> 千位分隔符，移除 (如 "1,234.56" = 1234.56)
+    comma_count = cleaned.count(',')
+    if comma_count > 1:
+        cleaned = cleaned.replace(',', '')
+    elif comma_count == 1 and '.' not in cleaned:
+        cleaned = cleaned.replace(',', '.')
+    elif comma_count == 1 and '.' in cleaned:
+        cleaned = cleaned.replace(',', '')
+
+    try:
+        return float(cleaned)
+    except (ValueError, TypeError):
+        return None
+
+
+def format_barcode(barcode: Any) -> str:
+    """
+    格式化条码，处理科学计数法
+    
+    Args:
+        barcode: 条码值
+        
+    Returns:
+        格式化后的条码字符串
+    """
+    if barcode is None:
+        return ""
+        
+    # 先转为字符串
+    barcode_str = str(barcode).strip()
+    
+    # 判断是否为科学计数法
+    if is_scientific_notation(barcode_str):
+        try:
+            # 科学计数法转为普通数字字符串
+            barcode_str = f"{float(barcode_str):.0f}"
+        except (ValueError, TypeError):
+            pass
+    
+    # 移除可能的小数部分（如"123456.0"变为"123456"）
+    if '.' in barcode_str:
+        barcode_str = re.sub(r'\.0+$', '', barcode_str)
+    
+    # 确保是纯数字字符串
+    if not barcode_str.isdigit():
+        # 只保留数字字符
+        barcode_str = re.sub(r'\D', '', barcode_str)
+    
+    # 新增：处理末尾多余的0，标准条码通常为12-13位
+    if len(barcode_str) > 13 and barcode_str.endswith('0'):
+        # 从末尾开始移除多余的0，直到条码长度为13位或者不再以0结尾
+        while len(barcode_str) > 13 and barcode_str.endswith('0'):
+            barcode_str = barcode_str[:-1]
+    
+    return barcode_str