commit e4d62df7e3419101763674691933193112738929 Author: houhuan Date: Mon May 4 19:51:13 2026 +0800 feat: 益选 OCR 订单处理系统初始提交 - 智能供应商识别(蓉城易购/烟草/杨碧月/通用) - 百度 OCR 表格识别集成 - 规则引擎(列映射/数据清洗/单位转换/规格推断) - 条码映射管理与云端同步(Gitea REST API) - 云端同步支持:条码映射、供应商配置、商品资料、采购模板 - 拖拽一键处理(图片→OCR→Excel→合并) - 191 个单元测试 - 移除无用的模板管理功能 - 清理 IDE 产物目录 Co-Authored-By: Claude Opus 4.7 diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..2eff444 --- /dev/null +++ b/.env.example @@ -0,0 +1,3 @@ +# 百度 OCR API 配置 +BAIDU_API_KEY=your_api_key_here +BAIDU_SECRET_KEY=your_secret_key_here diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c861b2b --- /dev/null +++ b/.gitignore @@ -0,0 +1,42 @@ +# Environment +.env + +# Python +__pycache__/ +*.pyc +*.pyo +.pytest_cache/ +.venv/ + +# Build & dist +build/ +dist/ +release/ +*.spec + +# Logs & temp +logs/ +data/temp/ + +# Runtime outputs +data/output/ +data/result/ +data/input/ +data/product_cache.db +data/user_settings.json +*.db + +# Claude Code / IDE +.claude/ +.playwright-mcp/ +.trae/ + +# Old project +wework_xiaoai_bot/ + +# OS/IDE +.DS_Store +Thumbs.db +.idea/ +.vscode/ + diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..5cb1679 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,39 @@ +# Changelog + +## [v2.2.0] - 2026-03-31 +### Added +- **UI Simplification**: Removed dedicated buttons for Rongcheng and Tobacco; all Excel orders now use the intelligent auto-routing. +- **Enhanced Yang Biyue Support**: Fixed column mapping for Yang Biyue orders, ensuring standard fields (Barcode, Quantity, Price) are correctly extracted. +- **Headless API Auto-Detect**: `headless_api.py` now automatically distinguishes between Image (OCR) and Excel (Direct) inputs based on file extension. + +### Fixed +- **Yang Biyue Preprocessing**: Resolved issue where data was empty due to incorrect column renaming. +- **Interference Filtering**: Added logic to exclude distractor columns like "Settlement Unit" or "Base Quantity" during preprocessing. + +### Removed +- **Redundant Files**: Cleaned up `run.py`, `clean.py`, and unused CLI modules. +- **Legacy UI Elements**: Removed tobacco-specific keyboard shortcuts and help entries. + +## [v2.1.0] - 2026-03-30 +### Added +- **Intelligent Recognition**: Automated fingerprinting for Rongcheng Yigou, Tobacco, and Yang Biyue orders. +- **Auto-Routing**: `OrderService.process_excel` now automatically handles preprocessing without explicit flags. +- **Headless API Enhancements**: `headless_api.py` updated to support the new intelligent recognition mode. +- **Comprehensive Documentation**: Added `OPENCLAW_GUIDE.md` and `FINAL_UPDATE_REPORT.md`. + +### Fixed +- **Rongcheng Yigou**: Fixed barcode splitting issue where quantities were incorrectly distributed (30 to 5). +- **Tobacco Orders**: Corrected unit price calculation (divided by 10) and quantity calculation (multiplied by 10). +- **Identification Failure**: Fixed issue where `header=0` caused identification keywords at the very first row to be missed. + +## [v2.0.0] - 2026-03-25 +### Added +- **Headless API**: First release of `headless_api.py` for OpenClaw integration. +- **Price Validation**: Integration with PosPal item data for unit price auditing. +- **Asynchronous Logging**: GUI now uses a queue for log output to prevent UI freezing. + +## [v1.1.0] - 2026-03-10 +### Added +- **Rongcheng Yigou Support**: Initial support for Rongcheng Excel templates. +- **Tobacco Support**: Initial support for Tobacco Excel templates. +- **Excel Processor**: Refactored core processing logic into `ExcelProcessor`. diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..a10215b --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,146 @@ +# CLAUDE.md - 益选 OCR 订单处理系统 + +## 项目概述 + +益选 OCR 订单处理系统 (orc-order-v2) 是一个面向零售与分销场景的采购单处理工具。 + +**核心流程**: 图片 OCR → Excel 规范化 → 模板填充 → 合并导出 + +**目标系统**: 银豹 (PosPal) POS 系统 + +**技术栈**: Python 3.9+, Tkinter, Pandas, Baidu OCR API, xlrd/xlwt/openpyxl + +## 项目结构 + +``` +orc-order-v2/ +├── 启动器.py # 入口桩 (~13行, 仅导入 main) +├── headless_api.py # CLI 自动化接口 (OpenClaw 对接) +├── build_exe.py # PyInstaller 打包脚本 +├── config.ini # 全局配置 (API密钥、路径) +├── config/ +│ ├── config.ini # 配置副本 +│ ├── barcode_mappings.json # 条码映射规则 +│ └── suppliers_config.json # 供应商配置 (列映射/清洗规则/计算规则) +├── app/ +│ ├── config/ +│ │ ├── settings.py # ConfigManager 单例 +│ │ └── defaults.py # 默认配置 +│ ├── core/ +│ │ ├── excel/ +│ │ │ ├── processor.py # ExcelProcessor - 标准化转换核心 +│ │ │ ├── converter.py # UnitConverter - 单位转换与规格推断 +│ │ │ ├── merger.py # PurchaseOrderMerger - 采购单合并 +│ │ │ ├── validators.py # ProductValidator +│ │ │ └── handlers/ # 条码映射、单位转换处理器 +│ │ ├── handlers/ +│ │ │ ├── rule_engine.py # 通用规则引擎 (split/extract/normalize/mark) +│ │ │ ├── column_mapper.py # 列映射器 +│ │ │ ├── data_cleaner.py # 数据清洗器 +│ │ │ └── calculator.py # 计算器 +│ │ ├── ocr/ +│ │ │ ├── table_ocr.py # OCRProcessor +│ │ │ └── baidu_ocr.py # BaiduOCRClient +│ │ ├── processors/ +│ │ │ ├── base.py # BaseProcessor 抽象基类 +│ │ │ ├── tobacco_processor.py +│ │ │ ├── ocr_processor.py +│ │ │ └── supplier_processors/ +│ │ │ └── generic_supplier_processor.py +│ │ └── utils/ +│ │ ├── file_utils.py # 文件操作工具 +│ │ ├── log_utils.py # 日志工具 +│ │ ├── string_utils.py # 字符串工具 +│ │ └── dialog_utils.py # Tkinter 对话框工具 +│ ├── services/ +│ │ ├── order_service.py # 订单服务 (智能路由分发) +│ │ ├── ocr_service.py # OCR 服务 +│ │ ├── processor_service.py # 处理器调度服务 +│ │ ├── tobacco_service.py # 烟草公司专用服务 +│ │ └── special_suppliers_service.py # 特殊供应商服务 (蓉城/杨碧月) +│ └── ui/ # GUI 模块 (从启动器.py拆分) +│ ├── error_utils.py # L0 错误对话框 +│ ├── theme.py # L0 主题管理 (THEMES, create_modern_button) +│ ├── logging_ui.py # L0 日志队列与GUI日志处理器 +│ ├── ui_widgets.py # L0 StatusBar, ProgressReporter, center_window +│ ├── user_settings.py # L1 用户设置与最近文件管理 +│ ├── result_previews.py # L1 处理结果预览对话框 +│ ├── command_runner.py # L1 命令执行器 (subprocess + 日志重定向) +│ ├── file_operations.py # L2 文件选择/清理/目录操作 +│ ├── action_handlers.py # L2 业务操作 (OCR/Excel/合并/拖拽) +│ ├── barcode_editor.py # L2 条码映射编辑 +│ ├── config_dialog.py # L3 系统设置对话框 +│ ├── shortcuts.py # L3 键盘快捷键绑定 +│ └── main_window.py # L4 main() 主窗口构建 +├── templates/ +│ ├── 银豹-采购单模板.xls # 输出模板 +│ └── 商品资料.xlsx # 单价校验参考数据 +├── data/ +│ ├── input/ # 输入文件 +│ ├── output/ # OCR 输出 +│ ├── result/ # 最终采购单 +│ └── user_settings.json # 用户设置 +└── docs/ + └── SYSTEM_ARCHITECTURE.md # 系统架构文档 +``` + +## 命令与运行 + +```bash +# GUI 模式 +python 启动器.py + +# CLI 模式 (OpenClaw 对接) +python headless_api.py [input] [--excel|--tobacco|--rongcheng] [--barcode X --target Y] + +# 打包 EXE +python build_exe.py + +# 条码映射更新 +python headless_api.py --update-mapping --barcode 6920584471055 --target 6920584471017 +``` + +## 供应商智能识别逻辑 + +系统通过扫描 Excel 前 50 行内容特征自动路由: + +| 供应商 | 识别特征 | 预处理逻辑 | +|--------|----------|-----------| +| 烟草公司 | "专卖证号" 或 "510109104938" | B/E/G/H 列映射, 数量*10, 单价/10 | +| 蓉城易购 | "RCDH" | E/N/Q/S 列映射, 多条码分裂均分数量 | +| 杨碧月 | "经手人" + "杨碧月" | 列对齐, 单位转换 (件→瓶) | +| 通用供应商 | suppliers_config.json 配置 | 列映射 + 规则引擎 | + +## 配置系统 + +- **ConfigManager** (`app/config/settings.py`): 单例模式, 基于 configparser 读取 `config.ini` +- **供应商配置** (`config/suppliers_config.json`): JSON 格式, 定义列映射/清洗规则/计算规则 +- **条码映射** (`config/barcode_mappings.json`): 运行时可更新的条码转换规则 + +## 关键约定 + +### 输出格式 +- 银豹采购单模板: 4 列 — 条码(B), 采购量(C), 赠送量(D), 采购单价(E) +- 单价保留 4 位小数, 使用 xlwt.XFStyle +- 采购单文件名: `采购单_{原文件名}.xls` + +### 单位转换规则 +- "件"/"箱"/"提"/"盒" → 数量*包装数量, 单价/包装数量, 单位→"瓶" +- 赠品: 价格为 0 或金额为 0 的行标记为赠品 +- 条码映射优先于单位转换 + +### 规格推断 +- 从商品名称推断: "24入纸箱" → 1*24, "450g*15" → 1*15 +- 支持三级规格: 1*5*12 +- OCR 修正: "IL" → "1L", "6oo" → "600" + +## 已知技术债务 + +1. ~~**启动器.py 过大**~~ (已拆分为 13 个 `app/ui/` 模块, 入口桩仅 13 行) +2. **代码重复**: 表头识别、列映射、金额解析在多处重复实现 +3. **配置不统一**: config.ini + suppliers_config.json + 硬编码路径混用 +4. **无测试**: 测试目录为空, 无自动化测试 +5. **旧格式依赖**: xlrd/xlwt 仅支持 .xls, 不支持 .xlsx 写入 +6. **API 密钥明文**: config.ini 中百度 OCR API 密钥未加密 +7. **路径硬编码**: config.ini 中 `template_folder = E:\2025Code\python\orc-order-v2\templates` +8. **日志不统一**: 混用 `get_logger()` 和 `logging.getLogger()` diff --git a/README.md b/README.md new file mode 100644 index 0000000..0790012 --- /dev/null +++ b/README.md @@ -0,0 +1,110 @@ +# 益选 OCR 订单处理系统 + +面向零售与分销场景的采购单处理工具,支持图片 OCR → Excel 规范化 → 模板填充 → 合并导出全流程,输出适配银豹 (PosPal) POS 系统。 + +## 核心功能 + +- **智能供应商识别**:自动扫描 Excel 前 50 行内容特征,路由到对应的预处理逻辑(蓉城易购、烟草公司、杨碧月等) +- **图片 OCR**:调用百度 OCR 表格识别 API,将采购单图片转为结构化 Excel +- **规则引擎**:支持列映射、数据清洗、单位转换、规格推断、赠品标记等自动化规则 +- **条码映射**:可配置的条码转换规则,支持运行时编辑和云端同步 +- **单价校验**:自动比对 `商品资料.xlsx`,价差超过 1.0 元触发预警 +- **云端同步**:通过 Gitea REST API 在多台设备间同步配置文件(条码映射、供应商配置、商品资料、采购模板) +- **拖拽一键处理**:拖入图片或 Excel 自动走完 OCR → 规范化 → 合并全流程 +- **CLI 接口**:`headless_api.py` 支持无界面自动化调用 + +## 快速开始 + +```bash +# 安装依赖 +pip install -r requirements.txt + +# GUI 模式 +python 启动器.py + +# CLI 模式 +python headless_api.py data/input/xxx.xlsx +python headless_api.py data/input/xxx.jpg --barcode 6920584471055 --target 6920584471017 + +# 打包 EXE +python build_exe.py +``` + +## 项目结构 + +``` +├── 启动器.py # GUI 入口 +├── headless_api.py # CLI 自动化接口 +├── config.ini # 全局配置(API密钥、路径、Gitea) +├── config/ +│ ├── config.ini # 配置副本 +│ ├── barcode_mappings.json # 条码映射规则 +│ └── suppliers_config.json # 供应商配置(列映射/规则引擎) +├── app/ +│ ├── config/ # 配置管理(ConfigManager 单例) +│ ├── core/ +│ │ ├── excel/ # Excel 处理(标准化、转换、合并、校验) +│ │ ├── handlers/ # 规则引擎、列映射、数据清洗、计算器 +│ │ ├── ocr/ # 百度 OCR 客户端 +│ │ ├── processors/ # 处理器(通用/烟草/OCR) +│ │ └── utils/ # 工具(日志、文件、字符串、云端同步、对话框) +│ ├── services/ # 业务服务(订单、OCR、处理器调度) +│ └── ui/ # GUI 模块(主题、日志、快捷键、主窗口) +├── templates/ +│ ├── 银豹-采购单模板.xls # 输出模板(条码/采购量/赠送量/单价) +│ └── 商品资料.xlsx # 单价校验参考数据 +├── data/ +│ ├── input/ # 输入文件 +│ ├── output/ # OCR 输出 +│ └── result/ # 最终采购单 +└── tests/ # 单元测试(191 个) +``` + +## 供应商智能路由 + +| 供应商 | 识别特征 | 处理逻辑 | +|--------|----------|----------| +| 烟草公司 | "专卖证号" 或 "510109104938" | B/E/G/H 列映射,数量×10,单价÷10 | +| 蓉城易购 | "RCDH" | E/N/Q/S 列映射,多条码分裂均分数量 | +| 杨碧月 | "经手人" + "杨碧月" | 列对齐,单位转换(件→瓶) | +| 通用供应商 | `suppliers_config.json` 配置 | 列映射 + 规则引擎 | + +## 云端同步 + +通过 Gitea REST API 在多台设备间同步配置,无需 git 客户端。 + +**支持同步的文件:** +- 条码映射 (`barcode_mappings.json`) +- 供应商配置 (`suppliers_config.json`) +- 商品资料 (`templates/商品资料.xlsx`) +- 采购单模板 (`templates/银豹-采购单模板.xls`) + +**配置方式:** +1. 系统设置 → 填入 Gitea 地址、仓库信息、Access Token +2. 主窗口 → "云端同步" 按钮 → 选择文件推拉 + +**Gitea 仓库:** `https://gitea.94kan.cn/houhuan/yixuan-sync-data` + +## 配置说明 + +| 配置项 | 文件 | 说明 | +|--------|------|------| +| API 密钥 | `.env` 或 `config.ini` | 百度 OCR API,优先从环境变量读取 | +| Gitea Token | `.env` 或 `config.ini` | 云端同步 Token,优先从环境变量读取 | +| 供应商规则 | `config/suppliers_config.json` | 列映射、清洗规则、计算规则 | +| 条码映射 | `config/barcode_mappings.json` | 条码转换规则,运行时可更新 | + +## 构建打包 + +```bash +pip install pyinstaller +python build_exe.py +# 输出: dist/OCR订单处理系统.exe +# 便携包: release/OCR订单处理系统.exe(含模板和商品资料) +``` + +## 测试 + +```bash +python -m pytest tests/ -v +``` diff --git a/app/__init__.py b/app/__init__.py new file mode 100644 index 0000000..1c563d5 --- /dev/null +++ b/app/__init__.py @@ -0,0 +1,8 @@ +""" +OCR订单处理系统 +--------------- +用于自动识别和处理Excel格式的订单文件的系统。 +支持多种格式的订单处理,包括普通订单和赠品订单的处理。 +""" + +__version__ = '2.0.0' \ No newline at end of file diff --git a/app/config/__init__.py b/app/config/__init__.py new file mode 100644 index 0000000..9331d71 --- /dev/null +++ b/app/config/__init__.py @@ -0,0 +1,5 @@ +""" +OCR订单处理系统 - 配置模块 +------------------------ +负责管理系统配置,包括API密钥、路径和处理选项。 +""" \ No newline at end of file diff --git a/app/config/defaults.py b/app/config/defaults.py new file mode 100644 index 0000000..9904a87 --- /dev/null +++ b/app/config/defaults.py @@ -0,0 +1,49 @@ +""" +默认配置 +------- +包含系统的默认配置值。 +""" + +# 默认配置 +DEFAULT_CONFIG = { + 'API': { + 'api_key': '', # 将从配置文件中读取 + 'secret_key': '', # 将从配置文件中读取 + 'timeout': '30', + 'max_retries': '3', + 'retry_delay': '2', + 'api_url': 'https://aip.baidubce.com/rest/2.0/ocr/v1/table', + 'token_url': 'https://aip.baidubce.com/oauth/2.0/token', + 'form_ocr_url': 'https://aip.baidubce.com/rest/2.0/solution/v1/form_ocr/get_request_result' + }, + 'Paths': { + 'input_folder': 'data/input', + 'output_folder': 'data/output', + 'temp_folder': 'data/temp', + 'template_folder': 'templates', + 'template_file': '银豹-采购单模板.xls', + 'processed_record': 'data/processed_files.json', + 'data_dir': 'data', + 'product_db': 'data/product_cache.db' + }, + 'Performance': { + 'max_workers': '4', + 'batch_size': '5', + 'skip_existing': 'true' + }, + 'File': { + 'allowed_extensions': '.jpg,.jpeg,.png,.bmp', + 'excel_extension': '.xlsx', + 'max_file_size_mb': '4' + }, + 'Templates': { + 'purchase_order': '银豹-采购单模板.xls', + 'item_data': '商品资料.xlsx' + }, + 'Gitea': { + 'base_url': 'https://gitea.94kan.cn', + 'owner': 'houhuan', + 'repo': 'yixuan-sync-data', + 'token': '' + } +} \ No newline at end of file diff --git a/app/config/settings.py b/app/config/settings.py new file mode 100644 index 0000000..9717513 --- /dev/null +++ b/app/config/settings.py @@ -0,0 +1,176 @@ +""" +配置管理模块 +----------- +提供统一的配置加载、访问和保存功能。 +""" + +import os +import configparser +from typing import Dict, List, Optional, Any + +from dotenv import load_dotenv +from ..core.utils.log_utils import get_logger +from .defaults import DEFAULT_CONFIG + +# 加载 .env 文件 +load_dotenv() + +logger = get_logger(__name__) + +class ConfigManager: + """ + 配置管理类,负责加载和保存配置 + 单例模式确保全局只有一个配置实例 + """ + _instance = None + + def __new__(cls, config_file=None): + """单例模式实现""" + if cls._instance is None: + cls._instance = super(ConfigManager, cls).__new__(cls) + cls._instance._init(config_file) + return cls._instance + + def _init(self, config_file): + """初始化配置管理器""" + self.config_file = config_file or 'config.ini' + self.config = configparser.ConfigParser() + self.load_config() + + def load_config(self) -> None: + """ + 加载配置文件,如果不存在则创建默认配置 + API 密钥优先从环境变量 (.env) 读取 + """ + if not os.path.exists(self.config_file): + self.create_default_config() + else: + try: + # 先读取现有配置 + self.config.read(self.config_file, encoding='utf-8') + + # 检查是否有缺失的配置项,只添加缺失的项 + for section, options in DEFAULT_CONFIG.items(): + if not self.config.has_section(section): + self.config.add_section(section) + + for option, value in options.items(): + if not self.config.has_option(section, option): + self.config.set(section, option, value) + + # API 密钥优先从环境变量读取 + self._override_from_env() + + # 保存更新后的配置 + self.save_config() + logger.info(f"已加载并更新配置文件: {self.config_file}") + except Exception as e: + logger.error(f"加载配置文件时出错: {e}") + logger.info("使用默认配置") + self.create_default_config(save=False) + + def _override_from_env(self) -> None: + """从环境变量覆盖敏感配置""" + env_mapping = { + ('API', 'api_key'): 'BAIDU_API_KEY', + ('API', 'secret_key'): 'BAIDU_SECRET_KEY', + ('Gitea', 'token'): 'GITEA_TOKEN', + } + for (section, option), env_key in env_mapping.items(): + env_val = os.getenv(env_key, '').strip() + if env_val: + self.config.set(section, option, env_val) + + def create_default_config(self, save: bool = True) -> None: + """创建默认配置""" + for section, options in DEFAULT_CONFIG.items(): + if not self.config.has_section(section): + self.config.add_section(section) + + for option, value in options.items(): + self.config.set(section, option, value) + + if save: + self.save_config() + logger.info(f"已创建默认配置文件: {self.config_file}") + + def save_config(self) -> None: + """保存配置到文件(API 密钥不写入文件)""" + try: + # 保存前临时清空 API 密钥,避免写入文件 + saved_keys = {} + for option in ('api_key', 'secret_key'): + saved_keys[option] = self.config.get('API', option, fallback='') + self.config.set('API', option, '') + + with open(self.config_file, 'w', encoding='utf-8') as f: + self.config.write(f) + + # 恢复内存中的值 + for option, val in saved_keys.items(): + self.config.set('API', option, val) + + logger.info(f"配置已保存到: {self.config_file}") + except Exception as e: + logger.error(f"保存配置文件时出错: {e}") + + def get(self, section: str, option: str, fallback: Any = None) -> Any: + """获取配置值""" + return self.config.get(section, option, fallback=fallback) + + def getint(self, section: str, option: str, fallback: int = 0) -> int: + """获取整数配置值""" + return self.config.getint(section, option, fallback=fallback) + + def getfloat(self, section: str, option: str, fallback: float = 0.0) -> float: + """获取浮点数配置值""" + return self.config.getfloat(section, option, fallback=fallback) + + def getboolean(self, section: str, option: str, fallback: bool = False) -> bool: + """获取布尔配置值""" + return self.config.getboolean(section, option, fallback=fallback) + + def get_list(self, section: str, option: str, fallback: str = "", delimiter: str = ",") -> List[str]: + """获取列表配置值(逗号分隔的字符串转为列表)""" + value = self.get(section, option, fallback) + return [item.strip() for item in value.split(delimiter) if item.strip()] + + def update(self, section: str, option: str, value: Any) -> None: + """更新配置选项""" + if not self.config.has_section(section): + self.config.add_section(section) + + self.config.set(section, option, str(value)) + logger.debug(f"更新配置: [{section}] {option} = {value}") + + def get_path(self, section: str, option: str, fallback: str = "", create: bool = False) -> str: + """ + 获取路径配置并确保它是一个有效的绝对路径 + 如果create为True,则自动创建该目录 + """ + from pathlib import Path + path_str = self.get(section, option, fallback) + path = Path(path_str) + + if not path.is_absolute(): + # 相对路径,转为绝对路径(相对于项目根目录) + path = Path(os.getcwd()) / path + + if create: + try: + # 智能判断是文件还是目录 + # 如果有后缀名则认为是文件,创建其父目录 + if path.suffix: + directory = path.parent + if not directory.exists(): + directory.mkdir(parents=True, exist_ok=True) + logger.info(f"已创建父目录: {directory}") + else: + # 否则认为是目录路径 + if not path.exists(): + path.mkdir(parents=True, exist_ok=True) + logger.info(f"已创建目录: {path}") + except Exception as e: + logger.error(f"创建目录失败: {path}, 错误: {e}") + + return str(path.absolute()) \ No newline at end of file diff --git a/app/core/db/__init__.py b/app/core/db/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/core/db/product_db.py b/app/core/db/product_db.py new file mode 100644 index 0000000..478a165 --- /dev/null +++ b/app/core/db/product_db.py @@ -0,0 +1,214 @@ +""" +商品资料 SQLite 数据库 + +将商品资料 (条码/名称/进货价/单位) 存储在 SQLite 中, +支持从 Excel 自动导入和按条码快速查询。 +""" + +import os +import sqlite3 +from datetime import datetime +from typing import Dict, List, Optional + +import pandas as pd + +from ..utils.log_utils import get_logger +from ..utils.file_utils import smart_read_excel +from ...core.handlers.column_mapper import ColumnMapper + +logger = get_logger(__name__) + + +class ProductDatabase: + """商品资料 SQLite 数据库""" + + SCHEMA = """ + CREATE TABLE IF NOT EXISTS products ( + barcode TEXT PRIMARY KEY, + name TEXT DEFAULT '', + price REAL DEFAULT 0.0, + unit TEXT DEFAULT '', + updated_at TEXT + ); + """ + + def __init__(self, db_path: str, excel_source: str): + """初始化数据库,如果 SQLite 不存在则自动从 Excel 导入 + + Args: + db_path: SQLite 数据库文件路径 + excel_source: 商品资料 Excel 文件路径 + """ + self.db_path = db_path + self.excel_source = excel_source + self._ensure_db() + + def _connect(self) -> sqlite3.Connection: + return sqlite3.connect(self.db_path) + + def _ensure_db(self): + """确保数据库存在,不存在则从 Excel 导入""" + if os.path.exists(self.db_path): + return + + if not os.path.exists(self.excel_source): + logger.warning(f"商品资料 Excel 不存在,跳过导入: {self.excel_source}") + self._create_empty_db() + return + + logger.info(f"首次运行,从 Excel 导入商品资料: {self.excel_source}") + os.makedirs(os.path.dirname(self.db_path), exist_ok=True) + self._create_empty_db() + count = self.import_from_excel(self.excel_source) + logger.info(f"商品资料导入完成: {count} 条记录") + + def _create_empty_db(self): + """创建空数据库""" + conn = self._connect() + try: + conn.executescript(self.SCHEMA) + conn.commit() + finally: + conn.close() + + def import_from_excel(self, excel_path: str) -> int: + """从 Excel 导入商品资料 + + Args: + excel_path: Excel 文件路径 + + Returns: + 导入的记录数 + """ + df = smart_read_excel(excel_path) + if df is None or df.empty: + logger.warning(f"Excel 文件为空或读取失败: {excel_path}") + return 0 + + # 查找条码列 + barcode_col = ColumnMapper.find_column(list(df.columns), 'barcode') + if not barcode_col: + logger.error(f"Excel 中未找到条码列: {list(df.columns)}") + return 0 + + # 查找进货价列 + price_col = ColumnMapper.find_column(list(df.columns), 'unit_price') + # 进货价可能没有标准别名,补充查找 + if not price_col: + for col in df.columns: + col_str = str(col).strip() + if '进货价' in col_str: + price_col = col + break + + # 查找名称列和单位列 (可选) + name_col = ColumnMapper.find_column(list(df.columns), 'name') + unit_col = ColumnMapper.find_column(list(df.columns), 'unit') + + now = datetime.now().isoformat() + rows = [] + for _, row in df.iterrows(): + barcode = str(row.get(barcode_col, '')).strip() + if not barcode or barcode == 'nan': + continue + + price = 0.0 + if price_col: + try: + p = row.get(price_col) + if p is not None and str(p).strip() not in ('', 'nan', 'None'): + price = float(p) + except (ValueError, TypeError): + pass + + name = str(row.get(name_col, '')).strip() if name_col else '' + if name == 'nan': + name = '' + unit = str(row.get(unit_col, '')).strip() if unit_col else '' + if unit == 'nan': + unit = '' + + rows.append((barcode, name, price, unit, now)) + + if not rows: + logger.warning(f"Excel 中未解析出有效记录: {excel_path}") + return 0 + + conn = self._connect() + try: + conn.executemany( + "INSERT OR REPLACE INTO products (barcode, name, price, unit, updated_at) " + "VALUES (?, ?, ?, ?, ?)", + rows + ) + conn.commit() + finally: + conn.close() + + return len(rows) + + def reimport(self) -> int: + """重新从 Excel 导入(清空现有数据后重新导入) + + Returns: + 导入的记录数 + """ + conn = self._connect() + try: + conn.execute("DELETE FROM products") + conn.commit() + finally: + conn.close() + return self.import_from_excel(self.excel_source) + + def get_price(self, barcode: str) -> Optional[float]: + """按条码查询进货价 + + Args: + barcode: 商品条码 + + Returns: + 进货价,未找到返回 None + """ + conn = self._connect() + try: + cursor = conn.execute( + "SELECT price FROM products WHERE barcode = ?", + (str(barcode).strip(),) + ) + row = cursor.fetchone() + return row[0] if row else None + finally: + conn.close() + + def get_prices(self, barcodes: List[str]) -> Dict[str, float]: + """批量查询进货价 + + Args: + barcodes: 条码列表 + + Returns: + {条码: 进货价} 字典,未找到的不包含 + """ + if not barcodes: + return {} + + conn = self._connect() + try: + placeholders = ','.join('?' * len(barcodes)) + cursor = conn.execute( + f"SELECT barcode, price FROM products WHERE barcode IN ({placeholders})", + [str(b).strip() for b in barcodes] + ) + return {row[0]: row[1] for row in cursor.fetchall()} + finally: + conn.close() + + def count(self) -> int: + """返回商品总数""" + conn = self._connect() + try: + cursor = conn.execute("SELECT COUNT(*) FROM products") + return cursor.fetchone()[0] + finally: + conn.close() diff --git a/app/core/excel/__init__.py b/app/core/excel/__init__.py new file mode 100644 index 0000000..3d49a8b --- /dev/null +++ b/app/core/excel/__init__.py @@ -0,0 +1,5 @@ +""" +OCR订单处理系统 - Excel处理模块 +---------------------------- +提供Excel文件处理、数据提取和转换功能。 +""" \ No newline at end of file diff --git a/app/core/excel/converter.py b/app/core/excel/converter.py new file mode 100644 index 0000000..3de8b8f --- /dev/null +++ b/app/core/excel/converter.py @@ -0,0 +1,535 @@ +""" +单位转换模块 +---------- +提供单位转换功能,支持规格推断和单位自动提取。 +""" + +import re +import logging +import os +import json +from typing import Dict, Tuple, Optional, Any, List, Union + +from ..utils.log_utils import get_logger +from .handlers.barcode_mapper import BarcodeMapper +from .handlers.unit_converter_handlers import ( + JianUnitHandler, BoxUnitHandler, TiHeUnitHandler, + GiftUnitHandler, UnitHandler +) +from .validators import ProductValidator + +logger = get_logger(__name__) + +# 条码映射配置文件路径 +BARCODE_MAPPING_CONFIG = "config/barcode_mappings.json" + +class UnitConverter: + """ + 单位转换器:处理不同单位之间的转换,支持从商品名称推断规格 + """ + + def __init__(self): + """ + 初始化单位转换器 + """ + # 加载特殊条码配置 + self.special_barcodes = self.load_barcode_mappings() + + # 规格推断的正则表达式模式 + self.spec_patterns = [ + # 1*6、1x12、1X20等格式 + (r'(\d+)[*xX×](\d+)', r'\1*\2'), + # 1*5*12和1x5x12等三级格式 + (r'(\d+)[*xX×](\d+)[*xX×](\d+)', r'\1*\2*\3'), + # "xx入"格式,如"12入"、"24入" + (r'(\d+)入', r'1*\1'), + # "xxL*1"或"xx升*1"格式 + (r'([\d\.]+)[L升][*xX×]?(\d+)?', r'\1L*\2' if r'\2' else r'\1L*1'), + # "xxkg*1"或"xx公斤*1"格式 + (r'([\d\.]+)(?:kg|公斤)[*xX×]?(\d+)?', r'\1kg*\2' if r'\2' else r'\1kg*1'), + # "xxg*1"或"xx克*1"格式 + (r'([\d\.]+)(?:g|克)[*xX×]?(\d+)?', r'\1g*\2' if r'\2' else r'\1g*1'), + # "xxmL*1"或"xx毫升*1"格式 + (r'([\d\.]+)(?:mL|毫升)[*xX×]?(\d+)?', r'\1mL*\2' if r'\2' else r'\1mL*1'), + ] + + # 初始化处理程序 + self._init_handlers() + + # 初始化验证器 + self.validator = ProductValidator() + + def _init_handlers(self): + """ + 初始化各种处理程序 + """ + # 创建条码处理程序 + self.barcode_mapper = BarcodeMapper(self.special_barcodes) + + # 创建单位处理程序列表,优先级从高到低 + self.unit_handlers: List[UnitHandler] = [ + GiftUnitHandler(), # 首先处理赠品,优先级最高 + JianUnitHandler(), # 处理"件"单位 + BoxUnitHandler(), # 处理"箱"单位 + TiHeUnitHandler() # 处理"提"和"盒"单位 + ] + + def extract_unit_from_quantity(self, quantity_str: str) -> Tuple[Optional[float], Optional[str]]: + """ + 从数量字符串中提取单位 + + 支持的格式: + 1. "2箱" -> (2, "箱") + 2. "3件" -> (3, "件") + 3. "1.5提" -> (1.5, "提") + 4. "数量: 5盒" -> (5, "盒") + 5. "× 2瓶" -> (2, "瓶") + + Args: + quantity_str: 数量字符串,如"2箱"、"5件" + + Returns: + (数量, 单位)的元组,如果无法提取则返回(None, None) + """ + if not quantity_str or not isinstance(quantity_str, str): + return None, None + + # 清理字符串,移除前后空白和一些常见前缀 + cleaned_str = quantity_str.strip() + for prefix in ['数量:', '数量:', '×', 'x', 'X', '*']: + cleaned_str = cleaned_str.replace(prefix, '').strip() + + # 匹配数字+单位格式 (基本格式) + basic_match = re.match(r'^([\d\.]+)\s*([^\d\s\.]+)$', cleaned_str) + if basic_match: + try: + num = float(basic_match.group(1)) + unit = basic_match.group(2) + logger.info(f"从数量提取单位(基本格式): {quantity_str} -> 数量={num}, 单位={unit}") + return num, unit + except ValueError: + pass + + # 匹配更复杂的格式,如包含其他文本的情况 + complex_match = re.search(r'([\d\.]+)\s*([箱|件|瓶|提|盒|袋|桶|包|kg|g|升|毫升|L|ml|个])', cleaned_str) + if complex_match: + try: + num = float(complex_match.group(1)) + unit = complex_match.group(2) + logger.info(f"从数量提取单位(复杂格式): {quantity_str} -> 数量={num}, 单位={unit}") + return num, unit + except ValueError: + pass + + return None, None + + def extract_specification(self, text: str) -> Optional[str]: + """ + 从文本中提取规格信息 + + Args: + text: 文本字符串 + + Returns: + 提取的规格字符串,如果无法提取则返回None + """ + if not text or not isinstance(text, str): + return None + + # 处理XX入白膜格式,如"550纯净水24入白膜" + match = re.search(r'.*?(\d+)入白膜', text) + if match: + result = f"1*{match.group(1)}" + logger.info(f"提取规格(入白膜): {text} -> {result}") + return result + + # 尝试所有模式 + for pattern, replacement in self.spec_patterns: + match = re.search(pattern, text) + if match: + # 特殊处理三级格式,确保正确显示为1*5*12 + if '*' in replacement and replacement.count('*') == 1 and len(match.groups()) >= 2: + result = f"{match.group(1)}*{match.group(2)}" + logger.info(f"提取规格: {text} -> {result}") + return result + # 特殊处理三级规格格式 + elif '*' in replacement and replacement.count('*') == 2 and len(match.groups()) >= 3: + result = f"{match.group(1)}*{match.group(2)}*{match.group(3)}" + logger.info(f"提取三级规格: {text} -> {result}") + return result + # 一般情况 + else: + result = re.sub(pattern, replacement, text) + logger.info(f"提取规格: {text} -> {result}") + return result + + # 没有匹配任何模式 + return None + + def infer_specification_from_name(self, name: str) -> Optional[str]: + """ + 从商品名称中推断规格 + + 规则: + 1. "xx入纸箱" -> 1*xx (如"15入纸箱" -> 1*15) + 2. 直接包含规格 "1*15" -> 1*15 + 3. "xx纸箱" -> 1*xx (如"15纸箱" -> 1*15) + 4. "xx白膜" -> 1*xx (如"12白膜" -> 1*12) + 5. "xxL" 容量单位特殊处理 + 6. "xx(g|ml|毫升|克)*数字" -> 1*数字 (如"450g*15" -> 1*15) + + Args: + name: 商品名称 + + Returns: + 推断的规格,如果无法推断则返回None + """ + if not name or not isinstance(name, str): + return None + + # 记录原始商品名称,用于日志 + original_name = name + + # 新增模式: 处理重量/容量*数字格式,如"450g*15", "450ml*15" + # 忽略重量/容量值,只提取后面的数量作为规格 + weight_volume_pattern = r'.*?\d+(?:g|ml|毫升|克)[*xX×](\d+)' + match = re.search(weight_volume_pattern, name) + if match: + inferred_spec = f"1*{match.group(1)}" + logger.info(f"从名称推断规格(重量/容量*数量): {original_name} -> {inferred_spec}") + return inferred_spec + + # 特殊模式1.1: "xx入白膜" 格式,如"550纯净水24入白膜" -> "1*24" + pattern1_1 = r'.*?(\d+)入白膜' + match = re.search(pattern1_1, name) + if match: + inferred_spec = f"1*{match.group(1)}" + logger.info(f"从名称推断规格(入白膜): {original_name} -> {inferred_spec}") + return inferred_spec + + # 特殊模式1: "xx入纸箱" 格式,如"445水溶C血橙15入纸箱" -> "1*15" + pattern1 = r'.*?(\d+)入纸箱' + match = re.search(pattern1, name) + if match: + inferred_spec = f"1*{match.group(1)}" + logger.info(f"从名称推断规格(入纸箱): {original_name} -> {inferred_spec}") + return inferred_spec + + # 特殊模式2: 直接包含规格,如"500-东方树叶-乌龙茶1*15-纸箱装" -> "1*15" + pattern2 = r'.*?(\d+)[*xX×](\d+).*' + match = re.search(pattern2, name) + if match: + inferred_spec = f"{match.group(1)}*{match.group(2)}" + logger.info(f"从名称推断规格(直接格式): {original_name} -> {inferred_spec}") + return inferred_spec + + # 特殊模式3: "xx纸箱" 格式,如"500茶π蜜桃乌龙15纸箱" -> "1*15" + pattern3 = r'.*?(\d+)纸箱' + match = re.search(pattern3, name) + if match: + inferred_spec = f"1*{match.group(1)}" + logger.info(f"从名称推断规格(纸箱): {original_name} -> {inferred_spec}") + return inferred_spec + + # 特殊模式4: "xx白膜" 格式,如"1.5L水12白膜" 或 "550水24白膜" -> "1*12" 或 "1*24" + pattern4 = r'.*?(\d+)白膜' + match = re.search(pattern4, name) + if match: + inferred_spec = f"1*{match.group(1)}" + logger.info(f"从名称推断规格(白膜): {original_name} -> {inferred_spec}") + return inferred_spec + + # 特殊模式5: 容量单位带数量格式 "1.8L*8瓶" -> "1.8L*8" + volume_count_pattern = r'.*?([\d\.]+)[Ll升][*×xX](\d+).*' + match = re.search(volume_count_pattern, name) + if match: + volume = match.group(1) + count = match.group(2) + inferred_spec = f"{volume}L*{count}" + logger.info(f"从名称推断规格(容量*数量): {original_name} -> {inferred_spec}") + return inferred_spec + + # 特殊模式6: 简单容量单位如"12.9L桶装水" -> "12.9L*1" + simple_volume_pattern = r'.*?([\d\.]+)[Ll升].*' + match = re.search(simple_volume_pattern, name) + if match: + inferred_spec = f"{match.group(1)}L*1" + logger.info(f"从名称推断规格(简单容量): {original_name} -> {inferred_spec}") + return inferred_spec + + # 尝试通用模式匹配 + spec = self.extract_specification(name) + if spec: + logger.info(f"从名称推断规格(通用模式): {original_name} -> {spec}") + return spec + + return None + + def parse_specification(self, spec: str) -> Tuple[int, int, Optional[int]]: + """ + 解析规格字符串,支持1*12和1*5*12等格式 + + Args: + spec: 规格字符串 + + Returns: + (一级包装, 二级包装, 三级包装)元组,如果是二级包装,第三个值为None + """ + if not spec or not isinstance(spec, str): + return 1, 1, None + + try: + # 清理规格字符串,确保格式统一 + spec = re.sub(r'\s+', '', spec) # 移除所有空白 + spec = re.sub(r'[xX×]', '*', spec) # 统一分隔符为* + + logger.debug(f"解析规格: {spec}") + + # 新增:处理“1件=12桶/袋/盒...”等等式规格,统一为1*12 + eq_match = re.match(r'(\d+(?:\.\d+)?)\s*(?:件|箱|提|盒)\s*[==]\s*(\d+)\s*(?:瓶|桶|盒|支|个|袋|罐|包|卷)', spec) + if eq_match: + try: + level2 = int(eq_match.group(2)) + logger.info(f"解析等式规格: {spec} -> 1*{level2}") + return 1, level2, None + except ValueError: + pass + + # 处理三级包装,如1*5*12 + three_level_match = re.match(r'(\d+)[*](\d+)[*](\d+)', spec) + if three_level_match: + try: + level1 = int(three_level_match.group(1)) + level2 = int(three_level_match.group(2)) + level3 = int(three_level_match.group(3)) + logger.info(f"解析三级规格: {spec} -> {level1}*{level2}*{level3}") + return level1, level2, level3 + except ValueError: + pass + + # 处理带重量单位的规格,如5kg*6、500g*12等 + weight_match = re.match(r'([\d\.]+)(?:kg|g|克|千克|公斤)[*](\d+)', spec, re.IGNORECASE) + if weight_match: + try: + # 对于重量单位,使用1作为一级包装,后面的数字作为二级包装 + level2 = int(weight_match.group(2)) + logger.info(f"解析重量规格: {spec} -> 1*{level2}") + return 1, level2, None + except ValueError: + pass + + # 处理带容量单位的规格,如500ml*15, 1L*12等 + ml_match = re.match(r'(\d+)(?:ml|毫升)[*](\d+)', spec, re.IGNORECASE) + if ml_match: + try: + # 对于ml单位,使用1作为一级包装,后面的数字作为二级包装 + level2 = int(ml_match.group(2)) + logger.info(f"解析容量(ml)规格: {spec} -> 1*{level2}") + return 1, level2, None + except ValueError: + pass + + # 处理带L单位的规格,如1L*12等 + l_match = re.match(r'(\d+(?:\.\d+)?)[Ll升][*](\d+)', spec) + if l_match: + try: + # 对于L单位,正确提取第二部分作为包装数量 + level2 = int(l_match.group(2)) + logger.info(f"解析容量(L)规格: {spec} -> 1*{level2}") + return 1, level2, None + except ValueError: + pass + + # 处理二级包装,如1*12 + two_level_match = re.match(r'(\d+)[*](\d+)', spec) + if two_level_match: + try: + level1 = int(two_level_match.group(1)) + level2 = int(two_level_match.group(2)) + logger.info(f"解析二级规格: {spec} -> {level1}*{level2}") + return level1, level2, None + except ValueError: + pass + + # 特殊处理L/升为单位的规格,如12.5L*1 + volume_match = re.match(r'([\d\.]+)[L升][*xX×](\d+)', spec) + if volume_match: + try: + volume = float(volume_match.group(1)) + quantity = int(volume_match.group(2)) + logger.info(f"解析容量规格: {spec} -> {volume}L*{quantity}") + return 1, quantity, None + except ValueError: + pass + + # 处理不规范格式,如IL*12, 6oo*12等,从中提取数字部分作为包装数量 + # 只要规格中包含*和数字,就尝试提取*后面的数字作为件数 + irregular_match = re.search(r'[^0-9]*\*(\d+)', spec) + if irregular_match: + try: + level2 = int(irregular_match.group(1)) + logger.info(f"解析不规范规格: {spec} -> 1*{level2}") + return 1, level2, None + except ValueError: + pass + + # 默认值 + logger.warning(f"无法解析规格: {spec},使用默认值1*1") + return 1, 1, None + except Exception as e: + logger.error(f"解析规格时出错: {e}") + return 1, 1, None + + def process_unit_conversion(self, product: Dict) -> Dict: + """ + 处理单位转换,按照以下规则: + 1. 特殊条码: 优先处理特殊条码 + 2. 赠品处理: 对于赠品,维持数量转换但单价为0 + 3. "件"单位: 数量×包装数量, 单价÷包装数量, 单位转为"瓶" + 4. "箱"单位: 数量×包装数量, 单价÷包装数量, 单位转为"瓶" + 5. "提"和"盒"单位: 如果是三级规格, 按件处理; 如果是二级规格, 保持不变 + 6. 其他单位: 保持不变 + + Args: + product: 商品信息字典 + + Returns: + 处理后的商品信息字典 + """ + # 首先验证商品数据 + product = self.validator.validate_product(product) + + # 复制原始数据,避免修改原始字典 + result = product.copy() + + barcode = result.get('barcode', '') + specification = result.get('specification', '') + + # 跳过无效数据 + if not barcode: + return result + + # 先处理条码映射 + result = self.barcode_mapper.map_barcode(result) + + # 如果没有规格信息,无法进行单位转换 + if not specification: + # 尝试从商品名称推断规格 + inferred_spec = self.infer_specification_from_name(result.get('name', '')) + if inferred_spec: + result['specification'] = inferred_spec + logger.info(f"从商品名称推断规格: {result.get('name', '')} -> {inferred_spec}") + else: + return result + + # 解析规格信息 + level1, level2, level3 = self.parse_specification(result.get('specification', '')) + + # 使用单位处理程序处理单位转换 + for handler in self.unit_handlers: + if handler.can_handle(result): + return handler.handle(result, level1, level2, level3) + + # 没有找到适用的处理程序,保持不变 + logger.info(f"其他单位处理: 保持原样 数量: {result.get('quantity', 0)}, 单价: {result.get('price', 0)}, 单位: {result.get('unit', '')}") + return result + + def load_barcode_mappings(self) -> Dict[str, Dict[str, Any]]: + """ + 从配置文件加载条码映射 + + Returns: + 条码映射字典 + """ + # 默认映射 + default_mappings = { + '6925019900087': { + 'multiplier': 10, + 'target_unit': '瓶', + 'description': '特殊处理:数量*10,单位转换为瓶' + }, + '6921168593804': { + 'multiplier': 30, + 'target_unit': '瓶', + 'description': 'NFC产品特殊处理:每箱30瓶' + }, + '6901826888138': { + 'multiplier': 30, + 'target_unit': '瓶', + 'fixed_price': 112/30, + 'specification': '1*30', + 'description': '特殊处理: 规格1*30,数量*30,单价=112/30' + }, + # 条码映射配置 + '6920584471055': { + 'map_to': '6920584471017', + 'description': '条码映射:6920584471055 -> 6920584471017' + }, + '6925861571159': { + 'map_to': '69021824', + 'description': '条码映射:6925861571159 -> 69021824' + }, + '6923644268923': { + 'map_to': '6923644268480', + 'description': '条码映射:6923644268923 -> 6923644268480' + }, + # 添加特殊条码6958620703716,既需要特殊处理又需要映射 + '6958620703716': { + 'specification': '1*14', + 'map_to': '6958620703907', + 'description': '特殊处理: 规格1*14,同时映射到6958620703907' + } + } + + try: + # 检查配置文件是否存在 + if os.path.exists(BARCODE_MAPPING_CONFIG): + with open(BARCODE_MAPPING_CONFIG, 'r', encoding='utf-8') as file: + mappings = json.load(file) + logger.info(f"成功加载条码映射配置,共{len(mappings)}项") + return mappings + else: + # 创建默认配置文件 + self.save_barcode_mappings(default_mappings) + logger.info(f"创建默认条码映射配置,共{len(default_mappings)}项") + return default_mappings + except Exception as e: + logger.error(f"加载条码映射配置失败: {e}") + return default_mappings + + def save_barcode_mappings(self, mappings: Dict[str, Dict[str, Any]]) -> bool: + """ + 保存条码映射到配置文件 + + Args: + mappings: 条码映射字典 + + Returns: + 保存是否成功 + """ + try: + # 确保配置目录存在 + os.makedirs(os.path.dirname(BARCODE_MAPPING_CONFIG), exist_ok=True) + + # 写入配置文件 + with open(BARCODE_MAPPING_CONFIG, 'w', encoding='utf-8') as file: + json.dump(mappings, file, ensure_ascii=False, indent=2) + + logger.info(f"条码映射配置保存成功,共{len(mappings)}项") + return True + except Exception as e: + logger.error(f"保存条码映射配置失败: {e}") + return False + + def update_barcode_mappings(self, new_mappings: Dict[str, Dict[str, Any]]) -> bool: + """ + 更新条码映射配置 + + Args: + new_mappings: 新的条码映射字典 + + Returns: + 更新是否成功 + """ + self.special_barcodes = new_mappings + return self.save_barcode_mappings(new_mappings) diff --git a/app/core/excel/handlers/__init__.py b/app/core/excel/handlers/__init__.py new file mode 100644 index 0000000..fe768ae --- /dev/null +++ b/app/core/excel/handlers/__init__.py @@ -0,0 +1,11 @@ +""" +单位转换处理程序包 +----------------- +提供单位转换和条码处理的各种处理程序 +""" + +from typing import Dict, Any + +# 导出所有处理程序类 +from .barcode_mapper import BarcodeMapper +from .unit_converter_handlers import JianUnitHandler, BoxUnitHandler, TiHeUnitHandler, GiftUnitHandler, UnitHandler \ No newline at end of file diff --git a/app/core/excel/handlers/barcode_mapper.py b/app/core/excel/handlers/barcode_mapper.py new file mode 100644 index 0000000..979cded --- /dev/null +++ b/app/core/excel/handlers/barcode_mapper.py @@ -0,0 +1,83 @@ +""" +条码映射处理程序 +------------- +处理特殊条码的映射和转换 +""" + +import logging +from typing import Dict, Optional, Any + +from ...utils.log_utils import get_logger + +logger = get_logger(__name__) + + +class BarcodeMapper: + """ + 条码映射器:负责特殊条码的映射和处理 + """ + + def __init__(self, special_barcodes: Dict[str, Dict[str, Any]]): + """ + 初始化条码映射器 + + Args: + special_barcodes: 特殊条码配置字典 + """ + self.special_barcodes = special_barcodes or {} + + def map_barcode(self, product: Dict[str, Any]) -> Dict[str, Any]: + """ + 映射商品条码,处理特殊情况 + + Args: + product: 包含条码的商品信息字典 + + Returns: + 处理后的商品信息字典 + """ + result = product.copy() + barcode = result.get('barcode', '') + + # 如果条码不在特殊条码列表中,直接返回 + if not barcode or barcode not in self.special_barcodes: + return result + + special_config = self.special_barcodes[barcode] + + # 处理特殊倍数 + if 'multiplier' in special_config: + multiplier = special_config.get('multiplier', 1) + target_unit = special_config.get('target_unit', '瓶') + + # 数量乘以倍数 + quantity = result.get('quantity', 0) + new_quantity = quantity * multiplier + + # 单价除以倍数 + price = result.get('price', 0) + new_price = price / multiplier if price else 0 + + # 如果有固定单价,优先使用 + if 'fixed_price' in special_config: + new_price = special_config['fixed_price'] + logger.info(f"特殊条码({barcode})使用固定单价: {new_price}") + + # 如果有固定规格,设置规格 + if 'specification' in special_config: + result['specification'] = special_config['specification'] + logger.info(f"特殊条码({barcode})使用固定规格: {special_config['specification']}") + + logger.info(f"特殊条码处理: {barcode}, 数量: {quantity} -> {new_quantity}, 单价: {price} -> {new_price}, 单位: {result.get('unit', '')} -> {target_unit}") + + result['quantity'] = new_quantity + result['price'] = new_price + result['unit'] = target_unit + + # 处理条码映射 - 放在后面以便可以同时进行特殊处理和条码映射 + if 'map_to' in special_config: + new_barcode = special_config['map_to'] + logger.info(f"条码映射: {barcode} -> {new_barcode}") + result['barcode'] = new_barcode + + return result \ No newline at end of file diff --git a/app/core/excel/handlers/unit_converter_handlers.py b/app/core/excel/handlers/unit_converter_handlers.py new file mode 100644 index 0000000..9cce4e9 --- /dev/null +++ b/app/core/excel/handlers/unit_converter_handlers.py @@ -0,0 +1,286 @@ +""" +单位转换处理程序 +------------- +处理不同单位的转换逻辑 +""" + +import logging +from typing import Dict, Optional, Any, Tuple, Protocol +from abc import ABC, abstractmethod + +from ...utils.log_utils import get_logger + +logger = get_logger(__name__) + + +class UnitHandler(ABC): + """ + 单位处理器基类:定义单位处理接口 + """ + + @abstractmethod + def can_handle(self, product: Dict[str, Any]) -> bool: + """ + 检查是否可以处理该商品 + + Args: + product: 商品信息字典 + + Returns: + 是否可以处理 + """ + pass + + @abstractmethod + def handle(self, product: Dict[str, Any], level1: int, level2: int, level3: Optional[int]) -> Dict[str, Any]: + """ + 处理单位转换 + + Args: + product: 商品信息字典 + level1: 一级包装数量 + level2: 二级包装数量 + level3: 三级包装数量,可能为None + + Returns: + 处理后的商品信息字典 + """ + pass + + +class JianUnitHandler(UnitHandler): + """ + 处理"件"单位的转换 + """ + + def can_handle(self, product: Dict[str, Any]) -> bool: + """ + 检查是否可以处理该商品(单位为"件") + + Args: + product: 商品信息字典 + + Returns: + 是否可以处理 + """ + unit = str(product.get('unit', '')).strip() + # 匹配"件"、"件、"、"件装"等 + return unit == '件' or unit.startswith('件') + + def handle(self, product: Dict[str, Any], level1: int, level2: int, level3: Optional[int]) -> Dict[str, Any]: + """ + 处理"件"单位转换:数量×包装数量,单价÷包装数量,单位转为"瓶" + + Args: + product: 商品信息字典 + level1: 一级包装数量 + level2: 二级包装数量 + level3: 三级包装数量,可能为None + + Returns: + 处理后的商品信息字典 + """ + result = product.copy() + + quantity = result.get('quantity', 0) + price = result.get('price', 0) + + # 计算包装数量(二级*三级,如果无三级则仅二级) + packaging_count = level2 * (level3 or 1) + + # 数量×包装数量 + new_quantity = quantity * packaging_count + + # 单价÷包装数量 + new_price = price / packaging_count if price else 0 + + logger.info(f"件单位处理: 数量: {quantity} -> {new_quantity}, 单价: {price} -> {new_price}, 单位: 件 -> 瓶") + + result['quantity'] = new_quantity + result['price'] = new_price + result['unit'] = '瓶' + + return result + + +class BoxUnitHandler(UnitHandler): + """ + 处理"箱"单位的转换 + """ + + def can_handle(self, product: Dict[str, Any]) -> bool: + """ + 检查是否可以处理该商品(单位为"箱") + + Args: + product: 商品信息字典 + + Returns: + 是否可以处理 + """ + unit = str(product.get('unit', '')).strip() + # 匹配"箱"、"箱、"、"箱装"等 + return unit == '箱' or unit.startswith('箱') + + def handle(self, product: Dict[str, Any], level1: int, level2: int, level3: Optional[int]) -> Dict[str, Any]: + """ + 处理"箱"单位转换:数量×包装数量,单价÷包装数量,单位转为"瓶" + + Args: + product: 商品信息字典 + level1: 一级包装数量 + level2: 二级包装数量 + level3: 三级包装数量,可能为None + + Returns: + 处理后的商品信息字典 + """ + result = product.copy() + + quantity = result.get('quantity', 0) + price = result.get('price', 0) + + # 计算包装数量(二级*三级,如果无三级则仅二级) + packaging_count = level2 * (level3 or 1) + + # 数量×包装数量 + new_quantity = quantity * packaging_count + + # 单价÷包装数量 + new_price = price / packaging_count if price else 0 + + logger.info(f"箱单位处理: 数量: {quantity} -> {new_quantity}, 单价: {price} -> {new_price}, 单位: 箱 -> 瓶") + + result['quantity'] = new_quantity + result['price'] = new_price + result['unit'] = '瓶' + + return result + + +class TiHeUnitHandler(UnitHandler): + """ + 处理"提"和"盒"单位的转换 + """ + + def can_handle(self, product: Dict[str, Any]) -> bool: + """ + 检查是否可以处理该商品(单位为"提"或"盒") + + Args: + product: 商品信息字典 + + Returns: + 是否可以处理 + """ + unit = str(product.get('unit', '')).strip() + return unit in ['提', '盒'] or unit.startswith('提') or unit.startswith('盒') + + def handle(self, product: Dict[str, Any], level1: int, level2: int, level3: Optional[int]) -> Dict[str, Any]: + """ + 处理"提"和"盒"单位转换: + - 如果是三级规格,按件处理(数量×包装数量,单价÷包装数量,单位转为"瓶") + - 如果是二级规格,保持不变 + + Args: + product: 商品信息字典 + level1: 一级包装数量 + level2: 二级包装数量 + level3: 三级包装数量,可能为None + + Returns: + 处理后的商品信息字典 + """ + result = product.copy() + + quantity = result.get('quantity', 0) + price = result.get('price', 0) + unit = result.get('unit', '') + + # 如果是三级规格,按件处理 + if level3 is not None: + # 计算包装数量 - 只乘以最后一级数量 + packaging_count = level3 + + # 数量×包装数量 + new_quantity = quantity * packaging_count + + # 单价÷包装数量 + new_price = price / packaging_count if price else 0 + + logger.info(f"提/盒单位(三级规格)处理: 数量: {quantity} -> {new_quantity}, 单价: {price} -> {new_price}, 单位: {unit} -> 瓶") + + result['quantity'] = new_quantity + result['price'] = new_price + result['unit'] = '瓶' + else: + # 如果是二级规格,保持不变 + logger.info(f"提/盒单位(二级规格)处理: 保持原样 数量: {quantity}, 单价: {price}, 单位: {unit}") + + return result + + +class GiftUnitHandler(UnitHandler): + """ + 处理赠品的特殊情况 + """ + + def can_handle(self, product: Dict[str, Any]) -> bool: + """ + 检查是否可以处理该商品(是否为赠品) + + Args: + product: 商品信息字典 + + Returns: + 是否可以处理 + """ + return product.get('is_gift', False) is True + + def handle(self, product: Dict[str, Any], level1: int, level2: int, level3: Optional[int]) -> Dict[str, Any]: + """ + 处理赠品的单位转换: + - 对于件/箱单位,数量仍然需要转换,但赠品的单价保持为0 + + Args: + product: 商品信息字典 + level1: 一级包装数量 + level2: 二级包装数量 + level3: 三级包装数量,可能为None + + Returns: + 处理后的商品信息字典 + """ + result = product.copy() + + unit = result.get('unit', '') + quantity = result.get('quantity', 0) + + # 根据单位类型选择适当的包装数计算 + if unit in ['件', '箱']: + # 计算包装数量(二级*三级,如果无三级则仅二级) + packaging_count = level2 * (level3 or 1) + + # 数量×包装数量 + new_quantity = quantity * packaging_count + + logger.info(f"赠品{unit}单位处理: 数量: {quantity} -> {new_quantity}, 单价: 0, 单位: {unit} -> 瓶") + + result['quantity'] = new_quantity + result['unit'] = '瓶' + elif unit in ['提', '盒'] and level3 is not None: + # 对于三级规格的提/盒,类似件处理 + new_quantity = quantity * level3 + + logger.info(f"赠品{unit}单位(三级规格)处理: 数量: {quantity} -> {new_quantity}, 单价: 0, 单位: {unit} -> 瓶") + + result['quantity'] = new_quantity + result['unit'] = '瓶' + else: + # 其他情况保持不变 + logger.info(f"赠品{unit}单位处理: 保持原样 数量: {quantity}, 单价: 0, 单位: {unit}") + + # 确保单价为0 + result['price'] = 0 + + return result \ No newline at end of file diff --git a/app/core/excel/merger.py b/app/core/excel/merger.py new file mode 100644 index 0000000..f325a09 --- /dev/null +++ b/app/core/excel/merger.py @@ -0,0 +1,423 @@ +""" +订单合并模块 +---------- +提供采购单合并功能,将多个采购单合并为一个。 +""" + +import os +import re +import pandas as pd +import numpy as np +import xlrd +import xlwt +from xlutils.copy import copy as xlcopy +from typing import Dict, List, Optional, Tuple, Union, Any, Callable +from datetime import datetime + +from ...config.settings import ConfigManager +from ..utils.log_utils import get_logger +from ..handlers.column_mapper import ColumnMapper +from ..utils.file_utils import ( + ensure_dir, + get_file_extension, + get_files_by_extensions, + load_json, + save_json +) +from ..utils.string_utils import ( + clean_string, + clean_barcode, + format_barcode +) + +logger = get_logger(__name__) + +class PurchaseOrderMerger: + """ + 采购单合并器:将多个采购单Excel文件合并成一个文件 + """ + + def __init__(self, config): + """ + 初始化采购单合并器 + + Args: + config: 配置信息 + """ + self.config = config + + # 修复ConfigParser对象没有get_path方法的问题 + try: + # 获取输出目录 + self.output_dir = config.get('Paths', 'output_folder', fallback='data/output') + + # 确保目录存在 + os.makedirs(self.output_dir, exist_ok=True) + + # 记录实际路径 + logger.info(f"使用输出目录: {os.path.abspath(self.output_dir)}") + + # 获取模板文件路径 + template_folder = config.get('Paths', 'template_folder', fallback='templates') + template_name = config.get('Templates', 'purchase_order', fallback='银豹-采购单模板.xls') + + self.template_path = os.path.join(template_folder, template_name) + + # 检查模板文件是否存在 + if not os.path.exists(self.template_path): + logger.warning(f"模板文件不存在: {self.template_path}") + + # 用于记录已合并的文件 + self.merged_files_json = os.path.join(self.output_dir, "merged_files.json") + self.merged_files = self._load_merged_files() + + logger.info(f"初始化PurchaseOrderMerger完成,模板文件: {self.template_path}") + except Exception as e: + logger.error(f"初始化PurchaseOrderMerger失败: {e}") + raise + + def _load_merged_files(self) -> Dict[str, str]: + """ + 加载已合并文件的缓存 + + Returns: + 合并记录字典 + """ + return load_json(self.merged_files_json, {}) + + def _save_merged_files(self) -> None: + """保存已合并文件的缓存""" + save_json(self.merged_files, self.merged_files_json) + + def get_purchase_orders(self) -> List[str]: + """ + 获取result目录下的采购单Excel文件 + + Returns: + 采购单文件路径列表 + """ + # 采购单文件保存在data/result目录 + result_dir = "data/result" + logger.info(f"搜索目录 {result_dir} 中的采购单Excel文件") + + # 确保目录存在 + os.makedirs(result_dir, exist_ok=True) + + # 获取所有Excel文件 + all_files = get_files_by_extensions(result_dir, ['.xls', '.xlsx']) + + # 筛选采购单文件 + purchase_orders = [ + file for file in all_files + if os.path.basename(file).startswith('采购单_') + ] + + if not purchase_orders: + logger.warning(f"未在 {result_dir} 目录下找到采购单Excel文件") + return [] + + # 按修改时间排序,最新的在前 + purchase_orders.sort(key=lambda x: os.path.getmtime(x), reverse=True) + + logger.info(f"找到 {len(purchase_orders)} 个采购单Excel文件") + return purchase_orders + + def read_purchase_order(self, file_path: str) -> Optional[pd.DataFrame]: + """ + 读取采购单Excel文件 + + Args: + file_path: 采购单文件路径 + + Returns: + 数据帧,如果读取失败则返回None + """ + try: + # 读取Excel文件 + df = pd.read_excel(file_path) + logger.info(f"成功读取采购单文件: {file_path}") + + # 打印列名,用于调试 + logger.debug(f"Excel文件的列名: {df.columns.tolist()}") + + # 处理特殊情况:检查是否需要读取指定行作为标题行 + header_row_idx = ColumnMapper.detect_header_row(df, max_rows=5, min_matches=3) + if header_row_idx >= 0: + logger.info(f"检测到表头在第 {header_row_idx+1} 行") + + # 使用此行作为列名,数据从下一行开始 + header_row = df.iloc[header_row_idx].astype(str) + data_rows = df.iloc[header_row_idx+1:].reset_index(drop=True) + + # 为每一列分配名称(避免重复的列名) + new_columns = [] + for i, col in enumerate(header_row): + col_str = str(col) + if col_str == 'nan' or col_str == 'None' or pd.isna(col): + new_columns.append(f"Col_{i}") + else: + new_columns.append(col_str) + + # 使用新列名创建新的DataFrame + data_rows.columns = new_columns + df = data_rows + logger.debug(f"重新构建的数据帧列名: {df.columns.tolist()}") + + # 使用 ColumnMapper 统一查找列名(保留中文键名以兼容下游代码) + all_columns = df.columns.tolist() + logger.info(f"列名: {all_columns}") + + standard_to_chinese = { + 'barcode': '条码', + 'quantity': '采购量', + 'unit_price': '采购单价', + 'gift_quantity': '赠送量', + } + + mapped_columns = {} + for std_name, chinese_name in standard_to_chinese.items(): + matched = ColumnMapper.find_column(all_columns, std_name) + if matched: + mapped_columns[chinese_name] = matched + logger.info(f"列名映射: {matched} -> {chinese_name}") + + # 如果找到了必要的列,重命名列 + if mapped_columns: + rename_dict = {mapped_columns[key]: key for key in mapped_columns} + logger.info(f"列名重命名映射: {rename_dict}") + df = df.rename(columns=rename_dict) + logger.info(f"重命名后的列名: {df.columns.tolist()}") + else: + logger.warning(f"未找到可映射的列名: {file_path}") + + return df + + except Exception as e: + logger.error(f"读取采购单文件失败: {file_path}, 错误: {str(e)}") + return None + + def merge_purchase_orders(self, file_paths: List[str]) -> Optional[pd.DataFrame]: + """ + 合并多个采购单文件 + + Args: + file_paths: 采购单文件路径列表 + + Returns: + 合并后的数据帧,如果合并失败则返回None + """ + if not file_paths: + logger.warning("没有需要合并的采购单文件") + return None + + # 读取所有采购单文件 + dfs = [] + for file_path in file_paths: + df = self.read_purchase_order(file_path) + if df is not None: + dfs.append(df) + + if not dfs: + logger.warning("没有成功读取的采购单文件") + return None + + # 合并数据 + logger.info(f"开始合并 {len(dfs)} 个采购单文件") + + # 首先,整理每个数据帧以确保它们有相同的结构 + processed_dfs = [] + for i, df in enumerate(dfs): + # 确保必要的列存在 + required_columns = ['条码', '采购量', '采购单价'] + missing_columns = [col for col in required_columns if col not in df.columns] + + if missing_columns: + logger.warning(f"数据帧 {i} 缺少必要的列: {missing_columns}") + continue + + # 处理赠送量列不存在的情况 + if '赠送量' not in df.columns: + df['赠送量'] = 0 + + # 选择并清理需要的列 + cleaned_df = pd.DataFrame() + + # 清理条码 - 确保是字符串且无小数点 + cleaned_df['条码'] = df['条码'].apply(lambda x: format_barcode(x) if pd.notna(x) else '') + + # 清理采购量 - 确保是数字 + cleaned_df['采购量'] = pd.to_numeric(df['采购量'], errors='coerce').fillna(0) + + # 清理单价 - 确保是数字并保留4位小数 + cleaned_df['采购单价'] = pd.to_numeric(df['采购单价'], errors='coerce').fillna(0).round(4) + + # 清理赠送量 - 确保是数字 + cleaned_df['赠送量'] = pd.to_numeric(df['赠送量'], errors='coerce').fillna(0) + + # 过滤无效行 - 条码为空或采购量为0的行跳过 + valid_df = cleaned_df[(cleaned_df['条码'] != '') & (cleaned_df['采购量'] > 0)] + + if len(valid_df) > 0: + processed_dfs.append(valid_df) + logger.info(f"处理文件 {i+1}: 有效记录 {len(valid_df)} 行") + else: + logger.warning(f"处理文件 {i+1}: 没有有效记录") + + if not processed_dfs: + logger.warning("没有有效的数据帧用于合并") + return None + + # 将所有数据帧合并 + merged_df = pd.concat(processed_dfs, ignore_index=True) + + # 按条码和单价分组,合并相同商品 + # 四舍五入到4位小数,避免浮点误差导致相同价格被当作不同价格 + merged_df['采购单价'] = merged_df['采购单价'].round(4) + + # 对于同一条码和单价的商品,合并数量和赠送量 + result = merged_df.groupby(['条码', '采购单价'], as_index=False).agg({ + '采购量': 'sum', + '赠送量': 'sum' + }) + + # 排序,按条码升序 + result = result.sort_values('条码').reset_index(drop=True) + + # 设置为0的赠送量设为空 + result.loc[result['赠送量'] == 0, '赠送量'] = pd.NA + + logger.info(f"合并完成,共 {len(result)} 条商品记录") + return result + + def create_merged_purchase_order(self, df: pd.DataFrame) -> Optional[str]: + """ + 创建合并的采购单文件,完全按照银豹格式要求 + + Args: + df: 合并后的数据帧 + + Returns: + 输出文件路径,如果创建失败则返回None + """ + try: + # 打开模板文件 + template_workbook = xlrd.open_workbook(self.template_path, formatting_info=True) + template_sheet = template_workbook.sheet_by_index(0) + + # 首先分析模板结构,确定关键列的位置 + logger.info(f"分析模板结构") + for i in range(min(5, template_sheet.nrows)): + row_values = [str(cell.value).strip() for cell in template_sheet.row(i)] + logger.debug(f"模板第{i+1}行: {row_values}") + + # 银豹模板的标准列位置: + # 条码列(商品条码): B列(索引1) + barcode_col = 1 + # 采购量列: C列(索引2) + quantity_col = 2 + # 赠送量列: D列(索引3) + gift_col = 3 + # 采购单价列: E列(索引4) + price_col = 4 + + # 找到数据开始行 - 通常是第二行(索引1) + data_start_row = 1 + + # 创建可写的副本 + output_workbook = xlcopy(template_workbook) + output_sheet = output_workbook.get_sheet(0) + + # 设置单价的格式样式(保留4位小数) + price_style = xlwt.XFStyle() + price_style.num_format_str = '0.0000' + + # 数量格式 + quantity_style = xlwt.XFStyle() + quantity_style.num_format_str = '0' + + # 遍历数据并填充到Excel + for i, (_, row) in enumerate(df.iterrows()): + r = data_start_row + i + + # 只填充银豹采购单格式要求的4个列:条码、采购量、赠送量、采购单价 + + # 条码(必填)- B列(1) + output_sheet.write(r, barcode_col, row['条码']) + + # 采购量(必填)- C列(2) + output_sheet.write(r, quantity_col, float(row['采购量']), quantity_style) + + # 赠送量 - D列(3) + if pd.notna(row['赠送量']) and float(row['赠送量']) > 0: + output_sheet.write(r, gift_col, float(row['赠送量']), quantity_style) + + # 采购单价(必填)- E列(4) + output_sheet.write(r, price_col, float(row['采购单价']), price_style) + + # 生成输出文件名,保存到data/result目录 + timestamp = datetime.now().strftime("%Y%m%d%H%M%S") + result_dir = "data/result" + os.makedirs(result_dir, exist_ok=True) + output_file = os.path.join(result_dir, f"合并采购单_{timestamp}.xls") + + # 保存文件 + output_workbook.save(output_file) + logger.info(f"合并采购单已保存到: {output_file},共{len(df)}条记录") + return output_file + + except Exception as e: + logger.error(f"创建合并采购单时出错: {e}") + return None + + def process(self, file_paths: Optional[List[str]] = None, progress_cb: Optional[Callable[[int], None]] = None) -> Optional[str]: + """ + 处理采购单合并 + + Args: + file_paths: 指定要合并的文件路径列表,如果为None则自动获取 + + Returns: + 合并后的文件路径,如果合并失败则返回None + """ + # 如果未指定文件路径,则获取所有采购单文件 + if file_paths is None: + file_paths = self.get_purchase_orders() + try: + if progress_cb: + progress_cb(97) + except Exception: + pass + + # 检查是否有文件需要合并 + if not file_paths: + logger.warning("没有找到可合并的采购单文件") + return None + + # 合并采购单 + merged_df = self.merge_purchase_orders(file_paths) + if merged_df is None: + logger.error("合并采购单失败") + return None + try: + if progress_cb: + progress_cb(98) + except Exception: + pass + + # 创建合并的采购单文件 + output_file = self.create_merged_purchase_order(merged_df) + if output_file is None: + logger.error("创建合并采购单文件失败") + return None + try: + if progress_cb: + progress_cb(100) + except Exception: + pass + + # 记录已合并文件 + for file_path in file_paths: + self.merged_files[file_path] = output_file + self._save_merged_files() + + return output_file diff --git a/app/core/excel/processor.py b/app/core/excel/processor.py new file mode 100644 index 0000000..b2a9a58 --- /dev/null +++ b/app/core/excel/processor.py @@ -0,0 +1,860 @@ +""" +Excel处理核心模块 +-------------- +提供Excel文件处理功能,包括表格解析、数据提取和处理。 +""" + +import os +import re +import pandas as pd +import numpy as np +import xlrd +import xlwt +from xlutils.copy import copy as xlcopy +from typing import Dict, List, Optional, Tuple, Union, Any, Callable +from datetime import datetime + +from ...config.settings import ConfigManager +from ..utils.log_utils import get_logger +from ..utils.file_utils import ( + ensure_dir, + get_file_extension, + get_latest_file, + load_json, + save_json +) +from ..utils.string_utils import ( + clean_string, + extract_number, + format_barcode, + parse_monetary_string +) +from .converter import UnitConverter +from ..handlers.column_mapper import ColumnMapper + +logger = get_logger(__name__) + +class ExcelProcessor: + """ + Excel处理器:处理OCR识别后的Excel文件, + 提取条码、单价和数量,并按照采购单模板的格式填充 + """ + + def __init__(self, config): + """ + 初始化Excel处理器 + + Args: + config: 配置信息 + """ + self.config = config + + # 修复ConfigParser对象没有get_path方法的问题 + try: + # 获取输入和输出目录 + self.output_dir = config.get('Paths', 'output_folder', fallback='data/output') + self.temp_dir = config.get('Paths', 'temp_folder', fallback='data/temp') + + # 获取模板文件路径 + self.template_path = config.get('Paths', 'template_file', fallback='templates/银豹-采购单模板.xls') + if not os.path.exists(self.template_path): + logger.warning(f"模板文件不存在: {self.template_path}") + + # 设置缓存文件路径 + self.cache_file = os.path.join(self.output_dir, "processed_files.json") + self.processed_files = self._load_processed_files() + + # 确保目录存在 + os.makedirs(self.output_dir, exist_ok=True) + os.makedirs(self.temp_dir, exist_ok=True) + + # 记录实际路径 + logger.info(f"使用输出目录: {os.path.abspath(self.output_dir)}") + logger.info(f"使用临时目录: {os.path.abspath(self.temp_dir)}") + + # 加载单位转换器和配置 + self.unit_converter = UnitConverter() + logger.info(f"初始化ExcelProcessor完成,模板文件: {self.template_path}") + except Exception as e: + logger.error(f"初始化ExcelProcessor失败: {e}") + raise + + def _load_processed_files(self) -> Dict[str, str]: + """ + 加载已处理文件的缓存 + + Returns: + 处理记录字典 + """ + return load_json(self.cache_file, {}) + + def _save_processed_files(self) -> None: + """保存已处理文件的缓存""" + save_json(self.processed_files, self.cache_file) + + def get_latest_excel(self) -> Optional[str]: + """ + 获取output目录下最新的Excel文件(排除采购单文件) + + Returns: + 最新Excel文件的路径,如果未找到则返回None + """ + logger.info(f"搜索目录 {self.output_dir} 中的Excel文件") + + # 使用文件工具获取最新文件 + latest_file = get_latest_file( + self.output_dir, + pattern="", # 不限制文件名 + extensions=['.xlsx', '.xls'] # 限制为Excel文件 + ) + + # 如果没有找到文件 + if not latest_file: + logger.warning(f"未在 {self.output_dir} 目录下找到未处理的Excel文件") + return None + + # 检查是否是采购单(以"采购单_"开头的文件) + file_name = os.path.basename(latest_file) + if file_name.startswith('采购单_'): + logger.warning(f"找到的最新文件是采购单,不作处理: {latest_file}") + return None + + logger.info(f"找到最新的Excel文件: {latest_file}") + return latest_file + + def extract_barcode(self, df: pd.DataFrame) -> List[str]: + """ + 从数据帧中提取条码列名 + + Args: + df: 数据帧 + + Returns: + 可能的条码列名列表 + """ + possible_barcode_columns = ColumnMapper.STANDARD_COLUMNS['barcode'] + + found_columns = [] + + # 检查精确匹配 + for col in df.columns: + col_str = str(col).strip() + if col_str in possible_barcode_columns: + found_columns.append(col) + logger.info(f"找到精确匹配的条码列: {col_str}") + + # 如果找不到精确匹配,尝试部分匹配 + if not found_columns: + for col in df.columns: + col_str = str(col).strip().lower() + for keyword in ['条码', '条形码', 'barcode', '编码']: + if keyword.lower() in col_str: + found_columns.append(col) + logger.info(f"找到部分匹配的条码列: {col} (包含关键词: {keyword})") + break + + # 如果仍然找不到,尝试使用数据特征识别 + if not found_columns and len(df) > 0: + for col in df.columns: + # 检查此列数据是否符合条码特征 + sample_values = df[col].dropna().astype(str).tolist()[:10] # 取前10个非空值 + + if sample_values and all(len(val) >= 8 and len(val) <= 14 for val in sample_values): + # 大多数条码长度在8-14之间 + if all(val.isdigit() for val in sample_values): + found_columns.append(col) + logger.info(f"基于数据特征识别的可能条码列: {col}") + + return found_columns + + def extract_product_info(self, df: pd.DataFrame) -> List[Dict]: + """ + 从数据帧中提取商品信息 + + Args: + df: 数据帧 + + Returns: + 商品信息列表 + """ + products = [] + + # 检测列映射 + column_mapping = self._detect_column_mapping(df) + logger.info(f"检测到列映射: {column_mapping}") + + # 处理每一行 + for idx, row in df.iterrows(): + try: + # 初始化商品信息 + product = { + 'barcode': '', # 条码 + 'name': '', # 商品名称 + 'specification': '', # 规格 + 'quantity': 0, # 数量 + 'unit': '', # 单位 + 'price': 0, # 单价 + 'amount': 0, # 金额 + 'is_gift': False # 是否为赠品 + } + + # 提取条码 + if '条码' in df.columns and not pd.isna(row['条码']): + product['barcode'] = str(row['条码']).strip() + elif column_mapping.get('barcode') and not pd.isna(row[column_mapping['barcode']]): + product['barcode'] = str(row[column_mapping['barcode']]).strip() + + # 跳过空条码行 + if not product['barcode']: + continue + + # 检查备注列,过滤换货、退货、作废等非采购行 + skip_row = False + for col in df.columns: + col_str = str(col) + if any(k in col_str for k in ['备注', '说明', '类型', '备注1']): + val = str(row[col]).strip() + # 过滤常见的非采购关键字 + if any(k in val for k in ['换货', '退货', '作废', '减钱', '冲减', '赠品单', '补货']): + logger.info(f"过滤非采购行: {product['barcode']} - {product.get('name', '')}, 原因: {col_str}包含 '{val}'") + skip_row = True + break + if skip_row: + continue + + # 提取商品名称 + if '商品名称' in df.columns and not pd.isna(row['商品名称']): + product['name'] = str(row['商品名称']).strip() + elif '名称' in df.columns and not pd.isna(row['名称']): + product['name'] = str(row['名称']).strip() + elif column_mapping.get('name') and not pd.isna(row[column_mapping['name']]): + product['name'] = str(row[column_mapping['name']]).strip() + + # 提取单位 + if '单位' in df.columns and not pd.isna(row['单位']): + product['unit'] = str(row['单位']).strip() + elif column_mapping.get('unit') and not pd.isna(row[column_mapping['unit']]): + product['unit'] = str(row[column_mapping['unit']]).strip() + + # 提取单价 + if '单价' in df.columns and not pd.isna(row['单价']): + product['price'] = row['单价'] + elif column_mapping.get('price') and not pd.isna(row[column_mapping['price']]): + product['price'] = row[column_mapping['price']] + + # 提取金额 + if '金额' in df.columns and not pd.isna(row['金额']): + product['amount'] = row['金额'] + elif '小计' in df.columns and not pd.isna(row['小计']): + product['amount'] = row['小计'] + elif column_mapping.get('amount') and not pd.isna(row[column_mapping['amount']]): + product['amount'] = row[column_mapping['amount']] + # 根据金额判断赠品:金额为0、为空、或为o/O + amt = product.get('amount', None) + try: + is_amt_gift = False + if amt is None: + is_amt_gift = True + elif isinstance(amt, str): + parsed = parse_monetary_string(amt) + is_amt_gift = (parsed is None or parsed == 0.0) + else: + parsed = parse_monetary_string(amt) + is_amt_gift = (parsed is not None and parsed == 0.0) + if is_amt_gift: + product['is_gift'] = True + except Exception: + pass + + # 提取数量 + if '数量' in df.columns and not pd.isna(row['数量']): + product['quantity'] = row['数量'] + elif column_mapping.get('quantity') and not pd.isna(row[column_mapping['quantity']]): + product['quantity'] = row[column_mapping['quantity']] + + # 处理可能的复合数量字段,例如"2箱"、"3件" + if isinstance(product['quantity'], str) and product['quantity']: + num, unit = self.unit_converter.extract_unit_from_quantity(product['quantity']) + if unit: + product['unit'] = unit + if num is not None: + product['quantity'] = num + + # 提取规格并解析包装数量 + if '规格' in df.columns and not pd.isna(row['规格']): + product['specification'] = str(row['规格']) + # 修正OCR误识别的4.51*4为4.5L*4 + product['specification'] = re.sub(r'(\d+\.\d+)1\*(\d+)', r'\1L*\2', product['specification']) + package_quantity = self.parse_specification(product['specification']) + if package_quantity: + product['package_quantity'] = package_quantity + logger.info(f"解析规格: {product['specification']} -> 包装数量={package_quantity}") + elif column_mapping.get('specification') and not pd.isna(row[column_mapping['specification']]): + product['specification'] = str(row[column_mapping['specification']]) + # 修正OCR误识别的4.51*4为4.5L*4 + product['specification'] = re.sub(r'(\d+\.\d+)1\*(\d+)', r'\1L*\2', product['specification']) + package_quantity = self.parse_specification(product['specification']) + if package_quantity: + product['package_quantity'] = package_quantity + logger.info(f"从映射列解析规格: {product['specification']} -> 包装数量={package_quantity}") + else: + # 只有在无法从Excel获取规格时,才尝试从商品名称推断规格 + if product['name']: + # 特殊处理:优先检查名称中是否包含"容量*数量"格式 + container_pattern = r'.*?(\d+(?:\.\d+)?)\s*(?:ml|[mM][lL]|[lL]|升|毫升)[*×xX](\d+).*' + match = re.search(container_pattern, product['name']) + if match: + # 容量单位*数量格式,如"1.8L*8瓶",取数量部分作为包装数量 + volume = match.group(1) + count = match.group(2) + inferred_spec = f"{volume}L*{count}" + inferred_qty = int(count) + product['specification'] = inferred_spec + product['package_quantity'] = inferred_qty + logger.info(f"从商品名称提取容量*数量格式: {product['name']} -> {inferred_spec}, 包装数量={inferred_qty}") + # 原来的重量/容量*数字格式处理逻辑 + else: + weight_volume_pattern = r'.*?\d+(?:g|ml|毫升|克)[*xX×](\d+)' + match = re.search(weight_volume_pattern, product['name']) + if match: + inferred_spec = f"1*{match.group(1)}" + inferred_qty = int(match.group(1)) + product['specification'] = inferred_spec + product['package_quantity'] = inferred_qty + logger.info(f"从商品名称提取重量/容量规格: {product['name']} -> {inferred_spec}, 包装数量={inferred_qty}") + else: + # 一般情况的规格推断 + inferred_spec = self.unit_converter.infer_specification_from_name(product['name']) + if inferred_spec: + product['specification'] = inferred_spec + package_quantity = self.parse_specification(inferred_spec) + if package_quantity: + product['package_quantity'] = package_quantity + logger.info(f"从商品名称推断规格: {product['name']} -> {inferred_spec}, 包装数量={package_quantity}") + + # 检查已设置的规格但未设置包装数量的情况 + if product.get('specification') and not product.get('package_quantity'): + package_quantity = self.parse_specification(product['specification']) + if package_quantity: + product['package_quantity'] = package_quantity + logger.info(f"解析已设置的规格: {product['specification']} -> 包装数量={package_quantity}") + + # 新增逻辑:根据规格推断单位为"件" + if not product['unit'] and product.get('barcode') and product.get('specification') and product.get('quantity') and product.get('price') is not None: + # 检查规格是否符合容量*数量格式 + volume_pattern = r'(\d+(?:\.\d+)?)\s*(?:ml|[mL]L|l|L|升|毫升)[*×xX](\d+)' + match = re.search(volume_pattern, product['specification']) + + # 判断是否需要推断单位为"件" + if match: + product['unit'] = '件' + logger.info(f"根据规格推断单位: {product['specification']} -> 单位=件") + else: + # 检查简单的数量*数量格式 + simple_pattern = r'(\d+)[*×xX](\d+)' + match = re.search(simple_pattern, product['specification']) + if match: + product['unit'] = '件' + logger.info(f"根据规格推断单位: {product['specification']} -> 单位=件") + + # 应用单位转换规则 + product = self.unit_converter.process_unit_conversion(product) + + # 如果数量为0但单价和金额都存在,计算数量 = 金额/单价 + if (product['quantity'] == 0 or product['quantity'] is None) and product['price'] > 0 and product['amount']: + try: + amount = parse_monetary_string(product['amount']) + if amount is not None and amount > 0: + quantity = amount / product['price'] + logger.info(f"数量为空或为0,通过金额({amount})和单价({product['price']})计算得出数量: {quantity}") + product['quantity'] = quantity + except Exception as e: + logger.warning(f"通过金额和单价计算数量失败: {e}") + + products.append(product) + except Exception as e: + logger.error(f"提取第{idx+1}行商品信息时出错: {e}", exc_info=True) + continue + + logger.info(f"提取到 {len(products)} 个商品信息") + return products + + def fill_template(self, products: List[Dict], output_file_path: str) -> bool: + """ + 填充采购单模板 + + Args: + products: 商品信息列表 + output_file_path: 输出文件路径 + + Returns: + 是否成功填充 + """ + try: + # 打开模板文件 + template_workbook = xlrd.open_workbook(self.template_path, formatting_info=True) + template_sheet = template_workbook.sheet_by_index(0) + + # 创建可写的副本 + output_workbook = xlcopy(template_workbook) + output_sheet = output_workbook.get_sheet(0) + + # 先对产品按条码分组,区分正常商品和赠品 + barcode_groups = {} + + # 遍历所有产品,按条码分组 + logger.info(f"开始处理{len(products)} 个产品信息") + for product in products: + barcode = product.get('barcode', '') + # 确保条码是整数字符串 + barcode = format_barcode(barcode) + + if not barcode: + logger.warning(f"跳过无条码商品") + continue + + # 获取数量和单价 + quantity = product.get('quantity', 0) + price = product.get('price', 0) + amount = product.get('amount', 0) + + # 如果数量为0但单价和金额都存在,计算数量 = 金额/单价 + if (quantity == 0 or quantity is None) and price > 0 and amount: + try: + amount = parse_monetary_string(amount) + if amount is not None and amount > 0: + quantity = amount / price + logger.info(f"数量为空或为0,通过金额({amount})和单价({price})计算得出数量: {quantity}") + product['quantity'] = quantity + except Exception as e: + logger.warning(f"通过金额和单价计算数量失败: {e}") + + # 判断是否为赠品(价格为0) + is_gift = bool(product.get('is_gift', False)) or (price == 0) + + logger.info(f"处理商品: 条码={barcode}, 数量={quantity}, 单价={price}, 是否赠品={is_gift}") + + if barcode not in barcode_groups: + barcode_groups[barcode] = { + 'normal': None, # 正常商品信息 + 'gift_quantity': 0 # 赠品数量 + } + + if is_gift: + # 是赠品,累加赠品数量 + barcode_groups[barcode]['gift_quantity'] += quantity + logger.info(f"发现赠品:条码{barcode}, 数量={quantity}") + else: + # 是正常商品 + if barcode_groups[barcode]['normal'] is None: + barcode_groups[barcode]['normal'] = { + 'product': product, + 'quantity': quantity, + 'price': price + } + logger.info(f"发现正常商品:条码{barcode}, 数量={quantity}, 单价={price}") + else: + # 如果有多个正常商品记录,累加数量 + barcode_groups[barcode]['normal']['quantity'] += quantity + logger.info(f"累加正常商品数量:条码{barcode}, 新增={quantity}, 累计={barcode_groups[barcode]['normal']['quantity']}") + + # 如果单价不同,取平均值 + if price != barcode_groups[barcode]['normal']['price']: + avg_price = (barcode_groups[barcode]['normal']['price'] + price) / 2 + barcode_groups[barcode]['normal']['price'] = avg_price + logger.info(f"调整单价(取平均值):条码{barcode}, 原价={barcode_groups[barcode]['normal']['price']}, 新价={price}, 平均={avg_price}") + + # 输出调试信息 + logger.info(f"分组后共{len(barcode_groups)} 个不同条码的商品") + for barcode, group in barcode_groups.items(): + if group['normal'] is not None: + logger.info(f"条码 {barcode} 处理结果:正常商品数量{group['normal']['quantity']},单价{group['normal']['price']},赠品数量{group['gift_quantity']}") + else: + logger.info(f"条码 {barcode} 处理结果:只有赠品,数量={group['gift_quantity']}") + + # 准备填充数据 + row_index = 1 # 从第2行开始填充(索引从0开始) + + for barcode, group in barcode_groups.items(): + # 1. 列B(1): 条码(必填) + output_sheet.write(row_index, 1, barcode) + + if group['normal'] is not None: + # 有正常商品 + product = group['normal']['product'] + + # 2. 列C(2): 采购量(必填) 使用正常商品的采购量 + normal_quantity = group['normal']['quantity'] + output_sheet.write(row_index, 2, normal_quantity) + + # 3. 列D(3): 赠送量 - 添加赠品数量 + if group['gift_quantity'] > 0: + output_sheet.write(row_index, 3, group['gift_quantity']) + logger.info(f"条码 {barcode} 填充:采购量={normal_quantity},赠品数量{group['gift_quantity']}") + + # 4. 列E(4): 采购单价(必填) + purchase_price = group['normal']['price'] + style = xlwt.XFStyle() + style.num_format_str = '0.0000' + output_sheet.write(row_index, 4, round(purchase_price, 4), style) + else: + # 只有赠品,没有正常商品 + # 采购量填0,赠送量填赠品数量 + output_sheet.write(row_index, 2, 0) # 采购量为0 + output_sheet.write(row_index, 3, group['gift_quantity']) # 赠送量 + output_sheet.write(row_index, 4, 0) # 单价为0 + + logger.info(f"条码 {barcode} 填充:仅有赠品,采购量=0,赠品数量={group['gift_quantity']}") + + # 移到下一行 + row_index += 1 + + # 保存文件 + output_workbook.save(output_file_path) + logger.info(f"采购单已保存到: {output_file_path}") + return True + + except Exception as e: + logger.error(f"填充模板时出错: {e}") + return False + + def _find_header_row(self, df: pd.DataFrame) -> Optional[int]: + """自动识别表头行,委托给 ColumnMapper.detect_header_row""" + result = ColumnMapper.detect_header_row(df, max_rows=30) + if result >= 0: + logger.info(f"找到表头行: 第{result+1}行") + return result + # 回退:找第一个非空行 + for row in range(len(df)): + if df.iloc[row].notna().sum() > 3: + logger.info(f"未找到明确表头,使用第一个有效行: 第{row+1}行") + return row + logger.warning("无法识别表头行") + return None + + def process_specific_file(self, file_path: str, progress_cb: Optional[Callable[[int], None]] = None) -> Optional[str]: + """ + 处理指定的Excel文件 + + Args: + file_path: Excel文件路径 + + Returns: + 输出文件路径,如果处理失败则返回None + """ + logger.info(f"开始处理Excel文件: {file_path}") + + if not os.path.exists(file_path): + logger.error(f"文件不存在: {file_path}") + return None + + try: + # 读取Excel文件时不立即指定表头 + if progress_cb: + try: + progress_cb(92) + except Exception: + pass + df = pd.read_excel(file_path, header=None) + logger.info(f"成功读取Excel文件: {file_path}, 共 {len(df)} 行") + + # 自动识别表头行 + header_row = self._find_header_row(df) + if header_row is None: + logger.error("无法识别表头行") + return None + + logger.info(f"识别到表头在第 {header_row+1} 行") + + # 重新设置表头,避免二次读取 + if progress_cb: + try: + progress_cb(94) + except Exception: + pass + + # 使用识别到的表头行设置列名,并过滤掉表头之前的行 + df.columns = df.iloc[header_row] + df = df.iloc[header_row + 1:].reset_index(drop=True) + + logger.info(f"重新整理数据结构,共 {len(df)} 行有效数据") + + # 提取商品信息 + if progress_cb: + try: + progress_cb(96) + except Exception: + pass + products = self.extract_product_info(df) + + if not products: + logger.warning("未提取到有效商品信息") + return None + + # 生成输出文件名,保存到data/result目录 + file_name = os.path.splitext(os.path.basename(file_path))[0] + result_dir = "data/result" + os.makedirs(result_dir, exist_ok=True) + output_file = os.path.join(result_dir, f"采购单_{file_name}.xls") + + # 填充模板并保存 + if self.fill_template(products, output_file): + # 记录已处理文件 + self.processed_files[file_path] = output_file + self._save_processed_files() + + # 不再自动打开输出目录 + logger.info(f"采购单已保存到: {output_file}") + if progress_cb: + try: + progress_cb(100) + except Exception: + pass + + return output_file + + return None + + except Exception as e: + logger.error(f"处理Excel文件时出错: {file_path}, 错误: {e}") + return None + + def process_latest_file(self, progress_cb: Optional[Callable[[int], None]] = None) -> Optional[str]: + """ + 处理最新的Excel文件 + + Returns: + 输出文件路径,如果处理失败则返回None + """ + # 获取最新的Excel文件 + latest_file = self.get_latest_excel() + if not latest_file: + logger.warning("未找到可处理的Excel文件") + return None + + # 处理文件 + return self.process_specific_file(latest_file, progress_cb=progress_cb) + + def _detect_column_mapping(self, df: pd.DataFrame) -> Dict[str, str]: + """ + 自动检测列名映射 + + Args: + df: 数据框 + + Returns: + 列名映射字典,键为标准列名,值为实际列名 + """ + # 提取有用的列 + barcode_cols = self.extract_barcode(df) + + # 如果没有找到条码列,无法继续处理 + if not barcode_cols: + logger.error("未找到条码列,无法处理") + return {} + + # 使用 ColumnMapper 统一查找列名 + mapped_columns = {'barcode': barcode_cols[0]} + logger.info(f"使用条码列: {mapped_columns['barcode']}") + + # 内部键名 -> 标准列名映射 (processor.py 使用 price/amount 作为内部键名) + field_map = [ + ('name', 'name'), + ('specification', 'specification'), + ('quantity', 'quantity'), + ('unit', 'unit'), + ('price', 'unit_price'), + ('amount', 'total_price'), + ] + + for internal_key, standard_name in field_map: + matched = ColumnMapper.find_column(list(df.columns), standard_name) + if matched: + mapped_columns[internal_key] = matched + logger.info(f"找到{internal_key}列: {matched}") + + return mapped_columns + + def infer_specification_from_name(self, product_name: str) -> Tuple[Optional[str], Optional[int]]: + """ + 从商品名称推断规格 + 根据特定的命名规则匹配规格信息 + + Args: + product_name: 商品名称 + + Returns: + 规格字符串和包装数量的元组 + """ + if not product_name or not isinstance(product_name, str): + logger.warning(f"无效的商品名: {product_name}") + return None, None + + product_name = product_name.strip() + + # 特殊处理:重量/容量*数字格式 + weight_volume_pattern = r'.*?\d+(?:g|ml|毫升|克)[*xX×](\d+)' + match = re.search(weight_volume_pattern, product_name) + if match: + inferred_spec = f"1*{match.group(1)}" + inferred_qty = int(match.group(1)) + logger.info(f"从商品名称提取重量/容量规格: {product_name} -> {inferred_spec}, 包装数量={inferred_qty}") + return inferred_spec, inferred_qty + + # 使用单位转换器推断规格 + inferred_spec = self.unit_converter.infer_specification_from_name(product_name) + if inferred_spec: + # 解析规格中的包装数量 + package_quantity = self.parse_specification(inferred_spec) + if package_quantity: + logger.info(f"从商品名称推断规格: {product_name} -> {inferred_spec}, 包装数量={package_quantity}") + return inferred_spec, package_quantity + + # 特定商品规则匹配 + spec_rules = [ + # XX入白膜格式,如"550纯净水24入白膜" + (r'.*?(\d+)入白膜', lambda m: (f"1*{m.group(1)}", int(m.group(1)))), + + # 白膜格式,如"550水24白膜" + (r'.*?(\d+)白膜', lambda m: (f"1*{m.group(1)}", int(m.group(1)))), + + # 445水溶C系列 + (r'445水溶C.*?(\d+)[入个]纸箱', lambda m: (f"1*{m.group(1)}", int(m.group(1)))), + + # 东方树叶系列 + (r'东方树叶.*?(\d+\*\d+).*纸箱', lambda m: (m.group(1), int(m.group(1).split('*')[1]))), + + # 桶装 + (r'(\d+\.?\d*L)桶装', lambda m: (f"{m.group(1)}*1", 1)), + + # 树叶茶系 + (r'树叶.*?(\d+)[入个]纸箱', lambda m: (f"1*{m.group(1)}", int(m.group(1)))), + + # 茶π系列 + (r'茶[πΠπ].*?(\d+)纸箱', lambda m: (f"1*{m.group(1)}", int(m.group(1)))), + + # 通用入数匹配 + (r'.*?(\d+)[入个](?:纸箱|箱装|白膜)', lambda m: (f"1*{m.group(1)}", int(m.group(1)))), + + # 通用数字+纸箱格式 + (r'.*?(\d+)纸箱', lambda m: (f"1*{m.group(1)}", int(m.group(1)))) + ] + + # 尝试所有规则 + for pattern, formatter in spec_rules: + match = re.search(pattern, product_name) + if match: + spec, qty = formatter(match) + logger.info(f"根据特定规则推断规格: {product_name} -> {spec}, 包装数量={qty}") + return spec, qty + + # 尝试直接从名称中提取数字*数字格式 + match = re.search(r'(\d+\*\d+)', product_name) + if match: + spec = match.group(1) + package_quantity = self.parse_specification(spec) + if package_quantity: + logger.info(f"从名称中直接提取规格: {spec}, 包装数量={package_quantity}") + return spec, package_quantity + + # 最后尝试提取任何位置的数字,默认典型件装数 + numbers = re.findall(r'\d+', product_name) + if numbers: + for num in numbers: + # 检查是否为典型的件装数(12/15/24/30) + if num in ['12', '15', '24', '30']: + spec = f"1*{num}" + logger.info(f"从名称中提取可能的件装数: {spec}, 包装数量={int(num)}") + return spec, int(num) + + logger.warning(f"无法从商品名'{product_name}' 推断规格") + return None, None + + def parse_specification(self, spec_str: str) -> Optional[int]: + """ + 解析规格字符串,提取包装数量 + 支持格式:1*15, 1x15, 1*5*10, 5kg*6, IL*12等 + + Args: + spec_str: 规格字符串 + + Returns: + 包装数量,如果无法解析则返回None + """ + if not spec_str or not isinstance(spec_str, str): + return None + + try: + # 清理规格字符串 + spec_str = clean_string(spec_str) + + # 处理可能的OCR误识别,如"IL"应为"1L","6oo"应为"600" + spec_str = re.sub(r'(\b|^)[iIlL](\d+)', r'1\2', spec_str) # 将"IL"替换为"1L" + spec_str = re.sub(r'(\d+)[oO0]{2,}', lambda m: m.group(1) + '00', spec_str) # 将"6oo"替换为"600" + spec_str = spec_str.replace('×', '*').replace('x', '*').replace('X', '*') # 统一乘号 + + logger.debug(f"清理后的规格字符串: {spec_str}") + + # 新增:匹配“1件=12桶/袋/盒…”等等式规格,取右侧数量作为包装数量 + eq_match = re.search(r'(\d+(?:\.\d+)?)\s*(?:件|箱|提|盒)\s*[==]\s*(\d+)\s*(?:瓶|桶|盒|支|个|袋|罐|包|卷)', spec_str) + if eq_match: + return int(eq_match.group(2)) + + # 匹配带单位的格式,如"5kg*6"、"450g*15"、"450ml*15" + weight_pattern = r'(\d+(?:\.\d+)?)\s*(?:kg|KG|千克|公斤)[*×](\d+)' + match = re.search(weight_pattern, spec_str) + if match: + return int(match.group(2)) + + # 匹配克、毫升等单位格式 + match = re.search(r'\d+(?:\.\d+)?(?:g|G|ml|ML|mL|毫升|克)[*×](\d+)', spec_str) + if match: + return int(match.group(1)) + + # 匹配1*5*10 格式的三级规格 + match = re.search(r'(\d+(?:\.\d+)?)[*×](\d+(?:\.\d+)?)[*×](\d+(?:\.\d+)?)', spec_str) + if match: + # 取最后一个数字作为袋数量 + return int(float(match.group(3))) + + # 匹配1*15, 1x15 格式 + match = re.search(r'(\d+(?:\.\d+)?)[*×](\d+(?:\.\d+)?)', spec_str) + if match: + # 取第二个数字作为包装数量 + return int(float(match.group(2))) + + # 匹配24瓶/件等格式 + match = re.search(r'(\d+(?:\.\d+)?)[瓶个支袋][//](件|箱)', spec_str) + if match: + return int(float(match.group(1))) + + # 匹配4L格式 + match = re.search(r'(\d+(?:\.\d+)?)\s*[Ll升][*×]?(\d+(?:\.\d+)?)?', spec_str) + if match: + # 如果有第二个数字,返回它;否则返回1 + return int(float(match.group(2))) if match.group(2) else 1 + + # 匹配单独的数字+单位格式,如"12瓶装" + match = re.search(r'(\d+(?:\.\d+)?)[瓶个支袋包盒罐箱](?:装|\/箱)?', spec_str) + if match: + return int(float(match.group(1))) + + # 尝试直接匹配任何数字 + numbers = re.findall(r'\d+(?:\.\d+)?', spec_str) + if numbers and len(numbers) > 0: + # 如果只有一个数字,通常是包装数量 + if len(numbers) == 1: + return int(float(numbers[0])) + + # 如果有多个数字,尝试识别可能的包装数量(典型数值如6/12/24/30) + for num in numbers: + if float(num) in [6.0, 12.0, 24.0, 30.0]: + return int(float(num)) + + # 如果没有典型数值,选择最后一个数字(通常是包装数量) + return int(float(numbers[-1])) + + except Exception as e: + logger.warning(f"解析规格'{spec_str}'时出错: {e}") + + return None diff --git a/app/core/excel/validators.py b/app/core/excel/validators.py new file mode 100644 index 0000000..4bb1900 --- /dev/null +++ b/app/core/excel/validators.py @@ -0,0 +1,259 @@ +""" +数据验证器模块 +---------- +提供对商品数据的验证和修复功能 +""" + +import re +import logging +from typing import Dict, Any, Optional, List, Tuple, Union + +from ..utils.log_utils import get_logger +from ..utils.string_utils import parse_monetary_string + +logger = get_logger(__name__) + + +class ProductValidator: + """ + 商品数据验证器:验证和修复商品数据 + """ + + def __init__(self): + """ + 初始化商品数据验证器 + """ + # 仓库标识列表 + self.warehouse_identifiers = ["仓库", "仓库全名", "warehouse"] + + def validate_barcode(self, barcode: Any) -> Tuple[bool, str, Optional[str]]: + """ + 验证并修复条码 + + Args: + barcode: 原始条码值 + + Returns: + (是否有效, 修复后的条码, 错误信息)元组 + """ + error_message = None + + # 处理空值 + if barcode is None: + return False, "", "条码为空" + + # 转为字符串 + barcode_str = str(barcode).strip() + + # 处理"仓库"特殊情况 + if barcode_str in self.warehouse_identifiers: + return False, barcode_str, "条码为仓库标识" + + # 清理条码格式(移除非数字字符) + barcode_clean = re.sub(r'\D', '', barcode_str) + + # 如果清理后为空,无效 + if not barcode_clean: + return False, barcode_str, "条码不包含数字" + + # 对特定的错误条码进行修正(5开头改为6开头) + if len(barcode_clean) > 8 and barcode_clean.startswith('5') and not barcode_clean.startswith('53'): + original_barcode = barcode_clean + barcode_clean = '6' + barcode_clean[1:] + logger.info(f"修正条码前缀 5->6: {original_barcode} -> {barcode_clean}") + + # 新增:处理14位条码,如果多余长度都是0,截断为13位 + if len(barcode_clean) > 13: + original_length = len(barcode_clean) + # 检查多余部分是否都是0 + if barcode_clean.endswith('0'): + # 从末尾开始移除0,直到条码长度为13位或不再以0结尾 + while len(barcode_clean) > 13 and barcode_clean.endswith('0'): + barcode_clean = barcode_clean[:-1] + logger.info(f"修正条码长度: 从{original_length}位截断到{len(barcode_clean)}位") + else: + error_message = f"条码长度异常: {barcode_clean}, 长度={len(barcode_clean)}" + logger.warning(error_message) + return False, barcode_clean, error_message + + # 验证条码长度 + if len(barcode_clean) < 8 or len(barcode_clean) > 13: + error_message = f"条码长度异常: {barcode_clean}, 长度={len(barcode_clean)}" + logger.warning(error_message) + return False, barcode_clean, error_message + + # 验证条码是否全为数字 + if not barcode_clean.isdigit(): + error_message = f"条码包含非数字字符: {barcode_clean}" + logger.warning(error_message) + return False, barcode_clean, error_message + + # 对于序号9的特殊情况,允许其条码格式 + if barcode_clean == "5321545613": + logger.info(f"特殊条码验证通过: {barcode_clean}") + return True, barcode_clean, None + + logger.debug(f"条码验证通过: {barcode_clean}") + return True, barcode_clean, None + + def validate_quantity(self, quantity: Any) -> Tuple[bool, float, Optional[str]]: + """ + 验证并修复数量 + + Args: + quantity: 原始数量值 + + Returns: + (是否有效, 修复后的数量, 错误信息)元组 + """ + # 处理空值 + if quantity is None: + return False, 0.0, "数量为空" + + # 如果是字符串,尝试解析 + if isinstance(quantity, str): + # 去除空白和非数字字符(保留小数点) + quantity_clean = re.sub(r'[^\d\.]', '', quantity.strip()) + if not quantity_clean: + return False, 0.0, "数量不包含数字" + + try: + quantity_value = float(quantity_clean) + except ValueError: + return False, 0.0, f"无法将数量 '{quantity}' 转换为数字" + else: + # 尝试直接转换 + try: + quantity_value = float(quantity) + except (ValueError, TypeError): + return False, 0.0, f"无法将数量 '{quantity}' 转换为数字" + + # 数量必须大于0 + if quantity_value <= 0: + return False, 0.0, f"数量必须大于0,当前值: {quantity_value}" + + return True, quantity_value, None + + def validate_price(self, price: Any) -> Tuple[bool, float, bool, Optional[str]]: + """ + 验证并修复单价 + + Args: + price: 原始单价值 + + Returns: + (是否有效, 修复后的单价, 是否为赠品, 错误信息)元组 + """ + # 初始化不是赠品 + is_gift = False + + # 处理空值 + if price is None: + return False, 0.0, True, "单价为空,视为赠品" + + # 如果是字符串,检查赠品标识 + if isinstance(price, str): + price_str = price.strip().lower() + if price_str in ["赠品", "gift", "赠送", "0", ""]: + return True, 0.0, True, None + + price_value = parse_monetary_string(price_str) + if price_value is None: + return False, 0.0, True, f"无法将单价 '{price}' 转换为数字,视为赠品" + else: + # 尝试直接转换 + try: + price_value = float(price) + except (ValueError, TypeError): + return False, 0.0, True, f"无法将单价 '{price}' 转换为数字,视为赠品" + + # 单价为0视为赠品 + if price_value == 0: + return True, 0.0, True, None + + # 单价必须大于0 + if price_value < 0: + return False, 0.0, True, f"单价不能为负数: {price_value},视为赠品" + + return True, price_value, False, None + + def validate_product(self, product: Dict[str, Any]) -> Dict[str, Any]: + """ + 验证并修复商品数据 + + Args: + product: 商品数据字典 + + Returns: + 修复后的商品数据字典 + """ + # 创建新字典,避免修改原始数据 + validated_product = product.copy() + + # 验证条码 + barcode = product.get('barcode', '') + is_valid, fixed_barcode, error_msg = self.validate_barcode(barcode) + if is_valid: + validated_product['barcode'] = fixed_barcode + else: + logger.warning(f"条码验证失败: {error_msg}") + if fixed_barcode: + # 即使验证失败,但如果有修复后的条码仍然使用它 + validated_product['barcode'] = fixed_barcode + + # 验证单价 + price = product.get('price', 0) + is_valid, fixed_price, is_gift, error_msg = self.validate_price(price) + validated_product['price'] = fixed_price + + # 如果单价验证结果表示为赠品,更新赠品标识 + if is_gift: + validated_product['is_gift'] = True + if error_msg: + logger.info(error_msg) + + amount = product.get('amount', None) + try: + is_amount_gift = False + parsed_amount = parse_monetary_string(amount) + if parsed_amount is None or parsed_amount == 0.0: + is_amount_gift = True + if is_amount_gift: + validated_product['is_gift'] = True + except Exception: + pass + + # 验证数量 + quantity = product.get('quantity', None) + is_valid, fixed_quantity, error_msg = self.validate_quantity(quantity) + + # 检查数量是否为空,但单价和金额存在的情况 + if not is_valid and error_msg == "数量为空": + # 获取金额 + amount = product.get('amount', None) + + # 如果单价有效且金额存在,则可以计算数量 + if fixed_price > 0 and amount is not None: + try: + # 确保金额是数字 + amount = parse_monetary_string(amount) + if amount is None: + raise ValueError("无法解析金额") + + # 计算数量 = 金额 / 单价 + if amount > 0: + calculated_quantity = amount / fixed_price + logger.info(f"数量为空,通过金额({amount})和单价({fixed_price})计算得出数量: {calculated_quantity}") + validated_product['quantity'] = calculated_quantity + is_valid = True + except (ValueError, TypeError, ZeroDivisionError) as e: + logger.warning(f"通过金额和单价计算数量失败: {e}") + + # 如果数量验证有效或通过金额计算成功 + if is_valid: + validated_product['quantity'] = fixed_quantity if is_valid and fixed_quantity > 0 else validated_product.get('quantity', 0) + else: + logger.warning(f"数量验证失败: {error_msg}") + validated_product['quantity'] = 0.0 + + return validated_product \ No newline at end of file diff --git a/app/core/handlers/__init__.py b/app/core/handlers/__init__.py new file mode 100644 index 0000000..3dccd02 --- /dev/null +++ b/app/core/handlers/__init__.py @@ -0,0 +1,9 @@ +""" +数据处理handlers模块初始化文件 +""" + +from .data_cleaner import DataCleaner +from .column_mapper import ColumnMapper +from .calculator import DataCalculator + +__all__ = ['DataCleaner', 'ColumnMapper', 'DataCalculator'] \ No newline at end of file diff --git a/app/core/handlers/calculator.py b/app/core/handlers/calculator.py new file mode 100644 index 0000000..b274681 --- /dev/null +++ b/app/core/handlers/calculator.py @@ -0,0 +1,378 @@ +""" +数据计算处理器 + +提供各种数据计算功能,如数量计算、价格计算、汇总统计等 +""" + +import pandas as pd +import numpy as np +from typing import Dict, Any, Optional, List, Union +from ...core.utils.log_utils import get_logger + +logger = get_logger(__name__) + + +class DataCalculator: + """数据计算处理器 + + 提供标准化的数据计算功能,支持各种业务计算规则 + """ + + def __init__(self, config: Optional[Dict[str, Any]] = None): + """初始化数据计算器 + + Args: + config: 计算配置 + """ + self.config = config or {} + self.calculation_rules = [] + + def add_rule(self, rule_type: str, **kwargs): + """添加计算规则 + + Args: + rule_type: 规则类型 + **kwargs: 规则参数 + """ + rule = {'type': rule_type, **kwargs} + self.calculation_rules.append(rule) + logger.debug(f"添加计算规则: {rule_type}") + + def calculate(self, df: pd.DataFrame) -> pd.DataFrame: + """执行数据计算 + + Args: + df: 输入数据 + + Returns: + 计算后的数据 + """ + logger.info(f"开始数据计算,原始数据形状: {df.shape}") + + result_df = df.copy() + + for i, rule in enumerate(self.calculation_rules): + try: + logger.debug(f"执行计算规则 {i+1}/{len(self.calculation_rules)}: {rule['type']}") + result_df = self._apply_rule(result_df, rule) + logger.debug(f"规则执行完成,数据形状: {result_df.shape}") + except Exception as e: + logger.error(f"计算规则执行失败: {rule}, 错误: {e}") + # 继续执行下一个规则,而不是中断整个流程 + continue + + logger.info(f"数据计算完成,最终数据形状: {result_df.shape}") + return result_df + + def _apply_rule(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame: + """应用单个计算规则 + + Args: + df: 数据 + rule: 规则配置 + + Returns: + 处理后的数据 + """ + rule_type = rule.get('type') + + if rule_type == 'multiply': + return self._multiply(df, rule) + elif rule_type == 'divide': + return self._divide(df, rule) + elif rule_type == 'add': + return self._add(df, rule) + elif rule_type == 'subtract': + return self._subtract(df, rule) + elif rule_type == 'formula': + return self._formula(df, rule) + elif rule_type == 'round': + return self._round(df, rule) + elif rule_type == 'sum': + return self._sum(df, rule) + elif rule_type == 'aggregate': + return self._aggregate(df, rule) + else: + logger.warning(f"未知的计算规则类型: {rule_type}") + return df + + def _multiply(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame: + """乘法计算 + + Args: + df: 数据 + rule: 规则配置 + + Returns: + 处理后的数据 + """ + source_column = rule.get('source_column') + target_column = rule.get('target_column') + factor = rule.get('factor', 1) + + if source_column and target_column: + if source_column in df.columns: + df[target_column] = df[source_column] * factor + logger.debug(f"乘法计算: {source_column} * {factor} -> {target_column}") + else: + logger.warning(f"源列不存在: {source_column}") + + return df + + def _divide(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame: + """除法计算 + + Args: + df: 数据 + rule: 规则配置 + + Returns: + 处理后的数据 + """ + source_column = rule.get('source_column') + target_column = rule.get('target_column') + divisor = rule.get('divisor', 1) + + if source_column and target_column and divisor != 0: + if source_column in df.columns: + df[target_column] = df[source_column] / divisor + logger.debug(f"除法计算: {source_column} / {divisor} -> {target_column}") + else: + logger.warning(f"源列不存在: {source_column}") + elif divisor == 0: + logger.error("除数不能为0") + + return df + + def _add(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame: + """加法计算 + + Args: + df: 数据 + rule: 规则配置 + + Returns: + 处理后的数据 + """ + columns = rule.get('columns', []) + target_column = rule.get('target_column') + constant = rule.get('constant', 0) + + if target_column: + if isinstance(columns, str): + columns = [columns] + + if columns: + # 列相加 + valid_columns = [col for col in columns if col in df.columns] + if valid_columns: + df[target_column] = df[valid_columns].sum(axis=1) + constant + logger.debug(f"加法计算: {valid_columns} + {constant} -> {target_column}") + else: + logger.warning(f"没有有效的列用于加法计算: {columns}") + else: + # 只加常数 + if target_column in df.columns: + df[target_column] = df[target_column] + constant + logger.debug(f"加法计算: {target_column} + {constant}") + else: + logger.warning(f"目标列不存在: {target_column}") + + return df + + def _subtract(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame: + """减法计算 + + Args: + df: 数据 + rule: 规则配置 + + Returns: + 处理后的数据 + """ + minuend = rule.get('minuend') # 被减数列 + subtrahend = rule.get('subtrahend') # 减数列 + target_column = rule.get('target_column') + constant = rule.get('constant', 0) + + if target_column and minuend and minuend in df.columns: + if subtrahend and subtrahend in df.columns: + df[target_column] = df[minuend] - df[subtrahend] - constant + logger.debug(f"减法计算: {minuend} - {subtrahend} - {constant} -> {target_column}") + else: + df[target_column] = df[minuend] - constant + logger.debug(f"减法计算: {minuend} - {constant} -> {target_column}") + else: + logger.warning(f"减法计算参数不完整或列不存在") + + return df + + def _formula(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame: + """公式计算 + + Args: + df: 数据 + rule: 规则配置 + + Returns: + 处理后的数据 + """ + formula = rule.get('formula') + target_column = rule.get('target_column') + + if formula and target_column: + try: + df[target_column] = df.eval(formula) + logger.debug(f"公式计算: {formula} -> {target_column}") + except Exception as e: + logger.error(f"公式计算失败: {formula}, 错误: {e}") + else: + logger.warning("公式计算缺少公式或目标列") + + return df + + def _round(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame: + """四舍五入 + + Args: + df: 数据 + rule: 规则配置 + + Returns: + 处理后的数据 + """ + columns = rule.get('columns', []) + decimals = rule.get('decimals', 0) + + if isinstance(columns, str): + columns = [columns] + + target_columns = columns or df.select_dtypes(include=[np.number]).columns + + for col in target_columns: + if col in df.columns and pd.api.types.is_numeric_dtype(df[col]): + df[col] = df[col].round(decimals) + logger.debug(f"四舍五入: {col} 保留 {decimals} 位小数") + + return df + + def _sum(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame: + """求和计算 + + Args: + df: 数据 + rule: 规则配置 + + Returns: + 处理后的数据 + """ + columns = rule.get('columns', []) + target_column = rule.get('target_column') + group_by = rule.get('group_by') + + if isinstance(columns, str): + columns = [columns] + + if group_by and group_by in df.columns: + # 分组求和 + if columns: + for col in columns: + if col in df.columns: + sum_result = df.groupby(group_by)[col].sum() + logger.debug(f"分组求和: {col} 按 {group_by} 分组") + else: + # 所有数值列分组求和 + numeric_columns = df.select_dtypes(include=[np.number]).columns + sum_result = df.groupby(group_by)[numeric_columns].sum() + logger.debug(f"分组求和: 所有数值列 按 {group_by} 分组") + else: + # 总体求和 + if columns: + valid_columns = [col for col in columns if col in df.columns] + if valid_columns and target_column: + df[target_column] = df[valid_columns].sum(axis=1) + logger.debug(f"求和计算: {valid_columns} -> {target_column}") + else: + # 所有数值列求和 + numeric_columns = df.select_dtypes(include=[np.number]).columns + if target_column and len(numeric_columns) > 0: + df[target_column] = df[numeric_columns].sum(axis=1) + logger.debug(f"求和计算: {list(numeric_columns)} -> {target_column}") + + return df + + def _aggregate(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame: + """聚合计算 + + Args: + df: 数据 + rule: 规则配置 + + Returns: + 处理后的数据 + """ + group_by = rule.get('group_by') + aggregations = rule.get('aggregations', {}) + + if group_by and group_by in df.columns: + # 构建聚合函数字典 + agg_dict = {} + for column, func in aggregations.items(): + if column in df.columns: + if isinstance(func, str): + agg_dict[column] = func + elif isinstance(func, list): + agg_dict[column] = func + + if agg_dict: + result = df.groupby(group_by).agg(agg_dict) + logger.debug(f"聚合计算: 按 {group_by} 分组, 聚合: {agg_dict}") + return result.reset_index() + + return df + + # 便捷方法 + def multiply(self, source_column: str, target_column: str, factor: float): + """乘法计算""" + self.add_rule('multiply', source_column=source_column, + target_column=target_column, factor=factor) + return self + + def divide(self, source_column: str, target_column: str, divisor: float): + """除法计算""" + self.add_rule('divide', source_column=source_column, + target_column=target_column, divisor=divisor) + return self + + def add(self, columns: Union[str, List[str]], target_column: str, constant: float = 0): + """加法计算""" + self.add_rule('add', columns=columns, target_column=target_column, constant=constant) + return self + + def subtract(self, minuend: str, target_column: str, + subtrahend: Optional[str] = None, constant: float = 0): + """减法计算""" + self.add_rule('subtract', minuend=minuend, target_column=target_column, + subtrahend=subtrahend, constant=constant) + return self + + def formula(self, formula: str, target_column: str): + """公式计算""" + self.add_rule('formula', formula=formula, target_column=target_column) + return self + + def round_columns(self, columns: Optional[Union[str, List[str]]] = None, decimals: int = 0): + """四舍五入""" + self.add_rule('round', columns=columns, decimals=decimals) + return self + + def sum_columns(self, columns: Optional[Union[str, List[str]]] = None, + target_column: Optional[str] = None, group_by: Optional[str] = None): + """求和计算""" + self.add_rule('sum', columns=columns, target_column=target_column, group_by=group_by) + return self + + def aggregate(self, group_by: str, aggregations: Dict[str, Union[str, List[str]]]): + """聚合计算""" + self.add_rule('aggregate', group_by=group_by, aggregations=aggregations) + return self \ No newline at end of file diff --git a/app/core/handlers/column_mapper.py b/app/core/handlers/column_mapper.py new file mode 100644 index 0000000..dc58806 --- /dev/null +++ b/app/core/handlers/column_mapper.py @@ -0,0 +1,382 @@ +""" +列映射处理器 + +提供列名映射和转换功能,支持不同供应商的列名标准化 +""" + +import re +import pandas as pd +from typing import Dict, Any, Optional, List, Union +from ...core.utils.log_utils import get_logger + +logger = get_logger(__name__) + + +class ColumnMapper: + """列映射处理器 + + 提供列名标准化功能,将不同供应商的列名映射到标准列名 + """ + + # 标准列名定义(所有列名别名的唯一来源) + STANDARD_COLUMNS = { + 'barcode': [ + '条码', '条形码', '商品条码', '商品条形码', '产品条码', '商品编码', + '商品编号', '条码(必填)', '电脑条码', '条码ID', + 'barcode', 'Barcode', 'BarCode', 'code', '编码', + ], + 'name': [ + '商品名称', '产品名称', '名称', '商品', '产品', '商品名', '品名', + '品项名', '商品或服务名称', '品项', '名 称', + 'name', 'product_name', + ], + 'specification': [ + '规格', '规格型号', '型号', '商品规格', '产品规格', '包装规格', '规 格', + 'specification', 'spec', 'model', + ], + 'quantity': [ + '数量', '采购量', '订货数量', '订单量', '需求量', '采购数量', '购买数量', + '订单数量', '数量(必填)', '采购量(必填)', '入库数', '入库数量', '数 量', + 'quantity', 'qty', + ], + 'unit': [ + '单位', '计量单位', '采购单位', '单位(必填)', '单位名称', '计价单位', '单 位', + 'unit', 'units', + ], + 'unit_price': [ + '单价', '价格', '采购单价', '进货价', '销售价', '采购价', '参考价', + '入库单价', '单价(必填)', '采购单价(必填)', '价格(必填)', '单 价', + 'unit_price', 'price', + ], + 'total_price': [ + '总价', '金额', '小计', '合计金额', '小计金额', '金额(元)', + '金额合计', '合计', '总额', + 'total_price', 'total', 'amount', + ], + 'gift_quantity': [ + '赠送量', '赠品数量', '赠送数量', '赠品', + ], + 'category': ['类别', '分类', '商品类别', 'category', 'type'], + 'brand': ['品牌', '商标', 'brand'], + 'supplier': ['供应商', '供货商', 'supplier', 'vendor'], + } + + def __init__(self, mapping_config: Optional[Dict[str, Any]] = None): + """初始化列映射器 + + Args: + mapping_config: 映射配置 + """ + self.mapping_config = mapping_config or {} + self.custom_mappings = {} + self._build_reverse_mapping() + + def _build_reverse_mapping(self): + """构建反向映射表""" + self.reverse_mapping = {} + + # 添加标准列的反向映射 + for standard_name, variations in self.STANDARD_COLUMNS.items(): + for variation in variations: + self.reverse_mapping[variation.lower()] = standard_name + + # 添加自定义映射 + for standard_name, custom_names in self.mapping_config.items(): + if isinstance(custom_names, str): + custom_names = [custom_names] + + for custom_name in custom_names: + self.reverse_mapping[custom_name.lower()] = standard_name + self.custom_mappings[custom_name.lower()] = standard_name + + def map_columns(self, df: pd.DataFrame, target_columns: Optional[List[str]] = None) -> pd.DataFrame: + """映射列名 + + Args: + df: 输入数据 + target_columns: 目标列名列表,如果为None则使用所有标准列 + + Returns: + 列名映射后的数据 + """ + if target_columns is None: + target_columns = list(self.STANDARD_COLUMNS.keys()) + + logger.info(f"开始列名映射,目标列: {target_columns}") + logger.info(f"原始列名: {list(df.columns)}") + + # 创建列名映射 + column_mapping = {} + used_columns = set() + + for target_col in target_columns: + # 查找匹配的原始列名 + matched_column = self._find_matching_column(df.columns, target_col) + if matched_column: + column_mapping[matched_column] = target_col + used_columns.add(matched_column) + logger.debug(f"列名映射: {matched_column} -> {target_col}") + + # 重命名列 + if column_mapping: + df_mapped = df.rename(columns=column_mapping) + + # 添加缺失的目标列 + for target_col in target_columns: + if target_col not in df_mapped.columns: + df_mapped[target_col] = self._get_default_value(target_col) + logger.debug(f"添加缺失列: {target_col}") + + # 只保留目标列 + existing_target_columns = [col for col in target_columns if col in df_mapped.columns] + df_result = df_mapped[existing_target_columns] + + logger.info(f"列名映射完成,结果列名: {list(df_result.columns)}") + return df_result + else: + logger.warning("没有找到可映射的列名") + return df + + def _find_matching_column(self, columns: List[str], target_column: str) -> Optional[str]: + """查找匹配的列名 + + Args: + columns: 原始列名列表 + target_column: 目标标准列名 + + Returns: + 匹配的原始列名或None + """ + # 获取目标列的所有可能变体 + possible_names = [] + + # 标准列名变体 + if target_column in self.STANDARD_COLUMNS: + possible_names.extend(self.STANDARD_COLUMNS[target_column]) + + # 自定义映射 + for standard_name, custom_names in self.mapping_config.items(): + if standard_name == target_column: + if isinstance(custom_names, str): + possible_names.append(custom_names) + else: + possible_names.extend(custom_names) + + # 查找匹配 + for possible_name in possible_names: + # 精确匹配(忽略大小写) + for column in columns: + if column.lower() == possible_name.lower(): + return column + + # 模糊匹配 + for column in columns: + if possible_name.lower() in column.lower() or column.lower() in possible_name.lower(): + return column + + return None + + def _get_default_value(self, column_name: str) -> Any: + """获取列的默认值 + + Args: + column_name: 列名 + + Returns: + 默认值 + """ + # 根据列名类型返回合适的默认值 + if column_name in ['quantity', 'unit_price', 'total_price']: + return 0 + elif column_name in ['barcode', 'name', 'specification', 'unit', 'category', 'brand', 'supplier']: + return '' + else: + return None + + def add_custom_mapping(self, standard_name: str, custom_names: Union[str, List[str]]): + """添加自定义列名映射 + + Args: + standard_name: 标准列名 + custom_names: 自定义列名或列名列表 + """ + if isinstance(custom_names, str): + custom_names = [custom_names] + + # 更新配置 + self.mapping_config[standard_name] = custom_names + + # 更新反向映射 + for custom_name in custom_names: + self.reverse_mapping[custom_name.lower()] = standard_name + self.custom_mappings[custom_name.lower()] = standard_name + + logger.info(f"添加自定义映射: {standard_name} <- {custom_names}") + + def detect_column_types(self, df: pd.DataFrame) -> Dict[str, str]: + """检测列的数据类型 + + Args: + df: 数据 + + Returns: + 列类型字典 + """ + column_types = {} + + for column in df.columns: + if pd.api.types.is_numeric_dtype(df[column]): + column_types[column] = 'numeric' + elif pd.api.types.is_datetime64_any_dtype(df[column]): + column_types[column] = 'datetime' + elif pd.api.types.is_bool_dtype(df[column]): + column_types[column] = 'boolean' + else: + column_types[column] = 'text' + + return column_types + + def suggest_column_mapping(self, df: pd.DataFrame) -> Dict[str, List[str]]: + """建议列名映射 + + Args: + df: 数据 + + Returns: + 建议的映射关系 + """ + suggestions = {} + + for column in df.columns: + column_lower = column.lower() + suggestions[column] = [] + + # 检查标准列名 + for standard_name, variations in self.STANDARD_COLUMNS.items(): + for variation in variations: + if column_lower in variation.lower() or variation.lower() in column_lower: + suggestions[column].append(standard_name) + + # 检查自定义映射 + for custom_name, standard_name in self.custom_mappings.items(): + if column_lower in custom_name or custom_name in column_lower: + suggestions[column].append(standard_name) + + # 去重 + suggestions[column] = list(set(suggestions[column])) + + # 只返回有建议的列 + return {k: v for k, v in suggestions.items() if v} + + def validate_mapping(self, df: pd.DataFrame, required_columns: List[str]) -> Dict[str, Any]: + """验证列映射结果 + + Args: + df: 映射后的数据 + required_columns: 必需的列名列表 + + Returns: + 验证结果 + """ + result = { + 'valid': True, + 'missing_columns': [], + 'empty_columns': [], + 'warnings': [] + } + + # 检查缺失列 + for col in required_columns: + if col not in df.columns: + result['missing_columns'].append(col) + result['valid'] = False + + # 检查空列 + for col in df.columns: + if df[col].isnull().all(): + result['empty_columns'].append(col) + result['warnings'].append(f"列 '{col}' 全部为空值") + + # 检查数值列 + numeric_columns = ['quantity', 'unit_price', 'total_price'] + for col in numeric_columns: + if col in df.columns and not pd.api.types.is_numeric_dtype(df[col]): + result['warnings'].append(f"列 '{col}' 不是数值类型") + + return result + + @classmethod + def find_column(cls, columns: List[str], standard_name: str) -> Optional[str]: + """在列名列表中查找匹配标准列名的列 + + 匹配策略: 精确匹配 → 忽略空白匹配 → 子串匹配 + + Args: + columns: 实际列名列表 + standard_name: 标准列名 (STANDARD_COLUMNS 的键) + + Returns: + 匹配到的实际列名,未找到返回 None + """ + candidates = cls.STANDARD_COLUMNS.get(standard_name, []) + if not candidates: + return None + + columns_str = [str(c) for c in columns] + + # 精确匹配 + for col in columns_str: + col_clean = col.strip() + for candidate in candidates: + if col_clean == candidate: + return col + + # 忽略空白匹配 + for col in columns_str: + col_clean = re.sub(r'\s+', '', col.strip()) + for candidate in candidates: + if col_clean == re.sub(r'\s+', '', candidate): + return col + + # 子串匹配 (候选名包含在列名中) + for col in columns_str: + col_lower = col.strip().lower() + for candidate in candidates: + if candidate.lower() in col_lower: + return col + + return None + + @staticmethod + def detect_header_row(df: pd.DataFrame, max_rows: int = 10, min_matches: int = 3) -> int: + """检测表头所在行 + + 扫描前 max_rows 行,返回包含最多关键词匹配的行索引。 + + Args: + df: 数据框 + max_rows: 最大扫描行数 + min_matches: 最少关键词匹配数 + + Returns: + 表头行索引,未找到返回 -1 + """ + header_keywords = [ + '条码', '条形码', '商品条码', '商品名称', '名称', '规格', + '单价', '数量', '金额', '单位', '必填', '编码', + ] + + best_row = -1 + best_matches = 0 + + for row_idx in range(min(max_rows, len(df))): + row_values = df.iloc[row_idx].astype(str) + matches = sum( + 1 for kw in header_keywords + if any(kw in str(val) for val in row_values.values) + ) + if matches >= min_matches and matches > best_matches: + best_matches = matches + best_row = row_idx + + return best_row \ No newline at end of file diff --git a/app/core/handlers/data_cleaner.py b/app/core/handlers/data_cleaner.py new file mode 100644 index 0000000..156255a --- /dev/null +++ b/app/core/handlers/data_cleaner.py @@ -0,0 +1,401 @@ +""" +数据清洗处理器 + +提供各种数据清洗功能,如空值处理、重复项处理、数据类型转换等 +""" + +import pandas as pd +from typing import Dict, Any, Optional, List, Union +from ...core.utils.log_utils import get_logger + +logger = get_logger(__name__) + + +class DataCleaner: + """数据清洗处理器 + + 提供标准化的数据清洗功能,支持链式调用和规则配置 + """ + + def __init__(self, config: Optional[Dict[str, Any]] = None): + """初始化数据清洗器 + + Args: + config: 清洗配置 + """ + self.config = config or {} + self.cleaning_rules = [] + + def add_rule(self, rule_type: str, **kwargs): + """添加清洗规则 + + Args: + rule_type: 规则类型 + **kwargs: 规则参数 + """ + rule = {'type': rule_type, **kwargs} + self.cleaning_rules.append(rule) + logger.debug(f"添加清洗规则: {rule_type}") + + def clean(self, df: pd.DataFrame) -> pd.DataFrame: + """执行数据清洗 + + Args: + df: 输入数据 + + Returns: + 清洗后的数据 + """ + logger.info(f"开始数据清洗,原始数据形状: {df.shape}") + + result_df = df.copy() + + for i, rule in enumerate(self.cleaning_rules): + try: + logger.debug(f"执行清洗规则 {i+1}/{len(self.cleaning_rules)}: {rule['type']}") + result_df = self._apply_rule(result_df, rule) + logger.debug(f"规则执行完成,数据形状: {result_df.shape}") + except Exception as e: + logger.error(f"清洗规则执行失败: {rule}, 错误: {e}") + # 继续执行下一个规则,而不是中断整个流程 + continue + + logger.info(f"数据清洗完成,最终数据形状: {result_df.shape}") + return result_df + + def _apply_rule(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame: + """应用单个清洗规则 + + Args: + df: 数据 + rule: 规则配置 + + Returns: + 处理后的数据 + """ + rule_type = rule.get('type') + + if rule_type == 'remove_duplicates': + return self._remove_duplicates(df, rule) + elif rule_type == 'fill_na': + return self._fill_na(df, rule) + elif rule_type == 'remove_rows': + return self._remove_rows(df, rule) + elif rule_type == 'convert_type': + return self._convert_type(df, rule) + elif rule_type == 'strip_whitespace': + return self._strip_whitespace(df, rule) + elif rule_type == 'normalize_text': + return self._normalize_text(df, rule) + elif rule_type == 'validate_data': + return self._validate_data(df, rule) + else: + logger.warning(f"未知的清洗规则类型: {rule_type}") + return df + + def _remove_duplicates(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame: + """移除重复项 + + Args: + df: 数据 + rule: 规则配置 + + Returns: + 处理后的数据 + """ + subset = rule.get('subset') # 用于判断重复的列 + keep = rule.get('keep', 'first') # 保留哪个重复项 + + before_count = len(df) + df_cleaned = df.drop_duplicates(subset=subset, keep=keep) + after_count = len(df_cleaned) + + logger.info(f"移除重复项: {before_count - after_count} 行被移除") + return df_cleaned + + def _fill_na(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame: + """填充空值 + + Args: + df: 数据 + rule: 规则配置 + + Returns: + 处理后的数据 + """ + columns = rule.get('columns') # 要处理的列 + value = rule.get('value', 0) # 填充值 + method = rule.get('method') # 填充方法('ffill', 'bfill', 'mean', 'median') + + if columns: + # 处理指定列 + if isinstance(columns, str): + columns = [columns] + + for col in columns: + if col in df.columns: + if method == 'ffill': + df[col] = df[col].fillna(method='ffill') + elif method == 'bfill': + df[col] = df[col].fillna(method='bfill') + elif method == 'mean': + df[col] = df[col].fillna(df[col].mean()) + elif method == 'median': + df[col] = df[col].fillna(df[col].median()) + else: + df[col] = df[col].fillna(value) + + logger.debug(f"填充列 {col} 的空值: {method or value}") + else: + # 处理所有列 + if method == 'ffill': + df = df.fillna(method='ffill') + elif method == 'bfill': + df = df.fillna(method='bfill') + else: + df = df.fillna(value) + + logger.debug(f"填充所有列的空值: {method or value}") + + return df + + def _remove_rows(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame: + """移除行 + + Args: + df: 数据 + rule: 规则配置 + + Returns: + 处理后的数据 + """ + condition = rule.get('condition') # 条件表达式 + columns = rule.get('columns') # 要检查的列 + values = rule.get('values') # 要移除的值 + + if condition: + # 使用条件表达式 + try: + before_count = len(df) + df_filtered = df.query(condition) + after_count = len(df_filtered) + logger.info(f"条件过滤: {condition}, 移除了 {before_count - after_count} 行") + return df_filtered + except Exception as e: + logger.error(f"条件表达式执行失败: {condition}, 错误: {e}") + return df + + if columns and values: + # 基于列值过滤 + if isinstance(columns, str): + columns = [columns] + if not isinstance(values, list): + values = [values] + + df_filtered = df.copy() + for col in columns: + if col in df_filtered.columns: + mask = ~df_filtered[col].isin(values) + df_filtered = df_filtered[mask] + logger.debug(f"列 {col} 过滤值 {values}") + + return df_filtered + + logger.warning("移除行规则缺少条件或列配置") + return df + + def _convert_type(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame: + """类型转换 + + Args: + df: 数据 + rule: 规则配置 + + Returns: + 处理后的数据 + """ + columns = rule.get('columns') + target_type = rule.get('target_type', 'float') + errors = rule.get('errors', 'coerce') # 错误处理方式 + + if isinstance(columns, str): + columns = [columns] + + for col in columns: + if col in df.columns: + try: + if target_type == 'int': + df[col] = pd.to_numeric(df[col], errors=errors).astype('Int64') + elif target_type == 'float': + df[col] = pd.to_numeric(df[col], errors=errors) + elif target_type == 'datetime': + df[col] = pd.to_datetime(df[col], errors=errors) + elif target_type == 'string': + df[col] = df[col].astype(str) + else: + df[col] = df[col].astype(target_type) + + logger.debug(f"列 {col} 类型转换: {target_type}") + except Exception as e: + logger.error(f"列 {col} 类型转换失败: {e}") + + return df + + def _strip_whitespace(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame: + """去除空白字符 + + Args: + df: 数据 + rule: 规则配置 + + Returns: + 处理后的数据 + """ + columns = rule.get('columns') + + if columns: + if isinstance(columns, str): + columns = [columns] + + for col in columns: + if col in df.columns and df[col].dtype == 'object': + df[col] = df[col].str.strip() + logger.debug(f"列 {col} 去除空白字符") + else: + # 处理所有文本列 + text_columns = df.select_dtypes(include=['object']).columns + for col in text_columns: + df[col] = df[col].str.strip() + + logger.debug(f"所有文本列去除空白字符: {list(text_columns)}") + + return df + + def _normalize_text(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame: + """文本标准化 + + Args: + df: 数据 + rule: 规则配置 + + Returns: + 处理后的数据 + """ + columns = rule.get('columns') + lowercase = rule.get('lowercase', False) + uppercase = rule.get('uppercase', False) + replace_map = rule.get('replace_map', {}) # 替换映射 + + if isinstance(columns, str): + columns = [columns] + + target_columns = columns or df.select_dtypes(include=['object']).columns + + for col in target_columns: + if col in df.columns and df[col].dtype == 'object': + if lowercase: + df[col] = df[col].str.lower() + elif uppercase: + df[col] = df[col].str.upper() + + # 应用替换映射 + for old, new in replace_map.items(): + df[col] = df[col].str.replace(old, new) + + logger.debug(f"列 {col} 文本标准化完成") + + return df + + def _validate_data(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame: + """数据验证 + + Args: + df: 数据 + rule: 规则配置 + + Returns: + 处理后的数据 + """ + columns = rule.get('columns') + min_value = rule.get('min_value') + max_value = rule.get('max_value') + required = rule.get('required', False) + + if isinstance(columns, str): + columns = [columns] + + validation_results = [] + + for col in columns: + if col in df.columns: + # 检查必需值 + if required: + null_count = df[col].isnull().sum() + if null_count > 0: + validation_results.append(f"{col}: {null_count} 个空值") + + # 检查数值范围 + if min_value is not None or max_value is not None: + if pd.api.types.is_numeric_dtype(df[col]): + invalid_mask = pd.Series(False, index=df.index) + if min_value is not None: + invalid_mask |= df[col] < min_value + if max_value is not None: + invalid_mask |= df[col] > max_value + + invalid_count = invalid_mask.sum() + if invalid_count > 0: + validation_results.append(f"{col}: {invalid_count} 个值超出范围") + + if validation_results: + logger.warning(f"数据验证发现问题: {', '.join(validation_results)}") + else: + logger.debug("数据验证通过") + + return df + + # 便捷方法 + def remove_duplicates(self, subset: Optional[List[str]] = None, keep: str = 'first'): + """移除重复项""" + self.add_rule('remove_duplicates', subset=subset, keep=keep) + return self + + def fill_na(self, columns: Optional[Union[str, List[str]]] = None, + value: Any = 0, method: Optional[str] = None): + """填充空值""" + self.add_rule('fill_na', columns=columns, value=value, method=method) + return self + + def remove_rows(self, condition: Optional[str] = None, + columns: Optional[Union[str, List[str]]] = None, + values: Optional[Any] = None): + """移除行""" + self.add_rule('remove_rows', condition=condition, columns=columns, values=values) + return self + + def convert_type(self, columns: Union[str, List[str]], target_type: str, errors: str = 'coerce'): + """类型转换""" + self.add_rule('convert_type', columns=columns, target_type=target_type, errors=errors) + return self + + def strip_whitespace(self, columns: Optional[Union[str, List[str]]] = None): + """去除空白字符""" + self.add_rule('strip_whitespace', columns=columns) + return self + + def normalize_text(self, columns: Optional[Union[str, List[str]]] = None, + lowercase: bool = False, uppercase: bool = False, + replace_map: Optional[Dict[str, str]] = None): + """文本标准化""" + self.add_rule('normalize_text', columns=columns, lowercase=lowercase, + uppercase=uppercase, replace_map=replace_map or {}) + return self + + def validate_data(self, columns: Union[str, List[str]], + min_value: Optional[float] = None, + max_value: Optional[float] = None, + required: bool = False): + """数据验证""" + self.add_rule('validate_data', columns=columns, min_value=min_value, + max_value=max_value, required=required) + return self \ No newline at end of file diff --git a/app/core/handlers/rule_engine.py b/app/core/handlers/rule_engine.py new file mode 100644 index 0000000..bf143d5 --- /dev/null +++ b/app/core/handlers/rule_engine.py @@ -0,0 +1,150 @@ +import re +import pandas as pd +from typing import List, Dict, Any, Optional + +def _split_quantity_unit(df: pd.DataFrame, source: str, dictionary: Optional[Dict[str, Any]] = None) -> pd.DataFrame: + if source in df.columns: + vals = df[source].astype(str).fillna("") + nums = [] + units = [] + default_unit = (dictionary or {}).get("default_unit", "") + unit_synonyms = (dictionary or {}).get("unit_synonyms", {}) + for v in vals: + m = re.search(r"(\d+(?:\.\d+)?)(箱|件|提|盒|瓶)", v) + if m: + nums.append(float(m.group(1))) + u = unit_synonyms.get(m.group(2), m.group(2)) + units.append(u) + else: + try: + nums.append(float(v)) + units.append(unit_synonyms.get(default_unit, default_unit)) + except Exception: + nums.append(0.0) + units.append(unit_synonyms.get(default_unit, default_unit)) + df["quantity"] = nums + df["unit"] = units + return df + +def _extract_spec_from_name(df: pd.DataFrame, source: str, dictionary: Optional[Dict[str, Any]] = None) -> pd.DataFrame: + if source in df.columns: + names = df[source].astype(str).fillna("") + specs = [] + packs = [] + ignore_words = (dictionary or {}).get("ignore_words", []) + name_patterns = (dictionary or {}).get("name_patterns", []) + for s in names: + if ignore_words: + for w in ignore_words: + s = s.replace(w, "") + matched = False + for pat in name_patterns: + try: + m = re.search(pat, s) + if m and len(m.groups()) >= 2: + try: + qty = int(m.group(len(m.groups()))) + except Exception: + qty = None + specs.append(s) + packs.append(qty) + matched = True + break + except Exception: + pass + if matched: + continue + m = re.search(r"(\d+(?:\.\d+)?)(ml|l|升|毫升)[*×xX](\d+)", s, re.IGNORECASE) + if m: + specs.append(f"{m.group(1)}{m.group(2)}*{m.group(3)}") + packs.append(int(m.group(3))) + continue + m2 = re.search(r"(\d+)[*×xX](\d+)", s) + if m2: + specs.append(f"1*{m2.group(2)}") + packs.append(int(m2.group(2))) + continue + m3 = re.search(r"(\d{2,3})\D*(\d{1,3})\D*", s) + if m3: + specs.append(f"1*{m3.group(2)}") + packs.append(int(m3.group(2))) + continue + specs.append("") + packs.append(None) + df["specification"] = df.get("specification", pd.Series(specs)) + df["package_quantity"] = packs + return df + +def _normalize_unit(df: pd.DataFrame, target: str, unit_map: Dict[str, str], dictionary: Optional[Dict[str, Any]] = None) -> pd.DataFrame: + if target in df.columns: + df[target] = df[target].astype(str) + df[target] = df[target].apply(lambda u: unit_map.get(u, u)) + pack_multipliers = (dictionary or {}).get("pack_multipliers", {}) + default_pq = (dictionary or {}).get("default_package_quantity", 1) + try: + if "quantity" in df.columns: + def convert_qty(row): + u = row.get(target) + q = row.get("quantity") + pq = row.get("package_quantity") + if u in ("件", "箱", "提", "盒"): + mult = pq or pack_multipliers.get(u, default_pq) + if pd.notna(q) and pd.notna(mult) and float(mult) > 0: + return float(q) * float(mult) + return q + df["quantity"] = df.apply(convert_qty, axis=1) + df[target] = df[target].apply(lambda u: "瓶" if u in ("件","箱","提","盒") else u) + except Exception: + pass + return df + +def _compute_quantity_from_total(df: pd.DataFrame) -> pd.DataFrame: + if "quantity" in df.columns and "unit_price" in df.columns: + qty = df["quantity"].fillna(0) + up = pd.to_numeric(df.get("unit_price", 0), errors="coerce").fillna(0) + tp = pd.to_numeric(df.get("total_price", 0), errors="coerce").fillna(0) + need = (qty <= 0) & (up > 0) & (tp > 0) + df.loc[need, "quantity"] = (tp[need] / up[need]).round(6) + return df + +def _fill_missing(df: pd.DataFrame, fills: Dict[str, Any]) -> pd.DataFrame: + for k, v in fills.items(): + if k in df.columns: + df[k] = df[k].fillna(v) + else: + df[k] = v + return df + +def _mark_gift(df: pd.DataFrame) -> pd.DataFrame: + df["is_gift"] = False + tp = df.get("total_price") + up = df.get("unit_price") + flags = pd.Series([False]*len(df)) + if tp is not None: + tpn = pd.to_numeric(tp, errors="coerce").fillna(0) + flags = flags | (tpn == 0) + if up is not None: + upn = pd.to_numeric(up, errors="coerce").fillna(0) + flags = flags | (upn == 0) + if "name" in df.columns: + flags = flags | df["name"].astype(str).str.contains(r"赠品|^o$|^O$", regex=True) + df.loc[flags, "is_gift"] = True + return df + +def apply_rules(df: pd.DataFrame, rules: List[Dict[str, Any]], dictionary: Optional[Dict[str, Any]] = None) -> pd.DataFrame: + out = df.copy() + for r in rules or []: + t = r.get("type") + if t == "split_quantity_unit": + out = _split_quantity_unit(out, r.get("source", "quantity"), dictionary) + elif t == "extract_spec_from_name": + out = _extract_spec_from_name(out, r.get("source", "name"), dictionary) + elif t == "normalize_unit": + out = _normalize_unit(out, r.get("target", "unit"), r.get("map", {}), dictionary) + elif t == "compute_quantity_from_total": + out = _compute_quantity_from_total(out) + elif t == "fill_missing": + out = _fill_missing(out, r.get("fills", {})) + elif t == "mark_gift": + out = _mark_gift(out) + return out \ No newline at end of file diff --git a/app/core/ocr/__init__.py b/app/core/ocr/__init__.py new file mode 100644 index 0000000..e3d34ad --- /dev/null +++ b/app/core/ocr/__init__.py @@ -0,0 +1,5 @@ +""" +OCR订单处理系统 - OCR核心模块 +--------------------------- +提供OCR识别相关功能,包括图片预处理、文字识别和表格识别。 +""" \ No newline at end of file diff --git a/app/core/ocr/baidu_ocr.py b/app/core/ocr/baidu_ocr.py new file mode 100644 index 0000000..ad4a694 --- /dev/null +++ b/app/core/ocr/baidu_ocr.py @@ -0,0 +1,368 @@ +""" +百度OCR客户端模块 +--------------- +提供百度OCR API的访问和调用功能。 +""" + +import time +import base64 +import requests +from typing import Dict, Optional, Union + +from ..utils.log_utils import get_logger + +logger = get_logger(__name__) + +# Token 过期相关常量 +_DEFAULT_TOKEN_LIFETIME = 30 * 24 * 3600 # 30天(秒) +_TOKEN_EARLY_EXPIRY = 3600 # 提前1小时刷新(秒) + +class TokenManager: + """ + 令牌管理类,负责获取和刷新百度API访问令牌 + """ + + def __init__(self, api_key: str, secret_key: str, max_retries: int = 3, retry_delay: int = 2, token_url: str = None): + """ + 初始化令牌管理器 + + Args: + api_key: 百度API Key + secret_key: 百度Secret Key + max_retries: 最大重试次数 + retry_delay: 重试延迟(秒) + token_url: 令牌获取地址 + """ + self.api_key = api_key + self.secret_key = secret_key + self.max_retries = max_retries + self.retry_delay = retry_delay + self.token_url = token_url or 'https://aip.baidubce.com/oauth/2.0/token' + self.access_token = None + self.token_expiry = 0 + + def get_token(self) -> Optional[str]: + """ + 获取访问令牌,如果令牌已过期则刷新 + + Returns: + 访问令牌,如果获取失败则返回None + """ + if self.is_token_valid(): + return self.access_token + + return self.refresh_token() + + def is_token_valid(self) -> bool: + """ + 检查令牌是否有效 + + Returns: + 令牌是否有效 + """ + return ( + self.access_token is not None and + self.token_expiry > time.time() + 60 # 提前1分钟刷新 + ) + + def refresh_token(self) -> Optional[str]: + """ + 刷新访问令牌 + + Returns: + 新的访问令牌,如果获取失败则返回None + """ + url = self.token_url + params = { + "grant_type": "client_credentials", + "client_id": self.api_key, + "client_secret": self.secret_key + } + + for attempt in range(self.max_retries): + try: + response = requests.post(url, params=params, timeout=10) + if response.status_code == 200: + result = response.json() + if "access_token" in result: + self.access_token = result["access_token"] + # 设置令牌过期时间(默认30天,提前1小时过期以确保安全) + self.token_expiry = time.time() + result.get("expires_in", _DEFAULT_TOKEN_LIFETIME) - _TOKEN_EARLY_EXPIRY + logger.info("成功获取访问令牌") + return self.access_token + + logger.warning(f"获取访问令牌失败 (尝试 {attempt+1}/{self.max_retries}): {response.text}") + + except Exception as e: + logger.warning(f"获取访问令牌时发生错误 (尝试 {attempt+1}/{self.max_retries}): {e}") + + # 如果不是最后一次尝试,则等待后重试 + if attempt < self.max_retries - 1: + time.sleep(self.retry_delay * (attempt + 1)) # 指数退避 + + logger.error("无法获取访问令牌") + return None + +class BaiduOCRClient: + """ + 百度OCR API客户端 + """ + + def __init__(self, config): + """ + 初始化百度OCR客户端 + + Args: + config: 配置信息 + """ + self.config = config + + # 从配置中读取API信息 + try: + # 修复getint调用方式 + self.timeout = config.get('API', 'timeout', fallback=30) + if isinstance(self.timeout, str): + self.timeout = int(self.timeout) + + self.api_key = config.get('API', 'api_key', fallback='') + self.secret_key = config.get('API', 'secret_key', fallback='') + + # 使用fallback而不是位置参数 + try: + self.max_retries = config.getint('API', 'max_retries', fallback=3) + except (TypeError, AttributeError): + # 如果getint不支持fallback,则使用get再转换 + self.max_retries = int(config.get('API', 'max_retries', fallback='3')) + + try: + self.retry_delay = config.getint('API', 'retry_delay', fallback=2) + except (TypeError, AttributeError): + # 如果getint不支持fallback,则使用get再转换 + self.retry_delay = int(config.get('API', 'retry_delay', fallback='2')) + + self.api_url = config.get('API', 'api_url', fallback='https://aip.baidubce.com/rest/2.0/ocr/v1/table') + + # 创建令牌管理器 + self.token_manager = TokenManager( + self.api_key, + self.secret_key, + self.max_retries, + self.retry_delay, + token_url=config.get('API', 'token_url', fallback='https://aip.baidubce.com/oauth/2.0/token') + ) + + # 验证API配置 + if not self.api_key or not self.secret_key: + logger.warning("API密钥未设置,请在配置文件中设置API密钥") + except Exception as e: + logger.error(f"初始化失败: {e}") + + def read_image(self, image_path: str) -> Optional[bytes]: + """ + 读取图片文件为二进制数据 + + Args: + image_path: 图片文件路径 + + Returns: + 图片二进制数据,如果读取失败则返回None + """ + try: + with open(image_path, 'rb') as f: + return f.read() + except Exception as e: + logger.error(f"读取图片文件失败: {image_path}, 错误: {e}") + return None + + def recognize_table(self, image_data: Union[str, bytes]) -> Optional[Dict]: + """ + 识别表格 + + Args: + image_data: 图片数据,可以是文件路径或二进制数据 + + Returns: + 识别结果字典,如果识别失败则返回None + """ + # 获取访问令牌 + access_token = self.token_manager.get_token() + if not access_token: + logger.error("无法获取访问令牌,无法进行表格识别") + return None + + # 如果是文件路径,读取图片数据 + if isinstance(image_data, str): + image_data = self.read_image(image_data) + if image_data is None: + return None + + # 准备请求参数 + url = f"{self.api_url}?access_token={access_token}" + image_base64 = base64.b64encode(image_data).decode('utf-8') + + # 请求参数 - 添加return_excel参数,与v1版本保持一致 + payload = { + 'image': image_base64, + 'is_sync': 'true', # 同步请求 + 'request_type': 'excel', # 输出为Excel + 'return_excel': 'true' # 直接返回Excel数据 + } + + headers = { + 'Content-Type': 'application/x-www-form-urlencoded', + 'Accept': 'application/json' + } + + # 发送请求 + for attempt in range(self.max_retries): + try: + response = requests.post( + url, + data=payload, + headers=headers, + timeout=self.timeout + ) + + if response.status_code == 200: + result = response.json() + # 打印返回结果以便调试 + logger.debug(f"百度OCR API返回结果: {result}") + + if 'error_code' in result: + error_msg = result.get('error_msg', '未知错误') + logger.error(f"百度OCR API错误: {error_msg}") + # 如果是授权错误,尝试刷新令牌 + if result.get('error_code') in [110, 111]: # 授权相关错误码 + logger.info("尝试刷新访问令牌...") + self.token_manager.refresh_token() + return None + + # 兼容不同的返回结构 + # 这是最关键的修改部分: 直接返回整个结果,不强制要求特定结构 + return result + else: + logger.warning(f"表格识别请求失败 (尝试 {attempt+1}/{self.max_retries}): {response.text}") + + except Exception as e: + logger.warning(f"表格识别时发生错误 (尝试 {attempt+1}/{self.max_retries}): {e}") + + # 如果不是最后一次尝试,则等待后重试 + if attempt < self.max_retries - 1: + wait_time = self.retry_delay * (2 ** attempt) # 指数退避 + logger.info(f"将在 {wait_time} 秒后重试...") + time.sleep(wait_time) + + logger.error("表格识别失败") + return None + + def get_excel_result(self, request_id_or_result: Union[str, Dict]) -> Optional[bytes]: + """ + 获取Excel结果 + + Args: + request_id_or_result: 请求ID或完整的识别结果 + + Returns: + Excel二进制数据,如果获取失败则返回None + """ + # 获取访问令牌 + access_token = self.token_manager.get_token() + if not access_token: + logger.error("无法获取访问令牌,无法获取Excel结果") + return None + + # 处理直接传入结果对象的情况 + request_id = request_id_or_result + if isinstance(request_id_or_result, dict): + # v1版本兼容处理:如果结果中直接包含Excel数据 + if 'result' in request_id_or_result: + # 如果是同步返回的Excel结果(某些API版本会直接返回) + if 'result_data' in request_id_or_result['result']: + excel_content = request_id_or_result['result']['result_data'] + if excel_content: + try: + return base64.b64decode(excel_content) + except Exception as e: + logger.error(f"解析Excel数据失败: {e}") + + # 提取request_id + if 'request_id' in request_id_or_result['result']: + request_id = request_id_or_result['result']['request_id'] + logger.debug(f"从result子对象中提取request_id: {request_id}") + elif 'tables_result' in request_id_or_result['result'] and len(request_id_or_result['result']['tables_result']) > 0: + # 某些版本API可能直接返回表格内容,此时可能没有request_id + logger.info("检测到API直接返回了表格内容,但没有request_id") + return None + # 有些版本可能request_id在顶层 + elif 'request_id' in request_id_or_result: + request_id = request_id_or_result['request_id'] + logger.debug(f"从顶层对象中提取request_id: {request_id}") + + # 如果没有有效的request_id,无法获取结果 + if not isinstance(request_id, str): + logger.error(f"无法从结果中提取有效的request_id: {request_id_or_result}") + return None + + base_url = self.config.get('API', 'form_ocr_url', fallback='https://aip.baidubce.com/rest/2.0/solution/v1/form_ocr/get_request_result') + url = f"{base_url}?access_token={access_token}" + + payload = { + 'request_id': request_id, + 'result_type': 'excel' + } + + headers = { + 'Content-Type': 'application/x-www-form-urlencoded', + 'Accept': 'application/json' + } + + for attempt in range(self.max_retries): + try: + response = requests.post( + url, + data=payload, + headers=headers, + timeout=self.timeout + ) + + if response.status_code == 200: + try: + result = response.json() + logger.debug(f"获取Excel结果返回: {result}") + + # 检查是否还在处理中 + if result.get('result', {}).get('ret_code') == 3: + logger.info(f"Excel结果正在处理中,等待后重试 (尝试 {attempt+1}/{self.max_retries})") + time.sleep(2) + continue + + # 检查是否有错误 + if 'error_code' in result or result.get('result', {}).get('ret_code') != 0: + error_msg = result.get('error_msg') or result.get('result', {}).get('ret_msg', '未知错误') + logger.error(f"获取Excel结果失败: {error_msg}") + return None + + # 获取Excel内容 + excel_content = result.get('result', {}).get('result_data') + if excel_content: + return base64.b64decode(excel_content) + else: + logger.error("Excel结果为空") + return None + + except Exception as e: + logger.error(f"解析Excel结果时出错: {e}") + return None + + else: + logger.warning(f"获取Excel结果请求失败 (尝试 {attempt+1}/{self.max_retries}): {response.text}") + + except Exception as e: + logger.warning(f"获取Excel结果时发生错误 (尝试 {attempt+1}/{self.max_retries}): {e}") + + # 如果不是最后一次尝试,则等待后重试 + if attempt < self.max_retries - 1: + time.sleep(self.retry_delay * (attempt + 1)) + + logger.error("获取Excel结果失败") + return None \ No newline at end of file diff --git a/app/core/ocr/table_ocr.py b/app/core/ocr/table_ocr.py new file mode 100644 index 0000000..c61f6c8 --- /dev/null +++ b/app/core/ocr/table_ocr.py @@ -0,0 +1,389 @@ +""" +表格OCR处理模块 +------------- +处理图片并提取表格内容,保存为Excel文件。 +""" + +import os +import time +import base64 +from concurrent.futures import ThreadPoolExecutor +from typing import Dict, List, Optional, Tuple, Callable + +from ..utils.log_utils import get_logger +from ..utils.file_utils import ( + ensure_dir, + get_file_extension, + get_files_by_extensions, + generate_timestamp_filename, + is_file_size_valid, + load_json, + save_json +) +from .baidu_ocr import BaiduOCRClient + +logger = get_logger(__name__) + +class ProcessedRecordManager: + """处理记录管理器,用于跟踪已处理的文件""" + + def __init__(self, record_file: str): + """ + 初始化处理记录管理器 + + Args: + record_file: 记录文件路径 + """ + self.record_file = record_file + self.processed_files = self._load_record() + + def _load_record(self) -> Dict[str, str]: + """ + 加载处理记录 + + Returns: + 处理记录字典,键为输入文件路径,值为输出文件路径 + """ + return load_json(self.record_file, {}) + + def save_record(self) -> None: + """保存处理记录""" + save_json(self.processed_files, self.record_file) + + def is_processed(self, image_file: str) -> bool: + """ + 检查图片是否已处理 + + Args: + image_file: 图片文件路径 + + Returns: + 是否已处理 + """ + return image_file in self.processed_files + + def mark_as_processed(self, image_file: str, output_file: str) -> None: + """ + 标记图片为已处理 + + Args: + image_file: 图片文件路径 + output_file: 输出文件路径 + """ + self.processed_files[image_file] = output_file + self.save_record() + + def get_output_file(self, image_file: str) -> Optional[str]: + """ + 获取图片的输出文件路径 + + Args: + image_file: 图片文件路径 + + Returns: + 输出文件路径,如果不存在则返回None + """ + return self.processed_files.get(image_file) + + def get_unprocessed_files(self, files: List[str]) -> List[str]: + """ + 获取未处理的文件列表 + + Args: + files: 文件列表 + + Returns: + 未处理的文件列表 + """ + return [file for file in files if not self.is_processed(file)] + +class OCRProcessor: + """ + OCR处理器,负责协调OCR识别和结果处理 + """ + + def __init__(self, config): + """ + 初始化OCR处理器 + + Args: + config: 配置信息 + """ + self.config = config + + # 修复ConfigParser对象没有get_path方法的问题 + try: + # 获取输入和输出目录 + self.input_folder = config.get('Paths', 'input_folder', fallback='data/input') + self.output_folder = config.get('Paths', 'output_folder', fallback='data/output') + self.temp_folder = config.get('Paths', 'temp_folder', fallback='data/temp') + + # 确保目录存在 + os.makedirs(self.input_folder, exist_ok=True) + os.makedirs(self.output_folder, exist_ok=True) + os.makedirs(self.temp_folder, exist_ok=True) + + # 获取文件类型列表 + allowed_extensions_str = config.get('File', 'allowed_extensions', fallback='.jpg,.jpeg,.png,.bmp') + self.file_types = [ext.strip() for ext in allowed_extensions_str.split(',') if ext.strip()] + if not self.file_types: + self.file_types = ['.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tif', '.tiff'] + + # 初始化OCR客户端 + self.ocr_client = BaiduOCRClient(self.config) + + # 记录实际路径 + logger.info(f"使用输入目录: {os.path.abspath(self.input_folder)}") + logger.info(f"使用输出目录: {os.path.abspath(self.output_folder)}") + logger.info(f"使用临时目录: {os.path.abspath(self.temp_folder)}") + logger.info(f"允许的文件类型: {self.file_types}") + + # 初始化processed_files_json和record_manager + self.processed_files_json = os.path.join(self.output_folder, 'processed_files.json') + self.record_manager = ProcessedRecordManager(self.processed_files_json) + + # 加载已处理文件记录 + self.processed_files = self._load_processed_files() + + logger.info(f"初始化OCRProcessor完成:输入目录={self.input_folder}, 输出目录={self.output_folder}") + except Exception as e: + logger.error(f"初始化OCRProcessor失败: {e}") + raise + + def _load_processed_files(self) -> Dict[str, str]: + """ + 加载已处理的文件记录 + + Returns: + 已处理的文件记录字典,键为输入文件路径,值为输出文件路径 + """ + return load_json(self.processed_files_json, {}) + + def get_unprocessed_images(self) -> List[str]: + """ + 获取未处理的图片列表 + + Returns: + 未处理的图片文件路径列表 + """ + # 获取所有图片文件 + image_files = get_files_by_extensions(self.input_folder, self.file_types) + + # 如果需要跳过已存在的文件 + skip_existing = True + try: + skip_existing = self.config.getboolean('Performance', 'skip_existing', fallback=True) + except Exception: + pass + + if skip_existing: + # 过滤已处理的文件 + unprocessed_files = self.record_manager.get_unprocessed_files(image_files) + logger.info(f"找到 {len(image_files)} 个图片文件,其中 {len(unprocessed_files)} 个未处理") + return unprocessed_files + + logger.info(f"找到 {len(image_files)} 个图片文件(不跳过已处理的文件)") + return image_files + + def validate_image(self, image_path: str) -> bool: + """ + 验证图片是否有效 + + Args: + image_path: 图片文件路径 + + Returns: + 图片是否有效 + """ + # 检查文件是否存在 + if not os.path.exists(image_path): + logger.warning(f"图片文件不存在: {image_path}") + return False + + # 检查文件扩展名 + ext = get_file_extension(image_path) + if ext not in self.file_types: + logger.warning(f"不支持的文件类型: {ext}, 文件: {image_path}") + return False + + # 检查文件大小 + max_size_mb = 4.0 + try: + max_size_mb = float(self.config.get('File', 'max_file_size_mb', fallback='4.0')) + except Exception: + pass + + if not is_file_size_valid(image_path, max_size_mb): + logger.warning(f"文件大小超过限制 ({max_size_mb}MB): {image_path}") + return False + + return True + + def process_image(self, image_path: str) -> Optional[str]: + """ + 处理单个图片 + + Args: + image_path: 图片文件路径 + + Returns: + 输出Excel文件路径,如果处理失败则返回None + """ + # 验证图片 + if not self.validate_image(image_path): + return None + + # 获取是否跳过已处理文件的配置 + skip_existing = True + try: + skip_existing = self.config.getboolean('Performance', 'skip_existing', fallback=True) + except Exception: + pass + + # 如果需要跳过已处理的文件 + if skip_existing and self.record_manager.is_processed(image_path): + output_file = self.record_manager.get_output_file(image_path) + logger.info(f"图片已处理,跳过: {image_path}, 输出文件: {output_file}") + return output_file + + logger.info(f"开始处理图片: {image_path}") + + try: + # 获取Excel扩展名 + excel_extension = '.xlsx' + try: + excel_extension = self.config.get('File', 'excel_extension', fallback='.xlsx') + except Exception: + pass + + # 生成输出文件路径 + file_name = os.path.splitext(os.path.basename(image_path))[0] + output_file = os.path.join(self.output_folder, f"{file_name}{excel_extension}") + + # 检查是否已存在对应的Excel文件 + if os.path.exists(output_file) and skip_existing: + logger.info(f"已存在对应的Excel文件,跳过处理: {os.path.basename(image_path)} -> {os.path.basename(output_file)}") + # 记录处理结果 + self.record_manager.mark_as_processed(image_path, output_file) + return output_file + + # 进行OCR识别 + ocr_result = self.ocr_client.recognize_table(image_path) + if not ocr_result: + logger.error(f"OCR识别失败: {image_path}") + return None + + # 保存Excel文件 - 按照v1版本逻辑提取Excel数据 + excel_base64 = None + + # 从不同可能的字段中尝试获取Excel数据 + if 'excel_file' in ocr_result: + excel_base64 = ocr_result['excel_file'] + logger.debug("从excel_file字段获取Excel数据") + elif 'result' in ocr_result: + if 'result_data' in ocr_result['result']: + excel_base64 = ocr_result['result']['result_data'] + logger.debug("从result.result_data字段获取Excel数据") + elif 'excel_file' in ocr_result['result']: + excel_base64 = ocr_result['result']['excel_file'] + logger.debug("从result.excel_file字段获取Excel数据") + elif 'tables_result' in ocr_result['result'] and ocr_result['result']['tables_result']: + for table in ocr_result['result']['tables_result']: + if 'excel_file' in table: + excel_base64 = table['excel_file'] + logger.debug("从tables_result中获取Excel数据") + break + + # 如果还是没有找到Excel数据,尝试通过get_excel_result获取 + if not excel_base64: + logger.info("无法从直接返回中获取Excel数据,尝试通过API获取...") + excel_data = self.ocr_client.get_excel_result(ocr_result) + if not excel_data: + logger.error(f"获取Excel结果失败: {image_path}") + return None + + # 保存Excel文件 + os.makedirs(os.path.dirname(output_file), exist_ok=True) + with open(output_file, 'wb') as f: + f.write(excel_data) + else: + # 解码并保存Excel文件 + try: + excel_data = base64.b64decode(excel_base64) + os.makedirs(os.path.dirname(output_file), exist_ok=True) + with open(output_file, 'wb') as f: + f.write(excel_data) + except Exception as e: + logger.error(f"解码或保存Excel数据时出错: {e}") + return None + + logger.info(f"图片处理成功: {image_path}, 输出文件: {output_file}") + + # 标记为已处理 + self.record_manager.mark_as_processed(image_path, output_file) + + return output_file + + except Exception as e: + logger.error(f"处理图片时出错: {image_path}, 错误: {e}") + return None + + def process_images_batch(self, batch_size: int = None, max_workers: int = None, progress_cb: Optional[Callable[[int], None]] = None) -> Tuple[int, int]: + """ + 批量处理图片 + + Args: + batch_size: 批处理大小,如果为None则使用配置值 + max_workers: 最大线程数,如果为None则使用配置值 + + Returns: + (总处理数, 成功处理数)元组 + """ + # 使用配置值或参数值 + if batch_size is None: + try: + batch_size = self.config.getint('Performance', 'batch_size', fallback=5) + except Exception: + batch_size = 5 + + if max_workers is None: + try: + max_workers = self.config.getint('Performance', 'max_workers', fallback=4) + except Exception: + max_workers = 4 + + # 获取未处理的图片 + unprocessed_images = self.get_unprocessed_images() + if not unprocessed_images: + logger.warning("没有需要处理的图片") + return 0, 0 + + total = len(unprocessed_images) + success_count = 0 + + # 按批次处理 + for i in range(0, total, batch_size): + batch = unprocessed_images[i:i+batch_size] + logger.info(f"处理批次 {i//batch_size+1}/{(total+batch_size-1)//batch_size}: {len(batch)} 个文件") + try: + if progress_cb: + # 以批次为单位估算进度(0-90%),保留10%给后续阶段 + percent = int(10 + (i / max(total, 1)) * 80) + progress_cb(min(percent, 90)) + except Exception: + pass + + # 使用多线程处理批次 + with ThreadPoolExecutor(max_workers=max_workers) as executor: + results = list(executor.map(self.process_image, batch)) + + # 统计成功数 + success_count += sum(1 for result in results if result is not None) + + logger.info(f"所有图片处理完成, 总计: {total}, 成功: {success_count}") + try: + if progress_cb: + progress_cb(90) + except Exception: + pass + return total, success_count diff --git a/app/core/processors/__init__.py b/app/core/processors/__init__.py new file mode 100644 index 0000000..0653b6d --- /dev/null +++ b/app/core/processors/__init__.py @@ -0,0 +1,9 @@ +""" +处理器模块初始化文件 +""" + +from .base import BaseProcessor +from .ocr_processor import OCRProcessor +from .tobacco_processor import TobaccoProcessor + +__all__ = ['BaseProcessor', 'OCRProcessor', 'TobaccoProcessor'] \ No newline at end of file diff --git a/app/core/processors/base.py b/app/core/processors/base.py new file mode 100644 index 0000000..e988851 --- /dev/null +++ b/app/core/processors/base.py @@ -0,0 +1,167 @@ +""" +基础处理器接口模块 + +定义所有处理器的基类,提供统一的处理接口 +""" + +from abc import ABC, abstractmethod +from typing import Dict, Any, Optional, List +from pathlib import Path +import logging +import pandas as pd + +from ...core.utils.log_utils import get_logger + +logger = get_logger(__name__) + + +class BaseProcessor(ABC): + """基础处理器接口 - 所有处理器的基类 + + 采用策略模式设计,每个处理器负责特定类型的文件处理 + """ + + def __init__(self, config: Dict[str, Any]): + """初始化处理器 + + Args: + config: 处理器配置字典 + """ + self.config = config + self.name = self.__class__.__name__ + self.description = "" + self._setup_logging() + + def _setup_logging(self): + """设置处理器日志""" + self.logger = logging.getLogger(f"{__name__}.{self.name}") + + @abstractmethod + def can_process(self, file_path: Path) -> bool: + """判断是否能处理该文件 + + Args: + file_path: 文件路径 + + Returns: + 是否能处理该文件 + """ + pass + + @abstractmethod + def process(self, input_file: Path, output_dir: Path) -> Optional[Path]: + """处理文件,返回输出文件路径 + + Args: + input_file: 输入文件路径 + output_dir: 输出目录路径 + + Returns: + 输出文件路径,处理失败返回None + """ + pass + + @abstractmethod + def get_required_columns(self) -> List[str]: + """返回需要的列名列表 + + Returns: + 列名列表 + """ + pass + + def validate_input(self, file_path: Path) -> bool: + """验证输入文件有效性 + + Args: + file_path: 文件路径 + + Returns: + 文件是否有效 + """ + try: + if not file_path.exists(): + self.logger.warning(f"文件不存在: {file_path}") + return False + + if not file_path.is_file(): + self.logger.warning(f"不是文件: {file_path}") + return False + + supported_extensions = self.get_supported_extensions() + if supported_extensions and file_path.suffix.lower() not in supported_extensions: + self.logger.warning(f"不支持的文件类型: {file_path.suffix}, 支持的类型: {supported_extensions}") + return False + + return True + + except Exception as e: + self.logger.error(f"验证文件时出错: {e}") + return False + + def get_supported_extensions(self) -> List[str]: + """获取支持的文件扩展名 + + Returns: + 支持的扩展名列表,空列表表示支持所有类型 + """ + return [] + + def get_output_filename(self, input_file: Path, suffix: str = "_processed") -> str: + """生成输出文件名 + + Args: + input_file: 输入文件路径 + suffix: 文件名后缀 + + Returns: + 输出文件名 + """ + return f"{input_file.stem}{suffix}{input_file.suffix}" + + def _read_excel_safely(self, file_path: Path, **kwargs) -> pd.DataFrame: + """根据扩展名选择合适的读取引擎 + + Args: + file_path: 文件路径 + **kwargs: 传递给 pd.read_excel 的参数 + + Returns: + DataFrame + + Raises: + Exception: 读取失败时抛出 + """ + suffix = file_path.suffix.lower() + if suffix == '.xlsx': + return pd.read_excel(file_path, engine='openpyxl', **kwargs) + elif suffix == '.xls': + try: + return pd.read_excel(file_path, engine='xlrd', **kwargs) + except Exception as e: + self.logger.warning(f"读取xls失败,可能缺少xlrd: {e}") + raise + else: + return pd.read_excel(file_path, **kwargs) + + def log_processing_start(self, input_file: Path): + """记录处理开始日志""" + self.logger.info(f"开始处理文件: {input_file}") + self.logger.info(f"处理器: {self.name} - {self.description}") + + def log_processing_end(self, input_file: Path, output_file: Optional[Path] = None, success: bool = True): + """记录处理结束日志""" + if success: + self.logger.info(f"处理完成: {input_file}") + if output_file: + self.logger.info(f"输出文件: {output_file}") + else: + self.logger.error(f"处理失败: {input_file}") + + def __str__(self) -> str: + """字符串表示""" + return f"{self.name}({self.description})" + + def __repr__(self) -> str: + """详细字符串表示""" + return f"{self.__class__.__module__}.{self.__class__.__name__}(name='{self.name}', description='{self.description}')" \ No newline at end of file diff --git a/app/core/processors/ocr_processor.py b/app/core/processors/ocr_processor.py new file mode 100644 index 0000000..1ff94bf --- /dev/null +++ b/app/core/processors/ocr_processor.py @@ -0,0 +1,192 @@ +""" +OCR处理器 + +处理图片文件的OCR识别完整流程:图片识别 → Excel处理 → 标准采购单生成 +""" + +import os +from pathlib import Path +from typing import Optional, Dict, Any, List + +from .base import BaseProcessor +from ...services.ocr_service import OCRService +from ...services.order_service import OrderService +from ...core.utils.log_utils import get_logger + +logger = get_logger(__name__) + + +class OCRProcessor(BaseProcessor): + """OCR处理器 + + 处理图片文件的完整OCR识别流程: + 1. OCR识别图片中的表格信息 + 2. 处理识别结果生成Excel文件 + 3. 转换为标准采购单格式 + """ + + def __init__(self, config: Dict[str, Any]): + """初始化OCR处理器 + + Args: + config: 配置信息 + """ + super().__init__(config) + self.description = "OCR识别完整流程(图片→识别→Excel→采购单)" + + # 初始化服务 + self.ocr_service = OCRService(config) + self.order_service = OrderService(config) + + def can_process(self, file_path: Path) -> bool: + """判断是否为支持的图片文件 + + Args: + file_path: 文件路径 + + Returns: + 是否能处理该文件 + """ + if not self.validate_input(file_path): + return False + + # 支持的图片格式 + supported_extensions = ['.jpg', '.jpeg', '.png', '.bmp'] + + if file_path.suffix.lower() in supported_extensions: + self.logger.info(f"识别为图片文件: {file_path.name}") + return True + + return False + + def process(self, input_file: Path, output_dir: Path) -> Optional[Path]: + """处理图片文件的完整OCR流程 + + Args: + input_file: 输入图片文件路径 + output_dir: 输出目录路径 + + Returns: + 输出文件路径,处理失败返回None + """ + self.log_processing_start(input_file) + + try: + self.logger.info("开始OCR识别流程...") + + # 步骤1: OCR识别 + self.logger.info("步骤1/3: OCR识别图片...") + ocr_result = self._perform_ocr(input_file, output_dir) + if not ocr_result: + self.logger.error("OCR识别失败") + self.log_processing_end(input_file, success=False) + return None + + # 步骤2: Excel处理 + self.logger.info("步骤2/3: 处理Excel文件...") + excel_result = self._process_excel(ocr_result, output_dir) + if not excel_result: + self.logger.error("Excel处理失败") + self.log_processing_end(input_file, success=False) + return None + + # 步骤3: 生成标准采购单 + self.logger.info("步骤3/3: 生成标准采购单...") + final_result = self._generate_purchase_order(excel_result, output_dir) + + if final_result: + self.logger.info(f"OCR处理流程完成,输出文件: {final_result}") + self.log_processing_end(input_file, final_result, success=True) + return final_result + else: + self.logger.error("生成采购单失败") + self.log_processing_end(input_file, success=False) + return None + + except Exception as e: + self.logger.error(f"OCR处理流程出错: {e}", exc_info=True) + self.log_processing_end(input_file, success=False) + return None + + def get_required_columns(self) -> List[str]: + """返回需要的列名列表""" + # OCR处理不直接依赖列名,由后续处理步骤决定 + return [] + + def get_supported_extensions(self) -> List[str]: + """支持的文件扩展名""" + return ['.jpg', '.jpeg', '.png', '.bmp'] + + def _perform_ocr(self, input_file: Path, output_dir: Path) -> Optional[Path]: + """执行OCR识别 + + Args: + input_file: 输入图片文件 + output_dir: 输出目录 + + Returns: + OCR生成的Excel文件路径,失败返回None + """ + try: + self.logger.info(f"开始OCR识别: {input_file}") + + # 使用OCR服务处理图片 + result_path = self.ocr_service.process_image(str(input_file)) + + if result_path: + # 确保结果文件在输出目录中 + result_path = Path(result_path) + if result_path.exists(): + self.logger.info(f"OCR识别成功,输出文件: {result_path}") + return result_path + else: + self.logger.error(f"OCR结果文件不存在: {result_path}") + return None + else: + self.logger.error("OCR服务返回None") + return None + + except Exception as e: + self.logger.error(f"OCR识别失败: {e}", exc_info=True) + return None + + def _process_excel(self, excel_file: Path, output_dir: Path) -> Optional[Path]: + """处理Excel文件 + + Args: + excel_file: Excel文件路径 + output_dir: 输出目录 + + Returns: + 处理后的Excel文件路径,失败返回None + """ + try: + self.logger.info(f"开始处理Excel文件: {excel_file}") + + # 使用订单服务处理Excel文件(生成采购单) + result_path = self.order_service.process_excel(str(excel_file)) + + if result_path: + result_path = Path(result_path) + if result_path.exists(): + self.logger.info(f"Excel处理成功,输出文件: {result_path}") + return result_path + else: + self.logger.error(f"Excel处理结果文件不存在: {result_path}") + return None + else: + self.logger.error("Excel处理服务返回None") + return None + + except Exception as e: + self.logger.error(f"Excel处理失败: {e}", exc_info=True) + return None + + def _generate_purchase_order(self, processed_file: Path, output_dir: Path) -> Optional[Path]: + """采购单生成由OrderService完成,此处直接返回处理结果""" + try: + if processed_file and processed_file.exists(): + return processed_file + return None + except Exception: + return None diff --git a/app/core/processors/supplier_processors/__init__.py b/app/core/processors/supplier_processors/__init__.py new file mode 100644 index 0000000..d9e1616 --- /dev/null +++ b/app/core/processors/supplier_processors/__init__.py @@ -0,0 +1,7 @@ +""" +供应商处理器模块初始化文件 +""" + +from .generic_supplier_processor import GenericSupplierProcessor + +__all__ = ['GenericSupplierProcessor'] \ No newline at end of file diff --git a/app/core/processors/supplier_processors/generic_supplier_processor.py b/app/core/processors/supplier_processors/generic_supplier_processor.py new file mode 100644 index 0000000..38525ca --- /dev/null +++ b/app/core/processors/supplier_processors/generic_supplier_processor.py @@ -0,0 +1,340 @@ +""" +通用供应商处理器 + +可配置化的供应商处理器,支持通过配置文件定义处理规则 +""" + +import fnmatch +import pandas as pd +from typing import Optional, Dict, Any, List +from pathlib import Path + +from ..base import BaseProcessor +from ...utils.log_utils import get_logger +from ...handlers.rule_engine import apply_rules +from ...handlers.column_mapper import ColumnMapper +from ...handlers.data_cleaner import DataCleaner +from ...handlers.calculator import DataCalculator + +logger = get_logger(__name__) + + +class GenericSupplierProcessor(BaseProcessor): + """通用供应商处理器 + + 基于配置文件处理不同供应商的Excel文件,支持: + - 文件名模式匹配 + - 内容特征识别 + - 列映射配置 + - 数据清洗规则 + - 计算处理规则 + """ + + def __init__(self, config: Dict[str, Any], supplier_config: Dict[str, Any]): + """初始化通用供应商处理器 + + Args: + config: 系统配置 + supplier_config: 供应商特定配置 + """ + super().__init__(config) + self.supplier_config = supplier_config + + # 从配置中提取基本信息 + self.name = supplier_config.get('name', 'GenericSupplier') + self.description = supplier_config.get('description', '通用供应商处理器') + + # 处理规则配置 + self.filename_patterns = supplier_config.get('filename_patterns', []) + self.content_indicators = supplier_config.get('content_indicators', []) + self.column_mapping = supplier_config.get('column_mapping', {}) + self.cleaning_rules = supplier_config.get('cleaning_rules', []) + self.calculations = supplier_config.get('calculations', []) + + # 输出配置 + self.output_template = supplier_config.get('output_template', 'templates/银豹-采购单模板.xls') + self.output_suffix = supplier_config.get('output_suffix', '_银豹采购单') + + def can_process(self, file_path: Path) -> bool: + """判断是否能处理该文件 + + Args: + file_path: 文件路径 + + Returns: + 是否能处理 + """ + if not self.validate_input(file_path): + return False + + # 检查文件名模式 + if self.filename_patterns: + filename_match = self._check_filename_patterns(file_path) + if filename_match: + return True + + # 检查文件内容特征 + if self.content_indicators: + content_match = self._check_content_indicators(file_path) + if content_match: + return True + + # 如果都没有配置,则无法判断 + if not self.filename_patterns and not self.content_indicators: + self.logger.warning(f"处理器 {self.name} 没有配置识别规则") + return False + + return False + + def process(self, input_file: Path, output_dir: Path) -> Optional[Path]: + """处理文件 + + Args: + input_file: 输入文件路径 + output_dir: 输出目录路径 + + Returns: + 输出文件路径,处理失败返回None + """ + self.log_processing_start(input_file) + + try: + # 步骤1: 读取数据 + self.logger.info("步骤1/4: 读取数据...") + df = self._read_supplier_data(input_file) + if df is None or df.empty: + self.logger.error("读取数据失败或数据为空") + self.log_processing_end(input_file, success=False) + return None + + # 步骤2: 应用列映射 + self.logger.info("步骤2/4: 应用列映射...") + mapped_df = self._apply_column_mapping(df) + if mapped_df is None: + self.logger.error("列映射失败") + self.log_processing_end(input_file, success=False) + return None + + # 步骤3: 数据清洗 + self.logger.info("步骤3/4: 数据清洗...") + cleaned_df = self._apply_data_cleaning(mapped_df) + if cleaned_df is None: + self.logger.error("数据清洗失败") + self.log_processing_end(input_file, success=False) + return None + try: + rules = self.supplier_config.get('rules', []) + dictionary = self.supplier_config.get('dictionary') + standardized_df = apply_rules(cleaned_df, rules, dictionary) + except Exception as e: + self.logger.warning(f"规则执行失败: {e}") + standardized_df = cleaned_df + + # 步骤4: 计算处理 + self.logger.info("步骤4/4: 计算处理...") + calculated_df = self._apply_calculations(standardized_df) + if calculated_df is None: + self.logger.error("计算处理失败") + self.log_processing_end(input_file, success=False) + return None + + # 生成输出文件 + output_file = self._generate_output(calculated_df, input_file, output_dir) + + if output_file and output_file.exists(): + self.logger.info(f"处理完成,输出文件: {output_file}") + self.log_processing_end(input_file, output_file, success=True) + return output_file + else: + self.logger.error("输出文件生成失败") + self.log_processing_end(input_file, success=False) + return None + + except Exception as e: + self.logger.error(f"处理文件时出错: {e}", exc_info=True) + self.log_processing_end(input_file, success=False) + return None + + def get_required_columns(self) -> List[str]: + """返回需要的列名列表""" + # 从列映射配置中提取目标列名 + return list(self.column_mapping.values()) if self.column_mapping else [] + + def _check_filename_patterns(self, file_path: Path) -> bool: + """检查文件名模式 + + Args: + file_path: 文件路径 + + Returns: + 是否匹配 + """ + try: + filename = file_path.name + for pattern in self.filename_patterns: + if fnmatch.fnmatch(filename.lower(), pattern.lower()): + self.logger.info(f"文件名匹配成功: {filename} -> {pattern}") + return True + return False + except Exception as e: + self.logger.error(f"检查文件名模式时出错: {e}") + return False + + def _check_content_indicators(self, file_path: Path) -> bool: + """检查文件内容特征 + + Args: + file_path: 文件路径 + + Returns: + 是否匹配 + """ + try: + df = self._read_excel_safely(file_path, nrows=5) + + # 检查列名中是否包含指定关键词 + columns_str = str(list(df.columns)).lower() + + for indicator in self.content_indicators: + if indicator.lower() in columns_str: + self.logger.info(f"内容特征匹配成功: {indicator}") + return True + + return False + + except Exception as e: + self.logger.error(f"检查内容特征时出错: {e}") + return False + + def _read_supplier_data(self, file_path: Path) -> Optional[pd.DataFrame]: + """读取供应商数据 + + Args: + file_path: 文件路径 + + Returns: + 数据DataFrame或None + """ + try: + specified = self.supplier_config.get('header_row') + if specified is not None: + try: + df = self._read_excel_safely(file_path, header=int(specified)) + except Exception: + df = self._read_excel_safely(file_path) + else: + df0 = self._read_excel_safely(file_path, header=None) + if df0 is None: + return None + header_row = self._find_header_row(df0) + if header_row is not None: + df = self._read_excel_safely(file_path, header=header_row) + else: + df = self._read_excel_safely(file_path) + if df is None or df.empty: + self.logger.warning("数据文件为空") + return None + self.logger.info(f"成功读取数据,形状: {df.shape}") + return df + except Exception as e: + self.logger.error(f"读取数据失败: {e}") + return None + + def _find_header_row(self, df: pd.DataFrame) -> Optional[int]: + result = ColumnMapper.detect_header_row(df, max_rows=30) + return result if result >= 0 else None + + def _apply_column_mapping(self, df: pd.DataFrame) -> Optional[pd.DataFrame]: + """应用列映射 + + Args: + df: 原始数据 + + Returns: + 映射后的数据或None + """ + if not self.column_mapping: + self.logger.info("没有列映射配置") + return df + + try: + # 应用列重命名 + df_renamed = df.rename(columns=self.column_mapping) + + # 检查必需的列是否存在 + required_columns = self.get_required_columns() + missing_columns = [col for col in required_columns if col not in df_renamed.columns] + + if missing_columns: + self.logger.warning(f"缺少必需的列: {missing_columns}") + # 创建缺失的列并填充默认值 + for col in missing_columns: + df_renamed[col] = 0 if '量' in col or '价' in col else '' + self.logger.info(f"创建缺失列: {col},默认值: {df_renamed[col].iloc[0] if len(df_renamed) > 0 else 'N/A'}") + + self.logger.info(f"列映射完成,列名: {list(df_renamed.columns)}") + return df_renamed + + except Exception as e: + self.logger.error(f"列映射失败: {e}") + return None + + def _apply_data_cleaning(self, df: pd.DataFrame) -> Optional[pd.DataFrame]: + """应用数据清洗规则,委托给 DataCleaner""" + if not self.cleaning_rules: + self.logger.info("没有数据清洗规则") + return df + try: + cleaner = DataCleaner() + for rule in self.cleaning_rules: + cleaner.add_rule(rule.get('type'), **{k: v for k, v in rule.items() if k != 'type'}) + result = cleaner.clean(df) + self.logger.info(f"数据清洗完成,数据形状: {result.shape}") + return result + except Exception as e: + self.logger.error(f"数据清洗失败: {e}") + return None + + def _apply_calculations(self, df: pd.DataFrame) -> Optional[pd.DataFrame]: + """应用计算处理,委托给 DataCalculator""" + if not self.calculations: + self.logger.info("没有计算规则") + return df + try: + calculator = DataCalculator() + for calc in self.calculations: + calculator.add_rule(calc.get('type'), **{k: v for k, v in calc.items() if k != 'type'}) + result = calculator.calculate(df) + self.logger.info(f"计算处理完成,数据形状: {result.shape}") + return result + except Exception as e: + self.logger.error(f"计算处理失败: {e}") + return None + + def _generate_output(self, df: pd.DataFrame, input_file: Path, output_dir: Path) -> Optional[Path]: + """生成输出文件 + + Args: + df: 最终数据 + input_file: 输入文件路径 + output_dir: 输出目录 + + Returns: + 输出文件路径或None + """ + try: + # 生成输出文件名 + timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S") + output_filename = f"{input_file.stem}{self.output_suffix}_{timestamp}.xls" + output_file = output_dir / output_filename + + # 这里应该使用实际的模板生成逻辑 + # 暂时直接保存为Excel文件 + df.to_excel(output_file, index=False) + + self.logger.info(f"输出文件生成成功: {output_file}") + return output_file + + except Exception as e: + self.logger.error(f"生成输出文件失败: {e}") + return None diff --git a/app/core/processors/tobacco_processor.py b/app/core/processors/tobacco_processor.py new file mode 100644 index 0000000..3641ad5 --- /dev/null +++ b/app/core/processors/tobacco_processor.py @@ -0,0 +1,347 @@ +""" +烟草订单处理器 + +处理烟草公司特定格式的订单明细文件,生成银豹采购单 +""" + +import os +import datetime +import pandas as pd +import xlrd +import xlwt +from xlutils.copy import copy +from openpyxl import load_workbook +from typing import Optional, Dict, Any, List, Tuple +from pathlib import Path + +from .base import BaseProcessor +from ...core.utils.log_utils import get_logger +from ...core.utils.string_utils import parse_monetary_string +from ...core.utils.dialog_utils import show_custom_dialog + +logger = get_logger(__name__) + + +class TobaccoProcessor(BaseProcessor): + """烟草订单处理器 + + 处理烟草公司订单明细文件,提取商品信息并生成标准银豹采购单格式 + """ + + def __init__(self, config: Dict[str, Any]): + """初始化烟草订单处理器 + + Args: + config: 配置信息 + """ + super().__init__(config) + self.description = "处理烟草公司订单明细文件" + self.template_file = config.get('Paths', 'template_file', fallback='templates/银豹-采购单模板.xls') + + # 输出目录配置 + self.result_dir = Path("data/result") + self.result_dir.mkdir(exist_ok=True) + + # 默认输出文件名 + self.default_output_name = "银豹采购单_烟草公司.xls" + + def can_process(self, file_path: Path) -> bool: + """判断是否为烟草订单文件 + + Args: + file_path: 文件路径 + + Returns: + 是否能处理该文件 + """ + if not self.validate_input(file_path): + return False + + # 检查文件名特征 + filename = file_path.name + tobacco_keywords = ['烟草', '卷烟', '订单明细', 'tobacco', '烟'] + + # 检查文件内容特征 + try: + df = self._read_excel_safely(file_path, nrows=5) + required_columns = ['商品', '盒码', '订单量'] + + # 检查文件名或内容特征 + filename_match = any(keyword in filename for keyword in tobacco_keywords) + content_match = all(col in df.columns for col in required_columns) + + if filename_match or content_match: + self.logger.info(f"识别为烟草订单文件: {filename}") + return True + + return False + + except Exception as e: + self.logger.warning(f"检查文件内容时出错: {e}") + # 如果无法读取内容,仅基于文件名判断 + return any(keyword in filename for keyword in tobacco_keywords) + + def process(self, input_file: Path, output_dir: Path) -> Optional[Path]: + """处理烟草订单 + + Args: + input_file: 输入文件路径 + output_dir: 输出目录路径 + + Returns: + 输出文件路径,处理失败返回None + """ + self.log_processing_start(input_file) + + try: + # 读取订单信息(时间和总金额) + order_info = self._read_order_info(input_file) + if not order_info: + self.logger.error(f"读取订单信息失败: {input_file}") + self.log_processing_end(input_file, success=False) + return None + + order_time, total_amount = order_info + self.logger.info(f"订单信息 - 时间: {order_time}, 总金额: {total_amount}") + + # 读取订单数据 + order_data = self._read_order_data(input_file) + if order_data is None or order_data.empty: + self.logger.error(f"读取订单数据失败或数据为空: {input_file}") + self.log_processing_end(input_file, success=False) + return None + + self.logger.info(f"成功读取订单数据,共{len(order_data)}条记录") + + # 生成输出文件路径 + timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + output_filename = f"银豹采购单_烟草公司_{timestamp}.xls" + output_file = output_dir / output_filename + + # 确保输出目录存在 + output_file.parent.mkdir(parents=True, exist_ok=True) + + # 生成银豹采购单 + result = self._generate_pospal_order(order_data, order_time, output_file) + + if result: + self.logger.info(f"采购单生成成功: {output_file}") + self.log_processing_end(input_file, output_file, success=True) + + # 显示处理结果 + self._show_processing_result(output_file, order_time, len(order_data), total_amount) + + return output_file + else: + self.logger.error("生成银豹采购单失败") + self.log_processing_end(input_file, success=False) + return None + + except Exception as e: + self.logger.error(f"处理烟草订单时发生错误: {e}", exc_info=True) + self.log_processing_end(input_file, success=False) + return None + + def get_required_columns(self) -> List[str]: + """返回需要的列名列表""" + return ['商品', '盒码', '条码', '建议零售价', '批发价', '需求量', '订单量', '金额'] + + def get_supported_extensions(self) -> List[str]: + """支持的文件扩展名""" + return ['.xlsx', '.xls'] + + def _read_order_info(self, file_path: Path) -> Optional[Tuple[str, float]]: + """读取订单信息(时间和总金额) + + Args: + file_path: 文件路径 + + Returns: + 包含订单时间和总金额的元组或None + """ + try: + wb_info = load_workbook(file_path, data_only=True) + ws_info = wb_info.active + + # 从指定单元格读取订单信息 + order_time = ws_info["H1"].value or "(空)" + total_amount = ws_info["H3"].value or 0.0 + + self.logger.info(f"成功读取订单信息: 时间={order_time}, 总金额={total_amount}") + return (order_time, total_amount) + + except Exception as e: + self.logger.error(f"读取订单信息出错: {e}") + return None + + def _read_order_data(self, file_path: Path) -> Optional[pd.DataFrame]: + """读取订单数据 + + Args: + file_path: 文件路径 + + Returns: + 订单数据DataFrame或None + """ + columns = ['商品', '盒码', '条码', '建议零售价', '批发价', '需求量', '订单量', '金额'] + + try: + df_old = self._read_excel_safely(file_path, header=None, skiprows=3, names=columns) + + # 过滤订单量不为0的数据,并计算采购量和单价 + df_filtered = df_old[df_old['订单量'] != 0].copy() + + if df_filtered.empty: + self.logger.warning("没有订单量不为0的记录") + return None + + # 计算采购量和单价 + df_filtered['采购量'] = df_filtered['订单量'] * 10 # 烟草订单通常需要乘以10 + df_filtered['采购单价'] = df_filtered['金额'] / df_filtered['采购量'] + df_filtered = df_filtered.reset_index(drop=True) + + self.logger.info(f"成功处理订单数据,有效记录数: {len(df_filtered)}") + return df_filtered + + except Exception as e: + self.logger.error(f"读取订单数据失败: {e}") + return None + + def _generate_pospal_order(self, order_data: pd.DataFrame, order_time: str, output_file: Path) -> bool: + """生成银豹采购单 + + Args: + order_data: 订单数据 + order_time: 订单时间 + output_file: 输出文件路径 + + Returns: + 是否生成成功 + """ + try: + # 检查模板文件是否存在 + template_path = Path(self.template_file) + if not template_path.exists(): + self.logger.error(f"采购单模板文件不存在: {template_path}") + return False + + self.logger.info(f"使用模板文件: {template_path}") + + # 打开模板,准备写入 + template_rd = xlrd.open_workbook(str(template_path), formatting_info=True) + template_wb = copy(template_rd) + template_ws = template_wb.get_sheet(0) + + # 获取模板中的表头列索引 + header_row = template_rd.sheet_by_index(0).row_values(0) + + # 查找需要的列索引 + try: + barcode_col = header_row.index("条码(必填)") + amount_col = header_row.index("采购量(必填)") + gift_col = header_row.index("赠送量") + price_col = header_row.index("采购单价(必填)") + except ValueError as e: + self.logger.error(f"模板列查找失败: {e}") + return False + + self.logger.info(f"模板列索引 - 条码:{barcode_col}, 采购量:{amount_col}, 赠送量:{gift_col}, 单价:{price_col}") + + # 写入数据到模板 + for i, row in order_data.iterrows(): + template_ws.write(i + 1, barcode_col, row['盒码']) # 商品条码 + template_ws.write(i + 1, amount_col, int(row['采购量'])) # 采购量 + template_ws.write(i + 1, gift_col, "") # 赠送量为空 + template_ws.write(i + 1, price_col, round(row['采购单价'], 2)) # 采购单价保留两位小数 + + # 确保输出目录存在 + output_file.parent.mkdir(parents=True, exist_ok=True) + + # 保存输出文件 + template_wb.save(str(output_file)) + + self.logger.info(f"采购单生成成功: {output_file}") + return True + + except Exception as e: + self.logger.error(f"生成银豹采购单失败: {e}", exc_info=True) + return False + + def _show_processing_result(self, output_file: Path, order_time: str, total_count: int, total_amount: float): + """显示处理结果 + + Args: + output_file: 输出文件路径 + order_time: 订单时间 + total_count: 处理条目数 + total_amount: 总金额 + """ + try: + # 创建附加信息 + additional_info = { + "订单来源": "烟草公司", + "处理时间": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + } + + # 格式化金额显示 + parsed = parse_monetary_string(total_amount) + total_amount = parsed if parsed is not None else 0.0 + amount_display = f"¥{total_amount:.2f}" + + # 显示自定义对话框 + show_custom_dialog( + title="烟草订单处理结果", + message="烟草订单处理完成", + result_file=str(output_file), + time_info=order_time, + count_info=f"{total_count}个商品", + amount_info=amount_display, + additional_info=additional_info + ) + + self.logger.info(f"显示处理结果 - 文件:{output_file}, 时间:{order_time}, 数量:{total_count}, 金额:{total_amount}") + + except Exception as e: + self.logger.error(f"显示处理结果时出错: {e}") + + def get_latest_tobacco_order(self) -> Optional[Path]: + """获取最新的烟草订单明细文件(兼容旧接口) + + Returns: + 文件路径或None + """ + try: + # 获取今日开始时间戳 + today = datetime.date.today() + today_start = datetime.datetime.combine(today, datetime.time.min).timestamp() + + # 查找订单明细文件 + result_dir = Path("data/output") + if not result_dir.exists(): + return None + + # 查找符合条件的文件 + candidates = [] + for file_path in result_dir.glob("订单明细*.xlsx"): + if file_path.stat().st_ctime >= today_start: + candidates.append(file_path) + + if not candidates: + self.logger.warning("未找到今天创建的烟草订单明细文件") + # 返回最新的文件 + all_files = list(result_dir.glob("订单明细*.xlsx")) + if all_files: + all_files.sort(key=lambda x: x.stat().st_ctime, reverse=True) + return all_files[0] + return None + + # 返回最新的文件 + candidates.sort(key=lambda x: x.stat().st_ctime, reverse=True) + latest_file = candidates[0] + + self.logger.info(f"找到最新烟草订单明细文件: {latest_file}") + return latest_file + + except Exception as e: + self.logger.error(f"获取最新烟草订单文件时出错: {e}") + return None diff --git a/app/core/utils/__init__.py b/app/core/utils/__init__.py new file mode 100644 index 0000000..7931e8d --- /dev/null +++ b/app/core/utils/__init__.py @@ -0,0 +1,5 @@ +""" +OCR订单处理系统 - 工具模块 +------------------------ +提供系统通用工具和辅助函数。 +""" \ No newline at end of file diff --git a/app/core/utils/cloud_sync.py b/app/core/utils/cloud_sync.py new file mode 100644 index 0000000..ff929a4 --- /dev/null +++ b/app/core/utils/cloud_sync.py @@ -0,0 +1,184 @@ +"""云端同步模块 — 基于 Gitea REST API 的文件同步""" + +import base64 +import json +from typing import Optional, Tuple + +import requests + +from .log_utils import get_logger + +logger = get_logger(__name__) + + +class GiteaSync: + """通过 Gitea REST API 读写仓库文件""" + + def __init__(self, base_url: str, owner: str, repo: str, token: str, timeout: int = 15): + self.base_url = base_url.rstrip("/") + self.owner = owner + self.repo = repo + self.token = token + self.timeout = timeout + + @property + def _headers(self) -> dict: + return {"Authorization": f"token {self.token}"} + + def _api_url(self, path: str) -> str: + return f"{self.base_url}/api/v1/repos/{self.owner}/{self.repo}/contents/{path}" + + def pull_file(self, remote_path: str) -> Optional[Tuple[bytes, str]]: + """从仓库下载文件 + + Returns: + (content_bytes, sha) 或 None(文件不存在或失败) + """ + try: + resp = requests.get( + self._api_url(remote_path), + headers=self._headers, + timeout=self.timeout, + ) + if resp.status_code == 404: + logger.info(f"云端文件不存在: {remote_path}") + return None + if resp.status_code != 200: + logger.warning(f"拉取文件失败: {resp.status_code} {resp.text[:200]}") + return None + + data = resp.json() + sha = data.get("sha", "") + content_b64 = data.get("content", "") + # Gitea 返回的 base64 可能含换行 + content_bytes = base64.b64decode(content_b64.replace("\n", "")) + logger.info(f"拉取文件成功: {remote_path} ({len(content_bytes)} bytes)") + return content_bytes, sha + + except requests.RequestException as e: + logger.error(f"拉取文件网络错误: {e}") + return None + + def push_file( + self, + remote_path: str, + content: bytes, + message: str, + sha: Optional[str] = None, + ) -> Optional[str]: + """上传或更新文件到仓库 + + Args: + remote_path: 仓库中的文件路径 + content: 文件内容(bytes) + message: commit message + sha: 文件当前 sha(更新时必传,新建时省略) + + Returns: + 新的 sha,失败返回 None + """ + payload = { + "message": message, + "content": base64.b64encode(content).decode("ascii"), + } + if sha: + payload["sha"] = sha + + try: + resp = requests.put( + self._api_url(remote_path), + headers={**self._headers, "Content-Type": "application/json"}, + json=payload, + timeout=self.timeout, + ) + if resp.status_code not in (200, 201): + logger.warning(f"推送文件失败: {resp.status_code} {resp.text[:200]}") + return None + + new_sha = resp.json().get("content", {}).get("sha", "") + logger.info(f"推送文件成功: {remote_path} (sha={new_sha[:12]})") + return new_sha + + except requests.RequestException as e: + logger.error(f"推送文件网络错误: {e}") + return None + + def file_exists(self, remote_path: str) -> Optional[str]: + """检查文件是否存在 + + Returns: + 文件 sha(存在)或 None(不存在) + """ + try: + resp = requests.head( + self._api_url(remote_path), + headers=self._headers, + timeout=self.timeout, + ) + if resp.status_code == 200: + # HEAD 不返回 body,需要 GET 获取 sha + result = self.pull_file(remote_path) + return result[1] if result else None + return None + except requests.RequestException: + return None + + def pull_json(self, remote_path: str) -> Optional[Tuple[dict, str]]: + """拉取并解析 JSON 文件 + + Returns: + (parsed_dict, sha) 或 None + """ + result = self.pull_file(remote_path) + if result is None: + return None + content_bytes, sha = result + try: + data = json.loads(content_bytes) + return data, sha + except json.JSONDecodeError as e: + logger.error(f"解析 JSON 失败: {e}") + return None + + def push_json(self, remote_path: str, data: dict, message: str, sha: Optional[str] = None) -> Optional[str]: + """将 dict 序列化为 JSON 并推送 + + Returns: + 新的 sha,失败返回 None + """ + content = json.dumps(data, ensure_ascii=False, indent=2).encode("utf-8") + return self.push_file(remote_path, content, message, sha) + + def push_binary(self, remote_path: str, local_path: str, message: str) -> Optional[str]: + """读取本地二进制文件并推送到云端 + + Returns: + 新的 sha,失败返回 None + """ + try: + with open(local_path, "rb") as f: + content = f.read() + except OSError as e: + logger.error(f"读取本地文件失败: {local_path} — {e}") + return None + + existing_sha = self.file_exists(remote_path) + return self.push_file(remote_path, content, message, sha=existing_sha) + + @classmethod + def from_config(cls, config) -> Optional["GiteaSync"]: + """从 ConfigManager 创建实例 + + Returns: + GiteaSync 实例,配置不完整时返回 None + """ + base_url = config.get("Gitea", "base_url", fallback="").strip() + owner = config.get("Gitea", "owner", fallback="").strip() + repo = config.get("Gitea", "repo", fallback="").strip() + token = config.get("Gitea", "token", fallback="").strip() + + if not all([base_url, owner, repo, token]): + logger.debug("Gitea 配置不完整,跳过云端同步") + return None + + return cls(base_url=base_url, owner=owner, repo=repo, token=token) diff --git a/app/core/utils/dialog_utils.py b/app/core/utils/dialog_utils.py new file mode 100644 index 0000000..78d8746 --- /dev/null +++ b/app/core/utils/dialog_utils.py @@ -0,0 +1,1142 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +对话框工具模块 +------------- +提供各种弹窗和对话框显示功能 +""" + +import os +import json +import tkinter as tk +from tkinter import messagebox, ttk, simpledialog +from datetime import datetime + +from .cloud_sync import GiteaSync +from app.config.settings import ConfigManager + +def create_custom_dialog(title="提示", message="", result_file=None, time_info=None, + count_info=None, amount_info=None, additional_info=None): + """ + 创建自定义结果对话框 + + Args: + title: 对话框标题 + message: 主要消息 + result_file: 结果文件路径(如果有) + time_info: 时间信息(如:订单时间) + count_info: 数量信息(如:处理条目数) + amount_info: 金额信息(如:总金额) + additional_info: 其他附加信息(字典格式) + + Returns: + dialog: 对话框对象 + """ + # 创建对话框 + dialog = tk.Toplevel() + dialog.title(title) + dialog.geometry("450x320") + dialog.resizable(False, False) + + # 使弹窗居中显示 + center_window(dialog) + + # 添加标题 + tk.Label(dialog, text=message, font=("Arial", 16, "bold")).pack(pady=10) + + # 创建内容框架 + result_frame = tk.Frame(dialog) + result_frame.pack(pady=10, fill=tk.BOTH, expand=True) + + # 添加时间、数量、金额等信息 + if time_info: + tk.Label(result_frame, text=f"时间信息: {time_info}", font=("Arial", 12)).pack(anchor=tk.W, padx=20, pady=5) + + if count_info: + tk.Label(result_frame, text=f"处理数量: {count_info}", font=("Arial", 12)).pack(anchor=tk.W, padx=20, pady=5) + + if amount_info: + tk.Label(result_frame, text=f"金额信息: {amount_info}", font=("Arial", 12)).pack(anchor=tk.W, padx=20, pady=5) + + # 添加其他附加信息 + if additional_info and isinstance(additional_info, dict): + for key, value in additional_info.items(): + tk.Label(result_frame, text=f"{key}: {value}", font=("Arial", 12)).pack(anchor=tk.W, padx=20, pady=5) + + # 如果有结果文件,显示文件信息 + if result_file and os.path.exists(result_file): + tk.Label(result_frame, text=f"输出文件: {os.path.basename(result_file)}", font=("Arial", 12)).pack(anchor=tk.W, padx=20, pady=5) + + # 成功提示 + tk.Label(result_frame, text="处理已成功完成!", font=("Arial", 12, "bold"), fg="#28a745").pack(pady=10) + + # 文件信息框 + file_frame = tk.Frame(result_frame, relief=tk.GROOVE, borderwidth=1) + file_frame.pack(fill=tk.X, padx=15, pady=5) + + tk.Label(file_frame, text="文件信息", font=("Arial", 10, "bold")).pack(anchor=tk.W, padx=10, pady=5) + + # 获取文件大小和时间 + try: + file_size = os.path.getsize(result_file) + file_time = datetime.fromtimestamp(os.path.getmtime(result_file)) + + from .file_utils import format_file_size + size_text = format_file_size(file_size) + + tk.Label(file_frame, text=f"文件大小: {size_text}", font=("Arial", 10)).pack(anchor=tk.W, padx=10, pady=2) + tk.Label(file_frame, text=f"创建时间: {file_time.strftime('%Y-%m-%d %H:%M:%S')}", font=("Arial", 10)).pack(anchor=tk.W, padx=10, pady=2) + except Exception: + tk.Label(file_frame, text="无法获取文件信息", font=("Arial", 10)).pack(anchor=tk.W, padx=10, pady=2) + + # 添加按钮 + button_frame = tk.Frame(dialog) + button_frame.pack(pady=10) + + tk.Button(button_frame, text="打开文件", command=lambda: os.startfile(result_file)).pack(side=tk.LEFT, padx=5) + tk.Button(button_frame, text="打开所在文件夹", command=lambda: os.startfile(os.path.dirname(result_file))).pack(side=tk.LEFT, padx=5) + tk.Button(button_frame, text="关闭", command=dialog.destroy).pack(side=tk.LEFT, padx=5) + else: + # 如果没有结果文件或文件不存在 + if result_file: + tk.Label(result_frame, text="未找到输出文件", font=("Arial", 12)).pack(anchor=tk.W, padx=20, pady=5) + tk.Label(result_frame, text="请检查输出目录", font=("Arial", 12, "bold"), fg="#dc3545").pack(pady=10) + + # 添加按钮 + button_frame = tk.Frame(dialog) + button_frame.pack(pady=10) + + tk.Button(button_frame, text="打开输出目录", command=lambda: os.startfile(os.path.abspath("data/output"))).pack(side=tk.LEFT, padx=5) + tk.Button(button_frame, text="关闭", command=dialog.destroy).pack(side=tk.LEFT, padx=5) + + # 确保窗口显示在最前 + dialog.lift() + dialog.attributes('-topmost', True) + dialog.after_idle(lambda: dialog.attributes('-topmost', False)) + + return dialog + +def show_custom_dialog(*args, **kwargs): + """ + 显示自定义对话框 + + 参数与create_custom_dialog相同 + + Returns: + dialog: 对话框对象 + """ + return create_custom_dialog(*args, **kwargs) + +def center_window(window): + """使窗口居中显示""" + window.update_idletasks() + width = window.winfo_width() + height = window.winfo_height() + x = (window.winfo_screenwidth() // 2) - (width // 2) + y = (window.winfo_screenheight() // 2) - (height // 2) + window.geometry('{}x{}+{}+{}'.format(width, height, x, y)) + +def create_barcode_mapping_dialog(parent=None, on_save=None, current_mappings=None): + """ + 创建条码映射编辑弹窗 + + Args: + parent: 父窗口 + on_save: 保存回调函数,接收修改后的映射数据 + current_mappings: 当前的映射数据 + + Returns: + dialog: 对话框对象 + """ + dialog = tk.Toplevel(parent) + dialog.title("条码映射编辑") + dialog.geometry("600x500") + dialog.resizable(True, True) + + # 使弹窗居中显示 + center_window(dialog) + + # 创建主框架 + main_frame = tk.Frame(dialog) + main_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=10) + + # 创建选项卡控件 + tab_control = ttk.Notebook(main_frame) + + # 创建两个选项卡页面 + tab1 = tk.Frame(tab_control) + tab2 = tk.Frame(tab_control) + + tab_control.add(tab1, text="条码映射") + tab_control.add(tab2, text="特殊处理") + tab_control.pack(expand=True, fill=tk.BOTH) + + # ========= 条码映射选项卡 ========= + # 顶部输入区域 + input_frame = tk.Frame(tab1) + input_frame.pack(fill=tk.X, padx=5, pady=5) + + tk.Label(input_frame, text="源条码:").grid(row=0, column=0, padx=5, pady=5) + source_entry = tk.Entry(input_frame, width=20) + source_entry.grid(row=0, column=1, padx=5, pady=5) + + tk.Label(input_frame, text="目标条码:").grid(row=0, column=2, padx=5, pady=5) + target_entry = tk.Entry(input_frame, width=20) + target_entry.grid(row=0, column=3, padx=5, pady=5) + + # 存储映射列表的变量 + mapping_list = [] + + # 映射列表显示区域 + list_frame = tk.Frame(tab1) + list_frame.pack(fill=tk.BOTH, expand=True, padx=5, pady=5) + + columns = ("源条码", "目标条码") + mapping_tree = ttk.Treeview(list_frame, columns=columns, show="headings", selectmode="browse") + + for col in columns: + mapping_tree.heading(col, text=col) + mapping_tree.column(col, width=100) + + mapping_tree.pack(side=tk.LEFT, fill=tk.BOTH, expand=True) + + # 添加滚动条 + scrollbar = ttk.Scrollbar(list_frame, orient=tk.VERTICAL, command=mapping_tree.yview) + scrollbar.pack(side=tk.RIGHT, fill=tk.Y) + mapping_tree.configure(yscrollcommand=scrollbar.set) + + # ========= 特殊处理选项卡 ========= + # 顶部输入区域 + special_input_frame = tk.Frame(tab2) + special_input_frame.pack(fill=tk.X, padx=5, pady=5) + + tk.Label(special_input_frame, text="条码:").grid(row=0, column=0, padx=5, pady=5) + special_barcode_entry = tk.Entry(special_input_frame, width=20) + special_barcode_entry.grid(row=0, column=1, padx=5, pady=5) + + tk.Label(special_input_frame, text="乘数:").grid(row=1, column=0, padx=5, pady=5) + multiplier_entry = tk.Entry(special_input_frame, width=10) + multiplier_entry.grid(row=1, column=1, padx=5, pady=5) + + tk.Label(special_input_frame, text="目标单位:").grid(row=1, column=2, padx=5, pady=5) + unit_entry = tk.Entry(special_input_frame, width=10) + unit_entry.grid(row=1, column=3, padx=5, pady=5) + + tk.Label(special_input_frame, text="固定单价:").grid(row=2, column=0, padx=5, pady=5) + price_entry = tk.Entry(special_input_frame, width=10) + price_entry.grid(row=2, column=1, padx=5, pady=5) + + tk.Label(special_input_frame, text="规格:").grid(row=2, column=2, padx=5, pady=5) + spec_entry = tk.Entry(special_input_frame, width=10) + spec_entry.grid(row=2, column=3, padx=5, pady=5) + + tk.Label(special_input_frame, text="描述:").grid(row=3, column=0, padx=5, pady=5) + desc_entry = tk.Entry(special_input_frame, width=40) + desc_entry.grid(row=3, column=1, columnspan=3, padx=5, pady=5) + + # 特殊处理列表显示区域 + special_list_frame = tk.Frame(tab2) + special_list_frame.pack(fill=tk.BOTH, expand=True, padx=5, pady=5) + + special_columns = ("条码", "乘数", "目标单位", "固定单价", "规格", "描述") + special_tree = ttk.Treeview(special_list_frame, columns=special_columns, show="headings", selectmode="browse") + + for col in special_columns: + special_tree.heading(col, text=col) + special_tree.column(col, width=80) + + special_tree.column("描述", width=200) + special_tree.pack(side=tk.LEFT, fill=tk.BOTH, expand=True) + + # 添加滚动条 + special_scrollbar = ttk.Scrollbar(special_list_frame, orient=tk.VERTICAL, command=special_tree.yview) + special_scrollbar.pack(side=tk.RIGHT, fill=tk.Y) + special_tree.configure(yscrollcommand=special_scrollbar.set) + + # 存储特殊处理列表的变量 + special_list = [] + + # 按钮区域 + def add_mapping(): + source = source_entry.get().strip() + target = target_entry.get().strip() + + if not source or not target: + messagebox.showwarning("输入错误", "源条码和目标条码不能为空") + return + + # 检查是否已存在 + for item in mapping_list: + if item[0] == source: + messagebox.showwarning("重复条码", f"条码 {source} 已存在映射") + return + + # 添加到列表 + mapping_list.append((source, target)) + mapping_tree.insert("", tk.END, values=(source, target)) + + # 清空输入框 + source_entry.delete(0, tk.END) + target_entry.delete(0, tk.END) + + def remove_mapping(): + selected = mapping_tree.selection() + if not selected: + messagebox.showwarning("未选择", "请先选择要删除的条目") + return + + # 获取选中项的索引 + item = mapping_tree.item(selected[0]) + source = item['values'][0] + + # 从列表中移除 + for i, (s, _) in enumerate(mapping_list): + if s == source: + mapping_list.pop(i) + break + + # 从树中移除 + mapping_tree.delete(selected[0]) + + def add_special(): + barcode = special_barcode_entry.get().strip() + multiplier = multiplier_entry.get().strip() + unit = unit_entry.get().strip() + price = price_entry.get().strip() + spec = spec_entry.get().strip() + desc = desc_entry.get().strip() + + if not barcode: + messagebox.showwarning("输入错误", "条码不能为空") + return + + # 检查是否已存在 + for item in special_list: + if item[0] == barcode: + messagebox.showwarning("重复条码", f"条码 {barcode} 已存在特殊处理") + return + + # 添加到列表 + special_list.append((barcode, multiplier, unit, price, spec, desc)) + special_tree.insert("", tk.END, values=(barcode, multiplier, unit, price, spec, desc)) + + # 清空输入框 + special_barcode_entry.delete(0, tk.END) + multiplier_entry.delete(0, tk.END) + unit_entry.delete(0, tk.END) + price_entry.delete(0, tk.END) + spec_entry.delete(0, tk.END) + desc_entry.delete(0, tk.END) + + def remove_special(): + selected = special_tree.selection() + if not selected: + messagebox.showwarning("未选择", "请先选择要删除的条目") + return + + # 获取选中项的索引 + item = special_tree.item(selected[0]) + barcode = item['values'][0] + + # 从列表中移除 + for i, (b, _, _, _, _, _) in enumerate(special_list): + if b == barcode: + special_list.pop(i) + break + + # 从树中移除 + special_tree.delete(selected[0]) + + # 条码映射按钮 + btn_frame = tk.Frame(tab1) + btn_frame.pack(fill=tk.X, padx=5, pady=5) + + add_btn = tk.Button(btn_frame, text="添加映射", command=add_mapping) + add_btn.pack(side=tk.LEFT, padx=5) + + remove_btn = tk.Button(btn_frame, text="删除映射", command=remove_mapping) + remove_btn.pack(side=tk.LEFT, padx=5) + + # 特殊处理按钮 + special_btn_frame = tk.Frame(tab2) + special_btn_frame.pack(fill=tk.X, padx=5, pady=5) + + add_special_btn = tk.Button(special_btn_frame, text="添加特殊处理", command=add_special) + add_special_btn.pack(side=tk.LEFT, padx=5) + + remove_special_btn = tk.Button(special_btn_frame, text="删除特殊处理", command=remove_special) + remove_special_btn.pack(side=tk.LEFT, padx=5) + + # 添加映射到特殊处理的功能 + def add_mapping_to_special(): + selected = special_tree.selection() + if not selected: + messagebox.showwarning("未选择", "请先选择要添加映射的特殊处理条目") + return + + # 获取选中项 + item = special_tree.item(selected[0]) + barcode = item['values'][0] + + # 弹出对话框输入映射目标 + target_barcode = tk.simpledialog.askstring("添加映射", f"为条码 {barcode} 添加映射目标条码:") + if not target_barcode: + return + + # 更新特殊处理列表中的项 + for i, (b, mult, unit, price, spec, desc) in enumerate(special_list): + if b == barcode: + # 如果描述中已有映射信息,更新它 + if "映射到:" in desc: + desc = desc.split("映射到:")[0].strip() + + # 添加映射信息到描述 + new_desc = f"{desc} 映射到: {target_barcode}" + special_list[i] = (b, mult, unit, price, spec, new_desc) + + # 更新显示 + special_tree.item(selected[0], values=(b, mult, unit, price, spec, new_desc)) + + # 标记该条码有映射 + special_tree.item(selected[0], tags=("mapped",)) + special_tree.tag_configure("mapped", foreground="blue") + + break + + map_special_btn = tk.Button(special_btn_frame, text="添加条码映射", command=add_mapping_to_special) + map_special_btn.pack(side=tk.LEFT, padx=5) + + # 底部按钮区域 + bottom_frame = tk.Frame(dialog) + bottom_frame.pack(fill=tk.X, padx=10, pady=10) + + def save_mappings(): + # 构建保存数据 + mappings = {} + + # 添加条码映射 + for source, target in mapping_list: + mappings[source] = { + 'map_to': target, + 'description': f'条码映射:{source} -> {target}' + } + + # 添加特殊处理 + for barcode, multiplier, unit, price, spec, desc in special_list: + # 检查该条码是否已存在 + if barcode not in mappings: + mappings[barcode] = {} + + if multiplier: + try: + # 安全地转换multiplier为数字 + if isinstance(multiplier, str): + if '.' in multiplier: + mappings[barcode]['multiplier'] = float(multiplier) + else: + mappings[barcode]['multiplier'] = int(multiplier) + else: + # 已经是数字类型 + mappings[barcode]['multiplier'] = multiplier + except ValueError: + # 如果转换失败,保持原始字符串 + mappings[barcode]['multiplier'] = multiplier + + if unit: + mappings[barcode]['target_unit'] = unit + + if price: + try: + # 安全地转换price为浮点数 + mappings[barcode]['fixed_price'] = float(price) + except ValueError: + # 如果转换失败,保持原始字符串 + mappings[barcode]['fixed_price'] = price + + if spec: + mappings[barcode]['specification'] = spec + + # 检查描述中是否包含映射信息 + if desc and "映射到:" in desc: + parts = desc.split("映射到:") + base_desc = parts[0].strip() + target_barcode = parts[1].strip() + + # 设置基本描述 + if base_desc: + mappings[barcode]['description'] = base_desc + + # 设置映射目标 + mappings[barcode]['map_to'] = target_barcode + elif desc: + mappings[barcode]['description'] = desc + + # 调用保存回调 + if on_save: + on_save(mappings) + + messagebox.showinfo("保存成功", f"已保存{len(mapping_list)}个条码映射和{len(special_list)}个特殊处理规则") + dialog.destroy() + + def cancel(): + dialog.destroy() + + save_btn = tk.Button(bottom_frame, text="保存", command=save_mappings) + save_btn.pack(side=tk.RIGHT, padx=5) + + cancel_btn = tk.Button(bottom_frame, text="取消", command=cancel) + cancel_btn.pack(side=tk.RIGHT, padx=5) + + # ---- 云端同步按钮 ---- + def _build_current_mappings(): + """从弹窗当前数据构建 mappings dict(与 save_mappings 逻辑相同)""" + mappings = {} + for source, target in mapping_list: + mappings[source] = { + 'map_to': target, + 'description': f'条码映射:{source} -> {target}' + } + for barcode, multiplier, unit, price, spec, desc in special_list: + if barcode not in mappings: + mappings[barcode] = {} + if multiplier: + try: + if isinstance(multiplier, str): + mappings[barcode]['multiplier'] = float(multiplier) if '.' in multiplier else int(multiplier) + else: + mappings[barcode]['multiplier'] = multiplier + except ValueError: + mappings[barcode]['multiplier'] = multiplier + if unit: + mappings[barcode]['target_unit'] = unit + if price: + try: + mappings[barcode]['fixed_price'] = float(price) + except ValueError: + mappings[barcode]['fixed_price'] = price + if spec: + mappings[barcode]['specification'] = spec + if desc and "映射到:" in desc: + parts = desc.split("映射到:") + base_desc = parts[0].strip() + target_barcode = parts[1].strip() + if base_desc: + mappings[barcode]['description'] = base_desc + mappings[barcode]['map_to'] = target_barcode + elif desc: + mappings[barcode]['description'] = desc + return mappings + + def _get_sync(): + """获取 GiteaSync 实例,配置不完整时提示用户""" + sync = GiteaSync.from_config(ConfigManager()) + if sync is None: + messagebox.showwarning("云端同步", "请先在「系统设置」中配置 Gitea 云端同步参数(token)") + return sync + + def _refresh_trees(new_mappings): + """用新数据刷新两个 Treeview""" + # 清空 + for item in mapping_tree.get_children(): + mapping_tree.delete(item) + mapping_list.clear() + for item in special_tree.get_children(): + special_tree.delete(item) + special_list.clear() + # 重新填充 + if new_mappings: + for barcode, data in new_mappings.items(): + if 'map_to' in data and 'multiplier' not in data: + mapping_list.append((barcode, data['map_to'])) + mapping_tree.insert('', 'end', values=(barcode, data['map_to'])) + else: + mult = data.get('multiplier', '') + unit = data.get('target_unit', '') + price = data.get('fixed_price', '') + spec = data.get('specification', '') + desc = data.get('description', '') + if 'map_to' in data: + desc = f"{desc} 映射到: {data['map_to']}" if desc else f"映射到: {data['map_to']}" + special_list.append((barcode, mult, unit, price, spec, desc)) + tags = ("mapped",) if 'map_to' in data else () + special_tree.insert('', 'end', values=(barcode, mult, unit, price, spec, desc), tags=tags) + if any('map_to' in d for d in new_mappings.values()): + special_tree.tag_configure("mapped", foreground="blue") + + def push_to_cloud(): + sync = _get_sync() + if not sync: + return + mappings = _build_current_mappings() + if not mappings: + messagebox.showwarning("同步到云端", "当前没有映射数据可同步") + return + # 先获取当前 sha(如果文件已存在) + sha = None + existing = sync.pull_file("barcode_mappings.json") + if existing: + sha = existing[1] + new_sha = sync.push_json( + "barcode_mappings.json", + mappings, + f"同步条码映射 ({len(mappings)} 条)", + sha=sha, + ) + if new_sha: + messagebox.showinfo("同步成功", f"已推送 {len(mappings)} 条映射到云端") + else: + messagebox.showerror("同步失败", "推送到云端失败,请检查网络和 Gitea 配置") + + def pull_from_cloud(): + sync = _get_sync() + if not sync: + return + result = sync.pull_json("barcode_mappings.json") + if result is None: + messagebox.showwarning("拉取失败", "云端没有找到条码映射文件,或网络错误") + return + data, sha = result + if not isinstance(data, dict) or len(data) == 0: + messagebox.showwarning("拉取失败", "云端数据格式异常") + return + # 同时保存到本地 + from app.core.excel.converter import UnitConverter + uc = UnitConverter() + uc.update_barcode_mappings(data) + # 刷新弹窗 + _refresh_trees(data) + messagebox.showinfo("拉取成功", f"已从云端拉取 {len(data)} 条映射,本地已同步更新") + + sync_frame = tk.Frame(bottom_frame) + sync_frame.pack(side=tk.LEFT, padx=5) + + push_btn = tk.Button(sync_frame, text="同步到云端", command=push_to_cloud, fg="white", bg="#4a90d9") + push_btn.pack(side=tk.LEFT, padx=3) + + pull_btn = tk.Button(sync_frame, text="从云端拉取", command=pull_from_cloud, fg="white", bg="#5cb85c") + pull_btn.pack(side=tk.LEFT, padx=3) + + # 导入当前映射数据 + if current_mappings: + for barcode, data in current_mappings.items(): + if 'map_to' in data: + # 这是条码映射 + mapping_list.append((barcode, data['map_to'])) + mapping_tree.insert("", tk.END, values=(barcode, data['map_to'])) + else: + # 这是特殊处理 + multiplier = data.get('multiplier', '') + unit = data.get('target_unit', '') + price = data.get('fixed_price', '') + spec = data.get('specification', '') + desc = data.get('description', '') + + special_list.append((barcode, multiplier, unit, price, spec, desc)) + special_tree.insert("", tk.END, values=(barcode, multiplier, unit, price, spec, desc)) + + # 确保窗口显示在最前 + dialog.transient(parent) + dialog.grab_set() + + return dialog + +def show_barcode_mapping_dialog(*args, **kwargs): + """ + 显示条码映射编辑弹窗 + + 参数与create_barcode_mapping_dialog相同 + + Returns: + dialog: 对话框对象 + """ + # 确保已导入ttk + import tkinter.ttk as ttk + return create_barcode_mapping_dialog(*args, **kwargs) + +def show_config_dialog(parent, config_manager, on_save=None): + """显示配置设置对话框""" + dialog = tk.Toplevel(parent) + dialog.title("系统配置") + dialog.geometry("600x500") + dialog.resizable(False, False) + + # 使窗口居中 + dialog.update_idletasks() + width = dialog.winfo_width() + height = dialog.winfo_height() + x = (dialog.winfo_screenwidth() // 2) - (width // 2) + y = (dialog.winfo_screenheight() // 2) - (height // 2) + dialog.geometry('{}x{}+{}+{}'.format(width, height, x, y)) + + # 创建主框架 + main_frame = ttk.Frame(dialog, padding="10") + main_frame.pack(fill=tk.BOTH, expand=True) + + # 创建选项卡 + notebook = ttk.Notebook(main_frame) + notebook.pack(fill=tk.BOTH, expand=True, pady=5) + + # 创建各个配置页面的框架 + api_frame = ttk.Frame(notebook, padding="10") + paths_frame = ttk.Frame(notebook, padding="10") + performance_frame = ttk.Frame(notebook, padding="10") + file_frame = ttk.Frame(notebook, padding="10") + + # 添加选项卡 + notebook.add(api_frame, text="API设置") + notebook.add(paths_frame, text="路径设置") + notebook.add(performance_frame, text="性能设置") + notebook.add(file_frame, text="文件设置") + + # 存储所有输入框的引用 + entries = {} + + # API设置 + ttk.Label(api_frame, text="百度OCR API设置", font=("Arial", 12, "bold")).pack(anchor=tk.W, pady=5) + + # API Key + ttk.Label(api_frame, text="API Key:").pack(anchor=tk.W, pady=2) + api_key_entry = ttk.Entry(api_frame, width=50) + api_key_entry.pack(fill=tk.X, pady=2) + api_key_entry.insert(0, config_manager.get('API', 'api_key', '')) + entries[('API', 'api_key')] = api_key_entry + + # Secret Key + ttk.Label(api_frame, text="Secret Key:").pack(anchor=tk.W, pady=2) + secret_key_entry = ttk.Entry(api_frame, width=50) + secret_key_entry.pack(fill=tk.X, pady=2) + secret_key_entry.insert(0, config_manager.get('API', 'secret_key', '')) + entries[('API', 'secret_key')] = secret_key_entry + + # 超时设置 + ttk.Label(api_frame, text="超时时间(秒):").pack(anchor=tk.W, pady=2) + timeout_entry = ttk.Entry(api_frame, width=10) + timeout_entry.pack(anchor=tk.W, pady=2) + timeout_entry.insert(0, config_manager.get('API', 'timeout', '30')) + entries[('API', 'timeout')] = timeout_entry + + # 路径设置 + ttk.Label(paths_frame, text="系统路径设置", font=("Arial", 12, "bold")).pack(anchor=tk.W, pady=5) + + # 输入目录 + ttk.Label(paths_frame, text="输入目录:").pack(anchor=tk.W, pady=2) + input_dir_entry = ttk.Entry(paths_frame, width=50) + input_dir_entry.pack(fill=tk.X, pady=2) + input_dir_entry.insert(0, config_manager.get('Paths', 'input_folder', 'data/input')) + entries[('Paths', 'input_folder')] = input_dir_entry + + # 输出目录 + ttk.Label(paths_frame, text="输出目录:").pack(anchor=tk.W, pady=2) + output_dir_entry = ttk.Entry(paths_frame, width=50) + output_dir_entry.pack(fill=tk.X, pady=2) + output_dir_entry.insert(0, config_manager.get('Paths', 'output_folder', 'data/output')) + entries[('Paths', 'output_folder')] = output_dir_entry + + # 性能设置 + ttk.Label(performance_frame, text="性能设置", font=("Arial", 12, "bold")).pack(anchor=tk.W, pady=5) + + # 最大工作线程数 + ttk.Label(performance_frame, text="最大工作线程数:").pack(anchor=tk.W, pady=2) + max_workers_entry = ttk.Entry(performance_frame, width=10) + max_workers_entry.pack(anchor=tk.W, pady=2) + max_workers_entry.insert(0, config_manager.get('Performance', 'max_workers', '4')) + entries[('Performance', 'max_workers')] = max_workers_entry + + # 批处理大小 + ttk.Label(performance_frame, text="批处理大小:").pack(anchor=tk.W, pady=2) + batch_size_entry = ttk.Entry(performance_frame, width=10) + batch_size_entry.pack(anchor=tk.W, pady=2) + batch_size_entry.insert(0, config_manager.get('Performance', 'batch_size', '5')) + entries[('Performance', 'batch_size')] = batch_size_entry + + # 文件设置 + ttk.Label(file_frame, text="文件设置", font=("Arial", 12, "bold")).pack(anchor=tk.W, pady=5) + + # 允许的文件扩展名 + ttk.Label(file_frame, text="允许的文件扩展名:").pack(anchor=tk.W, pady=2) + extensions_entry = ttk.Entry(file_frame, width=50) + extensions_entry.pack(fill=tk.X, pady=2) + extensions_entry.insert(0, config_manager.get('File', 'allowed_extensions', '.jpg,.jpeg,.png,.bmp')) + entries[('File', 'allowed_extensions')] = extensions_entry + + # 最大文件大小 + ttk.Label(file_frame, text="最大文件大小(MB):").pack(anchor=tk.W, pady=2) + max_size_entry = ttk.Entry(file_frame, width=10) + max_size_entry.pack(anchor=tk.W, pady=2) + max_size_entry.insert(0, config_manager.get('File', 'max_file_size_mb', '4')) + entries[('File', 'max_file_size_mb')] = max_size_entry + + def save_config(): + """保存配置""" + try: + # 收集所有输入框的值 + for (section, option), entry in entries.items(): + value = entry.get().strip() + config_manager.update(section, option, value) + + # 保存配置 + config_manager.save_config() + + if on_save: + on_save() + + messagebox.showinfo("成功", "配置已保存") + dialog.destroy() + except Exception as e: + messagebox.showerror("错误", f"保存配置时出错: {str(e)}") + + # 按钮框架 + button_frame = ttk.Frame(main_frame) + button_frame.pack(fill=tk.X, pady=10) + + # 保存按钮 + ttk.Button(button_frame, text="保存", command=save_config).pack(side=tk.RIGHT, padx=5) + + # 取消按钮 + ttk.Button(button_frame, text="取消", command=dialog.destroy).pack(side=tk.RIGHT, padx=5) + + # 设置模态 + dialog.transient(parent) + dialog.grab_set() + + +# ────────────────────────────────────────────────────────────── +# 云端同步管理对话框 +# ────────────────────────────────────────────────────────────── + +SYNC_FILES = [ + { + "name": "条码映射", + "remote": "barcode_mappings.json", + "local": "config/barcode_mappings.json", + "type": "json", + }, + { + "name": "供应商配置", + "remote": "suppliers_config.json", + "local": "config/suppliers_config.json", + "type": "json", + }, + { + "name": "商品资料", + "remote": "templates/商品资料.xlsx", + "local": "templates/商品资料.xlsx", + "type": "binary", + }, + { + "name": "采购单模板", + "remote": "templates/银豹-采购单模板.xls", + "local": "templates/银豹-采购单模板.xls", + "type": "binary", + }, +] + + +def _format_size(path: str) -> str: + try: + size = os.path.getsize(path) + if size < 1024 * 1024: + return f"{size / 1024:.1f} KB" + return f"{size / (1024 * 1024):.1f} MB" + except OSError: + return "—" + + +def show_cloud_sync_dialog(parent=None): + """统一云端同步管理对话框""" + + sync = GiteaSync.from_config(ConfigManager()) + if sync is None: + messagebox.showwarning( + "配置不完整", + "请先在「系统设置」中配置 Gitea 地址和 Access Token", + ) + return + + dlg = tk.Toplevel(parent) + dlg.title("云端同步管理") + dlg.geometry("620x440") + dlg.resizable(False, False) + + # 居中 + dlg.update_idletasks() + x = (dlg.winfo_screenwidth() - 620) // 2 + y = (dlg.winfo_screenheight() - 440) // 2 + dlg.geometry(f"620x440+{x}+{y}") + + # ── Treeview ── + columns = ("name", "local_status", "cloud_status") + tree = ttk.Treeview(dlg, columns=columns, show="headings", height=6) + tree.heading("name", text="文件") + tree.heading("local_status", text="本地状态") + tree.heading("cloud_status", text="云端状态") + tree.column("name", width=140) + tree.column("local_status", width=220) + tree.column("cloud_status", width=220) + tree.pack(fill=tk.BOTH, expand=True, padx=16, pady=(16, 8)) + + # tag 颜色 + tree.tag_configure("synced", foreground="#2e7d32") + tree.tag_configure("cloud_only", foreground="#e65100") + tree.tag_configure("local_only", foreground="#1565c0") + tree.tag_configure("missing", foreground="#999999") + + # 用 iid = remote_path 标识每行 + cloud_sha_cache: dict = {} # remote_path -> sha + + def _load_local_status(): + """仅加载本地状态,不发网络请求""" + for item in tree.get_children(): + tree.delete(item) + for entry in SYNC_FILES: + local = entry["local"] + if os.path.exists(local): + if entry["type"] == "json": + try: + with open(local, "r", encoding="utf-8") as f: + data = json.load(f) + if isinstance(data, dict): + local_text = f"{len(data)} 项" + elif isinstance(data, list): + local_text = f"{len(data)} 条记录" + else: + local_text = "已存在" + except Exception: + local_text = "已存在(解析异常)" + else: + local_text = _format_size(local) + tag = "local_only" + else: + local_text = "不存在" + tag = "missing" + tree.insert( + "", tk.END, + iid=entry["remote"], + values=(entry["name"], local_text, "点「刷新状态」检查"), + tags=(tag,), + ) + + def refresh_status(): + """刷新每行的本地/云端状态""" + cloud_sha_cache.clear() + for item in tree.get_children(): + tree.delete(item) + + for entry in SYNC_FILES: + remote = entry["remote"] + local = entry["local"] + + # 本地状态 + if os.path.exists(local): + if entry["type"] == "json": + try: + with open(local, "r", encoding="utf-8") as f: + data = json.load(f) + if isinstance(data, dict): + local_text = f"{len(data)} 项" + elif isinstance(data, list): + local_text = f"{len(data)} 条记录" + else: + local_text = "已存在" + except Exception: + local_text = "已存在(解析异常)" + else: + local_text = _format_size(local) + else: + local_text = "不存在" + + # 云端状态 — 网络请求,可能慢 + sha = sync.file_exists(remote) + if sha: + cloud_sha_cache[remote] = sha + cloud_text = "已存在" + else: + cloud_text = "未上传" + + # tag + local_ok = os.path.exists(local) + cloud_ok = sha is not None + if local_ok and cloud_ok: + tag = "synced" + elif cloud_ok and not local_ok: + tag = "cloud_only" + elif local_ok and not cloud_ok: + tag = "local_only" + else: + tag = "missing" + + tree.insert( + "", tk.END, + iid=remote, + values=(entry["name"], local_text, cloud_text), + tags=(tag,), + ) + + # ── 操作函数 ── + def _get_selected_entries(): + """获取选中的文件条目列表""" + selected = tree.selection() + if not selected: + messagebox.showinfo("提示", "请先选中要操作的文件") + return [] + return [e for e in SYNC_FILES if e["remote"] in selected] + + def push_selected(): + entries = _get_selected_entries() + if not entries: + return + ok, fail = 0, 0 + for entry in entries: + local, remote = entry["local"], entry["remote"] + if not os.path.exists(local): + messagebox.showwarning("跳过", f"本地文件不存在: {local}") + fail += 1 + continue + + if entry["type"] == "json": + try: + with open(local, "r", encoding="utf-8") as f: + data = json.load(f) + sha = cloud_sha_cache.get(remote) + result = sync.push_json(remote, data, f"同步 {entry['name']}", sha=sha) + except Exception as e: + messagebox.showerror("推送失败", f"{entry['name']}: {e}") + fail += 1 + continue + else: + result = sync.push_binary(remote, local, f"同步 {entry['name']}") + + if result: + ok += 1 + else: + fail += 1 + + if ok: + messagebox.showinfo("推送完成", f"成功 {ok} 个" + (f",失败 {fail} 个" if fail else "")) + refresh_status() + + def pull_selected(): + entries = _get_selected_entries() + if not entries: + return + ok, fail = 0, 0 + for entry in entries: + remote, local = entry["remote"], entry["local"] + + if entry["type"] == "json": + result = sync.pull_json(remote) + if result is None: + messagebox.showwarning("拉取失败", f"云端文件不存在: {entry['name']}") + fail += 1 + continue + content, sha = result + # 写入本地 + os.makedirs(os.path.dirname(local) or ".", exist_ok=True) + with open(local, "w", encoding="utf-8") as f: + json.dump(content, f, ensure_ascii=False, indent=2) + # 特殊后处理 + _post_pull(entry, content) + else: + result = sync.pull_file(remote) + if result is None: + messagebox.showwarning("拉取失败", f"云端文件不存在: {entry['name']}") + fail += 1 + continue + content, sha = result + os.makedirs(os.path.dirname(local) or ".", exist_ok=True) + with open(local, "wb") as f: + f.write(content) + + ok += 1 + + if ok: + messagebox.showinfo("拉取完成", f"成功 {ok} 个" + (f",失败 {fail} 个" if fail else "")) + refresh_status() + + def _post_pull(entry, data): + """拉取 JSON 文件后的特殊处理""" + if entry["remote"] == "barcode_mappings.json": + try: + from app.core.excel.converter import UnitConverter + UnitConverter().update_barcode_mappings(data) + except Exception: + pass + elif entry["remote"] == "suppliers_config.json": + try: + from app.services.processor_service import ProcessorService + ProcessorService(ConfigManager()).reload_processors() + except Exception: + pass + + def push_all(): + ok, fail = 0, 0 + for entry in SYNC_FILES: + local, remote = entry["local"], entry["remote"] + if not os.path.exists(local): + fail += 1 + continue + if entry["type"] == "json": + try: + with open(local, "r", encoding="utf-8") as f: + data = json.load(f) + sha = cloud_sha_cache.get(remote) + result = sync.push_json(remote, data, f"批量同步 {entry['name']}", sha=sha) + except Exception: + fail += 1 + continue + else: + result = sync.push_binary(remote, local, f"批量同步 {entry['name']}") + if result: + ok += 1 + else: + fail += 1 + messagebox.showinfo("批量推送完成", f"成功 {ok} 个,失败 {fail} 个") + refresh_status() + + def pull_all(): + ok, fail = 0, 0 + for entry in SYNC_FILES: + remote, local = entry["remote"], entry["local"] + if entry["type"] == "json": + result = sync.pull_json(remote) + if result is None: + fail += 1 + continue + content, sha = result + os.makedirs(os.path.dirname(local) or ".", exist_ok=True) + with open(local, "w", encoding="utf-8") as f: + json.dump(content, f, ensure_ascii=False, indent=2) + _post_pull(entry, content) + else: + result = sync.pull_file(remote) + if result is None: + fail += 1 + continue + content, sha = result + os.makedirs(os.path.dirname(local) or ".", exist_ok=True) + with open(local, "wb") as f: + f.write(content) + ok += 1 + messagebox.showinfo("批量拉取完成", f"成功 {ok} 个,失败 {fail} 个") + refresh_status() + + # ── 按钮区域 ── + btn_frame = ttk.Frame(dlg) + btn_frame.pack(fill=tk.X, padx=16, pady=(4, 16)) + + # 左侧:批量操作 + ttk.Button(btn_frame, text="全部推送到云端", command=push_all).pack(side=tk.LEFT, padx=4) + ttk.Button(btn_frame, text="全部从云端拉取", command=pull_all).pack(side=tk.LEFT, padx=4) + + # 右侧:选中操作 + 刷新 + 关闭 + ttk.Button(btn_frame, text="关闭", command=dlg.destroy).pack(side=tk.RIGHT, padx=4) + ttk.Button(btn_frame, text="刷新状态", command=refresh_status).pack(side=tk.RIGHT, padx=4) + tk.Button(btn_frame, text="推送到云端", command=push_selected, fg="white", bg="#4a90d9").pack(side=tk.RIGHT, padx=4) + tk.Button(btn_frame, text="从云端拉取", command=pull_selected, fg="white", bg="#5cb85c").pack(side=tk.RIGHT, padx=4) + + # 仅显示本地状态,云端状态需手动点"刷新状态" + _load_local_status() + + dlg.transient(parent) + dlg.grab_set() \ No newline at end of file diff --git a/app/core/utils/file_utils.py b/app/core/utils/file_utils.py new file mode 100644 index 0000000..5c91be3 --- /dev/null +++ b/app/core/utils/file_utils.py @@ -0,0 +1,286 @@ +""" +文件操作工具模块 +-------------- +提供文件处理、查找和管理功能。 +""" + +import os +import sys +import shutil +import json +from datetime import datetime +from pathlib import Path +from typing import Dict, List, Optional, Union, Any + +from .log_utils import get_logger + +logger = get_logger(__name__) + +def ensure_dir(directory: str) -> bool: + """ + 确保目录存在,如果不存在则创建 + + Args: + directory: 目录路径 + + Returns: + 是否成功创建或目录已存在 + """ + try: + os.makedirs(directory, exist_ok=True) + return True + except Exception as e: + logger.error(f"创建目录失败: {directory}, 错误: {e}") + return False + +def get_file_extension(file_path: str) -> str: + """ + 获取文件扩展名(小写) + + Args: + file_path: 文件路径 + + Returns: + 文件扩展名,包含点(例如 .jpg) + """ + return os.path.splitext(file_path)[1].lower() + +def is_valid_extension(file_path: str, allowed_extensions: List[str]) -> bool: + """ + 检查文件扩展名是否在允许的列表中 + + Args: + file_path: 文件路径 + allowed_extensions: 允许的扩展名列表(例如 ['.jpg', '.png']) + + Returns: + 文件扩展名是否有效 + """ + ext = get_file_extension(file_path) + return ext in allowed_extensions + +def get_files_by_extensions(directory: str, extensions: List[str], exclude_patterns: List[str] = None) -> List[str]: + """ + 获取指定目录下所有符合扩展名的文件路径 + + Args: + directory: 目录路径 + extensions: 扩展名列表(例如 ['.jpg', '.png']) + exclude_patterns: 排除的文件名模式(例如 ['~$', '.tmp']) + + Returns: + 文件路径列表 + """ + if exclude_patterns is None: + exclude_patterns = ['~$', '.tmp'] + + files = [] + for file in os.listdir(directory): + file_path = os.path.join(directory, file) + + # 检查是否是文件 + if not os.path.isfile(file_path): + continue + + # 检查扩展名 + if not is_valid_extension(file_path, extensions): + continue + + # 检查排除模式 + exclude = False + for pattern in exclude_patterns: + if pattern in file: + exclude = True + break + + if not exclude: + files.append(file_path) + + return files + +def get_latest_file(directory: str, pattern: str = "", extensions: List[str] = None) -> Optional[str]: + """ + 获取指定目录下最新的文件 + + Args: + directory: 目录路径 + pattern: 文件名包含的字符串模式 + extensions: 限制的文件扩展名列表 + + Returns: + 最新文件的路径,如果没有找到则返回None + """ + if not os.path.exists(directory): + logger.warning(f"目录不存在: {directory}") + return None + + files = [] + for file in os.listdir(directory): + # 检查模式和扩展名 + if (pattern and pattern not in file) or \ + (extensions and not is_valid_extension(file, extensions)): + continue + + file_path = os.path.join(directory, file) + if os.path.isfile(file_path): + files.append((file_path, os.path.getmtime(file_path))) + + if not files: + logger.warning(f"未在目录 {directory} 中找到符合条件的文件") + return None + + # 按修改时间排序,返回最新的 + sorted_files = sorted(files, key=lambda x: x[1], reverse=True) + return sorted_files[0][0] + +def generate_timestamp_filename(original_path: str) -> str: + """ + 生成基于时间戳的文件名 + + Args: + original_path: 原始文件路径 + + Returns: + 带时间戳的新文件路径 + """ + dir_path = os.path.dirname(original_path) + ext = os.path.splitext(original_path)[1] + timestamp = datetime.now().strftime("%Y%m%d%H%M%S") + return os.path.join(dir_path, f"{timestamp}{ext}") + +def rename_file(source_path: str, target_path: str) -> bool: + """ + 重命名文件 + + Args: + source_path: 源文件路径 + target_path: 目标文件路径 + + Returns: + 是否成功重命名 + """ + try: + # 确保目标目录存在 + target_dir = os.path.dirname(target_path) + ensure_dir(target_dir) + + # 重命名文件 + os.rename(source_path, target_path) + logger.info(f"文件已重命名: {os.path.basename(source_path)} -> {os.path.basename(target_path)}") + return True + except Exception as e: + logger.error(f"重命名文件失败: {e}") + return False + +def load_json(file_path: str, default: Any = None) -> Any: + """ + 加载JSON文件 + + Args: + file_path: JSON文件路径 + default: 如果文件不存在或加载失败时返回的默认值 + + Returns: + JSON内容,或者默认值 + """ + if not os.path.exists(file_path): + return default + + try: + with open(file_path, 'r', encoding='utf-8') as f: + return json.load(f) + except Exception as e: + logger.error(f"加载JSON文件失败: {file_path}, 错误: {e}") + return default + +def save_json(data: Any, file_path: str, ensure_ascii: bool = False, indent: int = 2) -> bool: + """ + 保存数据到JSON文件 + + Args: + data: 要保存的数据 + file_path: JSON文件路径 + ensure_ascii: 是否确保ASCII编码 + indent: 缩进空格数 + + Returns: + 是否成功保存 + """ + try: + # 确保目录存在 + directory = os.path.dirname(file_path) + ensure_dir(directory) + + with open(file_path, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=ensure_ascii, indent=indent) + logger.debug(f"JSON数据已保存到: {file_path}") + return True + except Exception as e: + logger.error(f"保存JSON文件失败: {file_path}, 错误: {e}") + return False + +def smart_read_excel(file_path: Union[str, Path], **kwargs) -> Any: + """ + 智能读取 Excel 文件,自动选择引擎并处理常见错误 + + Args: + file_path: Excel 文件路径 + **kwargs: 传递给 pd.read_excel 的额外参数 + + Returns: + pandas.DataFrame 对象 + """ + import pandas as pd + + path_str = str(file_path) + ext = os.path.splitext(path_str)[1].lower() + + # 自动选择引擎 + if ext == '.xlsx': + kwargs.setdefault('engine', 'openpyxl') + elif ext == '.xls': + kwargs.setdefault('engine', 'xlrd') + + try: + return pd.read_excel(path_str, **kwargs) + except Exception as e: + logger.error(f"读取 Excel 文件失败: {path_str}, 错误: {e}") + raise + +def get_file_size(file_path: str) -> int: + """ + 获取文件大小(字节) + + Args: + file_path: 文件路径 + + Returns: + 文件大小(字节) + """ + try: + return os.path.getsize(file_path) + except Exception as e: + logger.error(f"获取文件大小失败: {file_path}, 错误: {e}") + return 0 + +def is_file_size_valid(file_path: str, max_size_mb: float) -> bool: + """ + 检查文件大小是否在允许范围内 + + Args: + file_path: 文件路径 + max_size_mb: 最大允许大小(MB) + + Returns: + 文件大小是否有效 + """ + size_bytes = get_file_size(file_path) + max_size_bytes = max_size_mb * 1024 * 1024 + return size_bytes <= max_size_bytes + + +def format_file_size(size_bytes: int) -> str: + """将字节数格式化为可读的文件大小字符串(KB/MB)""" + if size_bytes < 1024 * 1024: + return f"{size_bytes / 1024:.1f} KB" + return f"{size_bytes / (1024 * 1024):.1f} MB" \ No newline at end of file diff --git a/app/core/utils/log_utils.py b/app/core/utils/log_utils.py new file mode 100644 index 0000000..84bb617 --- /dev/null +++ b/app/core/utils/log_utils.py @@ -0,0 +1,180 @@ +""" +日志工具模块 +---------- +提供统一的日志配置和管理功能。 +""" + +import os +import sys +import logging +from logging.handlers import RotatingFileHandler +from datetime import datetime +from pathlib import Path +from typing import Optional, Dict + +# 日志处理器字典,用于跟踪已创建的处理器 +_handlers: Dict[str, logging.Handler] = {} + +def setup_logger(name: str, + log_file: Optional[str] = None, + level=logging.INFO, + console_output: bool = True, + file_output: bool = True, + log_format: str = '%(asctime)s - %(name)s - %(levelname)s - %(message)s') -> logging.Logger: + """ + 配置并返回日志记录器 + + Args: + name: 日志记录器的名称 + log_file: 日志文件路径,如果为None则使用默认路径 + level: 日志级别 + console_output: 是否输出到控制台 + file_output: 是否输出到文件 + log_format: 日志格式 + + Returns: + 配置好的日志记录器 + """ + # 获取或创建日志记录器 + logger = logging.getLogger(name) + + # 如果已经配置过处理器,不重复配置 + if logger.handlers: + return logger + + # 设置日志级别 + logger.setLevel(level) + + # 创建格式化器 + formatter = logging.Formatter(log_format) + + # 如果需要输出到文件 + if file_output: + # 如果没有指定日志文件,使用默认路径 + if log_file is None: + log_dir = os.path.abspath('logs') + # 确保日志目录存在 + os.makedirs(log_dir, exist_ok=True) + log_file = os.path.join(log_dir, f"{name}.log") + + # 创建文件处理器 + try: + # 使用滚动日志,限制单个日志大小与备份数量 + file_handler = RotatingFileHandler(log_file, maxBytes=5 * 1024 * 1024, backupCount=3, encoding='utf-8') + file_handler.setFormatter(formatter) + file_handler.setLevel(level) + logger.addHandler(file_handler) + _handlers[f"{name}_file"] = file_handler + + # 记录活跃标记,避免被日志清理工具删除 + active_marker = os.path.join(os.path.dirname(log_file), f"{name}.active") + with open(active_marker, 'w', encoding='utf-8') as f: + f.write(f"Active since: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + except Exception as e: + print(f"无法创建日志文件处理器: {e}") + + # 如果需要输出到控制台 + if console_output: + # 创建控制台处理器 + console_handler = logging.StreamHandler(sys.stdout) + console_handler.setFormatter(formatter) + console_handler.setLevel(level) + logger.addHandler(console_handler) + _handlers[f"{name}_console"] = console_handler + + return logger + +def get_logger(name: str) -> logging.Logger: + """ + 获取已配置的日志记录器,如果不存在则创建一个新的 + + Args: + name: 日志记录器的名称 + + Returns: + 日志记录器 + """ + logger = logging.getLogger(name) + if not logger.handlers: + return setup_logger(name) + return logger + +def set_log_level(level: str) -> None: + """ + 设置所有日志记录器的级别 + + Args: + level: 日志级别(DEBUG, INFO, WARNING, ERROR, CRITICAL) + """ + level_map = { + 'debug': logging.DEBUG, + 'info': logging.INFO, + 'warning': logging.WARNING, + 'error': logging.ERROR, + 'critical': logging.CRITICAL + } + + # 获取对应的日志级别 + log_level = level_map.get(level.lower(), logging.INFO) + + # 获取所有记录器 + loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict] + + # 设置每个记录器的级别 + for logger in loggers: + logger.setLevel(log_level) + + # 设置根记录器的级别 + logging.getLogger().setLevel(log_level) + + print(f"所有日志记录器级别已设置为: {logging.getLevelName(log_level)}") + +def close_logger(name: str) -> None: + """ + 关闭日志记录器的所有处理器 + + Args: + name: 日志记录器的名称 + """ + logger = logging.getLogger(name) + for handler in logger.handlers[:]: + handler.close() + logger.removeHandler(handler) + + # 清除处理器缓存 + _handlers.pop(f"{name}_file", None) + _handlers.pop(f"{name}_console", None) + +def close_all_loggers() -> None: + """ + 关闭所有日志记录器的处理器 + """ + # 获取所有记录器 + loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict] + + # 关闭每个记录器的处理器 + for logger in loggers: + if hasattr(logger, 'handlers'): + for handler in logger.handlers[:]: + handler.close() + logger.removeHandler(handler) + + # 清空处理器缓存 + _handlers.clear() + + print("所有日志记录器已关闭") + +def cleanup_active_marker(name: str) -> None: + """ + 清理日志活跃标记 + + Args: + name: 日志记录器的名称 + """ + try: + log_dir = os.path.abspath('logs') + active_marker = os.path.join(log_dir, f"{name}.active") + if os.path.exists(active_marker): + os.remove(active_marker) + except Exception as e: + print(f"无法清理日志活跃标记: {e}") diff --git a/app/core/utils/string_utils.py b/app/core/utils/string_utils.py new file mode 100644 index 0000000..7dc66ed --- /dev/null +++ b/app/core/utils/string_utils.py @@ -0,0 +1,279 @@ +""" +字符串处理工具模块 +--------------- +提供字符串处理、正则表达式匹配等功能。 +""" + +import re +from typing import Dict, List, Optional, Tuple, Any + +def clean_string(text: str) -> str: + """ + 清理字符串,移除多余空白 + + Args: + text: 源字符串 + + Returns: + 清理后的字符串 + """ + if not isinstance(text, str): + return "" + + # 移除首尾空白 + text = text.strip() + # 移除多余空白 + text = re.sub(r'\s+', ' ', text) + return text + +def remove_non_digits(text: str) -> str: + """ + 移除字符串中的非数字字符 + + Args: + text: 源字符串 + + Returns: + 只包含数字的字符串 + """ + if not isinstance(text, str): + return "" + + return re.sub(r'\D', '', text) + +def extract_number(text: str) -> Optional[float]: + """ + 从字符串中提取数字 + + Args: + text: 源字符串 + + Returns: + 提取的数字,如果没有则返回None + """ + if not isinstance(text, str): + return None + + # 匹配数字(可以包含小数点和负号) + match = re.search(r'-?\d+(\.\d+)?', text) + if match: + return float(match.group()) + return None + +def extract_unit(text: str, units: List[str] = None) -> Optional[str]: + """ + 从字符串中提取单位 + + Args: + text: 源字符串 + units: 有效单位列表,如果为None则自动识别 + + Returns: + 提取的单位,如果没有则返回None + """ + if not isinstance(text, str): + return None + + # 如果提供了单位列表,检查字符串中是否包含 + if units: + for unit in units: + if unit in text: + return unit + return None + + # 否则,尝试自动识别常见单位 + # 正则表达式:匹配数字后面的非数字部分作为单位 + match = re.search(r'\d+\s*([^\d\s]+)', text) + if match: + return match.group(1) + return None + +def extract_number_and_unit(text: str) -> Tuple[Optional[float], Optional[str]]: + """ + 从字符串中同时提取数字和单位 + + Args: + text: 源字符串 + + Returns: + (数字, 单位)元组,如果没有则对应返回None + """ + if not isinstance(text, str): + return None, None + + # 匹配数字和单位的组合 + match = re.search(r'(-?\d+(?:\.\d+)?)\s*([^\d\s]+)?', text) + if match: + number = float(match.group(1)) + unit = match.group(2) if match.group(2) else None + return number, unit + return None, None + +def parse_specification(spec_str: str) -> Optional[int]: + """ + 解析规格字符串,提取包装数量 + 支持格式:1*15, 1x15, 1*5*10 + + Args: + spec_str: 规格字符串 + + Returns: + 包装数量,如果无法解析则返回None + """ + if not spec_str or not isinstance(spec_str, str): + return None + + try: + # 清理规格字符串 + spec_str = clean_string(spec_str) + + # 匹配重量/容量格式,如"450g*15"、"450ml*15" + match = re.search(r'\d+(?:g|ml|毫升|克)[*xX×](\d+)', spec_str) + if match: + # 返回后面的数量 + return int(match.group(1)) + + # 匹配1*5*10 格式的三级规格 + match = re.search(r'(\d+)[\*xX×](\d+)[\*xX×](\d+)', spec_str) + if match: + # 取最后一个数字作为袋数量 + return int(match.group(3)) + + # 匹配1*15, 1x15 格式 + match = re.search(r'(\d+)[\*xX×](\d+)', spec_str) + if match: + # 取第二个数字作为包装数量 + return int(match.group(2)) + + # 匹配24瓶/件等格式 + match = re.search(r'(\d+)[瓶个支袋][//](件|箱)', spec_str) + if match: + return int(match.group(1)) + + # 匹配4L格式 + match = re.search(r'(\d+(?:\.\d+)?)\s*[Ll升][*×]?(\d+)?', spec_str) + if match: + # 如果有第二个数字,返回它;否则返回1 + return int(match.group(2)) if match.group(2) else 1 + + except Exception: + pass + + return None + +def clean_barcode(barcode: Any) -> str: + """ + 清理条码格式 + + Args: + barcode: 条码(可以是字符串、整数或浮点数) + + Returns: + 清理后的条码字符串 + """ + if isinstance(barcode, (int, float)): + barcode = f"{barcode:.0f}" + + # 清理条码格式,移除可能的非数字字符(包括小数点) + barcode_clean = re.sub(r'\.0+$', '', str(barcode)) # 移除末尾0 + barcode_clean = re.sub(r'\D', '', barcode_clean) # 只保留数字 + + return barcode_clean + +def is_scientific_notation(value: str) -> bool: + """ + 检查字符串是否是科学计数法表示 + + Args: + value: 字符串值 + + Returns: + 是否是科学计数法 + """ + return bool(re.match(r'^-?\d+(\.\d+)?[eE][+-]?\d+$', str(value))) + +def parse_monetary_string(value: Any) -> Optional[float]: + """ + 解析金额/数量字符串为浮点数。 + 处理: 货币符号(¥/$)、逗号作小数点、逗号作千位分隔符、中文"元"后缀等。 + + Args: + value: 金额值(字符串、数字或其他类型) + + Returns: + 解析后的浮点数,无法解析则返回 None + """ + if value is None: + return None + if isinstance(value, (int, float)): + return float(value) + if not isinstance(value, str): + return None + + s = value.strip() + if not s or s.lower() in ('o', 'none', 'null', '-', '--'): + return None + + # 移除非数字字符,保留数字、小数点、逗号和负号 + cleaned = re.sub(r'[^\d\.\-,]', '', s) + if not cleaned or cleaned in ('-', '.', '-.', ','): + return None + + # 逗号处理策略: + # 多个逗号 -> 千位分隔符,全部移除 (如 "1,234,567" = 1234567) + # 一个逗号 + 无小数点 -> 逗号当小数点 (如 "1,5" = 1.5) + # 一个逗号 + 有小数点 -> 千位分隔符,移除 (如 "1,234.56" = 1234.56) + comma_count = cleaned.count(',') + if comma_count > 1: + cleaned = cleaned.replace(',', '') + elif comma_count == 1 and '.' not in cleaned: + cleaned = cleaned.replace(',', '.') + elif comma_count == 1 and '.' in cleaned: + cleaned = cleaned.replace(',', '') + + try: + return float(cleaned) + except (ValueError, TypeError): + return None + + +def format_barcode(barcode: Any) -> str: + """ + 格式化条码,处理科学计数法 + + Args: + barcode: 条码值 + + Returns: + 格式化后的条码字符串 + """ + if barcode is None: + return "" + + # 先转为字符串 + barcode_str = str(barcode).strip() + + # 判断是否为科学计数法 + if is_scientific_notation(barcode_str): + try: + # 科学计数法转为普通数字字符串 + barcode_str = f"{float(barcode_str):.0f}" + except (ValueError, TypeError): + pass + + # 移除可能的小数部分(如"123456.0"变为"123456") + if '.' in barcode_str: + barcode_str = re.sub(r'\.0+$', '', barcode_str) + + # 确保是纯数字字符串 + if not barcode_str.isdigit(): + # 只保留数字字符 + barcode_str = re.sub(r'\D', '', barcode_str) + + # 新增:处理末尾多余的0,标准条码通常为12-13位 + if len(barcode_str) > 13 and barcode_str.endswith('0'): + # 从末尾开始移除多余的0,直到条码长度为13位或者不再以0结尾 + while len(barcode_str) > 13 and barcode_str.endswith('0'): + barcode_str = barcode_str[:-1] + + return barcode_str \ No newline at end of file diff --git a/app/services/__init__.py b/app/services/__init__.py new file mode 100644 index 0000000..4ff788e --- /dev/null +++ b/app/services/__init__.py @@ -0,0 +1,5 @@ +""" +OCR订单处理系统 - 服务模块 +----------------------- +提供业务逻辑服务,协调各个核心组件完成业务功能。 +""" \ No newline at end of file diff --git a/app/services/ocr_service.py b/app/services/ocr_service.py new file mode 100644 index 0000000..1a4ce59 --- /dev/null +++ b/app/services/ocr_service.py @@ -0,0 +1,193 @@ +""" +OCR服务模块 +--------- +提供OCR识别服务,协调OCR流程。 +""" + +from typing import Dict, List, Optional, Tuple, Union, Any, Callable +import os + +from ..config.settings import ConfigManager +from ..core.utils.log_utils import get_logger +from ..core.ocr.table_ocr import OCRProcessor + +logger = get_logger(__name__) + +class OCRService: + """ + OCR识别服务:协调OCR流程 + """ + + def __init__(self, config: Optional[ConfigManager] = None): + """ + 初始化OCR服务 + + Args: + config: 配置管理器,如果为None则创建新的 + """ + logger.info("初始化OCRService") + self.config = config or ConfigManager() + + # 创建OCR处理器 + self.ocr_processor = OCRProcessor(self.config) + + logger.info("OCRService初始化完成") + + def get_unprocessed_images(self) -> List[str]: + """ + 获取待处理的图片列表 + + Returns: + 待处理图片路径列表 + """ + return self.ocr_processor.get_unprocessed_images() + + def process_image(self, image_path: str) -> Optional[str]: + """ + 处理单个图片文件 + + Args: + image_path: 图片文件路径 + + Returns: + 生成的Excel文件路径,如果处理失败则返回None + """ + try: + # 检查文件是否存在 + if not os.path.exists(image_path): + logger.error(f"文件不存在: {image_path}") + return None + + # 检查文件类型 + if not self._is_valid_image(image_path): + logger.error(f"不支持的文件类型: {image_path}") + return None + + # 检查是否已处理 + excel_file = self._get_excel_path(image_path) + if os.path.exists(excel_file): + logger.info(f"文件已处理过,跳过OCR识别: {image_path}") + return excel_file + + # 执行OCR识别 + result = self.ocr_processor.process_image(image_path) + if not result: + logger.error(f"OCR识别失败: {image_path}") + return None + + # 生成Excel文件 + excel_file = self._generate_excel(result, image_path) + if not excel_file: + logger.error(f"生成Excel文件失败: {image_path}") + return None + + logger.info(f"处理完成: {image_path} -> {excel_file}") + return excel_file + + except Exception as e: + logger.error(f"处理图片时发生错误: {e}", exc_info=True) + return None + + def process_images_batch(self, batch_size: int = None, max_workers: int = None, progress_cb: Optional[Callable[[int], None]] = None) -> Tuple[int, int]: + """ + 批量处理图片 + + Args: + batch_size: 批处理大小 + max_workers: 最大线程数 + + Returns: + (总处理数, 成功处理数)元组 + """ + logger.info(f"OCRService开始批量处理图片, batch_size={batch_size}, max_workers={max_workers}") + return self.ocr_processor.process_images_batch(batch_size, max_workers, progress_cb) + + # 添加batch_process作为process_images_batch的别名,确保兼容性 + def batch_process(self, batch_size: int = None, max_workers: int = None, progress_cb: Optional[Callable[[int], None]] = None) -> Tuple[int, int]: + """ + 批量处理图片(别名方法,与process_images_batch功能相同) + + Args: + batch_size: 批处理大小 + max_workers: 最大线程数 + + Returns: + (总处理数, 成功处理数)元组 + """ + logger.info(f"OCRService.batch_process被调用,转发到process_images_batch") + return self.process_images_batch(batch_size, max_workers, progress_cb) + + def validate_image(self, image_path: str) -> bool: + """ + 验证图片是否有效 + + Args: + image_path: 图片路径 + + Returns: + 图片是否有效 + """ + return self.ocr_processor.validate_image(image_path) + + def _is_valid_image(self, image_path: str) -> bool: + """ + 检查文件是否为有效的图片格式 + + Args: + image_path: 图片文件路径 + + Returns: + 是否为有效图片格式 + """ + return self.validate_image(image_path) + + def _get_excel_path(self, image_path: str) -> str: + """ + 根据图片路径生成对应的Excel文件路径 + + Args: + image_path: 图片文件路径 + + Returns: + Excel文件路径 + """ + # 获取文件名(不含扩展名) + base_name = os.path.splitext(os.path.basename(image_path))[0] + # 生成Excel文件路径 + output_dir = self.config.get('Paths', 'output_folder', fallback='data/output') + excel_path = os.path.join(output_dir, f"{base_name}.xlsx") + return excel_path + + def _generate_excel(self, ocr_result: dict, image_path: str) -> Optional[str]: + """ + 根据OCR结果生成Excel文件 + + Args: + ocr_result: OCR识别结果 + image_path: 原始图片路径 + + Returns: + 生成的Excel文件路径,失败返回None + """ + try: + excel_path = self._get_excel_path(image_path) + + # 确保输出目录存在 + os.makedirs(os.path.dirname(excel_path), exist_ok=True) + + # 调用OCR处理器的Excel生成功能 + if hasattr(self.ocr_processor, 'generate_excel'): + success = self.ocr_processor.generate_excel(ocr_result, excel_path) + if success: + return excel_path + else: + # 如果OCR处理器没有generate_excel方法,直接返回路径 + # 假设OCR处理器已经生成了Excel文件 + if os.path.exists(excel_path): + return excel_path + + return None + + except Exception as e: + logger.error(f"生成Excel文件时发生错误: {e}", exc_info=True) + return None diff --git a/app/services/order_service.py b/app/services/order_service.py new file mode 100644 index 0000000..fd53151 --- /dev/null +++ b/app/services/order_service.py @@ -0,0 +1,245 @@ +""" +订单服务模块 +--------- +提供订单处理服务,协调Excel处理和订单合并流程。 +""" + +import os +from typing import Dict, List, Optional, Tuple, Union, Any, Callable + +from ..config.settings import ConfigManager +from ..core.utils.log_utils import get_logger +from ..core.excel.processor import ExcelProcessor +from ..core.excel.merger import PurchaseOrderMerger +from ..core.db.product_db import ProductDatabase + +logger = get_logger(__name__) + +class OrderService: + """ + 订单服务:协调Excel处理和订单合并流程 + """ + + def __init__(self, config: Optional[ConfigManager] = None): + """ + 初始化订单服务 + + Args: + config: 配置管理器,如果为None则创建新的 + """ + logger.info("初始化OrderService") + self.config = config or ConfigManager() + + # 创建Excel处理器和采购单合并器 + self.excel_processor = ExcelProcessor(self.config) + self.order_merger = PurchaseOrderMerger(self.config) + + logger.info("OrderService初始化完成") + + def get_latest_excel(self) -> Optional[str]: + """ + 获取最新的Excel文件 + + Returns: + 最新Excel文件路径,如果未找到则返回None + """ + return self.excel_processor.get_latest_excel() + + def process_excel(self, file_path: Optional[str] = None, progress_cb: Optional[Callable[[int], None]] = None) -> Optional[str]: + """ + 处理Excel订单文件,生成标准采购单 + + Args: + file_path: Excel文件路径,如果为None则处理最新的文件 + + Returns: + 输出采购单文件路径,如果处理失败则返回None + """ + if not file_path: + file_path = self.excel_processor.get_latest_excel() + if not file_path: + logger.warning("未找到可处理的Excel文件") + return None + logger.info("OrderService开始处理最新Excel文件") + else: + logger.info(f"OrderService开始处理指定Excel文件: {file_path}") + + # 检查是否需要特殊的供应商预处理(如杨碧月) + try: + from .special_suppliers_service import SpecialSuppliersService + special_service = SpecialSuppliersService(self.config) + + # 尝试识别并预处理(注意:这里不再传入 progress_cb 避免无限递归或重复进度条, + # 或者我们在 special_service 内部逻辑中处理完后直接返回结果) + # 为了避免循环调用,我们在 SpecialSuppliersService 内部不再调用 process_excel, + # 而是让 process_excel 识别后自己决定是否处理预处理后的文件。 + + # 我们新增一个 check_and_preprocess 方法 + preprocessed_path = self._check_special_preprocess(file_path) + if preprocessed_path: + logger.info(f"检测到特殊供应商,已生成预处理文件: {preprocessed_path}") + file_path = preprocessed_path + except Exception as e: + logger.error(f"检查特殊预处理时出错: {e}") + + return self.excel_processor.process_specific_file(file_path, progress_cb=progress_cb) + + def _check_special_preprocess(self, file_path: str) -> Optional[str]: + """检查并执行特殊的预处理(支持杨碧月、烟草公司、蓉城易购)""" + try: + from app.core.utils.file_utils import smart_read_excel + import pandas as pd + import re + + # 仅读取前 50 行进行智能识别 (header=None 确保能读到第一行内容) + df_head = smart_read_excel(file_path, nrows=50, header=None) + df_str = df_head.astype(str) + + # 1. 识别:烟草公司 (Tobacco) + # 特征:内容中包含“专卖证号”或特定证号“510109104938” + is_tobacco = df_str.apply(lambda x: x.str.contains('专卖证号|510109104938')).any().any() + if is_tobacco: + logger.info("识别到烟草公司订单,执行专用预处理...") + from .tobacco_service import TobaccoService + tobacco_svc = TobaccoService(self.config) + return tobacco_svc.preprocess_tobacco_order(file_path) + + # 2. 识别:蓉城易购 (Rongcheng Yigou) + # 特征:内容中包含单号标识“RCDH” + is_rongcheng = df_str.apply(lambda x: x.str.contains('RCDH')).any().any() + if is_rongcheng: + logger.info("识别到蓉城易购订单,执行专用预处理...") + from .special_suppliers_service import SpecialSuppliersService + special_svc = SpecialSuppliersService(self.config) + return special_svc.preprocess_rongcheng_yigou(file_path) + + # 3. 识别:杨碧月 (Yang Biyue) + # 特征:经手人列包含“杨碧月” + handler_col = None + for col in df_head.columns: + # 在前50行中搜索“经手人”关键字 + if df_head[col].astype(str).str.contains('经手人').any(): + handler_col = col + break + + if handler_col is not None: + # 检查该列是否有“杨碧月” + if df_head[handler_col].astype(str).str.contains('杨碧月').any(): + logger.info("识别到杨碧月订单,执行专用预处理...") + from .special_suppliers_service import SpecialSuppliersService + special_svc = SpecialSuppliersService(self.config) + return special_svc.process_yang_biyue_only(file_path) + + except Exception as e: + logger.warning(f"智能预处理识别失败: {e}") + return None + + def get_purchase_orders(self) -> List[str]: + """ + 获取采购单文件列表 + + Returns: + 采购单文件路径列表 + """ + return self.order_merger.get_purchase_orders() + + def merge_purchase_orders(self, file_paths: List[str], progress_cb: Optional[Callable[[int], None]] = None) -> Optional[str]: + """ + 合并指定的采购单文件 + + Args: + file_paths: 采购单文件路径列表 + + Returns: + 合并后的采购单文件路径,如果合并失败则返回None + """ + logger.info(f"OrderService开始合并指定采购单: {file_paths}") + return self.merge_orders(file_paths, progress_cb) + + def merge_all_purchase_orders(self, progress_cb: Optional[Callable[[int], None]] = None) -> Optional[str]: + """ + 合并所有可用的采购单文件 + + Returns: + 合并后的采购单文件路径,如果合并失败则返回None + """ + logger.info("OrderService开始合并所有采购单") + return self.merge_orders(None, progress_cb) + + def merge_orders(self, file_paths: Optional[List[str]] = None, progress_cb: Optional[Callable[[int], None]] = None) -> Optional[str]: + """ + 合并采购单 + + Args: + file_paths: 采购单文件路径列表,如果为None则处理所有采购单 + + Returns: + 合并后的采购单文件路径,如果合并失败则返回None + """ + if file_paths: + logger.info(f"OrderService开始合并指定采购单: {file_paths}") + else: + logger.info("OrderService开始合并所有采购单") + + return self.order_merger.process(file_paths, progress_cb) + + def validate_unit_price(self, result_path: str) -> List[str]: + """ + 校验采购单单价与商品资料进货价的差异 + + Args: + result_path: 待校验的采购单路径 + + Returns: + 差异信息列表,无差异返回空列表 + """ + try: + import pandas as pd + import os + from app.core.utils.file_utils import smart_read_excel + from app.core.handlers.column_mapper import ColumnMapper as CM + + config = ConfigManager() + template_folder = config.get('Paths', 'template_folder', fallback='templates') + item_data = config.get('Templates', 'item_data', fallback='商品资料.xlsx') + item_path = os.path.join(template_folder, item_data) + product_db_path = config.get('Paths', 'product_db', fallback='data/product_cache.db') + + # 使用 SQLite 查询商品进货价 + product_db = ProductDatabase(product_db_path, item_path) + + # 读取待校验的采购单 + df_res = smart_read_excel(result_path) + + res_barcode_col = CM.find_column(list(df_res.columns), 'barcode') + res_price_col = CM.find_column(list(df_res.columns), 'unit_price') + + if not res_barcode_col or not res_price_col: + logger.warning("未能在采购单中找到条码或单价列") + return [] + + # 批量查询进货价 + barcodes = df_res[res_barcode_col].astype(str).str.strip().tolist() + item_prices = product_db.get_prices(barcodes) + + results = [] + for _, row in df_res.iterrows(): + bc = str(row[res_barcode_col]).strip() + if bc not in item_prices: + continue + + try: + res_price = float(row[res_price_col]) + except (ValueError, TypeError): + continue + + item_price = item_prices[bc] + diff = abs(res_price - item_price) + if diff > 1.0: + results.append(f"条码 {bc}: 采购单价={res_price} vs 进货价={item_price} 差异={diff:.2f}") + + return results + + except Exception as e: + logger.error(f"单价校验过程中发生错误: {e}") + return [] diff --git a/app/services/processor_service.py b/app/services/processor_service.py new file mode 100644 index 0000000..6b3fe05 --- /dev/null +++ b/app/services/processor_service.py @@ -0,0 +1,297 @@ +""" +处理器调度服务 + +负责管理和调度各种文件处理器,实现智能文件类型检测和处理器选择 +""" + +import logging +from typing import Dict, Any, Optional, List +from pathlib import Path + +from ..core.processors.base import BaseProcessor +from ..core.processors.tobacco_processor import TobaccoProcessor +from ..core.processors.ocr_processor import OCRProcessor +from ..core.utils.log_utils import get_logger + +logger = get_logger(__name__) + + +class ProcessorService: + """处理器调度服务 + + 负责管理所有处理器实例,提供统一的文件处理接口 + """ + + def __init__(self, config: Dict[str, Any]): + """初始化处理器服务 + + Args: + config: 系统配置字典 + """ + self.config = config + self.processors: List[BaseProcessor] = [] + self._load_processors() + logger.info(f"处理器服务初始化完成,加载了{len(self.processors)}个处理器") + + def _load_processors(self): + """加载所有处理器""" + try: + self.processors = [ + TobaccoProcessor(self.config), + OCRProcessor(self.config), + ] + + supplier_configs = [] + try: + import json + from pathlib import Path + # 优先从`config/suppliers_config.json`加载 + config_path = Path("config/suppliers_config.json") + if not config_path.exists(): + # 兼容其它路径 + config_path = Path("./suppliers_config.json") + if config_path.exists(): + with open(config_path, 'r', encoding='utf-8') as f: + data = json.load(f) + ok, errs, supplier_configs = self._validate_suppliers_config(data) + if not ok: + logger.error("供应商配置校验失败:\n" + "\n".join([f"- {e}" for e in errs])) + else: + logger.info(f"从 {config_path} 加载供应商配置,共 {len(supplier_configs)} 项") + else: + logger.info("未找到供应商配置文件,跳过供应商处理器加载") + except Exception as e: + logger.error(f"读取供应商配置失败: {e}") + + for supplier_config in supplier_configs: + try: + from ..core.processors.supplier_processors.generic_supplier_processor import GenericSupplierProcessor + processor = GenericSupplierProcessor(self.config, supplier_config) + self.processors.append(processor) + logger.info(f"加载供应商处理器: {processor.name}") + except Exception as e: + logger.error(f"加载供应商处理器失败: {e}") + + logger.info(f"成功加载{len(self.processors)}个处理器") + + except Exception as e: + logger.error(f"加载处理器时出错: {e}", exc_info=True) + self.processors = [ + TobaccoProcessor(self.config), + OCRProcessor(self.config), + ] + + def _validate_suppliers_config(self, data): + try: + suppliers = data.get('suppliers') + errors = [] + valid = [] + if not isinstance(suppliers, list) or not suppliers: + errors.append('suppliers必须是非空数组') + return False, errors, [] + for idx, s in enumerate(suppliers): + e = self._validate_single_supplier(s, idx) + if e: + errors.extend(e) + else: + valid.append(s) + return len(errors) == 0, errors, valid + except Exception as e: + return False, [f'配置解析异常: {e}'], [] + + def _validate_single_supplier(self, s, idx): + errs = [] + prefix = f'suppliers[{idx}]' + name = s.get('name') + if not name or not isinstance(name, str): + errs.append(f'{prefix}.name 必须为字符串') + fp = s.get('filename_patterns', []) + ci = s.get('content_indicators', []) + if not fp and not ci: + errs.append(f'{prefix} 必须至少提供 filename_patterns 或 content_indicators 之一') + cm = s.get('column_mapping', {}) + if cm and not isinstance(cm, dict): + errs.append(f'{prefix}.column_mapping 必须为对象') + cr = s.get('cleaning_rules', []) + if cr and not isinstance(cr, list): + errs.append(f'{prefix}.cleaning_rules 必须为数组') + else: + for i, rule in enumerate(cr): + rtype = rule.get('type') + if rtype not in ('remove_rows','fill_na','convert_type'): + errs.append(f'{prefix}.cleaning_rules[{i}].type 非法: {rtype}') + if rtype == 'remove_rows' and not rule.get('condition'): + errs.append(f'{prefix}.cleaning_rules[{i}].condition 必填') + if rtype in ('fill_na','convert_type'): + if not rule.get('columns') and not rule.get('column'): + errs.append(f'{prefix}.cleaning_rules[{i}] 需提供 columns 或 column') + calc = s.get('calculations', []) + if calc and not isinstance(calc, list): + errs.append(f'{prefix}.calculations 必须为数组') + else: + for i, c in enumerate(calc): + ctype = c.get('type') + if ctype not in ('multiply','divide','formula'): + errs.append(f'{prefix}.calculations[{i}].type 非法: {ctype}') + if ctype in ('multiply','divide'): + if not c.get('source_column') or not c.get('target_column'): + errs.append(f'{prefix}.calculations[{i}] 需提供 source_column 与 target_column') + if ctype == 'formula' and (not c.get('formula') or not c.get('target_column')): + errs.append(f'{prefix}.calculations[{i}] 需提供 formula 与 target_column') + return errs + + def process_file(self, input_file: Path, output_dir: Path, + preferred_processor: Optional[str] = None) -> Optional[Path]: + """处理文件 - 自动选择合适的处理器 + + Args: + input_file: 输入文件路径 + output_dir: 输出目录路径 + preferred_processor: 优先使用的处理器名称(可选) + + Returns: + 输出文件路径,处理失败返回None + """ + if not input_file.exists(): + logger.error(f"输入文件不存在: {input_file}") + return None + + if not output_dir.exists(): + output_dir.mkdir(parents=True, exist_ok=True) + + try: + # 如果指定了优先处理器,先尝试使用它 + if preferred_processor: + processor = self._get_processor_by_name(preferred_processor) + if processor and processor.can_process(input_file): + logger.info(f"使用指定的处理器: {processor.name}") + return processor.process(input_file, output_dir) + else: + logger.warning(f"指定的处理器不可用或无法处理该文件: {preferred_processor}") + + # 自动选择合适的处理器 + suitable_processors = [p for p in self.processors if p.can_process(input_file)] + + if not suitable_processors: + logger.warning(f"未找到适合处理文件的处理器: {input_file}") + logger.info(f"支持的文件类型: {self.get_supported_types()}") + return None + + # 使用第一个合适的处理器 + processor = suitable_processors[0] + logger.info(f"使用处理器 {processor.name} 处理文件: {input_file}") + + return processor.process(input_file, output_dir) + + except Exception as e: + logger.error(f"处理文件时出错: {e}", exc_info=True) + return None + + def _get_processor_by_name(self, name: str) -> Optional[BaseProcessor]: + """根据名称获取处理器 + + Args: + name: 处理器名称 + + Returns: + 处理器实例或None + """ + for processor in self.processors: + if processor.name == name or processor.__class__.__name__ == name: + return processor + return None + + def get_supported_types(self) -> List[Dict[str, Any]]: + """获取支持的文件类型信息 + + Returns: + 处理器类型信息列表 + """ + return [ + { + 'name': processor.name, + 'description': processor.description, + 'extensions': processor.get_supported_extensions(), + 'class_name': processor.__class__.__name__ + } + for processor in self.processors + ] + + def get_processor_info(self) -> List[Dict[str, Any]]: + """获取处理器详细信息 + + Returns: + 处理器详细信息列表 + """ + return [ + { + 'name': processor.name, + 'description': processor.description, + 'extensions': processor.get_supported_extensions(), + 'required_columns': processor.get_required_columns(), + 'class_name': processor.__class__.__name__, + 'module': processor.__class__.__module__ + } + for processor in self.processors + ] + + def can_process_file(self, file_path: Path) -> bool: + """检查是否有处理器能处理该文件 + + Args: + file_path: 文件路径 + + Returns: + 是否有处理器能处理 + """ + if not file_path.exists(): + return False + + return any(processor.can_process(file_path) for processor in self.processors) + + def get_suitable_processors(self, file_path: Path) -> List[BaseProcessor]: + """获取能处理该文件的所有处理器 + + Args: + file_path: 文件路径 + + Returns: + 合适的处理器列表 + """ + if not file_path.exists(): + return [] + + return [p for p in self.processors if p.can_process(file_path)] + + def reload_processors(self): + """重新加载处理器""" + logger.info("重新加载处理器...") + self.processors.clear() + self._load_processors() + logger.info(f"重新加载完成,共{len(self.processors)}个处理器") + + def add_processor(self, processor: BaseProcessor): + """添加处理器 + + Args: + processor: 处理器实例 + """ + self.processors.append(processor) + logger.info(f"添加处理器: {processor.name}") + + def remove_processor(self, processor_name: str) -> bool: + """移除处理器 + + Args: + processor_name: 处理器名称 + + Returns: + 是否成功移除 + """ + for i, processor in enumerate(self.processors): + if processor.name == processor_name or processor.__class__.__name__ == processor_name: + del self.processors[i] + logger.info(f"移除处理器: {processor_name}") + return True + logger.warning(f"未找到要移除的处理器: {processor_name}") + return False diff --git a/app/services/special_suppliers_service.py b/app/services/special_suppliers_service.py new file mode 100644 index 0000000..5cb7444 --- /dev/null +++ b/app/services/special_suppliers_service.py @@ -0,0 +1,227 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import os +import re +import time +import pandas as pd +from typing import Optional, Callable + +from ..core.utils.log_utils import get_logger + +logger = get_logger(__name__) + +class SpecialSuppliersService: + """ + 处理特殊供应商逻辑的服务类,如蓉城易购等 + """ + + def __init__(self, config_manager=None): + self.config_manager = config_manager + + def process_yang_biyue_only(self, src_path: str) -> Optional[str]: + """ + 仅执行杨碧月订单的预处理,返回预处理后的文件路径 + """ + try: + from app.core.utils.file_utils import smart_read_excel + # 读取原始数据 + df = smart_read_excel(src_path) + + # 检查是否包含“杨碧月” + handler_col = None + for col in df.columns: + if '经手人' in str(col): + handler_col = col + break + + if handler_col is None or not df[handler_col].astype(str).str.contains('杨碧月').any(): + return None + + # 识别到杨碧月订单,执行专用清洗 + logger.info("识别到杨碧月订单,正在执行专用清洗...") + + # 定义列映射关系 (映射到 ExcelProcessor 期望的中文列名) + # 使用精确匹配优先,防止“结算单位”匹配到“单位” + column_map = { + '商品条码': '商品条码', + '商品名称': '商品名称', + '商品规格': '规格', + '单位': '单位', + '数量': '数量', + '含税单价': '单价', + '含税金额': '金额' + } + + found_cols = {} + # 1. 第一遍:尝试精确匹配 + for target_zh, std_name in column_map.items(): + for col in df.columns: + if str(col).strip() == target_zh: + found_cols[col] = std_name + break + + # 2. 第二遍:对未匹配成功的列尝试模糊匹配(但要排除特定干扰词) + for target_zh, std_name in column_map.items(): + if std_name in found_cols.values(): + continue + for col in df.columns: + col_str = str(col) + if target_zh in col_str: + # 排除干扰列 + if target_zh == '单位' and '结算单位' in col_str: + continue + if target_zh == '数量' and '基本单位数量' in col_str: + continue + found_cols[col] = std_name + break + + if len(found_cols) < 4: + logger.error(f"杨碧月订单列匹配不足: 找到 {list(found_cols.values())}") + return None + + df_clean = df[list(found_cols.keys())].copy() + df_clean = df_clean.rename(columns=found_cols) + + # 过滤掉空的条码行 + df_clean = df_clean.dropna(subset=['商品条码']) + + # 保存预处理文件 + out_dir = os.path.dirname(src_path) + base = os.path.basename(src_path) + final_path = os.path.join(out_dir, f"预处理之后_{base}") + df_clean.to_excel(final_path, index=False) + + return final_path + except Exception as e: + logger.error(f"预处理杨碧月订单出错: {e}") + return None + + def process_yang_biyue(self, src_path: str, progress_cb: Optional[Callable[[int, str], None]] = None) -> Optional[str]: + """ + 处理杨碧月经手的订单(预处理+处理) + """ + try: + if progress_cb: progress_cb(10, "正在进行杨碧月订单预处理...") + preprocessed_path = self.process_yang_biyue_only(src_path) + + if not preprocessed_path: + return None + + if progress_cb: progress_cb(60, "预处理文件已保存,开始标准转换流程...") + + # 延迟导入以避免循环依赖 + from app.services.order_service import OrderService + order_service = OrderService(self.config_manager) + result = order_service.process_excel(preprocessed_path, progress_cb=lambda p: progress_cb(60 + int(p*0.4), "生成采购单中...") if progress_cb else None) + return result + + except Exception as e: + logger.error(f"处理杨碧月订单出错: {e}") + return None + + def preprocess_rongcheng_yigou(self, src_path: str, progress_cb: Optional[Callable[[int, str], None]] = None) -> Optional[str]: + """ + 蓉城易购订单预处理:按用户提供的 E, N, Q, S 列索引进行强制清洗 + """ + try: + if progress_cb: progress_cb(10, "正在处理蓉城易购预处理...") + + from app.core.utils.file_utils import smart_read_excel + # 蓉城易购格式:Row 0是单号,Row 1是联系人,Row 2是表头,Row 3开始是数据 + df_raw = smart_read_excel(src_path, header=None) + + # 检查数据行数 + if len(df_raw) <= 3: + logger.error("蓉城易购文件数据行数不足") + return None + + # 提取数据部分 (Row 3开始) + df_data = df_raw.iloc[3:].reset_index(drop=True) + + # 用户指定列映射: + # E列 (Index 4) -> 商品条码 + # N列 (Index 13) -> 数量 + # Q列 (Index 16) -> 单价 + # S列 (Index 18) -> 金额 + # C列 (Index 2) -> 商品名称 (通用需求) + + idx_map = { + 2: '商品名称', + 4: '商品条码', + 13: '数量', + 16: '单价', + 18: '金额' + } + + # 确保列索引不越界 + available_indices = [i for i in idx_map.keys() if i < df_data.shape[1]] + df2 = df_data.iloc[:, available_indices].copy() + df2.columns = [idx_map[i] for i in available_indices] + + # 强制转换类型 + for c in ['数量', '单价', '金额']: + if c in df2.columns: + df2[c] = pd.to_numeric(df2[c], errors='coerce').fillna(0) + + # 过滤掉空的条码行 + df2 = df2.dropna(subset=['商品条码']) + df2['商品条码'] = df2['商品条码'].astype(str).str.strip() + df2 = df2[df2['商品条码'] != ''] + + # 核心逻辑:分裂多条码行并均分数量 + if '商品条码' in df2.columns and '数量' in df2.columns: + rows = [] + for _, row in df2.iterrows(): + bc_val = str(row.get('商品条码', '')).strip() + # 识别分隔符:/ , , 、 + if any(sep in bc_val for sep in ['/', ',', ',', '、']): + parts = re.split(r'[/,,、]+', bc_val) + parts = [p.strip() for p in parts if p.strip()] + + if len(parts) >= 2: + q_total = float(row.get('数量', 0) or 0) + if q_total > 0: + n = len(parts) + base_qty = int(q_total // n) + remainder = int(q_total % n) + + for i, p_bc in enumerate(parts): + new_row = row.copy() + new_row['商品条码'] = p_bc + current_qty = base_qty + (1 if i < remainder else 0) + new_row['数量'] = current_qty + if '单价' in new_row: + try: + up = float(new_row['单价'] or 0) + new_row['金额'] = up * current_qty + except Exception: + pass + rows.append(new_row) + continue + rows.append(row) + df2 = pd.DataFrame(rows) + + # 保存预处理文件 + out_dir = os.path.dirname(src_path) + base = os.path.basename(src_path) + final_path = os.path.join(out_dir, f"预处理之后_{base}") + df2.to_excel(final_path, index=False) + + if progress_cb: progress_cb(100, "蓉城易购预处理完成") + return final_path + + except Exception as e: + logger.error(f"预处理蓉城易购订单出错: {e}") + return None + + def process_rongcheng_yigou(self, src_path: str, progress_cb: Optional[Callable[[int, str], None]] = None) -> Optional[str]: + """ + 兼容性方法:处理蓉城易购订单并执行后续转换 + """ + cleaned_path = self.preprocess_rongcheng_yigou(src_path, progress_cb) + if cleaned_path: + from app.services.order_service import OrderService + order_service = OrderService(self.config_manager) + return order_service.process_excel(cleaned_path, progress_cb=lambda p: progress_cb(60 + int(p*0.4), "生成采购单中...") if progress_cb else None) + return None diff --git a/app/services/tobacco_service.py b/app/services/tobacco_service.py new file mode 100644 index 0000000..8564079 --- /dev/null +++ b/app/services/tobacco_service.py @@ -0,0 +1,336 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +烟草公司订单处理服务 +---------------- +处理烟草公司特定格式的订单明细文件,生成银豹采购单 +""" + +import os +import glob +import datetime +import pandas as pd +import xlrd +import xlwt +import re +from xlutils.copy import copy +from openpyxl import load_workbook +from typing import Optional, Dict, Any, List, Tuple +from app.core.utils.log_utils import get_logger +from app.core.utils.string_utils import parse_monetary_string +from app.core.utils.dialog_utils import show_custom_dialog # 导入自定义弹窗工具 +from ..config.settings import ConfigManager + +logger = get_logger(__name__) + +class TobaccoService: + """烟草公司订单处理服务""" + + def __init__(self, config: Dict[str, Any]): + """ + 初始化服务 + + Args: + config: 配置信息 + """ + self.config = config + # 修复配置获取方式,使用fallback机制 + self.output_dir = config.get('Paths', 'output_folder', fallback='data/output') + self.template_file = config.get('Paths', 'template_file', fallback='templates/银豹-采购单模板.xls') + # 将烟草订单保存到result目录 + result_dir = "data/result" + os.makedirs(result_dir, exist_ok=True) + self.output_file = os.path.join(result_dir, '银豹采购单_烟草公司.xls') + + def get_latest_tobacco_order(self) -> Optional[str]: + """ + 获取最新的烟草订单明细文件 + + Returns: + 文件路径或None + """ + # 获取今日开始时间戳 + today = datetime.date.today() + today_start = datetime.datetime.combine(today, datetime.time.min).timestamp() + + # 查找订单明细文件 + file_pattern = os.path.join(self.output_dir, "订单明细*.xlsx") + candidates = glob.glob(file_pattern) + + if not candidates: + logger.warning("未找到烟草公司订单明细文件") + return None + + # 按创建时间排序 + candidates.sort(key=os.path.getctime, reverse=True) + latest_file = candidates[0] + + # 检查是否是今天的文件 + if os.path.getctime(latest_file) >= today_start: + logger.info(f"找到最新烟草订单明细文件: {latest_file}") + return latest_file + else: + logger.warning(f"找到的烟草订单明细文件不是今天创建的: {latest_file}") + return latest_file # 仍然返回最新文件,但给出警告 + + def preprocess_tobacco_order(self, file_path: str) -> Optional[str]: + """ + 烟草订单预处理:按用户提供的 B, E, G, H 列索引进行强制清洗 + """ + try: + logger.info(f"执行烟草订单专用预处理: {file_path}") + from app.core.utils.file_utils import smart_read_excel + + # 烟草格式:Row 0是专卖证号,Row 1是表头,Row 2是合计,Row 3开始是数据 + df_raw = smart_read_excel(file_path, header=None) + + if len(df_raw) <= 3: + logger.error("烟草订单文件数据行数不足") + return None + + # 提取数据部分 (Row 3开始) + df_data = df_raw.iloc[3:].reset_index(drop=True) + + # 用户指定列映射: + # A列 (Index 0) -> 商品名称 + # B列 (Index 1) -> 商品条码 (盒码) + # E列 (Index 4) -> 批发价 (单价) + # G列 (Index 6) -> 订单量 (数量) + # H列 (Index 7) -> 金额 + + idx_map = { + 0: '商品名称', + 1: '商品条码', + 4: '批发价', + 6: '数量', + 7: '金额' + } + + available_indices = [i for i in idx_map.keys() if i < df_data.shape[1]] + df = df_data.iloc[:, available_indices].copy() + df.columns = [idx_map[i] for i in available_indices] + + # 1. 过滤订单量不为0的数据 + df['数量'] = pd.to_numeric(df['数量'], errors='coerce').fillna(0) + df = df[df['数量'] != 0].copy() + + if df.empty: + logger.warning("烟草订单无有效订单量记录") + return None + + # 2. 核心清洗逻辑: + # 数量 = 订单量 * 10 (G列) + # 单价 = 批发价 / 10 (E列) + df['单价'] = pd.to_numeric(df['批发价'], errors='coerce').fillna(0) / 10 + df['数量'] = df['数量'] * 10 + + # 3. 校验金额 (H列) + df['金额'] = pd.to_numeric(df['金额'], errors='coerce').fillna(0) + + # 4. 只保留需要的列 + final_cols = ['商品条码', '商品名称', '数量', '单价', '金额'] + df_final = df[final_cols].copy() + + # 保存预处理文件 + out_dir = os.path.dirname(file_path) + base = os.path.basename(file_path) + final_path = os.path.join(out_dir, f"预处理之后_{base}") + df_final.to_excel(final_path, index=False) + + logger.info(f"烟草订单预处理完成: {final_path}") + return final_path + + except Exception as e: + logger.error(f"烟草订单预处理失败: {e}") + return None + + def process_tobacco_order(self, input_file=None): + """ + 处理烟草订单 + + Args: + input_file: 输入文件路径,如果为None则自动查找最新文件 + + Returns: + 输出文件路径或None(如果处理失败) + """ + try: + # 如果没有指定输入文件,查找最新的文件 + if input_file is None: + input_file = self.get_latest_tobacco_order() + + if input_file is None: + logger.warning("未找到烟草公司订单明细文件") + logger.error("未找到可处理的烟草订单明细文件") + return None + + logger.info(f"开始处理烟草公司订单: {input_file}") + + # 读取订单时间和总金额 + order_info = self._read_order_info(input_file) + if not order_info: + logger.error(f"读取订单信息失败: {input_file}") + return None + + order_time, total_amount = order_info + + # 读取订单数据 + order_data = self._read_order_data(input_file) + if order_data is None or order_data.empty: + logger.error(f"读取订单数据失败: {input_file}") + return None + + # 生成银豹采购单 + output_file = self._generate_pospal_order(order_data, order_time) + if not output_file: + logger.error("生成银豹采购单失败") + return None + + # 获取处理条目数 + total_count = len(order_data) + + # 输出处理结果 + logger.info(f"烟草公司订单处理成功,订单时间: {order_time}, 总金额: {total_amount}, 处理条目: {total_count}") + logger.info(f"采购单已生成: {output_file}") + + # 显示处理结果对话框 + self.show_result_dialog(output_file, order_time, total_count, total_amount) + + return output_file + + except Exception as e: + logger.error(f"处理烟草公司订单时发生错误: {e}", exc_info=True) + return None + + def _read_order_info(self, file_path: str) -> Optional[Tuple[str, float]]: + """ + 读取订单信息(时间和总金额) + + Args: + file_path: 文件路径 + + Returns: + 包含订单时间和总金额的元组或None + """ + try: + wb_info = load_workbook(file_path, data_only=True) + ws_info = wb_info.active + order_time = ws_info["H1"].value or "(空)" + total_amount = ws_info["H3"].value or 0 + + return (order_time, total_amount) + except Exception as e: + logger.error(f"读取订单信息出错: {e}") + return None + + def _read_order_data(self, file_path: str) -> Optional[pd.DataFrame]: + """ + 读取订单数据 + + Args: + file_path: 文件路径 + + Returns: + 订单数据DataFrame或None + """ + columns = ['商品', '盒码', '条码', '建议零售价', '批发价', '需求量', '订单量', '金额'] + + try: + from app.core.utils.file_utils import smart_read_excel + # 读取Excel文件 + df_old = smart_read_excel(file_path, header=None, skiprows=3, names=columns) + + # 过滤订单量不为0的数据,并计算采购量和单价 + df_filtered = df_old[df_old['订单量'] != 0].copy() + df_filtered['采购量'] = df_filtered['订单量'] * 10 + df_filtered['采购单价'] = df_filtered['金额'] / df_filtered['采购量'] + df_filtered = df_filtered.reset_index(drop=True) + + return df_filtered + except Exception as e: + logger.error(f"读取订单数据失败: {e}") + return None + + def _generate_pospal_order(self, order_data: pd.DataFrame, order_time: str) -> Optional[str]: + """ + 生成银豹采购单 + + Args: + order_data: 订单数据 + order_time: 订单时间 + + Returns: + 输出文件路径或None + """ + try: + # 检查模板文件是否存在 + if not os.path.exists(self.template_file): + logger.error(f"采购单模板文件不存在: {self.template_file}") + return None + + # 打开模板,准备写入 + template_rd = xlrd.open_workbook(self.template_file, formatting_info=True) + template_wb = copy(template_rd) + template_ws = template_wb.get_sheet(0) + + # 获取模板中的表头列索引 + header_row = template_rd.sheet_by_index(0).row_values(0) + barcode_col = header_row.index("条码(必填)") + amount_col = header_row.index("采购量(必填)") + gift_col = header_row.index("赠送量") + price_col = header_row.index("采购单价(必填)") + + # 写入数据到模板 + for i, row in order_data.iterrows(): + template_ws.write(i + 1, barcode_col, row['盒码']) # 商品条码 + template_ws.write(i + 1, amount_col, int(row['采购量'])) # 采购量 + template_ws.write(i + 1, gift_col, "") # 赠送量为空 + template_ws.write(i + 1, price_col, round(row['采购单价'], 2)) # 采购单价保留两位小数 + + # 确保输出目录存在 + os.makedirs(os.path.dirname(self.output_file), exist_ok=True) + + # 保存输出文件 + template_wb.save(self.output_file) + logger.info(f"采购单生成成功: {self.output_file}") + + return self.output_file + except Exception as e: + logger.error(f"生成银豹采购单失败: {e}") + return None + + def show_result_dialog(self, output_file, order_time, total_count, total_amount): + """ + 显示处理结果对话框 + + Args: + output_file: 输出文件路径 + order_time: 订单时间 + total_count: 总处理条目 + total_amount: 总金额 + """ + # 创建附加信息 + additional_info = { + "订单来源": "烟草公司", + "处理时间": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + } + + # 确保 total_amount 是数字类型 + parsed = parse_monetary_string(total_amount) + total_amount = parsed if parsed is not None else 0.0 + amount_display = f"¥{total_amount:.2f}" + + # 显示自定义对话框 + show_custom_dialog( + title="烟草订单处理结果", + message="烟草订单处理完成", + result_file=output_file, + time_info=order_time, + count_info=f"{total_count}个商品", + amount_info=amount_display, + additional_info=additional_info + ) + + # 记录日志 + logger.info(f"烟草公司订单处理成功,订单时间: {order_time}, 总金额: {total_amount}, 处理条目: {total_count}") \ No newline at end of file diff --git a/app/ui/__init__.py b/app/ui/__init__.py new file mode 100644 index 0000000..658473e --- /dev/null +++ b/app/ui/__init__.py @@ -0,0 +1,2 @@ +# -*- coding: utf-8 -*- +"""益选-OCR订单处理系统 UI 模块""" diff --git a/app/ui/action_handlers.py b/app/ui/action_handlers.py new file mode 100644 index 0000000..4188f7a --- /dev/null +++ b/app/ui/action_handlers.py @@ -0,0 +1,565 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +"""业务操作处理模块""" + +import os +import time +import datetime +import json +import logging +import tkinter as tk +from tkinter import messagebox +from threading import Thread + +from app.config.settings import ConfigManager +from app.services.ocr_service import OCRService +from app.services.order_service import OrderService +from app.core.utils.log_utils import get_logger + +from .logging_ui import add_to_log, init_gui_logger, dispose_gui_logger, GUILogHandler +from .ui_widgets import ProgressReporter +from .error_utils import show_error_dialog, get_error_suggestion + +logger = get_logger(__name__) +from .result_previews import show_ocr_result_preview, show_excel_result_preview, show_merge_result_preview +from .user_settings import add_recent_file +from .command_runner import get_running_task, set_running_task +from .file_operations import select_file, select_excel_file, validate_unit_price_against_item_data + + +def _ask_and_merge_purchase_orders(order_service, log_widget, add_to_recent=False): + """弹窗询问是否合并采购单,返回合并结果路径或 None。 + + 用于 run_pipeline_directly 和 batch_process_orders_with_status 的共享逻辑。 + """ + try: + purchase_orders = order_service.get_purchase_orders() + + if len(purchase_orders) == 0: + add_to_log(log_widget, "没有找到采购单文件,跳过合并步骤\n", "info") + elif len(purchase_orders) == 1: + add_to_log(log_widget, f"只有1个采购单文件,无需合并: {os.path.basename(purchase_orders[0])}\n", "info") + else: + add_to_log(log_widget, f"找到{len(purchase_orders)}个采购单文件\n", "info") + + file_list = "\n".join([f"• {os.path.basename(f)}" for f in purchase_orders]) + merge_choice = messagebox.askyesnocancel( + "采购单合并选择", + f"发现{len(purchase_orders)}个采购单文件:\n\n{file_list}\n\n是否需要合并这些采购单?\n\n• 选择'是':合并所有采购单\n• 选择'否':保持文件分离\n• 选择'取消':跳过此步骤", + icon='question' + ) + + if merge_choice is True: + add_to_log(log_widget, "开始合并采购单...\n", "info") + merge_result = order_service.merge_all_purchase_orders() + if merge_result: + add_to_log(log_widget, "采购单合并完成\n", "success") + if add_to_recent: + try: + add_recent_file(merge_result) + except Exception as e: + logger.debug(f"添加最近文件失败: {e}") + return merge_result + else: + add_to_log(log_widget, "合并失败\n", "warning") + elif merge_choice is False: + add_to_log(log_widget, "用户选择不合并采购单,保持文件分离\n", "info") + else: + add_to_log(log_widget, "用户取消合并操作\n", "info") + except Exception as e: + add_to_log(log_widget, f"合并过程出现问题: {str(e)}\n", "warning") + return None + + +def process_single_image_with_status(log_widget, status_bar): + status_bar.set_status("选择图片中...") + file_path = select_file(log_widget, [("图片文件", "*.jpg *.jpeg *.png *.bmp"), ("所有文件", "*.*")], "选择图片") + if not file_path: + status_bar.set_status("操作已取消") + add_to_log(log_widget, "未选择文件,操作已取消\n", "warning") + return + + def run_in_thread(): + try: + status_bar.set_running(True) + status_bar.set_status("开始处理图片...") + + gui_handler = GUILogHandler(log_widget) + gui_handler.setLevel(logging.INFO) + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + gui_handler.setFormatter(formatter) + + root_logger = logging.getLogger() + for handler in root_logger.handlers[:]: + if isinstance(handler, logging.StreamHandler): + root_logger.removeHandler(handler) + root_logger.addHandler(gui_handler) + root_logger.setLevel(logging.INFO) + + ocr_service = OCRService() + add_to_log(log_widget, f"开始处理图片: {file_path}\n", "info") + try: + add_recent_file(file_path) + except Exception as e: + logger.debug(f"添加最近文件失败: {e}") + excel_path = ocr_service.process_image(file_path) + + if excel_path: + add_to_log(log_widget, "图片OCR处理完成\n", "success") + preview_output = f"采购单已保存到: {excel_path}\n" + show_excel_result_preview(preview_output) + try: + add_recent_file(excel_path) + except Exception as e: + logger.debug(f"添加最近文件失败: {e}") + else: + add_to_log(log_widget, "图片OCR处理失败\n", "error") + + except Exception as e: + add_to_log(log_widget, f"处理单个图片时出错: {str(e)}\n", "error") + sugg = get_error_suggestion(str(e)) + if sugg: + show_error_dialog("OCR处理错误", str(e), sugg) + finally: + try: + root_logger = logging.getLogger() + for handler in root_logger.handlers[:]: + if isinstance(handler, GUILogHandler): + root_logger.removeHandler(handler) + handler.close() + except Exception as e: + logger.debug(f"清理日志处理器失败: {e}") + status_bar.set_running(False) + status_bar.set_status("就绪") + + thread = Thread(target=run_in_thread) + thread.daemon = True + thread.start() + + +def run_pipeline_directly(log_widget, status_bar): + """直接运行完整处理流程""" + if get_running_task() is not None: + messagebox.showinfo("任务进行中", "请等待当前任务完成后再执行新的操作。") + return + + def run_in_thread(): + set_running_task("pipeline") + + if status_bar: + status_bar.set_running(True) + status_bar.set_status("开始完整处理流程...") + + start_time = datetime.datetime.now() + start_perf = time.perf_counter() + log_widget.configure(state=tk.NORMAL) + log_widget.delete(1.0, tk.END) + log_widget.insert(tk.END, "执行命令: 完整处理流程\n", "command") + log_widget.insert(tk.END, f"开始时间: {start_time.strftime('%Y-%m-%d %H:%M:%S')}\n", "time") + log_widget.insert(tk.END, "=" * 50 + "\n\n", "separator") + log_widget.configure(state=tk.DISABLED) + + try: + config = ConfigManager() + + gui_handler = init_gui_logger(log_widget) + + ocr_service = OCRService(config) + order_service = OrderService(config) + + reporter = ProgressReporter(status_bar) + reporter.running() + reporter.set("开始OCR批量处理...", 10) + + total, success = ocr_service.batch_process(progress_cb=lambda p: reporter.set("OCR处理中...", p)) + if total == 0: + add_to_log(log_widget, "没有找到需要处理的图片\n", "warning") + if status_bar: + status_bar.set_status("未找到图片文件") + return + elif success == 0: + add_to_log(log_widget, "OCR处理没有成功处理任何新文件\n", "warning") + else: + add_to_log(log_widget, f"OCR处理完成,共处理 {success}/{total} 个文件\n", "success") + try: + processed_map = {} + config = ConfigManager() + pjson = config.get('Paths', 'processed_record', fallback='data/processed_files.json') + if os.path.exists(pjson): + with open(pjson, 'r', encoding='utf-8') as f: + processed_map = json.load(f) + outputs = list(processed_map.values()) + for p in outputs[-10:]: + if p: + add_recent_file(os.path.abspath(p)) + except Exception as e: + logger.debug(f"加载已处理文件记录失败: {e}") + reporter.set("开始Excel处理...", 92) + + add_to_log(log_widget, "开始Excel处理...\n", "info") + result = order_service.process_excel() + + if not result: + add_to_log(log_widget, "Excel处理失败\n", "error") + else: + add_to_log(log_widget, "Excel处理完成\n", "success") + try: + add_recent_file(result) + except Exception as e: + logger.debug(f"添加最近文件失败: {e}") + try: + validate_unit_price_against_item_data(result, log_widget) + except Exception as e: + logger.debug(f"单价校验失败: {e}") + + reporter.set("检查是否需要合并采购单...", 80) + _ask_and_merge_purchase_orders(order_service, log_widget, add_to_recent=True) + + end_time = datetime.datetime.now() + duration_sec = max(0.0, time.perf_counter() - start_perf) + + add_to_log(log_widget, f"\n{'=' * 50}\n", "separator") + add_to_log(log_widget, "完整处理流程执行完毕!\n", "success") + add_to_log(log_widget, f"结束时间: {end_time.strftime('%Y-%m-%d %H:%M:%S')}\n", "time") + add_to_log(log_widget, f"耗时: {duration_sec:.2f} 秒\n", "time") + reporter.set("处理完成", 100) + + except Exception as e: + add_to_log(log_widget, f"执行过程中发生错误: {str(e)}\n", "error") + import traceback + add_to_log(log_widget, f"详细错误信息: {traceback.format_exc()}\n", "error") + finally: + dispose_gui_logger() + reporter.done() + + set_running_task(None) + if status_bar: + status_bar.set_running(False) + status_bar.set_status("就绪") + + thread = Thread(target=run_in_thread) + thread.daemon = True + thread.start() + + +def batch_ocr_with_status(log_widget, status_bar): + """OCR批量识别""" + def run_in_thread(): + try: + reporter = ProgressReporter(status_bar) + reporter.running() + reporter.set("正在进行OCR批量识别...", 10) + add_to_log(log_widget, "开始OCR批量识别\n", "info") + + init_gui_logger(log_widget) + + ocr_service = OCRService() + + result = ocr_service.batch_process() + + if result: + add_to_log(log_widget, "OCR批量识别完成\n", "success") + show_ocr_result_preview("OCR批量识别成功完成") + reporter.set("批量识别完成", 100) + try: + processed_map = {} + config = ConfigManager() + pjson = config.get('Paths', 'processed_record', fallback='data/processed_files.json') + if os.path.exists(pjson): + with open(pjson, 'r', encoding='utf-8') as f: + processed_map = json.load(f) + outputs = list(processed_map.values()) + for p in outputs[-10:]: + if p: + add_recent_file(p) + inputs = list(processed_map.keys()) + for p in inputs[-10:]: + if p: + add_recent_file(p) + except Exception as e: + logger.debug(f"加载已处理文件记录失败: {e}") + else: + add_to_log(log_widget, "OCR批量识别失败\n", "error") + + except Exception as e: + add_to_log(log_widget, f"OCR批量识别出错: {str(e)}\n", "error") + sugg = get_error_suggestion(str(e)) + if sugg: + show_error_dialog("OCR处理错误", str(e), sugg) + finally: + dispose_gui_logger() + reporter.done() + + thread = Thread(target=run_in_thread) + thread.daemon = True + thread.start() + + +def batch_process_orders_with_status(log_widget, status_bar): + """批量处理订单(仅Excel处理,包含合并确认)""" + def run_in_thread(): + try: + reporter = ProgressReporter(status_bar) + reporter.running() + reporter.set("正在批量处理订单...", 10) + add_to_log(log_widget, "开始批量处理订单\n", "info") + + init_gui_logger(log_widget) + + order_service = OrderService() + + add_to_log(log_widget, "开始Excel处理...\n", "info") + try: + latest_input = order_service.get_latest_excel() + if latest_input: + add_recent_file(latest_input) + except Exception as e: + logger.debug(f"获取最新Excel失败: {e}") + result = order_service.process_excel(progress_cb=lambda p: reporter.set("Excel处理中...", p)) + + if result: + add_to_log(log_widget, "Excel处理完成\n", "success") + try: + validate_unit_price_against_item_data(result, log_widget) + except Exception as e: + logger.debug(f"单价校验失败: {e}") + + reporter.set("检查是否需要合并采购单...", 70) + add_to_log(log_widget, "检查是否需要合并采购单...\n", "info") + _ask_and_merge_purchase_orders(order_service, log_widget) + + add_to_log(log_widget, "批量处理订单完成\n", "success") + reporter.set("批量处理订单完成", 100) + show_excel_result_preview(f"采购单已保存到: {result}\n") + try: + add_recent_file(result) + except Exception as e: + logger.debug(f"添加最近文件失败: {e}") + else: + add_to_log(log_widget, "批量处理订单失败\n", "error") + + except Exception as e: + add_to_log(log_widget, f"批量处理订单时出错: {str(e)}\n", "error") + sugg = get_error_suggestion(str(e)) + if sugg: + show_error_dialog("Excel处理错误", str(e), sugg) + finally: + dispose_gui_logger() + reporter.done() + + thread = Thread(target=run_in_thread) + thread.daemon = True + thread.start() + + +def merge_orders_with_status(log_widget, status_bar): + """合并采购单""" + def run_in_thread(): + try: + reporter = ProgressReporter(status_bar) + reporter.running() + reporter.set("正在合并采购单...", 10) + add_to_log(log_widget, "开始合并采购单\n", "info") + + init_gui_logger(log_widget) + + order_service = OrderService() + + result = order_service.merge_all_purchase_orders(progress_cb=lambda p: reporter.set("合并处理中...", p)) + + if result: + add_to_log(log_widget, "采购单合并完成\n", "success") + show_merge_result_preview(f"已保存到: {result}\n") + try: + add_recent_file(result) + except Exception as e: + logger.debug(f"添加最近文件失败: {e}") + try: + validate_unit_price_against_item_data(result, log_widget) + except Exception as e: + logger.debug(f"单价校验失败: {e}") + else: + add_to_log(log_widget, "采购单合并失败\n", "error") + + except Exception as e: + add_to_log(log_widget, f"采购单合并出错: {str(e)}\n", "error") + sugg = get_error_suggestion(str(e)) + if sugg: + show_error_dialog("合并错误", str(e), sugg) + finally: + dispose_gui_logger() + reporter.done() + + thread = Thread(target=run_in_thread) + thread.daemon = True + thread.start() + + +def process_excel_file_with_status(log_widget, status_bar): + """处理Excel文件""" + def run_in_thread(): + try: + status_bar.set_running(True) + status_bar.set_status("选择Excel文件中...") + file_path = select_excel_file(log_widget) + + if file_path: + status_bar.set_status("开始处理Excel文件...") + add_to_log(log_widget, f"开始处理Excel文件: {file_path}\n", "info") + else: + status_bar.set_status("操作已取消") + add_to_log(log_widget, "未选择文件,操作已取消\n", "warning") + return + + init_gui_logger(log_widget) + + order_service = OrderService() + + if file_path: + try: + add_recent_file(file_path) + except Exception as e: + logger.debug(f"添加最近文件失败: {e}") + result = order_service.process_excel(file_path, progress_cb=lambda p: status_bar.set_status("Excel处理中...", p)) + else: + try: + latest_input = order_service.get_latest_excel() + if latest_input: + add_recent_file(latest_input) + except Exception as e: + logger.debug(f"获取最新Excel失败: {e}") + result = order_service.process_excel(progress_cb=lambda p: status_bar.set_status("Excel处理中...", p)) + + if result: + add_to_log(log_widget, "Excel文件处理完成\n", "success") + show_excel_result_preview(f"采购单已保存到: {result}\n") + try: + add_recent_file(result) + except Exception as e: + logger.debug(f"添加最近文件失败: {e}") + try: + validate_unit_price_against_item_data(result, log_widget) + except Exception as e: + logger.debug(f"单价校验失败: {e}") + else: + add_to_log(log_widget, "Excel文件处理失败\n", "error") + + except Exception as e: + add_to_log(log_widget, f"Excel文件处理出错: {str(e)}\n", "error") + msg = str(e) + suggestion = None + if 'openpyxl' in msg or 'engine' in msg: + suggestion = "安装依赖:pip install openpyxl" + elif 'xlrd' in msg: + suggestion = "安装依赖:pip install xlrd" + if suggestion: + show_error_dialog("Excel处理错误", msg, suggestion) + finally: + dispose_gui_logger() + + status_bar.set_running(False) + status_bar.set_status("就绪") + + thread = Thread(target=run_in_thread) + thread.daemon = True + thread.start() + + +def process_dropped_file(log_widget, status_bar, file_path): + try: + ext = os.path.splitext(file_path)[1].lower() + if ext in ['.jpg', '.jpeg', '.png', '.bmp']: + def _run_img(): + try: + reporter = ProgressReporter(status_bar) + reporter.running() + init_gui_logger(log_widget) + add_to_log(log_widget, f"开始一键处理图片: {file_path}\n", "info") + try: + add_recent_file(file_path) + except Exception as e: + logger.debug(f"添加最近文件失败: {e}") + + # 步骤1: OCR识别 + reporter.set("OCR识别中...", 10) + ocr_service = OCRService() + excel_path = ocr_service.process_image(file_path) + if not excel_path: + add_to_log(log_widget, "图片OCR处理失败\n", "error") + return + add_to_log(log_widget, f"OCR识别完成: {excel_path}\n", "success") + + # 步骤2: Excel处理 + reporter.set("Excel处理中...", 40) + order_service = OrderService() + result = order_service.process_excel(excel_path, progress_cb=lambda p: reporter.set("Excel处理中...", p)) + if not result: + add_to_log(log_widget, "Excel处理失败\n", "error") + return + add_to_log(log_widget, f"Excel处理完成: {result}\n", "success") + try: + add_recent_file(result) + except Exception as e: + logger.debug(f"添加最近文件失败: {e}") + try: + validate_unit_price_against_item_data(result, log_widget) + except Exception as e: + logger.debug(f"单价校验失败: {e}") + + # 步骤3: 合并采购单 + reporter.set("检查合并采购单...", 80) + _ask_and_merge_purchase_orders(order_service, log_widget, add_to_recent=True) + + reporter.set("处理完成", 100) + add_to_log(log_widget, "一键处理完成!\n", "success") + finally: + dispose_gui_logger() + reporter.done() + t = Thread(target=_run_img) + t.daemon = True + t.start() + elif ext in ['.xlsx', '.xls']: + def _run_xls(): + try: + reporter = ProgressReporter(status_bar) + reporter.running() + init_gui_logger(log_widget) + order_service = OrderService() + add_to_log(log_widget, f"开始一键处理Excel文件: {file_path}\n", "info") + try: + add_recent_file(file_path) + except Exception as e: + logger.debug(f"添加最近文件失败: {e}") + + # 步骤1: Excel处理 + reporter.set("Excel处理中...", 20) + result = order_service.process_excel(file_path, progress_cb=lambda p: reporter.set("Excel处理中...", p)) + if not result: + add_to_log(log_widget, "Excel文件处理失败\n", "error") + return + add_to_log(log_widget, f"Excel处理完成: {result}\n", "success") + try: + add_recent_file(result) + except Exception as e: + logger.debug(f"添加最近文件失败: {e}") + try: + validate_unit_price_against_item_data(result, log_widget) + except Exception as e: + logger.debug(f"单价校验失败: {e}") + + # 步骤2: 合并采购单 + reporter.set("检查合并采购单...", 80) + _ask_and_merge_purchase_orders(order_service, log_widget, add_to_recent=True) + + reporter.set("处理完成", 100) + add_to_log(log_widget, "一键处理完成!\n", "success") + finally: + dispose_gui_logger() + reporter.done() + t = Thread(target=_run_xls) + t.daemon = True + t.start() + else: + add_to_log(log_widget, f"不支持的文件类型: {file_path}\n", "warning") + except Exception as e: + add_to_log(log_widget, f"处理拖拽文件失败: {str(e)}\n", "error") diff --git a/app/ui/barcode_editor.py b/app/ui/barcode_editor.py new file mode 100644 index 0000000..19bbf3d --- /dev/null +++ b/app/ui/barcode_editor.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +"""条码映射编辑模块""" + +from tkinter import messagebox + +from app.core.excel.converter import UnitConverter +from app.core.utils.dialog_utils import show_barcode_mapping_dialog + +from .logging_ui import add_to_log + + +def edit_barcode_mappings(log_widget): + """编辑条码映射配置""" + try: + add_to_log(log_widget, "正在加载条码映射配置...\n", "info") + + unit_converter = UnitConverter() + + current_mappings = unit_converter.special_barcodes + + def save_mappings(new_mappings): + success = unit_converter.update_barcode_mappings(new_mappings) + if success: + add_to_log(log_widget, f"成功保存条码映射配置,共{len(new_mappings)}项\n", "success") + else: + add_to_log(log_widget, "保存条码映射配置失败\n", "error") + + show_barcode_mapping_dialog(None, save_mappings, current_mappings) + + except Exception as e: + add_to_log(log_widget, f"编辑条码映射时出错: {str(e)}\n", "error") + messagebox.showerror("错误", f"编辑条码映射时出错: {str(e)}") diff --git a/app/ui/command_runner.py b/app/ui/command_runner.py new file mode 100644 index 0000000..12e5853 --- /dev/null +++ b/app/ui/command_runner.py @@ -0,0 +1,158 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +"""命令执行器模块""" + +import os +import sys +import time +import subprocess +import datetime +import re +import tkinter as tk +from tkinter import messagebox +from threading import Thread + +from .logging_ui import LogRedirector +from .result_previews import show_result_preview + +# 任务状态跟踪 +_RUNNING_TASK = None + + +def get_running_task(): + return _RUNNING_TASK + + +def set_running_task(val): + global _RUNNING_TASK + _RUNNING_TASK = val + + +def run_command_with_logging(command, log_widget, status_bar=None, on_complete=None): + """运行命令并将输出重定向到日志窗口""" + if _RUNNING_TASK is not None: + messagebox.showinfo("任务进行中", "请等待当前任务完成后再执行新的操作。") + return + + def run_in_thread(): + global _RUNNING_TASK + _RUNNING_TASK = command + + if status_bar: + status_bar.set_running(True) + + start_time = datetime.datetime.now() + start_perf = time.perf_counter() + log_widget.configure(state=tk.NORMAL) + log_widget.delete(1.0, tk.END) + log_widget.insert(tk.END, f"执行命令: {' '.join(command)}\n", "command") + log_widget.insert(tk.END, f"开始时间: {start_time.strftime('%Y-%m-%d %H:%M:%S')}\n", "time") + log_widget.insert(tk.END, "=" * 50 + "\n\n", "separator") + log_widget.configure(state=tk.DISABLED) + + old_stdout = sys.stdout + old_stderr = sys.stderr + + log_redirector = LogRedirector(log_widget) + + env = os.environ.copy() + try: + from app.config.settings import ConfigManager + cfg = ConfigManager() + env["OCR_OUTPUT_DIR"] = cfg.get_path('Paths', 'output_folder', fallback='data/output', create=True) + env["OCR_INPUT_DIR"] = cfg.get_path('Paths', 'input_folder', fallback='data/input', create=True) + env["OCR_TEMP_DIR"] = cfg.get_path('Paths', 'temp_folder', fallback='data/temp', create=True) + except Exception: + env["OCR_OUTPUT_DIR"] = os.path.abspath("data/output") + env["OCR_INPUT_DIR"] = os.path.abspath("data/input") + env["OCR_TEMP_DIR"] = os.path.abspath("data/temp") + env["OCR_LOG_LEVEL"] = "DEBUG" + + try: + sys.stdout = log_redirector + sys.stderr = log_redirector + + print("日志重定向已启动,现在同时输出到终端和GUI") + + process = subprocess.Popen( + command, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, + universal_newlines=True, + env=env + ) + + output_data = [] + for line in process.stdout: + output_data.append(line) + print(line.rstrip()) + + if status_bar: + progress = extract_progress_from_log(line) + if progress is not None: + log_widget.after(0, lambda p=progress: status_bar.set_status(f"处理中: {p}%完成", p)) + + process.wait() + + end_time = datetime.datetime.now() + duration_sec = max(0.0, time.perf_counter() - start_perf) + + print(f"\n{'=' * 50}") + print(f"执行完毕!返回码: {process.returncode}") + print(f"结束时间: {end_time.strftime('%Y-%m-%d %H:%M:%S')}") + print(f"耗时: {duration_sec:.2f} 秒") + + output_text = ''.join(output_data) + + is_pipeline = "pipeline" in command + no_merge_files = "未找到采购单文件" in output_text + single_file = "只有1个采购单文件" in output_text + + if is_pipeline and (no_merge_files or single_file): + print("完整流程中没有需要合并的文件,但其他步骤执行成功,视为成功完成") + if status_bar: + log_widget.after(0, lambda: status_bar.set_status("处理完成", 100)) + log_widget.after(0, lambda: show_result_preview(command, output_text)) + else: + if on_complete: + log_widget.after(0, lambda: on_complete(process.returncode, output_text)) + elif process.returncode == 0: + if status_bar: + log_widget.after(0, lambda: status_bar.set_status("处理完成", 100)) + log_widget.after(0, lambda: show_result_preview(command, output_text)) + else: + if status_bar: + log_widget.after(0, lambda: status_bar.set_status(f"处理失败 (返回码: {process.returncode})", 0)) + log_widget.after(0, lambda: messagebox.showerror("操作失败", f"处理失败,返回码:{process.returncode}")) + + except Exception as e: + print(f"\n执行出错: {str(e)}") + if status_bar: + log_widget.after(0, lambda: status_bar.set_status(f"执行出错: {str(e)}", 0)) + log_widget.after(0, lambda: messagebox.showerror("执行错误", f"执行命令时出错: {str(e)}")) + finally: + sys.stdout = old_stdout + sys.stderr = old_stderr + + _RUNNING_TASK = None + if status_bar: + log_widget.after(0, lambda: status_bar.set_running(False)) + + Thread(target=run_in_thread).start() + + +def extract_progress_from_log(log_line): + """从日志行中提取进度信息""" + batch_match = re.search(r'处理批次 (\d+)/(\d+)', log_line) + if batch_match: + current = int(batch_match.group(1)) + total = int(batch_match.group(2)) + return int(current / total * 100) + + percent_match = re.search(r'(\d+)%', log_line) + if percent_match: + return int(percent_match.group(1)) + + return None diff --git a/app/ui/config_dialog.py b/app/ui/config_dialog.py new file mode 100644 index 0000000..b89399e --- /dev/null +++ b/app/ui/config_dialog.py @@ -0,0 +1,207 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +"""系统设置对话框模块""" + +import os +import tkinter as tk +from tkinter import messagebox, filedialog, ttk + +from app.config.settings import ConfigManager + +from .user_settings import load_user_settings, save_user_settings +from .ui_widgets import center_window +from app.core.utils.dialog_utils import show_cloud_sync_dialog + +# 模块级状态 +_PROCESSOR_SERVICE = None + + +def show_config_dialog(root, cfg: ConfigManager): + global _PROCESSOR_SERVICE + + settings = load_user_settings() + dlg = tk.Toplevel(root) + dlg.title("系统设置") + dlg.geometry("560x680") + center_window(dlg) + + content = ttk.Frame(dlg) + content.pack(fill=tk.BOTH, expand=True, padx=16, pady=16) + for i in range(2): + content.columnconfigure(i, weight=1) + + # 当前值 + log_level_val = tk.StringVar(value=settings.get('log_level', 'INFO')) + max_workers_val = tk.StringVar(value=str(settings.get('concurrency_max_workers', cfg.getint('Performance', 'max_workers', 4)))) + batch_size_val = tk.StringVar(value=str(settings.get('concurrency_batch_size', cfg.getint('Performance', 'batch_size', 5)))) + template_path_val = tk.StringVar(value=settings.get('template_path', os.path.join(cfg.get('Paths', 'template_folder', 'templates'), cfg.get('Templates', 'purchase_order', '银豹-采购单模板.xls')))) + input_dir_val = tk.StringVar(value=settings.get('input_folder', cfg.get('Paths', 'input_folder', 'data/input'))) + output_dir_val = tk.StringVar(value=settings.get('output_folder', cfg.get('Paths', 'output_folder', 'data/output'))) + result_dir_val = tk.StringVar(value=settings.get('result_folder', 'data/result')) + + def add_row(row, label_text, widget): + ttk.Label(content, text=label_text).grid(row=row, column=0, sticky='w', padx=4, pady=6) + widget.grid(row=row, column=1, sticky='ew', padx=4, pady=6) + + # 日志级别 + lvl = ttk.Combobox(content, textvariable=log_level_val, values=['DEBUG', 'INFO', 'WARNING', 'ERROR'], state='readonly') + add_row(0, "日志级别", lvl) + + # 并发参数 + maxw_entry = ttk.Entry(content, textvariable=max_workers_val) + add_row(1, "最大并发(max_workers)", maxw_entry) + batch_entry = ttk.Entry(content, textvariable=batch_size_val) + add_row(2, "批次大小(batch_size)", batch_entry) + + # 模板路径 + tpl_frame = ttk.Frame(content) + tpl_entry = ttk.Entry(tpl_frame, textvariable=template_path_val) + tpl_entry.pack(side=tk.LEFT, fill=tk.X, expand=True) + + def _select_template(): + p = filedialog.askopenfilename(title="选择模板文件", filetypes=[("Excel模板", "*.xls *.xlsx"), ("所有文件", "*.*")]) + if p: + try: + template_path_val.set(os.path.relpath(p, os.getcwd())) + except Exception: + template_path_val.set(p) + + ttk.Button(tpl_frame, text="选择", command=_select_template).pack(side=tk.LEFT, padx=6) + add_row(3, "采购单模板文件", tpl_frame) + + # 目录 + def dir_row(row_idx, label, var): + f = ttk.Frame(content) + e = ttk.Entry(f, textvariable=var) + e.pack(side=tk.LEFT, fill=tk.X, expand=True) + + def _select_dir(): + d = filedialog.askdirectory(title=f"选择{label}") + if d: + try: + var.set(os.path.relpath(d, os.getcwd())) + except Exception: + var.set(d) + + ttk.Button(f, text="选择", command=_select_dir).pack(side=tk.LEFT, padx=6) + add_row(row_idx, label, f) + + dir_row(4, "输入目录", input_dir_val) + dir_row(5, "输出目录", output_dir_val) + dir_row(6, "结果目录", result_dir_val) + + api_key_val = tk.StringVar(value=settings.get('api_key', cfg.get('API', 'api_key', ''))) + secret_key_val = tk.StringVar(value=settings.get('secret_key', cfg.get('API', 'secret_key', ''))) + timeout_val = tk.StringVar(value=str(settings.get('timeout', cfg.getint('API', 'timeout', 30)))) + max_retries_val = tk.StringVar(value=str(settings.get('max_retries', cfg.getint('API', 'max_retries', 3)))) + retry_delay_val = tk.StringVar(value=str(settings.get('retry_delay', cfg.getint('API', 'retry_delay', 2)))) + api_url_val = tk.StringVar(value=settings.get('api_url', cfg.get('API', 'api_url', ''))) + + api_key_entry = ttk.Entry(content, textvariable=api_key_val) + add_row(7, "API Key", api_key_entry) + secret_key_entry = ttk.Entry(content, textvariable=secret_key_val) + secret_key_entry.configure(show='*') + add_row(8, "Secret Key", secret_key_entry) + add_row(9, "Timeout", ttk.Entry(content, textvariable=timeout_val)) + add_row(10, "Max Retries", ttk.Entry(content, textvariable=max_retries_val)) + add_row(11, "Retry Delay", ttk.Entry(content, textvariable=retry_delay_val)) + add_row(12, "API URL", ttk.Entry(content, textvariable=api_url_val)) + + # ---- Gitea 云端同步配置 ---- + ttk.Separator(content).grid(row=13, column=0, columnspan=2, sticky='ew', pady=8) + ttk.Label(content, text="云端同步 (Gitea)", font=("Arial", 10, "bold")).grid(row=14, column=0, sticky='w', padx=4, pady=4) + + gitea_url_val = tk.StringVar(value=cfg.get('Gitea', 'base_url', fallback='https://gitea.94kan.cn')) + gitea_owner_val = tk.StringVar(value=cfg.get('Gitea', 'owner', fallback='houhuan')) + gitea_repo_val = tk.StringVar(value=cfg.get('Gitea', 'repo', fallback='yixuan-sync-data')) + gitea_token_val = tk.StringVar(value=cfg.get('Gitea', 'token', fallback='')) + + add_row(15, "Gitea 地址", ttk.Entry(content, textvariable=gitea_url_val)) + add_row(16, "仓库所有者", ttk.Entry(content, textvariable=gitea_owner_val)) + add_row(17, "仓库名称", ttk.Entry(content, textvariable=gitea_repo_val)) + gitea_token_entry = ttk.Entry(content, textvariable=gitea_token_val, show='*') + add_row(18, "Access Token", gitea_token_entry) + + # 操作按钮 + btns = ttk.Frame(content) + btns.grid(row=19, column=0, columnspan=2, sticky='ew', pady=10) + btns.columnconfigure(0, weight=1) + + def save_settings(): + try: + s = load_user_settings() + s['log_level'] = log_level_val.get() + s['concurrency_max_workers'] = int(max_workers_val.get() or '4') + s['concurrency_batch_size'] = int(batch_size_val.get() or '5') + tp = template_path_val.get() + inp = input_dir_val.get() + outp = output_dir_val.get() + resp = result_dir_val.get() + try: + if tp: + tp = os.path.relpath(tp, os.getcwd()) if os.path.isabs(tp) else tp + if inp: + inp = os.path.relpath(inp, os.getcwd()) if os.path.isabs(inp) else inp + if outp: + outp = os.path.relpath(outp, os.getcwd()) if os.path.isabs(outp) else outp + if resp: + resp = os.path.relpath(resp, os.getcwd()) if os.path.isabs(resp) else resp + except Exception: + pass + s['template_path'] = tp + s['input_folder'] = inp + s['output_folder'] = outp + s['result_folder'] = resp + save_user_settings(s) + try: + from app.core.utils.log_utils import set_log_level + set_log_level(s['log_level']) + except Exception: + pass + try: + tpl_path = s['template_path'] + tpl_dir = os.path.dirname(tpl_path) + tpl_name = os.path.basename(tpl_path) + cfg.update('Paths', 'template_folder', tpl_dir) + cfg.update('Templates', 'purchase_order', tpl_name) + try: + cfg.update('Paths', 'template_file', os.path.join(tpl_dir, tpl_name)) + except Exception: + pass + cfg.update('Paths', 'input_folder', s['input_folder']) + cfg.update('Paths', 'output_folder', s['output_folder']) + cfg.update('Performance', 'max_workers', s['concurrency_max_workers']) + cfg.update('Performance', 'batch_size', s['concurrency_batch_size']) + cfg.update('API', 'api_key', api_key_val.get()) + cfg.update('API', 'secret_key', secret_key_val.get()) + cfg.update('API', 'timeout', timeout_val.get()) + cfg.update('API', 'max_retries', max_retries_val.get()) + cfg.update('API', 'retry_delay', retry_delay_val.get()) + cfg.update('API', 'api_url', api_url_val.get()) + cfg.update('Gitea', 'base_url', gitea_url_val.get()) + cfg.update('Gitea', 'owner', gitea_owner_val.get()) + cfg.update('Gitea', 'repo', gitea_repo_val.get()) + cfg.update('Gitea', 'token', gitea_token_val.get()) + cfg.save_config() + except Exception: + pass + messagebox.showinfo("设置已保存", "系统设置已更新并保存") + dlg.destroy() + except Exception as e: + messagebox.showerror("保存失败", str(e)) + + def reload_suppliers(): + global _PROCESSOR_SERVICE + try: + from app.services.processor_service import ProcessorService + if _PROCESSOR_SERVICE is None: + _PROCESSOR_SERVICE = ProcessorService(ConfigManager()) + _PROCESSOR_SERVICE.reload_processors() + messagebox.showinfo("已重新加载", "供应商处理器已重新加载并应用最新配置") + except Exception as e: + messagebox.showerror("重新加载失败", str(e)) + + ttk.Button(btns, text="重新加载供应商配置", command=reload_suppliers).grid(row=0, column=0, sticky='w') + ttk.Button(btns, text="云端同步", command=lambda: show_cloud_sync_dialog(dlg)).grid(row=0, column=1, sticky='w', padx=6) + ttk.Button(btns, text="取消", command=dlg.destroy).grid(row=0, column=2, sticky='e') + ttk.Button(btns, text="保存", command=save_settings).grid(row=0, column=3, sticky='e', padx=6) diff --git a/app/ui/error_utils.py b/app/ui/error_utils.py new file mode 100644 index 0000000..fdd1ff3 --- /dev/null +++ b/app/ui/error_utils.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +"""错误处理工具模块""" + +from tkinter import messagebox +from typing import Optional + +from app.core.utils.log_utils import get_logger + +logger = get_logger(__name__) + + +def show_error_dialog(title: str, message: str, suggestion: Optional[str] = None): + try: + full_msg = message + if suggestion: + full_msg = f"{message}\n\n建议操作:\n- {suggestion}" + messagebox.showerror(title, full_msg) + except Exception as e: + logger.debug(f"显示错误对话框失败: {e}") + + +def get_error_suggestion(message: str) -> Optional[str]: + msg = (message or "").lower() + if 'openpyxl' in msg or ('engine' in msg and 'xlsx' in msg): + return '安装依赖:pip install openpyxl' + if 'xlrd' in msg or ('engine' in msg and 'xls' in msg): + return '安装依赖:pip install xlrd' + if 'timeout' in msg or 'timed out' in msg: + return '检查网络,增大API超时时间或稍后重试' + if 'invalid access_token' in msg or 'access token' in msg: + return '刷新百度OCR令牌或检查api_key/secret_key' + if '429' in msg or 'too many requests' in msg: + return '降低识别频率或稍后重试' + if '模板文件不存在' in msg or ('no such file' in msg and '模板' in msg): + return '在系统设置中选择正确的模板文件路径' + if '没有找到采购单' in msg or '未在 data/result 目录下找到采购单' in msg: + return '确认data/result目录内存在采购单文件' + if 'permission denied' in msg: + return '以管理员权限运行或更改目录写入权限' + return None diff --git a/app/ui/file_operations.py b/app/ui/file_operations.py new file mode 100644 index 0000000..3e0075b --- /dev/null +++ b/app/ui/file_operations.py @@ -0,0 +1,207 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +"""文件与目录操作模块""" + +import os +import json +import tkinter as tk +from tkinter import messagebox, filedialog, scrolledtext + +from .logging_ui import add_to_log +from .ui_widgets import center_window +from app.config.settings import ConfigManager + + +def select_file(log_widget, file_types=None, title="选择文件"): + """通用文件选择对话框""" + if file_types is None: + file_types = [("所有文件", "*.*")] + file_path = filedialog.askopenfilename(title=title, filetypes=file_types) + if file_path: + add_to_log(log_widget, f"已选择文件: {file_path}\n", "info") + return file_path + + +def select_excel_file(log_widget): + """选择Excel文件""" + return select_file( + log_widget, + [("Excel文件", "*.xlsx *.xls"), ("所有文件", "*.*")], + "选择Excel文件" + ) + + +def ensure_directories(): + """确保必要的目录结构存在""" + config = ConfigManager() + directories = [ + config.get('Paths', 'input_folder', fallback='data/input'), + config.get('Paths', 'output_folder', fallback='data/output'), + 'data/result', + config.get('Paths', 'temp_folder', fallback='data/temp'), + 'logs' + ] + for directory in directories: + if not os.path.exists(directory): + os.makedirs(directory, exist_ok=True) + print(f"创建目录: {directory}") + + +def clean_cache(log_widget): + """清除处理缓存""" + from .command_runner import set_running_task + try: + config = ConfigManager() + processed_record = config.get('Paths', 'processed_record', fallback='data/processed_files.json') + output_folder = config.get('Paths', 'output_folder', fallback='data/output') + cache_files = [ + processed_record, + os.path.join(output_folder, "processed_files.json"), + os.path.join(output_folder, "merged_files.json") + ] + + for cache_file in cache_files: + if os.path.exists(cache_file): + os.remove(cache_file) + add_to_log(log_widget, f"已清除缓存文件: {cache_file}\n", "success") + + temp_dir = os.path.join("data/temp") + if os.path.exists(temp_dir): + for file in os.listdir(temp_dir): + file_path = os.path.join(temp_dir, file) + try: + if os.path.isfile(file_path): + os.remove(file_path) + add_to_log(log_widget, f"已清除临时文件: {file_path}\n", "info") + except Exception as e: + add_to_log(log_widget, f"清除文件时出错: {file_path}, 错误: {str(e)}\n", "error") + + log_dir = "logs" + if os.path.exists(log_dir): + for file in os.listdir(log_dir): + if file.endswith(".active"): + file_path = os.path.join(log_dir, file) + try: + os.remove(file_path) + add_to_log(log_widget, f"已清除活动日志标记: {file_path}\n", "info") + except Exception as e: + add_to_log(log_widget, f"清除文件时出错: {file_path}, 错误: {str(e)}\n", "error") + + set_running_task(None) + + add_to_log(log_widget, "缓存清除完成,系统将重新处理所有文件\n", "success") + messagebox.showinfo("缓存清除", "缓存已清除,系统将重新处理所有文件。") + except Exception as e: + add_to_log(log_widget, f"清除缓存时出错: {str(e)}\n", "error") + messagebox.showerror("错误", f"清除缓存时出错: {str(e)}") + + +def open_result_directory(): + try: + result_dir = os.path.abspath("data/result") + if not os.path.exists(result_dir): + os.makedirs(result_dir, exist_ok=True) + os.startfile(result_dir) + except Exception as e: + messagebox.showerror("错误", f"无法打开结果目录: {str(e)}") + + +def _open_directory_from_settings(settings_key, default_path, label): + """通用的从用户设置读取路径并打开目录""" + from .user_settings import load_user_settings + try: + s = load_user_settings() + path = os.path.abspath(s.get(settings_key, default_path)) + if not os.path.exists(path): + os.makedirs(path, exist_ok=True) + os.startfile(path) + except Exception as e: + messagebox.showerror("错误", f"无法打开{label}: {str(e)}") + + +def open_input_directory_from_settings(): + _open_directory_from_settings('input_folder', 'data/input', '输入目录') + + +def open_output_directory_from_settings(): + _open_directory_from_settings('output_folder', 'data/output', '输出目录') + + +def open_result_directory_from_settings(): + _open_directory_from_settings('result_folder', 'data/result', '结果目录') + + +def clean_data_files(log_widget): + """清理数据文件(仅清理input和output目录)""" + try: + if not messagebox.askyesno("确认清理", "确定要清理input和output目录的文件吗?这将删除所有输入和输出数据。"): + add_to_log(log_widget, "操作已取消\n", "info") + return + + files_cleaned = 0 + + input_dir = "data/input" + if os.path.exists(input_dir): + for file in os.listdir(input_dir): + file_path = os.path.join(input_dir, file) + if os.path.isfile(file_path): + os.remove(file_path) + files_cleaned += 1 + add_to_log(log_widget, "已清理input目录\n", "info") + + output_dir = "data/output" + if os.path.exists(output_dir): + for file in os.listdir(output_dir): + file_path = os.path.join(output_dir, file) + if os.path.isfile(file_path): + os.remove(file_path) + files_cleaned += 1 + add_to_log(log_widget, "已清理output目录\n", "info") + + add_to_log(log_widget, f"清理完成,共清理 {files_cleaned} 个文件\n", "success") + messagebox.showinfo("清理完成", f"已成功清理 {files_cleaned} 个文件") + except Exception as e: + add_to_log(log_widget, f"清理数据文件时出错: {str(e)}\n", "error") + messagebox.showerror("错误", f"清理数据文件时出错: {str(e)}") + + +def clean_result_files(log_widget): + try: + if not messagebox.askyesno("确认清理", "确定要清理result目录的文件吗?这将删除所有已生成的采购单文件。"): + add_to_log(log_widget, "操作已取消\n", "info") + return + count = 0 + result_dir = "data/result" + if os.path.exists(result_dir): + for file in os.listdir(result_dir): + file_path = os.path.join(result_dir, file) + if os.path.isfile(file_path): + os.remove(file_path) + count += 1 + add_to_log(log_widget, f"已清理result目录,共 {count} 个文件\n", "success") + messagebox.showinfo("清理完成", f"已清理result目录 {count} 个文件") + except Exception as e: + add_to_log(log_widget, f"清理result目录时出错: {str(e)}\n", "error") + messagebox.showerror("错误", f"清理result目录时出错: {str(e)}") + + +def validate_unit_price_against_item_data(result_path: str, log_widget=None): + try: + from app.services.order_service import OrderService + service = OrderService() + bad_results = service.validate_unit_price(result_path) + + if bad_results: + display_count = min(len(bad_results), 10) + msg = f"存在{len(bad_results)}条单价与商品资料进货价差异超过1元:\n" + "\n".join(bad_results[:display_count]) + if len(bad_results) > 10: + msg += f"\n...(其余 {len(bad_results) - 10} 条已省略)" + messagebox.showwarning("单价校验提示", msg) + if log_widget is not None: + add_to_log(log_widget, f"单价校验发现{len(bad_results)}条差异>1元\n", "warning") + else: + if log_widget is not None: + add_to_log(log_widget, "单价校验通过(差异<=1元)\n", "success") + except Exception as e: + if log_widget is not None: + add_to_log(log_widget, f"单价校验出错: {str(e)}\n", "error") diff --git a/app/ui/logging_ui.py b/app/ui/logging_ui.py new file mode 100644 index 0000000..fc6aa87 --- /dev/null +++ b/app/ui/logging_ui.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +"""GUI日志处理模块""" + +import logging +import queue +import sys +import tkinter as tk + +# 全局日志队列,用于异步更新UI +LOG_QUEUE = queue.Queue() + + +class LogRedirector: + """日志重定向器,用于捕获命令输出并显示到界面""" + def __init__(self, text_widget): + self.text_widget = text_widget + self.buffer = "" + self.terminal = sys.__stdout__ + + def write(self, string): + self.buffer += string + self.terminal.write(string) + self.text_widget.after(0, self.update_text_widget) + + def update_text_widget(self): + self.text_widget.configure(state=tk.NORMAL) + + if self.buffer.strip(): + if any(marker in self.buffer.lower() for marker in ["错误", "error", "失败", "异常", "exception"]): + self.text_widget.insert(tk.END, self.buffer, "error") + elif any(marker in self.buffer.lower() for marker in ["警告", "warning"]): + self.text_widget.insert(tk.END, self.buffer, "warning") + elif any(marker in self.buffer.lower() for marker in ["成功", "success", "完成", "成功处理"]): + self.text_widget.insert(tk.END, self.buffer, "success") + elif any(marker in self.buffer.lower() for marker in ["info", "信息", "开始", "处理中"]): + self.text_widget.insert(tk.END, self.buffer, "info") + else: + self.text_widget.insert(tk.END, self.buffer, "normal") + else: + self.text_widget.insert(tk.END, self.buffer) + + self.text_widget.see(tk.END) + self.text_widget.configure(state=tk.DISABLED) + self.buffer = "" + + def flush(self): + self.terminal.flush() + + +class GUILogHandler(logging.Handler): + """自定义日志处理器,将日志放入队列,由GUI主线程定时消费""" + def __init__(self, text_widget): + super().__init__() + self.text_widget = text_widget + + def emit(self, record): + try: + msg = self.format(record) + if record.levelno >= logging.ERROR: + tag = "error" + elif record.levelno >= logging.WARNING: + tag = "warning" + elif record.levelno >= logging.INFO: + tag = "info" + else: + tag = "normal" + + LOG_QUEUE.put((msg + "\n", tag)) + except Exception: + self.handleError(record) + + +def poll_log_queue(text_widget): + """定期从队列中读取日志并更新UI""" + try: + updated = False + while not LOG_QUEUE.empty(): + msg, tag = LOG_QUEUE.get_nowait() + text_widget.configure(state=tk.NORMAL) + text_widget.insert(tk.END, msg, tag) + updated = True + + if updated: + text_widget.see(tk.END) + text_widget.configure(state=tk.DISABLED) + + except Exception: + pass + finally: + text_widget.after(100, lambda: poll_log_queue(text_widget)) + + +def init_gui_logger(text_widget, level=logging.INFO): + handler = GUILogHandler(text_widget) + handler.setLevel(level) + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + handler.setFormatter(formatter) + root_logger = logging.getLogger() + for h in root_logger.handlers[:]: + if isinstance(h, logging.StreamHandler): + root_logger.removeHandler(h) + if not any(isinstance(h, GUILogHandler) for h in root_logger.handlers): + root_logger.addHandler(handler) + root_logger.setLevel(level) + return handler + + +def dispose_gui_logger(): + root_logger = logging.getLogger() + for handler in root_logger.handlers[:]: + if isinstance(handler, GUILogHandler): + root_logger.removeHandler(handler) + try: + handler.close() + except Exception: + pass + + +def add_to_log(log_widget, text, tag="normal"): + """向日志队列添加文本,由 poll_log_queue 消费并更新 UI""" + if log_widget is None: + print(f"[{tag}] {text}", end="") + return + + LOG_QUEUE.put((text, tag)) diff --git a/app/ui/main_window.py b/app/ui/main_window.py new file mode 100644 index 0000000..1be1238 --- /dev/null +++ b/app/ui/main_window.py @@ -0,0 +1,485 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +"""主窗口模块""" + +import os +import sys +import subprocess +import tkinter as tk +from tkinter import messagebox, filedialog, scrolledtext + +from app.config.settings import ConfigManager +from app.core.utils.log_utils import set_log_level + +from .theme import THEMES, get_theme_mode, set_theme_mode, create_modern_button, create_card_frame +from .logging_ui import add_to_log, poll_log_queue +from .ui_widgets import StatusBar +from .user_settings import ( + load_user_settings, save_user_settings, refresh_recent_list_widget, + _extract_path_from_recent_item, clear_recent_files, RECENT_LIST_WIDGET, +) +from .file_operations import ( + ensure_directories, open_result_directory, clean_cache, + clean_data_files, clean_result_files, +) +from .action_handlers import ( + process_single_image_with_status, run_pipeline_directly, + batch_ocr_with_status, batch_process_orders_with_status, + merge_orders_with_status, process_excel_file_with_status, + process_dropped_file, +) +from .config_dialog import show_config_dialog +from .barcode_editor import edit_barcode_mappings +from .shortcuts import bind_keyboard_shortcuts +from app.core.utils.dialog_utils import show_cloud_sync_dialog + + +def _init_window(): + """初始化窗口、主题和设置,返回 (root, theme, settings, dnd_supported)""" + ensure_directories() + + dnd_supported = False + try: + from tkinterdnd2 import TkinterDnD, DND_FILES + root = TkinterDnD.Tk() + dnd_supported = True + except Exception: + root = tk.Tk() + + settings = load_user_settings() + theme_mode = settings.get('theme_mode', get_theme_mode()) + set_theme_mode(theme_mode) + + try: + cfg_for_title = ConfigManager() + ver = cfg_for_title.get('App', 'version', fallback='dev') + root.title(f"益选-OCR订单处理系统 v{ver} by 欢欢欢") + except Exception: + root.title("益选-OCR订单处理系统 by 欢欢欢") + + root.geometry("900x600") + settings['window_size'] = "900x600" + theme = THEMES[get_theme_mode()] + root.configure(bg=theme["bg"]) + + try: + log_level = settings.get('log_level') + if log_level: + set_log_level(log_level) + concurrency = settings.get('concurrency_max_workers') + if concurrency: + cfg = ConfigManager() + cfg.update('Performance', 'max_workers', str(concurrency)) + cfg.save_config() + except Exception: + pass + + try: + root.iconbitmap(default="") + except Exception: + pass + + return root, theme, settings, dnd_supported + + +def _create_left_panel(content_frame, theme, log_text, status_bar): + """创建左侧面板:完整流程、OCR处理、Excel处理、最近文件""" + left_panel = create_card_frame(content_frame) + left_panel.pack(side=tk.LEFT, fill=tk.BOTH, expand=False, padx=(0, 5), pady=5) + left_panel.configure(width=160) + + panel_content = tk.Frame(left_panel, bg=theme["card_bg"]) + panel_content.pack(fill=tk.BOTH, expand=True, padx=10, pady=(5, 10)) + + # 完整流程区 + pipeline_section = tk.LabelFrame( + panel_content, text="完整流程", bg=theme["card_bg"], fg=theme["fg"], + font=("Microsoft YaHei UI", 10, "bold"), relief="flat", borderwidth=0 + ) + pipeline_section.pack(fill=tk.X, pady=(0, 8)) + pipeline_frame = tk.Frame(pipeline_section, bg=theme["card_bg"]) + pipeline_frame.pack(fill=tk.X, padx=8, pady=6) + create_modern_button(pipeline_frame, "一键处理", lambda: run_pipeline_directly(log_text, status_bar), "primary", px_width=150, px_height=32).pack(anchor='w', pady=3) + + # OCR处理区 + core_section = tk.LabelFrame( + panel_content, text="OCR处理", bg=theme["card_bg"], fg=theme["fg"], + font=("Microsoft YaHei UI", 10, "bold"), relief="flat", borderwidth=0 + ) + core_section.pack(fill=tk.X, pady=(0, 8)) + core_buttons_frame = tk.Frame(core_section, bg=theme["card_bg"]) + core_buttons_frame.pack(fill=tk.X, padx=8, pady=6) + core_row1 = tk.Frame(core_buttons_frame, bg=theme["card_bg"]) + core_row1.pack(fill=tk.X, pady=3) + create_modern_button(core_row1, "批量识别", lambda: batch_ocr_with_status(log_text, status_bar), "primary", px_width=72, px_height=32).pack(side=tk.LEFT, padx=(0, 3)) + create_modern_button(core_row1, "单个识别", lambda: process_single_image_with_status(log_text, status_bar), "primary", px_width=72, px_height=32).pack(side=tk.LEFT, padx=(3, 0)) + + # Excel处理区 + ocr_section = tk.LabelFrame( + panel_content, text="Excel处理", bg=theme["card_bg"], fg=theme["fg"], + font=("Microsoft YaHei UI", 10, "bold"), relief="flat", borderwidth=0 + ) + ocr_section.pack(fill=tk.X, pady=(0, 8)) + ocr_buttons_frame = tk.Frame(ocr_section, bg=theme["card_bg"]) + ocr_buttons_frame.pack(fill=tk.X, padx=8, pady=6) + ocr_row1 = tk.Frame(ocr_buttons_frame, bg=theme["card_bg"]) + ocr_row1.pack(fill=tk.X, pady=3) + create_modern_button(ocr_row1, "批量处理", lambda: batch_process_orders_with_status(log_text, status_bar), "primary", px_width=72, px_height=32).pack(side=tk.LEFT, padx=(0, 3)) + create_modern_button(ocr_row1, "单个处理", lambda: process_excel_file_with_status(log_text, status_bar), "primary", px_width=72, px_height=32).pack(side=tk.LEFT, padx=(3, 0)) + + # 最近文件区 + _create_recent_files_section(panel_content, theme, log_text) + + +def _create_recent_files_section(parent, theme, log_text): + """创建最近文件列表区域""" + recent_section = tk.LabelFrame( + parent, text="最近文件", bg=theme["card_bg"], fg=theme["fg"], + font=("Microsoft YaHei UI", 10, "bold"), relief="flat", borderwidth=0 + ) + recent_section.pack(fill=tk.BOTH, pady=(0, 12)) + recent_frame = tk.Frame(recent_section, bg=theme["card_bg"]) + recent_frame.pack(fill=tk.BOTH, padx=8, pady=6) + recent_top = tk.Frame(recent_frame, bg=theme["card_bg"]) + recent_top.pack(fill=tk.X) + + def _resize_recent_top(e): + try: + h = int(e.height * 0.75) + recent_top.configure(height=h) + except Exception: + pass + + try: + recent_top.pack_propagate(False) + except Exception: + pass + recent_frame.bind('', _resize_recent_top) + + recent_rect = tk.Frame(recent_top, bg=theme["card_bg"], highlightbackground=theme["border"], highlightthickness=1) + recent_rect.pack(fill=tk.BOTH, expand=True) + recent_list = tk.Listbox(recent_rect, height=12) + recent_scrollbar = tk.Scrollbar(recent_rect) + recent_list.configure(yscrollcommand=recent_scrollbar.set) + recent_scrollbar.configure(command=recent_list.yview) + recent_list.pack(side=tk.LEFT, fill=tk.BOTH, expand=True) + recent_scrollbar.pack(side=tk.RIGHT, fill=tk.Y) + + import app.ui.user_settings as _us_mod + _us_mod.RECENT_LIST_WIDGET = recent_list + + def _open_selected_event(evt=None): + try: + idxs = recent_list.curselection() + if not idxs: + return + p = _extract_path_from_recent_item(recent_list.get(idxs[0])) + if os.path.exists(p): + os.startfile(p) + else: + messagebox.showwarning("文件不存在", p) + except Exception as e: + messagebox.showerror("打开失败", str(e)) + + recent_list.bind('', _open_selected_event) + refresh_recent_list_widget() + rf_btns = tk.Frame(recent_frame, bg=theme["card_bg"]) + rf_btns.pack(fill=tk.X, pady=6) + + def clear_list(): + clear_recent_files() + recent_list.delete(0, tk.END) + + create_modern_button(rf_btns, "清空列表", clear_list, "primary", px_width=72, px_height=32).pack(side=tk.LEFT, padx=(3, 0)) + + def purge_invalid(): + try: + kept = [] + for i in range(recent_list.size()): + item = recent_list.get(i) + p = _extract_path_from_recent_item(item) + if os.path.exists(p): + kept.append(p) + try: + kept_sorted = sorted(kept, key=lambda p: os.path.getmtime(p), reverse=True) + except Exception: + kept_sorted = kept + s = load_user_settings() + s['recent_files'] = kept_sorted + save_user_settings(s) + recent_list.delete(0, tk.END) + for i, p in enumerate(s['recent_files'][:recent_list.size() or len(s['recent_files'])], start=1): + recent_list.insert(tk.END, f"{i}. {p}") + refresh_recent_list_widget() + add_to_log(log_text, "已清理无效的最近文件条目\n", "success") + except Exception as e: + messagebox.showerror("清理失败", str(e)) + + create_modern_button(rf_btns, "清理无效", purge_invalid, "primary", px_width=72, px_height=32).pack(side=tk.LEFT, padx=(3, 0)) + + +def _create_right_panel(content_frame, theme, log_text, root): + """创建右侧面板:快捷操作、系统设置""" + right_panel = create_card_frame(content_frame) + right_panel.pack(side=tk.RIGHT, fill=tk.BOTH, expand=False, padx=(5, 0), pady=5) + right_panel.configure(width=380) + + right_panel_content = tk.Frame(right_panel, bg=theme["card_bg"]) + right_panel_content.pack(fill=tk.BOTH, expand=True, padx=10, pady=(5, 10)) + + # 工具功能区 + tools_section = tk.LabelFrame( + right_panel_content, text="快捷操作", bg=theme["card_bg"], fg=theme["fg"], + font=("Microsoft YaHei UI", 10, "bold"), relief="flat", borderwidth=0 + ) + tools_section.pack(fill=tk.X, pady=(0, 8)) + tools_buttons_frame = tk.Frame(tools_section, bg=theme["card_bg"]) + tools_buttons_frame.pack(fill=tk.X, padx=8, pady=6) + tk.Frame(tools_buttons_frame, bg=theme["card_bg"]).pack(fill=tk.X, pady=3) + + create_modern_button(tools_buttons_frame, "打开结果目录", lambda: open_result_directory(), "primary", px_width=132, px_height=32).pack(anchor='w', pady=3) + create_modern_button(tools_buttons_frame, "打开输出目录", lambda: os.startfile(os.path.abspath("data/output")), "primary", px_width=132, px_height=32).pack(anchor='w', pady=3) + create_modern_button(tools_buttons_frame, "打开输入目录", lambda: os.startfile(os.path.abspath("data/input")), "primary", px_width=132, px_height=32).pack(anchor='w', pady=3) + create_modern_button(tools_buttons_frame, "合并订单", lambda: merge_orders_with_status(log_text, StatusBar(root)), "primary", px_width=132, px_height=32).pack(anchor='w', pady=3) + create_modern_button(tools_buttons_frame, "清除缓存", lambda: clean_cache(log_text), "primary", px_width=132, px_height=32).pack(anchor='w', pady=3) + create_modern_button(tools_buttons_frame, "清理input/out文件", lambda: clean_data_files(log_text), "primary", px_width=132, px_height=32).pack(anchor='w', pady=3) + create_modern_button(tools_buttons_frame, "清理result文件", lambda: clean_result_files(log_text), "primary", px_width=132, px_height=32).pack(anchor='w', pady=3) + + # 系统设置区 + settings_section = tk.LabelFrame( + right_panel_content, text="系统设置", bg=theme["card_bg"], fg=theme["fg"], + font=("Microsoft YaHei UI", 10, "bold"), relief="flat", borderwidth=0 + ) + settings_section.pack(fill=tk.X, pady=(0, 8)) + settings_buttons_frame = tk.Frame(settings_section, bg=theme["card_bg"]) + settings_buttons_frame.pack(fill=tk.X, padx=8, pady=6) + create_modern_button(settings_buttons_frame, "系统设置", lambda: show_config_dialog(root, ConfigManager()), "primary", px_width=132, px_height=32).pack(anchor='w', pady=3) + create_modern_button(settings_buttons_frame, "条码映射", lambda: edit_barcode_mappings(log_text), "primary", px_width=132, px_height=32).pack(anchor='w', pady=3) + create_modern_button(settings_buttons_frame, "云端同步", lambda: show_cloud_sync_dialog(root), "primary", px_width=132, px_height=32).pack(anchor='w', pady=3) + + +def _setup_drag_area(mid_container, theme, dnd_supported, log_text, status_bar): + """创建拖拽/点击选择文件区域""" + drag_panel = create_card_frame(mid_container) + drag_panel.pack(side=tk.TOP, fill=tk.X, padx=(5, 5), pady=(0, 5)) + drag_panel_content = tk.Frame(drag_panel, bg=theme["card_bg"]) + drag_panel_content.pack(fill=tk.X, padx=10, pady=6) + + dnd_section = tk.LabelFrame( + drag_panel_content, bg=theme["card_bg"], fg=theme["fg"], + font=("Microsoft YaHei UI", 10, "bold"), relief="flat", borderwidth=0 + ) + dnd_section.pack(fill=tk.X, pady=(0, 0)) + dnd_frame = tk.Frame(dnd_section, bg=theme["card_bg"], highlightthickness=1, highlightbackground=theme["border"]) + dnd_frame.configure(height=60) + dnd_frame.pack(fill=tk.X, padx=8, pady=6) + try: + dnd_frame.pack_propagate(False) + except Exception: + pass + + def _set_highlight(active: bool): + try: + dnd_frame.configure(highlightbackground=theme["info"] if active else theme["border"]) + except Exception: + pass + + dnd_frame.bind('', lambda e: _set_highlight(True)) + dnd_frame.bind('', lambda e: _set_highlight(False)) + + msg_row = tk.Frame(dnd_frame, bg=theme["card_bg"]) + msg_row.pack(fill=tk.X) + if dnd_supported: + tk.Label( + msg_row, text="拖拽已启用:拖拽或点击此区域选择文件", + bg=theme["card_bg"], fg="#999999", justify="center" + ).pack(fill=tk.X) + else: + tk.Label( + msg_row, text="点击此区域选择文件;可安装拖拽支持", + bg=theme["card_bg"], fg="#999999", justify="center" + ).pack(fill=tk.X) + + if not dnd_supported: + btn_row = tk.Frame(dnd_frame, bg=theme["card_bg"]) + btn_row.pack(fill=tk.X) + + def copy_install(): + try: + mid_container.winfo_toplevel().clipboard_clear() + mid_container.winfo_toplevel().clipboard_append("pip install tkinterdnd2") + messagebox.showinfo("已复制", "已复制安装命令:pip install tkinterdnd2") + except Exception as e: + messagebox.showwarning("复制失败", str(e)) + + create_modern_button(btn_row, "复制安装命令", copy_install, "primary", px_width=132, px_height=28).pack(side=tk.RIGHT) + + def install_and_restart(): + try: + add_to_log(log_text, "开始安装拖拽支持库 tkinterdnd2...\n", "info") + cmd = [sys.executable, "-m", "pip", "install", "tkinterdnd2"] + result = subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + add_to_log(log_text, result.stdout + "\n", "info") + add_to_log(log_text, "安装成功,准备重启程序以启用拖拽...\n", "success") + if messagebox.askyesno("安装完成", "已安装拖拽支持,是否立即重启应用?"): + os.execl(sys.executable, sys.executable, *sys.argv) + except subprocess.CalledProcessError as e: + add_to_log(log_text, f"安装失败: {e.stderr}\n", "error") + messagebox.showerror("安装失败", f"安装输出:\n{e.stderr}") + except Exception as e: + add_to_log(log_text, f"安装失败: {str(e)}\n", "error") + messagebox.showerror("安装失败", str(e)) + + create_modern_button(btn_row, "一键安装拖拽", install_and_restart, "primary", px_width=132, px_height=28).pack(side=tk.RIGHT, padx=(3, 0)) + + # 点击拖拽框选择文件 + def _click_select(evt=None): + try: + files = filedialog.askopenfilenames( + title="选择图片或Excel文件", + filetypes=[ + ("支持文件", "*.xlsx *.xls *.jpg *.jpeg *.png *.bmp"), + ("Excel", "*.xlsx *.xls"), + ("图片", "*.jpg *.jpeg *.png *.bmp"), + ("所有文件", "*.*"), + ] + ) + if not files: + return + for p in files: + process_dropped_file(log_text, status_bar, p) + except Exception as e: + messagebox.showerror("选择失败", str(e)) + + dnd_frame.bind('', _click_select) + msg_row.bind('', _click_select) + + if dnd_supported: + def _on_drop(event): + try: + data = event.data + paths = [] + buf = "" + in_brace = False + for ch in data: + if ch == '{': + in_brace = True + buf = "" + elif ch == '}': + in_brace = False + paths.append(buf) + buf = "" + elif ch == ' ' and not in_brace: + if buf: + paths.append(buf) + buf = "" + else: + buf += ch + if buf: + paths.append(buf) + for p in paths: + process_dropped_file(log_text, status_bar, p) + except Exception as e: + add_to_log(log_text, f"拖拽处理失败: {str(e)}\n", "error") + + try: + from tkinterdnd2 import DND_FILES + dnd_frame.drop_target_register(DND_FILES) + dnd_frame.dnd_bind('<>', _on_drop) + except Exception: + pass + + +def _create_log_panel(mid_container, theme): + """创建中间日志面板,返回 log_text widget""" + log_panel = create_card_frame(mid_container, "处理日志") + log_panel.pack(side=tk.TOP, fill=tk.BOTH, expand=True, padx=(5, 5), pady=5) + + log_text = scrolledtext.ScrolledText( + log_panel, wrap=tk.WORD, width=68, height=26, + bg=theme["log_bg"], fg=theme["log_fg"], + font=("Consolas", 9), state=tk.DISABLED, + relief="flat", borderwidth=0 + ) + log_text.pack(fill=tk.BOTH, expand=True, padx=10, pady=(5, 10)) + + log_text.tag_configure("command", foreground=theme["info"], font=("Consolas", 9, "bold")) + log_text.tag_configure("time", foreground=theme["secondary_bg"], font=("Consolas", 8)) + log_text.tag_configure("separator", foreground=theme["border"]) + log_text.tag_configure("success", foreground=theme["success"], font=("Consolas", 9, "bold")) + log_text.tag_configure("error", foreground=theme["error"], font=("Consolas", 9, "bold")) + log_text.tag_configure("warning", foreground=theme["warning"], font=("Consolas", 9, "bold")) + log_text.tag_configure("info", foreground=theme["info"], font=("Consolas", 9)) + + poll_log_queue(log_text) + + add_to_log(log_text, "欢迎使用 益选-OCR订单处理系统 v1.1.0\n", "success") + add_to_log(log_text, "系统已就绪,请选择相应功能进行操作。\n\n", "info") + add_to_log(log_text, "功能说明:\n", "command") + add_to_log(log_text, "• 完整处理流程:一键完成OCR识别和Excel处理\n", "info") + add_to_log(log_text, "• 批量处理订单:批量处理多个订单文件\n", "info") + add_to_log(log_text, "• 处理烟草订单:专门处理烟草类订单\n", "info") + add_to_log(log_text, "• 合并订单:将多个订单合并为一个文件\n\n", "info") + add_to_log(log_text, "请将需要处理的图片文件放入 data/input 目录中。\n", "warning") + add_to_log(log_text, "OCR识别结果保存在 data/output 目录,处理完成的订单保存在 result 目录中。\n\n", "warning") + add_to_log(log_text, "=" * 50 + "\n\n", "separator") + + return log_text + + +def main(): + """主函数""" + try: + root, theme, settings, dnd_supported = _init_window() + + # 主容器 + main_container = tk.Frame(root, bg=theme["bg"]) + main_container.pack(fill=tk.BOTH, expand=True, padx=10, pady=10) + content_frame = tk.Frame(main_container, bg=theme["bg"]) + content_frame.pack(fill=tk.BOTH, expand=True) + + # 中间容器(拖拽区 + 日志区) + mid_container = tk.Frame(content_frame, bg=theme["bg"]) + mid_container.pack(side=tk.LEFT, fill=tk.BOTH, expand=True, padx=(5, 5), pady=5) + + log_text = _create_log_panel(mid_container, theme) + + # 状态栏 + status_bar = StatusBar(root) + status_bar.pack(side=tk.BOTTOM, fill=tk.X) + + # 左侧面板 + _create_left_panel(content_frame, theme, log_text, status_bar) + + # 右侧面板 + _create_right_panel(content_frame, theme, log_text, root) + + # 拖拽区域 + _setup_drag_area(mid_container, theme, dnd_supported, log_text, status_bar) + + # 快捷键 + 关闭事件 + def on_close(): + try: + w = root.winfo_width() + h = root.winfo_height() + settings['window_size'] = f"{w}x{h}" + settings['theme_mode'] = get_theme_mode() + save_user_settings(settings) + except Exception: + pass + root.destroy() + + root.protocol("WM_DELETE_WINDOW", on_close) + bind_keyboard_shortcuts(root, log_text, status_bar) + + root.mainloop() + + except Exception as e: + import traceback + error_msg = f"程序启动失败: {str(e)}\n详细错误信息:\n{traceback.format_exc()}" + print(error_msg) + try: + import tkinter.messagebox as mb + mb.showerror("启动错误", f"程序启动失败:\n{str(e)}") + except Exception: + pass diff --git a/app/ui/result_previews.py b/app/ui/result_previews.py new file mode 100644 index 0000000..6ee53ca --- /dev/null +++ b/app/ui/result_previews.py @@ -0,0 +1,371 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +"""处理结果预览对话框模块""" + +import os +import re +import datetime +import tkinter as tk +from tkinter import messagebox, scrolledtext + +from .theme import THEMES, get_theme_mode, apply_theme +from .ui_widgets import center_window +from app.core.utils.file_utils import format_file_size + +TOBACCO_PREVIEW_WINDOW = None + + +def show_result_preview(command, output): + """显示处理结果预览""" + if "ocr" in command: + show_ocr_result_preview(output) + elif "excel" in command: + show_excel_result_preview(output) + elif "merge" in command: + show_merge_result_preview(output) + elif "pipeline" in command: + show_pipeline_result_preview(output) + else: + messagebox.showinfo("处理完成", "操作已成功完成!\n请在data/output目录查看结果。") + + +def show_ocr_result_preview(output): + """显示OCR处理结果预览""" + files_match = re.search(r'找到 (\d+) 个图片文件,其中 (\d+) 个未处理', output) + processed_match = re.search(r'所有图片处理完成, 总计: (\d+), 成功: (\d+)', output) + + if processed_match: + total = int(processed_match.group(1)) + success = int(processed_match.group(2)) + + preview = tk.Toplevel() + preview.title("OCR处理结果") + preview.geometry("400x300") + preview.resizable(False, False) + center_window(preview) + + tk.Label(preview, text="OCR处理完成", font=("Arial", 16, "bold")).pack(pady=10) + + result_frame = tk.Frame(preview) + result_frame.pack(pady=10, fill=tk.BOTH, expand=True) + + tk.Label(result_frame, text=f"总共处理: {total} 个文件", font=("Arial", 12)).pack(anchor=tk.W, padx=20, pady=5) + tk.Label(result_frame, text=f"成功处理: {success} 个文件", font=("Arial", 12)).pack(anchor=tk.W, padx=20, pady=5) + tk.Label(result_frame, text=f"失败数量: {total - success} 个文件", font=("Arial", 12)).pack(anchor=tk.W, padx=20, pady=5) + + if success == total: + result_text = "全部处理成功!" + result_color = "#28a745" + elif success > total * 0.8: + result_text = "大部分处理成功。" + result_color = "#ffc107" + else: + result_text = "处理失败较多,请检查日志。" + result_color = "#dc3545" + + tk.Label(result_frame, text=result_text, font=("Arial", 12, "bold"), fg=result_color).pack(pady=10) + + button_frame = tk.Frame(preview) + button_frame.pack(pady=10) + + tk.Button(button_frame, text="查看输出文件", command=lambda: os.startfile(os.path.abspath("data/output"))).pack(side=tk.LEFT, padx=10) + tk.Button(button_frame, text="关闭", command=preview.destroy).pack(side=tk.LEFT, padx=10) + else: + messagebox.showinfo("OCR处理完成", "OCR处理已完成,请在data/output目录查看结果。") + + +def show_excel_result_preview(output): + """显示Excel处理结果预览""" + extract_match = re.search(r'提取到 (\d+) 个商品信息', output) + file_match = re.search(r'采购单已保存到: (.+?)(?:\n|$)', output) + + if extract_match and file_match: + products_count = int(extract_match.group(1)) + output_file = file_match.group(1) + + preview = tk.Toplevel() + preview.title("Excel处理结果") + preview.geometry("450x320") + preview.resizable(False, False) + center_window(preview) + + tk.Label(preview, text="Excel处理完成", font=("Arial", 16, "bold")).pack(pady=10) + + result_frame = tk.Frame(preview) + result_frame.pack(pady=10, fill=tk.BOTH, expand=True) + + tk.Label(result_frame, text=f"提取商品数量: {products_count} 个", font=("Arial", 12)).pack(anchor=tk.W, padx=20, pady=5) + tk.Label(result_frame, text=f"输出文件: {os.path.basename(output_file)}", font=("Arial", 12)).pack(anchor=tk.W, padx=20, pady=5) + + tk.Label(result_frame, text="采购单已成功生成!", font=("Arial", 12, "bold"), fg="#28a745").pack(pady=10) + + file_frame = tk.Frame(result_frame, relief=tk.GROOVE, borderwidth=1) + file_frame.pack(fill=tk.X, padx=15, pady=5) + + tk.Label(file_frame, text="文件信息", font=("Arial", 10, "bold")).pack(anchor=tk.W, padx=10, pady=5) + + try: + file_size = os.path.getsize(output_file) + file_time = datetime.datetime.fromtimestamp(os.path.getmtime(output_file)) + size_text = format_file_size(file_size) + tk.Label(file_frame, text=f"文件大小: {size_text}", font=("Arial", 10)).pack(anchor=tk.W, padx=10, pady=2) + tk.Label(file_frame, text=f"创建时间: {file_time.strftime('%Y-%m-%d %H:%M:%S')}", font=("Arial", 10)).pack(anchor=tk.W, padx=10, pady=2) + except Exception: + tk.Label(file_frame, text="无法获取文件信息", font=("Arial", 10)).pack(anchor=tk.W, padx=10, pady=2) + + button_frame = tk.Frame(preview) + button_frame.pack(pady=10) + + tk.Button(button_frame, text="打开文件", command=lambda: os.startfile(output_file)).pack(side=tk.LEFT, padx=5) + tk.Button(button_frame, text="打开所在文件夹", command=lambda: os.startfile(os.path.dirname(output_file))).pack(side=tk.LEFT, padx=5) + tk.Button(button_frame, text="关闭", command=preview.destroy).pack(side=tk.LEFT, padx=5) + else: + messagebox.showinfo("Excel处理完成", "Excel处理已完成,请在data/output目录查看结果。") + + +def show_merge_result_preview(output): + """显示合并结果预览""" + merged_match = re.search(r'合并了 (\d+) 个采购单', output) + product_match = re.search(r'共处理 (\d+) 个商品', output) + output_match = re.search(r'已保存到: (.+?)(?:\n|$)', output) + + if merged_match and output_match: + merged_count = int(merged_match.group(1)) + product_count = int(product_match.group(1)) if product_match else 0 + output_file = output_match.group(1) + + preview = tk.Toplevel() + preview.title("采购单合并结果") + preview.geometry("450x300") + preview.resizable(False, False) + apply_theme(preview) + + tk.Label(preview, text="采购单合并完成", font=("Arial", 16, "bold")).pack(pady=10) + + result_frame = tk.Frame(preview) + result_frame.pack(pady=10, fill=tk.BOTH, expand=True) + + tk.Label(result_frame, text=f"合并采购单数量: {merged_count} 个", font=("Arial", 12)).pack(anchor=tk.W, padx=20, pady=5) + tk.Label(result_frame, text=f"处理商品数量: {product_count} 个", font=("Arial", 12)).pack(anchor=tk.W, padx=20, pady=5) + tk.Label(result_frame, text=f"输出文件: {os.path.basename(output_file)}", font=("Arial", 12)).pack(anchor=tk.W, padx=20, pady=5) + + theme = THEMES[get_theme_mode()] + tk.Label(result_frame, text="采购单已成功合并!", font=("Arial", 12, "bold"), fg=theme["success"]).pack(pady=10) + + button_frame = tk.Frame(preview) + button_frame.pack(pady=10) + + tk.Button(button_frame, text="打开文件", command=lambda: os.startfile(output_file)).pack(side=tk.LEFT, padx=10) + tk.Button(button_frame, text="打开所在文件夹", command=lambda: os.startfile(os.path.dirname(output_file))).pack(side=tk.LEFT, padx=10) + tk.Button(button_frame, text="关闭", command=preview.destroy).pack(side=tk.LEFT, padx=10) + else: + messagebox.showinfo("采购单合并完成", "采购单合并已完成,请在data/output目录查看结果。") + + +def show_pipeline_result_preview(output): + """显示完整流程结果预览""" + ocr_match = re.search(r'所有图片处理完成, 总计: (\d+), 成功: (\d+)', output) + excel_match = re.search(r'提取到 (\d+) 个商品信息', output) + output_file_match = re.search(r'采购单已保存到: (.+?)(?:\n|$)', output) + + preview = tk.Toplevel() + preview.title("完整流程处理结果") + preview.geometry("500x400") + preview.resizable(False, False) + center_window(preview) + + tk.Label(preview, text="完整处理流程已完成", font=("Arial", 16, "bold")).pack(pady=10) + + no_files_match = re.search(r'未找到可合并的文件', output) + if no_files_match: + tk.Label(preview, text="未找到可合并的文件,但其他步骤已成功执行", font=("Arial", 12)).pack(pady=0) + + result_frame = tk.Frame(preview) + result_frame.pack(pady=10, fill=tk.BOTH, expand=True) + + result_text = scrolledtext.ScrolledText(result_frame, wrap=tk.WORD, height=15, width=60) + result_text.pack(fill=tk.BOTH, expand=True, padx=15, pady=5) + result_text.configure(state=tk.NORMAL) + + result_text.insert(tk.END, "===== 流程执行结果 =====\n\n", "title") + + result_text.insert(tk.END, "步骤1: OCR识别\n", "step") + if ocr_match: + total = int(ocr_match.group(1)) + success = int(ocr_match.group(2)) + result_text.insert(tk.END, f" 处理图片: {total} 个\n", "info") + result_text.insert(tk.END, f" 成功识别: {success} 个\n", "info") + if success == total: + result_text.insert(tk.END, " 结果: 全部识别成功\n", "success") + else: + result_text.insert(tk.END, f" 结果: 部分识别成功 ({success}/{total})\n", "warning") + else: + result_text.insert(tk.END, " 结果: 无OCR处理或处理信息不完整\n", "warning") + + result_text.insert(tk.END, "\n步骤2: Excel处理\n", "step") + if excel_match: + products = int(excel_match.group(1)) + result_text.insert(tk.END, f" 提取商品: {products} 个\n", "info") + result_text.insert(tk.END, " 结果: 成功生成采购单\n", "success") + if output_file_match: + output_file = output_file_match.group(1) + result_text.insert(tk.END, f" 输出文件: {os.path.basename(output_file)}\n", "info") + else: + result_text.insert(tk.END, " 结果: 无Excel处理或处理信息不完整\n", "warning") + + result_text.insert(tk.END, "\n===== 整体评估 =====\n", "title") + + has_errors = "错误" in output or "失败" in output + + no_files_match2 = re.search(r'未找到采购单文件', output) + single_file_match = re.search(r'只有1个采购单文件', output) + + if no_files_match2: + result_text.insert(tk.END, "没有找到可合并的文件,但处理流程已成功完成。\n", "warning") + result_text.insert(tk.END, "可以选择打开Excel文件或查看输出文件夹。\n", "info") + elif single_file_match: + result_text.insert(tk.END, "只有一个采购单文件,无需合并,处理流程已成功完成。\n", "warning") + result_text.insert(tk.END, "可以选择打开生成的Excel文件。\n", "info") + elif ocr_match and excel_match and not has_errors: + result_text.insert(tk.END, "流程完整执行成功!\n", "success") + elif ocr_match or excel_match: + result_text.insert(tk.END, "流程部分执行成功,请检查日志获取详情。\n", "warning") + else: + result_text.insert(tk.END, "流程执行可能存在问题,请查看详细日志。\n", "error") + + result_text.tag_configure("title", font=("Arial", 12, "bold")) + result_text.tag_configure("step", font=("Arial", 11, "bold")) + result_text.tag_configure("info", font=("Arial", 10)) + result_text.tag_configure("success", font=("Arial", 10, "bold"), foreground="#28a745") + result_text.tag_configure("warning", font=("Arial", 10, "bold"), foreground="#ffc107") + result_text.tag_configure("error", font=("Arial", 10, "bold"), foreground="#dc3545") + + result_text.configure(state=tk.DISABLED) + + button_frame = tk.Frame(preview) + button_frame.pack(pady=10) + + if output_file_match: + output_file = output_file_match.group(1) + tk.Button(button_frame, text="打开Excel文件", command=lambda: os.startfile(output_file)).pack(side=tk.LEFT, padx=10) + else: + if excel_match or no_files_match or single_file_match: + output_dir = os.path.abspath("data/output") + excel_files = [f for f in os.listdir(output_dir) if f.startswith('采购单_') and (f.endswith('.xls') or f.endswith('.xlsx'))] + if excel_files: + excel_files.sort(key=lambda x: os.path.getmtime(os.path.join(output_dir, x)), reverse=True) + latest_file = os.path.join(output_dir, excel_files[0]) + tk.Button(button_frame, text="打开最新Excel文件", + command=lambda: os.startfile(latest_file)).pack(side=tk.LEFT, padx=10) + + tk.Button(button_frame, text="查看输出文件夹", command=lambda: os.startfile(os.path.abspath("data/output"))).pack(side=tk.LEFT, padx=10) + tk.Button(button_frame, text="关闭", command=preview.destroy).pack(side=tk.LEFT, padx=10) + + +def show_tobacco_result_preview(returncode, output): + """显示烟草订单处理结果预览""" + global TOBACCO_PREVIEW_WINDOW + if returncode != 0: + return + + try: + try: + if TOBACCO_PREVIEW_WINDOW and TOBACCO_PREVIEW_WINDOW.winfo_exists(): + TOBACCO_PREVIEW_WINDOW.lift() + return + except Exception: + TOBACCO_PREVIEW_WINDOW = None + + result_file = None + order_time = "(未知)" + total_amount = "(未知)" + items_count = 0 + + abs_path_match = re.search(r'烟草订单处理完成,绝对路径: (.+)(?:\n|$)', output) + if abs_path_match: + result_file = abs_path_match.group(1).strip() + + for line in output.split('\n'): + if "烟草公司订单处理成功" in line and "订单时间" in line: + time_match = re.search(r'订单时间: ([^,]+)', line) + amount_match = re.search(r'总金额: ([^,]+)', line) + items_match = re.search(r'处理条目: (\d+)', line) + + if time_match: + order_time = time_match.group(1).strip() + if amount_match: + total_amount = amount_match.group(1).strip() + if items_match: + items_count = int(items_match.group(1).strip()) + + if not result_file or not os.path.exists(result_file): + default_path = os.path.abspath("data/output/银豹采购单_烟草公司.xls") + if os.path.exists(default_path): + result_file = default_path + + preview = tk.Toplevel() + preview.title("烟草订单处理结果") + preview.geometry("450x320") + preview.resizable(False, False) + TOBACCO_PREVIEW_WINDOW = preview + + def _close_preview(): + global TOBACCO_PREVIEW_WINDOW + TOBACCO_PREVIEW_WINDOW = None + try: + preview.destroy() + except Exception: + pass + + preview.protocol("WM_DELETE_WINDOW", _close_preview) + center_window(preview) + + tk.Label(preview, text="烟草订单处理完成", font=("Arial", 16, "bold")).pack(pady=10) + + result_frame = tk.Frame(preview) + result_frame.pack(pady=10, fill=tk.BOTH, expand=True) + + tk.Label(result_frame, text=f"订单时间: {order_time}", font=("Arial", 12)).pack(anchor=tk.W, padx=20, pady=5) + tk.Label(result_frame, text=f"订单总金额: {total_amount}", font=("Arial", 12)).pack(anchor=tk.W, padx=20, pady=5) + tk.Label(result_frame, text=f"处理商品数量: {items_count} 个", font=("Arial", 12)).pack(anchor=tk.W, padx=20, pady=5) + + if result_file and os.path.exists(result_file): + tk.Label(result_frame, text=f"输出文件: {os.path.basename(result_file)}", font=("Arial", 12)).pack(anchor=tk.W, padx=20, pady=5) + tk.Label(result_frame, text="银豹采购单已成功生成!", font=("Arial", 12, "bold"), fg="#28a745").pack(pady=10) + + file_frame = tk.Frame(result_frame, relief=tk.GROOVE, borderwidth=1) + file_frame.pack(fill=tk.X, padx=15, pady=5) + tk.Label(file_frame, text="文件信息", font=("Arial", 10, "bold")).pack(anchor=tk.W, padx=10, pady=5) + + try: + file_size = os.path.getsize(result_file) + file_time = datetime.datetime.fromtimestamp(os.path.getmtime(result_file)) + size_text = format_file_size(file_size) + tk.Label(file_frame, text=f"文件大小: {size_text}", font=("Arial", 10)).pack(anchor=tk.W, padx=10, pady=2) + tk.Label(file_frame, text=f"创建时间: {file_time.strftime('%Y-%m-%d %H:%M:%S')}", font=("Arial", 10)).pack(anchor=tk.W, padx=10, pady=2) + except Exception: + tk.Label(file_frame, text="无法获取文件信息", font=("Arial", 10)).pack(anchor=tk.W, padx=10, pady=2) + + button_frame = tk.Frame(preview) + button_frame.pack(pady=10) + tk.Button(button_frame, text="打开文件", command=lambda: os.startfile(result_file)).pack(side=tk.LEFT, padx=5) + tk.Button(button_frame, text="打开所在文件夹", command=lambda: os.startfile(os.path.dirname(result_file))).pack(side=tk.LEFT, padx=5) + tk.Button(button_frame, text="关闭", command=_close_preview).pack(side=tk.LEFT, padx=5) + else: + tk.Label(result_frame, text="未找到输出文件", font=("Arial", 12)).pack(anchor=tk.W, padx=20, pady=5) + tk.Label(result_frame, text="请检查data/output目录", font=("Arial", 12, "bold"), fg="#dc3545").pack(pady=10) + + button_frame = tk.Frame(preview) + button_frame.pack(pady=10) + tk.Button(button_frame, text="打开输出目录", command=lambda: os.startfile(os.path.abspath("data/output"))).pack(side=tk.LEFT, padx=5) + tk.Button(button_frame, text="关闭", command=_close_preview).pack(side=tk.LEFT, padx=5) + + preview.lift() + preview.attributes('-topmost', True) + preview.after_idle(lambda: preview.attributes('-topmost', False)) + + except Exception as e: + messagebox.showerror( + "处理异常", + f"显示预览时发生错误: {e}\n请检查日志了解详细信息。" + ) diff --git a/app/ui/shortcuts.py b/app/ui/shortcuts.py new file mode 100644 index 0000000..2aa3b5e --- /dev/null +++ b/app/ui/shortcuts.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +"""键盘快捷键模块""" + +import tkinter as tk +from tkinter import messagebox + +from .ui_widgets import center_window +from .action_handlers import ( + process_single_image_with_status, + process_excel_file_with_status, + batch_ocr_with_status, + run_pipeline_directly, + merge_orders_with_status, +) +from .file_operations import clean_cache + + +def bind_keyboard_shortcuts(root, log_widget, status_bar): + """绑定键盘快捷键""" + root.bind('', lambda e: process_single_image_with_status(log_widget, status_bar)) + root.bind('', lambda e: process_excel_file_with_status(log_widget, status_bar)) + root.bind('', lambda e: batch_ocr_with_status(log_widget, status_bar)) + root.bind('', lambda e: run_pipeline_directly(log_widget, status_bar)) + root.bind('', lambda e: merge_orders_with_status(log_widget, status_bar)) + root.bind('', lambda e: clean_cache(log_widget)) + root.bind('', lambda e: root.quit() if messagebox.askyesno("确认退出", "确定要退出程序吗?") else None) + root.bind('', lambda e: show_shortcuts_help()) + + +def show_shortcuts_help(): + """显示快捷键帮助对话框""" + help_dialog = tk.Toplevel() + help_dialog.title("快捷键帮助") + help_dialog.geometry("400x450") + center_window(help_dialog) + + tk.Label(help_dialog, text="键盘快捷键", font=("Arial", 16, "bold")).pack(pady=10) + + help_text = tk.Text(help_dialog, wrap=tk.WORD, width=50, height=20) + help_text.pack(padx=20, pady=10, fill=tk.BOTH, expand=True) + + shortcuts = """ + Ctrl+O: 处理单个图片 + Ctrl+E: 处理Excel文件 + Ctrl+B: OCR批量识别 + Ctrl+P: 完整处理流程 + Ctrl+M: 合并采购单 + F5: 清除处理缓存 + Esc: 退出程序 + """ + + help_text.insert(tk.END, shortcuts) + help_text.configure(state=tk.DISABLED) + + tk.Button(help_dialog, text="确定", command=help_dialog.destroy).pack(pady=10) + + help_dialog.lift() + help_dialog.attributes('-topmost', True) + help_dialog.after_idle(lambda: help_dialog.attributes('-topmost', False)) diff --git a/app/ui/theme.py b/app/ui/theme.py new file mode 100644 index 0000000..6a80618 --- /dev/null +++ b/app/ui/theme.py @@ -0,0 +1,193 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +"""主题管理模块""" + +import tkinter as tk +from tkinter import scrolledtext, ttk + +# 私有主题模式变量 +_theme_mode = "light" + +# 浅色和深色主题颜色 +THEMES = { + "light": { + "bg": "#f8f9fa", + "fg": "#212529", + "button_bg": "#ffffff", + "button_fg": "#495057", + "button_hover": "#e9ecef", + "primary_bg": "#007bff", + "primary_fg": "#ffffff", + "secondary_bg": "#6c757d", + "secondary_fg": "#ffffff", + "log_bg": "#ffffff", + "log_fg": "#212529", + "highlight_bg": "#007bff", + "highlight_fg": "#ffffff", + "border": "#dee2e6", + "success": "#28a745", + "error": "#dc3545", + "warning": "#ffc107", + "info": "#17a2b8", + "card_bg": "#ffffff", + "shadow": "#00000010" + }, + "dark": { + "bg": "#1a1a1a", + "fg": "#e9ecef", + "button_bg": "#343a40", + "button_fg": "#e9ecef", + "button_hover": "#495057", + "primary_bg": "#0d6efd", + "primary_fg": "#ffffff", + "secondary_bg": "#6c757d", + "secondary_fg": "#ffffff", + "log_bg": "#212529", + "log_fg": "#e9ecef", + "highlight_bg": "#0d6efd", + "highlight_fg": "#ffffff", + "border": "#495057", + "success": "#198754", + "error": "#dc3545", + "warning": "#ffc107", + "info": "#0dcaf0", + "card_bg": "#2d3748", + "shadow": "#00000030" + } +} + + +def get_theme_mode() -> str: + return _theme_mode + + +def set_theme_mode(mode: str): + global _theme_mode + _theme_mode = mode + + +def create_modern_button(parent, text, command, style="primary", width=None, height=None, px_width=None, px_height=None): + """创建现代化样式的按钮""" + theme = THEMES[_theme_mode] + + if style == "primary": + bg_color = "white" + fg_color = theme["primary_bg"] + hover_color = "#f0f8ff" + border_color = theme["primary_bg"] + elif style == "secondary": + bg_color = theme["secondary_bg"] + fg_color = theme["secondary_fg"] + hover_color = theme["button_hover"] + border_color = theme["secondary_bg"] + else: + bg_color = "white" + fg_color = theme["primary_bg"] + hover_color = "#f0f8ff" + border_color = theme["primary_bg"] + + button_frame = tk.Frame(parent, bg=border_color, highlightthickness=0) + button_frame.configure(relief="flat", bd=0) + if px_width or px_height: + try: + w = px_width if px_width else button_frame.winfo_reqwidth() + h = px_height if px_height else 32 + button_frame.configure(width=w, height=h) + button_frame.pack_propagate(False) + except Exception: + pass + + button = tk.Button( + button_frame, + text=text, + command=command, + bg=bg_color, + fg=fg_color, + font=("Microsoft YaHei UI", 8), + relief="flat", + bd=0, + padx=14, + pady=4, + anchor="center", + cursor="hand2", + activebackground=hover_color, + activeforeground=fg_color + ) + + if width: + button.configure(width=width) + else: + button.configure(width=12) + if height is not None: + button.configure(height=height) + else: + button.configure(height=1) + if height: + button.configure(height=height) + + # 悬停效果 + def on_enter(e): + button.configure(bg=hover_color) + + def on_leave(e): + button.configure(bg=bg_color) + + button.bind("", on_enter) + button.bind("", on_leave) + button_frame.bind("", on_enter) + button_frame.bind("", on_leave) + + button.pack(fill=tk.BOTH, expand=True, padx=1, pady=1) + return button_frame + + +def create_card_frame(parent, title=None): + """创建卡片样式的框架""" + theme = THEMES[_theme_mode] + + card = tk.Frame( + parent, + bg=theme["card_bg"], + relief="flat", + borderwidth=1, + highlightbackground=theme["border"], + highlightthickness=1 + ) + + if title: + title_label = tk.Label( + card, + text=title, + bg=theme["card_bg"], + fg=theme["fg"], + font=("Microsoft YaHei UI", 10, "bold") + ) + title_label.pack(pady=(6, 3)) + + return card + + +def apply_theme(widget, theme_mode=None): + """应用主题到小部件""" + if theme_mode is None: + theme_mode = _theme_mode + + theme = THEMES[theme_mode] + + try: + widget.configure(bg=theme["bg"], fg=theme["fg"]) + except Exception: + pass + + for child in widget.winfo_children(): + if isinstance(child, tk.Button) and not isinstance(child, ttk.Button): + child.configure(bg=theme["button_bg"], fg=theme["button_fg"]) + elif isinstance(child, scrolledtext.ScrolledText): + child.configure(bg=theme["log_bg"], fg=theme["log_fg"]) + else: + try: + child.configure(bg=theme["bg"], fg=theme["fg"]) + except Exception: + pass + + apply_theme(child, theme_mode) diff --git a/app/ui/ui_widgets.py b/app/ui/ui_widgets.py new file mode 100644 index 0000000..979eb7e --- /dev/null +++ b/app/ui/ui_widgets.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +"""UI控件模块 - StatusBar、ProgressReporter、可折叠框架等""" + +import tkinter as tk +from tkinter import ttk + +from .theme import THEMES, get_theme_mode + + +class StatusBar(tk.Frame): + """状态栏,显示当前系统状态和进度""" + + def __init__(self, master, **kwargs): + super().__init__(master, **kwargs) + self.configure(height=25, relief=tk.SUNKEN, borderwidth=1) + + self.status_label = tk.Label(self, text="就绪", anchor=tk.W, padx=5) + self.status_label.pack(side=tk.LEFT, fill=tk.X, expand=True) + + self.progress = ttk.Progressbar(self, orient=tk.HORIZONTAL, length=200, mode='determinate') + self.progress.pack(side=tk.RIGHT, padx=5, pady=2) + + self.progress.pack_forget() + + def set_status(self, text, progress=None): + """设置状态栏文本和进度""" + self.status_label.config(text=text) + + if progress is not None and 0 <= progress <= 100: + self.progress.pack(side=tk.RIGHT, padx=5, pady=2) + self.progress.config(value=progress) + else: + self.progress.pack_forget() + + def set_running(self, is_running=True): + """设置运行状态""" + theme = THEMES[get_theme_mode()] + if is_running: + self.status_label.config(text="处理中...", foreground=theme["info"]) + self.progress.pack(side=tk.RIGHT, padx=5, pady=2) + self.progress.config(mode='indeterminate') + self.progress.start() + else: + self.status_label.config(text="就绪", foreground=theme["fg"]) + self.progress.stop() + self.progress.pack_forget() + + +class ProgressReporter: + def __init__(self, status_bar: StatusBar): + self.status_bar = status_bar + + def set(self, text: str, percent: int = None): + try: + if percent is not None: + self.status_bar.set_status(text, percent) + else: + self.status_bar.set_status(text) + except Exception: + pass + + def running(self): + try: + self.status_bar.set_running(True) + except Exception: + pass + + def done(self): + try: + self.status_bar.set_running(False) + self.status_bar.set_status("就绪") + except Exception: + pass + + +def create_collapsible_frame(parent, title, initial_state=True): + """创建可折叠的面板""" + frame = tk.Frame(parent) + frame.pack(fill=tk.X, pady=5) + + title_frame = tk.Frame(frame) + title_frame.pack(fill=tk.X) + + state_var = tk.BooleanVar(value=initial_state) + indicator = "▼" if initial_state else "►" + state_label = tk.Label(title_frame, text=indicator, font=("Arial", 10, "bold")) + state_label.pack(side=tk.LEFT, padx=5) + + title_label = tk.Label(title_frame, text=title, font=("Arial", 11, "bold")) + title_label.pack(side=tk.LEFT, padx=5) + + content_frame = tk.Frame(frame) + if initial_state: + content_frame.pack(fill=tk.X, padx=20, pady=5) + + def toggle_collapse(event=None): + current_state = state_var.get() + new_state = not current_state + state_var.set(new_state) + state_label.config(text="▼" if new_state else "►") + if new_state: + content_frame.pack(fill=tk.X, padx=20, pady=5) + else: + content_frame.pack_forget() + + title_frame.bind("", toggle_collapse) + state_label.bind("", toggle_collapse) + title_label.bind("", toggle_collapse) + + return content_frame, state_var + + +def center_window(window): + """使窗口居中显示""" + window.update_idletasks() + width = window.winfo_width() + height = window.winfo_height() + x = (window.winfo_screenwidth() // 2) - (width // 2) + y = (window.winfo_screenheight() // 2) - (height // 2) + window.geometry('{}x{}+{}+{}'.format(width, height, x, y)) diff --git a/app/ui/user_settings.py b/app/ui/user_settings.py new file mode 100644 index 0000000..4a1911b --- /dev/null +++ b/app/ui/user_settings.py @@ -0,0 +1,128 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +"""用户设置与最近文件管理模块""" + +import os +import json +import re +import tkinter as tk +from typing import Dict, List, Any + +from app.core.utils.log_utils import get_logger + +logger = get_logger(__name__) + +RECENT_LIST_WIDGET = None + + +def load_user_settings(): + try: + path = os.path.abspath(os.path.join('data', 'user_settings.json')) + if os.path.exists(path): + with open(path, 'r', encoding='utf-8') as f: + return json.load(f) + except Exception as e: + logger.debug(f"加载用户设置失败: {e}") + return {} + + +def save_user_settings(settings: Dict[str, Any]): + try: + os.makedirs('data', exist_ok=True) + path = os.path.abspath(os.path.join('data', 'user_settings.json')) + with open(path, 'w', encoding='utf-8') as f: + json.dump(settings, f, ensure_ascii=False, indent=2) + except Exception as e: + logger.debug(f"保存用户设置失败: {e}") + + +def get_recent_files() -> List[str]: + s = load_user_settings() + items = s.get('recent_files', []) + if not isinstance(items, list): + return [] + + def _allowed(p: str) -> bool: + try: + if not isinstance(p, str) or not os.path.isfile(p): + return False + ext = os.path.splitext(p)[1].lower() + return ext in {'.xlsx', '.xls', '.jpg', '.jpeg', '.png', '.bmp'} + except Exception: + return False + + kept = [p for p in items if _allowed(p)] + if not kept: + candidates = [] + for d in ['data/output', 'data/result']: + try: + if os.path.exists(d): + for name in os.listdir(d): + p = os.path.join(d, name) + if _allowed(p): + candidates.append(p) + except Exception: + pass + if candidates: + kept = candidates + try: + kept_sorted = sorted(kept, key=lambda p: os.path.getmtime(p), reverse=True) + except Exception: + kept_sorted = kept + if kept_sorted != items or len(kept_sorted) != len(items): + s['recent_files'] = kept_sorted[:20] + save_user_settings(s) + return kept_sorted[:10] + + +def refresh_recent_list_widget(): + try: + global RECENT_LIST_WIDGET + if RECENT_LIST_WIDGET is None: + return + RECENT_LIST_WIDGET.delete(0, tk.END) + for i, p in enumerate(get_recent_files(), start=1): + RECENT_LIST_WIDGET.insert(tk.END, f"{i}. {p}") + except Exception as e: + logger.debug(f"刷新最近文件列表失败: {e}") + + +def _extract_path_from_recent_item(s: str) -> str: + try: + m = re.match(r'^(\d+)\.\s+(.*)$', s) + p = m.group(2) if m else s + return p.strip().strip('"') + except Exception: + return s.strip().strip('"') + + +def add_recent_file(path: str) -> None: + try: + if not path: + return + try: + if not os.path.isfile(path): + return + ext = os.path.splitext(path)[1].lower() + if ext not in {'.xlsx', '.xls', '.jpg', '.jpeg', '.png', '.bmp'}: + return + except Exception: + return + s = load_user_settings() + items = s.get('recent_files', []) + items = [p for p in items if p != path] + items.insert(0, path) + s['recent_files'] = items[:20] + save_user_settings(s) + refresh_recent_list_widget() + except Exception as e: + logger.debug(f"添加最近文件失败: {e}") + + +def clear_recent_files(): + try: + s = load_user_settings() + s['recent_files'] = [] + save_user_settings(s) + except Exception as e: + logger.debug(f"清空最近文件失败: {e}") diff --git a/build_exe.py b/build_exe.py new file mode 100644 index 0000000..6d96c45 --- /dev/null +++ b/build_exe.py @@ -0,0 +1,350 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +OCR订单处理系统 - EXE打包脚本 +============================ +自动化打包脚本,包含所有必要的资源文件和配置 +""" + +import os +import sys +import shutil +import subprocess +from pathlib import Path + +def clean_build(): + """清理之前的构建文件""" + print("清理构建目录...") + dirs_to_clean = ['build', 'dist', '__pycache__'] + for dir_name in dirs_to_clean: + if os.path.exists(dir_name): + shutil.rmtree(dir_name) + print(f"已删除: {dir_name}") + + # 删除spec文件 + spec_files = [f for f in os.listdir('.') if f.endswith('.spec')] + for spec_file in spec_files: + os.remove(spec_file) + print(f"已删除: {spec_file}") + +def create_spec_file(): + """创建PyInstaller spec文件""" + spec_content = ''' +# -*- mode: python ; coding: utf-8 -*- + +block_cipher = None + +# 需要包含的数据文件 +added_files = [ + ('config.ini', '.'), + ('config/barcode_mappings.json', 'config/'), + ('config/config.ini', 'config/'), + ('templates/银豹-采购单模板.xls', 'templates/'), + ('app', 'app'), +] + +# 需要隐式导入的模块 +hidden_imports = [ + 'tkinter', + 'tkinter.ttk', + 'tkinter.filedialog', + 'tkinter.messagebox', + 'tkinter.scrolledtext', + 'pandas', + 'numpy', + 'openpyxl', + 'xlrd', + 'xlwt', + 'xlutils', + 'requests', + 'configparser', + 'threading', + 'datetime', + 'json', + 're', + 'subprocess', + 'shutil', + 'app.config.settings', + 'app.services.ocr_service', + 'app.services.order_service', + 'app.services.tobacco_service', + 'app.core.utils.dialog_utils', + 'app.core.excel.converter', + 'app.core.db.product_db', + 'app.ui.error_utils', + 'app.ui.theme', + 'app.ui.logging_ui', + 'app.ui.ui_widgets', + 'app.ui.user_settings', + 'app.ui.result_previews', + 'app.ui.command_runner', + 'app.ui.file_operations', + 'app.ui.action_handlers', + 'app.ui.barcode_editor', + 'app.ui.config_dialog', + 'app.ui.shortcuts', + 'app.ui.main_window', +] + +a = Analysis( + ['启动器.py'], + pathex=[], + binaries=[], + datas=added_files, + hiddenimports=hidden_imports, + hookspath=[], + hooksconfig={}, + runtime_hooks=[], + excludes=[], + win_no_prefer_redirects=False, + win_private_assemblies=False, + cipher=block_cipher, + noarchive=False, +) + +pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher) + +exe = EXE( + pyz, + a.scripts, + a.binaries, + a.zipfiles, + a.datas, + [], + name='OCR订单处理系统', + debug=False, + bootloader_ignore_signals=False, + strip=False, + upx=True, + upx_exclude=[], + runtime_tmpdir=None, + console=False, + disable_windowed_traceback=False, + argv_emulation=False, + target_arch=None, + codesign_identity=None, + entitlements_file=None, +) +''' + + with open('OCR订单处理系统.spec', 'w', encoding='utf-8') as f: + f.write(spec_content) + print("已创建spec文件: OCR订单处理系统.spec") + +def build_exe(): + """构建EXE文件""" + print("开始构建EXE文件...") + try: + # 注入版本信息到根config.ini + try: + root_cfg = Path('config.ini') + from datetime import datetime + version_str = datetime.now().strftime('%Y.%m.%d.%H%M') + if root_cfg.exists(): + lines = root_cfg.read_text(encoding='utf-8').splitlines() + has_app = any(l.strip().lower() == '[app]' for l in lines) + if not has_app: + lines.append('[App]') + lines.append(f'version = {version_str}') + else: + # 更新或追加version + new_lines = [] + in_app = False + app_written = False + for l in lines: + if l.strip().lower() == '[app]': + in_app = True + new_lines.append(l) + continue + if in_app and l.strip().lower().startswith('version'): + new_lines.append(f'version = {version_str}') + app_written = True + in_app = True + continue + new_lines.append(l) + if not app_written: + new_lines.append('version = ' + version_str) + lines = new_lines + root_cfg.write_text('\n'.join(lines), encoding='utf-8') + print(f"已写入版本号: {version_str}") + except Exception as e: + print(f"版本信息注入失败: {e}") + result = subprocess.run([ + 'pyinstaller', + 'OCR订单处理系统.spec' + ], check=True, capture_output=True, text=True) + print("构建成功!") + print(result.stdout) + + # 构建完成后,复制完整的配置文件到dist目录 + dist_dir = Path('dist') + + # 复制包含API密钥的配置文件 + config_file = Path('config/config.ini') + if config_file.exists(): + # 确保config目录存在 + (dist_dir / 'config').mkdir(exist_ok=True) + shutil.copy2(config_file, dist_dir / 'config') + print(f"已复制配置文件到dist: {config_file} -> {dist_dir / 'config'}") + + # 复制完整的条码映射文件 + barcode_mapping_file = Path('config/barcode_mappings.json') + if barcode_mapping_file.exists(): + shutil.copy2(barcode_mapping_file, dist_dir / 'config') + print(f"已复制条码映射文件到dist: {barcode_mapping_file} -> {dist_dir / 'config'}") + + # 复制根目录的config.ini文件(覆盖空的配置文件) + root_config_file = Path('config.ini') + if root_config_file.exists(): + shutil.copy2(root_config_file, dist_dir) + print(f"已复制根配置文件到dist: {root_config_file} -> {dist_dir}") + else: + print("警告: 根配置文件不存在,将创建缺省版本") + (dist_dir / 'config.ini').write_text('[App]\nversion = dev\n', encoding='utf-8') + + except subprocess.CalledProcessError as e: + print(f"构建失败: {e}") + print(f"错误输出: {e.stderr}") + return False + return True + +def create_portable_package(): + """创建便携版打包""" + print("创建便携版打包...") + + # 创建发布目录 + release_dir = Path('release') + if release_dir.exists(): + try: + shutil.rmtree(release_dir) + except Exception as e: + print(f"警告: 无法完全清理发布目录 (可能文件被占用): {e}") + # 如果目录还在,尝试清理能清理的部分 + for item in release_dir.iterdir(): + try: + if item.is_dir(): shutil.rmtree(item) + else: item.unlink() + except Exception: pass + + release_dir.mkdir(exist_ok=True) + + # 复制exe文件 + exe_file = Path('dist/OCR订单处理系统.exe') + if exe_file.exists(): + shutil.copy2(exe_file, release_dir) + print(f"已复制: {exe_file} -> {release_dir}") + + # 创建必要的目录结构 + dirs_to_create = ['data/input', 'data/output', 'logs', 'templates', 'config'] + for dir_path in dirs_to_create: + (release_dir / dir_path).mkdir(parents=True, exist_ok=True) + print(f"已创建目录: {dir_path}") + + # 复制配置文件(包含API密钥) + config_file = Path('config/config.ini') + if config_file.exists(): + shutil.copy2(config_file, release_dir / 'config') + print(f"已复制配置文件: {config_file} -> {release_dir / 'config'}") + else: + print(f"警告: 配置文件不存在: {config_file}") + + # 复制完整的条码映射文件 + barcode_mapping_file = Path('config/barcode_mappings.json') + if barcode_mapping_file.exists(): + shutil.copy2(barcode_mapping_file, release_dir / 'config') + print(f"已复制条码映射文件: {barcode_mapping_file} -> {release_dir / 'config'}") + else: + print(f"警告: 条码映射文件不存在: {barcode_mapping_file}") + + # 复制根目录的config.ini文件 + root_config_file = Path('config.ini') + if root_config_file.exists(): + shutil.copy2(root_config_file, release_dir) + print(f"已复制根配置文件: {root_config_file} -> {release_dir}") + else: + print(f"警告: 根配置文件不存在: {root_config_file}") + + # 复制模板文件 + template_file = Path('templates/银豹-采购单模板.xls') + if template_file.exists(): + shutil.copy2(template_file, release_dir / 'templates') + print(f"已复制模板文件: {template_file} -> {release_dir / 'templates'}") + else: + print(f"警告: 模板文件不存在: {template_file}") + item_file = Path('templates/商品资料.xlsx') + if item_file.exists(): + try: + (Path('dist') / 'templates').mkdir(exist_ok=True) + shutil.copy2(item_file, Path('dist') / 'templates') + except Exception: + pass + shutil.copy2(item_file, release_dir / 'templates') + print(f"已复制商品资料: {item_file} -> {release_dir / 'templates'}") + else: + print(f"警告: 商品资料文件不存在: {item_file}") + + # 创建README文件 + readme_content = ''' +# OCR订单处理系统 - 便携版 + +## 使用说明 +1. 双击 "OCR订单处理系统.exe" 启动程序 +2. 将需要处理的图片文件放入 data/input 目录 +3. 处理结果将保存在 data/output 目录 +4. 日志文件保存在 logs 目录 + +## 注意事项 +- 首次运行时需要配置百度OCR API密钥 +- 支持的图片格式:jpg, jpeg, png, bmp +- 单个文件大小不超过4MB + +## 目录结构 +- OCR订单处理系统.exe - 主程序 +- data/input/ - 输入图片目录 +- data/output/ - 输出结果目录 +- logs/ - 日志目录 +''' + + with open(release_dir / 'README.txt', 'w', encoding='utf-8') as f: + f.write(readme_content) + print("已创建README.txt") + + print(f"便携版打包完成,位置: {release_dir.absolute()}") + +def main(): + """主函数""" + print("=" * 50) + print("OCR订单处理系统 - EXE打包工具") + print("=" * 50) + + # 检查是否安装了PyInstaller + try: + subprocess.run(['pyinstaller', '--version'], check=True, capture_output=True) + except (subprocess.CalledProcessError, FileNotFoundError): + print("错误: 未安装PyInstaller") + print("请运行: pip install pyinstaller") + return 1 + + # 清理构建目录 + clean_build() + + # 创建spec文件 + create_spec_file() + + # 构建EXE + if not build_exe(): + return 1 + + # 创建便携版打包 + create_portable_package() + + print("\n" + "=" * 50) + print("打包完成!") + print("EXE文件位置: dist/OCR订单处理系统.exe") + print("便携版位置: release/") + print("=" * 50) + + return 0 + +if __name__ == '__main__': + sys.exit(main()) \ No newline at end of file diff --git a/config.ini b/config.ini new file mode 100644 index 0000000..d1160e5 --- /dev/null +++ b/config.ini @@ -0,0 +1,43 @@ +[API] +api_key = +secret_key = +timeout = 30 +max_retries = 3 +retry_delay = 2 +api_url = https://aip.baidubce.com/rest/2.0/ocr/v1/table +token_url = https://aip.baidubce.com/oauth/2.0/token +form_ocr_url = https://aip.baidubce.com/rest/2.0/solution/v1/form_ocr/get_request_result + +[Paths] +input_folder = data/input +output_folder = data/output +temp_folder = data/temp +template_folder = templates +template_file = templates\银豹-采购单模板.xls +processed_record = data/processed_files.json +data_dir = data +product_db = data/product_cache.db + +[Performance] +max_workers = 4 +batch_size = 5 +skip_existing = true + +[File] +allowed_extensions = .jpg,.jpeg,.png,.bmp +excel_extension = .xlsx +max_file_size_mb = 4 + +[Templates] +purchase_order = 银豹-采购单模板.xls +item_data = 商品资料.xlsx + +[App] +version = 2026.03.30.1036 + +[Gitea] +base_url = https://gitea.94kan.cn +owner = houhuan +repo = yixuan-sync-data +token = + diff --git a/config/barcode_mappings.json b/config/barcode_mappings.json new file mode 100644 index 0000000..9d58fbb --- /dev/null +++ b/config/barcode_mappings.json @@ -0,0 +1,273 @@ +{ + "6920584471055": { + "map_to": "6920584471017", + "description": "条码映射:6920584471055 -> 6920584471017" + }, + "6925861571159": { + "map_to": "69021824", + "description": "条码映射:6925861571159 -> 69021824" + }, + "6923644268923": { + "map_to": "6923644268480", + "description": "条码映射:6923644268923 -> 6923644268480" + }, + "6925861571466": { + "map_to": "6925861571459", + "description": "条码映射:6925861571466 -> 6925861571459" + }, + "6907992508344": { + "map_to": "6907992508191", + "description": "条码映射:6907992508344 -> 6907992508191" + }, + "6903979000979": { + "map_to": "6903979000962", + "description": "条码映射:6903979000979 -> 6903979000962" + }, + "6923644283582": { + "map_to": "6923644283575", + "description": "条码映射:6923644283582 -> 6923644283575" + }, + "6923644268930": { + "map_to": "6923644268497", + "description": "条码映射:6923644268930 -> 6923644268497" + }, + "6923644268916": { + "map_to": "6923644268503", + "description": "条码映射:6923644268916 -> 6923644268503" + }, + "6923644268909": { + "map_to": "6923644268510", + "description": "条码映射:6923644268909 -> 6923644268510" + }, + "6923644299804": { + "map_to": "6923644299774", + "description": "条码映射:6923644299804 -> 6923644299774" + }, + "6923644266318": { + "map_to": "6923644266066", + "description": "条码映射:6923644266318 -> 6923644266066" + }, + "6923644210151": { + "map_to": "6923644223458", + "description": "条码映射:6923644210151 -> 6923644223458" + }, + "6907992501819": { + "map_to": "6907992500133", + "description": "条码映射:6907992501819 -> 6907992500133" + }, + "6907992502052": { + "map_to": "6907992100272", + "description": "条码映射:6907992502052 -> 6907992100272" + }, + "6907992507385": { + "map_to": "6907992507095", + "description": "条码映射:6907992507385 -> 6907992507095" + }, + "6973726149671": { + "map_to": "6973726149657", + "description": "条码映射:6973726149671 -> 6973726149657" + }, + "6977426410574": { + "map_to": "6977426410567", + "description": "条码映射:6977426410574 -> 6977426410567" + }, + "6973726149688": { + "map_to": "6973726149664", + "description": "条码映射:6973726149688 -> 6973726149664" + }, + "6935205322012": { + "map_to": "6935205320018", + "description": "条码映射:6935205322012 -> 6935205320018" + }, + "6943497411024": { + "map_to": "6943497411017", + "description": "条码映射:6943497411024 -> 6943497411017" + }, + "6921734968821": { + "map_to": "6921734968814", + "description": "条码映射:6921734968821 -> 6921734968814" + }, + "6921734968258": { + "map_to": "6921734968241", + "description": "条码映射:6921734968258 -> 6921734968241" + }, + "6921734968180": { + "map_to": "6921734968173", + "description": "条码映射:6921734968180 -> 6921734968173" + }, + "6921734908735": { + "map_to": "6935205372772", + "description": "条码映射:6921734908735 -> 6935205372772" + }, + "6923644248222": { + "map_to": "6923644248208", + "description": "条码映射:6923644248222 -> 6923644248208" + }, + "6902083881122": { + "map_to": "6902083881085", + "description": "条码映射:6902083881122 -> 6902083881085" + }, + "6907992501857": { + "map_to": "6907992500010", + "description": "条码映射:6907992501857 -> 6907992500010" + }, + "6902083891015": { + "map_to": "6902083890636", + "description": "条码映射:6902083891015 -> 6902083890636" + }, + "6923450605240": { + "map_to": "6923450605226", + "description": "条码映射:6923450605240 -> 6923450605226" + }, + "6923450605196": { + "map_to": "6923450614624", + "description": "条码映射:6923450605196 -> 6923450614624" + }, + "6923450665213": { + "map_to": "6923450665206", + "description": "条码映射:6923450665213 -> 6923450665206" + }, + "6923450666821": { + "map_to": "6923450666838", + "description": "条码映射:6923450666821 -> 6923450666838" + }, + "6923450661505": { + "map_to": "6923450661499", + "description": "条码映射:6923450661505 -> 6923450661499" + }, + "6923450676103": { + "map_to": "6923450676097", + "description": "条码映射:6923450676103 -> 6923450676097" + }, + "6923450614631": { + "map_to": "6923450614624", + "description": "条码映射:6923450614631 -> 6923450614624" + }, + "6901424334174": { + "map_to": "6973730760015", + "description": "条码映射:6901424334174 -> 6973730760015" + }, + "6958620703716": { + "map_to": "6958620703907", + "description": "条码映射:6958620703716 -> 6958620703907" + }, + "6937003706322": { + "map_to": "6937003703833", + "description": "条码映射:6937003706322 -> 6937003703833" + }, + "6950783203494": { + "map_to": "6950873203494", + "description": "条码映射:6950783203494 -> 6950873203494" + }, + "6907992501871": { + "map_to": "6907992500010", + "description": "条码映射:6907992501871 -> 6907992500010" + }, + "6907992501864": { + "map_to": "6907992100012", + "description": "条码映射:6907992501864 -> 6907992100012" + }, + "6923644264192": { + "map_to": "6923644264116", + "description": "条码映射:6923644264192 -> 6923644264116" + }, + "6923450667316": { + "map_to": "69042386", + "description": "条码映射:6923450667316 -> 69042386" + }, + "6923450653012": { + "map_to": "69021343", + "description": "条码映射:6923450653012 -> 69021343" + }, + "6923644295844": { + "map_to": "6923644285036", + "description": "条码映射:6923644295844 -> 6923644285036" + }, + "6907992513157": { + "map_to": "6907992513195", + "description": "条码映射:6907992513157 -> 6907992513195" + }, + "6902083893842": { + "map_to": "6902083907150", + "description": "条码映射:6902083893842 -> 6902083907150" + }, + "6902083904685": { + "map_to": "6902083905217", + "description": "条码映射:6902083904685 -> 6902083905217" + }, + "6917878036849": { + "map_to": "6917878036847", + "description": "条码映射:6917878036849 -> 6917878036847" + }, + "6903979000078": { + "map_to": "6903979000061", + "description": "条码映射:6903979000078 -> 6903979000061" + }, + "6937003706353": { + "map_to": "6937003706360", + "description": "条码映射:6937003706353 -> 6937003706360" + }, + "6923644242961": { + "map_to": "6907992100043", + "description": "条码映射:6923644242961 -> 6907992100043" + }, + "6923644258382": { + "map_to": "6923644252823", + "description": "条码映射:6923644258382 -> 6923644252823" + }, + "6923450657430": { + "map_to": "69029110", + "description": "条码映射:6923450657430 -> 69029110" + }, + "6923450660232": { + "map_to": "6923450690123", + "description": "条码映射:6923450660232 -> 6923450690123" + }, + "6923450657614": { + "map_to": "6923450657607", + "description": "条码映射:6923450657614 -> 6923450657607" + }, + "6972556000022": { + "map_to": "6977826050028", + "description": "条码映射:6972556000022 -> 6977826050028" + }, + "6949352266280": { + "map_to": "6949352266273", + "description": "条码映射:6949352266280 -> 6949352266273" + }, + "6925019900087": { + "multiplier": 10, + "target_unit": "瓶", + "description": "特殊处理:数量*10,单位转换为瓶" + }, + "6921168593804": { + "multiplier": 30, + "target_unit": "瓶", + "description": "NFC产品特殊处理:每箱30瓶" + }, + "6901826888138": { + "multiplier": 30, + "target_unit": "瓶", + "fixed_price": 3.7333333333333334, + "specification": "1*30", + "description": "特殊处理: 规格1*30,数量*30,单价=112/30" + }, + "6958620703907": { + "multiplier": 14, + "target_unit": "个", + "specification": "1*14", + "description": "友臣肉松,1盒14个" + }, + "6921734933485": { + "multiplier": 12, + "target_unit": "支", + "specification": "1*12", + "description": "得力铅笔" + }, + "6901826888244": { + "multiplier": 30, + "target_unit": "对", + "specification": "1*30", + "description": "南孚电池" + } +} \ No newline at end of file diff --git a/config/config.ini b/config/config.ini new file mode 100644 index 0000000..453df88 --- /dev/null +++ b/config/config.ini @@ -0,0 +1,40 @@ +[API] +api_key = +secret_key = +timeout = 30 +max_retries = 3 +retry_delay = 2 +api_url = https://aip.baidubce.com/rest/2.0/ocr/v1/table +token_url = https://aip.baidubce.com/oauth/2.0/token +form_ocr_url = https://aip.baidubce.com/rest/2.0/solution/v1/form_ocr/get_request_result + +[Paths] +input_folder = data/input +output_folder = data/output +temp_folder = data/temp +template_folder = templates +template_file = 银豹-采购单模板.xls +processed_record = data/processed_files.json +data_dir = data +product_db = data/product_cache.db + +[Performance] +max_workers = 4 +batch_size = 5 +skip_existing = true + +[File] +allowed_extensions = .jpg,.jpeg,.png,.bmp +excel_extension = .xlsx +max_file_size_mb = 4 + +[Templates] +purchase_order = 银豹-采购单模板.xls +item_data = 商品资料.xlsx + +[Gitea] +base_url = https://gitea.94kan.cn +owner = houhuan +repo = yixuan-sync-data +token = + diff --git a/config/suppliers_config.json b/config/suppliers_config.json new file mode 100644 index 0000000..6ad452b --- /dev/null +++ b/config/suppliers_config.json @@ -0,0 +1,237 @@ +{ + "suppliers": [ + { + "name": "蓉城易购", + "description": "蓉城易购供应商订单处理", + "filename_patterns": [ + "*蓉城*", + "*rongcheng*", + "*易*" + ], + "content_indicators": [ + "蓉城易购", + "商品编码", + "订货数量" + ], + "column_mapping": { + "商品条码(小条码)": "barcode", + "商品名称": "name", + "规格": "specification", + "订购数量(小单位)": "quantity", + "单位": "unit", + "单价(小单位)": "unit_price", + "优惠后金额(小单位)": "total_price", + "备注": "category", + "行号": "supplier" + }, + "cleaning_rules": [ + { + "type": "remove_rows", + "condition": "订货数量 == 0 or 订货数量.isna()" + }, + { + "type": "fill_na", + "columns": [ + "unit_price" + ], + "value": 0 + } + ], + "calculations": [ + { + "type": "multiply", + "source_column": "quantity", + "target_column": "quantity", + "factor": 1 + } + ], + "output_suffix": "_蓉城易购_银豹采购单", + "header_row": 2, + "rules": [ + { + "type": "split_quantity_unit", + "source": "订购数量(小单位)" + }, + { + "type": "extract_spec_from_name", + "source": "商品名称" + }, + { + "type": "normalize_unit", + "target": "unit", + "map": { + "箱": "件", + "提": "件", + "盒": "件" + } + }, + { + "type": "compute_quantity_from_total" + }, + { + "type": "mark_gift" + }, + { + "type": "fill_missing", + "fills": { + "unit": "瓶" + } + } + ], + "output_templates": [ + "templates/银豹-采购单模板.xls" + ], + "current_template_index": 0 + }, + { + "name": "通用食品供应商", + "description": "通用食品类供应商订单", + "filename_patterns": [ + "*食品*", + "*配送*", + "*供货*" + ], + "content_indicators": [ + "产品条码", + "订购量", + "进货价" + ], + "column_mapping": { + "产品条码": "barcode", + "产品名称": "name", + "订购量": "quantity", + "进货价": "unit_price" + }, + "cleaning_rules": [ + { + "type": "convert_type", + "columns": [ + "unit_price" + ], + "target_type": "float" + }, + { + "type": "fill_na", + "columns": [ + "barcode", + "name", + "quantity" + ], + "value": 0 + } + ], + "output_suffix": "_食品供应商_银豹采购单", + "rules": [ + { + "type": "split_quantity_unit", + "source": "订购量" + }, + { + "type": "extract_spec_from_name", + "source": "产品名称" + }, + { + "type": "normalize_unit", + "target": "unit", + "map": { + "箱": "件", + "提": "件", + "盒": "件" + } + }, + { + "type": "compute_quantity_from_total" + }, + { + "type": "mark_gift" + }, + { + "type": "fill_missing", + "fills": { + "unit": "瓶" + } + } + ], + "output_templates": [ + "templates/银豹-采购单模板.xls" + ], + "current_template_index": 0 + }, + { + "name": "农夫山泉", + "description": "", + "filename_patterns": [], + "content_indicators": [], + "column_mapping": { + "条形码": "barcode", + "商品名称": "name", + "销售价": "unit_price", + "订单金额": "total_price", + "Unnamed: 0": "supplier", + "备注": "brand" + }, + "header_row": 0, + "rules": [ + { + "type": "split_quantity_unit", + "source": "订单数量" + }, + { + "type": "extract_spec_from_name", + "source": "name" + }, + { + "type": "normalize_unit", + "target": "unit", + "map": { + "箱": "件", + "提": "件", + "盒": "件" + } + }, + { + "type": "compute_quantity_from_total" + }, + { + "type": "mark_gift" + }, + { + "type": "fill_missing", + "fills": { + "unit": "瓶" + } + } + ], + "dictionary": { + "ignore_words": [ + "白膜", + "彩膜", + "赠品" + ], + "unit_synonyms": { + "箱": "件", + "提": "件", + "盒": "件", + "瓶": "瓶" + }, + "pack_multipliers": { + "件": 24, + "箱": 24, + "提": 12, + "盒": 10 + }, + "name_patterns": [ + "(\\d+(?:\\.\\d+)?)(ml|mL|ML|l|L|升|毫升)[*×xX](\\d+)", + "(\\d+)[*×xX](\\d+)瓶", + "(\\d{2,3}).*?(\\d{1,3})" + ], + "default_unit": "瓶", + "default_package_quantity": 1 + }, + "output_templates": [ + "templates/银豹-采购单模板.xls" + ], + "current_template_index": 0 + } + ] +} diff --git a/docs/SYSTEM_ARCHITECTURE.md b/docs/SYSTEM_ARCHITECTURE.md new file mode 100644 index 0000000..c6d3521 --- /dev/null +++ b/docs/SYSTEM_ARCHITECTURE.md @@ -0,0 +1,208 @@ +# OCR 订单处理系统 - 系统架构文档 (v2.2) + +本文件详述了“OCR 订单处理系统”的技术架构、业务流向、数据模型及部署方案。 + +## 1. 系统整体架构图 (System Overall Architecture) + +```mermaid +graph TB + subgraph 用户交互层 + UI[启动器.py / Tkinter GUI] + CLI[headless_api.py / CLI] + end + + subgraph 核心业务逻辑层 + OS[OrderService / 订单调度] + OCR[OCRService / 图片识别] + SSS[SpecialSuppliersService / 特殊供应商处理] + TS[TobaccoService / 烟草处理] + EP[ExcelProcessor / 标准化转换] + end + + subgraph 基础设施与存储 + CONFIG[ConfigManager / JSON 配置] + FS[FileSystem / Excel 数据存储] + LOG[QueueLogger / 异步日志队列] + end + + subgraph 第三方集成 + OPENCLAW[OpenClaw 自动化平台] + POSPAL[银豹 POS 系统 (导出模板)] + end + + UI --> OS + CLI --> OS + OPENCLAW -- 调用 --> CLI + OS --> OCR + OS --> SSS + OS --> TS + OS --> EP + EP --> FS + EP --> CONFIG + SSS --> EP + TS --> EP + OS -- 验证 --> FS +``` + +### 图例说明 +- **用户交互层**:支持桌面 GUI 操作及专为 OpenClaw 设计的无界面 API 接入。 +- **核心业务层**:各服务高度解耦,通过 `OrderService` 进行智能路由分发。 +- **存储层**:系统采用“文件即数据库”的设计,利用 Excel 存储模板和商品资料,JSON 存储映射关系。 +- **第三方集成**:与 OpenClaw 平台通过 CLI 接口对接,最终生成符合银豹 POS 要求的采购单。 + +--- + +## 2. 核心业务逻辑流程图 (Core Business Logic) + +以“智能订单识别与预处理”为例: + +```mermaid +sequenceDiagram + participant User as 用户/OpenClaw + participant OS as OrderService + participant SSS as SpecialSuppliersService + participant TS as TobaccoService + participant EP as ExcelProcessor + + User->>OS: 提交 Excel 文件 + OS->>OS: 扫描前50行内容特征 + + alt 包含 "RCDH" + OS->>SSS: 路由至蓉城易购预处理 + SSS->>SSS: 按 E, N, Q, S 列强制清洗 + SSS-->>OS: 返回清洗后的临时文件 + else 包含 "专卖证号" + OS->>TS: 路由至烟草专用预处理 + TS->>TS: 数量*10 / 单价/10 / B,E,G,H列映射 + TS-->>OS: 返回清洗后的临时文件 + else 包含 "杨碧月" + OS->>SSS: 路由至杨碧月列对齐流程 + SSS-->>OS: 返回标准列临时文件 + else 通用格式 + OS->>OS: 直接跳过预处理 + end + + OS->>EP: 执行标准条码映射与模板填充 + EP->>EP: 校验单价 (与商品资料比对) + EP-->>User: 输出最终银豹采购单 (data/result) +``` + +### 技术注解 +- **智能指纹识别**:通过 `header=None` 读取前 50 行,避免了因标题行位置不固定导致的识别失败。 +- **原子化预处理**:每个供应商逻辑独立,预处理结果均为统一格式的中间文件,确立了系统的可扩展性。 + +--- + +## 3. 技术架构分层图 (Layered Architecture) + +| 分层 | 技术栈 / 组件 | 功能描述 | +| :--- | :--- | :--- | +| **表现层 (Presentation)** | Tkinter, headless_api.py | 桌面 GUI 交互与 OpenClaw 命令行接口 | +| **业务逻辑层 (Business)** | Python 3.x, Pandas, OCRService | 核心数据清洗、条码分裂、供应商特征识别 | +| **数据访问层 (Data)** | Pandas (Excel Engine), Json | 对 Excel 模板、映射表、用户设置的读写 | +| **基础设施层 (Infrastructure)** | Queue, Logging, PyInstaller | 异步日志分发、全局错误处理、EXE 打包工具 | + +--- + +## 4. 数据架构设计 (Data Architecture) + +系统未采用传统关系型数据库,而是基于 **JSON + Excel** 的混合存储架构。 + +### 4.1 表间关系示意 (JSON Mapping) +```mermaid +erDiagram + CONFIG_JSON ||--o{ BARCODE_MAPPING_JSON : "存储映射" + BARCODE_MAPPING_JSON { + string original_barcode "OCR识别出的原始条码" + string target_barcode "系统目标条码" + } + ITEM_DATA_EXCEL ||--o{ PURCHASE_ORDER_EXCEL : "验证单价" + ITEM_DATA_EXCEL { + string barcode "条码 (主键)" + float cost_price "进货价" + } +``` + +### 4.2 存储方案 +- **映射关系**:`barcode_mappings.json`。支持运行时动态更新,通过 `headless_api.py --update-mapping` 修改。 +- **业务数据**:`templates/商品资料.xlsx`。作为单价校验的权威数据源。 + +--- + +## 5. 微服务与模块化设计 (Microservices & Modularity) + +虽然系统目前采用单体架构(Monolithic Architecture)以适配桌面部署环境,但在逻辑上采用了**微服务式的模块化设计**: + +- **服务拆分**:每个供应商逻辑(Rongcheng, Tobacco, YangBiyue)都是独立的类,具备高度自治性。 +- **解耦机制**:通过统一的 `preprocess` 契约(输入:原始文件,输出:清洗后文件)进行交互,未来可轻松迁移至独立服务。 +- **进程隔离**:GUI 主进程与业务处理线程通过 `queue.Queue` 进行解耦,确保处理逻辑不阻塞用户界面。 + +--- + +## 6. 部署架构图 (Deployment) + +```mermaid +graph LR + subgraph 生产服务器 (Windows) + APP[orc-order-v2.exe] + DATA[data/ 目录] + LOGS[logs/ 目录] + end + + subgraph 自动化平台 + OC[OpenClaw] + end + + OC -- 命令行调用 --> APP + APP -- 读写 --> DATA + APP -- 记录 --> LOGS +``` + +### 部署要点 +- **便携化**:通过 PyInstaller 将 Python 运行环境与依赖打包,实现单文件/单目录部署。 +- **路径无关性**:系统内部通过 `os.path.abspath` 动态计算路径,支持安装在任意盘符。 + +--- + +## 6. 安全架构图 (Security) + +```mermaid +graph TD + A[外部输入] --> B{文件类型校验} + B -- 非图片/Excel --> C[拒绝处理] + B -- 图片/Excel --> D[清洗逻辑] + D --> E{单价偏差校验} + E -- 差值 > 1.0 --> F[生成警告日志/弹窗] + E -- 正常 --> G[生成采购单] + G --> H[日志埋点与审计] +``` + +### 安全策略 +- **数据隔离**:所有处理后的文件存放在 `data/output` 和 `data/result`,不修改原始输入文件。 +- **权限控制**:系统运行于用户权限下,利用 Windows 文件系统权限保护配置文件。 + +--- + +## 7. 技术债务与优化建议 (Tech Debt & Optimization) + +### 7.1 当前技术债务 +1. **并发限制**:目前为单进程串行处理,面对超大规模订单(万行级)可能存在阻塞。 +2. **持久化局限**:使用 JSON 存储映射关系在条码量达到万级时,查询性能会下降。 +3. **环境依赖**:OCR 引擎高度依赖 Tesseract/PaddleOCR 等本地二进制库,部署复杂。 + +### 7.2 单点故障风险 (SPOF Analysis) +1. **本地环境强依赖**:所有 OCR 与 Excel 处理均在单一 Windows 节点,若该节点故障,OpenClaw 对接将完全中断。 +2. **核心模板丢失**:`templates/` 下的商品资料或采购单模板缺失会导致全流程崩溃。 +3. **OCR 精度波动**:OCR 结果受图片质量影响,若 OCR 识别条码错误且无映射表,则该行数据将丢失。 + +### 7.3 架构优化建议方案 +- **容灾备份**:建议将 `templates/` 和 `barcode_mappings.json` 定期备份至远程 Git 仓库(如 Gitea)。 +- **分布式识别**:引入 PaddleOCR 服务端,支持多节点并发 OCR 识别,减少本地算力依赖。 +- **配置热更新**:支持从远程 URL 加载 `barcode_mappings.json`,实现多机条码库同步。 +- **数据回退机制**:增加中间文件持久化策略,允许在处理失败时手动干预已清洗的 Excel。 + +--- +*附注:本文档图表均采用 Mermaid 标准编写,可直接在 VS Code (需安装 Mermaid 插件) 或 [Mermaid Live Editor](https://mermaid.live/) 中实时渲染并导出为高清 PNG/SVG 格式。* + +--- +*文档版本:2.2.0 | 生成日期:2026-03-31* diff --git a/headless_api.py b/headless_api.py new file mode 100644 index 0000000..48adf1e --- /dev/null +++ b/headless_api.py @@ -0,0 +1,212 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +OCR订单处理系统 - 无界面自动化接口 +----------------------------- +专为与 openclaw 等自动化平台对接设计。 +处理流程:输入图片 -> OCR识别 -> 数据清洗 -> 价格校验 -> 输出结果路径。 +""" + +import os +import sys +import time +import argparse +import json +from pathlib import Path +from typing import Optional, List, Dict + +# 添加当前目录到路径 +sys.path.append(os.path.dirname(os.path.abspath(__file__))) + +from app.config.settings import ConfigManager +from app.services.ocr_service import OCRService +from app.services.order_service import OrderService +from app.services.tobacco_service import TobaccoService +from app.services.special_suppliers_service import SpecialSuppliersService +from app.core.utils.log_utils import get_logger, set_log_level + +logger = get_logger("HeadlessAPI") + +def get_latest_file(directory: str, extensions: List[str]) -> Optional[str]: + """获取目录中最新的指定后缀文件""" + dir_path = Path(directory) + if not dir_path.exists(): + return None + + files = [] + for ext in extensions: + files.extend(dir_path.glob(f"*{ext}")) + files.extend(dir_path.glob(f"*{ext.upper()}")) + + if not files: + return None + + latest_file = max(files, key=lambda p: p.stat().st_mtime) + return str(latest_file) + +def update_barcode_mapping(barcode: str, target_barcode: str = None, multiplier: float = None, unit: str = None, price: float = None, spec: str = None): + """更新条码映射或特殊处理配置""" + try: + config_path = os.path.join("config", "barcode_mappings.json") + mappings = {} + if os.path.exists(config_path): + with open(config_path, 'r', encoding='utf-8') as f: + mappings = json.load(f) + + # 获取或创建该条码的配置 + config = mappings.get(barcode, {}) + + if target_barcode: + config["map_to"] = target_barcode + config["description"] = config.get("description", "") + f" 条码映射 -> {target_barcode}" + + if multiplier is not None: + config["multiplier"] = multiplier + config["description"] = config.get("description", "") + f" 数量倍数*{multiplier}" + + if unit: + config["target_unit"] = unit + + if price is not None: + config["fixed_price"] = price + + if spec: + config["specification"] = spec + + if not config.get("description"): + config["description"] = f"特殊条码配置: {barcode}" + + mappings[barcode] = config + + with open(config_path, 'w', encoding='utf-8') as f: + json.dump(mappings, f, ensure_ascii=False, indent=2) + + logger.info(f"成功更新条码配置: {barcode} -> {config}") + return True + except Exception as e: + logger.error(f"更新条码配置失败: {e}") + return False + +def run_pipeline(args): + """运行处理流水线""" + try: + config_manager = ConfigManager() + order_service = OrderService(config_manager) + start_time = time.perf_counter() + final_excel = None + + input_folder = config_manager.get('Paths', 'input_folder', fallback='data/input') + output_folder = config_manager.get('Paths', 'output_folder', fallback='data/output') + + # 1. 处理条码映射更新 + if args.update_mapping: + if not args.barcode: + print("ERROR: --barcode is required for --update-mapping", file=sys.stderr) + return None + + # 至少需要一个更新项 + if not any([args.target, args.multiplier, args.unit, args.price, args.spec]): + print("ERROR: At least one update option (--target, --multiplier, --unit, --price, --spec) is required", file=sys.stderr) + return None + + if update_barcode_mapping(args.barcode, args.target, args.multiplier, args.unit, args.price, args.spec): + print(f"SUCCESS: Barcode configuration updated for {args.barcode}") + return "MAPPING_UPDATED" + return None + + # 2. 烟草公司处理 (显式指定) + if args.tobacco: + input_path = args.input or get_latest_file(output_folder, [".xlsx", ".xls"]) + if not input_path: + print("ERROR: No tobacco order file found.", file=sys.stderr) + return None + logger.info(f"开始显式处理烟草订单: {input_path}") + # 这里的 process_tobacco_order 会调用 preprocess 并生成银豹格式 + tobacco_service = TobaccoService(config_manager) + final_excel = tobacco_service.process_tobacco_order(input_path) + + # 3. 蓉城易购处理 (显式指定) + elif args.rongcheng: + input_path = args.input or get_latest_file(output_folder, [".xlsx", ".xls"]) + if not input_path: + print("ERROR: No Rongcheng Yigou order file found.", file=sys.stderr) + return None + logger.info(f"开始显式处理蓉城易购订单: {input_path}") + special_service = SpecialSuppliersService(config_manager) + final_excel = special_service.process_rongcheng_yigou(input_path) + + # 4. 普通 Excel 处理 (支持自动识别烟草/蓉城/杨碧月) + elif args.excel: + input_path = args.input or get_latest_file(input_folder, [".xlsx", ".xls"]) + if not input_path: + print("ERROR: No Excel file found in input.", file=sys.stderr) + return None + logger.info(f"开始处理 Excel (支持智能识别): {input_path}") + # OrderService.process_excel 内部会自动调用 _check_special_preprocess + final_excel = order_service.process_excel(input_path) + + # 5. 智能处理 (默认逻辑:自动判断图片还是 Excel) + else: + input_path = args.input or get_latest_file(input_folder, [".jpg", ".jpeg", ".png", ".bmp", ".xlsx", ".xls"]) + if not input_path: + print(f"ERROR: No input file found in {input_folder}.", file=sys.stderr) + return None + + ext = os.path.splitext(input_path)[1].lower() + if ext in [".xlsx", ".xls"]: + logger.info(f"智能识别为 Excel 文件,开始处理: {input_path}") + final_excel = order_service.process_excel(input_path) + else: + logger.info(f"智能识别为图片文件,开始 OCR 处理: {input_path}") + ocr_service = OCRService(config_manager) + excel_intermediate = ocr_service.process_image(input_path) + if excel_intermediate: + final_excel = order_service.process_excel(excel_intermediate) + + # 6. 后续处理 (校验与输出) + if final_excel: + # 单价校验 + discrepancies = order_service.validate_unit_price(final_excel) + if discrepancies: + print(f"WARNING: Price validation found {len(discrepancies)} issues:", file=sys.stderr) + for d in discrepancies: + print(f" - {d}", file=sys.stderr) + + duration = time.perf_counter() - start_time + logger.info(f"处理完成,耗时: {duration:.2f}s") + + # 输出最终路径 + abs_path = os.path.abspath(final_excel) + print(abs_path) + return abs_path + else: + print("ERROR: Processing failed.", file=sys.stderr) + return None + + except Exception as e: + import traceback + print(f"CRITICAL ERROR: {str(e)}", file=sys.stderr) + traceback.print_exc(file=sys.stderr) + return None + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="OCR订单处理系统 - 无界面自动化接口") + parser.add_argument('input', nargs='?', help='输入文件路径 (图片或Excel)') + + group = parser.add_mutually_exclusive_group() + group.add_argument('--excel', action='store_true', help='处理普通 Excel 文件') + group.add_argument('--tobacco', action='store_true', help='处理烟草公司订单') + group.add_argument('--rongcheng', action='store_true', help='处理蓉城易购订单') + group.add_argument('--update-mapping', action='store_true', help='更新条码映射') + + parser.add_argument('--barcode', help='待映射的原始条码 (用于 --update-mapping)') + parser.add_argument('--target', help='目标条码 (用于 --update-mapping)') + parser.add_argument('--multiplier', type=float, help='数量倍数 (例如箱转瓶填写30)') + parser.add_argument('--unit', help='目标单位 (例如"瓶")') + parser.add_argument('--price', type=float, help='固定单价') + parser.add_argument('--spec', help='固定规格 (例如"1*30")') + + args = parser.parse_args() + result = run_pipeline(args) + sys.exit(0 if result else 1) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..609355e --- /dev/null +++ b/requirements.txt @@ -0,0 +1,10 @@ +configparser>=5.0.0 +python-dotenv>=1.0.0 +numpy>=1.19.0 +openpyxl>=3.0.0 +pandas>=1.3.0 +pathlib>=1.0.1 +requests>=2.25.0 +xlrd>=2.0.0,<2.1.0 +xlutils>=2.0.0 +xlwt>=1.3.0 \ No newline at end of file diff --git a/templates/商品资料.xlsx b/templates/商品资料.xlsx new file mode 100644 index 0000000..3da82e9 Binary files /dev/null and b/templates/商品资料.xlsx differ diff --git a/templates/银豹-采购单模板.xls b/templates/银豹-采购单模板.xls new file mode 100644 index 0000000..a8fb1bc Binary files /dev/null and b/templates/银豹-采购单模板.xls differ diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_calculator.py b/tests/test_calculator.py new file mode 100644 index 0000000..b6e734f --- /dev/null +++ b/tests/test_calculator.py @@ -0,0 +1,222 @@ +"""app.core.handlers.calculator 单元测试""" + +import pytest +import pandas as pd +import numpy as np + +from app.core.handlers.calculator import DataCalculator + + +@pytest.fixture +def sample_df(): + return pd.DataFrame({ + 'price': [10.0, 20.0, 30.0], + 'quantity': [2, 5, 10], + 'name': ['A', 'B', 'C'], + }) + + +class TestMultiply: + def test_basic_multiply(self, sample_df): + calc = DataCalculator() + calc.add_rule('multiply', source_column='price', target_column='total', factor=2) + result = calc.calculate(sample_df) + assert list(result['total']) == [20.0, 40.0, 60.0] + + def test_multiply_missing_source(self, sample_df): + calc = DataCalculator() + calc.add_rule('multiply', source_column='nonexistent', target_column='total', factor=2) + result = calc.calculate(sample_df) + assert 'total' not in result.columns + + def test_multiply_default_factor(self, sample_df): + calc = DataCalculator() + calc.add_rule('multiply', source_column='price', target_column='copy', factor=1) + result = calc.calculate(sample_df) + assert list(result['copy']) == [10.0, 20.0, 30.0] + + def test_convenience_method(self, sample_df): + calc = DataCalculator() + calc.multiply('price', 'total', 3) + result = calc.calculate(sample_df) + assert list(result['total']) == [30.0, 60.0, 90.0] + + +class TestDivide: + def test_basic_divide(self, sample_df): + calc = DataCalculator() + calc.add_rule('divide', source_column='price', target_column='half', divisor=2) + result = calc.calculate(sample_df) + assert list(result['half']) == [5.0, 10.0, 15.0] + + def test_divide_by_zero(self, sample_df): + calc = DataCalculator() + calc.add_rule('divide', source_column='price', target_column='half', divisor=0) + result = calc.calculate(sample_df) + assert 'half' not in result.columns + + def test_divide_missing_source(self, sample_df): + calc = DataCalculator() + calc.add_rule('divide', source_column='nonexistent', target_column='x', divisor=2) + result = calc.calculate(sample_df) + assert 'x' not in result.columns + + +class TestAdd: + def test_add_columns(self): + df = pd.DataFrame({'a': [1, 2, 3], 'b': [10, 20, 30]}) + calc = DataCalculator() + calc.add_rule('add', columns=['a', 'b'], target_column='sum') + result = calc.calculate(df) + assert list(result['sum']) == [11, 22, 33] + + def test_add_constant(self): + df = pd.DataFrame({'a': [1, 2, 3]}) + calc = DataCalculator() + calc.add_rule('add', target_column='a', constant=100) + result = calc.calculate(df) + assert list(result['a']) == [101, 102, 103] + + def test_add_columns_with_constant(self): + df = pd.DataFrame({'a': [1, 2], 'b': [3, 4]}) + calc = DataCalculator() + calc.add_rule('add', columns=['a', 'b'], target_column='total', constant=10) + result = calc.calculate(df) + assert list(result['total']) == [14, 16] + + def test_add_string_column(self): + df = pd.DataFrame({'a': [1, 2]}) + calc = DataCalculator() + calc.add_rule('add', columns='a', target_column='total') + result = calc.calculate(df) + assert list(result['total']) == [1, 2] + + +class TestSubtract: + def test_subtract_two_columns(self): + df = pd.DataFrame({'income': [100, 200], 'cost': [30, 80]}) + calc = DataCalculator() + calc.add_rule('subtract', minuend='income', subtrahend='cost', target_column='profit') + result = calc.calculate(df) + assert list(result['profit']) == [70, 120] + + def test_subtract_constant(self): + df = pd.DataFrame({'price': [100, 200]}) + calc = DataCalculator() + calc.add_rule('subtract', minuend='price', target_column='discounted', constant=10) + result = calc.calculate(df) + assert list(result['discounted']) == [90, 190] + + def test_subtract_missing_minuend(self): + df = pd.DataFrame({'a': [1, 2]}) + calc = DataCalculator() + calc.add_rule('subtract', minuend='nonexistent', target_column='x', constant=1) + result = calc.calculate(df) + assert 'x' not in result.columns + + +class TestFormula: + def test_basic_formula(self, sample_df): + calc = DataCalculator() + calc.add_rule('formula', formula='price * quantity', target_column='total') + result = calc.calculate(sample_df) + assert list(result['total']) == [20.0, 100.0, 300.0] + + def test_invalid_formula(self, sample_df): + calc = DataCalculator() + calc.add_rule('formula', formula='nonexistent + 1', target_column='x') + result = calc.calculate(sample_df) + # formula fails, original df returned + assert 'x' not in result.columns + + def test_formula_missing_target(self, sample_df): + calc = DataCalculator() + calc.add_rule('formula', formula='price * 2') + result = calc.calculate(sample_df) + # no target_column, nothing happens + assert list(result['price']) == [10.0, 20.0, 30.0] + + +class TestRound: + def test_round_specific_columns(self): + df = pd.DataFrame({'a': [1.234, 2.567], 'b': [3.1, 4.9]}) + calc = DataCalculator() + calc.add_rule('round', columns=['a'], decimals=1) + result = calc.calculate(df) + assert list(result['a']) == [1.2, 2.6] + assert list(result['b']) == [3.1, 4.9] # unchanged + + def test_round_all_numeric(self): + df = pd.DataFrame({'a': [1.234, 2.567], 'b': [3.111, 4.999]}) + calc = DataCalculator() + calc.add_rule('round', decimals=0) + result = calc.calculate(df) + assert list(result['a']) == [1.0, 3.0] + assert list(result['b']) == [3.0, 5.0] + + def test_round_string_column_skipped(self): + df = pd.DataFrame({'name': ['a', 'b'], 'val': [1.5, 2.5]}) + calc = DataCalculator() + calc.add_rule('round', columns=['name', 'val'], decimals=0) + result = calc.calculate(df) + assert list(result['val']) == [2.0, 2.0] + + +class TestSum: + def test_sum_columns_to_target(self): + df = pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6]}) + calc = DataCalculator() + calc.add_rule('sum', columns=['a', 'b'], target_column='total') + result = calc.calculate(df) + assert list(result['total']) == [4, 6] + + def test_sum_missing_columns(self): + df = pd.DataFrame({'a': [1, 2]}) + calc = DataCalculator() + calc.add_rule('sum', columns=['a', 'missing'], target_column='total') + result = calc.calculate(df) + assert list(result['total']) == [1, 2] + + +class TestChaining: + def test_multiple_rules(self, sample_df): + calc = DataCalculator() + calc.add_rule('multiply', source_column='price', target_column='total', factor=2) + calc.add_rule('add', columns=['total', 'quantity'], target_column='grand') + result = calc.calculate(sample_df) + assert list(result['total']) == [20.0, 40.0, 60.0] + assert list(result['grand']) == [22.0, 45.0, 70.0] + + def test_chaining_convenience(self, sample_df): + calc = DataCalculator() + calc.multiply('price', 'total', 2).round_columns('total', 0) + result = calc.calculate(sample_df) + assert list(result['total']) == [20.0, 40.0, 60.0] + + +class TestEdgeCases: + def test_empty_dataframe(self): + df = pd.DataFrame({'a': pd.Series([], dtype=float)}) + calc = DataCalculator() + calc.add_rule('multiply', source_column='a', target_column='b', factor=2) + result = calc.calculate(df) + assert len(result) == 0 + + def test_no_rules(self, sample_df): + calc = DataCalculator() + result = calc.calculate(sample_df) + assert list(result['price']) == [10.0, 20.0, 30.0] + + def test_unknown_rule_type(self, sample_df): + calc = DataCalculator() + calc.add_rule('unknown_op', source_column='price', target_column='x') + result = calc.calculate(sample_df) + # unknown rule is skipped, df unchanged + assert list(result['price']) == [10.0, 20.0, 30.0] + + def test_rule_failure_continues(self, sample_df): + calc = DataCalculator() + calc.add_rule('formula', formula='nonexistent + 1', target_column='x') + calc.add_rule('multiply', source_column='price', target_column='y', factor=2) + result = calc.calculate(sample_df) + assert list(result['y']) == [20.0, 40.0, 60.0] diff --git a/tests/test_column_mapper.py b/tests/test_column_mapper.py new file mode 100644 index 0000000..e2e0019 --- /dev/null +++ b/tests/test_column_mapper.py @@ -0,0 +1,154 @@ +"""app.core.handlers.column_mapper 单元测试""" + +import pytest +import pandas as pd +from app.core.handlers.column_mapper import ColumnMapper + + +class TestStandardColumns: + """STANDARD_COLUMNS 完整性测试""" + + def test_has_all_standard_fields(self): + expected = {'barcode', 'name', 'specification', 'quantity', 'unit', + 'unit_price', 'total_price', 'gift_quantity', + 'category', 'brand', 'supplier'} + assert set(ColumnMapper.STANDARD_COLUMNS.keys()) == expected + + def test_no_empty_alias_lists(self): + for field, aliases in ColumnMapper.STANDARD_COLUMNS.items(): + assert len(aliases) > 0, f"{field} has no aliases" + + def test_barcode_includes_key_names(self): + bc = ColumnMapper.STANDARD_COLUMNS['barcode'] + assert '条码' in bc + assert '商品条码' in bc + assert 'barcode' in bc + + def test_gift_quantity_includes_common_names(self): + gq = ColumnMapper.STANDARD_COLUMNS['gift_quantity'] + assert '赠送量' in gq + assert '赠品数量' in gq + + +class TestFindColumn: + """ColumnMapper.find_column 列查找测试""" + + def test_exact_match(self): + cols = ['商品条码', '商品名称', '数量', '单价'] + assert ColumnMapper.find_column(cols, 'barcode') == '商品条码' + + def test_exact_match_standard_english(self): + cols = ['barcode', 'name', 'quantity'] + assert ColumnMapper.find_column(cols, 'barcode') == 'barcode' + + def test_whitespace_match(self): + """列名含空格时应匹配""" + cols = ['名 称', '数 量'] + assert ColumnMapper.find_column(cols, 'name') == '名 称' + assert ColumnMapper.find_column(cols, 'quantity') == '数 量' + + def test_partial_match_substring(self): + """列名包含候选名时应匹配""" + cols = ['商品条码(小条码)', '商品名称'] + assert ColumnMapper.find_column(cols, 'barcode') == '商品条码(小条码)' + + def test_not_found_returns_none(self): + cols = ['日期', '备注', '编号'] + assert ColumnMapper.find_column(cols, 'barcode') is None + + def test_unknown_standard_name_returns_none(self): + cols = ['商品条码'] + assert ColumnMapper.find_column(cols, 'nonexistent_field') is None + + def test_first_match_wins(self): + """多个列都能匹配时返回第一个""" + cols = ['条码', '商品条码', 'barcode'] + assert ColumnMapper.find_column(cols, 'barcode') == '条码' + + def test_case_insensitive(self): + cols = ['Barcode', 'Name'] + assert ColumnMapper.find_column(cols, 'barcode') == 'Barcode' + + def test_all_fields_matchable(self): + """每个标准字段都能找到至少一个匹配""" + cols = [ + '商品条码', '商品名称', '规格', '数量', '单位', + '单价', '金额', '赠送量', '类别', '品牌', '供应商', + ] + for std_name in ColumnMapper.STANDARD_COLUMNS: + result = ColumnMapper.find_column(cols, std_name) + assert result is not None, f"Could not find {std_name} in {cols}" + + +class TestDetectHeaderRow: + """ColumnMapper.detect_header_row 表头检测测试""" + + def test_header_on_first_row(self): + df = pd.DataFrame({ + 'A': ['条码', '123456', '789012'], + 'B': ['数量', '10', '20'], + 'C': ['单价', '5.5', '3.0'], + }) + assert ColumnMapper.detect_header_row(df, min_matches=2) == 0 + + def test_header_on_second_row(self): + df = pd.DataFrame({ + 'A': ['备注', '条码', '123456'], + 'B': ['日期', '数量', '10'], + 'C': ['时间', '单价', '5.5'], + }) + assert ColumnMapper.detect_header_row(df, min_matches=2) == 1 + + def test_no_header_returns_minus_one(self): + df = pd.DataFrame({ + 'A': ['aaa', 'bbb', 'ccc'], + 'B': ['ddd', 'eee', 'fff'], + }) + assert ColumnMapper.detect_header_row(df, min_matches=3) == -1 + + def test_empty_dataframe(self): + df = pd.DataFrame() + assert ColumnMapper.detect_header_row(df) == -1 + + def test_max_rows_limits_scan(self): + """表头在第 10 行但 max_rows=5 时应返回 -1""" + data = {f'col{i}': ['x'] * 15 for i in range(3)} + data['col0'][10] = '条码' + data['col1'][10] = '数量' + data['col2'][10] = '单价' + df = pd.DataFrame(data) + assert ColumnMapper.detect_header_row(df, max_rows=5, min_matches=2) == -1 + + +class TestColumnMapperInstance: + """ColumnMapper 实例方法测试""" + + def test_init_with_no_config(self): + mapper = ColumnMapper() + assert mapper.mapping_config == {} + + def test_init_with_custom_config(self): + mapper = ColumnMapper(mapping_config={'barcode': ['我的条码']}) + assert '我的条码' in mapper.custom_mappings + + def test_map_columns_renames(self): + mapper = ColumnMapper() + df = pd.DataFrame({'商品条码': ['123'], '商品名称': ['测试'], '数量': [10]}) + result = mapper.map_columns(df, target_columns=['barcode', 'name', 'quantity']) + assert 'barcode' in result.columns + assert 'name' in result.columns + assert 'quantity' in result.columns + + def test_map_columns_fills_missing(self): + mapper = ColumnMapper() + df = pd.DataFrame({'商品条码': ['123']}) + result = mapper.map_columns(df, target_columns=['barcode', 'quantity']) + assert 'barcode' in result.columns + assert 'quantity' in result.columns + assert result['quantity'].iloc[0] == 0 # default value + + def test_add_custom_mapping(self): + mapper = ColumnMapper() + mapper.add_custom_mapping('barcode', '自定义条码列') + assert '自定义条码列' in mapper.reverse_mapping + assert mapper.reverse_mapping['自定义条码列'] == 'barcode' diff --git a/tests/test_data_cleaner.py b/tests/test_data_cleaner.py new file mode 100644 index 0000000..630e220 --- /dev/null +++ b/tests/test_data_cleaner.py @@ -0,0 +1,236 @@ +"""app.core.handlers.data_cleaner 单元测试""" + +import pytest +import pandas as pd + +from app.core.handlers.data_cleaner import DataCleaner + + +@pytest.fixture +def sample_df(): + return pd.DataFrame({ + 'name': [' Alice ', 'Bob', 'Charlie', 'Dave'], + 'age': [25, 30, None, 40], + 'score': [80.5, 90.0, 70.0, 85.0], + 'city': ['Beijing', 'Shanghai', 'Beijing', 'Guangzhou'], + }) + + +class TestFillNa: + def test_fill_na_with_value(self, sample_df): + cleaner = DataCleaner() + cleaner.add_rule('fill_na', columns=['age'], value=0) + result = cleaner.clean(sample_df) + assert result['age'].isna().sum() == 0 + assert result.loc[2, 'age'] == 0 + + def test_fill_na_all_columns(self, sample_df): + cleaner = DataCleaner() + cleaner.add_rule('fill_na', value=-1) + result = cleaner.clean(sample_df) + assert result.isna().sum().sum() == 0 + + def test_fill_na_string_column(self): + df = pd.DataFrame({'a': ['x', None, 'z']}) + cleaner = DataCleaner() + cleaner.add_rule('fill_na', columns=['a'], value='unknown') + result = cleaner.clean(df) + assert result.loc[1, 'a'] == 'unknown' + + def test_convenience_method(self, sample_df): + cleaner = DataCleaner() + cleaner.fill_na(columns='age', value=99) + result = cleaner.clean(sample_df) + assert result.loc[2, 'age'] == 99 + + +class TestRemoveDuplicates: + def test_remove_by_subset(self): + df = pd.DataFrame({ + 'name': ['A', 'B', 'A', 'C'], + 'val': [1, 2, 3, 4], + }) + cleaner = DataCleaner() + cleaner.add_rule('remove_duplicates', subset=['name'], keep='first') + result = cleaner.clean(df) + assert len(result) == 3 + assert list(result['name']) == ['A', 'B', 'C'] + + def test_remove_all_columns(self): + df = pd.DataFrame({ + 'a': [1, 1, 2], + 'b': [10, 10, 20], + }) + cleaner = DataCleaner() + cleaner.add_rule('remove_duplicates') + result = cleaner.clean(df) + assert len(result) == 2 + + def test_no_duplicates(self, sample_df): + cleaner = DataCleaner() + cleaner.add_rule('remove_duplicates', subset=['name']) + result = cleaner.clean(sample_df) + assert len(result) == 4 + + +class TestRemoveRows: + def test_remove_by_condition(self, sample_df): + cleaner = DataCleaner() + cleaner.add_rule('remove_rows', condition='age > 25') + result = cleaner.clean(sample_df) + assert len(result) == 2 + + def test_remove_by_values(self, sample_df): + cleaner = DataCleaner() + cleaner.add_rule('remove_rows', columns=['city'], values=['Beijing']) + result = cleaner.clean(sample_df) + assert len(result) == 2 + assert 'Beijing' not in result['city'].values + + def test_remove_no_match(self, sample_df): + cleaner = DataCleaner() + cleaner.add_rule('remove_rows', condition='age > 100') + result = cleaner.clean(sample_df) + assert len(result) == 0 # condition filter: no rows match age > 100 + + def test_convenience_method(self, sample_df): + cleaner = DataCleaner() + cleaner.remove_rows(condition='score < 75') + result = cleaner.clean(sample_df) + assert len(result) == 1 # condition filter: keeps only Charlie (score=70.0) + + +class TestConvertType: + def test_to_float(self): + df = pd.DataFrame({'val': ['1.5', '2.7', 'abc']}) + cleaner = DataCleaner() + cleaner.add_rule('convert_type', columns=['val'], target_type='float') + result = cleaner.clean(df) + assert result['val'].dtype.kind == 'f' + assert result.loc[0, 'val'] == 1.5 + assert pd.isna(result.loc[2, 'val']) + + def test_to_int(self): + df = pd.DataFrame({'val': ['1', '2', '3']}) + cleaner = DataCleaner() + cleaner.add_rule('convert_type', columns=['val'], target_type='int') + result = cleaner.clean(df) + assert result.loc[0, 'val'] == 1 + + def test_to_string(self): + df = pd.DataFrame({'val': [1, 2, 3]}) + cleaner = DataCleaner() + cleaner.add_rule('convert_type', columns=['val'], target_type='string') + result = cleaner.clean(df) + assert result.loc[0, 'val'] == '1' + + def test_missing_column_skipped(self, sample_df): + cleaner = DataCleaner() + cleaner.add_rule('convert_type', columns=['nonexistent'], target_type='float') + result = cleaner.clean(sample_df) + assert len(result) == 4 + + +class TestStripWhitespace: + def test_strip_specific_columns(self, sample_df): + cleaner = DataCleaner() + cleaner.add_rule('strip_whitespace', columns=['name']) + result = cleaner.clean(sample_df) + assert result.loc[0, 'name'] == 'Alice' + + def test_strip_all_text(self, sample_df): + cleaner = DataCleaner() + cleaner.add_rule('strip_whitespace') + result = cleaner.clean(sample_df) + assert result.loc[0, 'name'] == 'Alice' + + def test_strip_non_text_skipped(self): + df = pd.DataFrame({'val': [1, 2, 3]}) + cleaner = DataCleaner() + cleaner.add_rule('strip_whitespace', columns=['val']) + result = cleaner.clean(df) + assert list(result['val']) == [1, 2, 3] + + +class TestNormalizeText: + def test_lowercase(self): + df = pd.DataFrame({'name': ['ALICE', 'BOB']}) + cleaner = DataCleaner() + cleaner.add_rule('normalize_text', columns=['name'], lowercase=True) + result = cleaner.clean(df) + assert list(result['name']) == ['alice', 'bob'] + + def test_uppercase(self): + df = pd.DataFrame({'name': ['alice', 'bob']}) + cleaner = DataCleaner() + cleaner.add_rule('normalize_text', columns=['name'], uppercase=True) + result = cleaner.clean(df) + assert list(result['name']) == ['ALICE', 'BOB'] + + def test_replace_map(self): + df = pd.DataFrame({'city': ['BJ', 'SH']}) + cleaner = DataCleaner() + cleaner.add_rule('normalize_text', columns=['city'], replace_map={'BJ': 'Beijing', 'SH': 'Shanghai'}) + result = cleaner.clean(df) + assert list(result['city']) == ['Beijing', 'Shanghai'] + + +class TestValidateData: + def test_validate_logs_but_does_not_modify(self, sample_df): + cleaner = DataCleaner() + cleaner.add_rule('validate_data', columns=['score'], min_value=0, max_value=100) + result = cleaner.clean(sample_df) + assert len(result) == 4 + + def test_validate_required(self, sample_df): + cleaner = DataCleaner() + cleaner.add_rule('validate_data', columns=['age'], required=True) + result = cleaner.clean(sample_df) + assert len(result) == 4 + + +class TestChaining: + def test_multiple_rules(self, sample_df): + cleaner = DataCleaner() + cleaner.add_rule('strip_whitespace', columns=['name']) + cleaner.add_rule('fill_na', columns=['age'], value=0) + cleaner.add_rule('convert_type', columns=['age'], target_type='int') + result = cleaner.clean(sample_df) + assert result.loc[0, 'name'] == 'Alice' + assert result['age'].isna().sum() == 0 + assert result.loc[2, 'age'] == 0 + + def test_convenience_chaining(self, sample_df): + cleaner = DataCleaner() + cleaner.strip_whitespace('name').fill_na('age', value=0) + result = cleaner.clean(sample_df) + assert result.loc[0, 'name'] == 'Alice' + assert result.loc[2, 'age'] == 0 + + +class TestEdgeCases: + def test_empty_dataframe(self): + df = pd.DataFrame({'a': pd.Series([], dtype=float)}) + cleaner = DataCleaner() + cleaner.add_rule('fill_na', value=0) + result = cleaner.clean(df) + assert len(result) == 0 + + def test_no_rules(self, sample_df): + cleaner = DataCleaner() + result = cleaner.clean(sample_df) + assert len(result) == 4 + + def test_unknown_rule_type(self, sample_df): + cleaner = DataCleaner() + cleaner.add_rule('unknown_op', columns=['name']) + result = cleaner.clean(sample_df) + assert len(result) == 4 + + def test_rule_failure_continues(self, sample_df): + """A failing rule should not block subsequent rules.""" + cleaner = DataCleaner() + cleaner.add_rule('convert_type', columns=['nonexistent'], target_type='float') + cleaner.add_rule('fill_na', columns=['age'], value=0) + result = cleaner.clean(sample_df) + assert result.loc[2, 'age'] == 0 diff --git a/tests/test_product_db.py b/tests/test_product_db.py new file mode 100644 index 0000000..5b94cbc --- /dev/null +++ b/tests/test_product_db.py @@ -0,0 +1,187 @@ +"""app.core.db.product_db 单元测试""" + +import os +import tempfile + +import pytest +import pandas as pd + +from app.core.db.product_db import ProductDatabase + + +@pytest.fixture +def db_dir(): + """临时目录""" + with tempfile.TemporaryDirectory() as d: + yield d + + +@pytest.fixture +def sample_excel(db_dir): + """创建测试用 Excel 文件""" + path = os.path.join(db_dir, '商品资料.xlsx') + df = pd.DataFrame({ + '商品条码': ['6920584471055', '6901028001133', '6925303800013'], + '商品名称': ['农夫山泉550ml', '蒙牛纯牛奶', '可口可乐330ml'], + '进货价': [1.2, 3.5, 1.8], + '单位': ['瓶', '盒', '罐'], + }) + df.to_excel(path, index=False) + return path + + +@pytest.fixture +def db_with_data(db_dir, sample_excel): + """已导入数据的数据库""" + db_path = os.path.join(db_dir, 'product_cache.db') + db = ProductDatabase(db_path, sample_excel) + return db + + +class TestProductDatabaseInit: + """数据库初始化测试""" + + def test_auto_import_on_first_run(self, db_dir, sample_excel): + """首次运行自动从 Excel 导入""" + db_path = os.path.join(db_dir, 'product_cache.db') + assert not os.path.exists(db_path) + + db = ProductDatabase(db_path, sample_excel) + + assert os.path.exists(db_path) + assert db.count() == 3 + + def test_no_reimport_on_existing_db(self, db_dir, sample_excel): + """数据库已存在时不重新导入""" + db_path = os.path.join(db_dir, 'product_cache.db') + + db1 = ProductDatabase(db_path, sample_excel) + assert db1.count() == 3 + + # 删除 Excel 后仍能打开已有数据库 + os.remove(sample_excel) + db2 = ProductDatabase(db_path, sample_excel) + assert db2.count() == 3 + + def test_missing_excel_creates_empty_db(self, db_dir): + """Excel 不存在时创建空数据库""" + db_path = os.path.join(db_dir, 'product_cache.db') + fake_excel = os.path.join(db_dir, '不存在.xlsx') + + db = ProductDatabase(db_path, fake_excel) + + assert os.path.exists(db_path) + assert db.count() == 0 + + def test_missing_dir_created(self, db_dir, sample_excel): + """数据库目录不存在时自动创建""" + db_path = os.path.join(db_dir, 'subdir', 'product_cache.db') + db = ProductDatabase(db_path, sample_excel) + assert os.path.exists(db_path) + assert db.count() == 3 + + +class TestGetPrice: + """单条查询测试""" + + def test_existing_barcode(self, db_with_data): + price = db_with_data.get_price('6920584471055') + assert price == pytest.approx(1.2) + + def test_nonexistent_barcode(self, db_with_data): + price = db_with_data.get_price('0000000000000') + assert price is None + + def test_empty_barcode(self, db_with_data): + price = db_with_data.get_price('') + assert price is None + + def test_barcode_with_spaces(self, db_with_data): + """条码前后空格应能匹配""" + price = db_with_data.get_price(' 6920584471055 ') + assert price == pytest.approx(1.2) + + +class TestGetPrices: + """批量查询测试""" + + def test_multiple_barcodes(self, db_with_data): + result = db_with_data.get_prices(['6920584471055', '6901028001133']) + assert len(result) == 2 + assert result['6920584471055'] == pytest.approx(1.2) + assert result['6901028001133'] == pytest.approx(3.5) + + def test_partial_match(self, db_with_data): + """部分条码存在,部分不存在""" + result = db_with_data.get_prices(['6920584471055', '0000000000000']) + assert len(result) == 1 + assert '6920584471055' in result + + def test_empty_list(self, db_with_data): + result = db_with_data.get_prices([]) + assert result == {} + + def test_all_nonexistent(self, db_with_data): + result = db_with_data.get_prices(['0000000000000', '1111111111111']) + assert result == {} + + +class TestReimport: + """重新导入测试""" + + def test_reimport_clears_and_reloads(self, db_dir, sample_excel): + db_path = os.path.join(db_dir, 'product_cache.db') + db = ProductDatabase(db_path, sample_excel) + assert db.count() == 3 + + # 修改 Excel,添加一行 + df = pd.read_excel(sample_excel) + df = pd.concat([df, pd.DataFrame({ + '商品条码': ['6954365200123'], + '商品名称': ['测试商品'], + '进货价': [5.0], + '单位': ['个'], + })]) + df.to_excel(sample_excel, index=False) + + count = db.reimport() + assert count == 4 + assert db.count() == 4 + assert db.get_price('6954365200123') == pytest.approx(5.0) + + +class TestEdgeCases: + """边界条件测试""" + + def test_excel_with_missing_price(self, db_dir): + """Excel 中价格列为空的行""" + path = os.path.join(db_dir, '商品资料.xlsx') + df = pd.DataFrame({ + '商品条码': ['6920584471055', '6901028001133'], + '商品名称': ['商品A', '商品B'], + '进货价': [1.5, None], + }) + df.to_excel(path, index=False) + + db_path = os.path.join(db_dir, 'product_cache.db') + db = ProductDatabase(db_path, path) + + assert db.count() == 2 + assert db.get_price('6920584471055') == pytest.approx(1.5) + assert db.get_price('6901028001133') == pytest.approx(0.0) + + def test_excel_with_duplicate_barcodes(self, db_dir): + """重复条码取最后一条 (INSERT OR REPLACE)""" + path = os.path.join(db_dir, '商品资料.xlsx') + df = pd.DataFrame({ + '商品条码': ['6920584471055', '6920584471055'], + '商品名称': ['商品A', '商品A-新'], + '进货价': [1.0, 2.0], + }) + df.to_excel(path, index=False) + + db_path = os.path.join(db_dir, 'product_cache.db') + db = ProductDatabase(db_path, path) + + assert db.count() == 1 + assert db.get_price('6920584471055') == pytest.approx(2.0) diff --git a/tests/test_rule_engine.py b/tests/test_rule_engine.py new file mode 100644 index 0000000..1a40ebc --- /dev/null +++ b/tests/test_rule_engine.py @@ -0,0 +1,223 @@ +"""app.core.handlers.rule_engine 单元测试""" + +import pytest +import pandas as pd + +from app.core.handlers.rule_engine import ( + apply_rules, + _split_quantity_unit, + _extract_spec_from_name, + _normalize_unit, + _compute_quantity_from_total, + _fill_missing, + _mark_gift, +) + + +@pytest.fixture +def sample_df(): + return pd.DataFrame({ + 'name': ['农夫山泉550ml*24', '蒙牛纯牛奶', '可口可乐330ml*6'], + 'quantity_raw': ['2箱', '5', '3提'], + 'unit_price': [28.8, 3.5, 10.8], + 'total_price': [57.6, 17.5, 32.4], + }) + + +class TestSplitQuantityUnit: + def test_split_with_unit(self): + df = pd.DataFrame({'quantity_raw': ['2箱', '5瓶', '3提']}) + result = _split_quantity_unit(df, 'quantity_raw') + assert list(result['quantity']) == [2.0, 5.0, 3.0] + assert list(result['unit']) == ['箱', '瓶', '提'] + + def test_split_number_only(self): + df = pd.DataFrame({'quantity_raw': ['10', '20']}) + result = _split_quantity_unit(df, 'quantity_raw') + assert list(result['quantity']) == [10.0, 20.0] + + def test_split_with_synonyms(self): + df = pd.DataFrame({'quantity_raw': ['2件']}) + dictionary = {'unit_synonyms': {'件': '箱'}, 'default_unit': '瓶'} + result = _split_quantity_unit(df, 'quantity_raw', dictionary) + assert result.loc[0, 'unit'] == '箱' + + def test_split_missing_column(self): + df = pd.DataFrame({'other': [1, 2]}) + result = _split_quantity_unit(df, 'quantity_raw') + assert 'quantity' not in result.columns + + def test_split_invalid_value(self): + df = pd.DataFrame({'quantity_raw': ['abc']}) + result = _split_quantity_unit(df, 'quantity_raw') + assert result.loc[0, 'quantity'] == 0.0 + + +class TestExtractSpecFromName: + def test_extract_550ml_24(self): + df = pd.DataFrame({'name': ['农夫山泉550ml*24']}) + result = _extract_spec_from_name(df, 'name') + assert result.loc[0, 'package_quantity'] == 24 + + def test_extract_330ml_6(self): + df = pd.DataFrame({'name': ['可口可乐330ml*6']}) + result = _extract_spec_from_name(df, 'name') + assert result.loc[0, 'package_quantity'] == 6 + + def test_extract_1_star_pattern(self): + df = pd.DataFrame({'name': ['啤酒1*12']}) + result = _extract_spec_from_name(df, 'name') + assert result.loc[0, 'package_quantity'] == 12 + + def test_no_spec(self): + df = pd.DataFrame({'name': ['蒙牛纯牛奶']}) + result = _extract_spec_from_name(df, 'name') + assert result.loc[0, 'package_quantity'] is None + + def test_missing_column(self): + df = pd.DataFrame({'other': ['test']}) + result = _extract_spec_from_name(df, 'name') + assert 'package_quantity' not in result.columns + + def test_with_ignore_words(self): + df = pd.DataFrame({'name': ['新品 农夫山泉550ml*24']}) + dictionary = {'ignore_words': ['新品'], 'name_patterns': []} + result = _extract_spec_from_name(df, 'name', dictionary) + assert result.loc[0, 'package_quantity'] == 24 + + +class TestNormalizeUnit: + def test_map_units(self): + df = pd.DataFrame({'unit': ['箱', '提', '盒', '瓶'], 'quantity': [1, 2, 3, 4]}) + unit_map = {'箱': '件', '提': '件', '盒': '件'} + result = _normalize_unit(df, 'unit', unit_map) + # _normalize_unit maps via unit_map, then converts 件→瓶 as packed unit + assert list(result['unit']) == ['瓶', '瓶', '瓶', '瓶'] + + def test_convert_quantity_for_packed_units(self): + df = pd.DataFrame({ + 'unit': ['箱', '瓶'], + 'quantity': [2, 5], + 'package_quantity': [12, None], + }) + unit_map = {'箱': '件'} + result = _normalize_unit(df, 'unit', unit_map) + assert result.loc[0, 'quantity'] == 24 # 2 * 12 + assert result.loc[1, 'quantity'] == 5 # unchanged + + def test_missing_column(self): + df = pd.DataFrame({'other': [1]}) + result = _normalize_unit(df, 'unit', {}) + assert 'unit' not in result.columns + + +class TestComputeQuantityFromTotal: + def test_compute_when_qty_zero(self): + df = pd.DataFrame({ + 'quantity': [0, 5, 0], + 'unit_price': [10.0, 20.0, 0.0], + 'total_price': [50.0, 100.0, 30.0], + }) + result = _compute_quantity_from_total(df) + assert result.loc[0, 'quantity'] == 5.0 # 50 / 10 + assert result.loc[1, 'quantity'] == 5 # unchanged + + def test_no_compute_when_qty_positive(self): + df = pd.DataFrame({ + 'quantity': [3, 5], + 'unit_price': [10.0, 20.0], + 'total_price': [50.0, 100.0], + }) + result = _compute_quantity_from_total(df) + assert list(result['quantity']) == [3, 5] + + +class TestFillMissing: + def test_fill_existing_column(self): + df = pd.DataFrame({'a': [1, None, 3], 'b': [None, 2, None]}) + result = _fill_missing(df, {'a': 0, 'b': 99}) + assert result.loc[1, 'a'] == 0 + assert result.loc[0, 'b'] == 99 + + def test_fill_new_column(self): + df = pd.DataFrame({'a': [1, 2]}) + result = _fill_missing(df, {'new_col': 'default'}) + assert list(result['new_col']) == ['default', 'default'] + + +class TestMarkGift: + def test_gift_by_zero_price(self): + df = pd.DataFrame({ + 'name': ['商品A', '商品B'], + 'unit_price': [10.0, 0.0], + 'total_price': [20.0, 0.0], + }) + result = _mark_gift(df) + assert result.loc[0, 'is_gift'] == False + assert result.loc[1, 'is_gift'] == True + + def test_gift_by_name(self): + df = pd.DataFrame({ + 'name': ['赠品-杯子', '商品A'], + 'unit_price': [0.0, 10.0], + 'total_price': [0.0, 20.0], + }) + result = _mark_gift(df) + assert result.loc[0, 'is_gift'] == True + assert result.loc[1, 'is_gift'] == False + + def test_gift_no_price_columns(self): + df = pd.DataFrame({'name': ['赠品', '正常']}) + result = _mark_gift(df) + assert result.loc[0, 'is_gift'] == True + assert result.loc[1, 'is_gift'] == False + + +class TestApplyRules: + def test_multiple_rules(self, sample_df): + rules = [ + {'type': 'split_quantity_unit', 'source': 'quantity_raw'}, + {'type': 'extract_spec_from_name', 'source': 'name'}, + {'type': 'mark_gift'}, + {'type': 'fill_missing', 'fills': {'unit': '瓶'}}, + ] + result = apply_rules(sample_df, rules) + assert 'quantity' in result.columns + assert 'unit' in result.columns + assert 'package_quantity' in result.columns + assert 'is_gift' in result.columns + + def test_empty_rules(self, sample_df): + result = apply_rules(sample_df, []) + assert len(result) == len(sample_df) + + def test_none_rules(self, sample_df): + result = apply_rules(sample_df, None) + assert len(result) == len(sample_df) + + def test_unknown_rule_type(self, sample_df): + rules = [{'type': 'unknown_operation'}] + result = apply_rules(sample_df, rules) + assert len(result) == len(sample_df) + + def test_with_dictionary(self): + df = pd.DataFrame({ + 'name': ['农夫山泉550ml*24'], + 'quantity_raw': ['2箱'], + }) + dictionary = { + 'unit_synonyms': {'箱': '件'}, + 'default_unit': '瓶', + 'ignore_words': [], + 'name_patterns': [], + 'pack_multipliers': {'件': 12}, + 'default_package_quantity': 1, + } + rules = [ + {'type': 'split_quantity_unit', 'source': 'quantity_raw'}, + {'type': 'extract_spec_from_name', 'source': 'name'}, + {'type': 'normalize_unit', 'target': 'unit', 'map': {'箱': '件'}}, + ] + result = apply_rules(df, rules, dictionary) + assert 'quantity' in result.columns + assert 'unit' in result.columns diff --git a/tests/test_string_utils.py b/tests/test_string_utils.py new file mode 100644 index 0000000..0a5a622 --- /dev/null +++ b/tests/test_string_utils.py @@ -0,0 +1,124 @@ +"""app.core.utils.string_utils 单元测试""" + +import pytest +from app.core.utils.string_utils import parse_monetary_string, format_barcode + + +class TestParseMonetaryString: + """parse_monetary_string 金额/数量字符串解析测试""" + + # --- 基本类型 --- + def test_none_returns_none(self): + assert parse_monetary_string(None) is None + + def test_int_passthrough(self): + assert parse_monetary_string(42) == 42.0 + + def test_float_passthrough(self): + assert parse_monetary_string(3.14) == 3.14 + + def test_zero_int(self): + assert parse_monetary_string(0) == 0.0 + + # --- 正常字符串 --- + def test_plain_number(self): + assert parse_monetary_string("123.45") == 123.45 + + def test_integer_string(self): + assert parse_monetary_string("100") == 100.0 + + # --- 货币符号 --- + def test_yen_prefix(self): + assert parse_monetary_string("¥1234.56") == 1234.56 + + def test_dollar_prefix(self): + assert parse_monetary_string("$99.9") == 99.9 + + def test_yuan_suffix(self): + assert parse_monetary_string("100元") == 100.0 + + # --- 逗号处理 --- + def test_comma_as_decimal_point(self): + """逗号当小数点: "1,5" = 1.5""" + assert parse_monetary_string("1,5") == 1.5 + + def test_comma_as_thousands_sep(self): + """逗号当千位分隔符: "1,234.56" = 1234.56""" + assert parse_monetary_string("1,234.56") == 1234.56 + + def test_multiple_commas_thousands(self): + """多个逗号: "1,234,567" = 1234567""" + assert parse_monetary_string("1,234,567") == 1234567.0 + + # --- 空值/无效值 --- + def test_empty_string(self): + assert parse_monetary_string("") is None + + def test_whitespace_only(self): + assert parse_monetary_string(" ") is None + + def test_o_string(self): + """OCR 常见误识别: 字母 o 当数字 0""" + assert parse_monetary_string("o") is None + + def test_none_string(self): + assert parse_monetary_string("none") is None + + def test_null_string(self): + assert parse_monetary_string("null") is None + + def test_dash(self): + assert parse_monetary_string("-") is None + + def test_double_dash(self): + assert parse_monetary_string("--") is None + + def test_no_digits(self): + assert parse_monetary_string("赠品") is None + + # --- 负数 --- + def test_negative_number(self): + assert parse_monetary_string("-5.5") == -5.5 + + # --- 非字符串非数字类型 --- + def test_list_returns_none(self): + assert parse_monetary_string([1, 2]) is None + + def test_dict_returns_none(self): + assert parse_monetary_string({"a": 1}) is None + + +class TestFormatBarcode: + """format_barcode 条码格式化测试""" + + def test_none_returns_empty(self): + assert format_barcode(None) == "" + + def test_normal_digit_string(self): + assert format_barcode("6920584471055") == "6920584471055" + + def test_integer_input(self): + assert format_barcode(6920584471055) == "6920584471055" + + def test_float_with_zero_decimal(self): + assert format_barcode(6920584471055.0) == "6920584471055" + + def test_scientific_notation(self): + assert format_barcode("6.920584e+12") == "6920584000000" + + def test_trailing_zeros_stripped(self): + assert format_barcode("123456.0") == "123456" + + def test_long_barcode_with_trailing_zeros(self): + """14位条码末尾是0时应截断到13位""" + assert format_barcode("69205844710550") == "6920584471055" + + def test_long_barcode_without_trailing_zeros(self): + """14位条码末尾不是0时不截断""" + assert format_barcode("69205844710551") == "69205844710551" + + def test_non_digit_chars_removed(self): + assert format_barcode("692-058-4471055") == "6920584471055" + + def test_empty_string(self): + assert format_barcode("") == "" diff --git a/tests/test_validators.py b/tests/test_validators.py new file mode 100644 index 0000000..e7dabe9 --- /dev/null +++ b/tests/test_validators.py @@ -0,0 +1,251 @@ +"""app.core.excel.validators 单元测试""" + +import pytest +from app.core.excel.validators import ProductValidator + + +@pytest.fixture +def validator(): + return ProductValidator() + + +class TestValidateBarcode: + """条码验证测试""" + + def test_valid_barcode_13_digits(self, validator): + ok, val, err = validator.validate_barcode("6920584471055") + assert ok is True + assert val == "6920584471055" + assert err is None + + def test_valid_barcode_8_digits(self, validator): + ok, val, err = validator.validate_barcode("12345678") + assert ok is True + assert val == "12345678" + + def test_valid_barcode_12_digits(self, validator): + ok, val, err = validator.validate_barcode("692058447105") + assert ok is True + + def test_none_returns_invalid(self, validator): + ok, val, err = validator.validate_barcode(None) + assert ok is False + assert err == "条码为空" + + def test_warehouse_identifier(self, validator): + ok, val, err = validator.validate_barcode("仓库") + assert ok is False + assert val == "仓库" + assert err == "条码为仓库标识" + + def test_warehouse_full_name(self, validator): + ok, val, err = validator.validate_barcode("仓库全名") + assert ok is False + + def test_prefix_5_to_6_correction(self, validator): + """5开头(非53)的长条码应修正为6开头""" + ok, val, err = validator.validate_barcode("5920584471055") + assert ok is True + assert val.startswith("6") + assert val == "6920584471055" + + def test_prefix_53_not_corrected(self, validator): + """53开头的条码不修正""" + ok, val, err = validator.validate_barcode("5321545613000") + assert ok is True + assert val.startswith("53") + + def test_14_digit_trailing_zero_truncated(self, validator): + """14位条码末尾是0时截断到13位""" + ok, val, err = validator.validate_barcode("69205844710550") + assert ok is True + assert len(val) == 13 + + def test_14_digit_no_trailing_zero_invalid(self, validator): + """14位条码末尾不是0时报错""" + ok, val, err = validator.validate_barcode("69205844710551") + assert ok is False + assert "长度异常" in err + + def test_too_short_invalid(self, validator): + ok, val, err = validator.validate_barcode("1234567") + assert ok is False + assert "长度异常" in err + + def test_too_long_invalid(self, validator): + ok, val, err = validator.validate_barcode("1" * 14) + # 14 digits with trailing 0s gets truncated, but "111...1" has no trailing 0 + ok2, val2, err2 = validator.validate_barcode("1" * 15) + assert ok2 is False + + def test_no_digits_invalid(self, validator): + ok, val, err = validator.validate_barcode("abc") + assert ok is False + assert err == "条码不包含数字" + + def test_float_input_cleaned(self, validator): + """浮点数输入应清理为整数字符串""" + ok, val, err = validator.validate_barcode(6920584471055.0) + assert ok is True + assert val == "6920584471055" + + def test_special_barcode_5321545613(self, validator): + """特殊条码 5321545613 应通过验证""" + ok, val, err = validator.validate_barcode("5321545613") + assert ok is True + assert val == "5321545613" + + +class TestValidatePrice: + """单价验证测试""" + + def test_valid_price(self, validator): + ok, val, is_gift, err = validator.validate_price(10.5) + assert ok is True + assert val == 10.5 + assert is_gift is False + + def test_zero_price_is_gift(self, validator): + ok, val, is_gift, err = validator.validate_price(0) + assert ok is True + assert val == 0.0 + assert is_gift is True + + def test_none_is_gift(self, validator): + ok, val, is_gift, err = validator.validate_price(None) + assert ok is False + assert is_gift is True + + def test_gift_string(self, validator): + ok, val, is_gift, err = validator.validate_price("赠品") + assert ok is True + assert is_gift is True + + def test_gift_english(self, validator): + ok, val, is_gift, err = validator.validate_price("gift") + assert ok is True + assert is_gift is True + + def test_price_string_with_yen(self, validator): + ok, val, is_gift, err = validator.validate_price("¥123.45") + assert ok is True + assert val == 123.45 + assert is_gift is False + + def test_price_string_with_comma(self, validator): + ok, val, is_gift, err = validator.validate_price("1,234.56") + assert ok is True + assert val == 1234.56 + + def test_negative_price_invalid(self, validator): + ok, val, is_gift, err = validator.validate_price(-5) + assert ok is False + assert is_gift is True + + def test_empty_string_is_gift(self, validator): + ok, val, is_gift, err = validator.validate_price("") + assert ok is True + assert is_gift is True + + +class TestValidateQuantity: + """数量验证测试""" + + def test_valid_quantity(self, validator): + ok, val, err = validator.validate_quantity(10) + assert ok is True + assert val == 10.0 + + def test_float_quantity(self, validator): + ok, val, err = validator.validate_quantity(2.5) + assert ok is True + assert val == 2.5 + + def test_string_quantity(self, validator): + ok, val, err = validator.validate_quantity("15") + assert ok is True + assert val == 15.0 + + def test_string_with_unit(self, validator): + ok, val, err = validator.validate_quantity("10瓶") + assert ok is True + assert val == 10.0 + + def test_none_invalid(self, validator): + ok, val, err = validator.validate_quantity(None) + assert ok is False + assert err == "数量为空" + + def test_zero_invalid(self, validator): + ok, val, err = validator.validate_quantity(0) + assert ok is False + assert "必须大于0" in err + + def test_negative_invalid(self, validator): + ok, val, err = validator.validate_quantity(-3) + assert ok is False + assert "必须大于0" in err + + def test_non_numeric_string_invalid(self, validator): + ok, val, err = validator.validate_quantity("abc") + assert ok is False + assert err == "数量不包含数字" + + +class TestValidateProduct: + """商品数据整体验证测试""" + + def test_valid_product(self, validator): + product = { + 'barcode': '6920584471055', + 'price': 10.5, + 'quantity': 5, + 'amount': 52.5, + } + result = validator.validate_product(product) + assert result['barcode'] == '6920584471055' + assert result['price'] == 10.5 + assert result['quantity'] == 5.0 + assert result.get('is_gift') is None or result.get('is_gift') is False + + def test_gift_product(self, validator): + product = { + 'barcode': '6920584471055', + 'price': '赠品', + 'quantity': 5, + } + result = validator.validate_product(product) + assert result['is_gift'] is True + assert result['price'] == 0.0 + + def test_quantity_from_amount_and_price(self, validator): + """数量为空时,通过金额/单价计算""" + product = { + 'barcode': '6920584471055', + 'price': 10.0, + 'amount': 50.0, + 'quantity': None, + } + result = validator.validate_product(product) + assert result['quantity'] == 5.0 # 50 / 10 + + def test_invalid_barcode_still_uses_fixed(self, validator): + """条码验证失败但有修复值时仍使用修复值""" + product = { + 'barcode': '5920584471055', # 5开头, 会被修正为6开头 + 'price': 10.0, + 'quantity': 5, + } + result = validator.validate_product(product) + assert result['barcode'] == '6920584471055' + + def test_amount_zero_marks_gift(self, validator): + """金额为0时标记为赠品""" + product = { + 'barcode': '6920584471055', + 'price': 10.0, + 'quantity': 5, + 'amount': 0, + } + result = validator.validate_product(product) + assert result.get('is_gift') is True diff --git a/启动器.py b/启动器.py new file mode 100644 index 0000000..940c84a --- /dev/null +++ b/启动器.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +"""益选-OCR订单处理系统启动器""" + +import os +import sys + +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +from app.ui.main_window import main + +if __name__ == "__main__": + main()