@

feat: 商品记忆库 — 从OCR结果学习，逐步替代OCR识别 - 扩展 product_db.py: schema迁移(specification/source/confidence/usage_count/last_seen) + 学习逻辑(learn_from_product)、置信度系统、批量查询、导入导出、云端同步 - 注入处理管线: processor.py 在提取产品后调用 _apply_memory() 用记忆补全OCR + _is_spec_suspicious() 检测OCR规格质量，处理完后自动学习 - order_service.py 创建共享 ProductDatabase 实例 - dialog_utils.py 新增商品记忆库云端同步条目 - 新建 memory_editor.py: Treeview查看/编辑/搜索/删除/重新导入 - main_window.py 系统设置区新增"商品记忆库"按钮 - build_exe.py 添加 memory_editor 到 hidden_imports @
2026-05-05 02:40:48 +08:00
parent 5cf9a98d9a
commit d267a1d1fa
8 changed files with 656 additions and 44 deletions
@@ -40,12 +40,13 @@ class ExcelProcessor:
    提取条码、单价和数量，并按照采购单模板的格式填充
    """
    
-    def __init__(self, config):
+    def __init__(self, config, product_db=None):
        """
        初始化Excel处理器
-        
+
        Args:
            config: 配置信息
+            product_db: 商品数据库实例（可选，由外部传入以共享）
        """
        self.config = config
        
@@ -74,6 +75,18 @@ class ExcelProcessor:
            
            # 加载单位转换器和配置
            self.unit_converter = UnitConverter()
+
+            # 商品记忆库
+            if product_db is not None:
+                self.product_db = product_db
+            else:
+                from ..db.product_db import ProductDatabase
+                db_path = config.get_path('Paths', 'product_db', fallback='data/product_cache.db') if hasattr(config, 'get_path') else 'data/product_cache.db'
+                tpl_folder = config.get('Paths', 'template_folder', fallback='templates')
+                item_data = config.get('Templates', 'item_data', fallback='商品资料.xlsx')
+                tpl_path = os.path.join(tpl_folder, item_data)
+                self.product_db = ProductDatabase(db_path, tpl_path)
+
            logger.info(f"初始化ExcelProcessor完成，模板文件: {self.template_path}")
        except Exception as e:
            logger.error(f"初始化ExcelProcessor失败: {e}")
@@ -371,14 +384,70 @@ class ExcelProcessor:
                    except Exception as e:
                        logger.warning(f"通过金额和单价计算数量失败: {e}")
                
+                # 应用记忆库补全
+                product = self._apply_memory(product)
+
                products.append(product)
            except Exception as e:
                logger.error(f"提取第{idx+1}行商品信息时出错: {e}", exc_info=True)
                continue
-                
+
        logger.info(f"提取到 {len(products)} 个商品信息")
        return products
-    
+
+    def _apply_memory(self, product: Dict) -> Dict:
+        """查记忆库，补全 OCR 缺失/错误的字段"""
+        barcode = product.get('barcode', '')
+        if not barcode:
+            return product
+
+        try:
+            memory = self.product_db.get_memory(barcode)
+        except Exception:
+            return product
+
+        if memory is None or memory.get('confidence', 0) < 80:
+            return product
+
+        # 补全规格
+        ocr_spec = product.get('specification', '')
+        mem_spec = memory.get('specification', '') or ''
+        if mem_spec and (not ocr_spec or self._is_spec_suspicious(ocr_spec)):
+            product['specification'] = mem_spec
+            logger.info(f"记忆修正规格: {barcode} '{ocr_spec}' -> '{mem_spec}'")
+
+        # 补全名称
+        ocr_name = product.get('name', '')
+        mem_name = memory.get('name', '') or ''
+        if mem_name and not ocr_name:
+            product['name'] = mem_name
+            logger.info(f"记忆修正名称: {barcode} -> '{mem_name}'")
+
+        # 补全单位
+        ocr_unit = product.get('unit', '')
+        mem_unit = memory.get('unit', '') or ''
+        if mem_unit and not ocr_unit:
+            product['unit'] = mem_unit
+            logger.info(f"记忆修正单位: {barcode} -> '{mem_unit}'")
+
+        # 不改数量和单价（每单不同）
+        return product
+
+    def _is_spec_suspicious(self, spec: str) -> bool:
+        """检测规格是否像 OCR 垃圾"""
+        if not spec:
+            return True
+        # IL*12（I 和 1 混淆）
+        if re.search(r'^[Ii][Ll*]', spec):
+            return True
+        # 4.51*4（L 被识别为 1）
+        if re.search(r'\d+\.\d+1\*\d+', spec):
+            return True
+        # 包含非常规字符（排除常见规格字符）
+        if re.search(r'[^\d.*xX\-LlKkGgMm升毫瓶桶盒箱件提\s]', spec):
+            return True
+        return False
+
    def fill_template(self, products: List[Dict], output_file_path: str) -> bool:
        """
        填充采购单模板
@@ -599,6 +668,14 @@ class ExcelProcessor:
            
            # 填充模板并保存
            if self.fill_template(products, output_file):
+                # 从处理结果中学习商品记忆
+                try:
+                    self.product_db.learn_from_products(products, source='ocr')
+                    self.product_db._export_memory_json()
+                    logger.info(f"已从处理结果学习 {len(products)} 条商品记忆")
+                except Exception as e:
+                    logger.warning(f"学习商品记忆失败: {e}")
+
                # 记录已处理文件
                self.processed_files[file_path] = output_file
                self._save_processed_files()