feat: 商品记忆库 — 从OCR结果学习,逐步替代OCR识别

- 扩展 product_db.py: schema迁移(specification/source/confidence/usage_count/last_seen)
  + 学习逻辑(learn_from_product)、置信度系统、批量查询、导入导出、云端同步
- 注入处理管线: processor.py 在提取产品后调用 _apply_memory() 用记忆补全OCR
  + _is_spec_suspicious() 检测OCR规格质量,处理完后自动学习
- order_service.py 创建共享 ProductDatabase 实例
- dialog_utils.py 新增商品记忆库云端同步条目
- 新建 memory_editor.py: Treeview查看/编辑/搜索/删除/重新导入
- main_window.py 系统设置区新增"商品记忆库"按钮
- build_exe.py 添加 memory_editor 到 hidden_imports
@
This commit is contained in:
2026-05-05 02:40:48 +08:00
parent 5cf9a98d9a
commit d267a1d1fa
8 changed files with 656 additions and 44 deletions
+81 -4
View File
@@ -40,12 +40,13 @@ class ExcelProcessor:
提取条码、单价和数量,并按照采购单模板的格式填充
"""
def __init__(self, config):
def __init__(self, config, product_db=None):
"""
初始化Excel处理器
Args:
config: 配置信息
product_db: 商品数据库实例(可选,由外部传入以共享)
"""
self.config = config
@@ -74,6 +75,18 @@ class ExcelProcessor:
# 加载单位转换器和配置
self.unit_converter = UnitConverter()
# 商品记忆库
if product_db is not None:
self.product_db = product_db
else:
from ..db.product_db import ProductDatabase
db_path = config.get_path('Paths', 'product_db', fallback='data/product_cache.db') if hasattr(config, 'get_path') else 'data/product_cache.db'
tpl_folder = config.get('Paths', 'template_folder', fallback='templates')
item_data = config.get('Templates', 'item_data', fallback='商品资料.xlsx')
tpl_path = os.path.join(tpl_folder, item_data)
self.product_db = ProductDatabase(db_path, tpl_path)
logger.info(f"初始化ExcelProcessor完成,模板文件: {self.template_path}")
except Exception as e:
logger.error(f"初始化ExcelProcessor失败: {e}")
@@ -371,14 +384,70 @@ class ExcelProcessor:
except Exception as e:
logger.warning(f"通过金额和单价计算数量失败: {e}")
# 应用记忆库补全
product = self._apply_memory(product)
products.append(product)
except Exception as e:
logger.error(f"提取第{idx+1}行商品信息时出错: {e}", exc_info=True)
continue
logger.info(f"提取到 {len(products)} 个商品信息")
return products
def _apply_memory(self, product: Dict) -> Dict:
"""查记忆库,补全 OCR 缺失/错误的字段"""
barcode = product.get('barcode', '')
if not barcode:
return product
try:
memory = self.product_db.get_memory(barcode)
except Exception:
return product
if memory is None or memory.get('confidence', 0) < 80:
return product
# 补全规格
ocr_spec = product.get('specification', '')
mem_spec = memory.get('specification', '') or ''
if mem_spec and (not ocr_spec or self._is_spec_suspicious(ocr_spec)):
product['specification'] = mem_spec
logger.info(f"记忆修正规格: {barcode} '{ocr_spec}' -> '{mem_spec}'")
# 补全名称
ocr_name = product.get('name', '')
mem_name = memory.get('name', '') or ''
if mem_name and not ocr_name:
product['name'] = mem_name
logger.info(f"记忆修正名称: {barcode} -> '{mem_name}'")
# 补全单位
ocr_unit = product.get('unit', '')
mem_unit = memory.get('unit', '') or ''
if mem_unit and not ocr_unit:
product['unit'] = mem_unit
logger.info(f"记忆修正单位: {barcode} -> '{mem_unit}'")
# 不改数量和单价(每单不同)
return product
def _is_spec_suspicious(self, spec: str) -> bool:
"""检测规格是否像 OCR 垃圾"""
if not spec:
return True
# IL*12I 和 1 混淆)
if re.search(r'^[Ii][Ll*]', spec):
return True
# 4.51*4L 被识别为 1
if re.search(r'\d+\.\d+1\*\d+', spec):
return True
# 包含非常规字符(排除常见规格字符)
if re.search(r'[^\d.*xX\-LlKkGgMm升毫瓶桶盒箱件提\s]', spec):
return True
return False
def fill_template(self, products: List[Dict], output_file_path: str) -> bool:
"""
填充采购单模板
@@ -599,6 +668,14 @@ class ExcelProcessor:
# 填充模板并保存
if self.fill_template(products, output_file):
# 从处理结果中学习商品记忆
try:
self.product_db.learn_from_products(products, source='ocr')
self.product_db._export_memory_json()
logger.info(f"已从处理结果学习 {len(products)} 条商品记忆")
except Exception as e:
logger.warning(f"学习商品记忆失败: {e}")
# 记录已处理文件
self.processed_files[file_path] = output_file
self._save_processed_files()