feat: 益选 OCR 订单处理系统初始提交
- 智能供应商识别(蓉城易购/烟草/杨碧月/通用) - 百度 OCR 表格识别集成 - 规则引擎(列映射/数据清洗/单位转换/规格推断) - 条码映射管理与云端同步(Gitea REST API) - 云端同步支持:条码映射、供应商配置、商品资料、采购模板 - 拖拽一键处理(图片→OCR→Excel→合并) - 191 个单元测试 - 移除无用的模板管理功能 - 清理 IDE 产物目录 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,214 @@
|
||||
"""
|
||||
商品资料 SQLite 数据库
|
||||
|
||||
将商品资料 (条码/名称/进货价/单位) 存储在 SQLite 中,
|
||||
支持从 Excel 自动导入和按条码快速查询。
|
||||
"""
|
||||
|
||||
import os
|
||||
import sqlite3
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from ..utils.log_utils import get_logger
|
||||
from ..utils.file_utils import smart_read_excel
|
||||
from ...core.handlers.column_mapper import ColumnMapper
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class ProductDatabase:
|
||||
"""商品资料 SQLite 数据库"""
|
||||
|
||||
SCHEMA = """
|
||||
CREATE TABLE IF NOT EXISTS products (
|
||||
barcode TEXT PRIMARY KEY,
|
||||
name TEXT DEFAULT '',
|
||||
price REAL DEFAULT 0.0,
|
||||
unit TEXT DEFAULT '',
|
||||
updated_at TEXT
|
||||
);
|
||||
"""
|
||||
|
||||
def __init__(self, db_path: str, excel_source: str):
|
||||
"""初始化数据库,如果 SQLite 不存在则自动从 Excel 导入
|
||||
|
||||
Args:
|
||||
db_path: SQLite 数据库文件路径
|
||||
excel_source: 商品资料 Excel 文件路径
|
||||
"""
|
||||
self.db_path = db_path
|
||||
self.excel_source = excel_source
|
||||
self._ensure_db()
|
||||
|
||||
def _connect(self) -> sqlite3.Connection:
|
||||
return sqlite3.connect(self.db_path)
|
||||
|
||||
def _ensure_db(self):
|
||||
"""确保数据库存在,不存在则从 Excel 导入"""
|
||||
if os.path.exists(self.db_path):
|
||||
return
|
||||
|
||||
if not os.path.exists(self.excel_source):
|
||||
logger.warning(f"商品资料 Excel 不存在,跳过导入: {self.excel_source}")
|
||||
self._create_empty_db()
|
||||
return
|
||||
|
||||
logger.info(f"首次运行,从 Excel 导入商品资料: {self.excel_source}")
|
||||
os.makedirs(os.path.dirname(self.db_path), exist_ok=True)
|
||||
self._create_empty_db()
|
||||
count = self.import_from_excel(self.excel_source)
|
||||
logger.info(f"商品资料导入完成: {count} 条记录")
|
||||
|
||||
def _create_empty_db(self):
|
||||
"""创建空数据库"""
|
||||
conn = self._connect()
|
||||
try:
|
||||
conn.executescript(self.SCHEMA)
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def import_from_excel(self, excel_path: str) -> int:
|
||||
"""从 Excel 导入商品资料
|
||||
|
||||
Args:
|
||||
excel_path: Excel 文件路径
|
||||
|
||||
Returns:
|
||||
导入的记录数
|
||||
"""
|
||||
df = smart_read_excel(excel_path)
|
||||
if df is None or df.empty:
|
||||
logger.warning(f"Excel 文件为空或读取失败: {excel_path}")
|
||||
return 0
|
||||
|
||||
# 查找条码列
|
||||
barcode_col = ColumnMapper.find_column(list(df.columns), 'barcode')
|
||||
if not barcode_col:
|
||||
logger.error(f"Excel 中未找到条码列: {list(df.columns)}")
|
||||
return 0
|
||||
|
||||
# 查找进货价列
|
||||
price_col = ColumnMapper.find_column(list(df.columns), 'unit_price')
|
||||
# 进货价可能没有标准别名,补充查找
|
||||
if not price_col:
|
||||
for col in df.columns:
|
||||
col_str = str(col).strip()
|
||||
if '进货价' in col_str:
|
||||
price_col = col
|
||||
break
|
||||
|
||||
# 查找名称列和单位列 (可选)
|
||||
name_col = ColumnMapper.find_column(list(df.columns), 'name')
|
||||
unit_col = ColumnMapper.find_column(list(df.columns), 'unit')
|
||||
|
||||
now = datetime.now().isoformat()
|
||||
rows = []
|
||||
for _, row in df.iterrows():
|
||||
barcode = str(row.get(barcode_col, '')).strip()
|
||||
if not barcode or barcode == 'nan':
|
||||
continue
|
||||
|
||||
price = 0.0
|
||||
if price_col:
|
||||
try:
|
||||
p = row.get(price_col)
|
||||
if p is not None and str(p).strip() not in ('', 'nan', 'None'):
|
||||
price = float(p)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
name = str(row.get(name_col, '')).strip() if name_col else ''
|
||||
if name == 'nan':
|
||||
name = ''
|
||||
unit = str(row.get(unit_col, '')).strip() if unit_col else ''
|
||||
if unit == 'nan':
|
||||
unit = ''
|
||||
|
||||
rows.append((barcode, name, price, unit, now))
|
||||
|
||||
if not rows:
|
||||
logger.warning(f"Excel 中未解析出有效记录: {excel_path}")
|
||||
return 0
|
||||
|
||||
conn = self._connect()
|
||||
try:
|
||||
conn.executemany(
|
||||
"INSERT OR REPLACE INTO products (barcode, name, price, unit, updated_at) "
|
||||
"VALUES (?, ?, ?, ?, ?)",
|
||||
rows
|
||||
)
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
return len(rows)
|
||||
|
||||
def reimport(self) -> int:
|
||||
"""重新从 Excel 导入(清空现有数据后重新导入)
|
||||
|
||||
Returns:
|
||||
导入的记录数
|
||||
"""
|
||||
conn = self._connect()
|
||||
try:
|
||||
conn.execute("DELETE FROM products")
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
return self.import_from_excel(self.excel_source)
|
||||
|
||||
def get_price(self, barcode: str) -> Optional[float]:
|
||||
"""按条码查询进货价
|
||||
|
||||
Args:
|
||||
barcode: 商品条码
|
||||
|
||||
Returns:
|
||||
进货价,未找到返回 None
|
||||
"""
|
||||
conn = self._connect()
|
||||
try:
|
||||
cursor = conn.execute(
|
||||
"SELECT price FROM products WHERE barcode = ?",
|
||||
(str(barcode).strip(),)
|
||||
)
|
||||
row = cursor.fetchone()
|
||||
return row[0] if row else None
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def get_prices(self, barcodes: List[str]) -> Dict[str, float]:
|
||||
"""批量查询进货价
|
||||
|
||||
Args:
|
||||
barcodes: 条码列表
|
||||
|
||||
Returns:
|
||||
{条码: 进货价} 字典,未找到的不包含
|
||||
"""
|
||||
if not barcodes:
|
||||
return {}
|
||||
|
||||
conn = self._connect()
|
||||
try:
|
||||
placeholders = ','.join('?' * len(barcodes))
|
||||
cursor = conn.execute(
|
||||
f"SELECT barcode, price FROM products WHERE barcode IN ({placeholders})",
|
||||
[str(b).strip() for b in barcodes]
|
||||
)
|
||||
return {row[0]: row[1] for row in cursor.fetchall()}
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def count(self) -> int:
|
||||
"""返回商品总数"""
|
||||
conn = self._connect()
|
||||
try:
|
||||
cursor = conn.execute("SELECT COUNT(*) FROM products")
|
||||
return cursor.fetchone()[0]
|
||||
finally:
|
||||
conn.close()
|
||||
@@ -0,0 +1,5 @@
|
||||
"""
|
||||
OCR订单处理系统 - Excel处理模块
|
||||
----------------------------
|
||||
提供Excel文件处理、数据提取和转换功能。
|
||||
"""
|
||||
@@ -0,0 +1,535 @@
|
||||
"""
|
||||
单位转换模块
|
||||
----------
|
||||
提供单位转换功能,支持规格推断和单位自动提取。
|
||||
"""
|
||||
|
||||
import re
|
||||
import logging
|
||||
import os
|
||||
import json
|
||||
from typing import Dict, Tuple, Optional, Any, List, Union
|
||||
|
||||
from ..utils.log_utils import get_logger
|
||||
from .handlers.barcode_mapper import BarcodeMapper
|
||||
from .handlers.unit_converter_handlers import (
|
||||
JianUnitHandler, BoxUnitHandler, TiHeUnitHandler,
|
||||
GiftUnitHandler, UnitHandler
|
||||
)
|
||||
from .validators import ProductValidator
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
# 条码映射配置文件路径
|
||||
BARCODE_MAPPING_CONFIG = "config/barcode_mappings.json"
|
||||
|
||||
class UnitConverter:
|
||||
"""
|
||||
单位转换器:处理不同单位之间的转换,支持从商品名称推断规格
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""
|
||||
初始化单位转换器
|
||||
"""
|
||||
# 加载特殊条码配置
|
||||
self.special_barcodes = self.load_barcode_mappings()
|
||||
|
||||
# 规格推断的正则表达式模式
|
||||
self.spec_patterns = [
|
||||
# 1*6、1x12、1X20等格式
|
||||
(r'(\d+)[*xX×](\d+)', r'\1*\2'),
|
||||
# 1*5*12和1x5x12等三级格式
|
||||
(r'(\d+)[*xX×](\d+)[*xX×](\d+)', r'\1*\2*\3'),
|
||||
# "xx入"格式,如"12入"、"24入"
|
||||
(r'(\d+)入', r'1*\1'),
|
||||
# "xxL*1"或"xx升*1"格式
|
||||
(r'([\d\.]+)[L升][*xX×]?(\d+)?', r'\1L*\2' if r'\2' else r'\1L*1'),
|
||||
# "xxkg*1"或"xx公斤*1"格式
|
||||
(r'([\d\.]+)(?:kg|公斤)[*xX×]?(\d+)?', r'\1kg*\2' if r'\2' else r'\1kg*1'),
|
||||
# "xxg*1"或"xx克*1"格式
|
||||
(r'([\d\.]+)(?:g|克)[*xX×]?(\d+)?', r'\1g*\2' if r'\2' else r'\1g*1'),
|
||||
# "xxmL*1"或"xx毫升*1"格式
|
||||
(r'([\d\.]+)(?:mL|毫升)[*xX×]?(\d+)?', r'\1mL*\2' if r'\2' else r'\1mL*1'),
|
||||
]
|
||||
|
||||
# 初始化处理程序
|
||||
self._init_handlers()
|
||||
|
||||
# 初始化验证器
|
||||
self.validator = ProductValidator()
|
||||
|
||||
def _init_handlers(self):
|
||||
"""
|
||||
初始化各种处理程序
|
||||
"""
|
||||
# 创建条码处理程序
|
||||
self.barcode_mapper = BarcodeMapper(self.special_barcodes)
|
||||
|
||||
# 创建单位处理程序列表,优先级从高到低
|
||||
self.unit_handlers: List[UnitHandler] = [
|
||||
GiftUnitHandler(), # 首先处理赠品,优先级最高
|
||||
JianUnitHandler(), # 处理"件"单位
|
||||
BoxUnitHandler(), # 处理"箱"单位
|
||||
TiHeUnitHandler() # 处理"提"和"盒"单位
|
||||
]
|
||||
|
||||
def extract_unit_from_quantity(self, quantity_str: str) -> Tuple[Optional[float], Optional[str]]:
|
||||
"""
|
||||
从数量字符串中提取单位
|
||||
|
||||
支持的格式:
|
||||
1. "2箱" -> (2, "箱")
|
||||
2. "3件" -> (3, "件")
|
||||
3. "1.5提" -> (1.5, "提")
|
||||
4. "数量: 5盒" -> (5, "盒")
|
||||
5. "× 2瓶" -> (2, "瓶")
|
||||
|
||||
Args:
|
||||
quantity_str: 数量字符串,如"2箱"、"5件"
|
||||
|
||||
Returns:
|
||||
(数量, 单位)的元组,如果无法提取则返回(None, None)
|
||||
"""
|
||||
if not quantity_str or not isinstance(quantity_str, str):
|
||||
return None, None
|
||||
|
||||
# 清理字符串,移除前后空白和一些常见前缀
|
||||
cleaned_str = quantity_str.strip()
|
||||
for prefix in ['数量:', '数量:', '×', 'x', 'X', '*']:
|
||||
cleaned_str = cleaned_str.replace(prefix, '').strip()
|
||||
|
||||
# 匹配数字+单位格式 (基本格式)
|
||||
basic_match = re.match(r'^([\d\.]+)\s*([^\d\s\.]+)$', cleaned_str)
|
||||
if basic_match:
|
||||
try:
|
||||
num = float(basic_match.group(1))
|
||||
unit = basic_match.group(2)
|
||||
logger.info(f"从数量提取单位(基本格式): {quantity_str} -> 数量={num}, 单位={unit}")
|
||||
return num, unit
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# 匹配更复杂的格式,如包含其他文本的情况
|
||||
complex_match = re.search(r'([\d\.]+)\s*([箱|件|瓶|提|盒|袋|桶|包|kg|g|升|毫升|L|ml|个])', cleaned_str)
|
||||
if complex_match:
|
||||
try:
|
||||
num = float(complex_match.group(1))
|
||||
unit = complex_match.group(2)
|
||||
logger.info(f"从数量提取单位(复杂格式): {quantity_str} -> 数量={num}, 单位={unit}")
|
||||
return num, unit
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
return None, None
|
||||
|
||||
def extract_specification(self, text: str) -> Optional[str]:
|
||||
"""
|
||||
从文本中提取规格信息
|
||||
|
||||
Args:
|
||||
text: 文本字符串
|
||||
|
||||
Returns:
|
||||
提取的规格字符串,如果无法提取则返回None
|
||||
"""
|
||||
if not text or not isinstance(text, str):
|
||||
return None
|
||||
|
||||
# 处理XX入白膜格式,如"550纯净水24入白膜"
|
||||
match = re.search(r'.*?(\d+)入白膜', text)
|
||||
if match:
|
||||
result = f"1*{match.group(1)}"
|
||||
logger.info(f"提取规格(入白膜): {text} -> {result}")
|
||||
return result
|
||||
|
||||
# 尝试所有模式
|
||||
for pattern, replacement in self.spec_patterns:
|
||||
match = re.search(pattern, text)
|
||||
if match:
|
||||
# 特殊处理三级格式,确保正确显示为1*5*12
|
||||
if '*' in replacement and replacement.count('*') == 1 and len(match.groups()) >= 2:
|
||||
result = f"{match.group(1)}*{match.group(2)}"
|
||||
logger.info(f"提取规格: {text} -> {result}")
|
||||
return result
|
||||
# 特殊处理三级规格格式
|
||||
elif '*' in replacement and replacement.count('*') == 2 and len(match.groups()) >= 3:
|
||||
result = f"{match.group(1)}*{match.group(2)}*{match.group(3)}"
|
||||
logger.info(f"提取三级规格: {text} -> {result}")
|
||||
return result
|
||||
# 一般情况
|
||||
else:
|
||||
result = re.sub(pattern, replacement, text)
|
||||
logger.info(f"提取规格: {text} -> {result}")
|
||||
return result
|
||||
|
||||
# 没有匹配任何模式
|
||||
return None
|
||||
|
||||
def infer_specification_from_name(self, name: str) -> Optional[str]:
|
||||
"""
|
||||
从商品名称中推断规格
|
||||
|
||||
规则:
|
||||
1. "xx入纸箱" -> 1*xx (如"15入纸箱" -> 1*15)
|
||||
2. 直接包含规格 "1*15" -> 1*15
|
||||
3. "xx纸箱" -> 1*xx (如"15纸箱" -> 1*15)
|
||||
4. "xx白膜" -> 1*xx (如"12白膜" -> 1*12)
|
||||
5. "xxL" 容量单位特殊处理
|
||||
6. "xx(g|ml|毫升|克)*数字" -> 1*数字 (如"450g*15" -> 1*15)
|
||||
|
||||
Args:
|
||||
name: 商品名称
|
||||
|
||||
Returns:
|
||||
推断的规格,如果无法推断则返回None
|
||||
"""
|
||||
if not name or not isinstance(name, str):
|
||||
return None
|
||||
|
||||
# 记录原始商品名称,用于日志
|
||||
original_name = name
|
||||
|
||||
# 新增模式: 处理重量/容量*数字格式,如"450g*15", "450ml*15"
|
||||
# 忽略重量/容量值,只提取后面的数量作为规格
|
||||
weight_volume_pattern = r'.*?\d+(?:g|ml|毫升|克)[*xX×](\d+)'
|
||||
match = re.search(weight_volume_pattern, name)
|
||||
if match:
|
||||
inferred_spec = f"1*{match.group(1)}"
|
||||
logger.info(f"从名称推断规格(重量/容量*数量): {original_name} -> {inferred_spec}")
|
||||
return inferred_spec
|
||||
|
||||
# 特殊模式1.1: "xx入白膜" 格式,如"550纯净水24入白膜" -> "1*24"
|
||||
pattern1_1 = r'.*?(\d+)入白膜'
|
||||
match = re.search(pattern1_1, name)
|
||||
if match:
|
||||
inferred_spec = f"1*{match.group(1)}"
|
||||
logger.info(f"从名称推断规格(入白膜): {original_name} -> {inferred_spec}")
|
||||
return inferred_spec
|
||||
|
||||
# 特殊模式1: "xx入纸箱" 格式,如"445水溶C血橙15入纸箱" -> "1*15"
|
||||
pattern1 = r'.*?(\d+)入纸箱'
|
||||
match = re.search(pattern1, name)
|
||||
if match:
|
||||
inferred_spec = f"1*{match.group(1)}"
|
||||
logger.info(f"从名称推断规格(入纸箱): {original_name} -> {inferred_spec}")
|
||||
return inferred_spec
|
||||
|
||||
# 特殊模式2: 直接包含规格,如"500-东方树叶-乌龙茶1*15-纸箱装" -> "1*15"
|
||||
pattern2 = r'.*?(\d+)[*xX×](\d+).*'
|
||||
match = re.search(pattern2, name)
|
||||
if match:
|
||||
inferred_spec = f"{match.group(1)}*{match.group(2)}"
|
||||
logger.info(f"从名称推断规格(直接格式): {original_name} -> {inferred_spec}")
|
||||
return inferred_spec
|
||||
|
||||
# 特殊模式3: "xx纸箱" 格式,如"500茶π蜜桃乌龙15纸箱" -> "1*15"
|
||||
pattern3 = r'.*?(\d+)纸箱'
|
||||
match = re.search(pattern3, name)
|
||||
if match:
|
||||
inferred_spec = f"1*{match.group(1)}"
|
||||
logger.info(f"从名称推断规格(纸箱): {original_name} -> {inferred_spec}")
|
||||
return inferred_spec
|
||||
|
||||
# 特殊模式4: "xx白膜" 格式,如"1.5L水12白膜" 或 "550水24白膜" -> "1*12" 或 "1*24"
|
||||
pattern4 = r'.*?(\d+)白膜'
|
||||
match = re.search(pattern4, name)
|
||||
if match:
|
||||
inferred_spec = f"1*{match.group(1)}"
|
||||
logger.info(f"从名称推断规格(白膜): {original_name} -> {inferred_spec}")
|
||||
return inferred_spec
|
||||
|
||||
# 特殊模式5: 容量单位带数量格式 "1.8L*8瓶" -> "1.8L*8"
|
||||
volume_count_pattern = r'.*?([\d\.]+)[Ll升][*×xX](\d+).*'
|
||||
match = re.search(volume_count_pattern, name)
|
||||
if match:
|
||||
volume = match.group(1)
|
||||
count = match.group(2)
|
||||
inferred_spec = f"{volume}L*{count}"
|
||||
logger.info(f"从名称推断规格(容量*数量): {original_name} -> {inferred_spec}")
|
||||
return inferred_spec
|
||||
|
||||
# 特殊模式6: 简单容量单位如"12.9L桶装水" -> "12.9L*1"
|
||||
simple_volume_pattern = r'.*?([\d\.]+)[Ll升].*'
|
||||
match = re.search(simple_volume_pattern, name)
|
||||
if match:
|
||||
inferred_spec = f"{match.group(1)}L*1"
|
||||
logger.info(f"从名称推断规格(简单容量): {original_name} -> {inferred_spec}")
|
||||
return inferred_spec
|
||||
|
||||
# 尝试通用模式匹配
|
||||
spec = self.extract_specification(name)
|
||||
if spec:
|
||||
logger.info(f"从名称推断规格(通用模式): {original_name} -> {spec}")
|
||||
return spec
|
||||
|
||||
return None
|
||||
|
||||
def parse_specification(self, spec: str) -> Tuple[int, int, Optional[int]]:
|
||||
"""
|
||||
解析规格字符串,支持1*12和1*5*12等格式
|
||||
|
||||
Args:
|
||||
spec: 规格字符串
|
||||
|
||||
Returns:
|
||||
(一级包装, 二级包装, 三级包装)元组,如果是二级包装,第三个值为None
|
||||
"""
|
||||
if not spec or not isinstance(spec, str):
|
||||
return 1, 1, None
|
||||
|
||||
try:
|
||||
# 清理规格字符串,确保格式统一
|
||||
spec = re.sub(r'\s+', '', spec) # 移除所有空白
|
||||
spec = re.sub(r'[xX×]', '*', spec) # 统一分隔符为*
|
||||
|
||||
logger.debug(f"解析规格: {spec}")
|
||||
|
||||
# 新增:处理“1件=12桶/袋/盒...”等等式规格,统一为1*12
|
||||
eq_match = re.match(r'(\d+(?:\.\d+)?)\s*(?:件|箱|提|盒)\s*[==]\s*(\d+)\s*(?:瓶|桶|盒|支|个|袋|罐|包|卷)', spec)
|
||||
if eq_match:
|
||||
try:
|
||||
level2 = int(eq_match.group(2))
|
||||
logger.info(f"解析等式规格: {spec} -> 1*{level2}")
|
||||
return 1, level2, None
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# 处理三级包装,如1*5*12
|
||||
three_level_match = re.match(r'(\d+)[*](\d+)[*](\d+)', spec)
|
||||
if three_level_match:
|
||||
try:
|
||||
level1 = int(three_level_match.group(1))
|
||||
level2 = int(three_level_match.group(2))
|
||||
level3 = int(three_level_match.group(3))
|
||||
logger.info(f"解析三级规格: {spec} -> {level1}*{level2}*{level3}")
|
||||
return level1, level2, level3
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# 处理带重量单位的规格,如5kg*6、500g*12等
|
||||
weight_match = re.match(r'([\d\.]+)(?:kg|g|克|千克|公斤)[*](\d+)', spec, re.IGNORECASE)
|
||||
if weight_match:
|
||||
try:
|
||||
# 对于重量单位,使用1作为一级包装,后面的数字作为二级包装
|
||||
level2 = int(weight_match.group(2))
|
||||
logger.info(f"解析重量规格: {spec} -> 1*{level2}")
|
||||
return 1, level2, None
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# 处理带容量单位的规格,如500ml*15, 1L*12等
|
||||
ml_match = re.match(r'(\d+)(?:ml|毫升)[*](\d+)', spec, re.IGNORECASE)
|
||||
if ml_match:
|
||||
try:
|
||||
# 对于ml单位,使用1作为一级包装,后面的数字作为二级包装
|
||||
level2 = int(ml_match.group(2))
|
||||
logger.info(f"解析容量(ml)规格: {spec} -> 1*{level2}")
|
||||
return 1, level2, None
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# 处理带L单位的规格,如1L*12等
|
||||
l_match = re.match(r'(\d+(?:\.\d+)?)[Ll升][*](\d+)', spec)
|
||||
if l_match:
|
||||
try:
|
||||
# 对于L单位,正确提取第二部分作为包装数量
|
||||
level2 = int(l_match.group(2))
|
||||
logger.info(f"解析容量(L)规格: {spec} -> 1*{level2}")
|
||||
return 1, level2, None
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# 处理二级包装,如1*12
|
||||
two_level_match = re.match(r'(\d+)[*](\d+)', spec)
|
||||
if two_level_match:
|
||||
try:
|
||||
level1 = int(two_level_match.group(1))
|
||||
level2 = int(two_level_match.group(2))
|
||||
logger.info(f"解析二级规格: {spec} -> {level1}*{level2}")
|
||||
return level1, level2, None
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# 特殊处理L/升为单位的规格,如12.5L*1
|
||||
volume_match = re.match(r'([\d\.]+)[L升][*xX×](\d+)', spec)
|
||||
if volume_match:
|
||||
try:
|
||||
volume = float(volume_match.group(1))
|
||||
quantity = int(volume_match.group(2))
|
||||
logger.info(f"解析容量规格: {spec} -> {volume}L*{quantity}")
|
||||
return 1, quantity, None
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# 处理不规范格式,如IL*12, 6oo*12等,从中提取数字部分作为包装数量
|
||||
# 只要规格中包含*和数字,就尝试提取*后面的数字作为件数
|
||||
irregular_match = re.search(r'[^0-9]*\*(\d+)', spec)
|
||||
if irregular_match:
|
||||
try:
|
||||
level2 = int(irregular_match.group(1))
|
||||
logger.info(f"解析不规范规格: {spec} -> 1*{level2}")
|
||||
return 1, level2, None
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# 默认值
|
||||
logger.warning(f"无法解析规格: {spec},使用默认值1*1")
|
||||
return 1, 1, None
|
||||
except Exception as e:
|
||||
logger.error(f"解析规格时出错: {e}")
|
||||
return 1, 1, None
|
||||
|
||||
def process_unit_conversion(self, product: Dict) -> Dict:
|
||||
"""
|
||||
处理单位转换,按照以下规则:
|
||||
1. 特殊条码: 优先处理特殊条码
|
||||
2. 赠品处理: 对于赠品,维持数量转换但单价为0
|
||||
3. "件"单位: 数量×包装数量, 单价÷包装数量, 单位转为"瓶"
|
||||
4. "箱"单位: 数量×包装数量, 单价÷包装数量, 单位转为"瓶"
|
||||
5. "提"和"盒"单位: 如果是三级规格, 按件处理; 如果是二级规格, 保持不变
|
||||
6. 其他单位: 保持不变
|
||||
|
||||
Args:
|
||||
product: 商品信息字典
|
||||
|
||||
Returns:
|
||||
处理后的商品信息字典
|
||||
"""
|
||||
# 首先验证商品数据
|
||||
product = self.validator.validate_product(product)
|
||||
|
||||
# 复制原始数据,避免修改原始字典
|
||||
result = product.copy()
|
||||
|
||||
barcode = result.get('barcode', '')
|
||||
specification = result.get('specification', '')
|
||||
|
||||
# 跳过无效数据
|
||||
if not barcode:
|
||||
return result
|
||||
|
||||
# 先处理条码映射
|
||||
result = self.barcode_mapper.map_barcode(result)
|
||||
|
||||
# 如果没有规格信息,无法进行单位转换
|
||||
if not specification:
|
||||
# 尝试从商品名称推断规格
|
||||
inferred_spec = self.infer_specification_from_name(result.get('name', ''))
|
||||
if inferred_spec:
|
||||
result['specification'] = inferred_spec
|
||||
logger.info(f"从商品名称推断规格: {result.get('name', '')} -> {inferred_spec}")
|
||||
else:
|
||||
return result
|
||||
|
||||
# 解析规格信息
|
||||
level1, level2, level3 = self.parse_specification(result.get('specification', ''))
|
||||
|
||||
# 使用单位处理程序处理单位转换
|
||||
for handler in self.unit_handlers:
|
||||
if handler.can_handle(result):
|
||||
return handler.handle(result, level1, level2, level3)
|
||||
|
||||
# 没有找到适用的处理程序,保持不变
|
||||
logger.info(f"其他单位处理: 保持原样 数量: {result.get('quantity', 0)}, 单价: {result.get('price', 0)}, 单位: {result.get('unit', '')}")
|
||||
return result
|
||||
|
||||
def load_barcode_mappings(self) -> Dict[str, Dict[str, Any]]:
|
||||
"""
|
||||
从配置文件加载条码映射
|
||||
|
||||
Returns:
|
||||
条码映射字典
|
||||
"""
|
||||
# 默认映射
|
||||
default_mappings = {
|
||||
'6925019900087': {
|
||||
'multiplier': 10,
|
||||
'target_unit': '瓶',
|
||||
'description': '特殊处理:数量*10,单位转换为瓶'
|
||||
},
|
||||
'6921168593804': {
|
||||
'multiplier': 30,
|
||||
'target_unit': '瓶',
|
||||
'description': 'NFC产品特殊处理:每箱30瓶'
|
||||
},
|
||||
'6901826888138': {
|
||||
'multiplier': 30,
|
||||
'target_unit': '瓶',
|
||||
'fixed_price': 112/30,
|
||||
'specification': '1*30',
|
||||
'description': '特殊处理: 规格1*30,数量*30,单价=112/30'
|
||||
},
|
||||
# 条码映射配置
|
||||
'6920584471055': {
|
||||
'map_to': '6920584471017',
|
||||
'description': '条码映射:6920584471055 -> 6920584471017'
|
||||
},
|
||||
'6925861571159': {
|
||||
'map_to': '69021824',
|
||||
'description': '条码映射:6925861571159 -> 69021824'
|
||||
},
|
||||
'6923644268923': {
|
||||
'map_to': '6923644268480',
|
||||
'description': '条码映射:6923644268923 -> 6923644268480'
|
||||
},
|
||||
# 添加特殊条码6958620703716,既需要特殊处理又需要映射
|
||||
'6958620703716': {
|
||||
'specification': '1*14',
|
||||
'map_to': '6958620703907',
|
||||
'description': '特殊处理: 规格1*14,同时映射到6958620703907'
|
||||
}
|
||||
}
|
||||
|
||||
try:
|
||||
# 检查配置文件是否存在
|
||||
if os.path.exists(BARCODE_MAPPING_CONFIG):
|
||||
with open(BARCODE_MAPPING_CONFIG, 'r', encoding='utf-8') as file:
|
||||
mappings = json.load(file)
|
||||
logger.info(f"成功加载条码映射配置,共{len(mappings)}项")
|
||||
return mappings
|
||||
else:
|
||||
# 创建默认配置文件
|
||||
self.save_barcode_mappings(default_mappings)
|
||||
logger.info(f"创建默认条码映射配置,共{len(default_mappings)}项")
|
||||
return default_mappings
|
||||
except Exception as e:
|
||||
logger.error(f"加载条码映射配置失败: {e}")
|
||||
return default_mappings
|
||||
|
||||
def save_barcode_mappings(self, mappings: Dict[str, Dict[str, Any]]) -> bool:
|
||||
"""
|
||||
保存条码映射到配置文件
|
||||
|
||||
Args:
|
||||
mappings: 条码映射字典
|
||||
|
||||
Returns:
|
||||
保存是否成功
|
||||
"""
|
||||
try:
|
||||
# 确保配置目录存在
|
||||
os.makedirs(os.path.dirname(BARCODE_MAPPING_CONFIG), exist_ok=True)
|
||||
|
||||
# 写入配置文件
|
||||
with open(BARCODE_MAPPING_CONFIG, 'w', encoding='utf-8') as file:
|
||||
json.dump(mappings, file, ensure_ascii=False, indent=2)
|
||||
|
||||
logger.info(f"条码映射配置保存成功,共{len(mappings)}项")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"保存条码映射配置失败: {e}")
|
||||
return False
|
||||
|
||||
def update_barcode_mappings(self, new_mappings: Dict[str, Dict[str, Any]]) -> bool:
|
||||
"""
|
||||
更新条码映射配置
|
||||
|
||||
Args:
|
||||
new_mappings: 新的条码映射字典
|
||||
|
||||
Returns:
|
||||
更新是否成功
|
||||
"""
|
||||
self.special_barcodes = new_mappings
|
||||
return self.save_barcode_mappings(new_mappings)
|
||||
@@ -0,0 +1,11 @@
|
||||
"""
|
||||
单位转换处理程序包
|
||||
-----------------
|
||||
提供单位转换和条码处理的各种处理程序
|
||||
"""
|
||||
|
||||
from typing import Dict, Any
|
||||
|
||||
# 导出所有处理程序类
|
||||
from .barcode_mapper import BarcodeMapper
|
||||
from .unit_converter_handlers import JianUnitHandler, BoxUnitHandler, TiHeUnitHandler, GiftUnitHandler, UnitHandler
|
||||
@@ -0,0 +1,83 @@
|
||||
"""
|
||||
条码映射处理程序
|
||||
-------------
|
||||
处理特殊条码的映射和转换
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Dict, Optional, Any
|
||||
|
||||
from ...utils.log_utils import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class BarcodeMapper:
|
||||
"""
|
||||
条码映射器:负责特殊条码的映射和处理
|
||||
"""
|
||||
|
||||
def __init__(self, special_barcodes: Dict[str, Dict[str, Any]]):
|
||||
"""
|
||||
初始化条码映射器
|
||||
|
||||
Args:
|
||||
special_barcodes: 特殊条码配置字典
|
||||
"""
|
||||
self.special_barcodes = special_barcodes or {}
|
||||
|
||||
def map_barcode(self, product: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
映射商品条码,处理特殊情况
|
||||
|
||||
Args:
|
||||
product: 包含条码的商品信息字典
|
||||
|
||||
Returns:
|
||||
处理后的商品信息字典
|
||||
"""
|
||||
result = product.copy()
|
||||
barcode = result.get('barcode', '')
|
||||
|
||||
# 如果条码不在特殊条码列表中,直接返回
|
||||
if not barcode or barcode not in self.special_barcodes:
|
||||
return result
|
||||
|
||||
special_config = self.special_barcodes[barcode]
|
||||
|
||||
# 处理特殊倍数
|
||||
if 'multiplier' in special_config:
|
||||
multiplier = special_config.get('multiplier', 1)
|
||||
target_unit = special_config.get('target_unit', '瓶')
|
||||
|
||||
# 数量乘以倍数
|
||||
quantity = result.get('quantity', 0)
|
||||
new_quantity = quantity * multiplier
|
||||
|
||||
# 单价除以倍数
|
||||
price = result.get('price', 0)
|
||||
new_price = price / multiplier if price else 0
|
||||
|
||||
# 如果有固定单价,优先使用
|
||||
if 'fixed_price' in special_config:
|
||||
new_price = special_config['fixed_price']
|
||||
logger.info(f"特殊条码({barcode})使用固定单价: {new_price}")
|
||||
|
||||
# 如果有固定规格,设置规格
|
||||
if 'specification' in special_config:
|
||||
result['specification'] = special_config['specification']
|
||||
logger.info(f"特殊条码({barcode})使用固定规格: {special_config['specification']}")
|
||||
|
||||
logger.info(f"特殊条码处理: {barcode}, 数量: {quantity} -> {new_quantity}, 单价: {price} -> {new_price}, 单位: {result.get('unit', '')} -> {target_unit}")
|
||||
|
||||
result['quantity'] = new_quantity
|
||||
result['price'] = new_price
|
||||
result['unit'] = target_unit
|
||||
|
||||
# 处理条码映射 - 放在后面以便可以同时进行特殊处理和条码映射
|
||||
if 'map_to' in special_config:
|
||||
new_barcode = special_config['map_to']
|
||||
logger.info(f"条码映射: {barcode} -> {new_barcode}")
|
||||
result['barcode'] = new_barcode
|
||||
|
||||
return result
|
||||
@@ -0,0 +1,286 @@
|
||||
"""
|
||||
单位转换处理程序
|
||||
-------------
|
||||
处理不同单位的转换逻辑
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Dict, Optional, Any, Tuple, Protocol
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
from ...utils.log_utils import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class UnitHandler(ABC):
|
||||
"""
|
||||
单位处理器基类:定义单位处理接口
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def can_handle(self, product: Dict[str, Any]) -> bool:
|
||||
"""
|
||||
检查是否可以处理该商品
|
||||
|
||||
Args:
|
||||
product: 商品信息字典
|
||||
|
||||
Returns:
|
||||
是否可以处理
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def handle(self, product: Dict[str, Any], level1: int, level2: int, level3: Optional[int]) -> Dict[str, Any]:
|
||||
"""
|
||||
处理单位转换
|
||||
|
||||
Args:
|
||||
product: 商品信息字典
|
||||
level1: 一级包装数量
|
||||
level2: 二级包装数量
|
||||
level3: 三级包装数量,可能为None
|
||||
|
||||
Returns:
|
||||
处理后的商品信息字典
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class JianUnitHandler(UnitHandler):
|
||||
"""
|
||||
处理"件"单位的转换
|
||||
"""
|
||||
|
||||
def can_handle(self, product: Dict[str, Any]) -> bool:
|
||||
"""
|
||||
检查是否可以处理该商品(单位为"件")
|
||||
|
||||
Args:
|
||||
product: 商品信息字典
|
||||
|
||||
Returns:
|
||||
是否可以处理
|
||||
"""
|
||||
unit = str(product.get('unit', '')).strip()
|
||||
# 匹配"件"、"件、"、"件装"等
|
||||
return unit == '件' or unit.startswith('件')
|
||||
|
||||
def handle(self, product: Dict[str, Any], level1: int, level2: int, level3: Optional[int]) -> Dict[str, Any]:
|
||||
"""
|
||||
处理"件"单位转换:数量×包装数量,单价÷包装数量,单位转为"瓶"
|
||||
|
||||
Args:
|
||||
product: 商品信息字典
|
||||
level1: 一级包装数量
|
||||
level2: 二级包装数量
|
||||
level3: 三级包装数量,可能为None
|
||||
|
||||
Returns:
|
||||
处理后的商品信息字典
|
||||
"""
|
||||
result = product.copy()
|
||||
|
||||
quantity = result.get('quantity', 0)
|
||||
price = result.get('price', 0)
|
||||
|
||||
# 计算包装数量(二级*三级,如果无三级则仅二级)
|
||||
packaging_count = level2 * (level3 or 1)
|
||||
|
||||
# 数量×包装数量
|
||||
new_quantity = quantity * packaging_count
|
||||
|
||||
# 单价÷包装数量
|
||||
new_price = price / packaging_count if price else 0
|
||||
|
||||
logger.info(f"件单位处理: 数量: {quantity} -> {new_quantity}, 单价: {price} -> {new_price}, 单位: 件 -> 瓶")
|
||||
|
||||
result['quantity'] = new_quantity
|
||||
result['price'] = new_price
|
||||
result['unit'] = '瓶'
|
||||
|
||||
return result
|
||||
|
||||
|
||||
class BoxUnitHandler(UnitHandler):
|
||||
"""
|
||||
处理"箱"单位的转换
|
||||
"""
|
||||
|
||||
def can_handle(self, product: Dict[str, Any]) -> bool:
|
||||
"""
|
||||
检查是否可以处理该商品(单位为"箱")
|
||||
|
||||
Args:
|
||||
product: 商品信息字典
|
||||
|
||||
Returns:
|
||||
是否可以处理
|
||||
"""
|
||||
unit = str(product.get('unit', '')).strip()
|
||||
# 匹配"箱"、"箱、"、"箱装"等
|
||||
return unit == '箱' or unit.startswith('箱')
|
||||
|
||||
def handle(self, product: Dict[str, Any], level1: int, level2: int, level3: Optional[int]) -> Dict[str, Any]:
|
||||
"""
|
||||
处理"箱"单位转换:数量×包装数量,单价÷包装数量,单位转为"瓶"
|
||||
|
||||
Args:
|
||||
product: 商品信息字典
|
||||
level1: 一级包装数量
|
||||
level2: 二级包装数量
|
||||
level3: 三级包装数量,可能为None
|
||||
|
||||
Returns:
|
||||
处理后的商品信息字典
|
||||
"""
|
||||
result = product.copy()
|
||||
|
||||
quantity = result.get('quantity', 0)
|
||||
price = result.get('price', 0)
|
||||
|
||||
# 计算包装数量(二级*三级,如果无三级则仅二级)
|
||||
packaging_count = level2 * (level3 or 1)
|
||||
|
||||
# 数量×包装数量
|
||||
new_quantity = quantity * packaging_count
|
||||
|
||||
# 单价÷包装数量
|
||||
new_price = price / packaging_count if price else 0
|
||||
|
||||
logger.info(f"箱单位处理: 数量: {quantity} -> {new_quantity}, 单价: {price} -> {new_price}, 单位: 箱 -> 瓶")
|
||||
|
||||
result['quantity'] = new_quantity
|
||||
result['price'] = new_price
|
||||
result['unit'] = '瓶'
|
||||
|
||||
return result
|
||||
|
||||
|
||||
class TiHeUnitHandler(UnitHandler):
|
||||
"""
|
||||
处理"提"和"盒"单位的转换
|
||||
"""
|
||||
|
||||
def can_handle(self, product: Dict[str, Any]) -> bool:
|
||||
"""
|
||||
检查是否可以处理该商品(单位为"提"或"盒")
|
||||
|
||||
Args:
|
||||
product: 商品信息字典
|
||||
|
||||
Returns:
|
||||
是否可以处理
|
||||
"""
|
||||
unit = str(product.get('unit', '')).strip()
|
||||
return unit in ['提', '盒'] or unit.startswith('提') or unit.startswith('盒')
|
||||
|
||||
def handle(self, product: Dict[str, Any], level1: int, level2: int, level3: Optional[int]) -> Dict[str, Any]:
|
||||
"""
|
||||
处理"提"和"盒"单位转换:
|
||||
- 如果是三级规格,按件处理(数量×包装数量,单价÷包装数量,单位转为"瓶")
|
||||
- 如果是二级规格,保持不变
|
||||
|
||||
Args:
|
||||
product: 商品信息字典
|
||||
level1: 一级包装数量
|
||||
level2: 二级包装数量
|
||||
level3: 三级包装数量,可能为None
|
||||
|
||||
Returns:
|
||||
处理后的商品信息字典
|
||||
"""
|
||||
result = product.copy()
|
||||
|
||||
quantity = result.get('quantity', 0)
|
||||
price = result.get('price', 0)
|
||||
unit = result.get('unit', '')
|
||||
|
||||
# 如果是三级规格,按件处理
|
||||
if level3 is not None:
|
||||
# 计算包装数量 - 只乘以最后一级数量
|
||||
packaging_count = level3
|
||||
|
||||
# 数量×包装数量
|
||||
new_quantity = quantity * packaging_count
|
||||
|
||||
# 单价÷包装数量
|
||||
new_price = price / packaging_count if price else 0
|
||||
|
||||
logger.info(f"提/盒单位(三级规格)处理: 数量: {quantity} -> {new_quantity}, 单价: {price} -> {new_price}, 单位: {unit} -> 瓶")
|
||||
|
||||
result['quantity'] = new_quantity
|
||||
result['price'] = new_price
|
||||
result['unit'] = '瓶'
|
||||
else:
|
||||
# 如果是二级规格,保持不变
|
||||
logger.info(f"提/盒单位(二级规格)处理: 保持原样 数量: {quantity}, 单价: {price}, 单位: {unit}")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
class GiftUnitHandler(UnitHandler):
|
||||
"""
|
||||
处理赠品的特殊情况
|
||||
"""
|
||||
|
||||
def can_handle(self, product: Dict[str, Any]) -> bool:
|
||||
"""
|
||||
检查是否可以处理该商品(是否为赠品)
|
||||
|
||||
Args:
|
||||
product: 商品信息字典
|
||||
|
||||
Returns:
|
||||
是否可以处理
|
||||
"""
|
||||
return product.get('is_gift', False) is True
|
||||
|
||||
def handle(self, product: Dict[str, Any], level1: int, level2: int, level3: Optional[int]) -> Dict[str, Any]:
|
||||
"""
|
||||
处理赠品的单位转换:
|
||||
- 对于件/箱单位,数量仍然需要转换,但赠品的单价保持为0
|
||||
|
||||
Args:
|
||||
product: 商品信息字典
|
||||
level1: 一级包装数量
|
||||
level2: 二级包装数量
|
||||
level3: 三级包装数量,可能为None
|
||||
|
||||
Returns:
|
||||
处理后的商品信息字典
|
||||
"""
|
||||
result = product.copy()
|
||||
|
||||
unit = result.get('unit', '')
|
||||
quantity = result.get('quantity', 0)
|
||||
|
||||
# 根据单位类型选择适当的包装数计算
|
||||
if unit in ['件', '箱']:
|
||||
# 计算包装数量(二级*三级,如果无三级则仅二级)
|
||||
packaging_count = level2 * (level3 or 1)
|
||||
|
||||
# 数量×包装数量
|
||||
new_quantity = quantity * packaging_count
|
||||
|
||||
logger.info(f"赠品{unit}单位处理: 数量: {quantity} -> {new_quantity}, 单价: 0, 单位: {unit} -> 瓶")
|
||||
|
||||
result['quantity'] = new_quantity
|
||||
result['unit'] = '瓶'
|
||||
elif unit in ['提', '盒'] and level3 is not None:
|
||||
# 对于三级规格的提/盒,类似件处理
|
||||
new_quantity = quantity * level3
|
||||
|
||||
logger.info(f"赠品{unit}单位(三级规格)处理: 数量: {quantity} -> {new_quantity}, 单价: 0, 单位: {unit} -> 瓶")
|
||||
|
||||
result['quantity'] = new_quantity
|
||||
result['unit'] = '瓶'
|
||||
else:
|
||||
# 其他情况保持不变
|
||||
logger.info(f"赠品{unit}单位处理: 保持原样 数量: {quantity}, 单价: 0, 单位: {unit}")
|
||||
|
||||
# 确保单价为0
|
||||
result['price'] = 0
|
||||
|
||||
return result
|
||||
@@ -0,0 +1,423 @@
|
||||
"""
|
||||
订单合并模块
|
||||
----------
|
||||
提供采购单合并功能,将多个采购单合并为一个。
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import xlrd
|
||||
import xlwt
|
||||
from xlutils.copy import copy as xlcopy
|
||||
from typing import Dict, List, Optional, Tuple, Union, Any, Callable
|
||||
from datetime import datetime
|
||||
|
||||
from ...config.settings import ConfigManager
|
||||
from ..utils.log_utils import get_logger
|
||||
from ..handlers.column_mapper import ColumnMapper
|
||||
from ..utils.file_utils import (
|
||||
ensure_dir,
|
||||
get_file_extension,
|
||||
get_files_by_extensions,
|
||||
load_json,
|
||||
save_json
|
||||
)
|
||||
from ..utils.string_utils import (
|
||||
clean_string,
|
||||
clean_barcode,
|
||||
format_barcode
|
||||
)
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
class PurchaseOrderMerger:
|
||||
"""
|
||||
采购单合并器:将多个采购单Excel文件合并成一个文件
|
||||
"""
|
||||
|
||||
def __init__(self, config):
|
||||
"""
|
||||
初始化采购单合并器
|
||||
|
||||
Args:
|
||||
config: 配置信息
|
||||
"""
|
||||
self.config = config
|
||||
|
||||
# 修复ConfigParser对象没有get_path方法的问题
|
||||
try:
|
||||
# 获取输出目录
|
||||
self.output_dir = config.get('Paths', 'output_folder', fallback='data/output')
|
||||
|
||||
# 确保目录存在
|
||||
os.makedirs(self.output_dir, exist_ok=True)
|
||||
|
||||
# 记录实际路径
|
||||
logger.info(f"使用输出目录: {os.path.abspath(self.output_dir)}")
|
||||
|
||||
# 获取模板文件路径
|
||||
template_folder = config.get('Paths', 'template_folder', fallback='templates')
|
||||
template_name = config.get('Templates', 'purchase_order', fallback='银豹-采购单模板.xls')
|
||||
|
||||
self.template_path = os.path.join(template_folder, template_name)
|
||||
|
||||
# 检查模板文件是否存在
|
||||
if not os.path.exists(self.template_path):
|
||||
logger.warning(f"模板文件不存在: {self.template_path}")
|
||||
|
||||
# 用于记录已合并的文件
|
||||
self.merged_files_json = os.path.join(self.output_dir, "merged_files.json")
|
||||
self.merged_files = self._load_merged_files()
|
||||
|
||||
logger.info(f"初始化PurchaseOrderMerger完成,模板文件: {self.template_path}")
|
||||
except Exception as e:
|
||||
logger.error(f"初始化PurchaseOrderMerger失败: {e}")
|
||||
raise
|
||||
|
||||
def _load_merged_files(self) -> Dict[str, str]:
|
||||
"""
|
||||
加载已合并文件的缓存
|
||||
|
||||
Returns:
|
||||
合并记录字典
|
||||
"""
|
||||
return load_json(self.merged_files_json, {})
|
||||
|
||||
def _save_merged_files(self) -> None:
|
||||
"""保存已合并文件的缓存"""
|
||||
save_json(self.merged_files, self.merged_files_json)
|
||||
|
||||
def get_purchase_orders(self) -> List[str]:
|
||||
"""
|
||||
获取result目录下的采购单Excel文件
|
||||
|
||||
Returns:
|
||||
采购单文件路径列表
|
||||
"""
|
||||
# 采购单文件保存在data/result目录
|
||||
result_dir = "data/result"
|
||||
logger.info(f"搜索目录 {result_dir} 中的采购单Excel文件")
|
||||
|
||||
# 确保目录存在
|
||||
os.makedirs(result_dir, exist_ok=True)
|
||||
|
||||
# 获取所有Excel文件
|
||||
all_files = get_files_by_extensions(result_dir, ['.xls', '.xlsx'])
|
||||
|
||||
# 筛选采购单文件
|
||||
purchase_orders = [
|
||||
file for file in all_files
|
||||
if os.path.basename(file).startswith('采购单_')
|
||||
]
|
||||
|
||||
if not purchase_orders:
|
||||
logger.warning(f"未在 {result_dir} 目录下找到采购单Excel文件")
|
||||
return []
|
||||
|
||||
# 按修改时间排序,最新的在前
|
||||
purchase_orders.sort(key=lambda x: os.path.getmtime(x), reverse=True)
|
||||
|
||||
logger.info(f"找到 {len(purchase_orders)} 个采购单Excel文件")
|
||||
return purchase_orders
|
||||
|
||||
def read_purchase_order(self, file_path: str) -> Optional[pd.DataFrame]:
|
||||
"""
|
||||
读取采购单Excel文件
|
||||
|
||||
Args:
|
||||
file_path: 采购单文件路径
|
||||
|
||||
Returns:
|
||||
数据帧,如果读取失败则返回None
|
||||
"""
|
||||
try:
|
||||
# 读取Excel文件
|
||||
df = pd.read_excel(file_path)
|
||||
logger.info(f"成功读取采购单文件: {file_path}")
|
||||
|
||||
# 打印列名,用于调试
|
||||
logger.debug(f"Excel文件的列名: {df.columns.tolist()}")
|
||||
|
||||
# 处理特殊情况:检查是否需要读取指定行作为标题行
|
||||
header_row_idx = ColumnMapper.detect_header_row(df, max_rows=5, min_matches=3)
|
||||
if header_row_idx >= 0:
|
||||
logger.info(f"检测到表头在第 {header_row_idx+1} 行")
|
||||
|
||||
# 使用此行作为列名,数据从下一行开始
|
||||
header_row = df.iloc[header_row_idx].astype(str)
|
||||
data_rows = df.iloc[header_row_idx+1:].reset_index(drop=True)
|
||||
|
||||
# 为每一列分配名称(避免重复的列名)
|
||||
new_columns = []
|
||||
for i, col in enumerate(header_row):
|
||||
col_str = str(col)
|
||||
if col_str == 'nan' or col_str == 'None' or pd.isna(col):
|
||||
new_columns.append(f"Col_{i}")
|
||||
else:
|
||||
new_columns.append(col_str)
|
||||
|
||||
# 使用新列名创建新的DataFrame
|
||||
data_rows.columns = new_columns
|
||||
df = data_rows
|
||||
logger.debug(f"重新构建的数据帧列名: {df.columns.tolist()}")
|
||||
|
||||
# 使用 ColumnMapper 统一查找列名(保留中文键名以兼容下游代码)
|
||||
all_columns = df.columns.tolist()
|
||||
logger.info(f"列名: {all_columns}")
|
||||
|
||||
standard_to_chinese = {
|
||||
'barcode': '条码',
|
||||
'quantity': '采购量',
|
||||
'unit_price': '采购单价',
|
||||
'gift_quantity': '赠送量',
|
||||
}
|
||||
|
||||
mapped_columns = {}
|
||||
for std_name, chinese_name in standard_to_chinese.items():
|
||||
matched = ColumnMapper.find_column(all_columns, std_name)
|
||||
if matched:
|
||||
mapped_columns[chinese_name] = matched
|
||||
logger.info(f"列名映射: {matched} -> {chinese_name}")
|
||||
|
||||
# 如果找到了必要的列,重命名列
|
||||
if mapped_columns:
|
||||
rename_dict = {mapped_columns[key]: key for key in mapped_columns}
|
||||
logger.info(f"列名重命名映射: {rename_dict}")
|
||||
df = df.rename(columns=rename_dict)
|
||||
logger.info(f"重命名后的列名: {df.columns.tolist()}")
|
||||
else:
|
||||
logger.warning(f"未找到可映射的列名: {file_path}")
|
||||
|
||||
return df
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"读取采购单文件失败: {file_path}, 错误: {str(e)}")
|
||||
return None
|
||||
|
||||
def merge_purchase_orders(self, file_paths: List[str]) -> Optional[pd.DataFrame]:
|
||||
"""
|
||||
合并多个采购单文件
|
||||
|
||||
Args:
|
||||
file_paths: 采购单文件路径列表
|
||||
|
||||
Returns:
|
||||
合并后的数据帧,如果合并失败则返回None
|
||||
"""
|
||||
if not file_paths:
|
||||
logger.warning("没有需要合并的采购单文件")
|
||||
return None
|
||||
|
||||
# 读取所有采购单文件
|
||||
dfs = []
|
||||
for file_path in file_paths:
|
||||
df = self.read_purchase_order(file_path)
|
||||
if df is not None:
|
||||
dfs.append(df)
|
||||
|
||||
if not dfs:
|
||||
logger.warning("没有成功读取的采购单文件")
|
||||
return None
|
||||
|
||||
# 合并数据
|
||||
logger.info(f"开始合并 {len(dfs)} 个采购单文件")
|
||||
|
||||
# 首先,整理每个数据帧以确保它们有相同的结构
|
||||
processed_dfs = []
|
||||
for i, df in enumerate(dfs):
|
||||
# 确保必要的列存在
|
||||
required_columns = ['条码', '采购量', '采购单价']
|
||||
missing_columns = [col for col in required_columns if col not in df.columns]
|
||||
|
||||
if missing_columns:
|
||||
logger.warning(f"数据帧 {i} 缺少必要的列: {missing_columns}")
|
||||
continue
|
||||
|
||||
# 处理赠送量列不存在的情况
|
||||
if '赠送量' not in df.columns:
|
||||
df['赠送量'] = 0
|
||||
|
||||
# 选择并清理需要的列
|
||||
cleaned_df = pd.DataFrame()
|
||||
|
||||
# 清理条码 - 确保是字符串且无小数点
|
||||
cleaned_df['条码'] = df['条码'].apply(lambda x: format_barcode(x) if pd.notna(x) else '')
|
||||
|
||||
# 清理采购量 - 确保是数字
|
||||
cleaned_df['采购量'] = pd.to_numeric(df['采购量'], errors='coerce').fillna(0)
|
||||
|
||||
# 清理单价 - 确保是数字并保留4位小数
|
||||
cleaned_df['采购单价'] = pd.to_numeric(df['采购单价'], errors='coerce').fillna(0).round(4)
|
||||
|
||||
# 清理赠送量 - 确保是数字
|
||||
cleaned_df['赠送量'] = pd.to_numeric(df['赠送量'], errors='coerce').fillna(0)
|
||||
|
||||
# 过滤无效行 - 条码为空或采购量为0的行跳过
|
||||
valid_df = cleaned_df[(cleaned_df['条码'] != '') & (cleaned_df['采购量'] > 0)]
|
||||
|
||||
if len(valid_df) > 0:
|
||||
processed_dfs.append(valid_df)
|
||||
logger.info(f"处理文件 {i+1}: 有效记录 {len(valid_df)} 行")
|
||||
else:
|
||||
logger.warning(f"处理文件 {i+1}: 没有有效记录")
|
||||
|
||||
if not processed_dfs:
|
||||
logger.warning("没有有效的数据帧用于合并")
|
||||
return None
|
||||
|
||||
# 将所有数据帧合并
|
||||
merged_df = pd.concat(processed_dfs, ignore_index=True)
|
||||
|
||||
# 按条码和单价分组,合并相同商品
|
||||
# 四舍五入到4位小数,避免浮点误差导致相同价格被当作不同价格
|
||||
merged_df['采购单价'] = merged_df['采购单价'].round(4)
|
||||
|
||||
# 对于同一条码和单价的商品,合并数量和赠送量
|
||||
result = merged_df.groupby(['条码', '采购单价'], as_index=False).agg({
|
||||
'采购量': 'sum',
|
||||
'赠送量': 'sum'
|
||||
})
|
||||
|
||||
# 排序,按条码升序
|
||||
result = result.sort_values('条码').reset_index(drop=True)
|
||||
|
||||
# 设置为0的赠送量设为空
|
||||
result.loc[result['赠送量'] == 0, '赠送量'] = pd.NA
|
||||
|
||||
logger.info(f"合并完成,共 {len(result)} 条商品记录")
|
||||
return result
|
||||
|
||||
def create_merged_purchase_order(self, df: pd.DataFrame) -> Optional[str]:
|
||||
"""
|
||||
创建合并的采购单文件,完全按照银豹格式要求
|
||||
|
||||
Args:
|
||||
df: 合并后的数据帧
|
||||
|
||||
Returns:
|
||||
输出文件路径,如果创建失败则返回None
|
||||
"""
|
||||
try:
|
||||
# 打开模板文件
|
||||
template_workbook = xlrd.open_workbook(self.template_path, formatting_info=True)
|
||||
template_sheet = template_workbook.sheet_by_index(0)
|
||||
|
||||
# 首先分析模板结构,确定关键列的位置
|
||||
logger.info(f"分析模板结构")
|
||||
for i in range(min(5, template_sheet.nrows)):
|
||||
row_values = [str(cell.value).strip() for cell in template_sheet.row(i)]
|
||||
logger.debug(f"模板第{i+1}行: {row_values}")
|
||||
|
||||
# 银豹模板的标准列位置:
|
||||
# 条码列(商品条码): B列(索引1)
|
||||
barcode_col = 1
|
||||
# 采购量列: C列(索引2)
|
||||
quantity_col = 2
|
||||
# 赠送量列: D列(索引3)
|
||||
gift_col = 3
|
||||
# 采购单价列: E列(索引4)
|
||||
price_col = 4
|
||||
|
||||
# 找到数据开始行 - 通常是第二行(索引1)
|
||||
data_start_row = 1
|
||||
|
||||
# 创建可写的副本
|
||||
output_workbook = xlcopy(template_workbook)
|
||||
output_sheet = output_workbook.get_sheet(0)
|
||||
|
||||
# 设置单价的格式样式(保留4位小数)
|
||||
price_style = xlwt.XFStyle()
|
||||
price_style.num_format_str = '0.0000'
|
||||
|
||||
# 数量格式
|
||||
quantity_style = xlwt.XFStyle()
|
||||
quantity_style.num_format_str = '0'
|
||||
|
||||
# 遍历数据并填充到Excel
|
||||
for i, (_, row) in enumerate(df.iterrows()):
|
||||
r = data_start_row + i
|
||||
|
||||
# 只填充银豹采购单格式要求的4个列:条码、采购量、赠送量、采购单价
|
||||
|
||||
# 条码(必填)- B列(1)
|
||||
output_sheet.write(r, barcode_col, row['条码'])
|
||||
|
||||
# 采购量(必填)- C列(2)
|
||||
output_sheet.write(r, quantity_col, float(row['采购量']), quantity_style)
|
||||
|
||||
# 赠送量 - D列(3)
|
||||
if pd.notna(row['赠送量']) and float(row['赠送量']) > 0:
|
||||
output_sheet.write(r, gift_col, float(row['赠送量']), quantity_style)
|
||||
|
||||
# 采购单价(必填)- E列(4)
|
||||
output_sheet.write(r, price_col, float(row['采购单价']), price_style)
|
||||
|
||||
# 生成输出文件名,保存到data/result目录
|
||||
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
|
||||
result_dir = "data/result"
|
||||
os.makedirs(result_dir, exist_ok=True)
|
||||
output_file = os.path.join(result_dir, f"合并采购单_{timestamp}.xls")
|
||||
|
||||
# 保存文件
|
||||
output_workbook.save(output_file)
|
||||
logger.info(f"合并采购单已保存到: {output_file},共{len(df)}条记录")
|
||||
return output_file
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"创建合并采购单时出错: {e}")
|
||||
return None
|
||||
|
||||
def process(self, file_paths: Optional[List[str]] = None, progress_cb: Optional[Callable[[int], None]] = None) -> Optional[str]:
|
||||
"""
|
||||
处理采购单合并
|
||||
|
||||
Args:
|
||||
file_paths: 指定要合并的文件路径列表,如果为None则自动获取
|
||||
|
||||
Returns:
|
||||
合并后的文件路径,如果合并失败则返回None
|
||||
"""
|
||||
# 如果未指定文件路径,则获取所有采购单文件
|
||||
if file_paths is None:
|
||||
file_paths = self.get_purchase_orders()
|
||||
try:
|
||||
if progress_cb:
|
||||
progress_cb(97)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 检查是否有文件需要合并
|
||||
if not file_paths:
|
||||
logger.warning("没有找到可合并的采购单文件")
|
||||
return None
|
||||
|
||||
# 合并采购单
|
||||
merged_df = self.merge_purchase_orders(file_paths)
|
||||
if merged_df is None:
|
||||
logger.error("合并采购单失败")
|
||||
return None
|
||||
try:
|
||||
if progress_cb:
|
||||
progress_cb(98)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 创建合并的采购单文件
|
||||
output_file = self.create_merged_purchase_order(merged_df)
|
||||
if output_file is None:
|
||||
logger.error("创建合并采购单文件失败")
|
||||
return None
|
||||
try:
|
||||
if progress_cb:
|
||||
progress_cb(100)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 记录已合并文件
|
||||
for file_path in file_paths:
|
||||
self.merged_files[file_path] = output_file
|
||||
self._save_merged_files()
|
||||
|
||||
return output_file
|
||||
@@ -0,0 +1,860 @@
|
||||
"""
|
||||
Excel处理核心模块
|
||||
--------------
|
||||
提供Excel文件处理功能,包括表格解析、数据提取和处理。
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import xlrd
|
||||
import xlwt
|
||||
from xlutils.copy import copy as xlcopy
|
||||
from typing import Dict, List, Optional, Tuple, Union, Any, Callable
|
||||
from datetime import datetime
|
||||
|
||||
from ...config.settings import ConfigManager
|
||||
from ..utils.log_utils import get_logger
|
||||
from ..utils.file_utils import (
|
||||
ensure_dir,
|
||||
get_file_extension,
|
||||
get_latest_file,
|
||||
load_json,
|
||||
save_json
|
||||
)
|
||||
from ..utils.string_utils import (
|
||||
clean_string,
|
||||
extract_number,
|
||||
format_barcode,
|
||||
parse_monetary_string
|
||||
)
|
||||
from .converter import UnitConverter
|
||||
from ..handlers.column_mapper import ColumnMapper
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
class ExcelProcessor:
|
||||
"""
|
||||
Excel处理器:处理OCR识别后的Excel文件,
|
||||
提取条码、单价和数量,并按照采购单模板的格式填充
|
||||
"""
|
||||
|
||||
def __init__(self, config):
|
||||
"""
|
||||
初始化Excel处理器
|
||||
|
||||
Args:
|
||||
config: 配置信息
|
||||
"""
|
||||
self.config = config
|
||||
|
||||
# 修复ConfigParser对象没有get_path方法的问题
|
||||
try:
|
||||
# 获取输入和输出目录
|
||||
self.output_dir = config.get('Paths', 'output_folder', fallback='data/output')
|
||||
self.temp_dir = config.get('Paths', 'temp_folder', fallback='data/temp')
|
||||
|
||||
# 获取模板文件路径
|
||||
self.template_path = config.get('Paths', 'template_file', fallback='templates/银豹-采购单模板.xls')
|
||||
if not os.path.exists(self.template_path):
|
||||
logger.warning(f"模板文件不存在: {self.template_path}")
|
||||
|
||||
# 设置缓存文件路径
|
||||
self.cache_file = os.path.join(self.output_dir, "processed_files.json")
|
||||
self.processed_files = self._load_processed_files()
|
||||
|
||||
# 确保目录存在
|
||||
os.makedirs(self.output_dir, exist_ok=True)
|
||||
os.makedirs(self.temp_dir, exist_ok=True)
|
||||
|
||||
# 记录实际路径
|
||||
logger.info(f"使用输出目录: {os.path.abspath(self.output_dir)}")
|
||||
logger.info(f"使用临时目录: {os.path.abspath(self.temp_dir)}")
|
||||
|
||||
# 加载单位转换器和配置
|
||||
self.unit_converter = UnitConverter()
|
||||
logger.info(f"初始化ExcelProcessor完成,模板文件: {self.template_path}")
|
||||
except Exception as e:
|
||||
logger.error(f"初始化ExcelProcessor失败: {e}")
|
||||
raise
|
||||
|
||||
def _load_processed_files(self) -> Dict[str, str]:
|
||||
"""
|
||||
加载已处理文件的缓存
|
||||
|
||||
Returns:
|
||||
处理记录字典
|
||||
"""
|
||||
return load_json(self.cache_file, {})
|
||||
|
||||
def _save_processed_files(self) -> None:
|
||||
"""保存已处理文件的缓存"""
|
||||
save_json(self.processed_files, self.cache_file)
|
||||
|
||||
def get_latest_excel(self) -> Optional[str]:
|
||||
"""
|
||||
获取output目录下最新的Excel文件(排除采购单文件)
|
||||
|
||||
Returns:
|
||||
最新Excel文件的路径,如果未找到则返回None
|
||||
"""
|
||||
logger.info(f"搜索目录 {self.output_dir} 中的Excel文件")
|
||||
|
||||
# 使用文件工具获取最新文件
|
||||
latest_file = get_latest_file(
|
||||
self.output_dir,
|
||||
pattern="", # 不限制文件名
|
||||
extensions=['.xlsx', '.xls'] # 限制为Excel文件
|
||||
)
|
||||
|
||||
# 如果没有找到文件
|
||||
if not latest_file:
|
||||
logger.warning(f"未在 {self.output_dir} 目录下找到未处理的Excel文件")
|
||||
return None
|
||||
|
||||
# 检查是否是采购单(以"采购单_"开头的文件)
|
||||
file_name = os.path.basename(latest_file)
|
||||
if file_name.startswith('采购单_'):
|
||||
logger.warning(f"找到的最新文件是采购单,不作处理: {latest_file}")
|
||||
return None
|
||||
|
||||
logger.info(f"找到最新的Excel文件: {latest_file}")
|
||||
return latest_file
|
||||
|
||||
def extract_barcode(self, df: pd.DataFrame) -> List[str]:
|
||||
"""
|
||||
从数据帧中提取条码列名
|
||||
|
||||
Args:
|
||||
df: 数据帧
|
||||
|
||||
Returns:
|
||||
可能的条码列名列表
|
||||
"""
|
||||
possible_barcode_columns = ColumnMapper.STANDARD_COLUMNS['barcode']
|
||||
|
||||
found_columns = []
|
||||
|
||||
# 检查精确匹配
|
||||
for col in df.columns:
|
||||
col_str = str(col).strip()
|
||||
if col_str in possible_barcode_columns:
|
||||
found_columns.append(col)
|
||||
logger.info(f"找到精确匹配的条码列: {col_str}")
|
||||
|
||||
# 如果找不到精确匹配,尝试部分匹配
|
||||
if not found_columns:
|
||||
for col in df.columns:
|
||||
col_str = str(col).strip().lower()
|
||||
for keyword in ['条码', '条形码', 'barcode', '编码']:
|
||||
if keyword.lower() in col_str:
|
||||
found_columns.append(col)
|
||||
logger.info(f"找到部分匹配的条码列: {col} (包含关键词: {keyword})")
|
||||
break
|
||||
|
||||
# 如果仍然找不到,尝试使用数据特征识别
|
||||
if not found_columns and len(df) > 0:
|
||||
for col in df.columns:
|
||||
# 检查此列数据是否符合条码特征
|
||||
sample_values = df[col].dropna().astype(str).tolist()[:10] # 取前10个非空值
|
||||
|
||||
if sample_values and all(len(val) >= 8 and len(val) <= 14 for val in sample_values):
|
||||
# 大多数条码长度在8-14之间
|
||||
if all(val.isdigit() for val in sample_values):
|
||||
found_columns.append(col)
|
||||
logger.info(f"基于数据特征识别的可能条码列: {col}")
|
||||
|
||||
return found_columns
|
||||
|
||||
def extract_product_info(self, df: pd.DataFrame) -> List[Dict]:
|
||||
"""
|
||||
从数据帧中提取商品信息
|
||||
|
||||
Args:
|
||||
df: 数据帧
|
||||
|
||||
Returns:
|
||||
商品信息列表
|
||||
"""
|
||||
products = []
|
||||
|
||||
# 检测列映射
|
||||
column_mapping = self._detect_column_mapping(df)
|
||||
logger.info(f"检测到列映射: {column_mapping}")
|
||||
|
||||
# 处理每一行
|
||||
for idx, row in df.iterrows():
|
||||
try:
|
||||
# 初始化商品信息
|
||||
product = {
|
||||
'barcode': '', # 条码
|
||||
'name': '', # 商品名称
|
||||
'specification': '', # 规格
|
||||
'quantity': 0, # 数量
|
||||
'unit': '', # 单位
|
||||
'price': 0, # 单价
|
||||
'amount': 0, # 金额
|
||||
'is_gift': False # 是否为赠品
|
||||
}
|
||||
|
||||
# 提取条码
|
||||
if '条码' in df.columns and not pd.isna(row['条码']):
|
||||
product['barcode'] = str(row['条码']).strip()
|
||||
elif column_mapping.get('barcode') and not pd.isna(row[column_mapping['barcode']]):
|
||||
product['barcode'] = str(row[column_mapping['barcode']]).strip()
|
||||
|
||||
# 跳过空条码行
|
||||
if not product['barcode']:
|
||||
continue
|
||||
|
||||
# 检查备注列,过滤换货、退货、作废等非采购行
|
||||
skip_row = False
|
||||
for col in df.columns:
|
||||
col_str = str(col)
|
||||
if any(k in col_str for k in ['备注', '说明', '类型', '备注1']):
|
||||
val = str(row[col]).strip()
|
||||
# 过滤常见的非采购关键字
|
||||
if any(k in val for k in ['换货', '退货', '作废', '减钱', '冲减', '赠品单', '补货']):
|
||||
logger.info(f"过滤非采购行: {product['barcode']} - {product.get('name', '')}, 原因: {col_str}包含 '{val}'")
|
||||
skip_row = True
|
||||
break
|
||||
if skip_row:
|
||||
continue
|
||||
|
||||
# 提取商品名称
|
||||
if '商品名称' in df.columns and not pd.isna(row['商品名称']):
|
||||
product['name'] = str(row['商品名称']).strip()
|
||||
elif '名称' in df.columns and not pd.isna(row['名称']):
|
||||
product['name'] = str(row['名称']).strip()
|
||||
elif column_mapping.get('name') and not pd.isna(row[column_mapping['name']]):
|
||||
product['name'] = str(row[column_mapping['name']]).strip()
|
||||
|
||||
# 提取单位
|
||||
if '单位' in df.columns and not pd.isna(row['单位']):
|
||||
product['unit'] = str(row['单位']).strip()
|
||||
elif column_mapping.get('unit') and not pd.isna(row[column_mapping['unit']]):
|
||||
product['unit'] = str(row[column_mapping['unit']]).strip()
|
||||
|
||||
# 提取单价
|
||||
if '单价' in df.columns and not pd.isna(row['单价']):
|
||||
product['price'] = row['单价']
|
||||
elif column_mapping.get('price') and not pd.isna(row[column_mapping['price']]):
|
||||
product['price'] = row[column_mapping['price']]
|
||||
|
||||
# 提取金额
|
||||
if '金额' in df.columns and not pd.isna(row['金额']):
|
||||
product['amount'] = row['金额']
|
||||
elif '小计' in df.columns and not pd.isna(row['小计']):
|
||||
product['amount'] = row['小计']
|
||||
elif column_mapping.get('amount') and not pd.isna(row[column_mapping['amount']]):
|
||||
product['amount'] = row[column_mapping['amount']]
|
||||
# 根据金额判断赠品:金额为0、为空、或为o/O
|
||||
amt = product.get('amount', None)
|
||||
try:
|
||||
is_amt_gift = False
|
||||
if amt is None:
|
||||
is_amt_gift = True
|
||||
elif isinstance(amt, str):
|
||||
parsed = parse_monetary_string(amt)
|
||||
is_amt_gift = (parsed is None or parsed == 0.0)
|
||||
else:
|
||||
parsed = parse_monetary_string(amt)
|
||||
is_amt_gift = (parsed is not None and parsed == 0.0)
|
||||
if is_amt_gift:
|
||||
product['is_gift'] = True
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 提取数量
|
||||
if '数量' in df.columns and not pd.isna(row['数量']):
|
||||
product['quantity'] = row['数量']
|
||||
elif column_mapping.get('quantity') and not pd.isna(row[column_mapping['quantity']]):
|
||||
product['quantity'] = row[column_mapping['quantity']]
|
||||
|
||||
# 处理可能的复合数量字段,例如"2箱"、"3件"
|
||||
if isinstance(product['quantity'], str) and product['quantity']:
|
||||
num, unit = self.unit_converter.extract_unit_from_quantity(product['quantity'])
|
||||
if unit:
|
||||
product['unit'] = unit
|
||||
if num is not None:
|
||||
product['quantity'] = num
|
||||
|
||||
# 提取规格并解析包装数量
|
||||
if '规格' in df.columns and not pd.isna(row['规格']):
|
||||
product['specification'] = str(row['规格'])
|
||||
# 修正OCR误识别的4.51*4为4.5L*4
|
||||
product['specification'] = re.sub(r'(\d+\.\d+)1\*(\d+)', r'\1L*\2', product['specification'])
|
||||
package_quantity = self.parse_specification(product['specification'])
|
||||
if package_quantity:
|
||||
product['package_quantity'] = package_quantity
|
||||
logger.info(f"解析规格: {product['specification']} -> 包装数量={package_quantity}")
|
||||
elif column_mapping.get('specification') and not pd.isna(row[column_mapping['specification']]):
|
||||
product['specification'] = str(row[column_mapping['specification']])
|
||||
# 修正OCR误识别的4.51*4为4.5L*4
|
||||
product['specification'] = re.sub(r'(\d+\.\d+)1\*(\d+)', r'\1L*\2', product['specification'])
|
||||
package_quantity = self.parse_specification(product['specification'])
|
||||
if package_quantity:
|
||||
product['package_quantity'] = package_quantity
|
||||
logger.info(f"从映射列解析规格: {product['specification']} -> 包装数量={package_quantity}")
|
||||
else:
|
||||
# 只有在无法从Excel获取规格时,才尝试从商品名称推断规格
|
||||
if product['name']:
|
||||
# 特殊处理:优先检查名称中是否包含"容量*数量"格式
|
||||
container_pattern = r'.*?(\d+(?:\.\d+)?)\s*(?:ml|[mM][lL]|[lL]|升|毫升)[*×xX](\d+).*'
|
||||
match = re.search(container_pattern, product['name'])
|
||||
if match:
|
||||
# 容量单位*数量格式,如"1.8L*8瓶",取数量部分作为包装数量
|
||||
volume = match.group(1)
|
||||
count = match.group(2)
|
||||
inferred_spec = f"{volume}L*{count}"
|
||||
inferred_qty = int(count)
|
||||
product['specification'] = inferred_spec
|
||||
product['package_quantity'] = inferred_qty
|
||||
logger.info(f"从商品名称提取容量*数量格式: {product['name']} -> {inferred_spec}, 包装数量={inferred_qty}")
|
||||
# 原来的重量/容量*数字格式处理逻辑
|
||||
else:
|
||||
weight_volume_pattern = r'.*?\d+(?:g|ml|毫升|克)[*xX×](\d+)'
|
||||
match = re.search(weight_volume_pattern, product['name'])
|
||||
if match:
|
||||
inferred_spec = f"1*{match.group(1)}"
|
||||
inferred_qty = int(match.group(1))
|
||||
product['specification'] = inferred_spec
|
||||
product['package_quantity'] = inferred_qty
|
||||
logger.info(f"从商品名称提取重量/容量规格: {product['name']} -> {inferred_spec}, 包装数量={inferred_qty}")
|
||||
else:
|
||||
# 一般情况的规格推断
|
||||
inferred_spec = self.unit_converter.infer_specification_from_name(product['name'])
|
||||
if inferred_spec:
|
||||
product['specification'] = inferred_spec
|
||||
package_quantity = self.parse_specification(inferred_spec)
|
||||
if package_quantity:
|
||||
product['package_quantity'] = package_quantity
|
||||
logger.info(f"从商品名称推断规格: {product['name']} -> {inferred_spec}, 包装数量={package_quantity}")
|
||||
|
||||
# 检查已设置的规格但未设置包装数量的情况
|
||||
if product.get('specification') and not product.get('package_quantity'):
|
||||
package_quantity = self.parse_specification(product['specification'])
|
||||
if package_quantity:
|
||||
product['package_quantity'] = package_quantity
|
||||
logger.info(f"解析已设置的规格: {product['specification']} -> 包装数量={package_quantity}")
|
||||
|
||||
# 新增逻辑:根据规格推断单位为"件"
|
||||
if not product['unit'] and product.get('barcode') and product.get('specification') and product.get('quantity') and product.get('price') is not None:
|
||||
# 检查规格是否符合容量*数量格式
|
||||
volume_pattern = r'(\d+(?:\.\d+)?)\s*(?:ml|[mL]L|l|L|升|毫升)[*×xX](\d+)'
|
||||
match = re.search(volume_pattern, product['specification'])
|
||||
|
||||
# 判断是否需要推断单位为"件"
|
||||
if match:
|
||||
product['unit'] = '件'
|
||||
logger.info(f"根据规格推断单位: {product['specification']} -> 单位=件")
|
||||
else:
|
||||
# 检查简单的数量*数量格式
|
||||
simple_pattern = r'(\d+)[*×xX](\d+)'
|
||||
match = re.search(simple_pattern, product['specification'])
|
||||
if match:
|
||||
product['unit'] = '件'
|
||||
logger.info(f"根据规格推断单位: {product['specification']} -> 单位=件")
|
||||
|
||||
# 应用单位转换规则
|
||||
product = self.unit_converter.process_unit_conversion(product)
|
||||
|
||||
# 如果数量为0但单价和金额都存在,计算数量 = 金额/单价
|
||||
if (product['quantity'] == 0 or product['quantity'] is None) and product['price'] > 0 and product['amount']:
|
||||
try:
|
||||
amount = parse_monetary_string(product['amount'])
|
||||
if amount is not None and amount > 0:
|
||||
quantity = amount / product['price']
|
||||
logger.info(f"数量为空或为0,通过金额({amount})和单价({product['price']})计算得出数量: {quantity}")
|
||||
product['quantity'] = quantity
|
||||
except Exception as e:
|
||||
logger.warning(f"通过金额和单价计算数量失败: {e}")
|
||||
|
||||
products.append(product)
|
||||
except Exception as e:
|
||||
logger.error(f"提取第{idx+1}行商品信息时出错: {e}", exc_info=True)
|
||||
continue
|
||||
|
||||
logger.info(f"提取到 {len(products)} 个商品信息")
|
||||
return products
|
||||
|
||||
def fill_template(self, products: List[Dict], output_file_path: str) -> bool:
|
||||
"""
|
||||
填充采购单模板
|
||||
|
||||
Args:
|
||||
products: 商品信息列表
|
||||
output_file_path: 输出文件路径
|
||||
|
||||
Returns:
|
||||
是否成功填充
|
||||
"""
|
||||
try:
|
||||
# 打开模板文件
|
||||
template_workbook = xlrd.open_workbook(self.template_path, formatting_info=True)
|
||||
template_sheet = template_workbook.sheet_by_index(0)
|
||||
|
||||
# 创建可写的副本
|
||||
output_workbook = xlcopy(template_workbook)
|
||||
output_sheet = output_workbook.get_sheet(0)
|
||||
|
||||
# 先对产品按条码分组,区分正常商品和赠品
|
||||
barcode_groups = {}
|
||||
|
||||
# 遍历所有产品,按条码分组
|
||||
logger.info(f"开始处理{len(products)} 个产品信息")
|
||||
for product in products:
|
||||
barcode = product.get('barcode', '')
|
||||
# 确保条码是整数字符串
|
||||
barcode = format_barcode(barcode)
|
||||
|
||||
if not barcode:
|
||||
logger.warning(f"跳过无条码商品")
|
||||
continue
|
||||
|
||||
# 获取数量和单价
|
||||
quantity = product.get('quantity', 0)
|
||||
price = product.get('price', 0)
|
||||
amount = product.get('amount', 0)
|
||||
|
||||
# 如果数量为0但单价和金额都存在,计算数量 = 金额/单价
|
||||
if (quantity == 0 or quantity is None) and price > 0 and amount:
|
||||
try:
|
||||
amount = parse_monetary_string(amount)
|
||||
if amount is not None and amount > 0:
|
||||
quantity = amount / price
|
||||
logger.info(f"数量为空或为0,通过金额({amount})和单价({price})计算得出数量: {quantity}")
|
||||
product['quantity'] = quantity
|
||||
except Exception as e:
|
||||
logger.warning(f"通过金额和单价计算数量失败: {e}")
|
||||
|
||||
# 判断是否为赠品(价格为0)
|
||||
is_gift = bool(product.get('is_gift', False)) or (price == 0)
|
||||
|
||||
logger.info(f"处理商品: 条码={barcode}, 数量={quantity}, 单价={price}, 是否赠品={is_gift}")
|
||||
|
||||
if barcode not in barcode_groups:
|
||||
barcode_groups[barcode] = {
|
||||
'normal': None, # 正常商品信息
|
||||
'gift_quantity': 0 # 赠品数量
|
||||
}
|
||||
|
||||
if is_gift:
|
||||
# 是赠品,累加赠品数量
|
||||
barcode_groups[barcode]['gift_quantity'] += quantity
|
||||
logger.info(f"发现赠品:条码{barcode}, 数量={quantity}")
|
||||
else:
|
||||
# 是正常商品
|
||||
if barcode_groups[barcode]['normal'] is None:
|
||||
barcode_groups[barcode]['normal'] = {
|
||||
'product': product,
|
||||
'quantity': quantity,
|
||||
'price': price
|
||||
}
|
||||
logger.info(f"发现正常商品:条码{barcode}, 数量={quantity}, 单价={price}")
|
||||
else:
|
||||
# 如果有多个正常商品记录,累加数量
|
||||
barcode_groups[barcode]['normal']['quantity'] += quantity
|
||||
logger.info(f"累加正常商品数量:条码{barcode}, 新增={quantity}, 累计={barcode_groups[barcode]['normal']['quantity']}")
|
||||
|
||||
# 如果单价不同,取平均值
|
||||
if price != barcode_groups[barcode]['normal']['price']:
|
||||
avg_price = (barcode_groups[barcode]['normal']['price'] + price) / 2
|
||||
barcode_groups[barcode]['normal']['price'] = avg_price
|
||||
logger.info(f"调整单价(取平均值):条码{barcode}, 原价={barcode_groups[barcode]['normal']['price']}, 新价={price}, 平均={avg_price}")
|
||||
|
||||
# 输出调试信息
|
||||
logger.info(f"分组后共{len(barcode_groups)} 个不同条码的商品")
|
||||
for barcode, group in barcode_groups.items():
|
||||
if group['normal'] is not None:
|
||||
logger.info(f"条码 {barcode} 处理结果:正常商品数量{group['normal']['quantity']},单价{group['normal']['price']},赠品数量{group['gift_quantity']}")
|
||||
else:
|
||||
logger.info(f"条码 {barcode} 处理结果:只有赠品,数量={group['gift_quantity']}")
|
||||
|
||||
# 准备填充数据
|
||||
row_index = 1 # 从第2行开始填充(索引从0开始)
|
||||
|
||||
for barcode, group in barcode_groups.items():
|
||||
# 1. 列B(1): 条码(必填)
|
||||
output_sheet.write(row_index, 1, barcode)
|
||||
|
||||
if group['normal'] is not None:
|
||||
# 有正常商品
|
||||
product = group['normal']['product']
|
||||
|
||||
# 2. 列C(2): 采购量(必填) 使用正常商品的采购量
|
||||
normal_quantity = group['normal']['quantity']
|
||||
output_sheet.write(row_index, 2, normal_quantity)
|
||||
|
||||
# 3. 列D(3): 赠送量 - 添加赠品数量
|
||||
if group['gift_quantity'] > 0:
|
||||
output_sheet.write(row_index, 3, group['gift_quantity'])
|
||||
logger.info(f"条码 {barcode} 填充:采购量={normal_quantity},赠品数量{group['gift_quantity']}")
|
||||
|
||||
# 4. 列E(4): 采购单价(必填)
|
||||
purchase_price = group['normal']['price']
|
||||
style = xlwt.XFStyle()
|
||||
style.num_format_str = '0.0000'
|
||||
output_sheet.write(row_index, 4, round(purchase_price, 4), style)
|
||||
else:
|
||||
# 只有赠品,没有正常商品
|
||||
# 采购量填0,赠送量填赠品数量
|
||||
output_sheet.write(row_index, 2, 0) # 采购量为0
|
||||
output_sheet.write(row_index, 3, group['gift_quantity']) # 赠送量
|
||||
output_sheet.write(row_index, 4, 0) # 单价为0
|
||||
|
||||
logger.info(f"条码 {barcode} 填充:仅有赠品,采购量=0,赠品数量={group['gift_quantity']}")
|
||||
|
||||
# 移到下一行
|
||||
row_index += 1
|
||||
|
||||
# 保存文件
|
||||
output_workbook.save(output_file_path)
|
||||
logger.info(f"采购单已保存到: {output_file_path}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"填充模板时出错: {e}")
|
||||
return False
|
||||
|
||||
def _find_header_row(self, df: pd.DataFrame) -> Optional[int]:
|
||||
"""自动识别表头行,委托给 ColumnMapper.detect_header_row"""
|
||||
result = ColumnMapper.detect_header_row(df, max_rows=30)
|
||||
if result >= 0:
|
||||
logger.info(f"找到表头行: 第{result+1}行")
|
||||
return result
|
||||
# 回退:找第一个非空行
|
||||
for row in range(len(df)):
|
||||
if df.iloc[row].notna().sum() > 3:
|
||||
logger.info(f"未找到明确表头,使用第一个有效行: 第{row+1}行")
|
||||
return row
|
||||
logger.warning("无法识别表头行")
|
||||
return None
|
||||
|
||||
def process_specific_file(self, file_path: str, progress_cb: Optional[Callable[[int], None]] = None) -> Optional[str]:
|
||||
"""
|
||||
处理指定的Excel文件
|
||||
|
||||
Args:
|
||||
file_path: Excel文件路径
|
||||
|
||||
Returns:
|
||||
输出文件路径,如果处理失败则返回None
|
||||
"""
|
||||
logger.info(f"开始处理Excel文件: {file_path}")
|
||||
|
||||
if not os.path.exists(file_path):
|
||||
logger.error(f"文件不存在: {file_path}")
|
||||
return None
|
||||
|
||||
try:
|
||||
# 读取Excel文件时不立即指定表头
|
||||
if progress_cb:
|
||||
try:
|
||||
progress_cb(92)
|
||||
except Exception:
|
||||
pass
|
||||
df = pd.read_excel(file_path, header=None)
|
||||
logger.info(f"成功读取Excel文件: {file_path}, 共 {len(df)} 行")
|
||||
|
||||
# 自动识别表头行
|
||||
header_row = self._find_header_row(df)
|
||||
if header_row is None:
|
||||
logger.error("无法识别表头行")
|
||||
return None
|
||||
|
||||
logger.info(f"识别到表头在第 {header_row+1} 行")
|
||||
|
||||
# 重新设置表头,避免二次读取
|
||||
if progress_cb:
|
||||
try:
|
||||
progress_cb(94)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 使用识别到的表头行设置列名,并过滤掉表头之前的行
|
||||
df.columns = df.iloc[header_row]
|
||||
df = df.iloc[header_row + 1:].reset_index(drop=True)
|
||||
|
||||
logger.info(f"重新整理数据结构,共 {len(df)} 行有效数据")
|
||||
|
||||
# 提取商品信息
|
||||
if progress_cb:
|
||||
try:
|
||||
progress_cb(96)
|
||||
except Exception:
|
||||
pass
|
||||
products = self.extract_product_info(df)
|
||||
|
||||
if not products:
|
||||
logger.warning("未提取到有效商品信息")
|
||||
return None
|
||||
|
||||
# 生成输出文件名,保存到data/result目录
|
||||
file_name = os.path.splitext(os.path.basename(file_path))[0]
|
||||
result_dir = "data/result"
|
||||
os.makedirs(result_dir, exist_ok=True)
|
||||
output_file = os.path.join(result_dir, f"采购单_{file_name}.xls")
|
||||
|
||||
# 填充模板并保存
|
||||
if self.fill_template(products, output_file):
|
||||
# 记录已处理文件
|
||||
self.processed_files[file_path] = output_file
|
||||
self._save_processed_files()
|
||||
|
||||
# 不再自动打开输出目录
|
||||
logger.info(f"采购单已保存到: {output_file}")
|
||||
if progress_cb:
|
||||
try:
|
||||
progress_cb(100)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return output_file
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"处理Excel文件时出错: {file_path}, 错误: {e}")
|
||||
return None
|
||||
|
||||
def process_latest_file(self, progress_cb: Optional[Callable[[int], None]] = None) -> Optional[str]:
|
||||
"""
|
||||
处理最新的Excel文件
|
||||
|
||||
Returns:
|
||||
输出文件路径,如果处理失败则返回None
|
||||
"""
|
||||
# 获取最新的Excel文件
|
||||
latest_file = self.get_latest_excel()
|
||||
if not latest_file:
|
||||
logger.warning("未找到可处理的Excel文件")
|
||||
return None
|
||||
|
||||
# 处理文件
|
||||
return self.process_specific_file(latest_file, progress_cb=progress_cb)
|
||||
|
||||
def _detect_column_mapping(self, df: pd.DataFrame) -> Dict[str, str]:
|
||||
"""
|
||||
自动检测列名映射
|
||||
|
||||
Args:
|
||||
df: 数据框
|
||||
|
||||
Returns:
|
||||
列名映射字典,键为标准列名,值为实际列名
|
||||
"""
|
||||
# 提取有用的列
|
||||
barcode_cols = self.extract_barcode(df)
|
||||
|
||||
# 如果没有找到条码列,无法继续处理
|
||||
if not barcode_cols:
|
||||
logger.error("未找到条码列,无法处理")
|
||||
return {}
|
||||
|
||||
# 使用 ColumnMapper 统一查找列名
|
||||
mapped_columns = {'barcode': barcode_cols[0]}
|
||||
logger.info(f"使用条码列: {mapped_columns['barcode']}")
|
||||
|
||||
# 内部键名 -> 标准列名映射 (processor.py 使用 price/amount 作为内部键名)
|
||||
field_map = [
|
||||
('name', 'name'),
|
||||
('specification', 'specification'),
|
||||
('quantity', 'quantity'),
|
||||
('unit', 'unit'),
|
||||
('price', 'unit_price'),
|
||||
('amount', 'total_price'),
|
||||
]
|
||||
|
||||
for internal_key, standard_name in field_map:
|
||||
matched = ColumnMapper.find_column(list(df.columns), standard_name)
|
||||
if matched:
|
||||
mapped_columns[internal_key] = matched
|
||||
logger.info(f"找到{internal_key}列: {matched}")
|
||||
|
||||
return mapped_columns
|
||||
|
||||
def infer_specification_from_name(self, product_name: str) -> Tuple[Optional[str], Optional[int]]:
|
||||
"""
|
||||
从商品名称推断规格
|
||||
根据特定的命名规则匹配规格信息
|
||||
|
||||
Args:
|
||||
product_name: 商品名称
|
||||
|
||||
Returns:
|
||||
规格字符串和包装数量的元组
|
||||
"""
|
||||
if not product_name or not isinstance(product_name, str):
|
||||
logger.warning(f"无效的商品名: {product_name}")
|
||||
return None, None
|
||||
|
||||
product_name = product_name.strip()
|
||||
|
||||
# 特殊处理:重量/容量*数字格式
|
||||
weight_volume_pattern = r'.*?\d+(?:g|ml|毫升|克)[*xX×](\d+)'
|
||||
match = re.search(weight_volume_pattern, product_name)
|
||||
if match:
|
||||
inferred_spec = f"1*{match.group(1)}"
|
||||
inferred_qty = int(match.group(1))
|
||||
logger.info(f"从商品名称提取重量/容量规格: {product_name} -> {inferred_spec}, 包装数量={inferred_qty}")
|
||||
return inferred_spec, inferred_qty
|
||||
|
||||
# 使用单位转换器推断规格
|
||||
inferred_spec = self.unit_converter.infer_specification_from_name(product_name)
|
||||
if inferred_spec:
|
||||
# 解析规格中的包装数量
|
||||
package_quantity = self.parse_specification(inferred_spec)
|
||||
if package_quantity:
|
||||
logger.info(f"从商品名称推断规格: {product_name} -> {inferred_spec}, 包装数量={package_quantity}")
|
||||
return inferred_spec, package_quantity
|
||||
|
||||
# 特定商品规则匹配
|
||||
spec_rules = [
|
||||
# XX入白膜格式,如"550纯净水24入白膜"
|
||||
(r'.*?(\d+)入白膜', lambda m: (f"1*{m.group(1)}", int(m.group(1)))),
|
||||
|
||||
# 白膜格式,如"550水24白膜"
|
||||
(r'.*?(\d+)白膜', lambda m: (f"1*{m.group(1)}", int(m.group(1)))),
|
||||
|
||||
# 445水溶C系列
|
||||
(r'445水溶C.*?(\d+)[入个]纸箱', lambda m: (f"1*{m.group(1)}", int(m.group(1)))),
|
||||
|
||||
# 东方树叶系列
|
||||
(r'东方树叶.*?(\d+\*\d+).*纸箱', lambda m: (m.group(1), int(m.group(1).split('*')[1]))),
|
||||
|
||||
# 桶装
|
||||
(r'(\d+\.?\d*L)桶装', lambda m: (f"{m.group(1)}*1", 1)),
|
||||
|
||||
# 树叶茶系
|
||||
(r'树叶.*?(\d+)[入个]纸箱', lambda m: (f"1*{m.group(1)}", int(m.group(1)))),
|
||||
|
||||
# 茶π系列
|
||||
(r'茶[πΠπ].*?(\d+)纸箱', lambda m: (f"1*{m.group(1)}", int(m.group(1)))),
|
||||
|
||||
# 通用入数匹配
|
||||
(r'.*?(\d+)[入个](?:纸箱|箱装|白膜)', lambda m: (f"1*{m.group(1)}", int(m.group(1)))),
|
||||
|
||||
# 通用数字+纸箱格式
|
||||
(r'.*?(\d+)纸箱', lambda m: (f"1*{m.group(1)}", int(m.group(1))))
|
||||
]
|
||||
|
||||
# 尝试所有规则
|
||||
for pattern, formatter in spec_rules:
|
||||
match = re.search(pattern, product_name)
|
||||
if match:
|
||||
spec, qty = formatter(match)
|
||||
logger.info(f"根据特定规则推断规格: {product_name} -> {spec}, 包装数量={qty}")
|
||||
return spec, qty
|
||||
|
||||
# 尝试直接从名称中提取数字*数字格式
|
||||
match = re.search(r'(\d+\*\d+)', product_name)
|
||||
if match:
|
||||
spec = match.group(1)
|
||||
package_quantity = self.parse_specification(spec)
|
||||
if package_quantity:
|
||||
logger.info(f"从名称中直接提取规格: {spec}, 包装数量={package_quantity}")
|
||||
return spec, package_quantity
|
||||
|
||||
# 最后尝试提取任何位置的数字,默认典型件装数
|
||||
numbers = re.findall(r'\d+', product_name)
|
||||
if numbers:
|
||||
for num in numbers:
|
||||
# 检查是否为典型的件装数(12/15/24/30)
|
||||
if num in ['12', '15', '24', '30']:
|
||||
spec = f"1*{num}"
|
||||
logger.info(f"从名称中提取可能的件装数: {spec}, 包装数量={int(num)}")
|
||||
return spec, int(num)
|
||||
|
||||
logger.warning(f"无法从商品名'{product_name}' 推断规格")
|
||||
return None, None
|
||||
|
||||
def parse_specification(self, spec_str: str) -> Optional[int]:
|
||||
"""
|
||||
解析规格字符串,提取包装数量
|
||||
支持格式:1*15, 1x15, 1*5*10, 5kg*6, IL*12等
|
||||
|
||||
Args:
|
||||
spec_str: 规格字符串
|
||||
|
||||
Returns:
|
||||
包装数量,如果无法解析则返回None
|
||||
"""
|
||||
if not spec_str or not isinstance(spec_str, str):
|
||||
return None
|
||||
|
||||
try:
|
||||
# 清理规格字符串
|
||||
spec_str = clean_string(spec_str)
|
||||
|
||||
# 处理可能的OCR误识别,如"IL"应为"1L","6oo"应为"600"
|
||||
spec_str = re.sub(r'(\b|^)[iIlL](\d+)', r'1\2', spec_str) # 将"IL"替换为"1L"
|
||||
spec_str = re.sub(r'(\d+)[oO0]{2,}', lambda m: m.group(1) + '00', spec_str) # 将"6oo"替换为"600"
|
||||
spec_str = spec_str.replace('×', '*').replace('x', '*').replace('X', '*') # 统一乘号
|
||||
|
||||
logger.debug(f"清理后的规格字符串: {spec_str}")
|
||||
|
||||
# 新增:匹配“1件=12桶/袋/盒…”等等式规格,取右侧数量作为包装数量
|
||||
eq_match = re.search(r'(\d+(?:\.\d+)?)\s*(?:件|箱|提|盒)\s*[==]\s*(\d+)\s*(?:瓶|桶|盒|支|个|袋|罐|包|卷)', spec_str)
|
||||
if eq_match:
|
||||
return int(eq_match.group(2))
|
||||
|
||||
# 匹配带单位的格式,如"5kg*6"、"450g*15"、"450ml*15"
|
||||
weight_pattern = r'(\d+(?:\.\d+)?)\s*(?:kg|KG|千克|公斤)[*×](\d+)'
|
||||
match = re.search(weight_pattern, spec_str)
|
||||
if match:
|
||||
return int(match.group(2))
|
||||
|
||||
# 匹配克、毫升等单位格式
|
||||
match = re.search(r'\d+(?:\.\d+)?(?:g|G|ml|ML|mL|毫升|克)[*×](\d+)', spec_str)
|
||||
if match:
|
||||
return int(match.group(1))
|
||||
|
||||
# 匹配1*5*10 格式的三级规格
|
||||
match = re.search(r'(\d+(?:\.\d+)?)[*×](\d+(?:\.\d+)?)[*×](\d+(?:\.\d+)?)', spec_str)
|
||||
if match:
|
||||
# 取最后一个数字作为袋数量
|
||||
return int(float(match.group(3)))
|
||||
|
||||
# 匹配1*15, 1x15 格式
|
||||
match = re.search(r'(\d+(?:\.\d+)?)[*×](\d+(?:\.\d+)?)', spec_str)
|
||||
if match:
|
||||
# 取第二个数字作为包装数量
|
||||
return int(float(match.group(2)))
|
||||
|
||||
# 匹配24瓶/件等格式
|
||||
match = re.search(r'(\d+(?:\.\d+)?)[瓶个支袋][//](件|箱)', spec_str)
|
||||
if match:
|
||||
return int(float(match.group(1)))
|
||||
|
||||
# 匹配4L格式
|
||||
match = re.search(r'(\d+(?:\.\d+)?)\s*[Ll升][*×]?(\d+(?:\.\d+)?)?', spec_str)
|
||||
if match:
|
||||
# 如果有第二个数字,返回它;否则返回1
|
||||
return int(float(match.group(2))) if match.group(2) else 1
|
||||
|
||||
# 匹配单独的数字+单位格式,如"12瓶装"
|
||||
match = re.search(r'(\d+(?:\.\d+)?)[瓶个支袋包盒罐箱](?:装|\/箱)?', spec_str)
|
||||
if match:
|
||||
return int(float(match.group(1)))
|
||||
|
||||
# 尝试直接匹配任何数字
|
||||
numbers = re.findall(r'\d+(?:\.\d+)?', spec_str)
|
||||
if numbers and len(numbers) > 0:
|
||||
# 如果只有一个数字,通常是包装数量
|
||||
if len(numbers) == 1:
|
||||
return int(float(numbers[0]))
|
||||
|
||||
# 如果有多个数字,尝试识别可能的包装数量(典型数值如6/12/24/30)
|
||||
for num in numbers:
|
||||
if float(num) in [6.0, 12.0, 24.0, 30.0]:
|
||||
return int(float(num))
|
||||
|
||||
# 如果没有典型数值,选择最后一个数字(通常是包装数量)
|
||||
return int(float(numbers[-1]))
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"解析规格'{spec_str}'时出错: {e}")
|
||||
|
||||
return None
|
||||
@@ -0,0 +1,259 @@
|
||||
"""
|
||||
数据验证器模块
|
||||
----------
|
||||
提供对商品数据的验证和修复功能
|
||||
"""
|
||||
|
||||
import re
|
||||
import logging
|
||||
from typing import Dict, Any, Optional, List, Tuple, Union
|
||||
|
||||
from ..utils.log_utils import get_logger
|
||||
from ..utils.string_utils import parse_monetary_string
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class ProductValidator:
|
||||
"""
|
||||
商品数据验证器:验证和修复商品数据
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""
|
||||
初始化商品数据验证器
|
||||
"""
|
||||
# 仓库标识列表
|
||||
self.warehouse_identifiers = ["仓库", "仓库全名", "warehouse"]
|
||||
|
||||
def validate_barcode(self, barcode: Any) -> Tuple[bool, str, Optional[str]]:
|
||||
"""
|
||||
验证并修复条码
|
||||
|
||||
Args:
|
||||
barcode: 原始条码值
|
||||
|
||||
Returns:
|
||||
(是否有效, 修复后的条码, 错误信息)元组
|
||||
"""
|
||||
error_message = None
|
||||
|
||||
# 处理空值
|
||||
if barcode is None:
|
||||
return False, "", "条码为空"
|
||||
|
||||
# 转为字符串
|
||||
barcode_str = str(barcode).strip()
|
||||
|
||||
# 处理"仓库"特殊情况
|
||||
if barcode_str in self.warehouse_identifiers:
|
||||
return False, barcode_str, "条码为仓库标识"
|
||||
|
||||
# 清理条码格式(移除非数字字符)
|
||||
barcode_clean = re.sub(r'\D', '', barcode_str)
|
||||
|
||||
# 如果清理后为空,无效
|
||||
if not barcode_clean:
|
||||
return False, barcode_str, "条码不包含数字"
|
||||
|
||||
# 对特定的错误条码进行修正(5开头改为6开头)
|
||||
if len(barcode_clean) > 8 and barcode_clean.startswith('5') and not barcode_clean.startswith('53'):
|
||||
original_barcode = barcode_clean
|
||||
barcode_clean = '6' + barcode_clean[1:]
|
||||
logger.info(f"修正条码前缀 5->6: {original_barcode} -> {barcode_clean}")
|
||||
|
||||
# 新增:处理14位条码,如果多余长度都是0,截断为13位
|
||||
if len(barcode_clean) > 13:
|
||||
original_length = len(barcode_clean)
|
||||
# 检查多余部分是否都是0
|
||||
if barcode_clean.endswith('0'):
|
||||
# 从末尾开始移除0,直到条码长度为13位或不再以0结尾
|
||||
while len(barcode_clean) > 13 and barcode_clean.endswith('0'):
|
||||
barcode_clean = barcode_clean[:-1]
|
||||
logger.info(f"修正条码长度: 从{original_length}位截断到{len(barcode_clean)}位")
|
||||
else:
|
||||
error_message = f"条码长度异常: {barcode_clean}, 长度={len(barcode_clean)}"
|
||||
logger.warning(error_message)
|
||||
return False, barcode_clean, error_message
|
||||
|
||||
# 验证条码长度
|
||||
if len(barcode_clean) < 8 or len(barcode_clean) > 13:
|
||||
error_message = f"条码长度异常: {barcode_clean}, 长度={len(barcode_clean)}"
|
||||
logger.warning(error_message)
|
||||
return False, barcode_clean, error_message
|
||||
|
||||
# 验证条码是否全为数字
|
||||
if not barcode_clean.isdigit():
|
||||
error_message = f"条码包含非数字字符: {barcode_clean}"
|
||||
logger.warning(error_message)
|
||||
return False, barcode_clean, error_message
|
||||
|
||||
# 对于序号9的特殊情况,允许其条码格式
|
||||
if barcode_clean == "5321545613":
|
||||
logger.info(f"特殊条码验证通过: {barcode_clean}")
|
||||
return True, barcode_clean, None
|
||||
|
||||
logger.debug(f"条码验证通过: {barcode_clean}")
|
||||
return True, barcode_clean, None
|
||||
|
||||
def validate_quantity(self, quantity: Any) -> Tuple[bool, float, Optional[str]]:
|
||||
"""
|
||||
验证并修复数量
|
||||
|
||||
Args:
|
||||
quantity: 原始数量值
|
||||
|
||||
Returns:
|
||||
(是否有效, 修复后的数量, 错误信息)元组
|
||||
"""
|
||||
# 处理空值
|
||||
if quantity is None:
|
||||
return False, 0.0, "数量为空"
|
||||
|
||||
# 如果是字符串,尝试解析
|
||||
if isinstance(quantity, str):
|
||||
# 去除空白和非数字字符(保留小数点)
|
||||
quantity_clean = re.sub(r'[^\d\.]', '', quantity.strip())
|
||||
if not quantity_clean:
|
||||
return False, 0.0, "数量不包含数字"
|
||||
|
||||
try:
|
||||
quantity_value = float(quantity_clean)
|
||||
except ValueError:
|
||||
return False, 0.0, f"无法将数量 '{quantity}' 转换为数字"
|
||||
else:
|
||||
# 尝试直接转换
|
||||
try:
|
||||
quantity_value = float(quantity)
|
||||
except (ValueError, TypeError):
|
||||
return False, 0.0, f"无法将数量 '{quantity}' 转换为数字"
|
||||
|
||||
# 数量必须大于0
|
||||
if quantity_value <= 0:
|
||||
return False, 0.0, f"数量必须大于0,当前值: {quantity_value}"
|
||||
|
||||
return True, quantity_value, None
|
||||
|
||||
def validate_price(self, price: Any) -> Tuple[bool, float, bool, Optional[str]]:
|
||||
"""
|
||||
验证并修复单价
|
||||
|
||||
Args:
|
||||
price: 原始单价值
|
||||
|
||||
Returns:
|
||||
(是否有效, 修复后的单价, 是否为赠品, 错误信息)元组
|
||||
"""
|
||||
# 初始化不是赠品
|
||||
is_gift = False
|
||||
|
||||
# 处理空值
|
||||
if price is None:
|
||||
return False, 0.0, True, "单价为空,视为赠品"
|
||||
|
||||
# 如果是字符串,检查赠品标识
|
||||
if isinstance(price, str):
|
||||
price_str = price.strip().lower()
|
||||
if price_str in ["赠品", "gift", "赠送", "0", ""]:
|
||||
return True, 0.0, True, None
|
||||
|
||||
price_value = parse_monetary_string(price_str)
|
||||
if price_value is None:
|
||||
return False, 0.0, True, f"无法将单价 '{price}' 转换为数字,视为赠品"
|
||||
else:
|
||||
# 尝试直接转换
|
||||
try:
|
||||
price_value = float(price)
|
||||
except (ValueError, TypeError):
|
||||
return False, 0.0, True, f"无法将单价 '{price}' 转换为数字,视为赠品"
|
||||
|
||||
# 单价为0视为赠品
|
||||
if price_value == 0:
|
||||
return True, 0.0, True, None
|
||||
|
||||
# 单价必须大于0
|
||||
if price_value < 0:
|
||||
return False, 0.0, True, f"单价不能为负数: {price_value},视为赠品"
|
||||
|
||||
return True, price_value, False, None
|
||||
|
||||
def validate_product(self, product: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
验证并修复商品数据
|
||||
|
||||
Args:
|
||||
product: 商品数据字典
|
||||
|
||||
Returns:
|
||||
修复后的商品数据字典
|
||||
"""
|
||||
# 创建新字典,避免修改原始数据
|
||||
validated_product = product.copy()
|
||||
|
||||
# 验证条码
|
||||
barcode = product.get('barcode', '')
|
||||
is_valid, fixed_barcode, error_msg = self.validate_barcode(barcode)
|
||||
if is_valid:
|
||||
validated_product['barcode'] = fixed_barcode
|
||||
else:
|
||||
logger.warning(f"条码验证失败: {error_msg}")
|
||||
if fixed_barcode:
|
||||
# 即使验证失败,但如果有修复后的条码仍然使用它
|
||||
validated_product['barcode'] = fixed_barcode
|
||||
|
||||
# 验证单价
|
||||
price = product.get('price', 0)
|
||||
is_valid, fixed_price, is_gift, error_msg = self.validate_price(price)
|
||||
validated_product['price'] = fixed_price
|
||||
|
||||
# 如果单价验证结果表示为赠品,更新赠品标识
|
||||
if is_gift:
|
||||
validated_product['is_gift'] = True
|
||||
if error_msg:
|
||||
logger.info(error_msg)
|
||||
|
||||
amount = product.get('amount', None)
|
||||
try:
|
||||
is_amount_gift = False
|
||||
parsed_amount = parse_monetary_string(amount)
|
||||
if parsed_amount is None or parsed_amount == 0.0:
|
||||
is_amount_gift = True
|
||||
if is_amount_gift:
|
||||
validated_product['is_gift'] = True
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 验证数量
|
||||
quantity = product.get('quantity', None)
|
||||
is_valid, fixed_quantity, error_msg = self.validate_quantity(quantity)
|
||||
|
||||
# 检查数量是否为空,但单价和金额存在的情况
|
||||
if not is_valid and error_msg == "数量为空":
|
||||
# 获取金额
|
||||
amount = product.get('amount', None)
|
||||
|
||||
# 如果单价有效且金额存在,则可以计算数量
|
||||
if fixed_price > 0 and amount is not None:
|
||||
try:
|
||||
# 确保金额是数字
|
||||
amount = parse_monetary_string(amount)
|
||||
if amount is None:
|
||||
raise ValueError("无法解析金额")
|
||||
|
||||
# 计算数量 = 金额 / 单价
|
||||
if amount > 0:
|
||||
calculated_quantity = amount / fixed_price
|
||||
logger.info(f"数量为空,通过金额({amount})和单价({fixed_price})计算得出数量: {calculated_quantity}")
|
||||
validated_product['quantity'] = calculated_quantity
|
||||
is_valid = True
|
||||
except (ValueError, TypeError, ZeroDivisionError) as e:
|
||||
logger.warning(f"通过金额和单价计算数量失败: {e}")
|
||||
|
||||
# 如果数量验证有效或通过金额计算成功
|
||||
if is_valid:
|
||||
validated_product['quantity'] = fixed_quantity if is_valid and fixed_quantity > 0 else validated_product.get('quantity', 0)
|
||||
else:
|
||||
logger.warning(f"数量验证失败: {error_msg}")
|
||||
validated_product['quantity'] = 0.0
|
||||
|
||||
return validated_product
|
||||
@@ -0,0 +1,9 @@
|
||||
"""
|
||||
数据处理handlers模块初始化文件
|
||||
"""
|
||||
|
||||
from .data_cleaner import DataCleaner
|
||||
from .column_mapper import ColumnMapper
|
||||
from .calculator import DataCalculator
|
||||
|
||||
__all__ = ['DataCleaner', 'ColumnMapper', 'DataCalculator']
|
||||
@@ -0,0 +1,378 @@
|
||||
"""
|
||||
数据计算处理器
|
||||
|
||||
提供各种数据计算功能,如数量计算、价格计算、汇总统计等
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from typing import Dict, Any, Optional, List, Union
|
||||
from ...core.utils.log_utils import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class DataCalculator:
|
||||
"""数据计算处理器
|
||||
|
||||
提供标准化的数据计算功能,支持各种业务计算规则
|
||||
"""
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
"""初始化数据计算器
|
||||
|
||||
Args:
|
||||
config: 计算配置
|
||||
"""
|
||||
self.config = config or {}
|
||||
self.calculation_rules = []
|
||||
|
||||
def add_rule(self, rule_type: str, **kwargs):
|
||||
"""添加计算规则
|
||||
|
||||
Args:
|
||||
rule_type: 规则类型
|
||||
**kwargs: 规则参数
|
||||
"""
|
||||
rule = {'type': rule_type, **kwargs}
|
||||
self.calculation_rules.append(rule)
|
||||
logger.debug(f"添加计算规则: {rule_type}")
|
||||
|
||||
def calculate(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""执行数据计算
|
||||
|
||||
Args:
|
||||
df: 输入数据
|
||||
|
||||
Returns:
|
||||
计算后的数据
|
||||
"""
|
||||
logger.info(f"开始数据计算,原始数据形状: {df.shape}")
|
||||
|
||||
result_df = df.copy()
|
||||
|
||||
for i, rule in enumerate(self.calculation_rules):
|
||||
try:
|
||||
logger.debug(f"执行计算规则 {i+1}/{len(self.calculation_rules)}: {rule['type']}")
|
||||
result_df = self._apply_rule(result_df, rule)
|
||||
logger.debug(f"规则执行完成,数据形状: {result_df.shape}")
|
||||
except Exception as e:
|
||||
logger.error(f"计算规则执行失败: {rule}, 错误: {e}")
|
||||
# 继续执行下一个规则,而不是中断整个流程
|
||||
continue
|
||||
|
||||
logger.info(f"数据计算完成,最终数据形状: {result_df.shape}")
|
||||
return result_df
|
||||
|
||||
def _apply_rule(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""应用单个计算规则
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
rule_type = rule.get('type')
|
||||
|
||||
if rule_type == 'multiply':
|
||||
return self._multiply(df, rule)
|
||||
elif rule_type == 'divide':
|
||||
return self._divide(df, rule)
|
||||
elif rule_type == 'add':
|
||||
return self._add(df, rule)
|
||||
elif rule_type == 'subtract':
|
||||
return self._subtract(df, rule)
|
||||
elif rule_type == 'formula':
|
||||
return self._formula(df, rule)
|
||||
elif rule_type == 'round':
|
||||
return self._round(df, rule)
|
||||
elif rule_type == 'sum':
|
||||
return self._sum(df, rule)
|
||||
elif rule_type == 'aggregate':
|
||||
return self._aggregate(df, rule)
|
||||
else:
|
||||
logger.warning(f"未知的计算规则类型: {rule_type}")
|
||||
return df
|
||||
|
||||
def _multiply(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""乘法计算
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
source_column = rule.get('source_column')
|
||||
target_column = rule.get('target_column')
|
||||
factor = rule.get('factor', 1)
|
||||
|
||||
if source_column and target_column:
|
||||
if source_column in df.columns:
|
||||
df[target_column] = df[source_column] * factor
|
||||
logger.debug(f"乘法计算: {source_column} * {factor} -> {target_column}")
|
||||
else:
|
||||
logger.warning(f"源列不存在: {source_column}")
|
||||
|
||||
return df
|
||||
|
||||
def _divide(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""除法计算
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
source_column = rule.get('source_column')
|
||||
target_column = rule.get('target_column')
|
||||
divisor = rule.get('divisor', 1)
|
||||
|
||||
if source_column and target_column and divisor != 0:
|
||||
if source_column in df.columns:
|
||||
df[target_column] = df[source_column] / divisor
|
||||
logger.debug(f"除法计算: {source_column} / {divisor} -> {target_column}")
|
||||
else:
|
||||
logger.warning(f"源列不存在: {source_column}")
|
||||
elif divisor == 0:
|
||||
logger.error("除数不能为0")
|
||||
|
||||
return df
|
||||
|
||||
def _add(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""加法计算
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
columns = rule.get('columns', [])
|
||||
target_column = rule.get('target_column')
|
||||
constant = rule.get('constant', 0)
|
||||
|
||||
if target_column:
|
||||
if isinstance(columns, str):
|
||||
columns = [columns]
|
||||
|
||||
if columns:
|
||||
# 列相加
|
||||
valid_columns = [col for col in columns if col in df.columns]
|
||||
if valid_columns:
|
||||
df[target_column] = df[valid_columns].sum(axis=1) + constant
|
||||
logger.debug(f"加法计算: {valid_columns} + {constant} -> {target_column}")
|
||||
else:
|
||||
logger.warning(f"没有有效的列用于加法计算: {columns}")
|
||||
else:
|
||||
# 只加常数
|
||||
if target_column in df.columns:
|
||||
df[target_column] = df[target_column] + constant
|
||||
logger.debug(f"加法计算: {target_column} + {constant}")
|
||||
else:
|
||||
logger.warning(f"目标列不存在: {target_column}")
|
||||
|
||||
return df
|
||||
|
||||
def _subtract(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""减法计算
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
minuend = rule.get('minuend') # 被减数列
|
||||
subtrahend = rule.get('subtrahend') # 减数列
|
||||
target_column = rule.get('target_column')
|
||||
constant = rule.get('constant', 0)
|
||||
|
||||
if target_column and minuend and minuend in df.columns:
|
||||
if subtrahend and subtrahend in df.columns:
|
||||
df[target_column] = df[minuend] - df[subtrahend] - constant
|
||||
logger.debug(f"减法计算: {minuend} - {subtrahend} - {constant} -> {target_column}")
|
||||
else:
|
||||
df[target_column] = df[minuend] - constant
|
||||
logger.debug(f"减法计算: {minuend} - {constant} -> {target_column}")
|
||||
else:
|
||||
logger.warning(f"减法计算参数不完整或列不存在")
|
||||
|
||||
return df
|
||||
|
||||
def _formula(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""公式计算
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
formula = rule.get('formula')
|
||||
target_column = rule.get('target_column')
|
||||
|
||||
if formula and target_column:
|
||||
try:
|
||||
df[target_column] = df.eval(formula)
|
||||
logger.debug(f"公式计算: {formula} -> {target_column}")
|
||||
except Exception as e:
|
||||
logger.error(f"公式计算失败: {formula}, 错误: {e}")
|
||||
else:
|
||||
logger.warning("公式计算缺少公式或目标列")
|
||||
|
||||
return df
|
||||
|
||||
def _round(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""四舍五入
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
columns = rule.get('columns', [])
|
||||
decimals = rule.get('decimals', 0)
|
||||
|
||||
if isinstance(columns, str):
|
||||
columns = [columns]
|
||||
|
||||
target_columns = columns or df.select_dtypes(include=[np.number]).columns
|
||||
|
||||
for col in target_columns:
|
||||
if col in df.columns and pd.api.types.is_numeric_dtype(df[col]):
|
||||
df[col] = df[col].round(decimals)
|
||||
logger.debug(f"四舍五入: {col} 保留 {decimals} 位小数")
|
||||
|
||||
return df
|
||||
|
||||
def _sum(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""求和计算
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
columns = rule.get('columns', [])
|
||||
target_column = rule.get('target_column')
|
||||
group_by = rule.get('group_by')
|
||||
|
||||
if isinstance(columns, str):
|
||||
columns = [columns]
|
||||
|
||||
if group_by and group_by in df.columns:
|
||||
# 分组求和
|
||||
if columns:
|
||||
for col in columns:
|
||||
if col in df.columns:
|
||||
sum_result = df.groupby(group_by)[col].sum()
|
||||
logger.debug(f"分组求和: {col} 按 {group_by} 分组")
|
||||
else:
|
||||
# 所有数值列分组求和
|
||||
numeric_columns = df.select_dtypes(include=[np.number]).columns
|
||||
sum_result = df.groupby(group_by)[numeric_columns].sum()
|
||||
logger.debug(f"分组求和: 所有数值列 按 {group_by} 分组")
|
||||
else:
|
||||
# 总体求和
|
||||
if columns:
|
||||
valid_columns = [col for col in columns if col in df.columns]
|
||||
if valid_columns and target_column:
|
||||
df[target_column] = df[valid_columns].sum(axis=1)
|
||||
logger.debug(f"求和计算: {valid_columns} -> {target_column}")
|
||||
else:
|
||||
# 所有数值列求和
|
||||
numeric_columns = df.select_dtypes(include=[np.number]).columns
|
||||
if target_column and len(numeric_columns) > 0:
|
||||
df[target_column] = df[numeric_columns].sum(axis=1)
|
||||
logger.debug(f"求和计算: {list(numeric_columns)} -> {target_column}")
|
||||
|
||||
return df
|
||||
|
||||
def _aggregate(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""聚合计算
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
group_by = rule.get('group_by')
|
||||
aggregations = rule.get('aggregations', {})
|
||||
|
||||
if group_by and group_by in df.columns:
|
||||
# 构建聚合函数字典
|
||||
agg_dict = {}
|
||||
for column, func in aggregations.items():
|
||||
if column in df.columns:
|
||||
if isinstance(func, str):
|
||||
agg_dict[column] = func
|
||||
elif isinstance(func, list):
|
||||
agg_dict[column] = func
|
||||
|
||||
if agg_dict:
|
||||
result = df.groupby(group_by).agg(agg_dict)
|
||||
logger.debug(f"聚合计算: 按 {group_by} 分组, 聚合: {agg_dict}")
|
||||
return result.reset_index()
|
||||
|
||||
return df
|
||||
|
||||
# 便捷方法
|
||||
def multiply(self, source_column: str, target_column: str, factor: float):
|
||||
"""乘法计算"""
|
||||
self.add_rule('multiply', source_column=source_column,
|
||||
target_column=target_column, factor=factor)
|
||||
return self
|
||||
|
||||
def divide(self, source_column: str, target_column: str, divisor: float):
|
||||
"""除法计算"""
|
||||
self.add_rule('divide', source_column=source_column,
|
||||
target_column=target_column, divisor=divisor)
|
||||
return self
|
||||
|
||||
def add(self, columns: Union[str, List[str]], target_column: str, constant: float = 0):
|
||||
"""加法计算"""
|
||||
self.add_rule('add', columns=columns, target_column=target_column, constant=constant)
|
||||
return self
|
||||
|
||||
def subtract(self, minuend: str, target_column: str,
|
||||
subtrahend: Optional[str] = None, constant: float = 0):
|
||||
"""减法计算"""
|
||||
self.add_rule('subtract', minuend=minuend, target_column=target_column,
|
||||
subtrahend=subtrahend, constant=constant)
|
||||
return self
|
||||
|
||||
def formula(self, formula: str, target_column: str):
|
||||
"""公式计算"""
|
||||
self.add_rule('formula', formula=formula, target_column=target_column)
|
||||
return self
|
||||
|
||||
def round_columns(self, columns: Optional[Union[str, List[str]]] = None, decimals: int = 0):
|
||||
"""四舍五入"""
|
||||
self.add_rule('round', columns=columns, decimals=decimals)
|
||||
return self
|
||||
|
||||
def sum_columns(self, columns: Optional[Union[str, List[str]]] = None,
|
||||
target_column: Optional[str] = None, group_by: Optional[str] = None):
|
||||
"""求和计算"""
|
||||
self.add_rule('sum', columns=columns, target_column=target_column, group_by=group_by)
|
||||
return self
|
||||
|
||||
def aggregate(self, group_by: str, aggregations: Dict[str, Union[str, List[str]]]):
|
||||
"""聚合计算"""
|
||||
self.add_rule('aggregate', group_by=group_by, aggregations=aggregations)
|
||||
return self
|
||||
@@ -0,0 +1,382 @@
|
||||
"""
|
||||
列映射处理器
|
||||
|
||||
提供列名映射和转换功能,支持不同供应商的列名标准化
|
||||
"""
|
||||
|
||||
import re
|
||||
import pandas as pd
|
||||
from typing import Dict, Any, Optional, List, Union
|
||||
from ...core.utils.log_utils import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class ColumnMapper:
|
||||
"""列映射处理器
|
||||
|
||||
提供列名标准化功能,将不同供应商的列名映射到标准列名
|
||||
"""
|
||||
|
||||
# 标准列名定义(所有列名别名的唯一来源)
|
||||
STANDARD_COLUMNS = {
|
||||
'barcode': [
|
||||
'条码', '条形码', '商品条码', '商品条形码', '产品条码', '商品编码',
|
||||
'商品编号', '条码(必填)', '电脑条码', '条码ID',
|
||||
'barcode', 'Barcode', 'BarCode', 'code', '编码',
|
||||
],
|
||||
'name': [
|
||||
'商品名称', '产品名称', '名称', '商品', '产品', '商品名', '品名',
|
||||
'品项名', '商品或服务名称', '品项', '名 称',
|
||||
'name', 'product_name',
|
||||
],
|
||||
'specification': [
|
||||
'规格', '规格型号', '型号', '商品规格', '产品规格', '包装规格', '规 格',
|
||||
'specification', 'spec', 'model',
|
||||
],
|
||||
'quantity': [
|
||||
'数量', '采购量', '订货数量', '订单量', '需求量', '采购数量', '购买数量',
|
||||
'订单数量', '数量(必填)', '采购量(必填)', '入库数', '入库数量', '数 量',
|
||||
'quantity', 'qty',
|
||||
],
|
||||
'unit': [
|
||||
'单位', '计量单位', '采购单位', '单位(必填)', '单位名称', '计价单位', '单 位',
|
||||
'unit', 'units',
|
||||
],
|
||||
'unit_price': [
|
||||
'单价', '价格', '采购单价', '进货价', '销售价', '采购价', '参考价',
|
||||
'入库单价', '单价(必填)', '采购单价(必填)', '价格(必填)', '单 价',
|
||||
'unit_price', 'price',
|
||||
],
|
||||
'total_price': [
|
||||
'总价', '金额', '小计', '合计金额', '小计金额', '金额(元)',
|
||||
'金额合计', '合计', '总额',
|
||||
'total_price', 'total', 'amount',
|
||||
],
|
||||
'gift_quantity': [
|
||||
'赠送量', '赠品数量', '赠送数量', '赠品',
|
||||
],
|
||||
'category': ['类别', '分类', '商品类别', 'category', 'type'],
|
||||
'brand': ['品牌', '商标', 'brand'],
|
||||
'supplier': ['供应商', '供货商', 'supplier', 'vendor'],
|
||||
}
|
||||
|
||||
def __init__(self, mapping_config: Optional[Dict[str, Any]] = None):
|
||||
"""初始化列映射器
|
||||
|
||||
Args:
|
||||
mapping_config: 映射配置
|
||||
"""
|
||||
self.mapping_config = mapping_config or {}
|
||||
self.custom_mappings = {}
|
||||
self._build_reverse_mapping()
|
||||
|
||||
def _build_reverse_mapping(self):
|
||||
"""构建反向映射表"""
|
||||
self.reverse_mapping = {}
|
||||
|
||||
# 添加标准列的反向映射
|
||||
for standard_name, variations in self.STANDARD_COLUMNS.items():
|
||||
for variation in variations:
|
||||
self.reverse_mapping[variation.lower()] = standard_name
|
||||
|
||||
# 添加自定义映射
|
||||
for standard_name, custom_names in self.mapping_config.items():
|
||||
if isinstance(custom_names, str):
|
||||
custom_names = [custom_names]
|
||||
|
||||
for custom_name in custom_names:
|
||||
self.reverse_mapping[custom_name.lower()] = standard_name
|
||||
self.custom_mappings[custom_name.lower()] = standard_name
|
||||
|
||||
def map_columns(self, df: pd.DataFrame, target_columns: Optional[List[str]] = None) -> pd.DataFrame:
|
||||
"""映射列名
|
||||
|
||||
Args:
|
||||
df: 输入数据
|
||||
target_columns: 目标列名列表,如果为None则使用所有标准列
|
||||
|
||||
Returns:
|
||||
列名映射后的数据
|
||||
"""
|
||||
if target_columns is None:
|
||||
target_columns = list(self.STANDARD_COLUMNS.keys())
|
||||
|
||||
logger.info(f"开始列名映射,目标列: {target_columns}")
|
||||
logger.info(f"原始列名: {list(df.columns)}")
|
||||
|
||||
# 创建列名映射
|
||||
column_mapping = {}
|
||||
used_columns = set()
|
||||
|
||||
for target_col in target_columns:
|
||||
# 查找匹配的原始列名
|
||||
matched_column = self._find_matching_column(df.columns, target_col)
|
||||
if matched_column:
|
||||
column_mapping[matched_column] = target_col
|
||||
used_columns.add(matched_column)
|
||||
logger.debug(f"列名映射: {matched_column} -> {target_col}")
|
||||
|
||||
# 重命名列
|
||||
if column_mapping:
|
||||
df_mapped = df.rename(columns=column_mapping)
|
||||
|
||||
# 添加缺失的目标列
|
||||
for target_col in target_columns:
|
||||
if target_col not in df_mapped.columns:
|
||||
df_mapped[target_col] = self._get_default_value(target_col)
|
||||
logger.debug(f"添加缺失列: {target_col}")
|
||||
|
||||
# 只保留目标列
|
||||
existing_target_columns = [col for col in target_columns if col in df_mapped.columns]
|
||||
df_result = df_mapped[existing_target_columns]
|
||||
|
||||
logger.info(f"列名映射完成,结果列名: {list(df_result.columns)}")
|
||||
return df_result
|
||||
else:
|
||||
logger.warning("没有找到可映射的列名")
|
||||
return df
|
||||
|
||||
def _find_matching_column(self, columns: List[str], target_column: str) -> Optional[str]:
|
||||
"""查找匹配的列名
|
||||
|
||||
Args:
|
||||
columns: 原始列名列表
|
||||
target_column: 目标标准列名
|
||||
|
||||
Returns:
|
||||
匹配的原始列名或None
|
||||
"""
|
||||
# 获取目标列的所有可能变体
|
||||
possible_names = []
|
||||
|
||||
# 标准列名变体
|
||||
if target_column in self.STANDARD_COLUMNS:
|
||||
possible_names.extend(self.STANDARD_COLUMNS[target_column])
|
||||
|
||||
# 自定义映射
|
||||
for standard_name, custom_names in self.mapping_config.items():
|
||||
if standard_name == target_column:
|
||||
if isinstance(custom_names, str):
|
||||
possible_names.append(custom_names)
|
||||
else:
|
||||
possible_names.extend(custom_names)
|
||||
|
||||
# 查找匹配
|
||||
for possible_name in possible_names:
|
||||
# 精确匹配(忽略大小写)
|
||||
for column in columns:
|
||||
if column.lower() == possible_name.lower():
|
||||
return column
|
||||
|
||||
# 模糊匹配
|
||||
for column in columns:
|
||||
if possible_name.lower() in column.lower() or column.lower() in possible_name.lower():
|
||||
return column
|
||||
|
||||
return None
|
||||
|
||||
def _get_default_value(self, column_name: str) -> Any:
|
||||
"""获取列的默认值
|
||||
|
||||
Args:
|
||||
column_name: 列名
|
||||
|
||||
Returns:
|
||||
默认值
|
||||
"""
|
||||
# 根据列名类型返回合适的默认值
|
||||
if column_name in ['quantity', 'unit_price', 'total_price']:
|
||||
return 0
|
||||
elif column_name in ['barcode', 'name', 'specification', 'unit', 'category', 'brand', 'supplier']:
|
||||
return ''
|
||||
else:
|
||||
return None
|
||||
|
||||
def add_custom_mapping(self, standard_name: str, custom_names: Union[str, List[str]]):
|
||||
"""添加自定义列名映射
|
||||
|
||||
Args:
|
||||
standard_name: 标准列名
|
||||
custom_names: 自定义列名或列名列表
|
||||
"""
|
||||
if isinstance(custom_names, str):
|
||||
custom_names = [custom_names]
|
||||
|
||||
# 更新配置
|
||||
self.mapping_config[standard_name] = custom_names
|
||||
|
||||
# 更新反向映射
|
||||
for custom_name in custom_names:
|
||||
self.reverse_mapping[custom_name.lower()] = standard_name
|
||||
self.custom_mappings[custom_name.lower()] = standard_name
|
||||
|
||||
logger.info(f"添加自定义映射: {standard_name} <- {custom_names}")
|
||||
|
||||
def detect_column_types(self, df: pd.DataFrame) -> Dict[str, str]:
|
||||
"""检测列的数据类型
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
|
||||
Returns:
|
||||
列类型字典
|
||||
"""
|
||||
column_types = {}
|
||||
|
||||
for column in df.columns:
|
||||
if pd.api.types.is_numeric_dtype(df[column]):
|
||||
column_types[column] = 'numeric'
|
||||
elif pd.api.types.is_datetime64_any_dtype(df[column]):
|
||||
column_types[column] = 'datetime'
|
||||
elif pd.api.types.is_bool_dtype(df[column]):
|
||||
column_types[column] = 'boolean'
|
||||
else:
|
||||
column_types[column] = 'text'
|
||||
|
||||
return column_types
|
||||
|
||||
def suggest_column_mapping(self, df: pd.DataFrame) -> Dict[str, List[str]]:
|
||||
"""建议列名映射
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
|
||||
Returns:
|
||||
建议的映射关系
|
||||
"""
|
||||
suggestions = {}
|
||||
|
||||
for column in df.columns:
|
||||
column_lower = column.lower()
|
||||
suggestions[column] = []
|
||||
|
||||
# 检查标准列名
|
||||
for standard_name, variations in self.STANDARD_COLUMNS.items():
|
||||
for variation in variations:
|
||||
if column_lower in variation.lower() or variation.lower() in column_lower:
|
||||
suggestions[column].append(standard_name)
|
||||
|
||||
# 检查自定义映射
|
||||
for custom_name, standard_name in self.custom_mappings.items():
|
||||
if column_lower in custom_name or custom_name in column_lower:
|
||||
suggestions[column].append(standard_name)
|
||||
|
||||
# 去重
|
||||
suggestions[column] = list(set(suggestions[column]))
|
||||
|
||||
# 只返回有建议的列
|
||||
return {k: v for k, v in suggestions.items() if v}
|
||||
|
||||
def validate_mapping(self, df: pd.DataFrame, required_columns: List[str]) -> Dict[str, Any]:
|
||||
"""验证列映射结果
|
||||
|
||||
Args:
|
||||
df: 映射后的数据
|
||||
required_columns: 必需的列名列表
|
||||
|
||||
Returns:
|
||||
验证结果
|
||||
"""
|
||||
result = {
|
||||
'valid': True,
|
||||
'missing_columns': [],
|
||||
'empty_columns': [],
|
||||
'warnings': []
|
||||
}
|
||||
|
||||
# 检查缺失列
|
||||
for col in required_columns:
|
||||
if col not in df.columns:
|
||||
result['missing_columns'].append(col)
|
||||
result['valid'] = False
|
||||
|
||||
# 检查空列
|
||||
for col in df.columns:
|
||||
if df[col].isnull().all():
|
||||
result['empty_columns'].append(col)
|
||||
result['warnings'].append(f"列 '{col}' 全部为空值")
|
||||
|
||||
# 检查数值列
|
||||
numeric_columns = ['quantity', 'unit_price', 'total_price']
|
||||
for col in numeric_columns:
|
||||
if col in df.columns and not pd.api.types.is_numeric_dtype(df[col]):
|
||||
result['warnings'].append(f"列 '{col}' 不是数值类型")
|
||||
|
||||
return result
|
||||
|
||||
@classmethod
|
||||
def find_column(cls, columns: List[str], standard_name: str) -> Optional[str]:
|
||||
"""在列名列表中查找匹配标准列名的列
|
||||
|
||||
匹配策略: 精确匹配 → 忽略空白匹配 → 子串匹配
|
||||
|
||||
Args:
|
||||
columns: 实际列名列表
|
||||
standard_name: 标准列名 (STANDARD_COLUMNS 的键)
|
||||
|
||||
Returns:
|
||||
匹配到的实际列名,未找到返回 None
|
||||
"""
|
||||
candidates = cls.STANDARD_COLUMNS.get(standard_name, [])
|
||||
if not candidates:
|
||||
return None
|
||||
|
||||
columns_str = [str(c) for c in columns]
|
||||
|
||||
# 精确匹配
|
||||
for col in columns_str:
|
||||
col_clean = col.strip()
|
||||
for candidate in candidates:
|
||||
if col_clean == candidate:
|
||||
return col
|
||||
|
||||
# 忽略空白匹配
|
||||
for col in columns_str:
|
||||
col_clean = re.sub(r'\s+', '', col.strip())
|
||||
for candidate in candidates:
|
||||
if col_clean == re.sub(r'\s+', '', candidate):
|
||||
return col
|
||||
|
||||
# 子串匹配 (候选名包含在列名中)
|
||||
for col in columns_str:
|
||||
col_lower = col.strip().lower()
|
||||
for candidate in candidates:
|
||||
if candidate.lower() in col_lower:
|
||||
return col
|
||||
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def detect_header_row(df: pd.DataFrame, max_rows: int = 10, min_matches: int = 3) -> int:
|
||||
"""检测表头所在行
|
||||
|
||||
扫描前 max_rows 行,返回包含最多关键词匹配的行索引。
|
||||
|
||||
Args:
|
||||
df: 数据框
|
||||
max_rows: 最大扫描行数
|
||||
min_matches: 最少关键词匹配数
|
||||
|
||||
Returns:
|
||||
表头行索引,未找到返回 -1
|
||||
"""
|
||||
header_keywords = [
|
||||
'条码', '条形码', '商品条码', '商品名称', '名称', '规格',
|
||||
'单价', '数量', '金额', '单位', '必填', '编码',
|
||||
]
|
||||
|
||||
best_row = -1
|
||||
best_matches = 0
|
||||
|
||||
for row_idx in range(min(max_rows, len(df))):
|
||||
row_values = df.iloc[row_idx].astype(str)
|
||||
matches = sum(
|
||||
1 for kw in header_keywords
|
||||
if any(kw in str(val) for val in row_values.values)
|
||||
)
|
||||
if matches >= min_matches and matches > best_matches:
|
||||
best_matches = matches
|
||||
best_row = row_idx
|
||||
|
||||
return best_row
|
||||
@@ -0,0 +1,401 @@
|
||||
"""
|
||||
数据清洗处理器
|
||||
|
||||
提供各种数据清洗功能,如空值处理、重复项处理、数据类型转换等
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
from typing import Dict, Any, Optional, List, Union
|
||||
from ...core.utils.log_utils import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class DataCleaner:
|
||||
"""数据清洗处理器
|
||||
|
||||
提供标准化的数据清洗功能,支持链式调用和规则配置
|
||||
"""
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
"""初始化数据清洗器
|
||||
|
||||
Args:
|
||||
config: 清洗配置
|
||||
"""
|
||||
self.config = config or {}
|
||||
self.cleaning_rules = []
|
||||
|
||||
def add_rule(self, rule_type: str, **kwargs):
|
||||
"""添加清洗规则
|
||||
|
||||
Args:
|
||||
rule_type: 规则类型
|
||||
**kwargs: 规则参数
|
||||
"""
|
||||
rule = {'type': rule_type, **kwargs}
|
||||
self.cleaning_rules.append(rule)
|
||||
logger.debug(f"添加清洗规则: {rule_type}")
|
||||
|
||||
def clean(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""执行数据清洗
|
||||
|
||||
Args:
|
||||
df: 输入数据
|
||||
|
||||
Returns:
|
||||
清洗后的数据
|
||||
"""
|
||||
logger.info(f"开始数据清洗,原始数据形状: {df.shape}")
|
||||
|
||||
result_df = df.copy()
|
||||
|
||||
for i, rule in enumerate(self.cleaning_rules):
|
||||
try:
|
||||
logger.debug(f"执行清洗规则 {i+1}/{len(self.cleaning_rules)}: {rule['type']}")
|
||||
result_df = self._apply_rule(result_df, rule)
|
||||
logger.debug(f"规则执行完成,数据形状: {result_df.shape}")
|
||||
except Exception as e:
|
||||
logger.error(f"清洗规则执行失败: {rule}, 错误: {e}")
|
||||
# 继续执行下一个规则,而不是中断整个流程
|
||||
continue
|
||||
|
||||
logger.info(f"数据清洗完成,最终数据形状: {result_df.shape}")
|
||||
return result_df
|
||||
|
||||
def _apply_rule(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""应用单个清洗规则
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
rule_type = rule.get('type')
|
||||
|
||||
if rule_type == 'remove_duplicates':
|
||||
return self._remove_duplicates(df, rule)
|
||||
elif rule_type == 'fill_na':
|
||||
return self._fill_na(df, rule)
|
||||
elif rule_type == 'remove_rows':
|
||||
return self._remove_rows(df, rule)
|
||||
elif rule_type == 'convert_type':
|
||||
return self._convert_type(df, rule)
|
||||
elif rule_type == 'strip_whitespace':
|
||||
return self._strip_whitespace(df, rule)
|
||||
elif rule_type == 'normalize_text':
|
||||
return self._normalize_text(df, rule)
|
||||
elif rule_type == 'validate_data':
|
||||
return self._validate_data(df, rule)
|
||||
else:
|
||||
logger.warning(f"未知的清洗规则类型: {rule_type}")
|
||||
return df
|
||||
|
||||
def _remove_duplicates(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""移除重复项
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
subset = rule.get('subset') # 用于判断重复的列
|
||||
keep = rule.get('keep', 'first') # 保留哪个重复项
|
||||
|
||||
before_count = len(df)
|
||||
df_cleaned = df.drop_duplicates(subset=subset, keep=keep)
|
||||
after_count = len(df_cleaned)
|
||||
|
||||
logger.info(f"移除重复项: {before_count - after_count} 行被移除")
|
||||
return df_cleaned
|
||||
|
||||
def _fill_na(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""填充空值
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
columns = rule.get('columns') # 要处理的列
|
||||
value = rule.get('value', 0) # 填充值
|
||||
method = rule.get('method') # 填充方法('ffill', 'bfill', 'mean', 'median')
|
||||
|
||||
if columns:
|
||||
# 处理指定列
|
||||
if isinstance(columns, str):
|
||||
columns = [columns]
|
||||
|
||||
for col in columns:
|
||||
if col in df.columns:
|
||||
if method == 'ffill':
|
||||
df[col] = df[col].fillna(method='ffill')
|
||||
elif method == 'bfill':
|
||||
df[col] = df[col].fillna(method='bfill')
|
||||
elif method == 'mean':
|
||||
df[col] = df[col].fillna(df[col].mean())
|
||||
elif method == 'median':
|
||||
df[col] = df[col].fillna(df[col].median())
|
||||
else:
|
||||
df[col] = df[col].fillna(value)
|
||||
|
||||
logger.debug(f"填充列 {col} 的空值: {method or value}")
|
||||
else:
|
||||
# 处理所有列
|
||||
if method == 'ffill':
|
||||
df = df.fillna(method='ffill')
|
||||
elif method == 'bfill':
|
||||
df = df.fillna(method='bfill')
|
||||
else:
|
||||
df = df.fillna(value)
|
||||
|
||||
logger.debug(f"填充所有列的空值: {method or value}")
|
||||
|
||||
return df
|
||||
|
||||
def _remove_rows(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""移除行
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
condition = rule.get('condition') # 条件表达式
|
||||
columns = rule.get('columns') # 要检查的列
|
||||
values = rule.get('values') # 要移除的值
|
||||
|
||||
if condition:
|
||||
# 使用条件表达式
|
||||
try:
|
||||
before_count = len(df)
|
||||
df_filtered = df.query(condition)
|
||||
after_count = len(df_filtered)
|
||||
logger.info(f"条件过滤: {condition}, 移除了 {before_count - after_count} 行")
|
||||
return df_filtered
|
||||
except Exception as e:
|
||||
logger.error(f"条件表达式执行失败: {condition}, 错误: {e}")
|
||||
return df
|
||||
|
||||
if columns and values:
|
||||
# 基于列值过滤
|
||||
if isinstance(columns, str):
|
||||
columns = [columns]
|
||||
if not isinstance(values, list):
|
||||
values = [values]
|
||||
|
||||
df_filtered = df.copy()
|
||||
for col in columns:
|
||||
if col in df_filtered.columns:
|
||||
mask = ~df_filtered[col].isin(values)
|
||||
df_filtered = df_filtered[mask]
|
||||
logger.debug(f"列 {col} 过滤值 {values}")
|
||||
|
||||
return df_filtered
|
||||
|
||||
logger.warning("移除行规则缺少条件或列配置")
|
||||
return df
|
||||
|
||||
def _convert_type(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""类型转换
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
columns = rule.get('columns')
|
||||
target_type = rule.get('target_type', 'float')
|
||||
errors = rule.get('errors', 'coerce') # 错误处理方式
|
||||
|
||||
if isinstance(columns, str):
|
||||
columns = [columns]
|
||||
|
||||
for col in columns:
|
||||
if col in df.columns:
|
||||
try:
|
||||
if target_type == 'int':
|
||||
df[col] = pd.to_numeric(df[col], errors=errors).astype('Int64')
|
||||
elif target_type == 'float':
|
||||
df[col] = pd.to_numeric(df[col], errors=errors)
|
||||
elif target_type == 'datetime':
|
||||
df[col] = pd.to_datetime(df[col], errors=errors)
|
||||
elif target_type == 'string':
|
||||
df[col] = df[col].astype(str)
|
||||
else:
|
||||
df[col] = df[col].astype(target_type)
|
||||
|
||||
logger.debug(f"列 {col} 类型转换: {target_type}")
|
||||
except Exception as e:
|
||||
logger.error(f"列 {col} 类型转换失败: {e}")
|
||||
|
||||
return df
|
||||
|
||||
def _strip_whitespace(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""去除空白字符
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
columns = rule.get('columns')
|
||||
|
||||
if columns:
|
||||
if isinstance(columns, str):
|
||||
columns = [columns]
|
||||
|
||||
for col in columns:
|
||||
if col in df.columns and df[col].dtype == 'object':
|
||||
df[col] = df[col].str.strip()
|
||||
logger.debug(f"列 {col} 去除空白字符")
|
||||
else:
|
||||
# 处理所有文本列
|
||||
text_columns = df.select_dtypes(include=['object']).columns
|
||||
for col in text_columns:
|
||||
df[col] = df[col].str.strip()
|
||||
|
||||
logger.debug(f"所有文本列去除空白字符: {list(text_columns)}")
|
||||
|
||||
return df
|
||||
|
||||
def _normalize_text(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""文本标准化
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
columns = rule.get('columns')
|
||||
lowercase = rule.get('lowercase', False)
|
||||
uppercase = rule.get('uppercase', False)
|
||||
replace_map = rule.get('replace_map', {}) # 替换映射
|
||||
|
||||
if isinstance(columns, str):
|
||||
columns = [columns]
|
||||
|
||||
target_columns = columns or df.select_dtypes(include=['object']).columns
|
||||
|
||||
for col in target_columns:
|
||||
if col in df.columns and df[col].dtype == 'object':
|
||||
if lowercase:
|
||||
df[col] = df[col].str.lower()
|
||||
elif uppercase:
|
||||
df[col] = df[col].str.upper()
|
||||
|
||||
# 应用替换映射
|
||||
for old, new in replace_map.items():
|
||||
df[col] = df[col].str.replace(old, new)
|
||||
|
||||
logger.debug(f"列 {col} 文本标准化完成")
|
||||
|
||||
return df
|
||||
|
||||
def _validate_data(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""数据验证
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
columns = rule.get('columns')
|
||||
min_value = rule.get('min_value')
|
||||
max_value = rule.get('max_value')
|
||||
required = rule.get('required', False)
|
||||
|
||||
if isinstance(columns, str):
|
||||
columns = [columns]
|
||||
|
||||
validation_results = []
|
||||
|
||||
for col in columns:
|
||||
if col in df.columns:
|
||||
# 检查必需值
|
||||
if required:
|
||||
null_count = df[col].isnull().sum()
|
||||
if null_count > 0:
|
||||
validation_results.append(f"{col}: {null_count} 个空值")
|
||||
|
||||
# 检查数值范围
|
||||
if min_value is not None or max_value is not None:
|
||||
if pd.api.types.is_numeric_dtype(df[col]):
|
||||
invalid_mask = pd.Series(False, index=df.index)
|
||||
if min_value is not None:
|
||||
invalid_mask |= df[col] < min_value
|
||||
if max_value is not None:
|
||||
invalid_mask |= df[col] > max_value
|
||||
|
||||
invalid_count = invalid_mask.sum()
|
||||
if invalid_count > 0:
|
||||
validation_results.append(f"{col}: {invalid_count} 个值超出范围")
|
||||
|
||||
if validation_results:
|
||||
logger.warning(f"数据验证发现问题: {', '.join(validation_results)}")
|
||||
else:
|
||||
logger.debug("数据验证通过")
|
||||
|
||||
return df
|
||||
|
||||
# 便捷方法
|
||||
def remove_duplicates(self, subset: Optional[List[str]] = None, keep: str = 'first'):
|
||||
"""移除重复项"""
|
||||
self.add_rule('remove_duplicates', subset=subset, keep=keep)
|
||||
return self
|
||||
|
||||
def fill_na(self, columns: Optional[Union[str, List[str]]] = None,
|
||||
value: Any = 0, method: Optional[str] = None):
|
||||
"""填充空值"""
|
||||
self.add_rule('fill_na', columns=columns, value=value, method=method)
|
||||
return self
|
||||
|
||||
def remove_rows(self, condition: Optional[str] = None,
|
||||
columns: Optional[Union[str, List[str]]] = None,
|
||||
values: Optional[Any] = None):
|
||||
"""移除行"""
|
||||
self.add_rule('remove_rows', condition=condition, columns=columns, values=values)
|
||||
return self
|
||||
|
||||
def convert_type(self, columns: Union[str, List[str]], target_type: str, errors: str = 'coerce'):
|
||||
"""类型转换"""
|
||||
self.add_rule('convert_type', columns=columns, target_type=target_type, errors=errors)
|
||||
return self
|
||||
|
||||
def strip_whitespace(self, columns: Optional[Union[str, List[str]]] = None):
|
||||
"""去除空白字符"""
|
||||
self.add_rule('strip_whitespace', columns=columns)
|
||||
return self
|
||||
|
||||
def normalize_text(self, columns: Optional[Union[str, List[str]]] = None,
|
||||
lowercase: bool = False, uppercase: bool = False,
|
||||
replace_map: Optional[Dict[str, str]] = None):
|
||||
"""文本标准化"""
|
||||
self.add_rule('normalize_text', columns=columns, lowercase=lowercase,
|
||||
uppercase=uppercase, replace_map=replace_map or {})
|
||||
return self
|
||||
|
||||
def validate_data(self, columns: Union[str, List[str]],
|
||||
min_value: Optional[float] = None,
|
||||
max_value: Optional[float] = None,
|
||||
required: bool = False):
|
||||
"""数据验证"""
|
||||
self.add_rule('validate_data', columns=columns, min_value=min_value,
|
||||
max_value=max_value, required=required)
|
||||
return self
|
||||
@@ -0,0 +1,150 @@
|
||||
import re
|
||||
import pandas as pd
|
||||
from typing import List, Dict, Any, Optional
|
||||
|
||||
def _split_quantity_unit(df: pd.DataFrame, source: str, dictionary: Optional[Dict[str, Any]] = None) -> pd.DataFrame:
|
||||
if source in df.columns:
|
||||
vals = df[source].astype(str).fillna("")
|
||||
nums = []
|
||||
units = []
|
||||
default_unit = (dictionary or {}).get("default_unit", "")
|
||||
unit_synonyms = (dictionary or {}).get("unit_synonyms", {})
|
||||
for v in vals:
|
||||
m = re.search(r"(\d+(?:\.\d+)?)(箱|件|提|盒|瓶)", v)
|
||||
if m:
|
||||
nums.append(float(m.group(1)))
|
||||
u = unit_synonyms.get(m.group(2), m.group(2))
|
||||
units.append(u)
|
||||
else:
|
||||
try:
|
||||
nums.append(float(v))
|
||||
units.append(unit_synonyms.get(default_unit, default_unit))
|
||||
except Exception:
|
||||
nums.append(0.0)
|
||||
units.append(unit_synonyms.get(default_unit, default_unit))
|
||||
df["quantity"] = nums
|
||||
df["unit"] = units
|
||||
return df
|
||||
|
||||
def _extract_spec_from_name(df: pd.DataFrame, source: str, dictionary: Optional[Dict[str, Any]] = None) -> pd.DataFrame:
|
||||
if source in df.columns:
|
||||
names = df[source].astype(str).fillna("")
|
||||
specs = []
|
||||
packs = []
|
||||
ignore_words = (dictionary or {}).get("ignore_words", [])
|
||||
name_patterns = (dictionary or {}).get("name_patterns", [])
|
||||
for s in names:
|
||||
if ignore_words:
|
||||
for w in ignore_words:
|
||||
s = s.replace(w, "")
|
||||
matched = False
|
||||
for pat in name_patterns:
|
||||
try:
|
||||
m = re.search(pat, s)
|
||||
if m and len(m.groups()) >= 2:
|
||||
try:
|
||||
qty = int(m.group(len(m.groups())))
|
||||
except Exception:
|
||||
qty = None
|
||||
specs.append(s)
|
||||
packs.append(qty)
|
||||
matched = True
|
||||
break
|
||||
except Exception:
|
||||
pass
|
||||
if matched:
|
||||
continue
|
||||
m = re.search(r"(\d+(?:\.\d+)?)(ml|l|升|毫升)[*×xX](\d+)", s, re.IGNORECASE)
|
||||
if m:
|
||||
specs.append(f"{m.group(1)}{m.group(2)}*{m.group(3)}")
|
||||
packs.append(int(m.group(3)))
|
||||
continue
|
||||
m2 = re.search(r"(\d+)[*×xX](\d+)", s)
|
||||
if m2:
|
||||
specs.append(f"1*{m2.group(2)}")
|
||||
packs.append(int(m2.group(2)))
|
||||
continue
|
||||
m3 = re.search(r"(\d{2,3})\D*(\d{1,3})\D*", s)
|
||||
if m3:
|
||||
specs.append(f"1*{m3.group(2)}")
|
||||
packs.append(int(m3.group(2)))
|
||||
continue
|
||||
specs.append("")
|
||||
packs.append(None)
|
||||
df["specification"] = df.get("specification", pd.Series(specs))
|
||||
df["package_quantity"] = packs
|
||||
return df
|
||||
|
||||
def _normalize_unit(df: pd.DataFrame, target: str, unit_map: Dict[str, str], dictionary: Optional[Dict[str, Any]] = None) -> pd.DataFrame:
|
||||
if target in df.columns:
|
||||
df[target] = df[target].astype(str)
|
||||
df[target] = df[target].apply(lambda u: unit_map.get(u, u))
|
||||
pack_multipliers = (dictionary or {}).get("pack_multipliers", {})
|
||||
default_pq = (dictionary or {}).get("default_package_quantity", 1)
|
||||
try:
|
||||
if "quantity" in df.columns:
|
||||
def convert_qty(row):
|
||||
u = row.get(target)
|
||||
q = row.get("quantity")
|
||||
pq = row.get("package_quantity")
|
||||
if u in ("件", "箱", "提", "盒"):
|
||||
mult = pq or pack_multipliers.get(u, default_pq)
|
||||
if pd.notna(q) and pd.notna(mult) and float(mult) > 0:
|
||||
return float(q) * float(mult)
|
||||
return q
|
||||
df["quantity"] = df.apply(convert_qty, axis=1)
|
||||
df[target] = df[target].apply(lambda u: "瓶" if u in ("件","箱","提","盒") else u)
|
||||
except Exception:
|
||||
pass
|
||||
return df
|
||||
|
||||
def _compute_quantity_from_total(df: pd.DataFrame) -> pd.DataFrame:
|
||||
if "quantity" in df.columns and "unit_price" in df.columns:
|
||||
qty = df["quantity"].fillna(0)
|
||||
up = pd.to_numeric(df.get("unit_price", 0), errors="coerce").fillna(0)
|
||||
tp = pd.to_numeric(df.get("total_price", 0), errors="coerce").fillna(0)
|
||||
need = (qty <= 0) & (up > 0) & (tp > 0)
|
||||
df.loc[need, "quantity"] = (tp[need] / up[need]).round(6)
|
||||
return df
|
||||
|
||||
def _fill_missing(df: pd.DataFrame, fills: Dict[str, Any]) -> pd.DataFrame:
|
||||
for k, v in fills.items():
|
||||
if k in df.columns:
|
||||
df[k] = df[k].fillna(v)
|
||||
else:
|
||||
df[k] = v
|
||||
return df
|
||||
|
||||
def _mark_gift(df: pd.DataFrame) -> pd.DataFrame:
|
||||
df["is_gift"] = False
|
||||
tp = df.get("total_price")
|
||||
up = df.get("unit_price")
|
||||
flags = pd.Series([False]*len(df))
|
||||
if tp is not None:
|
||||
tpn = pd.to_numeric(tp, errors="coerce").fillna(0)
|
||||
flags = flags | (tpn == 0)
|
||||
if up is not None:
|
||||
upn = pd.to_numeric(up, errors="coerce").fillna(0)
|
||||
flags = flags | (upn == 0)
|
||||
if "name" in df.columns:
|
||||
flags = flags | df["name"].astype(str).str.contains(r"赠品|^o$|^O$", regex=True)
|
||||
df.loc[flags, "is_gift"] = True
|
||||
return df
|
||||
|
||||
def apply_rules(df: pd.DataFrame, rules: List[Dict[str, Any]], dictionary: Optional[Dict[str, Any]] = None) -> pd.DataFrame:
|
||||
out = df.copy()
|
||||
for r in rules or []:
|
||||
t = r.get("type")
|
||||
if t == "split_quantity_unit":
|
||||
out = _split_quantity_unit(out, r.get("source", "quantity"), dictionary)
|
||||
elif t == "extract_spec_from_name":
|
||||
out = _extract_spec_from_name(out, r.get("source", "name"), dictionary)
|
||||
elif t == "normalize_unit":
|
||||
out = _normalize_unit(out, r.get("target", "unit"), r.get("map", {}), dictionary)
|
||||
elif t == "compute_quantity_from_total":
|
||||
out = _compute_quantity_from_total(out)
|
||||
elif t == "fill_missing":
|
||||
out = _fill_missing(out, r.get("fills", {}))
|
||||
elif t == "mark_gift":
|
||||
out = _mark_gift(out)
|
||||
return out
|
||||
@@ -0,0 +1,5 @@
|
||||
"""
|
||||
OCR订单处理系统 - OCR核心模块
|
||||
---------------------------
|
||||
提供OCR识别相关功能,包括图片预处理、文字识别和表格识别。
|
||||
"""
|
||||
@@ -0,0 +1,368 @@
|
||||
"""
|
||||
百度OCR客户端模块
|
||||
---------------
|
||||
提供百度OCR API的访问和调用功能。
|
||||
"""
|
||||
|
||||
import time
|
||||
import base64
|
||||
import requests
|
||||
from typing import Dict, Optional, Union
|
||||
|
||||
from ..utils.log_utils import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
# Token 过期相关常量
|
||||
_DEFAULT_TOKEN_LIFETIME = 30 * 24 * 3600 # 30天(秒)
|
||||
_TOKEN_EARLY_EXPIRY = 3600 # 提前1小时刷新(秒)
|
||||
|
||||
class TokenManager:
|
||||
"""
|
||||
令牌管理类,负责获取和刷新百度API访问令牌
|
||||
"""
|
||||
|
||||
def __init__(self, api_key: str, secret_key: str, max_retries: int = 3, retry_delay: int = 2, token_url: str = None):
|
||||
"""
|
||||
初始化令牌管理器
|
||||
|
||||
Args:
|
||||
api_key: 百度API Key
|
||||
secret_key: 百度Secret Key
|
||||
max_retries: 最大重试次数
|
||||
retry_delay: 重试延迟(秒)
|
||||
token_url: 令牌获取地址
|
||||
"""
|
||||
self.api_key = api_key
|
||||
self.secret_key = secret_key
|
||||
self.max_retries = max_retries
|
||||
self.retry_delay = retry_delay
|
||||
self.token_url = token_url or 'https://aip.baidubce.com/oauth/2.0/token'
|
||||
self.access_token = None
|
||||
self.token_expiry = 0
|
||||
|
||||
def get_token(self) -> Optional[str]:
|
||||
"""
|
||||
获取访问令牌,如果令牌已过期则刷新
|
||||
|
||||
Returns:
|
||||
访问令牌,如果获取失败则返回None
|
||||
"""
|
||||
if self.is_token_valid():
|
||||
return self.access_token
|
||||
|
||||
return self.refresh_token()
|
||||
|
||||
def is_token_valid(self) -> bool:
|
||||
"""
|
||||
检查令牌是否有效
|
||||
|
||||
Returns:
|
||||
令牌是否有效
|
||||
"""
|
||||
return (
|
||||
self.access_token is not None and
|
||||
self.token_expiry > time.time() + 60 # 提前1分钟刷新
|
||||
)
|
||||
|
||||
def refresh_token(self) -> Optional[str]:
|
||||
"""
|
||||
刷新访问令牌
|
||||
|
||||
Returns:
|
||||
新的访问令牌,如果获取失败则返回None
|
||||
"""
|
||||
url = self.token_url
|
||||
params = {
|
||||
"grant_type": "client_credentials",
|
||||
"client_id": self.api_key,
|
||||
"client_secret": self.secret_key
|
||||
}
|
||||
|
||||
for attempt in range(self.max_retries):
|
||||
try:
|
||||
response = requests.post(url, params=params, timeout=10)
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
if "access_token" in result:
|
||||
self.access_token = result["access_token"]
|
||||
# 设置令牌过期时间(默认30天,提前1小时过期以确保安全)
|
||||
self.token_expiry = time.time() + result.get("expires_in", _DEFAULT_TOKEN_LIFETIME) - _TOKEN_EARLY_EXPIRY
|
||||
logger.info("成功获取访问令牌")
|
||||
return self.access_token
|
||||
|
||||
logger.warning(f"获取访问令牌失败 (尝试 {attempt+1}/{self.max_retries}): {response.text}")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"获取访问令牌时发生错误 (尝试 {attempt+1}/{self.max_retries}): {e}")
|
||||
|
||||
# 如果不是最后一次尝试,则等待后重试
|
||||
if attempt < self.max_retries - 1:
|
||||
time.sleep(self.retry_delay * (attempt + 1)) # 指数退避
|
||||
|
||||
logger.error("无法获取访问令牌")
|
||||
return None
|
||||
|
||||
class BaiduOCRClient:
|
||||
"""
|
||||
百度OCR API客户端
|
||||
"""
|
||||
|
||||
def __init__(self, config):
|
||||
"""
|
||||
初始化百度OCR客户端
|
||||
|
||||
Args:
|
||||
config: 配置信息
|
||||
"""
|
||||
self.config = config
|
||||
|
||||
# 从配置中读取API信息
|
||||
try:
|
||||
# 修复getint调用方式
|
||||
self.timeout = config.get('API', 'timeout', fallback=30)
|
||||
if isinstance(self.timeout, str):
|
||||
self.timeout = int(self.timeout)
|
||||
|
||||
self.api_key = config.get('API', 'api_key', fallback='')
|
||||
self.secret_key = config.get('API', 'secret_key', fallback='')
|
||||
|
||||
# 使用fallback而不是位置参数
|
||||
try:
|
||||
self.max_retries = config.getint('API', 'max_retries', fallback=3)
|
||||
except (TypeError, AttributeError):
|
||||
# 如果getint不支持fallback,则使用get再转换
|
||||
self.max_retries = int(config.get('API', 'max_retries', fallback='3'))
|
||||
|
||||
try:
|
||||
self.retry_delay = config.getint('API', 'retry_delay', fallback=2)
|
||||
except (TypeError, AttributeError):
|
||||
# 如果getint不支持fallback,则使用get再转换
|
||||
self.retry_delay = int(config.get('API', 'retry_delay', fallback='2'))
|
||||
|
||||
self.api_url = config.get('API', 'api_url', fallback='https://aip.baidubce.com/rest/2.0/ocr/v1/table')
|
||||
|
||||
# 创建令牌管理器
|
||||
self.token_manager = TokenManager(
|
||||
self.api_key,
|
||||
self.secret_key,
|
||||
self.max_retries,
|
||||
self.retry_delay,
|
||||
token_url=config.get('API', 'token_url', fallback='https://aip.baidubce.com/oauth/2.0/token')
|
||||
)
|
||||
|
||||
# 验证API配置
|
||||
if not self.api_key or not self.secret_key:
|
||||
logger.warning("API密钥未设置,请在配置文件中设置API密钥")
|
||||
except Exception as e:
|
||||
logger.error(f"初始化失败: {e}")
|
||||
|
||||
def read_image(self, image_path: str) -> Optional[bytes]:
|
||||
"""
|
||||
读取图片文件为二进制数据
|
||||
|
||||
Args:
|
||||
image_path: 图片文件路径
|
||||
|
||||
Returns:
|
||||
图片二进制数据,如果读取失败则返回None
|
||||
"""
|
||||
try:
|
||||
with open(image_path, 'rb') as f:
|
||||
return f.read()
|
||||
except Exception as e:
|
||||
logger.error(f"读取图片文件失败: {image_path}, 错误: {e}")
|
||||
return None
|
||||
|
||||
def recognize_table(self, image_data: Union[str, bytes]) -> Optional[Dict]:
|
||||
"""
|
||||
识别表格
|
||||
|
||||
Args:
|
||||
image_data: 图片数据,可以是文件路径或二进制数据
|
||||
|
||||
Returns:
|
||||
识别结果字典,如果识别失败则返回None
|
||||
"""
|
||||
# 获取访问令牌
|
||||
access_token = self.token_manager.get_token()
|
||||
if not access_token:
|
||||
logger.error("无法获取访问令牌,无法进行表格识别")
|
||||
return None
|
||||
|
||||
# 如果是文件路径,读取图片数据
|
||||
if isinstance(image_data, str):
|
||||
image_data = self.read_image(image_data)
|
||||
if image_data is None:
|
||||
return None
|
||||
|
||||
# 准备请求参数
|
||||
url = f"{self.api_url}?access_token={access_token}"
|
||||
image_base64 = base64.b64encode(image_data).decode('utf-8')
|
||||
|
||||
# 请求参数 - 添加return_excel参数,与v1版本保持一致
|
||||
payload = {
|
||||
'image': image_base64,
|
||||
'is_sync': 'true', # 同步请求
|
||||
'request_type': 'excel', # 输出为Excel
|
||||
'return_excel': 'true' # 直接返回Excel数据
|
||||
}
|
||||
|
||||
headers = {
|
||||
'Content-Type': 'application/x-www-form-urlencoded',
|
||||
'Accept': 'application/json'
|
||||
}
|
||||
|
||||
# 发送请求
|
||||
for attempt in range(self.max_retries):
|
||||
try:
|
||||
response = requests.post(
|
||||
url,
|
||||
data=payload,
|
||||
headers=headers,
|
||||
timeout=self.timeout
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
# 打印返回结果以便调试
|
||||
logger.debug(f"百度OCR API返回结果: {result}")
|
||||
|
||||
if 'error_code' in result:
|
||||
error_msg = result.get('error_msg', '未知错误')
|
||||
logger.error(f"百度OCR API错误: {error_msg}")
|
||||
# 如果是授权错误,尝试刷新令牌
|
||||
if result.get('error_code') in [110, 111]: # 授权相关错误码
|
||||
logger.info("尝试刷新访问令牌...")
|
||||
self.token_manager.refresh_token()
|
||||
return None
|
||||
|
||||
# 兼容不同的返回结构
|
||||
# 这是最关键的修改部分: 直接返回整个结果,不强制要求特定结构
|
||||
return result
|
||||
else:
|
||||
logger.warning(f"表格识别请求失败 (尝试 {attempt+1}/{self.max_retries}): {response.text}")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"表格识别时发生错误 (尝试 {attempt+1}/{self.max_retries}): {e}")
|
||||
|
||||
# 如果不是最后一次尝试,则等待后重试
|
||||
if attempt < self.max_retries - 1:
|
||||
wait_time = self.retry_delay * (2 ** attempt) # 指数退避
|
||||
logger.info(f"将在 {wait_time} 秒后重试...")
|
||||
time.sleep(wait_time)
|
||||
|
||||
logger.error("表格识别失败")
|
||||
return None
|
||||
|
||||
def get_excel_result(self, request_id_or_result: Union[str, Dict]) -> Optional[bytes]:
|
||||
"""
|
||||
获取Excel结果
|
||||
|
||||
Args:
|
||||
request_id_or_result: 请求ID或完整的识别结果
|
||||
|
||||
Returns:
|
||||
Excel二进制数据,如果获取失败则返回None
|
||||
"""
|
||||
# 获取访问令牌
|
||||
access_token = self.token_manager.get_token()
|
||||
if not access_token:
|
||||
logger.error("无法获取访问令牌,无法获取Excel结果")
|
||||
return None
|
||||
|
||||
# 处理直接传入结果对象的情况
|
||||
request_id = request_id_or_result
|
||||
if isinstance(request_id_or_result, dict):
|
||||
# v1版本兼容处理:如果结果中直接包含Excel数据
|
||||
if 'result' in request_id_or_result:
|
||||
# 如果是同步返回的Excel结果(某些API版本会直接返回)
|
||||
if 'result_data' in request_id_or_result['result']:
|
||||
excel_content = request_id_or_result['result']['result_data']
|
||||
if excel_content:
|
||||
try:
|
||||
return base64.b64decode(excel_content)
|
||||
except Exception as e:
|
||||
logger.error(f"解析Excel数据失败: {e}")
|
||||
|
||||
# 提取request_id
|
||||
if 'request_id' in request_id_or_result['result']:
|
||||
request_id = request_id_or_result['result']['request_id']
|
||||
logger.debug(f"从result子对象中提取request_id: {request_id}")
|
||||
elif 'tables_result' in request_id_or_result['result'] and len(request_id_or_result['result']['tables_result']) > 0:
|
||||
# 某些版本API可能直接返回表格内容,此时可能没有request_id
|
||||
logger.info("检测到API直接返回了表格内容,但没有request_id")
|
||||
return None
|
||||
# 有些版本可能request_id在顶层
|
||||
elif 'request_id' in request_id_or_result:
|
||||
request_id = request_id_or_result['request_id']
|
||||
logger.debug(f"从顶层对象中提取request_id: {request_id}")
|
||||
|
||||
# 如果没有有效的request_id,无法获取结果
|
||||
if not isinstance(request_id, str):
|
||||
logger.error(f"无法从结果中提取有效的request_id: {request_id_or_result}")
|
||||
return None
|
||||
|
||||
base_url = self.config.get('API', 'form_ocr_url', fallback='https://aip.baidubce.com/rest/2.0/solution/v1/form_ocr/get_request_result')
|
||||
url = f"{base_url}?access_token={access_token}"
|
||||
|
||||
payload = {
|
||||
'request_id': request_id,
|
||||
'result_type': 'excel'
|
||||
}
|
||||
|
||||
headers = {
|
||||
'Content-Type': 'application/x-www-form-urlencoded',
|
||||
'Accept': 'application/json'
|
||||
}
|
||||
|
||||
for attempt in range(self.max_retries):
|
||||
try:
|
||||
response = requests.post(
|
||||
url,
|
||||
data=payload,
|
||||
headers=headers,
|
||||
timeout=self.timeout
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
try:
|
||||
result = response.json()
|
||||
logger.debug(f"获取Excel结果返回: {result}")
|
||||
|
||||
# 检查是否还在处理中
|
||||
if result.get('result', {}).get('ret_code') == 3:
|
||||
logger.info(f"Excel结果正在处理中,等待后重试 (尝试 {attempt+1}/{self.max_retries})")
|
||||
time.sleep(2)
|
||||
continue
|
||||
|
||||
# 检查是否有错误
|
||||
if 'error_code' in result or result.get('result', {}).get('ret_code') != 0:
|
||||
error_msg = result.get('error_msg') or result.get('result', {}).get('ret_msg', '未知错误')
|
||||
logger.error(f"获取Excel结果失败: {error_msg}")
|
||||
return None
|
||||
|
||||
# 获取Excel内容
|
||||
excel_content = result.get('result', {}).get('result_data')
|
||||
if excel_content:
|
||||
return base64.b64decode(excel_content)
|
||||
else:
|
||||
logger.error("Excel结果为空")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"解析Excel结果时出错: {e}")
|
||||
return None
|
||||
|
||||
else:
|
||||
logger.warning(f"获取Excel结果请求失败 (尝试 {attempt+1}/{self.max_retries}): {response.text}")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"获取Excel结果时发生错误 (尝试 {attempt+1}/{self.max_retries}): {e}")
|
||||
|
||||
# 如果不是最后一次尝试,则等待后重试
|
||||
if attempt < self.max_retries - 1:
|
||||
time.sleep(self.retry_delay * (attempt + 1))
|
||||
|
||||
logger.error("获取Excel结果失败")
|
||||
return None
|
||||
@@ -0,0 +1,389 @@
|
||||
"""
|
||||
表格OCR处理模块
|
||||
-------------
|
||||
处理图片并提取表格内容,保存为Excel文件。
|
||||
"""
|
||||
|
||||
import os
|
||||
import time
|
||||
import base64
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from typing import Dict, List, Optional, Tuple, Callable
|
||||
|
||||
from ..utils.log_utils import get_logger
|
||||
from ..utils.file_utils import (
|
||||
ensure_dir,
|
||||
get_file_extension,
|
||||
get_files_by_extensions,
|
||||
generate_timestamp_filename,
|
||||
is_file_size_valid,
|
||||
load_json,
|
||||
save_json
|
||||
)
|
||||
from .baidu_ocr import BaiduOCRClient
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
class ProcessedRecordManager:
|
||||
"""处理记录管理器,用于跟踪已处理的文件"""
|
||||
|
||||
def __init__(self, record_file: str):
|
||||
"""
|
||||
初始化处理记录管理器
|
||||
|
||||
Args:
|
||||
record_file: 记录文件路径
|
||||
"""
|
||||
self.record_file = record_file
|
||||
self.processed_files = self._load_record()
|
||||
|
||||
def _load_record(self) -> Dict[str, str]:
|
||||
"""
|
||||
加载处理记录
|
||||
|
||||
Returns:
|
||||
处理记录字典,键为输入文件路径,值为输出文件路径
|
||||
"""
|
||||
return load_json(self.record_file, {})
|
||||
|
||||
def save_record(self) -> None:
|
||||
"""保存处理记录"""
|
||||
save_json(self.processed_files, self.record_file)
|
||||
|
||||
def is_processed(self, image_file: str) -> bool:
|
||||
"""
|
||||
检查图片是否已处理
|
||||
|
||||
Args:
|
||||
image_file: 图片文件路径
|
||||
|
||||
Returns:
|
||||
是否已处理
|
||||
"""
|
||||
return image_file in self.processed_files
|
||||
|
||||
def mark_as_processed(self, image_file: str, output_file: str) -> None:
|
||||
"""
|
||||
标记图片为已处理
|
||||
|
||||
Args:
|
||||
image_file: 图片文件路径
|
||||
output_file: 输出文件路径
|
||||
"""
|
||||
self.processed_files[image_file] = output_file
|
||||
self.save_record()
|
||||
|
||||
def get_output_file(self, image_file: str) -> Optional[str]:
|
||||
"""
|
||||
获取图片的输出文件路径
|
||||
|
||||
Args:
|
||||
image_file: 图片文件路径
|
||||
|
||||
Returns:
|
||||
输出文件路径,如果不存在则返回None
|
||||
"""
|
||||
return self.processed_files.get(image_file)
|
||||
|
||||
def get_unprocessed_files(self, files: List[str]) -> List[str]:
|
||||
"""
|
||||
获取未处理的文件列表
|
||||
|
||||
Args:
|
||||
files: 文件列表
|
||||
|
||||
Returns:
|
||||
未处理的文件列表
|
||||
"""
|
||||
return [file for file in files if not self.is_processed(file)]
|
||||
|
||||
class OCRProcessor:
|
||||
"""
|
||||
OCR处理器,负责协调OCR识别和结果处理
|
||||
"""
|
||||
|
||||
def __init__(self, config):
|
||||
"""
|
||||
初始化OCR处理器
|
||||
|
||||
Args:
|
||||
config: 配置信息
|
||||
"""
|
||||
self.config = config
|
||||
|
||||
# 修复ConfigParser对象没有get_path方法的问题
|
||||
try:
|
||||
# 获取输入和输出目录
|
||||
self.input_folder = config.get('Paths', 'input_folder', fallback='data/input')
|
||||
self.output_folder = config.get('Paths', 'output_folder', fallback='data/output')
|
||||
self.temp_folder = config.get('Paths', 'temp_folder', fallback='data/temp')
|
||||
|
||||
# 确保目录存在
|
||||
os.makedirs(self.input_folder, exist_ok=True)
|
||||
os.makedirs(self.output_folder, exist_ok=True)
|
||||
os.makedirs(self.temp_folder, exist_ok=True)
|
||||
|
||||
# 获取文件类型列表
|
||||
allowed_extensions_str = config.get('File', 'allowed_extensions', fallback='.jpg,.jpeg,.png,.bmp')
|
||||
self.file_types = [ext.strip() for ext in allowed_extensions_str.split(',') if ext.strip()]
|
||||
if not self.file_types:
|
||||
self.file_types = ['.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tif', '.tiff']
|
||||
|
||||
# 初始化OCR客户端
|
||||
self.ocr_client = BaiduOCRClient(self.config)
|
||||
|
||||
# 记录实际路径
|
||||
logger.info(f"使用输入目录: {os.path.abspath(self.input_folder)}")
|
||||
logger.info(f"使用输出目录: {os.path.abspath(self.output_folder)}")
|
||||
logger.info(f"使用临时目录: {os.path.abspath(self.temp_folder)}")
|
||||
logger.info(f"允许的文件类型: {self.file_types}")
|
||||
|
||||
# 初始化processed_files_json和record_manager
|
||||
self.processed_files_json = os.path.join(self.output_folder, 'processed_files.json')
|
||||
self.record_manager = ProcessedRecordManager(self.processed_files_json)
|
||||
|
||||
# 加载已处理文件记录
|
||||
self.processed_files = self._load_processed_files()
|
||||
|
||||
logger.info(f"初始化OCRProcessor完成:输入目录={self.input_folder}, 输出目录={self.output_folder}")
|
||||
except Exception as e:
|
||||
logger.error(f"初始化OCRProcessor失败: {e}")
|
||||
raise
|
||||
|
||||
def _load_processed_files(self) -> Dict[str, str]:
|
||||
"""
|
||||
加载已处理的文件记录
|
||||
|
||||
Returns:
|
||||
已处理的文件记录字典,键为输入文件路径,值为输出文件路径
|
||||
"""
|
||||
return load_json(self.processed_files_json, {})
|
||||
|
||||
def get_unprocessed_images(self) -> List[str]:
|
||||
"""
|
||||
获取未处理的图片列表
|
||||
|
||||
Returns:
|
||||
未处理的图片文件路径列表
|
||||
"""
|
||||
# 获取所有图片文件
|
||||
image_files = get_files_by_extensions(self.input_folder, self.file_types)
|
||||
|
||||
# 如果需要跳过已存在的文件
|
||||
skip_existing = True
|
||||
try:
|
||||
skip_existing = self.config.getboolean('Performance', 'skip_existing', fallback=True)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if skip_existing:
|
||||
# 过滤已处理的文件
|
||||
unprocessed_files = self.record_manager.get_unprocessed_files(image_files)
|
||||
logger.info(f"找到 {len(image_files)} 个图片文件,其中 {len(unprocessed_files)} 个未处理")
|
||||
return unprocessed_files
|
||||
|
||||
logger.info(f"找到 {len(image_files)} 个图片文件(不跳过已处理的文件)")
|
||||
return image_files
|
||||
|
||||
def validate_image(self, image_path: str) -> bool:
|
||||
"""
|
||||
验证图片是否有效
|
||||
|
||||
Args:
|
||||
image_path: 图片文件路径
|
||||
|
||||
Returns:
|
||||
图片是否有效
|
||||
"""
|
||||
# 检查文件是否存在
|
||||
if not os.path.exists(image_path):
|
||||
logger.warning(f"图片文件不存在: {image_path}")
|
||||
return False
|
||||
|
||||
# 检查文件扩展名
|
||||
ext = get_file_extension(image_path)
|
||||
if ext not in self.file_types:
|
||||
logger.warning(f"不支持的文件类型: {ext}, 文件: {image_path}")
|
||||
return False
|
||||
|
||||
# 检查文件大小
|
||||
max_size_mb = 4.0
|
||||
try:
|
||||
max_size_mb = float(self.config.get('File', 'max_file_size_mb', fallback='4.0'))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if not is_file_size_valid(image_path, max_size_mb):
|
||||
logger.warning(f"文件大小超过限制 ({max_size_mb}MB): {image_path}")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def process_image(self, image_path: str) -> Optional[str]:
|
||||
"""
|
||||
处理单个图片
|
||||
|
||||
Args:
|
||||
image_path: 图片文件路径
|
||||
|
||||
Returns:
|
||||
输出Excel文件路径,如果处理失败则返回None
|
||||
"""
|
||||
# 验证图片
|
||||
if not self.validate_image(image_path):
|
||||
return None
|
||||
|
||||
# 获取是否跳过已处理文件的配置
|
||||
skip_existing = True
|
||||
try:
|
||||
skip_existing = self.config.getboolean('Performance', 'skip_existing', fallback=True)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 如果需要跳过已处理的文件
|
||||
if skip_existing and self.record_manager.is_processed(image_path):
|
||||
output_file = self.record_manager.get_output_file(image_path)
|
||||
logger.info(f"图片已处理,跳过: {image_path}, 输出文件: {output_file}")
|
||||
return output_file
|
||||
|
||||
logger.info(f"开始处理图片: {image_path}")
|
||||
|
||||
try:
|
||||
# 获取Excel扩展名
|
||||
excel_extension = '.xlsx'
|
||||
try:
|
||||
excel_extension = self.config.get('File', 'excel_extension', fallback='.xlsx')
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 生成输出文件路径
|
||||
file_name = os.path.splitext(os.path.basename(image_path))[0]
|
||||
output_file = os.path.join(self.output_folder, f"{file_name}{excel_extension}")
|
||||
|
||||
# 检查是否已存在对应的Excel文件
|
||||
if os.path.exists(output_file) and skip_existing:
|
||||
logger.info(f"已存在对应的Excel文件,跳过处理: {os.path.basename(image_path)} -> {os.path.basename(output_file)}")
|
||||
# 记录处理结果
|
||||
self.record_manager.mark_as_processed(image_path, output_file)
|
||||
return output_file
|
||||
|
||||
# 进行OCR识别
|
||||
ocr_result = self.ocr_client.recognize_table(image_path)
|
||||
if not ocr_result:
|
||||
logger.error(f"OCR识别失败: {image_path}")
|
||||
return None
|
||||
|
||||
# 保存Excel文件 - 按照v1版本逻辑提取Excel数据
|
||||
excel_base64 = None
|
||||
|
||||
# 从不同可能的字段中尝试获取Excel数据
|
||||
if 'excel_file' in ocr_result:
|
||||
excel_base64 = ocr_result['excel_file']
|
||||
logger.debug("从excel_file字段获取Excel数据")
|
||||
elif 'result' in ocr_result:
|
||||
if 'result_data' in ocr_result['result']:
|
||||
excel_base64 = ocr_result['result']['result_data']
|
||||
logger.debug("从result.result_data字段获取Excel数据")
|
||||
elif 'excel_file' in ocr_result['result']:
|
||||
excel_base64 = ocr_result['result']['excel_file']
|
||||
logger.debug("从result.excel_file字段获取Excel数据")
|
||||
elif 'tables_result' in ocr_result['result'] and ocr_result['result']['tables_result']:
|
||||
for table in ocr_result['result']['tables_result']:
|
||||
if 'excel_file' in table:
|
||||
excel_base64 = table['excel_file']
|
||||
logger.debug("从tables_result中获取Excel数据")
|
||||
break
|
||||
|
||||
# 如果还是没有找到Excel数据,尝试通过get_excel_result获取
|
||||
if not excel_base64:
|
||||
logger.info("无法从直接返回中获取Excel数据,尝试通过API获取...")
|
||||
excel_data = self.ocr_client.get_excel_result(ocr_result)
|
||||
if not excel_data:
|
||||
logger.error(f"获取Excel结果失败: {image_path}")
|
||||
return None
|
||||
|
||||
# 保存Excel文件
|
||||
os.makedirs(os.path.dirname(output_file), exist_ok=True)
|
||||
with open(output_file, 'wb') as f:
|
||||
f.write(excel_data)
|
||||
else:
|
||||
# 解码并保存Excel文件
|
||||
try:
|
||||
excel_data = base64.b64decode(excel_base64)
|
||||
os.makedirs(os.path.dirname(output_file), exist_ok=True)
|
||||
with open(output_file, 'wb') as f:
|
||||
f.write(excel_data)
|
||||
except Exception as e:
|
||||
logger.error(f"解码或保存Excel数据时出错: {e}")
|
||||
return None
|
||||
|
||||
logger.info(f"图片处理成功: {image_path}, 输出文件: {output_file}")
|
||||
|
||||
# 标记为已处理
|
||||
self.record_manager.mark_as_processed(image_path, output_file)
|
||||
|
||||
return output_file
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"处理图片时出错: {image_path}, 错误: {e}")
|
||||
return None
|
||||
|
||||
def process_images_batch(self, batch_size: int = None, max_workers: int = None, progress_cb: Optional[Callable[[int], None]] = None) -> Tuple[int, int]:
|
||||
"""
|
||||
批量处理图片
|
||||
|
||||
Args:
|
||||
batch_size: 批处理大小,如果为None则使用配置值
|
||||
max_workers: 最大线程数,如果为None则使用配置值
|
||||
|
||||
Returns:
|
||||
(总处理数, 成功处理数)元组
|
||||
"""
|
||||
# 使用配置值或参数值
|
||||
if batch_size is None:
|
||||
try:
|
||||
batch_size = self.config.getint('Performance', 'batch_size', fallback=5)
|
||||
except Exception:
|
||||
batch_size = 5
|
||||
|
||||
if max_workers is None:
|
||||
try:
|
||||
max_workers = self.config.getint('Performance', 'max_workers', fallback=4)
|
||||
except Exception:
|
||||
max_workers = 4
|
||||
|
||||
# 获取未处理的图片
|
||||
unprocessed_images = self.get_unprocessed_images()
|
||||
if not unprocessed_images:
|
||||
logger.warning("没有需要处理的图片")
|
||||
return 0, 0
|
||||
|
||||
total = len(unprocessed_images)
|
||||
success_count = 0
|
||||
|
||||
# 按批次处理
|
||||
for i in range(0, total, batch_size):
|
||||
batch = unprocessed_images[i:i+batch_size]
|
||||
logger.info(f"处理批次 {i//batch_size+1}/{(total+batch_size-1)//batch_size}: {len(batch)} 个文件")
|
||||
try:
|
||||
if progress_cb:
|
||||
# 以批次为单位估算进度(0-90%),保留10%给后续阶段
|
||||
percent = int(10 + (i / max(total, 1)) * 80)
|
||||
progress_cb(min(percent, 90))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 使用多线程处理批次
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
results = list(executor.map(self.process_image, batch))
|
||||
|
||||
# 统计成功数
|
||||
success_count += sum(1 for result in results if result is not None)
|
||||
|
||||
logger.info(f"所有图片处理完成, 总计: {total}, 成功: {success_count}")
|
||||
try:
|
||||
if progress_cb:
|
||||
progress_cb(90)
|
||||
except Exception:
|
||||
pass
|
||||
return total, success_count
|
||||
@@ -0,0 +1,9 @@
|
||||
"""
|
||||
处理器模块初始化文件
|
||||
"""
|
||||
|
||||
from .base import BaseProcessor
|
||||
from .ocr_processor import OCRProcessor
|
||||
from .tobacco_processor import TobaccoProcessor
|
||||
|
||||
__all__ = ['BaseProcessor', 'OCRProcessor', 'TobaccoProcessor']
|
||||
@@ -0,0 +1,167 @@
|
||||
"""
|
||||
基础处理器接口模块
|
||||
|
||||
定义所有处理器的基类,提供统一的处理接口
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, Any, Optional, List
|
||||
from pathlib import Path
|
||||
import logging
|
||||
import pandas as pd
|
||||
|
||||
from ...core.utils.log_utils import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class BaseProcessor(ABC):
|
||||
"""基础处理器接口 - 所有处理器的基类
|
||||
|
||||
采用策略模式设计,每个处理器负责特定类型的文件处理
|
||||
"""
|
||||
|
||||
def __init__(self, config: Dict[str, Any]):
|
||||
"""初始化处理器
|
||||
|
||||
Args:
|
||||
config: 处理器配置字典
|
||||
"""
|
||||
self.config = config
|
||||
self.name = self.__class__.__name__
|
||||
self.description = ""
|
||||
self._setup_logging()
|
||||
|
||||
def _setup_logging(self):
|
||||
"""设置处理器日志"""
|
||||
self.logger = logging.getLogger(f"{__name__}.{self.name}")
|
||||
|
||||
@abstractmethod
|
||||
def can_process(self, file_path: Path) -> bool:
|
||||
"""判断是否能处理该文件
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
是否能处理该文件
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def process(self, input_file: Path, output_dir: Path) -> Optional[Path]:
|
||||
"""处理文件,返回输出文件路径
|
||||
|
||||
Args:
|
||||
input_file: 输入文件路径
|
||||
output_dir: 输出目录路径
|
||||
|
||||
Returns:
|
||||
输出文件路径,处理失败返回None
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_required_columns(self) -> List[str]:
|
||||
"""返回需要的列名列表
|
||||
|
||||
Returns:
|
||||
列名列表
|
||||
"""
|
||||
pass
|
||||
|
||||
def validate_input(self, file_path: Path) -> bool:
|
||||
"""验证输入文件有效性
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
文件是否有效
|
||||
"""
|
||||
try:
|
||||
if not file_path.exists():
|
||||
self.logger.warning(f"文件不存在: {file_path}")
|
||||
return False
|
||||
|
||||
if not file_path.is_file():
|
||||
self.logger.warning(f"不是文件: {file_path}")
|
||||
return False
|
||||
|
||||
supported_extensions = self.get_supported_extensions()
|
||||
if supported_extensions and file_path.suffix.lower() not in supported_extensions:
|
||||
self.logger.warning(f"不支持的文件类型: {file_path.suffix}, 支持的类型: {supported_extensions}")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"验证文件时出错: {e}")
|
||||
return False
|
||||
|
||||
def get_supported_extensions(self) -> List[str]:
|
||||
"""获取支持的文件扩展名
|
||||
|
||||
Returns:
|
||||
支持的扩展名列表,空列表表示支持所有类型
|
||||
"""
|
||||
return []
|
||||
|
||||
def get_output_filename(self, input_file: Path, suffix: str = "_processed") -> str:
|
||||
"""生成输出文件名
|
||||
|
||||
Args:
|
||||
input_file: 输入文件路径
|
||||
suffix: 文件名后缀
|
||||
|
||||
Returns:
|
||||
输出文件名
|
||||
"""
|
||||
return f"{input_file.stem}{suffix}{input_file.suffix}"
|
||||
|
||||
def _read_excel_safely(self, file_path: Path, **kwargs) -> pd.DataFrame:
|
||||
"""根据扩展名选择合适的读取引擎
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
**kwargs: 传递给 pd.read_excel 的参数
|
||||
|
||||
Returns:
|
||||
DataFrame
|
||||
|
||||
Raises:
|
||||
Exception: 读取失败时抛出
|
||||
"""
|
||||
suffix = file_path.suffix.lower()
|
||||
if suffix == '.xlsx':
|
||||
return pd.read_excel(file_path, engine='openpyxl', **kwargs)
|
||||
elif suffix == '.xls':
|
||||
try:
|
||||
return pd.read_excel(file_path, engine='xlrd', **kwargs)
|
||||
except Exception as e:
|
||||
self.logger.warning(f"读取xls失败,可能缺少xlrd: {e}")
|
||||
raise
|
||||
else:
|
||||
return pd.read_excel(file_path, **kwargs)
|
||||
|
||||
def log_processing_start(self, input_file: Path):
|
||||
"""记录处理开始日志"""
|
||||
self.logger.info(f"开始处理文件: {input_file}")
|
||||
self.logger.info(f"处理器: {self.name} - {self.description}")
|
||||
|
||||
def log_processing_end(self, input_file: Path, output_file: Optional[Path] = None, success: bool = True):
|
||||
"""记录处理结束日志"""
|
||||
if success:
|
||||
self.logger.info(f"处理完成: {input_file}")
|
||||
if output_file:
|
||||
self.logger.info(f"输出文件: {output_file}")
|
||||
else:
|
||||
self.logger.error(f"处理失败: {input_file}")
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""字符串表示"""
|
||||
return f"{self.name}({self.description})"
|
||||
|
||||
def __repr__(self) -> str:
|
||||
"""详细字符串表示"""
|
||||
return f"{self.__class__.__module__}.{self.__class__.__name__}(name='{self.name}', description='{self.description}')"
|
||||
@@ -0,0 +1,192 @@
|
||||
"""
|
||||
OCR处理器
|
||||
|
||||
处理图片文件的OCR识别完整流程:图片识别 → Excel处理 → 标准采购单生成
|
||||
"""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict, Any, List
|
||||
|
||||
from .base import BaseProcessor
|
||||
from ...services.ocr_service import OCRService
|
||||
from ...services.order_service import OrderService
|
||||
from ...core.utils.log_utils import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class OCRProcessor(BaseProcessor):
|
||||
"""OCR处理器
|
||||
|
||||
处理图片文件的完整OCR识别流程:
|
||||
1. OCR识别图片中的表格信息
|
||||
2. 处理识别结果生成Excel文件
|
||||
3. 转换为标准采购单格式
|
||||
"""
|
||||
|
||||
def __init__(self, config: Dict[str, Any]):
|
||||
"""初始化OCR处理器
|
||||
|
||||
Args:
|
||||
config: 配置信息
|
||||
"""
|
||||
super().__init__(config)
|
||||
self.description = "OCR识别完整流程(图片→识别→Excel→采购单)"
|
||||
|
||||
# 初始化服务
|
||||
self.ocr_service = OCRService(config)
|
||||
self.order_service = OrderService(config)
|
||||
|
||||
def can_process(self, file_path: Path) -> bool:
|
||||
"""判断是否为支持的图片文件
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
是否能处理该文件
|
||||
"""
|
||||
if not self.validate_input(file_path):
|
||||
return False
|
||||
|
||||
# 支持的图片格式
|
||||
supported_extensions = ['.jpg', '.jpeg', '.png', '.bmp']
|
||||
|
||||
if file_path.suffix.lower() in supported_extensions:
|
||||
self.logger.info(f"识别为图片文件: {file_path.name}")
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def process(self, input_file: Path, output_dir: Path) -> Optional[Path]:
|
||||
"""处理图片文件的完整OCR流程
|
||||
|
||||
Args:
|
||||
input_file: 输入图片文件路径
|
||||
output_dir: 输出目录路径
|
||||
|
||||
Returns:
|
||||
输出文件路径,处理失败返回None
|
||||
"""
|
||||
self.log_processing_start(input_file)
|
||||
|
||||
try:
|
||||
self.logger.info("开始OCR识别流程...")
|
||||
|
||||
# 步骤1: OCR识别
|
||||
self.logger.info("步骤1/3: OCR识别图片...")
|
||||
ocr_result = self._perform_ocr(input_file, output_dir)
|
||||
if not ocr_result:
|
||||
self.logger.error("OCR识别失败")
|
||||
self.log_processing_end(input_file, success=False)
|
||||
return None
|
||||
|
||||
# 步骤2: Excel处理
|
||||
self.logger.info("步骤2/3: 处理Excel文件...")
|
||||
excel_result = self._process_excel(ocr_result, output_dir)
|
||||
if not excel_result:
|
||||
self.logger.error("Excel处理失败")
|
||||
self.log_processing_end(input_file, success=False)
|
||||
return None
|
||||
|
||||
# 步骤3: 生成标准采购单
|
||||
self.logger.info("步骤3/3: 生成标准采购单...")
|
||||
final_result = self._generate_purchase_order(excel_result, output_dir)
|
||||
|
||||
if final_result:
|
||||
self.logger.info(f"OCR处理流程完成,输出文件: {final_result}")
|
||||
self.log_processing_end(input_file, final_result, success=True)
|
||||
return final_result
|
||||
else:
|
||||
self.logger.error("生成采购单失败")
|
||||
self.log_processing_end(input_file, success=False)
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"OCR处理流程出错: {e}", exc_info=True)
|
||||
self.log_processing_end(input_file, success=False)
|
||||
return None
|
||||
|
||||
def get_required_columns(self) -> List[str]:
|
||||
"""返回需要的列名列表"""
|
||||
# OCR处理不直接依赖列名,由后续处理步骤决定
|
||||
return []
|
||||
|
||||
def get_supported_extensions(self) -> List[str]:
|
||||
"""支持的文件扩展名"""
|
||||
return ['.jpg', '.jpeg', '.png', '.bmp']
|
||||
|
||||
def _perform_ocr(self, input_file: Path, output_dir: Path) -> Optional[Path]:
|
||||
"""执行OCR识别
|
||||
|
||||
Args:
|
||||
input_file: 输入图片文件
|
||||
output_dir: 输出目录
|
||||
|
||||
Returns:
|
||||
OCR生成的Excel文件路径,失败返回None
|
||||
"""
|
||||
try:
|
||||
self.logger.info(f"开始OCR识别: {input_file}")
|
||||
|
||||
# 使用OCR服务处理图片
|
||||
result_path = self.ocr_service.process_image(str(input_file))
|
||||
|
||||
if result_path:
|
||||
# 确保结果文件在输出目录中
|
||||
result_path = Path(result_path)
|
||||
if result_path.exists():
|
||||
self.logger.info(f"OCR识别成功,输出文件: {result_path}")
|
||||
return result_path
|
||||
else:
|
||||
self.logger.error(f"OCR结果文件不存在: {result_path}")
|
||||
return None
|
||||
else:
|
||||
self.logger.error("OCR服务返回None")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"OCR识别失败: {e}", exc_info=True)
|
||||
return None
|
||||
|
||||
def _process_excel(self, excel_file: Path, output_dir: Path) -> Optional[Path]:
|
||||
"""处理Excel文件
|
||||
|
||||
Args:
|
||||
excel_file: Excel文件路径
|
||||
output_dir: 输出目录
|
||||
|
||||
Returns:
|
||||
处理后的Excel文件路径,失败返回None
|
||||
"""
|
||||
try:
|
||||
self.logger.info(f"开始处理Excel文件: {excel_file}")
|
||||
|
||||
# 使用订单服务处理Excel文件(生成采购单)
|
||||
result_path = self.order_service.process_excel(str(excel_file))
|
||||
|
||||
if result_path:
|
||||
result_path = Path(result_path)
|
||||
if result_path.exists():
|
||||
self.logger.info(f"Excel处理成功,输出文件: {result_path}")
|
||||
return result_path
|
||||
else:
|
||||
self.logger.error(f"Excel处理结果文件不存在: {result_path}")
|
||||
return None
|
||||
else:
|
||||
self.logger.error("Excel处理服务返回None")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Excel处理失败: {e}", exc_info=True)
|
||||
return None
|
||||
|
||||
def _generate_purchase_order(self, processed_file: Path, output_dir: Path) -> Optional[Path]:
|
||||
"""采购单生成由OrderService完成,此处直接返回处理结果"""
|
||||
try:
|
||||
if processed_file and processed_file.exists():
|
||||
return processed_file
|
||||
return None
|
||||
except Exception:
|
||||
return None
|
||||
@@ -0,0 +1,7 @@
|
||||
"""
|
||||
供应商处理器模块初始化文件
|
||||
"""
|
||||
|
||||
from .generic_supplier_processor import GenericSupplierProcessor
|
||||
|
||||
__all__ = ['GenericSupplierProcessor']
|
||||
@@ -0,0 +1,340 @@
|
||||
"""
|
||||
通用供应商处理器
|
||||
|
||||
可配置化的供应商处理器,支持通过配置文件定义处理规则
|
||||
"""
|
||||
|
||||
import fnmatch
|
||||
import pandas as pd
|
||||
from typing import Optional, Dict, Any, List
|
||||
from pathlib import Path
|
||||
|
||||
from ..base import BaseProcessor
|
||||
from ...utils.log_utils import get_logger
|
||||
from ...handlers.rule_engine import apply_rules
|
||||
from ...handlers.column_mapper import ColumnMapper
|
||||
from ...handlers.data_cleaner import DataCleaner
|
||||
from ...handlers.calculator import DataCalculator
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class GenericSupplierProcessor(BaseProcessor):
|
||||
"""通用供应商处理器
|
||||
|
||||
基于配置文件处理不同供应商的Excel文件,支持:
|
||||
- 文件名模式匹配
|
||||
- 内容特征识别
|
||||
- 列映射配置
|
||||
- 数据清洗规则
|
||||
- 计算处理规则
|
||||
"""
|
||||
|
||||
def __init__(self, config: Dict[str, Any], supplier_config: Dict[str, Any]):
|
||||
"""初始化通用供应商处理器
|
||||
|
||||
Args:
|
||||
config: 系统配置
|
||||
supplier_config: 供应商特定配置
|
||||
"""
|
||||
super().__init__(config)
|
||||
self.supplier_config = supplier_config
|
||||
|
||||
# 从配置中提取基本信息
|
||||
self.name = supplier_config.get('name', 'GenericSupplier')
|
||||
self.description = supplier_config.get('description', '通用供应商处理器')
|
||||
|
||||
# 处理规则配置
|
||||
self.filename_patterns = supplier_config.get('filename_patterns', [])
|
||||
self.content_indicators = supplier_config.get('content_indicators', [])
|
||||
self.column_mapping = supplier_config.get('column_mapping', {})
|
||||
self.cleaning_rules = supplier_config.get('cleaning_rules', [])
|
||||
self.calculations = supplier_config.get('calculations', [])
|
||||
|
||||
# 输出配置
|
||||
self.output_template = supplier_config.get('output_template', 'templates/银豹-采购单模板.xls')
|
||||
self.output_suffix = supplier_config.get('output_suffix', '_银豹采购单')
|
||||
|
||||
def can_process(self, file_path: Path) -> bool:
|
||||
"""判断是否能处理该文件
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
是否能处理
|
||||
"""
|
||||
if not self.validate_input(file_path):
|
||||
return False
|
||||
|
||||
# 检查文件名模式
|
||||
if self.filename_patterns:
|
||||
filename_match = self._check_filename_patterns(file_path)
|
||||
if filename_match:
|
||||
return True
|
||||
|
||||
# 检查文件内容特征
|
||||
if self.content_indicators:
|
||||
content_match = self._check_content_indicators(file_path)
|
||||
if content_match:
|
||||
return True
|
||||
|
||||
# 如果都没有配置,则无法判断
|
||||
if not self.filename_patterns and not self.content_indicators:
|
||||
self.logger.warning(f"处理器 {self.name} 没有配置识别规则")
|
||||
return False
|
||||
|
||||
return False
|
||||
|
||||
def process(self, input_file: Path, output_dir: Path) -> Optional[Path]:
|
||||
"""处理文件
|
||||
|
||||
Args:
|
||||
input_file: 输入文件路径
|
||||
output_dir: 输出目录路径
|
||||
|
||||
Returns:
|
||||
输出文件路径,处理失败返回None
|
||||
"""
|
||||
self.log_processing_start(input_file)
|
||||
|
||||
try:
|
||||
# 步骤1: 读取数据
|
||||
self.logger.info("步骤1/4: 读取数据...")
|
||||
df = self._read_supplier_data(input_file)
|
||||
if df is None or df.empty:
|
||||
self.logger.error("读取数据失败或数据为空")
|
||||
self.log_processing_end(input_file, success=False)
|
||||
return None
|
||||
|
||||
# 步骤2: 应用列映射
|
||||
self.logger.info("步骤2/4: 应用列映射...")
|
||||
mapped_df = self._apply_column_mapping(df)
|
||||
if mapped_df is None:
|
||||
self.logger.error("列映射失败")
|
||||
self.log_processing_end(input_file, success=False)
|
||||
return None
|
||||
|
||||
# 步骤3: 数据清洗
|
||||
self.logger.info("步骤3/4: 数据清洗...")
|
||||
cleaned_df = self._apply_data_cleaning(mapped_df)
|
||||
if cleaned_df is None:
|
||||
self.logger.error("数据清洗失败")
|
||||
self.log_processing_end(input_file, success=False)
|
||||
return None
|
||||
try:
|
||||
rules = self.supplier_config.get('rules', [])
|
||||
dictionary = self.supplier_config.get('dictionary')
|
||||
standardized_df = apply_rules(cleaned_df, rules, dictionary)
|
||||
except Exception as e:
|
||||
self.logger.warning(f"规则执行失败: {e}")
|
||||
standardized_df = cleaned_df
|
||||
|
||||
# 步骤4: 计算处理
|
||||
self.logger.info("步骤4/4: 计算处理...")
|
||||
calculated_df = self._apply_calculations(standardized_df)
|
||||
if calculated_df is None:
|
||||
self.logger.error("计算处理失败")
|
||||
self.log_processing_end(input_file, success=False)
|
||||
return None
|
||||
|
||||
# 生成输出文件
|
||||
output_file = self._generate_output(calculated_df, input_file, output_dir)
|
||||
|
||||
if output_file and output_file.exists():
|
||||
self.logger.info(f"处理完成,输出文件: {output_file}")
|
||||
self.log_processing_end(input_file, output_file, success=True)
|
||||
return output_file
|
||||
else:
|
||||
self.logger.error("输出文件生成失败")
|
||||
self.log_processing_end(input_file, success=False)
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"处理文件时出错: {e}", exc_info=True)
|
||||
self.log_processing_end(input_file, success=False)
|
||||
return None
|
||||
|
||||
def get_required_columns(self) -> List[str]:
|
||||
"""返回需要的列名列表"""
|
||||
# 从列映射配置中提取目标列名
|
||||
return list(self.column_mapping.values()) if self.column_mapping else []
|
||||
|
||||
def _check_filename_patterns(self, file_path: Path) -> bool:
|
||||
"""检查文件名模式
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
是否匹配
|
||||
"""
|
||||
try:
|
||||
filename = file_path.name
|
||||
for pattern in self.filename_patterns:
|
||||
if fnmatch.fnmatch(filename.lower(), pattern.lower()):
|
||||
self.logger.info(f"文件名匹配成功: {filename} -> {pattern}")
|
||||
return True
|
||||
return False
|
||||
except Exception as e:
|
||||
self.logger.error(f"检查文件名模式时出错: {e}")
|
||||
return False
|
||||
|
||||
def _check_content_indicators(self, file_path: Path) -> bool:
|
||||
"""检查文件内容特征
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
是否匹配
|
||||
"""
|
||||
try:
|
||||
df = self._read_excel_safely(file_path, nrows=5)
|
||||
|
||||
# 检查列名中是否包含指定关键词
|
||||
columns_str = str(list(df.columns)).lower()
|
||||
|
||||
for indicator in self.content_indicators:
|
||||
if indicator.lower() in columns_str:
|
||||
self.logger.info(f"内容特征匹配成功: {indicator}")
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"检查内容特征时出错: {e}")
|
||||
return False
|
||||
|
||||
def _read_supplier_data(self, file_path: Path) -> Optional[pd.DataFrame]:
|
||||
"""读取供应商数据
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
数据DataFrame或None
|
||||
"""
|
||||
try:
|
||||
specified = self.supplier_config.get('header_row')
|
||||
if specified is not None:
|
||||
try:
|
||||
df = self._read_excel_safely(file_path, header=int(specified))
|
||||
except Exception:
|
||||
df = self._read_excel_safely(file_path)
|
||||
else:
|
||||
df0 = self._read_excel_safely(file_path, header=None)
|
||||
if df0 is None:
|
||||
return None
|
||||
header_row = self._find_header_row(df0)
|
||||
if header_row is not None:
|
||||
df = self._read_excel_safely(file_path, header=header_row)
|
||||
else:
|
||||
df = self._read_excel_safely(file_path)
|
||||
if df is None or df.empty:
|
||||
self.logger.warning("数据文件为空")
|
||||
return None
|
||||
self.logger.info(f"成功读取数据,形状: {df.shape}")
|
||||
return df
|
||||
except Exception as e:
|
||||
self.logger.error(f"读取数据失败: {e}")
|
||||
return None
|
||||
|
||||
def _find_header_row(self, df: pd.DataFrame) -> Optional[int]:
|
||||
result = ColumnMapper.detect_header_row(df, max_rows=30)
|
||||
return result if result >= 0 else None
|
||||
|
||||
def _apply_column_mapping(self, df: pd.DataFrame) -> Optional[pd.DataFrame]:
|
||||
"""应用列映射
|
||||
|
||||
Args:
|
||||
df: 原始数据
|
||||
|
||||
Returns:
|
||||
映射后的数据或None
|
||||
"""
|
||||
if not self.column_mapping:
|
||||
self.logger.info("没有列映射配置")
|
||||
return df
|
||||
|
||||
try:
|
||||
# 应用列重命名
|
||||
df_renamed = df.rename(columns=self.column_mapping)
|
||||
|
||||
# 检查必需的列是否存在
|
||||
required_columns = self.get_required_columns()
|
||||
missing_columns = [col for col in required_columns if col not in df_renamed.columns]
|
||||
|
||||
if missing_columns:
|
||||
self.logger.warning(f"缺少必需的列: {missing_columns}")
|
||||
# 创建缺失的列并填充默认值
|
||||
for col in missing_columns:
|
||||
df_renamed[col] = 0 if '量' in col or '价' in col else ''
|
||||
self.logger.info(f"创建缺失列: {col},默认值: {df_renamed[col].iloc[0] if len(df_renamed) > 0 else 'N/A'}")
|
||||
|
||||
self.logger.info(f"列映射完成,列名: {list(df_renamed.columns)}")
|
||||
return df_renamed
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"列映射失败: {e}")
|
||||
return None
|
||||
|
||||
def _apply_data_cleaning(self, df: pd.DataFrame) -> Optional[pd.DataFrame]:
|
||||
"""应用数据清洗规则,委托给 DataCleaner"""
|
||||
if not self.cleaning_rules:
|
||||
self.logger.info("没有数据清洗规则")
|
||||
return df
|
||||
try:
|
||||
cleaner = DataCleaner()
|
||||
for rule in self.cleaning_rules:
|
||||
cleaner.add_rule(rule.get('type'), **{k: v for k, v in rule.items() if k != 'type'})
|
||||
result = cleaner.clean(df)
|
||||
self.logger.info(f"数据清洗完成,数据形状: {result.shape}")
|
||||
return result
|
||||
except Exception as e:
|
||||
self.logger.error(f"数据清洗失败: {e}")
|
||||
return None
|
||||
|
||||
def _apply_calculations(self, df: pd.DataFrame) -> Optional[pd.DataFrame]:
|
||||
"""应用计算处理,委托给 DataCalculator"""
|
||||
if not self.calculations:
|
||||
self.logger.info("没有计算规则")
|
||||
return df
|
||||
try:
|
||||
calculator = DataCalculator()
|
||||
for calc in self.calculations:
|
||||
calculator.add_rule(calc.get('type'), **{k: v for k, v in calc.items() if k != 'type'})
|
||||
result = calculator.calculate(df)
|
||||
self.logger.info(f"计算处理完成,数据形状: {result.shape}")
|
||||
return result
|
||||
except Exception as e:
|
||||
self.logger.error(f"计算处理失败: {e}")
|
||||
return None
|
||||
|
||||
def _generate_output(self, df: pd.DataFrame, input_file: Path, output_dir: Path) -> Optional[Path]:
|
||||
"""生成输出文件
|
||||
|
||||
Args:
|
||||
df: 最终数据
|
||||
input_file: 输入文件路径
|
||||
output_dir: 输出目录
|
||||
|
||||
Returns:
|
||||
输出文件路径或None
|
||||
"""
|
||||
try:
|
||||
# 生成输出文件名
|
||||
timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")
|
||||
output_filename = f"{input_file.stem}{self.output_suffix}_{timestamp}.xls"
|
||||
output_file = output_dir / output_filename
|
||||
|
||||
# 这里应该使用实际的模板生成逻辑
|
||||
# 暂时直接保存为Excel文件
|
||||
df.to_excel(output_file, index=False)
|
||||
|
||||
self.logger.info(f"输出文件生成成功: {output_file}")
|
||||
return output_file
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"生成输出文件失败: {e}")
|
||||
return None
|
||||
@@ -0,0 +1,347 @@
|
||||
"""
|
||||
烟草订单处理器
|
||||
|
||||
处理烟草公司特定格式的订单明细文件,生成银豹采购单
|
||||
"""
|
||||
|
||||
import os
|
||||
import datetime
|
||||
import pandas as pd
|
||||
import xlrd
|
||||
import xlwt
|
||||
from xlutils.copy import copy
|
||||
from openpyxl import load_workbook
|
||||
from typing import Optional, Dict, Any, List, Tuple
|
||||
from pathlib import Path
|
||||
|
||||
from .base import BaseProcessor
|
||||
from ...core.utils.log_utils import get_logger
|
||||
from ...core.utils.string_utils import parse_monetary_string
|
||||
from ...core.utils.dialog_utils import show_custom_dialog
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class TobaccoProcessor(BaseProcessor):
|
||||
"""烟草订单处理器
|
||||
|
||||
处理烟草公司订单明细文件,提取商品信息并生成标准银豹采购单格式
|
||||
"""
|
||||
|
||||
def __init__(self, config: Dict[str, Any]):
|
||||
"""初始化烟草订单处理器
|
||||
|
||||
Args:
|
||||
config: 配置信息
|
||||
"""
|
||||
super().__init__(config)
|
||||
self.description = "处理烟草公司订单明细文件"
|
||||
self.template_file = config.get('Paths', 'template_file', fallback='templates/银豹-采购单模板.xls')
|
||||
|
||||
# 输出目录配置
|
||||
self.result_dir = Path("data/result")
|
||||
self.result_dir.mkdir(exist_ok=True)
|
||||
|
||||
# 默认输出文件名
|
||||
self.default_output_name = "银豹采购单_烟草公司.xls"
|
||||
|
||||
def can_process(self, file_path: Path) -> bool:
|
||||
"""判断是否为烟草订单文件
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
是否能处理该文件
|
||||
"""
|
||||
if not self.validate_input(file_path):
|
||||
return False
|
||||
|
||||
# 检查文件名特征
|
||||
filename = file_path.name
|
||||
tobacco_keywords = ['烟草', '卷烟', '订单明细', 'tobacco', '烟']
|
||||
|
||||
# 检查文件内容特征
|
||||
try:
|
||||
df = self._read_excel_safely(file_path, nrows=5)
|
||||
required_columns = ['商品', '盒码', '订单量']
|
||||
|
||||
# 检查文件名或内容特征
|
||||
filename_match = any(keyword in filename for keyword in tobacco_keywords)
|
||||
content_match = all(col in df.columns for col in required_columns)
|
||||
|
||||
if filename_match or content_match:
|
||||
self.logger.info(f"识别为烟草订单文件: {filename}")
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"检查文件内容时出错: {e}")
|
||||
# 如果无法读取内容,仅基于文件名判断
|
||||
return any(keyword in filename for keyword in tobacco_keywords)
|
||||
|
||||
def process(self, input_file: Path, output_dir: Path) -> Optional[Path]:
|
||||
"""处理烟草订单
|
||||
|
||||
Args:
|
||||
input_file: 输入文件路径
|
||||
output_dir: 输出目录路径
|
||||
|
||||
Returns:
|
||||
输出文件路径,处理失败返回None
|
||||
"""
|
||||
self.log_processing_start(input_file)
|
||||
|
||||
try:
|
||||
# 读取订单信息(时间和总金额)
|
||||
order_info = self._read_order_info(input_file)
|
||||
if not order_info:
|
||||
self.logger.error(f"读取订单信息失败: {input_file}")
|
||||
self.log_processing_end(input_file, success=False)
|
||||
return None
|
||||
|
||||
order_time, total_amount = order_info
|
||||
self.logger.info(f"订单信息 - 时间: {order_time}, 总金额: {total_amount}")
|
||||
|
||||
# 读取订单数据
|
||||
order_data = self._read_order_data(input_file)
|
||||
if order_data is None or order_data.empty:
|
||||
self.logger.error(f"读取订单数据失败或数据为空: {input_file}")
|
||||
self.log_processing_end(input_file, success=False)
|
||||
return None
|
||||
|
||||
self.logger.info(f"成功读取订单数据,共{len(order_data)}条记录")
|
||||
|
||||
# 生成输出文件路径
|
||||
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
output_filename = f"银豹采购单_烟草公司_{timestamp}.xls"
|
||||
output_file = output_dir / output_filename
|
||||
|
||||
# 确保输出目录存在
|
||||
output_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 生成银豹采购单
|
||||
result = self._generate_pospal_order(order_data, order_time, output_file)
|
||||
|
||||
if result:
|
||||
self.logger.info(f"采购单生成成功: {output_file}")
|
||||
self.log_processing_end(input_file, output_file, success=True)
|
||||
|
||||
# 显示处理结果
|
||||
self._show_processing_result(output_file, order_time, len(order_data), total_amount)
|
||||
|
||||
return output_file
|
||||
else:
|
||||
self.logger.error("生成银豹采购单失败")
|
||||
self.log_processing_end(input_file, success=False)
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"处理烟草订单时发生错误: {e}", exc_info=True)
|
||||
self.log_processing_end(input_file, success=False)
|
||||
return None
|
||||
|
||||
def get_required_columns(self) -> List[str]:
|
||||
"""返回需要的列名列表"""
|
||||
return ['商品', '盒码', '条码', '建议零售价', '批发价', '需求量', '订单量', '金额']
|
||||
|
||||
def get_supported_extensions(self) -> List[str]:
|
||||
"""支持的文件扩展名"""
|
||||
return ['.xlsx', '.xls']
|
||||
|
||||
def _read_order_info(self, file_path: Path) -> Optional[Tuple[str, float]]:
|
||||
"""读取订单信息(时间和总金额)
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
包含订单时间和总金额的元组或None
|
||||
"""
|
||||
try:
|
||||
wb_info = load_workbook(file_path, data_only=True)
|
||||
ws_info = wb_info.active
|
||||
|
||||
# 从指定单元格读取订单信息
|
||||
order_time = ws_info["H1"].value or "(空)"
|
||||
total_amount = ws_info["H3"].value or 0.0
|
||||
|
||||
self.logger.info(f"成功读取订单信息: 时间={order_time}, 总金额={total_amount}")
|
||||
return (order_time, total_amount)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"读取订单信息出错: {e}")
|
||||
return None
|
||||
|
||||
def _read_order_data(self, file_path: Path) -> Optional[pd.DataFrame]:
|
||||
"""读取订单数据
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
订单数据DataFrame或None
|
||||
"""
|
||||
columns = ['商品', '盒码', '条码', '建议零售价', '批发价', '需求量', '订单量', '金额']
|
||||
|
||||
try:
|
||||
df_old = self._read_excel_safely(file_path, header=None, skiprows=3, names=columns)
|
||||
|
||||
# 过滤订单量不为0的数据,并计算采购量和单价
|
||||
df_filtered = df_old[df_old['订单量'] != 0].copy()
|
||||
|
||||
if df_filtered.empty:
|
||||
self.logger.warning("没有订单量不为0的记录")
|
||||
return None
|
||||
|
||||
# 计算采购量和单价
|
||||
df_filtered['采购量'] = df_filtered['订单量'] * 10 # 烟草订单通常需要乘以10
|
||||
df_filtered['采购单价'] = df_filtered['金额'] / df_filtered['采购量']
|
||||
df_filtered = df_filtered.reset_index(drop=True)
|
||||
|
||||
self.logger.info(f"成功处理订单数据,有效记录数: {len(df_filtered)}")
|
||||
return df_filtered
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"读取订单数据失败: {e}")
|
||||
return None
|
||||
|
||||
def _generate_pospal_order(self, order_data: pd.DataFrame, order_time: str, output_file: Path) -> bool:
|
||||
"""生成银豹采购单
|
||||
|
||||
Args:
|
||||
order_data: 订单数据
|
||||
order_time: 订单时间
|
||||
output_file: 输出文件路径
|
||||
|
||||
Returns:
|
||||
是否生成成功
|
||||
"""
|
||||
try:
|
||||
# 检查模板文件是否存在
|
||||
template_path = Path(self.template_file)
|
||||
if not template_path.exists():
|
||||
self.logger.error(f"采购单模板文件不存在: {template_path}")
|
||||
return False
|
||||
|
||||
self.logger.info(f"使用模板文件: {template_path}")
|
||||
|
||||
# 打开模板,准备写入
|
||||
template_rd = xlrd.open_workbook(str(template_path), formatting_info=True)
|
||||
template_wb = copy(template_rd)
|
||||
template_ws = template_wb.get_sheet(0)
|
||||
|
||||
# 获取模板中的表头列索引
|
||||
header_row = template_rd.sheet_by_index(0).row_values(0)
|
||||
|
||||
# 查找需要的列索引
|
||||
try:
|
||||
barcode_col = header_row.index("条码(必填)")
|
||||
amount_col = header_row.index("采购量(必填)")
|
||||
gift_col = header_row.index("赠送量")
|
||||
price_col = header_row.index("采购单价(必填)")
|
||||
except ValueError as e:
|
||||
self.logger.error(f"模板列查找失败: {e}")
|
||||
return False
|
||||
|
||||
self.logger.info(f"模板列索引 - 条码:{barcode_col}, 采购量:{amount_col}, 赠送量:{gift_col}, 单价:{price_col}")
|
||||
|
||||
# 写入数据到模板
|
||||
for i, row in order_data.iterrows():
|
||||
template_ws.write(i + 1, barcode_col, row['盒码']) # 商品条码
|
||||
template_ws.write(i + 1, amount_col, int(row['采购量'])) # 采购量
|
||||
template_ws.write(i + 1, gift_col, "") # 赠送量为空
|
||||
template_ws.write(i + 1, price_col, round(row['采购单价'], 2)) # 采购单价保留两位小数
|
||||
|
||||
# 确保输出目录存在
|
||||
output_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 保存输出文件
|
||||
template_wb.save(str(output_file))
|
||||
|
||||
self.logger.info(f"采购单生成成功: {output_file}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"生成银豹采购单失败: {e}", exc_info=True)
|
||||
return False
|
||||
|
||||
def _show_processing_result(self, output_file: Path, order_time: str, total_count: int, total_amount: float):
|
||||
"""显示处理结果
|
||||
|
||||
Args:
|
||||
output_file: 输出文件路径
|
||||
order_time: 订单时间
|
||||
total_count: 处理条目数
|
||||
total_amount: 总金额
|
||||
"""
|
||||
try:
|
||||
# 创建附加信息
|
||||
additional_info = {
|
||||
"订单来源": "烟草公司",
|
||||
"处理时间": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
}
|
||||
|
||||
# 格式化金额显示
|
||||
parsed = parse_monetary_string(total_amount)
|
||||
total_amount = parsed if parsed is not None else 0.0
|
||||
amount_display = f"¥{total_amount:.2f}"
|
||||
|
||||
# 显示自定义对话框
|
||||
show_custom_dialog(
|
||||
title="烟草订单处理结果",
|
||||
message="烟草订单处理完成",
|
||||
result_file=str(output_file),
|
||||
time_info=order_time,
|
||||
count_info=f"{total_count}个商品",
|
||||
amount_info=amount_display,
|
||||
additional_info=additional_info
|
||||
)
|
||||
|
||||
self.logger.info(f"显示处理结果 - 文件:{output_file}, 时间:{order_time}, 数量:{total_count}, 金额:{total_amount}")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"显示处理结果时出错: {e}")
|
||||
|
||||
def get_latest_tobacco_order(self) -> Optional[Path]:
|
||||
"""获取最新的烟草订单明细文件(兼容旧接口)
|
||||
|
||||
Returns:
|
||||
文件路径或None
|
||||
"""
|
||||
try:
|
||||
# 获取今日开始时间戳
|
||||
today = datetime.date.today()
|
||||
today_start = datetime.datetime.combine(today, datetime.time.min).timestamp()
|
||||
|
||||
# 查找订单明细文件
|
||||
result_dir = Path("data/output")
|
||||
if not result_dir.exists():
|
||||
return None
|
||||
|
||||
# 查找符合条件的文件
|
||||
candidates = []
|
||||
for file_path in result_dir.glob("订单明细*.xlsx"):
|
||||
if file_path.stat().st_ctime >= today_start:
|
||||
candidates.append(file_path)
|
||||
|
||||
if not candidates:
|
||||
self.logger.warning("未找到今天创建的烟草订单明细文件")
|
||||
# 返回最新的文件
|
||||
all_files = list(result_dir.glob("订单明细*.xlsx"))
|
||||
if all_files:
|
||||
all_files.sort(key=lambda x: x.stat().st_ctime, reverse=True)
|
||||
return all_files[0]
|
||||
return None
|
||||
|
||||
# 返回最新的文件
|
||||
candidates.sort(key=lambda x: x.stat().st_ctime, reverse=True)
|
||||
latest_file = candidates[0]
|
||||
|
||||
self.logger.info(f"找到最新烟草订单明细文件: {latest_file}")
|
||||
return latest_file
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"获取最新烟草订单文件时出错: {e}")
|
||||
return None
|
||||
@@ -0,0 +1,5 @@
|
||||
"""
|
||||
OCR订单处理系统 - 工具模块
|
||||
------------------------
|
||||
提供系统通用工具和辅助函数。
|
||||
"""
|
||||
@@ -0,0 +1,184 @@
|
||||
"""云端同步模块 — 基于 Gitea REST API 的文件同步"""
|
||||
|
||||
import base64
|
||||
import json
|
||||
from typing import Optional, Tuple
|
||||
|
||||
import requests
|
||||
|
||||
from .log_utils import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class GiteaSync:
|
||||
"""通过 Gitea REST API 读写仓库文件"""
|
||||
|
||||
def __init__(self, base_url: str, owner: str, repo: str, token: str, timeout: int = 15):
|
||||
self.base_url = base_url.rstrip("/")
|
||||
self.owner = owner
|
||||
self.repo = repo
|
||||
self.token = token
|
||||
self.timeout = timeout
|
||||
|
||||
@property
|
||||
def _headers(self) -> dict:
|
||||
return {"Authorization": f"token {self.token}"}
|
||||
|
||||
def _api_url(self, path: str) -> str:
|
||||
return f"{self.base_url}/api/v1/repos/{self.owner}/{self.repo}/contents/{path}"
|
||||
|
||||
def pull_file(self, remote_path: str) -> Optional[Tuple[bytes, str]]:
|
||||
"""从仓库下载文件
|
||||
|
||||
Returns:
|
||||
(content_bytes, sha) 或 None(文件不存在或失败)
|
||||
"""
|
||||
try:
|
||||
resp = requests.get(
|
||||
self._api_url(remote_path),
|
||||
headers=self._headers,
|
||||
timeout=self.timeout,
|
||||
)
|
||||
if resp.status_code == 404:
|
||||
logger.info(f"云端文件不存在: {remote_path}")
|
||||
return None
|
||||
if resp.status_code != 200:
|
||||
logger.warning(f"拉取文件失败: {resp.status_code} {resp.text[:200]}")
|
||||
return None
|
||||
|
||||
data = resp.json()
|
||||
sha = data.get("sha", "")
|
||||
content_b64 = data.get("content", "")
|
||||
# Gitea 返回的 base64 可能含换行
|
||||
content_bytes = base64.b64decode(content_b64.replace("\n", ""))
|
||||
logger.info(f"拉取文件成功: {remote_path} ({len(content_bytes)} bytes)")
|
||||
return content_bytes, sha
|
||||
|
||||
except requests.RequestException as e:
|
||||
logger.error(f"拉取文件网络错误: {e}")
|
||||
return None
|
||||
|
||||
def push_file(
|
||||
self,
|
||||
remote_path: str,
|
||||
content: bytes,
|
||||
message: str,
|
||||
sha: Optional[str] = None,
|
||||
) -> Optional[str]:
|
||||
"""上传或更新文件到仓库
|
||||
|
||||
Args:
|
||||
remote_path: 仓库中的文件路径
|
||||
content: 文件内容(bytes)
|
||||
message: commit message
|
||||
sha: 文件当前 sha(更新时必传,新建时省略)
|
||||
|
||||
Returns:
|
||||
新的 sha,失败返回 None
|
||||
"""
|
||||
payload = {
|
||||
"message": message,
|
||||
"content": base64.b64encode(content).decode("ascii"),
|
||||
}
|
||||
if sha:
|
||||
payload["sha"] = sha
|
||||
|
||||
try:
|
||||
resp = requests.put(
|
||||
self._api_url(remote_path),
|
||||
headers={**self._headers, "Content-Type": "application/json"},
|
||||
json=payload,
|
||||
timeout=self.timeout,
|
||||
)
|
||||
if resp.status_code not in (200, 201):
|
||||
logger.warning(f"推送文件失败: {resp.status_code} {resp.text[:200]}")
|
||||
return None
|
||||
|
||||
new_sha = resp.json().get("content", {}).get("sha", "")
|
||||
logger.info(f"推送文件成功: {remote_path} (sha={new_sha[:12]})")
|
||||
return new_sha
|
||||
|
||||
except requests.RequestException as e:
|
||||
logger.error(f"推送文件网络错误: {e}")
|
||||
return None
|
||||
|
||||
def file_exists(self, remote_path: str) -> Optional[str]:
|
||||
"""检查文件是否存在
|
||||
|
||||
Returns:
|
||||
文件 sha(存在)或 None(不存在)
|
||||
"""
|
||||
try:
|
||||
resp = requests.head(
|
||||
self._api_url(remote_path),
|
||||
headers=self._headers,
|
||||
timeout=self.timeout,
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
# HEAD 不返回 body,需要 GET 获取 sha
|
||||
result = self.pull_file(remote_path)
|
||||
return result[1] if result else None
|
||||
return None
|
||||
except requests.RequestException:
|
||||
return None
|
||||
|
||||
def pull_json(self, remote_path: str) -> Optional[Tuple[dict, str]]:
|
||||
"""拉取并解析 JSON 文件
|
||||
|
||||
Returns:
|
||||
(parsed_dict, sha) 或 None
|
||||
"""
|
||||
result = self.pull_file(remote_path)
|
||||
if result is None:
|
||||
return None
|
||||
content_bytes, sha = result
|
||||
try:
|
||||
data = json.loads(content_bytes)
|
||||
return data, sha
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"解析 JSON 失败: {e}")
|
||||
return None
|
||||
|
||||
def push_json(self, remote_path: str, data: dict, message: str, sha: Optional[str] = None) -> Optional[str]:
|
||||
"""将 dict 序列化为 JSON 并推送
|
||||
|
||||
Returns:
|
||||
新的 sha,失败返回 None
|
||||
"""
|
||||
content = json.dumps(data, ensure_ascii=False, indent=2).encode("utf-8")
|
||||
return self.push_file(remote_path, content, message, sha)
|
||||
|
||||
def push_binary(self, remote_path: str, local_path: str, message: str) -> Optional[str]:
|
||||
"""读取本地二进制文件并推送到云端
|
||||
|
||||
Returns:
|
||||
新的 sha,失败返回 None
|
||||
"""
|
||||
try:
|
||||
with open(local_path, "rb") as f:
|
||||
content = f.read()
|
||||
except OSError as e:
|
||||
logger.error(f"读取本地文件失败: {local_path} — {e}")
|
||||
return None
|
||||
|
||||
existing_sha = self.file_exists(remote_path)
|
||||
return self.push_file(remote_path, content, message, sha=existing_sha)
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, config) -> Optional["GiteaSync"]:
|
||||
"""从 ConfigManager 创建实例
|
||||
|
||||
Returns:
|
||||
GiteaSync 实例,配置不完整时返回 None
|
||||
"""
|
||||
base_url = config.get("Gitea", "base_url", fallback="").strip()
|
||||
owner = config.get("Gitea", "owner", fallback="").strip()
|
||||
repo = config.get("Gitea", "repo", fallback="").strip()
|
||||
token = config.get("Gitea", "token", fallback="").strip()
|
||||
|
||||
if not all([base_url, owner, repo, token]):
|
||||
logger.debug("Gitea 配置不完整,跳过云端同步")
|
||||
return None
|
||||
|
||||
return cls(base_url=base_url, owner=owner, repo=repo, token=token)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,286 @@
|
||||
"""
|
||||
文件操作工具模块
|
||||
--------------
|
||||
提供文件处理、查找和管理功能。
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import shutil
|
||||
import json
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Union, Any
|
||||
|
||||
from .log_utils import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
def ensure_dir(directory: str) -> bool:
|
||||
"""
|
||||
确保目录存在,如果不存在则创建
|
||||
|
||||
Args:
|
||||
directory: 目录路径
|
||||
|
||||
Returns:
|
||||
是否成功创建或目录已存在
|
||||
"""
|
||||
try:
|
||||
os.makedirs(directory, exist_ok=True)
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"创建目录失败: {directory}, 错误: {e}")
|
||||
return False
|
||||
|
||||
def get_file_extension(file_path: str) -> str:
|
||||
"""
|
||||
获取文件扩展名(小写)
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
文件扩展名,包含点(例如 .jpg)
|
||||
"""
|
||||
return os.path.splitext(file_path)[1].lower()
|
||||
|
||||
def is_valid_extension(file_path: str, allowed_extensions: List[str]) -> bool:
|
||||
"""
|
||||
检查文件扩展名是否在允许的列表中
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
allowed_extensions: 允许的扩展名列表(例如 ['.jpg', '.png'])
|
||||
|
||||
Returns:
|
||||
文件扩展名是否有效
|
||||
"""
|
||||
ext = get_file_extension(file_path)
|
||||
return ext in allowed_extensions
|
||||
|
||||
def get_files_by_extensions(directory: str, extensions: List[str], exclude_patterns: List[str] = None) -> List[str]:
|
||||
"""
|
||||
获取指定目录下所有符合扩展名的文件路径
|
||||
|
||||
Args:
|
||||
directory: 目录路径
|
||||
extensions: 扩展名列表(例如 ['.jpg', '.png'])
|
||||
exclude_patterns: 排除的文件名模式(例如 ['~$', '.tmp'])
|
||||
|
||||
Returns:
|
||||
文件路径列表
|
||||
"""
|
||||
if exclude_patterns is None:
|
||||
exclude_patterns = ['~$', '.tmp']
|
||||
|
||||
files = []
|
||||
for file in os.listdir(directory):
|
||||
file_path = os.path.join(directory, file)
|
||||
|
||||
# 检查是否是文件
|
||||
if not os.path.isfile(file_path):
|
||||
continue
|
||||
|
||||
# 检查扩展名
|
||||
if not is_valid_extension(file_path, extensions):
|
||||
continue
|
||||
|
||||
# 检查排除模式
|
||||
exclude = False
|
||||
for pattern in exclude_patterns:
|
||||
if pattern in file:
|
||||
exclude = True
|
||||
break
|
||||
|
||||
if not exclude:
|
||||
files.append(file_path)
|
||||
|
||||
return files
|
||||
|
||||
def get_latest_file(directory: str, pattern: str = "", extensions: List[str] = None) -> Optional[str]:
|
||||
"""
|
||||
获取指定目录下最新的文件
|
||||
|
||||
Args:
|
||||
directory: 目录路径
|
||||
pattern: 文件名包含的字符串模式
|
||||
extensions: 限制的文件扩展名列表
|
||||
|
||||
Returns:
|
||||
最新文件的路径,如果没有找到则返回None
|
||||
"""
|
||||
if not os.path.exists(directory):
|
||||
logger.warning(f"目录不存在: {directory}")
|
||||
return None
|
||||
|
||||
files = []
|
||||
for file in os.listdir(directory):
|
||||
# 检查模式和扩展名
|
||||
if (pattern and pattern not in file) or \
|
||||
(extensions and not is_valid_extension(file, extensions)):
|
||||
continue
|
||||
|
||||
file_path = os.path.join(directory, file)
|
||||
if os.path.isfile(file_path):
|
||||
files.append((file_path, os.path.getmtime(file_path)))
|
||||
|
||||
if not files:
|
||||
logger.warning(f"未在目录 {directory} 中找到符合条件的文件")
|
||||
return None
|
||||
|
||||
# 按修改时间排序,返回最新的
|
||||
sorted_files = sorted(files, key=lambda x: x[1], reverse=True)
|
||||
return sorted_files[0][0]
|
||||
|
||||
def generate_timestamp_filename(original_path: str) -> str:
|
||||
"""
|
||||
生成基于时间戳的文件名
|
||||
|
||||
Args:
|
||||
original_path: 原始文件路径
|
||||
|
||||
Returns:
|
||||
带时间戳的新文件路径
|
||||
"""
|
||||
dir_path = os.path.dirname(original_path)
|
||||
ext = os.path.splitext(original_path)[1]
|
||||
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
|
||||
return os.path.join(dir_path, f"{timestamp}{ext}")
|
||||
|
||||
def rename_file(source_path: str, target_path: str) -> bool:
|
||||
"""
|
||||
重命名文件
|
||||
|
||||
Args:
|
||||
source_path: 源文件路径
|
||||
target_path: 目标文件路径
|
||||
|
||||
Returns:
|
||||
是否成功重命名
|
||||
"""
|
||||
try:
|
||||
# 确保目标目录存在
|
||||
target_dir = os.path.dirname(target_path)
|
||||
ensure_dir(target_dir)
|
||||
|
||||
# 重命名文件
|
||||
os.rename(source_path, target_path)
|
||||
logger.info(f"文件已重命名: {os.path.basename(source_path)} -> {os.path.basename(target_path)}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"重命名文件失败: {e}")
|
||||
return False
|
||||
|
||||
def load_json(file_path: str, default: Any = None) -> Any:
|
||||
"""
|
||||
加载JSON文件
|
||||
|
||||
Args:
|
||||
file_path: JSON文件路径
|
||||
default: 如果文件不存在或加载失败时返回的默认值
|
||||
|
||||
Returns:
|
||||
JSON内容,或者默认值
|
||||
"""
|
||||
if not os.path.exists(file_path):
|
||||
return default
|
||||
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
except Exception as e:
|
||||
logger.error(f"加载JSON文件失败: {file_path}, 错误: {e}")
|
||||
return default
|
||||
|
||||
def save_json(data: Any, file_path: str, ensure_ascii: bool = False, indent: int = 2) -> bool:
|
||||
"""
|
||||
保存数据到JSON文件
|
||||
|
||||
Args:
|
||||
data: 要保存的数据
|
||||
file_path: JSON文件路径
|
||||
ensure_ascii: 是否确保ASCII编码
|
||||
indent: 缩进空格数
|
||||
|
||||
Returns:
|
||||
是否成功保存
|
||||
"""
|
||||
try:
|
||||
# 确保目录存在
|
||||
directory = os.path.dirname(file_path)
|
||||
ensure_dir(directory)
|
||||
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, ensure_ascii=ensure_ascii, indent=indent)
|
||||
logger.debug(f"JSON数据已保存到: {file_path}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"保存JSON文件失败: {file_path}, 错误: {e}")
|
||||
return False
|
||||
|
||||
def smart_read_excel(file_path: Union[str, Path], **kwargs) -> Any:
|
||||
"""
|
||||
智能读取 Excel 文件,自动选择引擎并处理常见错误
|
||||
|
||||
Args:
|
||||
file_path: Excel 文件路径
|
||||
**kwargs: 传递给 pd.read_excel 的额外参数
|
||||
|
||||
Returns:
|
||||
pandas.DataFrame 对象
|
||||
"""
|
||||
import pandas as pd
|
||||
|
||||
path_str = str(file_path)
|
||||
ext = os.path.splitext(path_str)[1].lower()
|
||||
|
||||
# 自动选择引擎
|
||||
if ext == '.xlsx':
|
||||
kwargs.setdefault('engine', 'openpyxl')
|
||||
elif ext == '.xls':
|
||||
kwargs.setdefault('engine', 'xlrd')
|
||||
|
||||
try:
|
||||
return pd.read_excel(path_str, **kwargs)
|
||||
except Exception as e:
|
||||
logger.error(f"读取 Excel 文件失败: {path_str}, 错误: {e}")
|
||||
raise
|
||||
|
||||
def get_file_size(file_path: str) -> int:
|
||||
"""
|
||||
获取文件大小(字节)
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
文件大小(字节)
|
||||
"""
|
||||
try:
|
||||
return os.path.getsize(file_path)
|
||||
except Exception as e:
|
||||
logger.error(f"获取文件大小失败: {file_path}, 错误: {e}")
|
||||
return 0
|
||||
|
||||
def is_file_size_valid(file_path: str, max_size_mb: float) -> bool:
|
||||
"""
|
||||
检查文件大小是否在允许范围内
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
max_size_mb: 最大允许大小(MB)
|
||||
|
||||
Returns:
|
||||
文件大小是否有效
|
||||
"""
|
||||
size_bytes = get_file_size(file_path)
|
||||
max_size_bytes = max_size_mb * 1024 * 1024
|
||||
return size_bytes <= max_size_bytes
|
||||
|
||||
|
||||
def format_file_size(size_bytes: int) -> str:
|
||||
"""将字节数格式化为可读的文件大小字符串(KB/MB)"""
|
||||
if size_bytes < 1024 * 1024:
|
||||
return f"{size_bytes / 1024:.1f} KB"
|
||||
return f"{size_bytes / (1024 * 1024):.1f} MB"
|
||||
@@ -0,0 +1,180 @@
|
||||
"""
|
||||
日志工具模块
|
||||
----------
|
||||
提供统一的日志配置和管理功能。
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import logging
|
||||
from logging.handlers import RotatingFileHandler
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict
|
||||
|
||||
# 日志处理器字典,用于跟踪已创建的处理器
|
||||
_handlers: Dict[str, logging.Handler] = {}
|
||||
|
||||
def setup_logger(name: str,
|
||||
log_file: Optional[str] = None,
|
||||
level=logging.INFO,
|
||||
console_output: bool = True,
|
||||
file_output: bool = True,
|
||||
log_format: str = '%(asctime)s - %(name)s - %(levelname)s - %(message)s') -> logging.Logger:
|
||||
"""
|
||||
配置并返回日志记录器
|
||||
|
||||
Args:
|
||||
name: 日志记录器的名称
|
||||
log_file: 日志文件路径,如果为None则使用默认路径
|
||||
level: 日志级别
|
||||
console_output: 是否输出到控制台
|
||||
file_output: 是否输出到文件
|
||||
log_format: 日志格式
|
||||
|
||||
Returns:
|
||||
配置好的日志记录器
|
||||
"""
|
||||
# 获取或创建日志记录器
|
||||
logger = logging.getLogger(name)
|
||||
|
||||
# 如果已经配置过处理器,不重复配置
|
||||
if logger.handlers:
|
||||
return logger
|
||||
|
||||
# 设置日志级别
|
||||
logger.setLevel(level)
|
||||
|
||||
# 创建格式化器
|
||||
formatter = logging.Formatter(log_format)
|
||||
|
||||
# 如果需要输出到文件
|
||||
if file_output:
|
||||
# 如果没有指定日志文件,使用默认路径
|
||||
if log_file is None:
|
||||
log_dir = os.path.abspath('logs')
|
||||
# 确保日志目录存在
|
||||
os.makedirs(log_dir, exist_ok=True)
|
||||
log_file = os.path.join(log_dir, f"{name}.log")
|
||||
|
||||
# 创建文件处理器
|
||||
try:
|
||||
# 使用滚动日志,限制单个日志大小与备份数量
|
||||
file_handler = RotatingFileHandler(log_file, maxBytes=5 * 1024 * 1024, backupCount=3, encoding='utf-8')
|
||||
file_handler.setFormatter(formatter)
|
||||
file_handler.setLevel(level)
|
||||
logger.addHandler(file_handler)
|
||||
_handlers[f"{name}_file"] = file_handler
|
||||
|
||||
# 记录活跃标记,避免被日志清理工具删除
|
||||
active_marker = os.path.join(os.path.dirname(log_file), f"{name}.active")
|
||||
with open(active_marker, 'w', encoding='utf-8') as f:
|
||||
f.write(f"Active since: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
except Exception as e:
|
||||
print(f"无法创建日志文件处理器: {e}")
|
||||
|
||||
# 如果需要输出到控制台
|
||||
if console_output:
|
||||
# 创建控制台处理器
|
||||
console_handler = logging.StreamHandler(sys.stdout)
|
||||
console_handler.setFormatter(formatter)
|
||||
console_handler.setLevel(level)
|
||||
logger.addHandler(console_handler)
|
||||
_handlers[f"{name}_console"] = console_handler
|
||||
|
||||
return logger
|
||||
|
||||
def get_logger(name: str) -> logging.Logger:
|
||||
"""
|
||||
获取已配置的日志记录器,如果不存在则创建一个新的
|
||||
|
||||
Args:
|
||||
name: 日志记录器的名称
|
||||
|
||||
Returns:
|
||||
日志记录器
|
||||
"""
|
||||
logger = logging.getLogger(name)
|
||||
if not logger.handlers:
|
||||
return setup_logger(name)
|
||||
return logger
|
||||
|
||||
def set_log_level(level: str) -> None:
|
||||
"""
|
||||
设置所有日志记录器的级别
|
||||
|
||||
Args:
|
||||
level: 日志级别(DEBUG, INFO, WARNING, ERROR, CRITICAL)
|
||||
"""
|
||||
level_map = {
|
||||
'debug': logging.DEBUG,
|
||||
'info': logging.INFO,
|
||||
'warning': logging.WARNING,
|
||||
'error': logging.ERROR,
|
||||
'critical': logging.CRITICAL
|
||||
}
|
||||
|
||||
# 获取对应的日志级别
|
||||
log_level = level_map.get(level.lower(), logging.INFO)
|
||||
|
||||
# 获取所有记录器
|
||||
loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict]
|
||||
|
||||
# 设置每个记录器的级别
|
||||
for logger in loggers:
|
||||
logger.setLevel(log_level)
|
||||
|
||||
# 设置根记录器的级别
|
||||
logging.getLogger().setLevel(log_level)
|
||||
|
||||
print(f"所有日志记录器级别已设置为: {logging.getLevelName(log_level)}")
|
||||
|
||||
def close_logger(name: str) -> None:
|
||||
"""
|
||||
关闭日志记录器的所有处理器
|
||||
|
||||
Args:
|
||||
name: 日志记录器的名称
|
||||
"""
|
||||
logger = logging.getLogger(name)
|
||||
for handler in logger.handlers[:]:
|
||||
handler.close()
|
||||
logger.removeHandler(handler)
|
||||
|
||||
# 清除处理器缓存
|
||||
_handlers.pop(f"{name}_file", None)
|
||||
_handlers.pop(f"{name}_console", None)
|
||||
|
||||
def close_all_loggers() -> None:
|
||||
"""
|
||||
关闭所有日志记录器的处理器
|
||||
"""
|
||||
# 获取所有记录器
|
||||
loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict]
|
||||
|
||||
# 关闭每个记录器的处理器
|
||||
for logger in loggers:
|
||||
if hasattr(logger, 'handlers'):
|
||||
for handler in logger.handlers[:]:
|
||||
handler.close()
|
||||
logger.removeHandler(handler)
|
||||
|
||||
# 清空处理器缓存
|
||||
_handlers.clear()
|
||||
|
||||
print("所有日志记录器已关闭")
|
||||
|
||||
def cleanup_active_marker(name: str) -> None:
|
||||
"""
|
||||
清理日志活跃标记
|
||||
|
||||
Args:
|
||||
name: 日志记录器的名称
|
||||
"""
|
||||
try:
|
||||
log_dir = os.path.abspath('logs')
|
||||
active_marker = os.path.join(log_dir, f"{name}.active")
|
||||
if os.path.exists(active_marker):
|
||||
os.remove(active_marker)
|
||||
except Exception as e:
|
||||
print(f"无法清理日志活跃标记: {e}")
|
||||
@@ -0,0 +1,279 @@
|
||||
"""
|
||||
字符串处理工具模块
|
||||
---------------
|
||||
提供字符串处理、正则表达式匹配等功能。
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Dict, List, Optional, Tuple, Any
|
||||
|
||||
def clean_string(text: str) -> str:
|
||||
"""
|
||||
清理字符串,移除多余空白
|
||||
|
||||
Args:
|
||||
text: 源字符串
|
||||
|
||||
Returns:
|
||||
清理后的字符串
|
||||
"""
|
||||
if not isinstance(text, str):
|
||||
return ""
|
||||
|
||||
# 移除首尾空白
|
||||
text = text.strip()
|
||||
# 移除多余空白
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
return text
|
||||
|
||||
def remove_non_digits(text: str) -> str:
|
||||
"""
|
||||
移除字符串中的非数字字符
|
||||
|
||||
Args:
|
||||
text: 源字符串
|
||||
|
||||
Returns:
|
||||
只包含数字的字符串
|
||||
"""
|
||||
if not isinstance(text, str):
|
||||
return ""
|
||||
|
||||
return re.sub(r'\D', '', text)
|
||||
|
||||
def extract_number(text: str) -> Optional[float]:
|
||||
"""
|
||||
从字符串中提取数字
|
||||
|
||||
Args:
|
||||
text: 源字符串
|
||||
|
||||
Returns:
|
||||
提取的数字,如果没有则返回None
|
||||
"""
|
||||
if not isinstance(text, str):
|
||||
return None
|
||||
|
||||
# 匹配数字(可以包含小数点和负号)
|
||||
match = re.search(r'-?\d+(\.\d+)?', text)
|
||||
if match:
|
||||
return float(match.group())
|
||||
return None
|
||||
|
||||
def extract_unit(text: str, units: List[str] = None) -> Optional[str]:
|
||||
"""
|
||||
从字符串中提取单位
|
||||
|
||||
Args:
|
||||
text: 源字符串
|
||||
units: 有效单位列表,如果为None则自动识别
|
||||
|
||||
Returns:
|
||||
提取的单位,如果没有则返回None
|
||||
"""
|
||||
if not isinstance(text, str):
|
||||
return None
|
||||
|
||||
# 如果提供了单位列表,检查字符串中是否包含
|
||||
if units:
|
||||
for unit in units:
|
||||
if unit in text:
|
||||
return unit
|
||||
return None
|
||||
|
||||
# 否则,尝试自动识别常见单位
|
||||
# 正则表达式:匹配数字后面的非数字部分作为单位
|
||||
match = re.search(r'\d+\s*([^\d\s]+)', text)
|
||||
if match:
|
||||
return match.group(1)
|
||||
return None
|
||||
|
||||
def extract_number_and_unit(text: str) -> Tuple[Optional[float], Optional[str]]:
|
||||
"""
|
||||
从字符串中同时提取数字和单位
|
||||
|
||||
Args:
|
||||
text: 源字符串
|
||||
|
||||
Returns:
|
||||
(数字, 单位)元组,如果没有则对应返回None
|
||||
"""
|
||||
if not isinstance(text, str):
|
||||
return None, None
|
||||
|
||||
# 匹配数字和单位的组合
|
||||
match = re.search(r'(-?\d+(?:\.\d+)?)\s*([^\d\s]+)?', text)
|
||||
if match:
|
||||
number = float(match.group(1))
|
||||
unit = match.group(2) if match.group(2) else None
|
||||
return number, unit
|
||||
return None, None
|
||||
|
||||
def parse_specification(spec_str: str) -> Optional[int]:
|
||||
"""
|
||||
解析规格字符串,提取包装数量
|
||||
支持格式:1*15, 1x15, 1*5*10
|
||||
|
||||
Args:
|
||||
spec_str: 规格字符串
|
||||
|
||||
Returns:
|
||||
包装数量,如果无法解析则返回None
|
||||
"""
|
||||
if not spec_str or not isinstance(spec_str, str):
|
||||
return None
|
||||
|
||||
try:
|
||||
# 清理规格字符串
|
||||
spec_str = clean_string(spec_str)
|
||||
|
||||
# 匹配重量/容量格式,如"450g*15"、"450ml*15"
|
||||
match = re.search(r'\d+(?:g|ml|毫升|克)[*xX×](\d+)', spec_str)
|
||||
if match:
|
||||
# 返回后面的数量
|
||||
return int(match.group(1))
|
||||
|
||||
# 匹配1*5*10 格式的三级规格
|
||||
match = re.search(r'(\d+)[\*xX×](\d+)[\*xX×](\d+)', spec_str)
|
||||
if match:
|
||||
# 取最后一个数字作为袋数量
|
||||
return int(match.group(3))
|
||||
|
||||
# 匹配1*15, 1x15 格式
|
||||
match = re.search(r'(\d+)[\*xX×](\d+)', spec_str)
|
||||
if match:
|
||||
# 取第二个数字作为包装数量
|
||||
return int(match.group(2))
|
||||
|
||||
# 匹配24瓶/件等格式
|
||||
match = re.search(r'(\d+)[瓶个支袋][//](件|箱)', spec_str)
|
||||
if match:
|
||||
return int(match.group(1))
|
||||
|
||||
# 匹配4L格式
|
||||
match = re.search(r'(\d+(?:\.\d+)?)\s*[Ll升][*×]?(\d+)?', spec_str)
|
||||
if match:
|
||||
# 如果有第二个数字,返回它;否则返回1
|
||||
return int(match.group(2)) if match.group(2) else 1
|
||||
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
def clean_barcode(barcode: Any) -> str:
|
||||
"""
|
||||
清理条码格式
|
||||
|
||||
Args:
|
||||
barcode: 条码(可以是字符串、整数或浮点数)
|
||||
|
||||
Returns:
|
||||
清理后的条码字符串
|
||||
"""
|
||||
if isinstance(barcode, (int, float)):
|
||||
barcode = f"{barcode:.0f}"
|
||||
|
||||
# 清理条码格式,移除可能的非数字字符(包括小数点)
|
||||
barcode_clean = re.sub(r'\.0+$', '', str(barcode)) # 移除末尾0
|
||||
barcode_clean = re.sub(r'\D', '', barcode_clean) # 只保留数字
|
||||
|
||||
return barcode_clean
|
||||
|
||||
def is_scientific_notation(value: str) -> bool:
|
||||
"""
|
||||
检查字符串是否是科学计数法表示
|
||||
|
||||
Args:
|
||||
value: 字符串值
|
||||
|
||||
Returns:
|
||||
是否是科学计数法
|
||||
"""
|
||||
return bool(re.match(r'^-?\d+(\.\d+)?[eE][+-]?\d+$', str(value)))
|
||||
|
||||
def parse_monetary_string(value: Any) -> Optional[float]:
|
||||
"""
|
||||
解析金额/数量字符串为浮点数。
|
||||
处理: 货币符号(¥/$)、逗号作小数点、逗号作千位分隔符、中文"元"后缀等。
|
||||
|
||||
Args:
|
||||
value: 金额值(字符串、数字或其他类型)
|
||||
|
||||
Returns:
|
||||
解析后的浮点数,无法解析则返回 None
|
||||
"""
|
||||
if value is None:
|
||||
return None
|
||||
if isinstance(value, (int, float)):
|
||||
return float(value)
|
||||
if not isinstance(value, str):
|
||||
return None
|
||||
|
||||
s = value.strip()
|
||||
if not s or s.lower() in ('o', 'none', 'null', '-', '--'):
|
||||
return None
|
||||
|
||||
# 移除非数字字符,保留数字、小数点、逗号和负号
|
||||
cleaned = re.sub(r'[^\d\.\-,]', '', s)
|
||||
if not cleaned or cleaned in ('-', '.', '-.', ','):
|
||||
return None
|
||||
|
||||
# 逗号处理策略:
|
||||
# 多个逗号 -> 千位分隔符,全部移除 (如 "1,234,567" = 1234567)
|
||||
# 一个逗号 + 无小数点 -> 逗号当小数点 (如 "1,5" = 1.5)
|
||||
# 一个逗号 + 有小数点 -> 千位分隔符,移除 (如 "1,234.56" = 1234.56)
|
||||
comma_count = cleaned.count(',')
|
||||
if comma_count > 1:
|
||||
cleaned = cleaned.replace(',', '')
|
||||
elif comma_count == 1 and '.' not in cleaned:
|
||||
cleaned = cleaned.replace(',', '.')
|
||||
elif comma_count == 1 and '.' in cleaned:
|
||||
cleaned = cleaned.replace(',', '')
|
||||
|
||||
try:
|
||||
return float(cleaned)
|
||||
except (ValueError, TypeError):
|
||||
return None
|
||||
|
||||
|
||||
def format_barcode(barcode: Any) -> str:
|
||||
"""
|
||||
格式化条码,处理科学计数法
|
||||
|
||||
Args:
|
||||
barcode: 条码值
|
||||
|
||||
Returns:
|
||||
格式化后的条码字符串
|
||||
"""
|
||||
if barcode is None:
|
||||
return ""
|
||||
|
||||
# 先转为字符串
|
||||
barcode_str = str(barcode).strip()
|
||||
|
||||
# 判断是否为科学计数法
|
||||
if is_scientific_notation(barcode_str):
|
||||
try:
|
||||
# 科学计数法转为普通数字字符串
|
||||
barcode_str = f"{float(barcode_str):.0f}"
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
# 移除可能的小数部分(如"123456.0"变为"123456")
|
||||
if '.' in barcode_str:
|
||||
barcode_str = re.sub(r'\.0+$', '', barcode_str)
|
||||
|
||||
# 确保是纯数字字符串
|
||||
if not barcode_str.isdigit():
|
||||
# 只保留数字字符
|
||||
barcode_str = re.sub(r'\D', '', barcode_str)
|
||||
|
||||
# 新增:处理末尾多余的0,标准条码通常为12-13位
|
||||
if len(barcode_str) > 13 and barcode_str.endswith('0'):
|
||||
# 从末尾开始移除多余的0,直到条码长度为13位或者不再以0结尾
|
||||
while len(barcode_str) > 13 and barcode_str.endswith('0'):
|
||||
barcode_str = barcode_str[:-1]
|
||||
|
||||
return barcode_str
|
||||
Reference in New Issue
Block a user