新版本
This commit is contained in:
@@ -285,6 +285,16 @@ class UnitConverter:
|
||||
|
||||
logger.debug(f"解析规格: {spec}")
|
||||
|
||||
# 新增:处理“1件=12桶/袋/盒...”等等式规格,统一为1*12
|
||||
eq_match = re.match(r'(\d+(?:\.\d+)?)\s*(?:件|箱|提|盒)\s*[==]\s*(\d+)\s*(?:瓶|桶|盒|支|个|袋|罐|包|卷)', spec)
|
||||
if eq_match:
|
||||
try:
|
||||
level2 = int(eq_match.group(2))
|
||||
logger.info(f"解析等式规格: {spec} -> 1*{level2}")
|
||||
return 1, level2, None
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# 处理三级包装,如1*5*12
|
||||
three_level_match = re.match(r'(\d+)[*](\d+)[*](\d+)', spec)
|
||||
if three_level_match:
|
||||
@@ -522,4 +532,4 @@ class UnitConverter:
|
||||
更新是否成功
|
||||
"""
|
||||
self.special_barcodes = new_mappings
|
||||
return self.save_barcode_mappings(new_mappings)
|
||||
return self.save_barcode_mappings(new_mappings)
|
||||
|
||||
@@ -11,7 +11,7 @@ import numpy as np
|
||||
import xlrd
|
||||
import xlwt
|
||||
from xlutils.copy import copy as xlcopy
|
||||
from typing import Dict, List, Optional, Tuple, Union, Any
|
||||
from typing import Dict, List, Optional, Tuple, Union, Any, Callable
|
||||
from datetime import datetime
|
||||
|
||||
from ...config.settings import ConfigManager
|
||||
@@ -414,7 +414,7 @@ class PurchaseOrderMerger:
|
||||
logger.error(f"创建合并采购单时出错: {e}")
|
||||
return None
|
||||
|
||||
def process(self, file_paths: Optional[List[str]] = None) -> Optional[str]:
|
||||
def process(self, file_paths: Optional[List[str]] = None, progress_cb: Optional[Callable[[int], None]] = None) -> Optional[str]:
|
||||
"""
|
||||
处理采购单合并
|
||||
|
||||
@@ -427,6 +427,11 @@ class PurchaseOrderMerger:
|
||||
# 如果未指定文件路径,则获取所有采购单文件
|
||||
if file_paths is None:
|
||||
file_paths = self.get_purchase_orders()
|
||||
try:
|
||||
if progress_cb:
|
||||
progress_cb(97)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 检查是否有文件需要合并
|
||||
if not file_paths:
|
||||
@@ -438,16 +443,26 @@ class PurchaseOrderMerger:
|
||||
if merged_df is None:
|
||||
logger.error("合并采购单失败")
|
||||
return None
|
||||
try:
|
||||
if progress_cb:
|
||||
progress_cb(98)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 创建合并的采购单文件
|
||||
output_file = self.create_merged_purchase_order(merged_df)
|
||||
if output_file is None:
|
||||
logger.error("创建合并采购单文件失败")
|
||||
return None
|
||||
try:
|
||||
if progress_cb:
|
||||
progress_cb(100)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 记录已合并文件
|
||||
for file_path in file_paths:
|
||||
self.merged_files[file_path] = output_file
|
||||
self._save_merged_files()
|
||||
|
||||
return output_file
|
||||
return output_file
|
||||
|
||||
@@ -11,7 +11,7 @@ import numpy as np
|
||||
import xlrd
|
||||
import xlwt
|
||||
from xlutils.copy import copy as xlcopy
|
||||
from typing import Dict, List, Optional, Tuple, Union, Any
|
||||
from typing import Dict, List, Optional, Tuple, Union, Any, Callable
|
||||
from datetime import datetime
|
||||
|
||||
from ...config.settings import ConfigManager
|
||||
@@ -281,6 +281,36 @@ class ExcelProcessor:
|
||||
product['amount'] = row['小计']
|
||||
elif column_mapping.get('amount') and not pd.isna(row[column_mapping['amount']]):
|
||||
product['amount'] = row[column_mapping['amount']]
|
||||
# 根据金额判断赠品:金额为0、为空、或为o/O
|
||||
amt = product.get('amount', None)
|
||||
try:
|
||||
is_amt_gift = False
|
||||
if amt is None:
|
||||
is_amt_gift = True
|
||||
elif isinstance(amt, str):
|
||||
s = amt.strip()
|
||||
if s == '' or s.lower() == 'o' or s == '0' or s == '○':
|
||||
is_amt_gift = True
|
||||
else:
|
||||
amt_clean = re.sub(r'[^\d\.,]', '', s)
|
||||
if ',' in amt_clean and '.' not in amt_clean:
|
||||
amt_clean = amt_clean.replace(',', '.')
|
||||
elif ',' in amt_clean and '.' in amt_clean:
|
||||
amt_clean = amt_clean.replace(',', '')
|
||||
if amt_clean:
|
||||
try:
|
||||
is_amt_gift = float(amt_clean) == 0.0
|
||||
except ValueError:
|
||||
pass
|
||||
else:
|
||||
try:
|
||||
is_amt_gift = float(amt) == 0.0
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
if is_amt_gift:
|
||||
product['is_gift'] = True
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 提取数量
|
||||
if '数量' in df.columns and not pd.isna(row['数量']):
|
||||
@@ -472,7 +502,7 @@ class ExcelProcessor:
|
||||
logger.warning(f"通过金额和单价计算数量失败: {e}")
|
||||
|
||||
# 判断是否为赠品(价格为0)
|
||||
is_gift = price == 0
|
||||
is_gift = bool(product.get('is_gift', False)) or (price == 0)
|
||||
|
||||
logger.info(f"处理商品: 条码={barcode}, 数量={quantity}, 单价={price}, 是否赠品={is_gift}")
|
||||
|
||||
@@ -631,7 +661,7 @@ class ExcelProcessor:
|
||||
logger.warning("无法识别表头行")
|
||||
return None
|
||||
|
||||
def process_specific_file(self, file_path: str) -> Optional[str]:
|
||||
def process_specific_file(self, file_path: str, progress_cb: Optional[Callable[[int], None]] = None) -> Optional[str]:
|
||||
"""
|
||||
处理指定的Excel文件
|
||||
|
||||
@@ -649,6 +679,11 @@ class ExcelProcessor:
|
||||
|
||||
try:
|
||||
# 读取Excel文件时不立即指定表头
|
||||
if progress_cb:
|
||||
try:
|
||||
progress_cb(92)
|
||||
except Exception:
|
||||
pass
|
||||
df = pd.read_excel(file_path, header=None)
|
||||
logger.info(f"成功读取Excel文件: {file_path}, 共 {len(df)} 行")
|
||||
|
||||
@@ -661,10 +696,20 @@ class ExcelProcessor:
|
||||
logger.info(f"识别到表头在第 {header_row+1} 行")
|
||||
|
||||
# 重新读取Excel,正确指定表头行
|
||||
if progress_cb:
|
||||
try:
|
||||
progress_cb(94)
|
||||
except Exception:
|
||||
pass
|
||||
df = pd.read_excel(file_path, header=header_row)
|
||||
logger.info(f"使用表头行重新读取数据,共 {len(df)} 行有效数据")
|
||||
|
||||
# 提取商品信息
|
||||
if progress_cb:
|
||||
try:
|
||||
progress_cb(96)
|
||||
except Exception:
|
||||
pass
|
||||
products = self.extract_product_info(df)
|
||||
|
||||
if not products:
|
||||
@@ -685,6 +730,11 @@ class ExcelProcessor:
|
||||
|
||||
# 不再自动打开输出目录
|
||||
logger.info(f"采购单已保存到: {output_file}")
|
||||
if progress_cb:
|
||||
try:
|
||||
progress_cb(100)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return output_file
|
||||
|
||||
@@ -694,7 +744,7 @@ class ExcelProcessor:
|
||||
logger.error(f"处理Excel文件时出错: {file_path}, 错误: {e}")
|
||||
return None
|
||||
|
||||
def process_latest_file(self) -> Optional[str]:
|
||||
def process_latest_file(self, progress_cb: Optional[Callable[[int], None]] = None) -> Optional[str]:
|
||||
"""
|
||||
处理最新的Excel文件
|
||||
|
||||
@@ -708,7 +758,7 @@ class ExcelProcessor:
|
||||
return None
|
||||
|
||||
# 处理文件
|
||||
return self.process_specific_file(latest_file)
|
||||
return self.process_specific_file(latest_file, progress_cb=progress_cb)
|
||||
|
||||
def _detect_column_mapping(self, df: pd.DataFrame) -> Dict[str, str]:
|
||||
"""
|
||||
@@ -889,6 +939,11 @@ class ExcelProcessor:
|
||||
|
||||
logger.debug(f"清理后的规格字符串: {spec_str}")
|
||||
|
||||
# 新增:匹配“1件=12桶/袋/盒…”等等式规格,取右侧数量作为包装数量
|
||||
eq_match = re.search(r'(\d+(?:\.\d+)?)\s*(?:件|箱|提|盒)\s*[==]\s*(\d+)\s*(?:瓶|桶|盒|支|个|袋|罐|包|卷)', spec_str)
|
||||
if eq_match:
|
||||
return int(eq_match.group(2))
|
||||
|
||||
# 匹配带单位的格式,如"5kg*6"、"450g*15"、"450ml*15"
|
||||
weight_pattern = r'(\d+(?:\.\d+)?)\s*(?:kg|KG|千克|公斤)[*×](\d+)'
|
||||
match = re.search(weight_pattern, spec_str)
|
||||
@@ -946,4 +1001,4 @@ class ExcelProcessor:
|
||||
except Exception as e:
|
||||
logger.warning(f"解析规格'{spec_str}'时出错: {e}")
|
||||
|
||||
return None
|
||||
return None
|
||||
|
||||
@@ -1,355 +0,0 @@
|
||||
"""
|
||||
单位转换器测试模块
|
||||
---------------
|
||||
测试单位转换和条码映射逻辑
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import unittest
|
||||
from typing import Dict, Any
|
||||
|
||||
# 添加项目根目录到Python路径
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../..')))
|
||||
|
||||
from app.core.excel.converter import UnitConverter
|
||||
from app.core.excel.validators import ProductValidator
|
||||
|
||||
|
||||
class TestUnitConverter(unittest.TestCase):
|
||||
"""
|
||||
测试单位转换器功能
|
||||
"""
|
||||
|
||||
def setUp(self):
|
||||
"""
|
||||
测试前的准备工作
|
||||
"""
|
||||
self.converter = UnitConverter()
|
||||
|
||||
def test_jian_unit_conversion(self):
|
||||
"""
|
||||
测试"件"单位的转换
|
||||
"""
|
||||
# 准备测试数据
|
||||
product = {
|
||||
'barcode': '6954767400129',
|
||||
'name': '美汁源果粒橙1.8L*8瓶',
|
||||
'specification': '1.8L*8',
|
||||
'quantity': 1.0,
|
||||
'unit': '件',
|
||||
'price': 65.0
|
||||
}
|
||||
|
||||
# 执行转换
|
||||
result = self.converter.process_unit_conversion(product)
|
||||
|
||||
# 验证结果
|
||||
self.assertEqual(result['quantity'], 8.0)
|
||||
self.assertEqual(result['price'], 8.125)
|
||||
self.assertEqual(result['unit'], '瓶')
|
||||
|
||||
def test_box_unit_conversion(self):
|
||||
"""
|
||||
测试"箱"单位的转换
|
||||
"""
|
||||
# 准备测试数据
|
||||
product = {
|
||||
'barcode': '6925303721244',
|
||||
'name': '统一鲜橙多2L*6瓶',
|
||||
'specification': '2L*6',
|
||||
'quantity': 1.0,
|
||||
'unit': '箱',
|
||||
'price': 43.0
|
||||
}
|
||||
|
||||
# 执行转换
|
||||
result = self.converter.process_unit_conversion(product)
|
||||
|
||||
# 验证结果
|
||||
self.assertEqual(result['quantity'], 6.0)
|
||||
self.assertEqual(result['price'], 7.1666666666666667)
|
||||
self.assertEqual(result['unit'], '瓶')
|
||||
|
||||
def test_tihe_unit_conversion_level3(self):
|
||||
"""
|
||||
测试"提"单位的转换(三级规格)
|
||||
"""
|
||||
# 准备测试数据(三级规格:1*6*4,表示1排6提,每提4瓶)
|
||||
product = {
|
||||
'barcode': '6921168509347',
|
||||
'name': '农夫山泉550ml*24瓶',
|
||||
'specification': '1*6*4',
|
||||
'quantity': 2.0,
|
||||
'unit': '提',
|
||||
'price': 16.0
|
||||
}
|
||||
|
||||
# 执行转换
|
||||
result = self.converter.process_unit_conversion(product)
|
||||
|
||||
# 验证结果:三级规格,提单位特殊处理,数量*最后一级
|
||||
self.assertEqual(result['quantity'], 8.0) # 2提 * 4瓶/提
|
||||
self.assertEqual(result['price'], 4.0) # 16元/提 ÷ 4瓶/提
|
||||
self.assertEqual(result['unit'], '瓶')
|
||||
|
||||
def test_tihe_unit_conversion_level2(self):
|
||||
"""
|
||||
测试"提"单位的转换(二级规格)
|
||||
"""
|
||||
# 准备测试数据(二级规格:1*4,表示每件4提)
|
||||
product = {
|
||||
'barcode': '6921168509347',
|
||||
'name': '农夫山泉550ml*4瓶',
|
||||
'specification': '1*4',
|
||||
'quantity': 5.0,
|
||||
'unit': '提',
|
||||
'price': 10.0
|
||||
}
|
||||
|
||||
# 执行转换
|
||||
result = self.converter.process_unit_conversion(product)
|
||||
|
||||
# 验证结果:二级规格,提单位保持不变
|
||||
self.assertEqual(result['quantity'], 5.0)
|
||||
self.assertEqual(result['price'], 10.0)
|
||||
self.assertEqual(result['unit'], '提')
|
||||
|
||||
def test_barcode_mapping(self):
|
||||
"""
|
||||
测试条码映射
|
||||
"""
|
||||
# 准备测试数据(使用需要被映射的条码)
|
||||
product = {
|
||||
'barcode': '6920584471055', # 这个条码应映射到6920584471017
|
||||
'name': '测试映射条码商品',
|
||||
'specification': '1*12',
|
||||
'quantity': 1.0,
|
||||
'unit': '件',
|
||||
'price': 60.0
|
||||
}
|
||||
|
||||
# 执行转换
|
||||
result = self.converter.process_unit_conversion(product)
|
||||
|
||||
# 验证结果:条码应该被映射
|
||||
self.assertEqual(result['barcode'], '6920584471017')
|
||||
self.assertEqual(result['quantity'], 12.0) # 同时处理件单位转换
|
||||
self.assertEqual(result['price'], 5.0) # 60元/件 ÷ 12瓶/件
|
||||
self.assertEqual(result['unit'], '瓶')
|
||||
|
||||
def test_special_barcode_multiplier(self):
|
||||
"""
|
||||
测试特殊条码的倍数处理
|
||||
"""
|
||||
# 准备测试数据(使用特殊条码)
|
||||
product = {
|
||||
'barcode': '6925019900087', # 特殊条码:数量*10,单位转瓶
|
||||
'name': '特殊条码商品',
|
||||
'specification': '1*10',
|
||||
'quantity': 2.0,
|
||||
'unit': '箱',
|
||||
'price': 100.0
|
||||
}
|
||||
|
||||
# 执行转换
|
||||
result = self.converter.process_unit_conversion(product)
|
||||
|
||||
# 验证结果:特殊条码乘数应该生效
|
||||
self.assertEqual(result['quantity'], 20.0) # 2箱 * 10倍数
|
||||
self.assertEqual(result['price'], 5.0) # 100元/箱 ÷ 10倍数/箱
|
||||
self.assertEqual(result['unit'], '瓶')
|
||||
|
||||
|
||||
class TestProductValidator(unittest.TestCase):
|
||||
"""
|
||||
测试商品数据验证器功能
|
||||
"""
|
||||
|
||||
def setUp(self):
|
||||
"""
|
||||
测试前的准备工作
|
||||
"""
|
||||
self.validator = ProductValidator()
|
||||
|
||||
def test_validate_barcode(self):
|
||||
"""
|
||||
测试条码验证
|
||||
"""
|
||||
# 测试有效条码
|
||||
is_valid, barcode, error = self.validator.validate_barcode('6925303721244')
|
||||
self.assertTrue(is_valid)
|
||||
self.assertEqual(barcode, '6925303721244')
|
||||
self.assertIsNone(error)
|
||||
|
||||
# 测试包含非数字字符的条码
|
||||
is_valid, barcode, error = self.validator.validate_barcode('6925303-721244')
|
||||
self.assertTrue(is_valid)
|
||||
self.assertEqual(barcode, '6925303721244')
|
||||
self.assertIsNone(error)
|
||||
|
||||
# 测试5开头的条码修正
|
||||
is_valid, barcode, error = self.validator.validate_barcode('5925303721244')
|
||||
self.assertTrue(is_valid)
|
||||
self.assertEqual(barcode, '6925303721244')
|
||||
self.assertIsNone(error)
|
||||
|
||||
# 测试过短的条码
|
||||
is_valid, barcode, error = self.validator.validate_barcode('12345')
|
||||
self.assertFalse(is_valid)
|
||||
self.assertEqual(barcode, '12345')
|
||||
self.assertIn("条码长度异常", error)
|
||||
|
||||
# 测试仓库标识
|
||||
is_valid, barcode, error = self.validator.validate_barcode('仓库')
|
||||
self.assertFalse(is_valid)
|
||||
self.assertEqual(barcode, '仓库')
|
||||
self.assertEqual(error, "条码为仓库标识")
|
||||
|
||||
# 测试空值
|
||||
is_valid, barcode, error = self.validator.validate_barcode(None)
|
||||
self.assertFalse(is_valid)
|
||||
self.assertEqual(barcode, "")
|
||||
self.assertEqual(error, "条码为空")
|
||||
|
||||
def test_validate_quantity(self):
|
||||
"""
|
||||
测试数量验证
|
||||
"""
|
||||
# 测试有效数量
|
||||
is_valid, quantity, error = self.validator.validate_quantity(10)
|
||||
self.assertTrue(is_valid)
|
||||
self.assertEqual(quantity, 10.0)
|
||||
self.assertIsNone(error)
|
||||
|
||||
# 测试字符串数量
|
||||
is_valid, quantity, error = self.validator.validate_quantity("25.5")
|
||||
self.assertTrue(is_valid)
|
||||
self.assertEqual(quantity, 25.5)
|
||||
self.assertIsNone(error)
|
||||
|
||||
# 测试带单位的数量
|
||||
is_valid, quantity, error = self.validator.validate_quantity("30瓶")
|
||||
self.assertTrue(is_valid)
|
||||
self.assertEqual(quantity, 30.0)
|
||||
self.assertIsNone(error)
|
||||
|
||||
# 测试零数量
|
||||
is_valid, quantity, error = self.validator.validate_quantity(0)
|
||||
self.assertFalse(is_valid)
|
||||
self.assertEqual(quantity, 0.0)
|
||||
self.assertIn("数量必须大于0", error)
|
||||
|
||||
# 测试负数量
|
||||
is_valid, quantity, error = self.validator.validate_quantity(-5)
|
||||
self.assertFalse(is_valid)
|
||||
self.assertEqual(quantity, 0.0)
|
||||
self.assertIn("数量必须大于0", error)
|
||||
|
||||
# 测试非数字
|
||||
is_valid, quantity, error = self.validator.validate_quantity("abc")
|
||||
self.assertFalse(is_valid)
|
||||
self.assertEqual(quantity, 0.0)
|
||||
self.assertIn("数量不包含数字", error)
|
||||
|
||||
# 测试空值
|
||||
is_valid, quantity, error = self.validator.validate_quantity(None)
|
||||
self.assertFalse(is_valid)
|
||||
self.assertEqual(quantity, 0.0)
|
||||
self.assertEqual(error, "数量为空")
|
||||
|
||||
def test_validate_price(self):
|
||||
"""
|
||||
测试单价验证
|
||||
"""
|
||||
# 测试有效单价
|
||||
is_valid, price, is_gift, error = self.validator.validate_price(12.5)
|
||||
self.assertTrue(is_valid)
|
||||
self.assertEqual(price, 12.5)
|
||||
self.assertFalse(is_gift)
|
||||
self.assertIsNone(error)
|
||||
|
||||
# 测试字符串单价
|
||||
is_valid, price, is_gift, error = self.validator.validate_price("8.0")
|
||||
self.assertTrue(is_valid)
|
||||
self.assertEqual(price, 8.0)
|
||||
self.assertFalse(is_gift)
|
||||
self.assertIsNone(error)
|
||||
|
||||
# 测试零单价(赠品)
|
||||
is_valid, price, is_gift, error = self.validator.validate_price(0)
|
||||
self.assertTrue(is_valid)
|
||||
self.assertEqual(price, 0.0)
|
||||
self.assertTrue(is_gift)
|
||||
self.assertIsNone(error)
|
||||
|
||||
# 测试"赠品"标记
|
||||
is_valid, price, is_gift, error = self.validator.validate_price("赠品")
|
||||
self.assertTrue(is_valid)
|
||||
self.assertEqual(price, 0.0)
|
||||
self.assertTrue(is_gift)
|
||||
self.assertIsNone(error)
|
||||
|
||||
# 测试负单价
|
||||
is_valid, price, is_gift, error = self.validator.validate_price(-5)
|
||||
self.assertFalse(is_valid)
|
||||
self.assertEqual(price, 0.0)
|
||||
self.assertTrue(is_gift)
|
||||
self.assertIn("单价不能为负数", error)
|
||||
|
||||
# 测试空值
|
||||
is_valid, price, is_gift, error = self.validator.validate_price(None)
|
||||
self.assertFalse(is_valid)
|
||||
self.assertEqual(price, 0.0)
|
||||
self.assertTrue(is_gift)
|
||||
self.assertEqual(error, "单价为空,视为赠品")
|
||||
|
||||
def test_validate_product(self):
|
||||
"""
|
||||
测试商品数据验证
|
||||
"""
|
||||
# 准备测试数据(有效商品)
|
||||
product = {
|
||||
'barcode': '6954767400129',
|
||||
'name': '测试商品',
|
||||
'specification': '1*12',
|
||||
'quantity': 3.0,
|
||||
'price': 36.0,
|
||||
'unit': '件',
|
||||
'is_gift': False
|
||||
}
|
||||
|
||||
# 验证有效商品
|
||||
result = self.validator.validate_product(product)
|
||||
self.assertEqual(result['barcode'], '6954767400129')
|
||||
self.assertEqual(result['quantity'], 3.0)
|
||||
self.assertEqual(result['price'], 36.0)
|
||||
self.assertFalse(result['is_gift'])
|
||||
|
||||
# 验证赠品商品
|
||||
gift_product = product.copy()
|
||||
gift_product['price'] = 0
|
||||
result = self.validator.validate_product(gift_product)
|
||||
self.assertEqual(result['price'], 0.0)
|
||||
self.assertTrue(result['is_gift'])
|
||||
|
||||
# 验证需要修复的商品
|
||||
invalid_product = {
|
||||
'barcode': '5954767-400129', # 需要修复前缀和移除非数字
|
||||
'name': '测试商品',
|
||||
'specification': '1*12',
|
||||
'quantity': '2件', # 需要提取数字
|
||||
'price': '赠品', # 赠品标记
|
||||
'unit': '件',
|
||||
'is_gift': False
|
||||
}
|
||||
|
||||
result = self.validator.validate_product(invalid_product)
|
||||
self.assertEqual(result['barcode'], '6954767400129') # 5->6,移除 '-'
|
||||
self.assertEqual(result['quantity'], 2.0) # 提取数字
|
||||
self.assertEqual(result['price'], 0.0) # 赠品价格为0
|
||||
self.assertTrue(result['is_gift']) # 标记为赠品
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@@ -225,6 +225,36 @@ class ProductValidator:
|
||||
validated_product['is_gift'] = True
|
||||
if error_msg:
|
||||
logger.info(error_msg)
|
||||
|
||||
amount = product.get('amount', None)
|
||||
try:
|
||||
is_amount_gift = False
|
||||
if amount is None:
|
||||
is_amount_gift = True
|
||||
elif isinstance(amount, str):
|
||||
s = amount.strip()
|
||||
if s == '' or s.lower() == 'o' or s == '0':
|
||||
is_amount_gift = True
|
||||
else:
|
||||
amt_clean = re.sub(r'[^\d\.,]', '', s)
|
||||
if ',' in amt_clean and '.' not in amt_clean:
|
||||
amt_clean = amt_clean.replace(',', '.')
|
||||
elif ',' in amt_clean and '.' in amt_clean:
|
||||
amt_clean = amt_clean.replace(',', '')
|
||||
if amt_clean:
|
||||
try:
|
||||
is_amount_gift = float(amt_clean) == 0.0
|
||||
except ValueError:
|
||||
pass
|
||||
else:
|
||||
try:
|
||||
is_amount_gift = float(amount) == 0.0
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
if is_amount_gift:
|
||||
validated_product['is_gift'] = True
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 验证数量
|
||||
quantity = product.get('quantity', None)
|
||||
@@ -268,4 +298,4 @@ class ProductValidator:
|
||||
logger.warning(f"数量验证失败: {error_msg}")
|
||||
validated_product['quantity'] = 0.0
|
||||
|
||||
return validated_product
|
||||
return validated_product
|
||||
|
||||
@@ -0,0 +1,9 @@
|
||||
"""
|
||||
数据处理handlers模块初始化文件
|
||||
"""
|
||||
|
||||
from .data_cleaner import DataCleaner
|
||||
from .column_mapper import ColumnMapper
|
||||
from .calculator import DataCalculator
|
||||
|
||||
__all__ = ['DataCleaner', 'ColumnMapper', 'DataCalculator']
|
||||
@@ -0,0 +1,378 @@
|
||||
"""
|
||||
数据计算处理器
|
||||
|
||||
提供各种数据计算功能,如数量计算、价格计算、汇总统计等
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from typing import Dict, Any, Optional, List, Union
|
||||
from ...core.utils.log_utils import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class DataCalculator:
|
||||
"""数据计算处理器
|
||||
|
||||
提供标准化的数据计算功能,支持各种业务计算规则
|
||||
"""
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
"""初始化数据计算器
|
||||
|
||||
Args:
|
||||
config: 计算配置
|
||||
"""
|
||||
self.config = config or {}
|
||||
self.calculation_rules = []
|
||||
|
||||
def add_rule(self, rule_type: str, **kwargs):
|
||||
"""添加计算规则
|
||||
|
||||
Args:
|
||||
rule_type: 规则类型
|
||||
**kwargs: 规则参数
|
||||
"""
|
||||
rule = {'type': rule_type, **kwargs}
|
||||
self.calculation_rules.append(rule)
|
||||
logger.debug(f"添加计算规则: {rule_type}")
|
||||
|
||||
def calculate(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""执行数据计算
|
||||
|
||||
Args:
|
||||
df: 输入数据
|
||||
|
||||
Returns:
|
||||
计算后的数据
|
||||
"""
|
||||
logger.info(f"开始数据计算,原始数据形状: {df.shape}")
|
||||
|
||||
result_df = df.copy()
|
||||
|
||||
for i, rule in enumerate(self.calculation_rules):
|
||||
try:
|
||||
logger.debug(f"执行计算规则 {i+1}/{len(self.calculation_rules)}: {rule['type']}")
|
||||
result_df = self._apply_rule(result_df, rule)
|
||||
logger.debug(f"规则执行完成,数据形状: {result_df.shape}")
|
||||
except Exception as e:
|
||||
logger.error(f"计算规则执行失败: {rule}, 错误: {e}")
|
||||
# 继续执行下一个规则,而不是中断整个流程
|
||||
continue
|
||||
|
||||
logger.info(f"数据计算完成,最终数据形状: {result_df.shape}")
|
||||
return result_df
|
||||
|
||||
def _apply_rule(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""应用单个计算规则
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
rule_type = rule.get('type')
|
||||
|
||||
if rule_type == 'multiply':
|
||||
return self._multiply(df, rule)
|
||||
elif rule_type == 'divide':
|
||||
return self._divide(df, rule)
|
||||
elif rule_type == 'add':
|
||||
return self._add(df, rule)
|
||||
elif rule_type == 'subtract':
|
||||
return self._subtract(df, rule)
|
||||
elif rule_type == 'formula':
|
||||
return self._formula(df, rule)
|
||||
elif rule_type == 'round':
|
||||
return self._round(df, rule)
|
||||
elif rule_type == 'sum':
|
||||
return self._sum(df, rule)
|
||||
elif rule_type == 'aggregate':
|
||||
return self._aggregate(df, rule)
|
||||
else:
|
||||
logger.warning(f"未知的计算规则类型: {rule_type}")
|
||||
return df
|
||||
|
||||
def _multiply(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""乘法计算
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
source_column = rule.get('source_column')
|
||||
target_column = rule.get('target_column')
|
||||
factor = rule.get('factor', 1)
|
||||
|
||||
if source_column and target_column:
|
||||
if source_column in df.columns:
|
||||
df[target_column] = df[source_column] * factor
|
||||
logger.debug(f"乘法计算: {source_column} * {factor} -> {target_column}")
|
||||
else:
|
||||
logger.warning(f"源列不存在: {source_column}")
|
||||
|
||||
return df
|
||||
|
||||
def _divide(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""除法计算
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
source_column = rule.get('source_column')
|
||||
target_column = rule.get('target_column')
|
||||
divisor = rule.get('divisor', 1)
|
||||
|
||||
if source_column and target_column and divisor != 0:
|
||||
if source_column in df.columns:
|
||||
df[target_column] = df[source_column] / divisor
|
||||
logger.debug(f"除法计算: {source_column} / {divisor} -> {target_column}")
|
||||
else:
|
||||
logger.warning(f"源列不存在: {source_column}")
|
||||
elif divisor == 0:
|
||||
logger.error("除数不能为0")
|
||||
|
||||
return df
|
||||
|
||||
def _add(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""加法计算
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
columns = rule.get('columns', [])
|
||||
target_column = rule.get('target_column')
|
||||
constant = rule.get('constant', 0)
|
||||
|
||||
if target_column:
|
||||
if isinstance(columns, str):
|
||||
columns = [columns]
|
||||
|
||||
if columns:
|
||||
# 列相加
|
||||
valid_columns = [col for col in columns if col in df.columns]
|
||||
if valid_columns:
|
||||
df[target_column] = df[valid_columns].sum(axis=1) + constant
|
||||
logger.debug(f"加法计算: {valid_columns} + {constant} -> {target_column}")
|
||||
else:
|
||||
logger.warning(f"没有有效的列用于加法计算: {columns}")
|
||||
else:
|
||||
# 只加常数
|
||||
if target_column in df.columns:
|
||||
df[target_column] = df[target_column] + constant
|
||||
logger.debug(f"加法计算: {target_column} + {constant}")
|
||||
else:
|
||||
logger.warning(f"目标列不存在: {target_column}")
|
||||
|
||||
return df
|
||||
|
||||
def _subtract(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""减法计算
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
minuend = rule.get('minuend') # 被减数列
|
||||
subtrahend = rule.get('subtrahend') # 减数列
|
||||
target_column = rule.get('target_column')
|
||||
constant = rule.get('constant', 0)
|
||||
|
||||
if target_column and minuend and minuend in df.columns:
|
||||
if subtrahend and subtrahend in df.columns:
|
||||
df[target_column] = df[minuend] - df[subtrahend] - constant
|
||||
logger.debug(f"减法计算: {minuend} - {subtrahend} - {constant} -> {target_column}")
|
||||
else:
|
||||
df[target_column] = df[minuend] - constant
|
||||
logger.debug(f"减法计算: {minuend} - {constant} -> {target_column}")
|
||||
else:
|
||||
logger.warning(f"减法计算参数不完整或列不存在")
|
||||
|
||||
return df
|
||||
|
||||
def _formula(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""公式计算
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
formula = rule.get('formula')
|
||||
target_column = rule.get('target_column')
|
||||
|
||||
if formula and target_column:
|
||||
try:
|
||||
df[target_column] = df.eval(formula)
|
||||
logger.debug(f"公式计算: {formula} -> {target_column}")
|
||||
except Exception as e:
|
||||
logger.error(f"公式计算失败: {formula}, 错误: {e}")
|
||||
else:
|
||||
logger.warning("公式计算缺少公式或目标列")
|
||||
|
||||
return df
|
||||
|
||||
def _round(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""四舍五入
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
columns = rule.get('columns', [])
|
||||
decimals = rule.get('decimals', 0)
|
||||
|
||||
if isinstance(columns, str):
|
||||
columns = [columns]
|
||||
|
||||
target_columns = columns or df.select_dtypes(include=[np.number]).columns
|
||||
|
||||
for col in target_columns:
|
||||
if col in df.columns and pd.api.types.is_numeric_dtype(df[col]):
|
||||
df[col] = df[col].round(decimals)
|
||||
logger.debug(f"四舍五入: {col} 保留 {decimals} 位小数")
|
||||
|
||||
return df
|
||||
|
||||
def _sum(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""求和计算
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
columns = rule.get('columns', [])
|
||||
target_column = rule.get('target_column')
|
||||
group_by = rule.get('group_by')
|
||||
|
||||
if isinstance(columns, str):
|
||||
columns = [columns]
|
||||
|
||||
if group_by and group_by in df.columns:
|
||||
# 分组求和
|
||||
if columns:
|
||||
for col in columns:
|
||||
if col in df.columns:
|
||||
sum_result = df.groupby(group_by)[col].sum()
|
||||
logger.debug(f"分组求和: {col} 按 {group_by} 分组")
|
||||
else:
|
||||
# 所有数值列分组求和
|
||||
numeric_columns = df.select_dtypes(include=[np.number]).columns
|
||||
sum_result = df.groupby(group_by)[numeric_columns].sum()
|
||||
logger.debug(f"分组求和: 所有数值列 按 {group_by} 分组")
|
||||
else:
|
||||
# 总体求和
|
||||
if columns:
|
||||
valid_columns = [col for col in columns if col in df.columns]
|
||||
if valid_columns and target_column:
|
||||
df[target_column] = df[valid_columns].sum(axis=1)
|
||||
logger.debug(f"求和计算: {valid_columns} -> {target_column}")
|
||||
else:
|
||||
# 所有数值列求和
|
||||
numeric_columns = df.select_dtypes(include=[np.number]).columns
|
||||
if target_column and len(numeric_columns) > 0:
|
||||
df[target_column] = df[numeric_columns].sum(axis=1)
|
||||
logger.debug(f"求和计算: {list(numeric_columns)} -> {target_column}")
|
||||
|
||||
return df
|
||||
|
||||
def _aggregate(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""聚合计算
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
group_by = rule.get('group_by')
|
||||
aggregations = rule.get('aggregations', {})
|
||||
|
||||
if group_by and group_by in df.columns:
|
||||
# 构建聚合函数字典
|
||||
agg_dict = {}
|
||||
for column, func in aggregations.items():
|
||||
if column in df.columns:
|
||||
if isinstance(func, str):
|
||||
agg_dict[column] = func
|
||||
elif isinstance(func, list):
|
||||
agg_dict[column] = func
|
||||
|
||||
if agg_dict:
|
||||
result = df.groupby(group_by).agg(agg_dict)
|
||||
logger.debug(f"聚合计算: 按 {group_by} 分组, 聚合: {agg_dict}")
|
||||
return result.reset_index()
|
||||
|
||||
return df
|
||||
|
||||
# 便捷方法
|
||||
def multiply(self, source_column: str, target_column: str, factor: float):
|
||||
"""乘法计算"""
|
||||
self.add_rule('multiply', source_column=source_column,
|
||||
target_column=target_column, factor=factor)
|
||||
return self
|
||||
|
||||
def divide(self, source_column: str, target_column: str, divisor: float):
|
||||
"""除法计算"""
|
||||
self.add_rule('divide', source_column=source_column,
|
||||
target_column=target_column, divisor=divisor)
|
||||
return self
|
||||
|
||||
def add(self, columns: Union[str, List[str]], target_column: str, constant: float = 0):
|
||||
"""加法计算"""
|
||||
self.add_rule('add', columns=columns, target_column=target_column, constant=constant)
|
||||
return self
|
||||
|
||||
def subtract(self, minuend: str, target_column: str,
|
||||
subtrahend: Optional[str] = None, constant: float = 0):
|
||||
"""减法计算"""
|
||||
self.add_rule('subtract', minuend=minuend, target_column=target_column,
|
||||
subtrahend=subtrahend, constant=constant)
|
||||
return self
|
||||
|
||||
def formula(self, formula: str, target_column: str):
|
||||
"""公式计算"""
|
||||
self.add_rule('formula', formula=formula, target_column=target_column)
|
||||
return self
|
||||
|
||||
def round_columns(self, columns: Optional[Union[str, List[str]]] = None, decimals: int = 0):
|
||||
"""四舍五入"""
|
||||
self.add_rule('round', columns=columns, decimals=decimals)
|
||||
return self
|
||||
|
||||
def sum_columns(self, columns: Optional[Union[str, List[str]]] = None,
|
||||
target_column: Optional[str] = None, group_by: Optional[str] = None):
|
||||
"""求和计算"""
|
||||
self.add_rule('sum', columns=columns, target_column=target_column, group_by=group_by)
|
||||
return self
|
||||
|
||||
def aggregate(self, group_by: str, aggregations: Dict[str, Union[str, List[str]]]):
|
||||
"""聚合计算"""
|
||||
self.add_rule('aggregate', group_by=group_by, aggregations=aggregations)
|
||||
return self
|
||||
@@ -0,0 +1,276 @@
|
||||
"""
|
||||
列映射处理器
|
||||
|
||||
提供列名映射和转换功能,支持不同供应商的列名标准化
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
from typing import Dict, Any, Optional, List, Union
|
||||
from ...core.utils.log_utils import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class ColumnMapper:
|
||||
"""列映射处理器
|
||||
|
||||
提供列名标准化功能,将不同供应商的列名映射到标准列名
|
||||
"""
|
||||
|
||||
# 标准列名定义
|
||||
STANDARD_COLUMNS = {
|
||||
'barcode': ['条码', '条形码', '商品条码', '产品条码', '条码(必填)', 'barcode', 'code'],
|
||||
'name': ['商品名称', '产品名称', '名称', '商品', '产品', 'name', 'product_name'],
|
||||
'specification': ['规格', '规格型号', '型号', 'specification', 'spec', 'model'],
|
||||
'quantity': ['数量', '采购量', '订货数量', '订单量', '需求量', 'quantity', 'qty', '采购量(必填)'],
|
||||
'unit': ['单位', '计量单位', 'unit', 'units'],
|
||||
'unit_price': ['单价', '价格', '采购单价', '进货价', 'unit_price', 'price', '采购单价(必填)'],
|
||||
'total_price': ['总价', '金额', '小计', 'total_price', 'total', 'amount'],
|
||||
'category': ['类别', '分类', '商品类别', 'category', 'type'],
|
||||
'brand': ['品牌', '商标', 'brand'],
|
||||
'supplier': ['供应商', '供货商', 'supplier', 'vendor']
|
||||
}
|
||||
|
||||
def __init__(self, mapping_config: Optional[Dict[str, Any]] = None):
|
||||
"""初始化列映射器
|
||||
|
||||
Args:
|
||||
mapping_config: 映射配置
|
||||
"""
|
||||
self.mapping_config = mapping_config or {}
|
||||
self.custom_mappings = {}
|
||||
self._build_reverse_mapping()
|
||||
|
||||
def _build_reverse_mapping(self):
|
||||
"""构建反向映射表"""
|
||||
self.reverse_mapping = {}
|
||||
|
||||
# 添加标准列的反向映射
|
||||
for standard_name, variations in self.STANDARD_COLUMNS.items():
|
||||
for variation in variations:
|
||||
self.reverse_mapping[variation.lower()] = standard_name
|
||||
|
||||
# 添加自定义映射
|
||||
for standard_name, custom_names in self.mapping_config.items():
|
||||
if isinstance(custom_names, str):
|
||||
custom_names = [custom_names]
|
||||
|
||||
for custom_name in custom_names:
|
||||
self.reverse_mapping[custom_name.lower()] = standard_name
|
||||
self.custom_mappings[custom_name.lower()] = standard_name
|
||||
|
||||
def map_columns(self, df: pd.DataFrame, target_columns: Optional[List[str]] = None) -> pd.DataFrame:
|
||||
"""映射列名
|
||||
|
||||
Args:
|
||||
df: 输入数据
|
||||
target_columns: 目标列名列表,如果为None则使用所有标准列
|
||||
|
||||
Returns:
|
||||
列名映射后的数据
|
||||
"""
|
||||
if target_columns is None:
|
||||
target_columns = list(self.STANDARD_COLUMNS.keys())
|
||||
|
||||
logger.info(f"开始列名映射,目标列: {target_columns}")
|
||||
logger.info(f"原始列名: {list(df.columns)}")
|
||||
|
||||
# 创建列名映射
|
||||
column_mapping = {}
|
||||
used_columns = set()
|
||||
|
||||
for target_col in target_columns:
|
||||
# 查找匹配的原始列名
|
||||
matched_column = self._find_matching_column(df.columns, target_col)
|
||||
if matched_column:
|
||||
column_mapping[matched_column] = target_col
|
||||
used_columns.add(matched_column)
|
||||
logger.debug(f"列名映射: {matched_column} -> {target_col}")
|
||||
|
||||
# 重命名列
|
||||
if column_mapping:
|
||||
df_mapped = df.rename(columns=column_mapping)
|
||||
|
||||
# 添加缺失的目标列
|
||||
for target_col in target_columns:
|
||||
if target_col not in df_mapped.columns:
|
||||
df_mapped[target_col] = self._get_default_value(target_col)
|
||||
logger.debug(f"添加缺失列: {target_col}")
|
||||
|
||||
# 只保留目标列
|
||||
existing_target_columns = [col for col in target_columns if col in df_mapped.columns]
|
||||
df_result = df_mapped[existing_target_columns]
|
||||
|
||||
logger.info(f"列名映射完成,结果列名: {list(df_result.columns)}")
|
||||
return df_result
|
||||
else:
|
||||
logger.warning("没有找到可映射的列名")
|
||||
return df
|
||||
|
||||
def _find_matching_column(self, columns: List[str], target_column: str) -> Optional[str]:
|
||||
"""查找匹配的列名
|
||||
|
||||
Args:
|
||||
columns: 原始列名列表
|
||||
target_column: 目标标准列名
|
||||
|
||||
Returns:
|
||||
匹配的原始列名或None
|
||||
"""
|
||||
# 获取目标列的所有可能变体
|
||||
possible_names = []
|
||||
|
||||
# 标准列名变体
|
||||
if target_column in self.STANDARD_COLUMNS:
|
||||
possible_names.extend(self.STANDARD_COLUMNS[target_column])
|
||||
|
||||
# 自定义映射
|
||||
for standard_name, custom_names in self.mapping_config.items():
|
||||
if standard_name == target_column:
|
||||
if isinstance(custom_names, str):
|
||||
possible_names.append(custom_names)
|
||||
else:
|
||||
possible_names.extend(custom_names)
|
||||
|
||||
# 查找匹配
|
||||
for possible_name in possible_names:
|
||||
# 精确匹配(忽略大小写)
|
||||
for column in columns:
|
||||
if column.lower() == possible_name.lower():
|
||||
return column
|
||||
|
||||
# 模糊匹配
|
||||
for column in columns:
|
||||
if possible_name.lower() in column.lower() or column.lower() in possible_name.lower():
|
||||
return column
|
||||
|
||||
return None
|
||||
|
||||
def _get_default_value(self, column_name: str) -> Any:
|
||||
"""获取列的默认值
|
||||
|
||||
Args:
|
||||
column_name: 列名
|
||||
|
||||
Returns:
|
||||
默认值
|
||||
"""
|
||||
# 根据列名类型返回合适的默认值
|
||||
if column_name in ['quantity', 'unit_price', 'total_price']:
|
||||
return 0
|
||||
elif column_name in ['barcode', 'name', 'specification', 'unit', 'category', 'brand', 'supplier']:
|
||||
return ''
|
||||
else:
|
||||
return None
|
||||
|
||||
def add_custom_mapping(self, standard_name: str, custom_names: Union[str, List[str]]):
|
||||
"""添加自定义列名映射
|
||||
|
||||
Args:
|
||||
standard_name: 标准列名
|
||||
custom_names: 自定义列名或列名列表
|
||||
"""
|
||||
if isinstance(custom_names, str):
|
||||
custom_names = [custom_names]
|
||||
|
||||
# 更新配置
|
||||
self.mapping_config[standard_name] = custom_names
|
||||
|
||||
# 更新反向映射
|
||||
for custom_name in custom_names:
|
||||
self.reverse_mapping[custom_name.lower()] = standard_name
|
||||
self.custom_mappings[custom_name.lower()] = standard_name
|
||||
|
||||
logger.info(f"添加自定义映射: {standard_name} <- {custom_names}")
|
||||
|
||||
def detect_column_types(self, df: pd.DataFrame) -> Dict[str, str]:
|
||||
"""检测列的数据类型
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
|
||||
Returns:
|
||||
列类型字典
|
||||
"""
|
||||
column_types = {}
|
||||
|
||||
for column in df.columns:
|
||||
if pd.api.types.is_numeric_dtype(df[column]):
|
||||
column_types[column] = 'numeric'
|
||||
elif pd.api.types.is_datetime64_any_dtype(df[column]):
|
||||
column_types[column] = 'datetime'
|
||||
elif pd.api.types.is_bool_dtype(df[column]):
|
||||
column_types[column] = 'boolean'
|
||||
else:
|
||||
column_types[column] = 'text'
|
||||
|
||||
return column_types
|
||||
|
||||
def suggest_column_mapping(self, df: pd.DataFrame) -> Dict[str, List[str]]:
|
||||
"""建议列名映射
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
|
||||
Returns:
|
||||
建议的映射关系
|
||||
"""
|
||||
suggestions = {}
|
||||
|
||||
for column in df.columns:
|
||||
column_lower = column.lower()
|
||||
suggestions[column] = []
|
||||
|
||||
# 检查标准列名
|
||||
for standard_name, variations in self.STANDARD_COLUMNS.items():
|
||||
for variation in variations:
|
||||
if column_lower in variation.lower() or variation.lower() in column_lower:
|
||||
suggestions[column].append(standard_name)
|
||||
|
||||
# 检查自定义映射
|
||||
for custom_name, standard_name in self.custom_mappings.items():
|
||||
if column_lower in custom_name or custom_name in column_lower:
|
||||
suggestions[column].append(standard_name)
|
||||
|
||||
# 去重
|
||||
suggestions[column] = list(set(suggestions[column]))
|
||||
|
||||
# 只返回有建议的列
|
||||
return {k: v for k, v in suggestions.items() if v}
|
||||
|
||||
def validate_mapping(self, df: pd.DataFrame, required_columns: List[str]) -> Dict[str, Any]:
|
||||
"""验证列映射结果
|
||||
|
||||
Args:
|
||||
df: 映射后的数据
|
||||
required_columns: 必需的列名列表
|
||||
|
||||
Returns:
|
||||
验证结果
|
||||
"""
|
||||
result = {
|
||||
'valid': True,
|
||||
'missing_columns': [],
|
||||
'empty_columns': [],
|
||||
'warnings': []
|
||||
}
|
||||
|
||||
# 检查缺失列
|
||||
for col in required_columns:
|
||||
if col not in df.columns:
|
||||
result['missing_columns'].append(col)
|
||||
result['valid'] = False
|
||||
|
||||
# 检查空列
|
||||
for col in df.columns:
|
||||
if df[col].isnull().all():
|
||||
result['empty_columns'].append(col)
|
||||
result['warnings'].append(f"列 '{col}' 全部为空值")
|
||||
|
||||
# 检查数值列
|
||||
numeric_columns = ['quantity', 'unit_price', 'total_price']
|
||||
for col in numeric_columns:
|
||||
if col in df.columns and not pd.api.types.is_numeric_dtype(df[col]):
|
||||
result['warnings'].append(f"列 '{col}' 不是数值类型")
|
||||
|
||||
return result
|
||||
@@ -0,0 +1,401 @@
|
||||
"""
|
||||
数据清洗处理器
|
||||
|
||||
提供各种数据清洗功能,如空值处理、重复项处理、数据类型转换等
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
from typing import Dict, Any, Optional, List, Union
|
||||
from ...core.utils.log_utils import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class DataCleaner:
|
||||
"""数据清洗处理器
|
||||
|
||||
提供标准化的数据清洗功能,支持链式调用和规则配置
|
||||
"""
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
"""初始化数据清洗器
|
||||
|
||||
Args:
|
||||
config: 清洗配置
|
||||
"""
|
||||
self.config = config or {}
|
||||
self.cleaning_rules = []
|
||||
|
||||
def add_rule(self, rule_type: str, **kwargs):
|
||||
"""添加清洗规则
|
||||
|
||||
Args:
|
||||
rule_type: 规则类型
|
||||
**kwargs: 规则参数
|
||||
"""
|
||||
rule = {'type': rule_type, **kwargs}
|
||||
self.cleaning_rules.append(rule)
|
||||
logger.debug(f"添加清洗规则: {rule_type}")
|
||||
|
||||
def clean(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""执行数据清洗
|
||||
|
||||
Args:
|
||||
df: 输入数据
|
||||
|
||||
Returns:
|
||||
清洗后的数据
|
||||
"""
|
||||
logger.info(f"开始数据清洗,原始数据形状: {df.shape}")
|
||||
|
||||
result_df = df.copy()
|
||||
|
||||
for i, rule in enumerate(self.cleaning_rules):
|
||||
try:
|
||||
logger.debug(f"执行清洗规则 {i+1}/{len(self.cleaning_rules)}: {rule['type']}")
|
||||
result_df = self._apply_rule(result_df, rule)
|
||||
logger.debug(f"规则执行完成,数据形状: {result_df.shape}")
|
||||
except Exception as e:
|
||||
logger.error(f"清洗规则执行失败: {rule}, 错误: {e}")
|
||||
# 继续执行下一个规则,而不是中断整个流程
|
||||
continue
|
||||
|
||||
logger.info(f"数据清洗完成,最终数据形状: {result_df.shape}")
|
||||
return result_df
|
||||
|
||||
def _apply_rule(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""应用单个清洗规则
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
rule_type = rule.get('type')
|
||||
|
||||
if rule_type == 'remove_duplicates':
|
||||
return self._remove_duplicates(df, rule)
|
||||
elif rule_type == 'fill_na':
|
||||
return self._fill_na(df, rule)
|
||||
elif rule_type == 'remove_rows':
|
||||
return self._remove_rows(df, rule)
|
||||
elif rule_type == 'convert_type':
|
||||
return self._convert_type(df, rule)
|
||||
elif rule_type == 'strip_whitespace':
|
||||
return self._strip_whitespace(df, rule)
|
||||
elif rule_type == 'normalize_text':
|
||||
return self._normalize_text(df, rule)
|
||||
elif rule_type == 'validate_data':
|
||||
return self._validate_data(df, rule)
|
||||
else:
|
||||
logger.warning(f"未知的清洗规则类型: {rule_type}")
|
||||
return df
|
||||
|
||||
def _remove_duplicates(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""移除重复项
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
subset = rule.get('subset') # 用于判断重复的列
|
||||
keep = rule.get('keep', 'first') # 保留哪个重复项
|
||||
|
||||
before_count = len(df)
|
||||
df_cleaned = df.drop_duplicates(subset=subset, keep=keep)
|
||||
after_count = len(df_cleaned)
|
||||
|
||||
logger.info(f"移除重复项: {before_count - after_count} 行被移除")
|
||||
return df_cleaned
|
||||
|
||||
def _fill_na(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""填充空值
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
columns = rule.get('columns') # 要处理的列
|
||||
value = rule.get('value', 0) # 填充值
|
||||
method = rule.get('method') # 填充方法('ffill', 'bfill', 'mean', 'median')
|
||||
|
||||
if columns:
|
||||
# 处理指定列
|
||||
if isinstance(columns, str):
|
||||
columns = [columns]
|
||||
|
||||
for col in columns:
|
||||
if col in df.columns:
|
||||
if method == 'ffill':
|
||||
df[col] = df[col].fillna(method='ffill')
|
||||
elif method == 'bfill':
|
||||
df[col] = df[col].fillna(method='bfill')
|
||||
elif method == 'mean':
|
||||
df[col] = df[col].fillna(df[col].mean())
|
||||
elif method == 'median':
|
||||
df[col] = df[col].fillna(df[col].median())
|
||||
else:
|
||||
df[col] = df[col].fillna(value)
|
||||
|
||||
logger.debug(f"填充列 {col} 的空值: {method or value}")
|
||||
else:
|
||||
# 处理所有列
|
||||
if method == 'ffill':
|
||||
df = df.fillna(method='ffill')
|
||||
elif method == 'bfill':
|
||||
df = df.fillna(method='bfill')
|
||||
else:
|
||||
df = df.fillna(value)
|
||||
|
||||
logger.debug(f"填充所有列的空值: {method or value}")
|
||||
|
||||
return df
|
||||
|
||||
def _remove_rows(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""移除行
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
condition = rule.get('condition') # 条件表达式
|
||||
columns = rule.get('columns') # 要检查的列
|
||||
values = rule.get('values') # 要移除的值
|
||||
|
||||
if condition:
|
||||
# 使用条件表达式
|
||||
try:
|
||||
before_count = len(df)
|
||||
df_filtered = df.query(condition)
|
||||
after_count = len(df_filtered)
|
||||
logger.info(f"条件过滤: {condition}, 移除了 {before_count - after_count} 行")
|
||||
return df_filtered
|
||||
except Exception as e:
|
||||
logger.error(f"条件表达式执行失败: {condition}, 错误: {e}")
|
||||
return df
|
||||
|
||||
if columns and values:
|
||||
# 基于列值过滤
|
||||
if isinstance(columns, str):
|
||||
columns = [columns]
|
||||
if not isinstance(values, list):
|
||||
values = [values]
|
||||
|
||||
df_filtered = df.copy()
|
||||
for col in columns:
|
||||
if col in df_filtered.columns:
|
||||
mask = ~df_filtered[col].isin(values)
|
||||
df_filtered = df_filtered[mask]
|
||||
logger.debug(f"列 {col} 过滤值 {values}")
|
||||
|
||||
return df_filtered
|
||||
|
||||
logger.warning("移除行规则缺少条件或列配置")
|
||||
return df
|
||||
|
||||
def _convert_type(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""类型转换
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
columns = rule.get('columns')
|
||||
target_type = rule.get('target_type', 'float')
|
||||
errors = rule.get('errors', 'coerce') # 错误处理方式
|
||||
|
||||
if isinstance(columns, str):
|
||||
columns = [columns]
|
||||
|
||||
for col in columns:
|
||||
if col in df.columns:
|
||||
try:
|
||||
if target_type == 'int':
|
||||
df[col] = pd.to_numeric(df[col], errors=errors).astype('Int64')
|
||||
elif target_type == 'float':
|
||||
df[col] = pd.to_numeric(df[col], errors=errors)
|
||||
elif target_type == 'datetime':
|
||||
df[col] = pd.to_datetime(df[col], errors=errors)
|
||||
elif target_type == 'string':
|
||||
df[col] = df[col].astype(str)
|
||||
else:
|
||||
df[col] = df[col].astype(target_type)
|
||||
|
||||
logger.debug(f"列 {col} 类型转换: {target_type}")
|
||||
except Exception as e:
|
||||
logger.error(f"列 {col} 类型转换失败: {e}")
|
||||
|
||||
return df
|
||||
|
||||
def _strip_whitespace(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""去除空白字符
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
columns = rule.get('columns')
|
||||
|
||||
if columns:
|
||||
if isinstance(columns, str):
|
||||
columns = [columns]
|
||||
|
||||
for col in columns:
|
||||
if col in df.columns and df[col].dtype == 'object':
|
||||
df[col] = df[col].str.strip()
|
||||
logger.debug(f"列 {col} 去除空白字符")
|
||||
else:
|
||||
# 处理所有文本列
|
||||
text_columns = df.select_dtypes(include=['object']).columns
|
||||
for col in text_columns:
|
||||
df[col] = df[col].str.strip()
|
||||
|
||||
logger.debug(f"所有文本列去除空白字符: {list(text_columns)}")
|
||||
|
||||
return df
|
||||
|
||||
def _normalize_text(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""文本标准化
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
columns = rule.get('columns')
|
||||
lowercase = rule.get('lowercase', False)
|
||||
uppercase = rule.get('uppercase', False)
|
||||
replace_map = rule.get('replace_map', {}) # 替换映射
|
||||
|
||||
if isinstance(columns, str):
|
||||
columns = [columns]
|
||||
|
||||
target_columns = columns or df.select_dtypes(include=['object']).columns
|
||||
|
||||
for col in target_columns:
|
||||
if col in df.columns and df[col].dtype == 'object':
|
||||
if lowercase:
|
||||
df[col] = df[col].str.lower()
|
||||
elif uppercase:
|
||||
df[col] = df[col].str.upper()
|
||||
|
||||
# 应用替换映射
|
||||
for old, new in replace_map.items():
|
||||
df[col] = df[col].str.replace(old, new)
|
||||
|
||||
logger.debug(f"列 {col} 文本标准化完成")
|
||||
|
||||
return df
|
||||
|
||||
def _validate_data(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""数据验证
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
columns = rule.get('columns')
|
||||
min_value = rule.get('min_value')
|
||||
max_value = rule.get('max_value')
|
||||
required = rule.get('required', False)
|
||||
|
||||
if isinstance(columns, str):
|
||||
columns = [columns]
|
||||
|
||||
validation_results = []
|
||||
|
||||
for col in columns:
|
||||
if col in df.columns:
|
||||
# 检查必需值
|
||||
if required:
|
||||
null_count = df[col].isnull().sum()
|
||||
if null_count > 0:
|
||||
validation_results.append(f"{col}: {null_count} 个空值")
|
||||
|
||||
# 检查数值范围
|
||||
if min_value is not None or max_value is not None:
|
||||
if pd.api.types.is_numeric_dtype(df[col]):
|
||||
invalid_mask = pd.Series(False, index=df.index)
|
||||
if min_value is not None:
|
||||
invalid_mask |= df[col] < min_value
|
||||
if max_value is not None:
|
||||
invalid_mask |= df[col] > max_value
|
||||
|
||||
invalid_count = invalid_mask.sum()
|
||||
if invalid_count > 0:
|
||||
validation_results.append(f"{col}: {invalid_count} 个值超出范围")
|
||||
|
||||
if validation_results:
|
||||
logger.warning(f"数据验证发现问题: {', '.join(validation_results)}")
|
||||
else:
|
||||
logger.debug("数据验证通过")
|
||||
|
||||
return df
|
||||
|
||||
# 便捷方法
|
||||
def remove_duplicates(self, subset: Optional[List[str]] = None, keep: str = 'first'):
|
||||
"""移除重复项"""
|
||||
self.add_rule('remove_duplicates', subset=subset, keep=keep)
|
||||
return self
|
||||
|
||||
def fill_na(self, columns: Optional[Union[str, List[str]]] = None,
|
||||
value: Any = 0, method: Optional[str] = None):
|
||||
"""填充空值"""
|
||||
self.add_rule('fill_na', columns=columns, value=value, method=method)
|
||||
return self
|
||||
|
||||
def remove_rows(self, condition: Optional[str] = None,
|
||||
columns: Optional[Union[str, List[str]]] = None,
|
||||
values: Optional[Any] = None):
|
||||
"""移除行"""
|
||||
self.add_rule('remove_rows', condition=condition, columns=columns, values=values)
|
||||
return self
|
||||
|
||||
def convert_type(self, columns: Union[str, List[str]], target_type: str, errors: str = 'coerce'):
|
||||
"""类型转换"""
|
||||
self.add_rule('convert_type', columns=columns, target_type=target_type, errors=errors)
|
||||
return self
|
||||
|
||||
def strip_whitespace(self, columns: Optional[Union[str, List[str]]] = None):
|
||||
"""去除空白字符"""
|
||||
self.add_rule('strip_whitespace', columns=columns)
|
||||
return self
|
||||
|
||||
def normalize_text(self, columns: Optional[Union[str, List[str]]] = None,
|
||||
lowercase: bool = False, uppercase: bool = False,
|
||||
replace_map: Optional[Dict[str, str]] = None):
|
||||
"""文本标准化"""
|
||||
self.add_rule('normalize_text', columns=columns, lowercase=lowercase,
|
||||
uppercase=uppercase, replace_map=replace_map or {})
|
||||
return self
|
||||
|
||||
def validate_data(self, columns: Union[str, List[str]],
|
||||
min_value: Optional[float] = None,
|
||||
max_value: Optional[float] = None,
|
||||
required: bool = False):
|
||||
"""数据验证"""
|
||||
self.add_rule('validate_data', columns=columns, min_value=min_value,
|
||||
max_value=max_value, required=required)
|
||||
return self
|
||||
@@ -11,7 +11,7 @@ import json
|
||||
import base64
|
||||
from datetime import datetime
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from typing import Dict, List, Optional, Tuple, Union, Any
|
||||
from typing import Dict, List, Optional, Tuple, Union, Any, Callable
|
||||
|
||||
from ...config.settings import ConfigManager
|
||||
from ..utils.log_utils import get_logger
|
||||
@@ -332,7 +332,7 @@ class OCRProcessor:
|
||||
logger.error(f"处理图片时出错: {image_path}, 错误: {e}")
|
||||
return None
|
||||
|
||||
def process_images_batch(self, batch_size: int = None, max_workers: int = None) -> Tuple[int, int]:
|
||||
def process_images_batch(self, batch_size: int = None, max_workers: int = None, progress_cb: Optional[Callable[[int], None]] = None) -> Tuple[int, int]:
|
||||
"""
|
||||
批量处理图片
|
||||
|
||||
@@ -369,6 +369,13 @@ class OCRProcessor:
|
||||
for i in range(0, total, batch_size):
|
||||
batch = unprocessed_images[i:i+batch_size]
|
||||
logger.info(f"处理批次 {i//batch_size+1}/{(total+batch_size-1)//batch_size}: {len(batch)} 个文件")
|
||||
try:
|
||||
if progress_cb:
|
||||
# 以批次为单位估算进度(0-90%),保留10%给后续阶段
|
||||
percent = int(10 + (i / max(total, 1)) * 80)
|
||||
progress_cb(min(percent, 90))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 使用多线程处理批次
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
@@ -378,4 +385,9 @@ class OCRProcessor:
|
||||
success_count += sum(1 for result in results if result is not None)
|
||||
|
||||
logger.info(f"所有图片处理完成, 总计: {total}, 成功: {success_count}")
|
||||
try:
|
||||
if progress_cb:
|
||||
progress_cb(90)
|
||||
except Exception:
|
||||
pass
|
||||
return total, success_count
|
||||
|
||||
@@ -0,0 +1,9 @@
|
||||
"""
|
||||
处理器模块初始化文件
|
||||
"""
|
||||
|
||||
from .base import BaseProcessor
|
||||
from .ocr_processor import OCRProcessor
|
||||
from .tobacco_processor import TobaccoProcessor
|
||||
|
||||
__all__ = ['BaseProcessor', 'OCRProcessor', 'TobaccoProcessor']
|
||||
@@ -0,0 +1,139 @@
|
||||
"""
|
||||
基础处理器接口模块
|
||||
|
||||
定义所有处理器的基类,提供统一的处理接口
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, Any, Optional, List
|
||||
from pathlib import Path
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BaseProcessor(ABC):
|
||||
"""基础处理器接口 - 所有处理器的基类
|
||||
|
||||
采用策略模式设计,每个处理器负责特定类型的文件处理
|
||||
"""
|
||||
|
||||
def __init__(self, config: Dict[str, Any]):
|
||||
"""初始化处理器
|
||||
|
||||
Args:
|
||||
config: 处理器配置字典
|
||||
"""
|
||||
self.config = config
|
||||
self.name = self.__class__.__name__
|
||||
self.description = ""
|
||||
self._setup_logging()
|
||||
|
||||
def _setup_logging(self):
|
||||
"""设置处理器日志"""
|
||||
self.logger = logging.getLogger(f"{__name__}.{self.name}")
|
||||
|
||||
@abstractmethod
|
||||
def can_process(self, file_path: Path) -> bool:
|
||||
"""判断是否能处理该文件
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
是否能处理该文件
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def process(self, input_file: Path, output_dir: Path) -> Optional[Path]:
|
||||
"""处理文件,返回输出文件路径
|
||||
|
||||
Args:
|
||||
input_file: 输入文件路径
|
||||
output_dir: 输出目录路径
|
||||
|
||||
Returns:
|
||||
输出文件路径,处理失败返回None
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_required_columns(self) -> List[str]:
|
||||
"""返回需要的列名列表
|
||||
|
||||
Returns:
|
||||
列名列表
|
||||
"""
|
||||
pass
|
||||
|
||||
def validate_input(self, file_path: Path) -> bool:
|
||||
"""验证输入文件有效性
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
文件是否有效
|
||||
"""
|
||||
try:
|
||||
if not file_path.exists():
|
||||
self.logger.warning(f"文件不存在: {file_path}")
|
||||
return False
|
||||
|
||||
if not file_path.is_file():
|
||||
self.logger.warning(f"不是文件: {file_path}")
|
||||
return False
|
||||
|
||||
supported_extensions = self.get_supported_extensions()
|
||||
if supported_extensions and file_path.suffix.lower() not in supported_extensions:
|
||||
self.logger.warning(f"不支持的文件类型: {file_path.suffix}, 支持的类型: {supported_extensions}")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"验证文件时出错: {e}")
|
||||
return False
|
||||
|
||||
def get_supported_extensions(self) -> List[str]:
|
||||
"""获取支持的文件扩展名
|
||||
|
||||
Returns:
|
||||
支持的扩展名列表,空列表表示支持所有类型
|
||||
"""
|
||||
return []
|
||||
|
||||
def get_output_filename(self, input_file: Path, suffix: str = "_processed") -> str:
|
||||
"""生成输出文件名
|
||||
|
||||
Args:
|
||||
input_file: 输入文件路径
|
||||
suffix: 文件名后缀
|
||||
|
||||
Returns:
|
||||
输出文件名
|
||||
"""
|
||||
return f"{input_file.stem}{suffix}{input_file.suffix}"
|
||||
|
||||
def log_processing_start(self, input_file: Path):
|
||||
"""记录处理开始日志"""
|
||||
self.logger.info(f"开始处理文件: {input_file}")
|
||||
self.logger.info(f"处理器: {self.name} - {self.description}")
|
||||
|
||||
def log_processing_end(self, input_file: Path, output_file: Optional[Path] = None, success: bool = True):
|
||||
"""记录处理结束日志"""
|
||||
if success:
|
||||
self.logger.info(f"处理完成: {input_file}")
|
||||
if output_file:
|
||||
self.logger.info(f"输出文件: {output_file}")
|
||||
else:
|
||||
self.logger.error(f"处理失败: {input_file}")
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""字符串表示"""
|
||||
return f"{self.name}({self.description})"
|
||||
|
||||
def __repr__(self) -> str:
|
||||
"""详细字符串表示"""
|
||||
return f"{self.__class__.__module__}.{self.__class__.__name__}(name='{self.name}', description='{self.description}')"
|
||||
@@ -0,0 +1,192 @@
|
||||
"""
|
||||
OCR处理器
|
||||
|
||||
处理图片文件的OCR识别完整流程:图片识别 → Excel处理 → 标准采购单生成
|
||||
"""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict, Any, List
|
||||
|
||||
from .base import BaseProcessor
|
||||
from ...services.ocr_service import OCRService
|
||||
from ...services.order_service import OrderService
|
||||
from ...core.utils.log_utils import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class OCRProcessor(BaseProcessor):
|
||||
"""OCR处理器
|
||||
|
||||
处理图片文件的完整OCR识别流程:
|
||||
1. OCR识别图片中的表格信息
|
||||
2. 处理识别结果生成Excel文件
|
||||
3. 转换为标准采购单格式
|
||||
"""
|
||||
|
||||
def __init__(self, config: Dict[str, Any]):
|
||||
"""初始化OCR处理器
|
||||
|
||||
Args:
|
||||
config: 配置信息
|
||||
"""
|
||||
super().__init__(config)
|
||||
self.description = "OCR识别完整流程(图片→识别→Excel→采购单)"
|
||||
|
||||
# 初始化服务
|
||||
self.ocr_service = OCRService(config)
|
||||
self.order_service = OrderService(config)
|
||||
|
||||
def can_process(self, file_path: Path) -> bool:
|
||||
"""判断是否为支持的图片文件
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
是否能处理该文件
|
||||
"""
|
||||
if not self.validate_input(file_path):
|
||||
return False
|
||||
|
||||
# 支持的图片格式
|
||||
supported_extensions = ['.jpg', '.jpeg', '.png', '.bmp']
|
||||
|
||||
if file_path.suffix.lower() in supported_extensions:
|
||||
self.logger.info(f"识别为图片文件: {file_path.name}")
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def process(self, input_file: Path, output_dir: Path) -> Optional[Path]:
|
||||
"""处理图片文件的完整OCR流程
|
||||
|
||||
Args:
|
||||
input_file: 输入图片文件路径
|
||||
output_dir: 输出目录路径
|
||||
|
||||
Returns:
|
||||
输出文件路径,处理失败返回None
|
||||
"""
|
||||
self.log_processing_start(input_file)
|
||||
|
||||
try:
|
||||
self.logger.info("开始OCR识别流程...")
|
||||
|
||||
# 步骤1: OCR识别
|
||||
self.logger.info("步骤1/3: OCR识别图片...")
|
||||
ocr_result = self._perform_ocr(input_file, output_dir)
|
||||
if not ocr_result:
|
||||
self.logger.error("OCR识别失败")
|
||||
self.log_processing_end(input_file, success=False)
|
||||
return None
|
||||
|
||||
# 步骤2: Excel处理
|
||||
self.logger.info("步骤2/3: 处理Excel文件...")
|
||||
excel_result = self._process_excel(ocr_result, output_dir)
|
||||
if not excel_result:
|
||||
self.logger.error("Excel处理失败")
|
||||
self.log_processing_end(input_file, success=False)
|
||||
return None
|
||||
|
||||
# 步骤3: 生成标准采购单
|
||||
self.logger.info("步骤3/3: 生成标准采购单...")
|
||||
final_result = self._generate_purchase_order(excel_result, output_dir)
|
||||
|
||||
if final_result:
|
||||
self.logger.info(f"OCR处理流程完成,输出文件: {final_result}")
|
||||
self.log_processing_end(input_file, final_result, success=True)
|
||||
return final_result
|
||||
else:
|
||||
self.logger.error("生成采购单失败")
|
||||
self.log_processing_end(input_file, success=False)
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"OCR处理流程出错: {e}", exc_info=True)
|
||||
self.log_processing_end(input_file, success=False)
|
||||
return None
|
||||
|
||||
def get_required_columns(self) -> List[str]:
|
||||
"""返回需要的列名列表"""
|
||||
# OCR处理不直接依赖列名,由后续处理步骤决定
|
||||
return []
|
||||
|
||||
def get_supported_extensions(self) -> List[str]:
|
||||
"""支持的文件扩展名"""
|
||||
return ['.jpg', '.jpeg', '.png', '.bmp']
|
||||
|
||||
def _perform_ocr(self, input_file: Path, output_dir: Path) -> Optional[Path]:
|
||||
"""执行OCR识别
|
||||
|
||||
Args:
|
||||
input_file: 输入图片文件
|
||||
output_dir: 输出目录
|
||||
|
||||
Returns:
|
||||
OCR生成的Excel文件路径,失败返回None
|
||||
"""
|
||||
try:
|
||||
self.logger.info(f"开始OCR识别: {input_file}")
|
||||
|
||||
# 使用OCR服务处理图片
|
||||
result_path = self.ocr_service.process_image(str(input_file))
|
||||
|
||||
if result_path:
|
||||
# 确保结果文件在输出目录中
|
||||
result_path = Path(result_path)
|
||||
if result_path.exists():
|
||||
self.logger.info(f"OCR识别成功,输出文件: {result_path}")
|
||||
return result_path
|
||||
else:
|
||||
self.logger.error(f"OCR结果文件不存在: {result_path}")
|
||||
return None
|
||||
else:
|
||||
self.logger.error("OCR服务返回None")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"OCR识别失败: {e}", exc_info=True)
|
||||
return None
|
||||
|
||||
def _process_excel(self, excel_file: Path, output_dir: Path) -> Optional[Path]:
|
||||
"""处理Excel文件
|
||||
|
||||
Args:
|
||||
excel_file: Excel文件路径
|
||||
output_dir: 输出目录
|
||||
|
||||
Returns:
|
||||
处理后的Excel文件路径,失败返回None
|
||||
"""
|
||||
try:
|
||||
self.logger.info(f"开始处理Excel文件: {excel_file}")
|
||||
|
||||
# 使用订单服务处理Excel文件(生成采购单)
|
||||
result_path = self.order_service.process_excel(str(excel_file))
|
||||
|
||||
if result_path:
|
||||
result_path = Path(result_path)
|
||||
if result_path.exists():
|
||||
self.logger.info(f"Excel处理成功,输出文件: {result_path}")
|
||||
return result_path
|
||||
else:
|
||||
self.logger.error(f"Excel处理结果文件不存在: {result_path}")
|
||||
return None
|
||||
else:
|
||||
self.logger.error("Excel处理服务返回None")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Excel处理失败: {e}", exc_info=True)
|
||||
return None
|
||||
|
||||
def _generate_purchase_order(self, processed_file: Path, output_dir: Path) -> Optional[Path]:
|
||||
"""采购单生成由OrderService完成,此处直接返回处理结果"""
|
||||
try:
|
||||
if processed_file and processed_file.exists():
|
||||
return processed_file
|
||||
return None
|
||||
except Exception:
|
||||
return None
|
||||
@@ -0,0 +1,7 @@
|
||||
"""
|
||||
供应商处理器模块初始化文件
|
||||
"""
|
||||
|
||||
from .generic_supplier_processor import GenericSupplierProcessor
|
||||
|
||||
__all__ = ['GenericSupplierProcessor']
|
||||
@@ -0,0 +1,430 @@
|
||||
"""
|
||||
通用供应商处理器
|
||||
|
||||
可配置化的供应商处理器,支持通过配置文件定义处理规则
|
||||
"""
|
||||
|
||||
import fnmatch
|
||||
import pandas as pd
|
||||
from typing import Optional, Dict, Any, List
|
||||
from pathlib import Path
|
||||
|
||||
from ..base import BaseProcessor
|
||||
from ...utils.log_utils import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class GenericSupplierProcessor(BaseProcessor):
|
||||
"""通用供应商处理器
|
||||
|
||||
基于配置文件处理不同供应商的Excel文件,支持:
|
||||
- 文件名模式匹配
|
||||
- 内容特征识别
|
||||
- 列映射配置
|
||||
- 数据清洗规则
|
||||
- 计算处理规则
|
||||
"""
|
||||
|
||||
def __init__(self, config: Dict[str, Any], supplier_config: Dict[str, Any]):
|
||||
"""初始化通用供应商处理器
|
||||
|
||||
Args:
|
||||
config: 系统配置
|
||||
supplier_config: 供应商特定配置
|
||||
"""
|
||||
super().__init__(config)
|
||||
self.supplier_config = supplier_config
|
||||
|
||||
# 从配置中提取基本信息
|
||||
self.name = supplier_config.get('name', 'GenericSupplier')
|
||||
self.description = supplier_config.get('description', '通用供应商处理器')
|
||||
|
||||
# 处理规则配置
|
||||
self.filename_patterns = supplier_config.get('filename_patterns', [])
|
||||
self.content_indicators = supplier_config.get('content_indicators', [])
|
||||
self.column_mapping = supplier_config.get('column_mapping', {})
|
||||
self.cleaning_rules = supplier_config.get('cleaning_rules', [])
|
||||
self.calculations = supplier_config.get('calculations', [])
|
||||
|
||||
# 输出配置
|
||||
self.output_template = supplier_config.get('output_template', 'templates/银豹-采购单模板.xls')
|
||||
self.output_suffix = supplier_config.get('output_suffix', '_银豹采购单')
|
||||
|
||||
def can_process(self, file_path: Path) -> bool:
|
||||
"""判断是否能处理该文件
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
是否能处理
|
||||
"""
|
||||
if not self.validate_input(file_path):
|
||||
return False
|
||||
|
||||
# 检查文件名模式
|
||||
if self.filename_patterns:
|
||||
filename_match = self._check_filename_patterns(file_path)
|
||||
if filename_match:
|
||||
return True
|
||||
|
||||
# 检查文件内容特征
|
||||
if self.content_indicators:
|
||||
content_match = self._check_content_indicators(file_path)
|
||||
if content_match:
|
||||
return True
|
||||
|
||||
# 如果都没有配置,则无法判断
|
||||
if not self.filename_patterns and not self.content_indicators:
|
||||
self.logger.warning(f"处理器 {self.name} 没有配置识别规则")
|
||||
return False
|
||||
|
||||
return False
|
||||
|
||||
def process(self, input_file: Path, output_dir: Path) -> Optional[Path]:
|
||||
"""处理文件
|
||||
|
||||
Args:
|
||||
input_file: 输入文件路径
|
||||
output_dir: 输出目录路径
|
||||
|
||||
Returns:
|
||||
输出文件路径,处理失败返回None
|
||||
"""
|
||||
self.log_processing_start(input_file)
|
||||
|
||||
try:
|
||||
# 步骤1: 读取数据
|
||||
self.logger.info("步骤1/4: 读取数据...")
|
||||
df = self._read_supplier_data(input_file)
|
||||
if df is None or df.empty:
|
||||
self.logger.error("读取数据失败或数据为空")
|
||||
self.log_processing_end(input_file, success=False)
|
||||
return None
|
||||
|
||||
# 步骤2: 应用列映射
|
||||
self.logger.info("步骤2/4: 应用列映射...")
|
||||
mapped_df = self._apply_column_mapping(df)
|
||||
if mapped_df is None:
|
||||
self.logger.error("列映射失败")
|
||||
self.log_processing_end(input_file, success=False)
|
||||
return None
|
||||
|
||||
# 步骤3: 数据清洗
|
||||
self.logger.info("步骤3/4: 数据清洗...")
|
||||
cleaned_df = self._apply_data_cleaning(mapped_df)
|
||||
if cleaned_df is None:
|
||||
self.logger.error("数据清洗失败")
|
||||
self.log_processing_end(input_file, success=False)
|
||||
return None
|
||||
|
||||
# 步骤4: 计算处理
|
||||
self.logger.info("步骤4/4: 计算处理...")
|
||||
calculated_df = self._apply_calculations(cleaned_df)
|
||||
if calculated_df is None:
|
||||
self.logger.error("计算处理失败")
|
||||
self.log_processing_end(input_file, success=False)
|
||||
return None
|
||||
|
||||
# 生成输出文件
|
||||
output_file = self._generate_output(calculated_df, input_file, output_dir)
|
||||
|
||||
if output_file and output_file.exists():
|
||||
self.logger.info(f"处理完成,输出文件: {output_file}")
|
||||
self.log_processing_end(input_file, output_file, success=True)
|
||||
return output_file
|
||||
else:
|
||||
self.logger.error("输出文件生成失败")
|
||||
self.log_processing_end(input_file, success=False)
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"处理文件时出错: {e}", exc_info=True)
|
||||
self.log_processing_end(input_file, success=False)
|
||||
return None
|
||||
|
||||
def get_required_columns(self) -> List[str]:
|
||||
"""返回需要的列名列表"""
|
||||
# 从列映射配置中提取目标列名
|
||||
return list(self.column_mapping.values()) if self.column_mapping else []
|
||||
|
||||
def _check_filename_patterns(self, file_path: Path) -> bool:
|
||||
"""检查文件名模式
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
是否匹配
|
||||
"""
|
||||
try:
|
||||
filename = file_path.name
|
||||
for pattern in self.filename_patterns:
|
||||
if fnmatch.fnmatch(filename.lower(), pattern.lower()):
|
||||
self.logger.info(f"文件名匹配成功: {filename} -> {pattern}")
|
||||
return True
|
||||
return False
|
||||
except Exception as e:
|
||||
self.logger.error(f"检查文件名模式时出错: {e}")
|
||||
return False
|
||||
|
||||
def _check_content_indicators(self, file_path: Path) -> bool:
|
||||
"""检查文件内容特征
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
是否匹配
|
||||
"""
|
||||
try:
|
||||
df = self._read_excel_safely(file_path, nrows=5)
|
||||
|
||||
# 检查列名中是否包含指定关键词
|
||||
columns_str = str(list(df.columns)).lower()
|
||||
|
||||
for indicator in self.content_indicators:
|
||||
if indicator.lower() in columns_str:
|
||||
self.logger.info(f"内容特征匹配成功: {indicator}")
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"检查内容特征时出错: {e}")
|
||||
return False
|
||||
|
||||
def _read_supplier_data(self, file_path: Path) -> Optional[pd.DataFrame]:
|
||||
"""读取供应商数据
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
数据DataFrame或None
|
||||
"""
|
||||
try:
|
||||
df = self._read_excel_safely(file_path)
|
||||
|
||||
if df.empty:
|
||||
self.logger.warning("数据文件为空")
|
||||
return None
|
||||
|
||||
self.logger.info(f"成功读取数据,形状: {df.shape}")
|
||||
return df
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"读取数据失败: {e}")
|
||||
return None
|
||||
|
||||
def _read_excel_safely(self, file_path: Path, **kwargs) -> pd.DataFrame:
|
||||
"""根据扩展名选择合适的读取引擎并带有回退"""
|
||||
suffix = file_path.suffix.lower()
|
||||
try:
|
||||
if suffix == '.xlsx':
|
||||
return pd.read_excel(file_path, engine='openpyxl', **kwargs)
|
||||
elif suffix == '.xls':
|
||||
try:
|
||||
return pd.read_excel(file_path, engine='xlrd', **kwargs)
|
||||
except Exception as e:
|
||||
self.logger.warning(f"读取xls失败,可能缺少xlrd: {e}")
|
||||
raise
|
||||
else:
|
||||
return pd.read_excel(file_path, **kwargs)
|
||||
except Exception as e:
|
||||
self.logger.error(f"读取Excel失败: {file_path} - {e}")
|
||||
raise
|
||||
|
||||
def _apply_column_mapping(self, df: pd.DataFrame) -> Optional[pd.DataFrame]:
|
||||
"""应用列映射
|
||||
|
||||
Args:
|
||||
df: 原始数据
|
||||
|
||||
Returns:
|
||||
映射后的数据或None
|
||||
"""
|
||||
if not self.column_mapping:
|
||||
self.logger.info("没有列映射配置")
|
||||
return df
|
||||
|
||||
try:
|
||||
# 应用列重命名
|
||||
df_renamed = df.rename(columns=self.column_mapping)
|
||||
|
||||
# 检查必需的列是否存在
|
||||
required_columns = self.get_required_columns()
|
||||
missing_columns = [col for col in required_columns if col not in df_renamed.columns]
|
||||
|
||||
if missing_columns:
|
||||
self.logger.warning(f"缺少必需的列: {missing_columns}")
|
||||
# 创建缺失的列并填充默认值
|
||||
for col in missing_columns:
|
||||
df_renamed[col] = 0 if '量' in col or '价' in col else ''
|
||||
self.logger.info(f"创建缺失列: {col},默认值: {df_renamed[col].iloc[0] if len(df_renamed) > 0 else 'N/A'}")
|
||||
|
||||
self.logger.info(f"列映射完成,列名: {list(df_renamed.columns)}")
|
||||
return df_renamed
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"列映射失败: {e}")
|
||||
return None
|
||||
|
||||
def _apply_data_cleaning(self, df: pd.DataFrame) -> Optional[pd.DataFrame]:
|
||||
"""应用数据清洗规则
|
||||
|
||||
Args:
|
||||
df: 映射后的数据
|
||||
|
||||
Returns:
|
||||
清洗后的数据或None
|
||||
"""
|
||||
if not self.cleaning_rules:
|
||||
self.logger.info("没有数据清洗规则")
|
||||
return df
|
||||
|
||||
try:
|
||||
df_cleaned = df.copy()
|
||||
|
||||
for rule in self.cleaning_rules:
|
||||
rule_type = rule.get('type')
|
||||
|
||||
if rule_type == 'remove_rows':
|
||||
# 删除行
|
||||
condition = rule.get('condition')
|
||||
if condition:
|
||||
before_count = len(df_cleaned)
|
||||
df_cleaned = df_cleaned.query(condition)
|
||||
after_count = len(df_cleaned)
|
||||
self.logger.info(f"删除行规则: {condition}, 删除数量: {before_count - after_count}")
|
||||
|
||||
elif rule_type == 'fill_na':
|
||||
# 填充空值,兼容单列和多列
|
||||
columns = rule.get('columns') or [rule.get('column')] if rule.get('column') else []
|
||||
value = rule.get('value', 0)
|
||||
for col in columns:
|
||||
if col and col in df_cleaned.columns:
|
||||
na_count = df_cleaned[col].isna().sum()
|
||||
df_cleaned[col] = df_cleaned[col].fillna(value)
|
||||
self.logger.info(f"填充空值: {col} -> {value}, 填充数量: {na_count}")
|
||||
|
||||
elif rule_type == 'convert_type':
|
||||
# 类型转换,兼容单列和多列
|
||||
target_type = rule.get('target_type', 'float')
|
||||
columns = rule.get('columns') or [rule.get('column')] if rule.get('column') else []
|
||||
for col in columns:
|
||||
if col and col in df_cleaned.columns:
|
||||
try:
|
||||
if target_type == 'float':
|
||||
df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce')
|
||||
elif target_type == 'int':
|
||||
df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce').astype('Int64')
|
||||
self.logger.info(f"类型转换: {col} -> {target_type}")
|
||||
except Exception as e:
|
||||
self.logger.warning(f"类型转换失败: {col} -> {target_type}: {e}")
|
||||
|
||||
else:
|
||||
self.logger.warning(f"未知的清洗规则类型: {rule_type}")
|
||||
|
||||
self.logger.info(f"数据清洗完成,数据形状: {df_cleaned.shape}")
|
||||
return df_cleaned
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"数据清洗失败: {e}")
|
||||
return None
|
||||
|
||||
def _apply_calculations(self, df: pd.DataFrame) -> Optional[pd.DataFrame]:
|
||||
"""应用计算处理
|
||||
|
||||
Args:
|
||||
df: 清洗后的数据
|
||||
|
||||
Returns:
|
||||
计算后的数据或None
|
||||
"""
|
||||
if not self.calculations:
|
||||
self.logger.info("没有计算规则")
|
||||
return df
|
||||
|
||||
try:
|
||||
df_calculated = df.copy()
|
||||
|
||||
for calculation in self.calculations:
|
||||
calc_type = calculation.get('type')
|
||||
|
||||
if calc_type == 'multiply':
|
||||
# 乘法计算
|
||||
source_column = calculation.get('source_column')
|
||||
target_column = calculation.get('target_column')
|
||||
factor = calculation.get('factor', 1)
|
||||
|
||||
if source_column and target_column:
|
||||
if source_column in df_calculated.columns:
|
||||
df_calculated[target_column] = df_calculated[source_column] * factor
|
||||
self.logger.info(f"乘法计算: {source_column} * {factor} -> {target_column}")
|
||||
else:
|
||||
self.logger.warning(f"源列不存在: {source_column}")
|
||||
|
||||
elif calc_type == 'divide':
|
||||
# 除法计算
|
||||
source_column = calculation.get('source_column')
|
||||
target_column = calculation.get('target_column')
|
||||
divisor = calculation.get('divisor', 1)
|
||||
|
||||
if source_column and target_column and divisor != 0:
|
||||
if source_column in df_calculated.columns:
|
||||
df_calculated[target_column] = df_calculated[source_column] / divisor
|
||||
self.logger.info(f"除法计算: {source_column} / {divisor} -> {target_column}")
|
||||
else:
|
||||
self.logger.warning(f"源列不存在: {source_column}")
|
||||
|
||||
elif calc_type == 'formula':
|
||||
# 公式计算
|
||||
formula = calculation.get('formula')
|
||||
target_column = calculation.get('target_column')
|
||||
|
||||
if formula and target_column:
|
||||
try:
|
||||
df_calculated[target_column] = df_calculated.eval(formula)
|
||||
self.logger.info(f"公式计算: {formula} -> {target_column}")
|
||||
except Exception as e:
|
||||
self.logger.error(f"公式计算失败: {formula}: {e}")
|
||||
|
||||
else:
|
||||
self.logger.warning(f"未知的计算类型: {calc_type}")
|
||||
|
||||
self.logger.info(f"计算处理完成,数据形状: {df_calculated.shape}")
|
||||
return df_calculated
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"计算处理失败: {e}")
|
||||
return None
|
||||
|
||||
def _generate_output(self, df: pd.DataFrame, input_file: Path, output_dir: Path) -> Optional[Path]:
|
||||
"""生成输出文件
|
||||
|
||||
Args:
|
||||
df: 最终数据
|
||||
input_file: 输入文件路径
|
||||
output_dir: 输出目录
|
||||
|
||||
Returns:
|
||||
输出文件路径或None
|
||||
"""
|
||||
try:
|
||||
# 生成输出文件名
|
||||
timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")
|
||||
output_filename = f"{input_file.stem}{self.output_suffix}_{timestamp}.xls"
|
||||
output_file = output_dir / output_filename
|
||||
|
||||
# 这里应该使用实际的模板生成逻辑
|
||||
# 暂时直接保存为Excel文件
|
||||
df.to_excel(output_file, index=False)
|
||||
|
||||
self.logger.info(f"输出文件生成成功: {output_file}")
|
||||
return output_file
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"生成输出文件失败: {e}")
|
||||
return None
|
||||
@@ -0,0 +1,362 @@
|
||||
"""
|
||||
烟草订单处理器
|
||||
|
||||
处理烟草公司特定格式的订单明细文件,生成银豹采购单
|
||||
"""
|
||||
|
||||
import os
|
||||
import datetime
|
||||
import pandas as pd
|
||||
import xlrd
|
||||
import xlwt
|
||||
from xlutils.copy import copy
|
||||
from openpyxl import load_workbook
|
||||
from typing import Optional, Dict, Any, List, Tuple
|
||||
from pathlib import Path
|
||||
|
||||
from .base import BaseProcessor
|
||||
from ...core.utils.log_utils import get_logger
|
||||
from ...core.utils.dialog_utils import show_custom_dialog
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class TobaccoProcessor(BaseProcessor):
|
||||
"""烟草订单处理器
|
||||
|
||||
处理烟草公司订单明细文件,提取商品信息并生成标准银豹采购单格式
|
||||
"""
|
||||
|
||||
def __init__(self, config: Dict[str, Any]):
|
||||
"""初始化烟草订单处理器
|
||||
|
||||
Args:
|
||||
config: 配置信息
|
||||
"""
|
||||
super().__init__(config)
|
||||
self.description = "处理烟草公司订单明细文件"
|
||||
self.template_file = config.get('Paths', 'template_file', fallback='templates/银豹-采购单模板.xls')
|
||||
|
||||
# 输出目录配置
|
||||
self.result_dir = Path("data/result")
|
||||
self.result_dir.mkdir(exist_ok=True)
|
||||
|
||||
# 默认输出文件名
|
||||
self.default_output_name = "银豹采购单_烟草公司.xls"
|
||||
|
||||
def can_process(self, file_path: Path) -> bool:
|
||||
"""判断是否为烟草订单文件
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
是否能处理该文件
|
||||
"""
|
||||
if not self.validate_input(file_path):
|
||||
return False
|
||||
|
||||
# 检查文件名特征
|
||||
filename = file_path.name
|
||||
tobacco_keywords = ['烟草', '卷烟', '订单明细', 'tobacco', '烟']
|
||||
|
||||
# 检查文件内容特征
|
||||
try:
|
||||
df = self._read_excel_safely(file_path, nrows=5)
|
||||
required_columns = ['商品', '盒码', '订单量']
|
||||
|
||||
# 检查文件名或内容特征
|
||||
filename_match = any(keyword in filename for keyword in tobacco_keywords)
|
||||
content_match = all(col in df.columns for col in required_columns)
|
||||
|
||||
if filename_match or content_match:
|
||||
self.logger.info(f"识别为烟草订单文件: {filename}")
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"检查文件内容时出错: {e}")
|
||||
# 如果无法读取内容,仅基于文件名判断
|
||||
return any(keyword in filename for keyword in tobacco_keywords)
|
||||
|
||||
def process(self, input_file: Path, output_dir: Path) -> Optional[Path]:
|
||||
"""处理烟草订单
|
||||
|
||||
Args:
|
||||
input_file: 输入文件路径
|
||||
output_dir: 输出目录路径
|
||||
|
||||
Returns:
|
||||
输出文件路径,处理失败返回None
|
||||
"""
|
||||
self.log_processing_start(input_file)
|
||||
|
||||
try:
|
||||
# 读取订单信息(时间和总金额)
|
||||
order_info = self._read_order_info(input_file)
|
||||
if not order_info:
|
||||
self.logger.error(f"读取订单信息失败: {input_file}")
|
||||
self.log_processing_end(input_file, success=False)
|
||||
return None
|
||||
|
||||
order_time, total_amount = order_info
|
||||
self.logger.info(f"订单信息 - 时间: {order_time}, 总金额: {total_amount}")
|
||||
|
||||
# 读取订单数据
|
||||
order_data = self._read_order_data(input_file)
|
||||
if order_data is None or order_data.empty:
|
||||
self.logger.error(f"读取订单数据失败或数据为空: {input_file}")
|
||||
self.log_processing_end(input_file, success=False)
|
||||
return None
|
||||
|
||||
self.logger.info(f"成功读取订单数据,共{len(order_data)}条记录")
|
||||
|
||||
# 生成输出文件路径
|
||||
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
output_filename = f"银豹采购单_烟草公司_{timestamp}.xls"
|
||||
output_file = output_dir / output_filename
|
||||
|
||||
# 确保输出目录存在
|
||||
output_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 生成银豹采购单
|
||||
result = self._generate_pospal_order(order_data, order_time, output_file)
|
||||
|
||||
if result:
|
||||
self.logger.info(f"采购单生成成功: {output_file}")
|
||||
self.log_processing_end(input_file, output_file, success=True)
|
||||
|
||||
# 显示处理结果
|
||||
self._show_processing_result(output_file, order_time, len(order_data), total_amount)
|
||||
|
||||
return output_file
|
||||
else:
|
||||
self.logger.error("生成银豹采购单失败")
|
||||
self.log_processing_end(input_file, success=False)
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"处理烟草订单时发生错误: {e}", exc_info=True)
|
||||
self.log_processing_end(input_file, success=False)
|
||||
return None
|
||||
|
||||
def get_required_columns(self) -> List[str]:
|
||||
"""返回需要的列名列表"""
|
||||
return ['商品', '盒码', '条码', '建议零售价', '批发价', '需求量', '订单量', '金额']
|
||||
|
||||
def get_supported_extensions(self) -> List[str]:
|
||||
"""支持的文件扩展名"""
|
||||
return ['.xlsx', '.xls']
|
||||
|
||||
def _read_order_info(self, file_path: Path) -> Optional[Tuple[str, float]]:
|
||||
"""读取订单信息(时间和总金额)
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
包含订单时间和总金额的元组或None
|
||||
"""
|
||||
try:
|
||||
wb_info = load_workbook(file_path, data_only=True)
|
||||
ws_info = wb_info.active
|
||||
|
||||
# 从指定单元格读取订单信息
|
||||
order_time = ws_info["H1"].value or "(空)"
|
||||
total_amount = ws_info["H3"].value or 0.0
|
||||
|
||||
self.logger.info(f"成功读取订单信息: 时间={order_time}, 总金额={total_amount}")
|
||||
return (order_time, total_amount)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"读取订单信息出错: {e}")
|
||||
return None
|
||||
|
||||
def _read_order_data(self, file_path: Path) -> Optional[pd.DataFrame]:
|
||||
"""读取订单数据
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
订单数据DataFrame或None
|
||||
"""
|
||||
columns = ['商品', '盒码', '条码', '建议零售价', '批发价', '需求量', '订单量', '金额']
|
||||
|
||||
try:
|
||||
df_old = self._read_excel_safely(file_path, header=None, skiprows=3, names=columns)
|
||||
|
||||
# 过滤订单量不为0的数据,并计算采购量和单价
|
||||
df_filtered = df_old[df_old['订单量'] != 0].copy()
|
||||
|
||||
if df_filtered.empty:
|
||||
self.logger.warning("没有订单量不为0的记录")
|
||||
return None
|
||||
|
||||
# 计算采购量和单价
|
||||
df_filtered['采购量'] = df_filtered['订单量'] * 10 # 烟草订单通常需要乘以10
|
||||
df_filtered['采购单价'] = df_filtered['金额'] / df_filtered['采购量']
|
||||
df_filtered = df_filtered.reset_index(drop=True)
|
||||
|
||||
self.logger.info(f"成功处理订单数据,有效记录数: {len(df_filtered)}")
|
||||
return df_filtered
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"读取订单数据失败: {e}")
|
||||
return None
|
||||
|
||||
def _read_excel_safely(self, file_path: Path, **kwargs) -> pd.DataFrame:
|
||||
suffix = file_path.suffix.lower()
|
||||
if suffix == '.xlsx':
|
||||
return pd.read_excel(file_path, engine='openpyxl', **kwargs)
|
||||
elif suffix == '.xls':
|
||||
try:
|
||||
return pd.read_excel(file_path, engine='xlrd', **kwargs)
|
||||
except Exception as e:
|
||||
self.logger.error(f"读取xls失败,可能缺少xlrd: {e}")
|
||||
raise
|
||||
else:
|
||||
return pd.read_excel(file_path, **kwargs)
|
||||
|
||||
def _generate_pospal_order(self, order_data: pd.DataFrame, order_time: str, output_file: Path) -> bool:
|
||||
"""生成银豹采购单
|
||||
|
||||
Args:
|
||||
order_data: 订单数据
|
||||
order_time: 订单时间
|
||||
output_file: 输出文件路径
|
||||
|
||||
Returns:
|
||||
是否生成成功
|
||||
"""
|
||||
try:
|
||||
# 检查模板文件是否存在
|
||||
template_path = Path(self.template_file)
|
||||
if not template_path.exists():
|
||||
self.logger.error(f"采购单模板文件不存在: {template_path}")
|
||||
return False
|
||||
|
||||
self.logger.info(f"使用模板文件: {template_path}")
|
||||
|
||||
# 打开模板,准备写入
|
||||
template_rd = xlrd.open_workbook(str(template_path), formatting_info=True)
|
||||
template_wb = copy(template_rd)
|
||||
template_ws = template_wb.get_sheet(0)
|
||||
|
||||
# 获取模板中的表头列索引
|
||||
header_row = template_rd.sheet_by_index(0).row_values(0)
|
||||
|
||||
# 查找需要的列索引
|
||||
try:
|
||||
barcode_col = header_row.index("条码(必填)")
|
||||
amount_col = header_row.index("采购量(必填)")
|
||||
gift_col = header_row.index("赠送量")
|
||||
price_col = header_row.index("采购单价(必填)")
|
||||
except ValueError as e:
|
||||
self.logger.error(f"模板列查找失败: {e}")
|
||||
return False
|
||||
|
||||
self.logger.info(f"模板列索引 - 条码:{barcode_col}, 采购量:{amount_col}, 赠送量:{gift_col}, 单价:{price_col}")
|
||||
|
||||
# 写入数据到模板
|
||||
for i, row in order_data.iterrows():
|
||||
template_ws.write(i + 1, barcode_col, row['盒码']) # 商品条码
|
||||
template_ws.write(i + 1, amount_col, int(row['采购量'])) # 采购量
|
||||
template_ws.write(i + 1, gift_col, "") # 赠送量为空
|
||||
template_ws.write(i + 1, price_col, round(row['采购单价'], 2)) # 采购单价保留两位小数
|
||||
|
||||
# 确保输出目录存在
|
||||
output_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 保存输出文件
|
||||
template_wb.save(str(output_file))
|
||||
|
||||
self.logger.info(f"采购单生成成功: {output_file}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"生成银豹采购单失败: {e}", exc_info=True)
|
||||
return False
|
||||
|
||||
def _show_processing_result(self, output_file: Path, order_time: str, total_count: int, total_amount: float):
|
||||
"""显示处理结果
|
||||
|
||||
Args:
|
||||
output_file: 输出文件路径
|
||||
order_time: 订单时间
|
||||
total_count: 处理条目数
|
||||
total_amount: 总金额
|
||||
"""
|
||||
try:
|
||||
# 创建附加信息
|
||||
additional_info = {
|
||||
"订单来源": "烟草公司",
|
||||
"处理时间": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
}
|
||||
|
||||
# 格式化金额显示
|
||||
try:
|
||||
if isinstance(total_amount, str):
|
||||
total_amount = float(total_amount.replace(',', ''))
|
||||
amount_display = f"¥{total_amount:.2f}"
|
||||
except (ValueError, TypeError):
|
||||
amount_display = f"¥{total_amount}"
|
||||
|
||||
# 显示自定义对话框
|
||||
show_custom_dialog(
|
||||
title="烟草订单处理结果",
|
||||
message="烟草订单处理完成",
|
||||
result_file=str(output_file),
|
||||
time_info=order_time,
|
||||
count_info=f"{total_count}个商品",
|
||||
amount_info=amount_display,
|
||||
additional_info=additional_info
|
||||
)
|
||||
|
||||
self.logger.info(f"显示处理结果 - 文件:{output_file}, 时间:{order_time}, 数量:{total_count}, 金额:{total_amount}")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"显示处理结果时出错: {e}")
|
||||
|
||||
def get_latest_tobacco_order(self) -> Optional[Path]:
|
||||
"""获取最新的烟草订单明细文件(兼容旧接口)
|
||||
|
||||
Returns:
|
||||
文件路径或None
|
||||
"""
|
||||
try:
|
||||
# 获取今日开始时间戳
|
||||
today = datetime.date.today()
|
||||
today_start = datetime.datetime.combine(today, datetime.time.min).timestamp()
|
||||
|
||||
# 查找订单明细文件
|
||||
result_dir = Path("data/output")
|
||||
if not result_dir.exists():
|
||||
return None
|
||||
|
||||
# 查找符合条件的文件
|
||||
candidates = []
|
||||
for file_path in result_dir.glob("订单明细*.xlsx"):
|
||||
if file_path.stat().st_ctime >= today_start:
|
||||
candidates.append(file_path)
|
||||
|
||||
if not candidates:
|
||||
self.logger.warning("未找到今天创建的烟草订单明细文件")
|
||||
# 返回最新的文件
|
||||
all_files = list(result_dir.glob("订单明细*.xlsx"))
|
||||
if all_files:
|
||||
all_files.sort(key=lambda x: x.stat().st_ctime, reverse=True)
|
||||
return all_files[0]
|
||||
return None
|
||||
|
||||
# 返回最新的文件
|
||||
candidates.sort(key=lambda x: x.stat().st_ctime, reverse=True)
|
||||
latest_file = candidates[0]
|
||||
|
||||
self.logger.info(f"找到最新烟草订单明细文件: {latest_file}")
|
||||
return latest_file
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"获取最新烟草订单文件时出错: {e}")
|
||||
return None
|
||||
@@ -7,6 +7,7 @@
|
||||
import os
|
||||
import sys
|
||||
import logging
|
||||
from logging.handlers import RotatingFileHandler
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict
|
||||
@@ -58,7 +59,8 @@ def setup_logger(name: str,
|
||||
|
||||
# 创建文件处理器
|
||||
try:
|
||||
file_handler = logging.FileHandler(log_file, encoding='utf-8')
|
||||
# 使用滚动日志,限制单个日志大小与备份数量
|
||||
file_handler = RotatingFileHandler(log_file, maxBytes=5 * 1024 * 1024, backupCount=3, encoding='utf-8')
|
||||
file_handler.setFormatter(formatter)
|
||||
file_handler.setLevel(level)
|
||||
logger.addHandler(file_handler)
|
||||
@@ -175,4 +177,4 @@ def cleanup_active_marker(name: str) -> None:
|
||||
if os.path.exists(active_marker):
|
||||
os.remove(active_marker)
|
||||
except Exception as e:
|
||||
print(f"无法清理日志活跃标记: {e}")
|
||||
print(f"无法清理日志活跃标记: {e}")
|
||||
|
||||
Reference in New Issue
Block a user