添加数量为空时通过金额和单价计算数量的功能,增强规格解析能力
This commit is contained in:
+107
-78
@@ -215,88 +215,84 @@ class ExcelProcessor:
|
||||
|
||||
def extract_product_info(self, df: pd.DataFrame) -> List[Dict]:
|
||||
"""
|
||||
从处理后的数据框中提取商品信息
|
||||
支持处理不同格式的Excel文件
|
||||
从数据帧中提取商品信息
|
||||
|
||||
Args:
|
||||
df: 数据框
|
||||
df: 数据帧
|
||||
|
||||
Returns:
|
||||
商品信息列表,每个商品为一个字典
|
||||
商品信息列表
|
||||
"""
|
||||
products = []
|
||||
|
||||
# 检测表头位置和数据格式
|
||||
# 检测列映射
|
||||
column_mapping = self._detect_column_mapping(df)
|
||||
logger.info(f"列名映射结果: {column_mapping}")
|
||||
logger.info(f"检测到列映射: {column_mapping}")
|
||||
|
||||
# 检查是否有规格列
|
||||
has_specification_column = '规格' in df.columns
|
||||
logger.info(f"是否存在规格列: {has_specification_column}")
|
||||
|
||||
# 处理每一行数据
|
||||
# 处理每一行
|
||||
for idx, row in df.iterrows():
|
||||
try:
|
||||
# 跳过无效行:名称为空、包含小计/合计/总计/空行等
|
||||
name_val = str(row[column_mapping['name']]) if column_mapping.get('name') and not pd.isna(row[column_mapping['name']]) else ''
|
||||
if not name_val or any(key in name_val for key in ["小计", "合计", "总计"]):
|
||||
continue
|
||||
# 条码处理 - 确保条码总是字符串格式且不带小数点
|
||||
barcode_raw = row[column_mapping['barcode']] if column_mapping.get('barcode') else ''
|
||||
if pd.isna(barcode_raw) or barcode_raw == '' or str(barcode_raw).strip() in ['nan', 'None']:
|
||||
continue
|
||||
# 跳过条码长度异常、数量为0、单价为0且名称疑似无效的行
|
||||
if (len(str(barcode_raw)) < 7) or (column_mapping.get('quantity') and (pd.isna(row[column_mapping['quantity']]) or str(row[column_mapping['quantity']]).strip() in ['nan', 'None', '0', '0.0'])):
|
||||
continue
|
||||
|
||||
# 使用format_barcode函数处理条码,确保无小数点
|
||||
barcode = format_barcode(barcode_raw)
|
||||
|
||||
# 处理数量字段,先提取数字部分再转换为浮点数
|
||||
quantity_value = 0
|
||||
quantity_str = ""
|
||||
if column_mapping.get('quantity') and not pd.isna(row[column_mapping['quantity']]):
|
||||
quantity_str = str(row[column_mapping['quantity']])
|
||||
# 使用提取数字的函数
|
||||
quantity_num = extract_number(quantity_str)
|
||||
if quantity_num is not None:
|
||||
quantity_value = quantity_num
|
||||
|
||||
# 基础信息
|
||||
# 初始化商品信息
|
||||
product = {
|
||||
'barcode': barcode,
|
||||
'name': str(row[column_mapping['name']]) if column_mapping.get('name') else '',
|
||||
'quantity': quantity_value,
|
||||
'price': 0,
|
||||
'unit': str(row[column_mapping['unit']]) if column_mapping.get('unit') and not pd.isna(row[column_mapping['unit']]) else '',
|
||||
'specification': '',
|
||||
'package_quantity': None
|
||||
'barcode': '', # 条码
|
||||
'name': '', # 商品名称
|
||||
'specification': '', # 规格
|
||||
'quantity': 0, # 数量
|
||||
'unit': '', # 单位
|
||||
'price': 0, # 单价
|
||||
'amount': 0, # 金额
|
||||
'is_gift': False # 是否为赠品
|
||||
}
|
||||
|
||||
# 处理价格字段 - 清理可能的换行符和空格
|
||||
if column_mapping.get('price') and not pd.isna(row[column_mapping['price']]):
|
||||
price_str = str(row[column_mapping['price']])
|
||||
# 清理换行符、空格并替换逗号
|
||||
price_str = price_str.replace('\n', '').replace(' ', '').replace(',', '.')
|
||||
try:
|
||||
product['price'] = float(price_str)
|
||||
except ValueError:
|
||||
logger.warning(f"价格转换失败,原始值: '{price_str}',使用默认值0")
|
||||
# 提取条码
|
||||
if '条码' in df.columns and not pd.isna(row['条码']):
|
||||
product['barcode'] = str(row['条码']).strip()
|
||||
elif column_mapping.get('barcode') and not pd.isna(row[column_mapping['barcode']]):
|
||||
product['barcode'] = str(row[column_mapping['barcode']]).strip()
|
||||
|
||||
# 清理单位
|
||||
if product['unit'] == 'nan' or product['unit'] == 'None':
|
||||
product['unit'] = ''
|
||||
# 跳过空条码行
|
||||
if not product['barcode']:
|
||||
continue
|
||||
|
||||
# 提取商品名称
|
||||
if '商品名称' in df.columns and not pd.isna(row['商品名称']):
|
||||
product['name'] = str(row['商品名称']).strip()
|
||||
elif '名称' in df.columns and not pd.isna(row['名称']):
|
||||
product['name'] = str(row['名称']).strip()
|
||||
elif column_mapping.get('name') and not pd.isna(row[column_mapping['name']]):
|
||||
product['name'] = str(row[column_mapping['name']]).strip()
|
||||
|
||||
# 提取单位
|
||||
if '单位' in df.columns and not pd.isna(row['单位']):
|
||||
product['unit'] = str(row['单位']).strip()
|
||||
elif column_mapping.get('unit') and not pd.isna(row[column_mapping['unit']]):
|
||||
product['unit'] = str(row[column_mapping['unit']]).strip()
|
||||
|
||||
# 打印每行提取出的信息
|
||||
logger.info(f"第{idx+1}行: 提取商品信息 条码={product['barcode']}, 名称={product['name']}, 规格={product['specification']}, 数量={product['quantity']}, 单位={product['unit']}, 单价={product['price']}")
|
||||
# 提取单价
|
||||
if '单价' in df.columns and not pd.isna(row['单价']):
|
||||
product['price'] = row['单价']
|
||||
elif column_mapping.get('price') and not pd.isna(row[column_mapping['price']]):
|
||||
product['price'] = row[column_mapping['price']]
|
||||
|
||||
# 从数量字段中提取单位(如果单位字段为空)
|
||||
if not product['unit'] and quantity_str:
|
||||
num, unit = self.unit_converter.extract_unit_from_quantity(quantity_str)
|
||||
# 提取金额
|
||||
if '金额' in df.columns and not pd.isna(row['金额']):
|
||||
product['amount'] = row['金额']
|
||||
elif '小计' in df.columns and not pd.isna(row['小计']):
|
||||
product['amount'] = row['小计']
|
||||
elif column_mapping.get('amount') and not pd.isna(row[column_mapping['amount']]):
|
||||
product['amount'] = row[column_mapping['amount']]
|
||||
|
||||
# 提取数量
|
||||
if '数量' in df.columns and not pd.isna(row['数量']):
|
||||
product['quantity'] = row['数量']
|
||||
elif column_mapping.get('quantity') and not pd.isna(row[column_mapping['quantity']]):
|
||||
product['quantity'] = row[column_mapping['quantity']]
|
||||
|
||||
# 处理可能的复合数量字段,例如"2箱"、"3件"
|
||||
if isinstance(product['quantity'], str) and product['quantity']:
|
||||
num, unit = self.unit_converter.extract_unit_from_quantity(product['quantity'])
|
||||
if unit:
|
||||
product['unit'] = unit
|
||||
logger.info(f"从数量提取单位: {quantity_str} -> {unit}")
|
||||
# 如果数量被提取出来,更新数量
|
||||
if num is not None:
|
||||
product['quantity'] = num
|
||||
|
||||
@@ -665,7 +661,7 @@ class ExcelProcessor:
|
||||
|
||||
def _detect_column_mapping(self, df: pd.DataFrame) -> Dict[str, str]:
|
||||
"""
|
||||
检测和映射Excel表头列名
|
||||
自动检测列名映射
|
||||
|
||||
Args:
|
||||
df: 数据框
|
||||
@@ -687,7 +683,8 @@ class ExcelProcessor:
|
||||
'specification': ['规格', '规格型号', '型号', '商品规格', '产品规格', '包装规格','规 格'],
|
||||
'quantity': ['数量', '采购数量', '购买数量', '采购数量', '订单数量', '数量(必填)', '入库数', '入库数量','数 量'],
|
||||
'unit': ['单位', '采购单位', '计量单位', '单位(必填)', '单位名称', '计价单位','单 位'],
|
||||
'price': ['单价', '价格', '采购单价', '销售价', '进货价', '单价(必填)', '采购价', '参考价', '入库单价','单 价']
|
||||
'price': ['单价', '价格', '采购单价', '销售价', '进货价', '单价(必填)', '采购价', '参考价', '入库单价','单 价'],
|
||||
'amount': ['金额', '小计', '总价', '合计金额', '小计金额', '金额(元)', '金额合计', '合计', '总额']
|
||||
}
|
||||
|
||||
# 映射列名到标准名称
|
||||
@@ -719,7 +716,7 @@ class ExcelProcessor:
|
||||
if target in mapped_columns:
|
||||
break
|
||||
|
||||
return mapped_columns
|
||||
return mapped_columns
|
||||
|
||||
def infer_specification_from_name(self, product_name: str) -> Tuple[Optional[str], Optional[int]]:
|
||||
"""
|
||||
@@ -819,7 +816,7 @@ class ExcelProcessor:
|
||||
def parse_specification(self, spec_str: str) -> Optional[int]:
|
||||
"""
|
||||
解析规格字符串,提取包装数量
|
||||
支持格式:1*15, 1x15, 1*5*10
|
||||
支持格式:1*15, 1x15, 1*5*10, 5kg*6, IL*12等
|
||||
|
||||
Args:
|
||||
spec_str: 规格字符串
|
||||
@@ -834,34 +831,66 @@ class ExcelProcessor:
|
||||
# 清理规格字符串
|
||||
spec_str = clean_string(spec_str)
|
||||
|
||||
# 匹配重量/容量格式,如"450g*15"、"450ml*15"
|
||||
match = re.search(r'\d+(?:g|ml|毫升|克)[*xX×](\d+)', spec_str)
|
||||
# 处理可能的OCR误识别,如"IL"应为"1L","6oo"应为"600"
|
||||
spec_str = re.sub(r'(\b|^)[iIlL](\d+)', r'1\2', spec_str) # 将"IL"替换为"1L"
|
||||
spec_str = re.sub(r'(\d+)[oO0]{2,}', lambda m: m.group(1) + '00', spec_str) # 将"6oo"替换为"600"
|
||||
spec_str = spec_str.replace('×', '*').replace('x', '*').replace('X', '*') # 统一乘号
|
||||
|
||||
logger.debug(f"清理后的规格字符串: {spec_str}")
|
||||
|
||||
# 匹配带单位的格式,如"5kg*6"、"450g*15"、"450ml*15"
|
||||
weight_pattern = r'(\d+(?:\.\d+)?)\s*(?:kg|KG|千克|公斤)[*×](\d+)'
|
||||
match = re.search(weight_pattern, spec_str)
|
||||
if match:
|
||||
return int(match.group(2))
|
||||
|
||||
# 匹配克、毫升等单位格式
|
||||
match = re.search(r'\d+(?:\.\d+)?(?:g|G|ml|ML|mL|毫升|克)[*×](\d+)', spec_str)
|
||||
if match:
|
||||
# 返回后面的数量
|
||||
return int(match.group(1))
|
||||
|
||||
# 匹配1*5*10 格式的三级规格
|
||||
match = re.search(r'(\d+)[\*xX×](\d+)[\*xX×](\d+)', spec_str)
|
||||
match = re.search(r'(\d+(?:\.\d+)?)[*×](\d+(?:\.\d+)?)[*×](\d+(?:\.\d+)?)', spec_str)
|
||||
if match:
|
||||
# 取最后一个数字作为袋数量
|
||||
return int(match.group(3))
|
||||
return int(float(match.group(3)))
|
||||
|
||||
# 匹配1*15, 1x15 格式
|
||||
match = re.search(r'(\d+)[\*xX×](\d+)', spec_str)
|
||||
match = re.search(r'(\d+(?:\.\d+)?)[*×](\d+(?:\.\d+)?)', spec_str)
|
||||
if match:
|
||||
# 取第二个数字作为包装数量
|
||||
return int(match.group(2))
|
||||
return int(float(match.group(2)))
|
||||
|
||||
# 匹配24瓶/件等格式
|
||||
match = re.search(r'(\d+)[瓶个支袋][//](件|箱)', spec_str)
|
||||
match = re.search(r'(\d+(?:\.\d+)?)[瓶个支袋][//](件|箱)', spec_str)
|
||||
if match:
|
||||
return int(match.group(1))
|
||||
return int(float(match.group(1)))
|
||||
|
||||
# 匹配4L格式
|
||||
match = re.search(r'(\d+(?:\.\d+)?)\s*[Ll升][*×]?(\d+)?', spec_str)
|
||||
match = re.search(r'(\d+(?:\.\d+)?)\s*[Ll升][*×]?(\d+(?:\.\d+)?)?', spec_str)
|
||||
if match:
|
||||
# 如果有第二个数字,返回它;否则返回1
|
||||
return int(match.group(2)) if match.group(2) else 1
|
||||
return int(float(match.group(2))) if match.group(2) else 1
|
||||
|
||||
# 匹配单独的数字+单位格式,如"12瓶装"
|
||||
match = re.search(r'(\d+(?:\.\d+)?)[瓶个支袋包盒罐箱](?:装|\/箱)?', spec_str)
|
||||
if match:
|
||||
return int(float(match.group(1)))
|
||||
|
||||
# 尝试直接匹配任何数字
|
||||
numbers = re.findall(r'\d+(?:\.\d+)?', spec_str)
|
||||
if numbers and len(numbers) > 0:
|
||||
# 如果只有一个数字,通常是包装数量
|
||||
if len(numbers) == 1:
|
||||
return int(float(numbers[0]))
|
||||
|
||||
# 如果有多个数字,尝试识别可能的包装数量(典型数值如6/12/24/30)
|
||||
for num in numbers:
|
||||
if float(num) in [6.0, 12.0, 24.0, 30.0]:
|
||||
return int(float(num))
|
||||
|
||||
# 如果没有典型数值,选择最后一个数字(通常是包装数量)
|
||||
return int(float(numbers[-1]))
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"解析规格'{spec_str}'时出错: {e}")
|
||||
|
||||
@@ -142,8 +142,17 @@ class ProductValidator:
|
||||
if price_str in ["赠品", "gift", "赠送", "0", ""]:
|
||||
return True, 0.0, True, None
|
||||
|
||||
# 去除空白和非数字字符(保留小数点)
|
||||
price_clean = re.sub(r'[^\d\.]', '', price_str)
|
||||
# 去除空白和非数字字符(保留小数点和逗号)
|
||||
price_clean = re.sub(r'[^\d\.,]', '', price_str)
|
||||
|
||||
# 处理小数点和逗号
|
||||
if ',' in price_clean and '.' not in price_clean:
|
||||
# 如果只有逗号没有小数点,将逗号视为小数点
|
||||
price_clean = price_clean.replace(',', '.')
|
||||
elif ',' in price_clean and '.' in price_clean:
|
||||
# 如果既有逗号又有小数点,移除逗号(认为逗号是千位分隔符)
|
||||
price_clean = price_clean.replace(',', '')
|
||||
|
||||
if not price_clean:
|
||||
return False, 0.0, True, "单价不包含数字,视为赠品"
|
||||
|
||||
@@ -192,15 +201,6 @@ class ProductValidator:
|
||||
# 即使验证失败,但如果有修复后的条码仍然使用它
|
||||
validated_product['barcode'] = fixed_barcode
|
||||
|
||||
# 验证数量
|
||||
quantity = product.get('quantity', 0)
|
||||
is_valid, fixed_quantity, error_msg = self.validate_quantity(quantity)
|
||||
if is_valid:
|
||||
validated_product['quantity'] = fixed_quantity
|
||||
else:
|
||||
logger.warning(f"数量验证失败: {error_msg}")
|
||||
validated_product['quantity'] = 0.0
|
||||
|
||||
# 验证单价
|
||||
price = product.get('price', 0)
|
||||
is_valid, fixed_price, is_gift, error_msg = self.validate_price(price)
|
||||
@@ -212,4 +212,46 @@ class ProductValidator:
|
||||
if error_msg:
|
||||
logger.info(error_msg)
|
||||
|
||||
# 验证数量
|
||||
quantity = product.get('quantity', None)
|
||||
is_valid, fixed_quantity, error_msg = self.validate_quantity(quantity)
|
||||
|
||||
# 检查数量是否为空,但单价和金额存在的情况
|
||||
if not is_valid and error_msg == "数量为空":
|
||||
# 获取金额
|
||||
amount = product.get('amount', None)
|
||||
|
||||
# 如果单价有效且金额存在,则可以计算数量
|
||||
if fixed_price > 0 and amount is not None:
|
||||
try:
|
||||
# 确保金额是数字
|
||||
if isinstance(amount, str):
|
||||
# 移除货币符号和非数字字符,保留数字、小数点和逗号
|
||||
amount_str = re.sub(r'[^\d\.,]', '', amount.strip())
|
||||
# 替换逗号为小数点(如果逗号作为小数分隔符)
|
||||
if ',' in amount_str and '.' not in amount_str:
|
||||
amount_str = amount_str.replace(',', '.')
|
||||
# 处理既有逗号又有小数点的情况(通常逗号是千位分隔符)
|
||||
elif ',' in amount_str and '.' in amount_str:
|
||||
amount_str = amount_str.replace(',', '')
|
||||
amount = float(amount_str)
|
||||
else:
|
||||
amount = float(amount)
|
||||
|
||||
# 计算数量 = 金额 / 单价
|
||||
if amount > 0:
|
||||
calculated_quantity = amount / fixed_price
|
||||
logger.info(f"数量为空,通过金额({amount})和单价({fixed_price})计算得出数量: {calculated_quantity}")
|
||||
validated_product['quantity'] = calculated_quantity
|
||||
is_valid = True
|
||||
except (ValueError, TypeError, ZeroDivisionError) as e:
|
||||
logger.warning(f"通过金额和单价计算数量失败: {e}")
|
||||
|
||||
# 如果数量验证有效或通过金额计算成功
|
||||
if is_valid:
|
||||
validated_product['quantity'] = fixed_quantity if is_valid and fixed_quantity > 0 else validated_product.get('quantity', 0)
|
||||
else:
|
||||
logger.warning(f"数量验证失败: {error_msg}")
|
||||
validated_product['quantity'] = 0.0
|
||||
|
||||
return validated_product
|
||||
Reference in New Issue
Block a user