feat: 益选 OCR 订单处理系统初始提交
- 智能供应商识别(蓉城易购/烟草/杨碧月/通用) - 百度 OCR 表格识别集成 - 规则引擎(列映射/数据清洗/单位转换/规格推断) - 条码映射管理与云端同步(Gitea REST API) - 云端同步支持:条码映射、供应商配置、商品资料、采购模板 - 拖拽一键处理(图片→OCR→Excel→合并) - 191 个单元测试 - 移除无用的模板管理功能 - 清理 IDE 产物目录 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,9 @@
|
||||
"""
|
||||
数据处理handlers模块初始化文件
|
||||
"""
|
||||
|
||||
from .data_cleaner import DataCleaner
|
||||
from .column_mapper import ColumnMapper
|
||||
from .calculator import DataCalculator
|
||||
|
||||
__all__ = ['DataCleaner', 'ColumnMapper', 'DataCalculator']
|
||||
@@ -0,0 +1,378 @@
|
||||
"""
|
||||
数据计算处理器
|
||||
|
||||
提供各种数据计算功能,如数量计算、价格计算、汇总统计等
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from typing import Dict, Any, Optional, List, Union
|
||||
from ...core.utils.log_utils import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class DataCalculator:
|
||||
"""数据计算处理器
|
||||
|
||||
提供标准化的数据计算功能,支持各种业务计算规则
|
||||
"""
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
"""初始化数据计算器
|
||||
|
||||
Args:
|
||||
config: 计算配置
|
||||
"""
|
||||
self.config = config or {}
|
||||
self.calculation_rules = []
|
||||
|
||||
def add_rule(self, rule_type: str, **kwargs):
|
||||
"""添加计算规则
|
||||
|
||||
Args:
|
||||
rule_type: 规则类型
|
||||
**kwargs: 规则参数
|
||||
"""
|
||||
rule = {'type': rule_type, **kwargs}
|
||||
self.calculation_rules.append(rule)
|
||||
logger.debug(f"添加计算规则: {rule_type}")
|
||||
|
||||
def calculate(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""执行数据计算
|
||||
|
||||
Args:
|
||||
df: 输入数据
|
||||
|
||||
Returns:
|
||||
计算后的数据
|
||||
"""
|
||||
logger.info(f"开始数据计算,原始数据形状: {df.shape}")
|
||||
|
||||
result_df = df.copy()
|
||||
|
||||
for i, rule in enumerate(self.calculation_rules):
|
||||
try:
|
||||
logger.debug(f"执行计算规则 {i+1}/{len(self.calculation_rules)}: {rule['type']}")
|
||||
result_df = self._apply_rule(result_df, rule)
|
||||
logger.debug(f"规则执行完成,数据形状: {result_df.shape}")
|
||||
except Exception as e:
|
||||
logger.error(f"计算规则执行失败: {rule}, 错误: {e}")
|
||||
# 继续执行下一个规则,而不是中断整个流程
|
||||
continue
|
||||
|
||||
logger.info(f"数据计算完成,最终数据形状: {result_df.shape}")
|
||||
return result_df
|
||||
|
||||
def _apply_rule(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""应用单个计算规则
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
rule_type = rule.get('type')
|
||||
|
||||
if rule_type == 'multiply':
|
||||
return self._multiply(df, rule)
|
||||
elif rule_type == 'divide':
|
||||
return self._divide(df, rule)
|
||||
elif rule_type == 'add':
|
||||
return self._add(df, rule)
|
||||
elif rule_type == 'subtract':
|
||||
return self._subtract(df, rule)
|
||||
elif rule_type == 'formula':
|
||||
return self._formula(df, rule)
|
||||
elif rule_type == 'round':
|
||||
return self._round(df, rule)
|
||||
elif rule_type == 'sum':
|
||||
return self._sum(df, rule)
|
||||
elif rule_type == 'aggregate':
|
||||
return self._aggregate(df, rule)
|
||||
else:
|
||||
logger.warning(f"未知的计算规则类型: {rule_type}")
|
||||
return df
|
||||
|
||||
def _multiply(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""乘法计算
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
source_column = rule.get('source_column')
|
||||
target_column = rule.get('target_column')
|
||||
factor = rule.get('factor', 1)
|
||||
|
||||
if source_column and target_column:
|
||||
if source_column in df.columns:
|
||||
df[target_column] = df[source_column] * factor
|
||||
logger.debug(f"乘法计算: {source_column} * {factor} -> {target_column}")
|
||||
else:
|
||||
logger.warning(f"源列不存在: {source_column}")
|
||||
|
||||
return df
|
||||
|
||||
def _divide(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""除法计算
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
source_column = rule.get('source_column')
|
||||
target_column = rule.get('target_column')
|
||||
divisor = rule.get('divisor', 1)
|
||||
|
||||
if source_column and target_column and divisor != 0:
|
||||
if source_column in df.columns:
|
||||
df[target_column] = df[source_column] / divisor
|
||||
logger.debug(f"除法计算: {source_column} / {divisor} -> {target_column}")
|
||||
else:
|
||||
logger.warning(f"源列不存在: {source_column}")
|
||||
elif divisor == 0:
|
||||
logger.error("除数不能为0")
|
||||
|
||||
return df
|
||||
|
||||
def _add(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""加法计算
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
columns = rule.get('columns', [])
|
||||
target_column = rule.get('target_column')
|
||||
constant = rule.get('constant', 0)
|
||||
|
||||
if target_column:
|
||||
if isinstance(columns, str):
|
||||
columns = [columns]
|
||||
|
||||
if columns:
|
||||
# 列相加
|
||||
valid_columns = [col for col in columns if col in df.columns]
|
||||
if valid_columns:
|
||||
df[target_column] = df[valid_columns].sum(axis=1) + constant
|
||||
logger.debug(f"加法计算: {valid_columns} + {constant} -> {target_column}")
|
||||
else:
|
||||
logger.warning(f"没有有效的列用于加法计算: {columns}")
|
||||
else:
|
||||
# 只加常数
|
||||
if target_column in df.columns:
|
||||
df[target_column] = df[target_column] + constant
|
||||
logger.debug(f"加法计算: {target_column} + {constant}")
|
||||
else:
|
||||
logger.warning(f"目标列不存在: {target_column}")
|
||||
|
||||
return df
|
||||
|
||||
def _subtract(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""减法计算
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
minuend = rule.get('minuend') # 被减数列
|
||||
subtrahend = rule.get('subtrahend') # 减数列
|
||||
target_column = rule.get('target_column')
|
||||
constant = rule.get('constant', 0)
|
||||
|
||||
if target_column and minuend and minuend in df.columns:
|
||||
if subtrahend and subtrahend in df.columns:
|
||||
df[target_column] = df[minuend] - df[subtrahend] - constant
|
||||
logger.debug(f"减法计算: {minuend} - {subtrahend} - {constant} -> {target_column}")
|
||||
else:
|
||||
df[target_column] = df[minuend] - constant
|
||||
logger.debug(f"减法计算: {minuend} - {constant} -> {target_column}")
|
||||
else:
|
||||
logger.warning(f"减法计算参数不完整或列不存在")
|
||||
|
||||
return df
|
||||
|
||||
def _formula(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""公式计算
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
formula = rule.get('formula')
|
||||
target_column = rule.get('target_column')
|
||||
|
||||
if formula and target_column:
|
||||
try:
|
||||
df[target_column] = df.eval(formula)
|
||||
logger.debug(f"公式计算: {formula} -> {target_column}")
|
||||
except Exception as e:
|
||||
logger.error(f"公式计算失败: {formula}, 错误: {e}")
|
||||
else:
|
||||
logger.warning("公式计算缺少公式或目标列")
|
||||
|
||||
return df
|
||||
|
||||
def _round(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""四舍五入
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
columns = rule.get('columns', [])
|
||||
decimals = rule.get('decimals', 0)
|
||||
|
||||
if isinstance(columns, str):
|
||||
columns = [columns]
|
||||
|
||||
target_columns = columns or df.select_dtypes(include=[np.number]).columns
|
||||
|
||||
for col in target_columns:
|
||||
if col in df.columns and pd.api.types.is_numeric_dtype(df[col]):
|
||||
df[col] = df[col].round(decimals)
|
||||
logger.debug(f"四舍五入: {col} 保留 {decimals} 位小数")
|
||||
|
||||
return df
|
||||
|
||||
def _sum(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""求和计算
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
columns = rule.get('columns', [])
|
||||
target_column = rule.get('target_column')
|
||||
group_by = rule.get('group_by')
|
||||
|
||||
if isinstance(columns, str):
|
||||
columns = [columns]
|
||||
|
||||
if group_by and group_by in df.columns:
|
||||
# 分组求和
|
||||
if columns:
|
||||
for col in columns:
|
||||
if col in df.columns:
|
||||
sum_result = df.groupby(group_by)[col].sum()
|
||||
logger.debug(f"分组求和: {col} 按 {group_by} 分组")
|
||||
else:
|
||||
# 所有数值列分组求和
|
||||
numeric_columns = df.select_dtypes(include=[np.number]).columns
|
||||
sum_result = df.groupby(group_by)[numeric_columns].sum()
|
||||
logger.debug(f"分组求和: 所有数值列 按 {group_by} 分组")
|
||||
else:
|
||||
# 总体求和
|
||||
if columns:
|
||||
valid_columns = [col for col in columns if col in df.columns]
|
||||
if valid_columns and target_column:
|
||||
df[target_column] = df[valid_columns].sum(axis=1)
|
||||
logger.debug(f"求和计算: {valid_columns} -> {target_column}")
|
||||
else:
|
||||
# 所有数值列求和
|
||||
numeric_columns = df.select_dtypes(include=[np.number]).columns
|
||||
if target_column and len(numeric_columns) > 0:
|
||||
df[target_column] = df[numeric_columns].sum(axis=1)
|
||||
logger.debug(f"求和计算: {list(numeric_columns)} -> {target_column}")
|
||||
|
||||
return df
|
||||
|
||||
def _aggregate(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""聚合计算
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
group_by = rule.get('group_by')
|
||||
aggregations = rule.get('aggregations', {})
|
||||
|
||||
if group_by and group_by in df.columns:
|
||||
# 构建聚合函数字典
|
||||
agg_dict = {}
|
||||
for column, func in aggregations.items():
|
||||
if column in df.columns:
|
||||
if isinstance(func, str):
|
||||
agg_dict[column] = func
|
||||
elif isinstance(func, list):
|
||||
agg_dict[column] = func
|
||||
|
||||
if agg_dict:
|
||||
result = df.groupby(group_by).agg(agg_dict)
|
||||
logger.debug(f"聚合计算: 按 {group_by} 分组, 聚合: {agg_dict}")
|
||||
return result.reset_index()
|
||||
|
||||
return df
|
||||
|
||||
# 便捷方法
|
||||
def multiply(self, source_column: str, target_column: str, factor: float):
|
||||
"""乘法计算"""
|
||||
self.add_rule('multiply', source_column=source_column,
|
||||
target_column=target_column, factor=factor)
|
||||
return self
|
||||
|
||||
def divide(self, source_column: str, target_column: str, divisor: float):
|
||||
"""除法计算"""
|
||||
self.add_rule('divide', source_column=source_column,
|
||||
target_column=target_column, divisor=divisor)
|
||||
return self
|
||||
|
||||
def add(self, columns: Union[str, List[str]], target_column: str, constant: float = 0):
|
||||
"""加法计算"""
|
||||
self.add_rule('add', columns=columns, target_column=target_column, constant=constant)
|
||||
return self
|
||||
|
||||
def subtract(self, minuend: str, target_column: str,
|
||||
subtrahend: Optional[str] = None, constant: float = 0):
|
||||
"""减法计算"""
|
||||
self.add_rule('subtract', minuend=minuend, target_column=target_column,
|
||||
subtrahend=subtrahend, constant=constant)
|
||||
return self
|
||||
|
||||
def formula(self, formula: str, target_column: str):
|
||||
"""公式计算"""
|
||||
self.add_rule('formula', formula=formula, target_column=target_column)
|
||||
return self
|
||||
|
||||
def round_columns(self, columns: Optional[Union[str, List[str]]] = None, decimals: int = 0):
|
||||
"""四舍五入"""
|
||||
self.add_rule('round', columns=columns, decimals=decimals)
|
||||
return self
|
||||
|
||||
def sum_columns(self, columns: Optional[Union[str, List[str]]] = None,
|
||||
target_column: Optional[str] = None, group_by: Optional[str] = None):
|
||||
"""求和计算"""
|
||||
self.add_rule('sum', columns=columns, target_column=target_column, group_by=group_by)
|
||||
return self
|
||||
|
||||
def aggregate(self, group_by: str, aggregations: Dict[str, Union[str, List[str]]]):
|
||||
"""聚合计算"""
|
||||
self.add_rule('aggregate', group_by=group_by, aggregations=aggregations)
|
||||
return self
|
||||
@@ -0,0 +1,382 @@
|
||||
"""
|
||||
列映射处理器
|
||||
|
||||
提供列名映射和转换功能,支持不同供应商的列名标准化
|
||||
"""
|
||||
|
||||
import re
|
||||
import pandas as pd
|
||||
from typing import Dict, Any, Optional, List, Union
|
||||
from ...core.utils.log_utils import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class ColumnMapper:
|
||||
"""列映射处理器
|
||||
|
||||
提供列名标准化功能,将不同供应商的列名映射到标准列名
|
||||
"""
|
||||
|
||||
# 标准列名定义(所有列名别名的唯一来源)
|
||||
STANDARD_COLUMNS = {
|
||||
'barcode': [
|
||||
'条码', '条形码', '商品条码', '商品条形码', '产品条码', '商品编码',
|
||||
'商品编号', '条码(必填)', '电脑条码', '条码ID',
|
||||
'barcode', 'Barcode', 'BarCode', 'code', '编码',
|
||||
],
|
||||
'name': [
|
||||
'商品名称', '产品名称', '名称', '商品', '产品', '商品名', '品名',
|
||||
'品项名', '商品或服务名称', '品项', '名 称',
|
||||
'name', 'product_name',
|
||||
],
|
||||
'specification': [
|
||||
'规格', '规格型号', '型号', '商品规格', '产品规格', '包装规格', '规 格',
|
||||
'specification', 'spec', 'model',
|
||||
],
|
||||
'quantity': [
|
||||
'数量', '采购量', '订货数量', '订单量', '需求量', '采购数量', '购买数量',
|
||||
'订单数量', '数量(必填)', '采购量(必填)', '入库数', '入库数量', '数 量',
|
||||
'quantity', 'qty',
|
||||
],
|
||||
'unit': [
|
||||
'单位', '计量单位', '采购单位', '单位(必填)', '单位名称', '计价单位', '单 位',
|
||||
'unit', 'units',
|
||||
],
|
||||
'unit_price': [
|
||||
'单价', '价格', '采购单价', '进货价', '销售价', '采购价', '参考价',
|
||||
'入库单价', '单价(必填)', '采购单价(必填)', '价格(必填)', '单 价',
|
||||
'unit_price', 'price',
|
||||
],
|
||||
'total_price': [
|
||||
'总价', '金额', '小计', '合计金额', '小计金额', '金额(元)',
|
||||
'金额合计', '合计', '总额',
|
||||
'total_price', 'total', 'amount',
|
||||
],
|
||||
'gift_quantity': [
|
||||
'赠送量', '赠品数量', '赠送数量', '赠品',
|
||||
],
|
||||
'category': ['类别', '分类', '商品类别', 'category', 'type'],
|
||||
'brand': ['品牌', '商标', 'brand'],
|
||||
'supplier': ['供应商', '供货商', 'supplier', 'vendor'],
|
||||
}
|
||||
|
||||
def __init__(self, mapping_config: Optional[Dict[str, Any]] = None):
|
||||
"""初始化列映射器
|
||||
|
||||
Args:
|
||||
mapping_config: 映射配置
|
||||
"""
|
||||
self.mapping_config = mapping_config or {}
|
||||
self.custom_mappings = {}
|
||||
self._build_reverse_mapping()
|
||||
|
||||
def _build_reverse_mapping(self):
|
||||
"""构建反向映射表"""
|
||||
self.reverse_mapping = {}
|
||||
|
||||
# 添加标准列的反向映射
|
||||
for standard_name, variations in self.STANDARD_COLUMNS.items():
|
||||
for variation in variations:
|
||||
self.reverse_mapping[variation.lower()] = standard_name
|
||||
|
||||
# 添加自定义映射
|
||||
for standard_name, custom_names in self.mapping_config.items():
|
||||
if isinstance(custom_names, str):
|
||||
custom_names = [custom_names]
|
||||
|
||||
for custom_name in custom_names:
|
||||
self.reverse_mapping[custom_name.lower()] = standard_name
|
||||
self.custom_mappings[custom_name.lower()] = standard_name
|
||||
|
||||
def map_columns(self, df: pd.DataFrame, target_columns: Optional[List[str]] = None) -> pd.DataFrame:
|
||||
"""映射列名
|
||||
|
||||
Args:
|
||||
df: 输入数据
|
||||
target_columns: 目标列名列表,如果为None则使用所有标准列
|
||||
|
||||
Returns:
|
||||
列名映射后的数据
|
||||
"""
|
||||
if target_columns is None:
|
||||
target_columns = list(self.STANDARD_COLUMNS.keys())
|
||||
|
||||
logger.info(f"开始列名映射,目标列: {target_columns}")
|
||||
logger.info(f"原始列名: {list(df.columns)}")
|
||||
|
||||
# 创建列名映射
|
||||
column_mapping = {}
|
||||
used_columns = set()
|
||||
|
||||
for target_col in target_columns:
|
||||
# 查找匹配的原始列名
|
||||
matched_column = self._find_matching_column(df.columns, target_col)
|
||||
if matched_column:
|
||||
column_mapping[matched_column] = target_col
|
||||
used_columns.add(matched_column)
|
||||
logger.debug(f"列名映射: {matched_column} -> {target_col}")
|
||||
|
||||
# 重命名列
|
||||
if column_mapping:
|
||||
df_mapped = df.rename(columns=column_mapping)
|
||||
|
||||
# 添加缺失的目标列
|
||||
for target_col in target_columns:
|
||||
if target_col not in df_mapped.columns:
|
||||
df_mapped[target_col] = self._get_default_value(target_col)
|
||||
logger.debug(f"添加缺失列: {target_col}")
|
||||
|
||||
# 只保留目标列
|
||||
existing_target_columns = [col for col in target_columns if col in df_mapped.columns]
|
||||
df_result = df_mapped[existing_target_columns]
|
||||
|
||||
logger.info(f"列名映射完成,结果列名: {list(df_result.columns)}")
|
||||
return df_result
|
||||
else:
|
||||
logger.warning("没有找到可映射的列名")
|
||||
return df
|
||||
|
||||
def _find_matching_column(self, columns: List[str], target_column: str) -> Optional[str]:
|
||||
"""查找匹配的列名
|
||||
|
||||
Args:
|
||||
columns: 原始列名列表
|
||||
target_column: 目标标准列名
|
||||
|
||||
Returns:
|
||||
匹配的原始列名或None
|
||||
"""
|
||||
# 获取目标列的所有可能变体
|
||||
possible_names = []
|
||||
|
||||
# 标准列名变体
|
||||
if target_column in self.STANDARD_COLUMNS:
|
||||
possible_names.extend(self.STANDARD_COLUMNS[target_column])
|
||||
|
||||
# 自定义映射
|
||||
for standard_name, custom_names in self.mapping_config.items():
|
||||
if standard_name == target_column:
|
||||
if isinstance(custom_names, str):
|
||||
possible_names.append(custom_names)
|
||||
else:
|
||||
possible_names.extend(custom_names)
|
||||
|
||||
# 查找匹配
|
||||
for possible_name in possible_names:
|
||||
# 精确匹配(忽略大小写)
|
||||
for column in columns:
|
||||
if column.lower() == possible_name.lower():
|
||||
return column
|
||||
|
||||
# 模糊匹配
|
||||
for column in columns:
|
||||
if possible_name.lower() in column.lower() or column.lower() in possible_name.lower():
|
||||
return column
|
||||
|
||||
return None
|
||||
|
||||
def _get_default_value(self, column_name: str) -> Any:
|
||||
"""获取列的默认值
|
||||
|
||||
Args:
|
||||
column_name: 列名
|
||||
|
||||
Returns:
|
||||
默认值
|
||||
"""
|
||||
# 根据列名类型返回合适的默认值
|
||||
if column_name in ['quantity', 'unit_price', 'total_price']:
|
||||
return 0
|
||||
elif column_name in ['barcode', 'name', 'specification', 'unit', 'category', 'brand', 'supplier']:
|
||||
return ''
|
||||
else:
|
||||
return None
|
||||
|
||||
def add_custom_mapping(self, standard_name: str, custom_names: Union[str, List[str]]):
|
||||
"""添加自定义列名映射
|
||||
|
||||
Args:
|
||||
standard_name: 标准列名
|
||||
custom_names: 自定义列名或列名列表
|
||||
"""
|
||||
if isinstance(custom_names, str):
|
||||
custom_names = [custom_names]
|
||||
|
||||
# 更新配置
|
||||
self.mapping_config[standard_name] = custom_names
|
||||
|
||||
# 更新反向映射
|
||||
for custom_name in custom_names:
|
||||
self.reverse_mapping[custom_name.lower()] = standard_name
|
||||
self.custom_mappings[custom_name.lower()] = standard_name
|
||||
|
||||
logger.info(f"添加自定义映射: {standard_name} <- {custom_names}")
|
||||
|
||||
def detect_column_types(self, df: pd.DataFrame) -> Dict[str, str]:
|
||||
"""检测列的数据类型
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
|
||||
Returns:
|
||||
列类型字典
|
||||
"""
|
||||
column_types = {}
|
||||
|
||||
for column in df.columns:
|
||||
if pd.api.types.is_numeric_dtype(df[column]):
|
||||
column_types[column] = 'numeric'
|
||||
elif pd.api.types.is_datetime64_any_dtype(df[column]):
|
||||
column_types[column] = 'datetime'
|
||||
elif pd.api.types.is_bool_dtype(df[column]):
|
||||
column_types[column] = 'boolean'
|
||||
else:
|
||||
column_types[column] = 'text'
|
||||
|
||||
return column_types
|
||||
|
||||
def suggest_column_mapping(self, df: pd.DataFrame) -> Dict[str, List[str]]:
|
||||
"""建议列名映射
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
|
||||
Returns:
|
||||
建议的映射关系
|
||||
"""
|
||||
suggestions = {}
|
||||
|
||||
for column in df.columns:
|
||||
column_lower = column.lower()
|
||||
suggestions[column] = []
|
||||
|
||||
# 检查标准列名
|
||||
for standard_name, variations in self.STANDARD_COLUMNS.items():
|
||||
for variation in variations:
|
||||
if column_lower in variation.lower() or variation.lower() in column_lower:
|
||||
suggestions[column].append(standard_name)
|
||||
|
||||
# 检查自定义映射
|
||||
for custom_name, standard_name in self.custom_mappings.items():
|
||||
if column_lower in custom_name or custom_name in column_lower:
|
||||
suggestions[column].append(standard_name)
|
||||
|
||||
# 去重
|
||||
suggestions[column] = list(set(suggestions[column]))
|
||||
|
||||
# 只返回有建议的列
|
||||
return {k: v for k, v in suggestions.items() if v}
|
||||
|
||||
def validate_mapping(self, df: pd.DataFrame, required_columns: List[str]) -> Dict[str, Any]:
|
||||
"""验证列映射结果
|
||||
|
||||
Args:
|
||||
df: 映射后的数据
|
||||
required_columns: 必需的列名列表
|
||||
|
||||
Returns:
|
||||
验证结果
|
||||
"""
|
||||
result = {
|
||||
'valid': True,
|
||||
'missing_columns': [],
|
||||
'empty_columns': [],
|
||||
'warnings': []
|
||||
}
|
||||
|
||||
# 检查缺失列
|
||||
for col in required_columns:
|
||||
if col not in df.columns:
|
||||
result['missing_columns'].append(col)
|
||||
result['valid'] = False
|
||||
|
||||
# 检查空列
|
||||
for col in df.columns:
|
||||
if df[col].isnull().all():
|
||||
result['empty_columns'].append(col)
|
||||
result['warnings'].append(f"列 '{col}' 全部为空值")
|
||||
|
||||
# 检查数值列
|
||||
numeric_columns = ['quantity', 'unit_price', 'total_price']
|
||||
for col in numeric_columns:
|
||||
if col in df.columns and not pd.api.types.is_numeric_dtype(df[col]):
|
||||
result['warnings'].append(f"列 '{col}' 不是数值类型")
|
||||
|
||||
return result
|
||||
|
||||
@classmethod
|
||||
def find_column(cls, columns: List[str], standard_name: str) -> Optional[str]:
|
||||
"""在列名列表中查找匹配标准列名的列
|
||||
|
||||
匹配策略: 精确匹配 → 忽略空白匹配 → 子串匹配
|
||||
|
||||
Args:
|
||||
columns: 实际列名列表
|
||||
standard_name: 标准列名 (STANDARD_COLUMNS 的键)
|
||||
|
||||
Returns:
|
||||
匹配到的实际列名,未找到返回 None
|
||||
"""
|
||||
candidates = cls.STANDARD_COLUMNS.get(standard_name, [])
|
||||
if not candidates:
|
||||
return None
|
||||
|
||||
columns_str = [str(c) for c in columns]
|
||||
|
||||
# 精确匹配
|
||||
for col in columns_str:
|
||||
col_clean = col.strip()
|
||||
for candidate in candidates:
|
||||
if col_clean == candidate:
|
||||
return col
|
||||
|
||||
# 忽略空白匹配
|
||||
for col in columns_str:
|
||||
col_clean = re.sub(r'\s+', '', col.strip())
|
||||
for candidate in candidates:
|
||||
if col_clean == re.sub(r'\s+', '', candidate):
|
||||
return col
|
||||
|
||||
# 子串匹配 (候选名包含在列名中)
|
||||
for col in columns_str:
|
||||
col_lower = col.strip().lower()
|
||||
for candidate in candidates:
|
||||
if candidate.lower() in col_lower:
|
||||
return col
|
||||
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def detect_header_row(df: pd.DataFrame, max_rows: int = 10, min_matches: int = 3) -> int:
|
||||
"""检测表头所在行
|
||||
|
||||
扫描前 max_rows 行,返回包含最多关键词匹配的行索引。
|
||||
|
||||
Args:
|
||||
df: 数据框
|
||||
max_rows: 最大扫描行数
|
||||
min_matches: 最少关键词匹配数
|
||||
|
||||
Returns:
|
||||
表头行索引,未找到返回 -1
|
||||
"""
|
||||
header_keywords = [
|
||||
'条码', '条形码', '商品条码', '商品名称', '名称', '规格',
|
||||
'单价', '数量', '金额', '单位', '必填', '编码',
|
||||
]
|
||||
|
||||
best_row = -1
|
||||
best_matches = 0
|
||||
|
||||
for row_idx in range(min(max_rows, len(df))):
|
||||
row_values = df.iloc[row_idx].astype(str)
|
||||
matches = sum(
|
||||
1 for kw in header_keywords
|
||||
if any(kw in str(val) for val in row_values.values)
|
||||
)
|
||||
if matches >= min_matches and matches > best_matches:
|
||||
best_matches = matches
|
||||
best_row = row_idx
|
||||
|
||||
return best_row
|
||||
@@ -0,0 +1,401 @@
|
||||
"""
|
||||
数据清洗处理器
|
||||
|
||||
提供各种数据清洗功能,如空值处理、重复项处理、数据类型转换等
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
from typing import Dict, Any, Optional, List, Union
|
||||
from ...core.utils.log_utils import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class DataCleaner:
|
||||
"""数据清洗处理器
|
||||
|
||||
提供标准化的数据清洗功能,支持链式调用和规则配置
|
||||
"""
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
"""初始化数据清洗器
|
||||
|
||||
Args:
|
||||
config: 清洗配置
|
||||
"""
|
||||
self.config = config or {}
|
||||
self.cleaning_rules = []
|
||||
|
||||
def add_rule(self, rule_type: str, **kwargs):
|
||||
"""添加清洗规则
|
||||
|
||||
Args:
|
||||
rule_type: 规则类型
|
||||
**kwargs: 规则参数
|
||||
"""
|
||||
rule = {'type': rule_type, **kwargs}
|
||||
self.cleaning_rules.append(rule)
|
||||
logger.debug(f"添加清洗规则: {rule_type}")
|
||||
|
||||
def clean(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""执行数据清洗
|
||||
|
||||
Args:
|
||||
df: 输入数据
|
||||
|
||||
Returns:
|
||||
清洗后的数据
|
||||
"""
|
||||
logger.info(f"开始数据清洗,原始数据形状: {df.shape}")
|
||||
|
||||
result_df = df.copy()
|
||||
|
||||
for i, rule in enumerate(self.cleaning_rules):
|
||||
try:
|
||||
logger.debug(f"执行清洗规则 {i+1}/{len(self.cleaning_rules)}: {rule['type']}")
|
||||
result_df = self._apply_rule(result_df, rule)
|
||||
logger.debug(f"规则执行完成,数据形状: {result_df.shape}")
|
||||
except Exception as e:
|
||||
logger.error(f"清洗规则执行失败: {rule}, 错误: {e}")
|
||||
# 继续执行下一个规则,而不是中断整个流程
|
||||
continue
|
||||
|
||||
logger.info(f"数据清洗完成,最终数据形状: {result_df.shape}")
|
||||
return result_df
|
||||
|
||||
def _apply_rule(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""应用单个清洗规则
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
rule_type = rule.get('type')
|
||||
|
||||
if rule_type == 'remove_duplicates':
|
||||
return self._remove_duplicates(df, rule)
|
||||
elif rule_type == 'fill_na':
|
||||
return self._fill_na(df, rule)
|
||||
elif rule_type == 'remove_rows':
|
||||
return self._remove_rows(df, rule)
|
||||
elif rule_type == 'convert_type':
|
||||
return self._convert_type(df, rule)
|
||||
elif rule_type == 'strip_whitespace':
|
||||
return self._strip_whitespace(df, rule)
|
||||
elif rule_type == 'normalize_text':
|
||||
return self._normalize_text(df, rule)
|
||||
elif rule_type == 'validate_data':
|
||||
return self._validate_data(df, rule)
|
||||
else:
|
||||
logger.warning(f"未知的清洗规则类型: {rule_type}")
|
||||
return df
|
||||
|
||||
def _remove_duplicates(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""移除重复项
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
subset = rule.get('subset') # 用于判断重复的列
|
||||
keep = rule.get('keep', 'first') # 保留哪个重复项
|
||||
|
||||
before_count = len(df)
|
||||
df_cleaned = df.drop_duplicates(subset=subset, keep=keep)
|
||||
after_count = len(df_cleaned)
|
||||
|
||||
logger.info(f"移除重复项: {before_count - after_count} 行被移除")
|
||||
return df_cleaned
|
||||
|
||||
def _fill_na(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""填充空值
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
columns = rule.get('columns') # 要处理的列
|
||||
value = rule.get('value', 0) # 填充值
|
||||
method = rule.get('method') # 填充方法('ffill', 'bfill', 'mean', 'median')
|
||||
|
||||
if columns:
|
||||
# 处理指定列
|
||||
if isinstance(columns, str):
|
||||
columns = [columns]
|
||||
|
||||
for col in columns:
|
||||
if col in df.columns:
|
||||
if method == 'ffill':
|
||||
df[col] = df[col].fillna(method='ffill')
|
||||
elif method == 'bfill':
|
||||
df[col] = df[col].fillna(method='bfill')
|
||||
elif method == 'mean':
|
||||
df[col] = df[col].fillna(df[col].mean())
|
||||
elif method == 'median':
|
||||
df[col] = df[col].fillna(df[col].median())
|
||||
else:
|
||||
df[col] = df[col].fillna(value)
|
||||
|
||||
logger.debug(f"填充列 {col} 的空值: {method or value}")
|
||||
else:
|
||||
# 处理所有列
|
||||
if method == 'ffill':
|
||||
df = df.fillna(method='ffill')
|
||||
elif method == 'bfill':
|
||||
df = df.fillna(method='bfill')
|
||||
else:
|
||||
df = df.fillna(value)
|
||||
|
||||
logger.debug(f"填充所有列的空值: {method or value}")
|
||||
|
||||
return df
|
||||
|
||||
def _remove_rows(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""移除行
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
condition = rule.get('condition') # 条件表达式
|
||||
columns = rule.get('columns') # 要检查的列
|
||||
values = rule.get('values') # 要移除的值
|
||||
|
||||
if condition:
|
||||
# 使用条件表达式
|
||||
try:
|
||||
before_count = len(df)
|
||||
df_filtered = df.query(condition)
|
||||
after_count = len(df_filtered)
|
||||
logger.info(f"条件过滤: {condition}, 移除了 {before_count - after_count} 行")
|
||||
return df_filtered
|
||||
except Exception as e:
|
||||
logger.error(f"条件表达式执行失败: {condition}, 错误: {e}")
|
||||
return df
|
||||
|
||||
if columns and values:
|
||||
# 基于列值过滤
|
||||
if isinstance(columns, str):
|
||||
columns = [columns]
|
||||
if not isinstance(values, list):
|
||||
values = [values]
|
||||
|
||||
df_filtered = df.copy()
|
||||
for col in columns:
|
||||
if col in df_filtered.columns:
|
||||
mask = ~df_filtered[col].isin(values)
|
||||
df_filtered = df_filtered[mask]
|
||||
logger.debug(f"列 {col} 过滤值 {values}")
|
||||
|
||||
return df_filtered
|
||||
|
||||
logger.warning("移除行规则缺少条件或列配置")
|
||||
return df
|
||||
|
||||
def _convert_type(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""类型转换
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
columns = rule.get('columns')
|
||||
target_type = rule.get('target_type', 'float')
|
||||
errors = rule.get('errors', 'coerce') # 错误处理方式
|
||||
|
||||
if isinstance(columns, str):
|
||||
columns = [columns]
|
||||
|
||||
for col in columns:
|
||||
if col in df.columns:
|
||||
try:
|
||||
if target_type == 'int':
|
||||
df[col] = pd.to_numeric(df[col], errors=errors).astype('Int64')
|
||||
elif target_type == 'float':
|
||||
df[col] = pd.to_numeric(df[col], errors=errors)
|
||||
elif target_type == 'datetime':
|
||||
df[col] = pd.to_datetime(df[col], errors=errors)
|
||||
elif target_type == 'string':
|
||||
df[col] = df[col].astype(str)
|
||||
else:
|
||||
df[col] = df[col].astype(target_type)
|
||||
|
||||
logger.debug(f"列 {col} 类型转换: {target_type}")
|
||||
except Exception as e:
|
||||
logger.error(f"列 {col} 类型转换失败: {e}")
|
||||
|
||||
return df
|
||||
|
||||
def _strip_whitespace(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""去除空白字符
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
columns = rule.get('columns')
|
||||
|
||||
if columns:
|
||||
if isinstance(columns, str):
|
||||
columns = [columns]
|
||||
|
||||
for col in columns:
|
||||
if col in df.columns and df[col].dtype == 'object':
|
||||
df[col] = df[col].str.strip()
|
||||
logger.debug(f"列 {col} 去除空白字符")
|
||||
else:
|
||||
# 处理所有文本列
|
||||
text_columns = df.select_dtypes(include=['object']).columns
|
||||
for col in text_columns:
|
||||
df[col] = df[col].str.strip()
|
||||
|
||||
logger.debug(f"所有文本列去除空白字符: {list(text_columns)}")
|
||||
|
||||
return df
|
||||
|
||||
def _normalize_text(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""文本标准化
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
columns = rule.get('columns')
|
||||
lowercase = rule.get('lowercase', False)
|
||||
uppercase = rule.get('uppercase', False)
|
||||
replace_map = rule.get('replace_map', {}) # 替换映射
|
||||
|
||||
if isinstance(columns, str):
|
||||
columns = [columns]
|
||||
|
||||
target_columns = columns or df.select_dtypes(include=['object']).columns
|
||||
|
||||
for col in target_columns:
|
||||
if col in df.columns and df[col].dtype == 'object':
|
||||
if lowercase:
|
||||
df[col] = df[col].str.lower()
|
||||
elif uppercase:
|
||||
df[col] = df[col].str.upper()
|
||||
|
||||
# 应用替换映射
|
||||
for old, new in replace_map.items():
|
||||
df[col] = df[col].str.replace(old, new)
|
||||
|
||||
logger.debug(f"列 {col} 文本标准化完成")
|
||||
|
||||
return df
|
||||
|
||||
def _validate_data(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""数据验证
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
columns = rule.get('columns')
|
||||
min_value = rule.get('min_value')
|
||||
max_value = rule.get('max_value')
|
||||
required = rule.get('required', False)
|
||||
|
||||
if isinstance(columns, str):
|
||||
columns = [columns]
|
||||
|
||||
validation_results = []
|
||||
|
||||
for col in columns:
|
||||
if col in df.columns:
|
||||
# 检查必需值
|
||||
if required:
|
||||
null_count = df[col].isnull().sum()
|
||||
if null_count > 0:
|
||||
validation_results.append(f"{col}: {null_count} 个空值")
|
||||
|
||||
# 检查数值范围
|
||||
if min_value is not None or max_value is not None:
|
||||
if pd.api.types.is_numeric_dtype(df[col]):
|
||||
invalid_mask = pd.Series(False, index=df.index)
|
||||
if min_value is not None:
|
||||
invalid_mask |= df[col] < min_value
|
||||
if max_value is not None:
|
||||
invalid_mask |= df[col] > max_value
|
||||
|
||||
invalid_count = invalid_mask.sum()
|
||||
if invalid_count > 0:
|
||||
validation_results.append(f"{col}: {invalid_count} 个值超出范围")
|
||||
|
||||
if validation_results:
|
||||
logger.warning(f"数据验证发现问题: {', '.join(validation_results)}")
|
||||
else:
|
||||
logger.debug("数据验证通过")
|
||||
|
||||
return df
|
||||
|
||||
# 便捷方法
|
||||
def remove_duplicates(self, subset: Optional[List[str]] = None, keep: str = 'first'):
|
||||
"""移除重复项"""
|
||||
self.add_rule('remove_duplicates', subset=subset, keep=keep)
|
||||
return self
|
||||
|
||||
def fill_na(self, columns: Optional[Union[str, List[str]]] = None,
|
||||
value: Any = 0, method: Optional[str] = None):
|
||||
"""填充空值"""
|
||||
self.add_rule('fill_na', columns=columns, value=value, method=method)
|
||||
return self
|
||||
|
||||
def remove_rows(self, condition: Optional[str] = None,
|
||||
columns: Optional[Union[str, List[str]]] = None,
|
||||
values: Optional[Any] = None):
|
||||
"""移除行"""
|
||||
self.add_rule('remove_rows', condition=condition, columns=columns, values=values)
|
||||
return self
|
||||
|
||||
def convert_type(self, columns: Union[str, List[str]], target_type: str, errors: str = 'coerce'):
|
||||
"""类型转换"""
|
||||
self.add_rule('convert_type', columns=columns, target_type=target_type, errors=errors)
|
||||
return self
|
||||
|
||||
def strip_whitespace(self, columns: Optional[Union[str, List[str]]] = None):
|
||||
"""去除空白字符"""
|
||||
self.add_rule('strip_whitespace', columns=columns)
|
||||
return self
|
||||
|
||||
def normalize_text(self, columns: Optional[Union[str, List[str]]] = None,
|
||||
lowercase: bool = False, uppercase: bool = False,
|
||||
replace_map: Optional[Dict[str, str]] = None):
|
||||
"""文本标准化"""
|
||||
self.add_rule('normalize_text', columns=columns, lowercase=lowercase,
|
||||
uppercase=uppercase, replace_map=replace_map or {})
|
||||
return self
|
||||
|
||||
def validate_data(self, columns: Union[str, List[str]],
|
||||
min_value: Optional[float] = None,
|
||||
max_value: Optional[float] = None,
|
||||
required: bool = False):
|
||||
"""数据验证"""
|
||||
self.add_rule('validate_data', columns=columns, min_value=min_value,
|
||||
max_value=max_value, required=required)
|
||||
return self
|
||||
@@ -0,0 +1,150 @@
|
||||
import re
|
||||
import pandas as pd
|
||||
from typing import List, Dict, Any, Optional
|
||||
|
||||
def _split_quantity_unit(df: pd.DataFrame, source: str, dictionary: Optional[Dict[str, Any]] = None) -> pd.DataFrame:
|
||||
if source in df.columns:
|
||||
vals = df[source].astype(str).fillna("")
|
||||
nums = []
|
||||
units = []
|
||||
default_unit = (dictionary or {}).get("default_unit", "")
|
||||
unit_synonyms = (dictionary or {}).get("unit_synonyms", {})
|
||||
for v in vals:
|
||||
m = re.search(r"(\d+(?:\.\d+)?)(箱|件|提|盒|瓶)", v)
|
||||
if m:
|
||||
nums.append(float(m.group(1)))
|
||||
u = unit_synonyms.get(m.group(2), m.group(2))
|
||||
units.append(u)
|
||||
else:
|
||||
try:
|
||||
nums.append(float(v))
|
||||
units.append(unit_synonyms.get(default_unit, default_unit))
|
||||
except Exception:
|
||||
nums.append(0.0)
|
||||
units.append(unit_synonyms.get(default_unit, default_unit))
|
||||
df["quantity"] = nums
|
||||
df["unit"] = units
|
||||
return df
|
||||
|
||||
def _extract_spec_from_name(df: pd.DataFrame, source: str, dictionary: Optional[Dict[str, Any]] = None) -> pd.DataFrame:
|
||||
if source in df.columns:
|
||||
names = df[source].astype(str).fillna("")
|
||||
specs = []
|
||||
packs = []
|
||||
ignore_words = (dictionary or {}).get("ignore_words", [])
|
||||
name_patterns = (dictionary or {}).get("name_patterns", [])
|
||||
for s in names:
|
||||
if ignore_words:
|
||||
for w in ignore_words:
|
||||
s = s.replace(w, "")
|
||||
matched = False
|
||||
for pat in name_patterns:
|
||||
try:
|
||||
m = re.search(pat, s)
|
||||
if m and len(m.groups()) >= 2:
|
||||
try:
|
||||
qty = int(m.group(len(m.groups())))
|
||||
except Exception:
|
||||
qty = None
|
||||
specs.append(s)
|
||||
packs.append(qty)
|
||||
matched = True
|
||||
break
|
||||
except Exception:
|
||||
pass
|
||||
if matched:
|
||||
continue
|
||||
m = re.search(r"(\d+(?:\.\d+)?)(ml|l|升|毫升)[*×xX](\d+)", s, re.IGNORECASE)
|
||||
if m:
|
||||
specs.append(f"{m.group(1)}{m.group(2)}*{m.group(3)}")
|
||||
packs.append(int(m.group(3)))
|
||||
continue
|
||||
m2 = re.search(r"(\d+)[*×xX](\d+)", s)
|
||||
if m2:
|
||||
specs.append(f"1*{m2.group(2)}")
|
||||
packs.append(int(m2.group(2)))
|
||||
continue
|
||||
m3 = re.search(r"(\d{2,3})\D*(\d{1,3})\D*", s)
|
||||
if m3:
|
||||
specs.append(f"1*{m3.group(2)}")
|
||||
packs.append(int(m3.group(2)))
|
||||
continue
|
||||
specs.append("")
|
||||
packs.append(None)
|
||||
df["specification"] = df.get("specification", pd.Series(specs))
|
||||
df["package_quantity"] = packs
|
||||
return df
|
||||
|
||||
def _normalize_unit(df: pd.DataFrame, target: str, unit_map: Dict[str, str], dictionary: Optional[Dict[str, Any]] = None) -> pd.DataFrame:
|
||||
if target in df.columns:
|
||||
df[target] = df[target].astype(str)
|
||||
df[target] = df[target].apply(lambda u: unit_map.get(u, u))
|
||||
pack_multipliers = (dictionary or {}).get("pack_multipliers", {})
|
||||
default_pq = (dictionary or {}).get("default_package_quantity", 1)
|
||||
try:
|
||||
if "quantity" in df.columns:
|
||||
def convert_qty(row):
|
||||
u = row.get(target)
|
||||
q = row.get("quantity")
|
||||
pq = row.get("package_quantity")
|
||||
if u in ("件", "箱", "提", "盒"):
|
||||
mult = pq or pack_multipliers.get(u, default_pq)
|
||||
if pd.notna(q) and pd.notna(mult) and float(mult) > 0:
|
||||
return float(q) * float(mult)
|
||||
return q
|
||||
df["quantity"] = df.apply(convert_qty, axis=1)
|
||||
df[target] = df[target].apply(lambda u: "瓶" if u in ("件","箱","提","盒") else u)
|
||||
except Exception:
|
||||
pass
|
||||
return df
|
||||
|
||||
def _compute_quantity_from_total(df: pd.DataFrame) -> pd.DataFrame:
|
||||
if "quantity" in df.columns and "unit_price" in df.columns:
|
||||
qty = df["quantity"].fillna(0)
|
||||
up = pd.to_numeric(df.get("unit_price", 0), errors="coerce").fillna(0)
|
||||
tp = pd.to_numeric(df.get("total_price", 0), errors="coerce").fillna(0)
|
||||
need = (qty <= 0) & (up > 0) & (tp > 0)
|
||||
df.loc[need, "quantity"] = (tp[need] / up[need]).round(6)
|
||||
return df
|
||||
|
||||
def _fill_missing(df: pd.DataFrame, fills: Dict[str, Any]) -> pd.DataFrame:
|
||||
for k, v in fills.items():
|
||||
if k in df.columns:
|
||||
df[k] = df[k].fillna(v)
|
||||
else:
|
||||
df[k] = v
|
||||
return df
|
||||
|
||||
def _mark_gift(df: pd.DataFrame) -> pd.DataFrame:
|
||||
df["is_gift"] = False
|
||||
tp = df.get("total_price")
|
||||
up = df.get("unit_price")
|
||||
flags = pd.Series([False]*len(df))
|
||||
if tp is not None:
|
||||
tpn = pd.to_numeric(tp, errors="coerce").fillna(0)
|
||||
flags = flags | (tpn == 0)
|
||||
if up is not None:
|
||||
upn = pd.to_numeric(up, errors="coerce").fillna(0)
|
||||
flags = flags | (upn == 0)
|
||||
if "name" in df.columns:
|
||||
flags = flags | df["name"].astype(str).str.contains(r"赠品|^o$|^O$", regex=True)
|
||||
df.loc[flags, "is_gift"] = True
|
||||
return df
|
||||
|
||||
def apply_rules(df: pd.DataFrame, rules: List[Dict[str, Any]], dictionary: Optional[Dict[str, Any]] = None) -> pd.DataFrame:
|
||||
out = df.copy()
|
||||
for r in rules or []:
|
||||
t = r.get("type")
|
||||
if t == "split_quantity_unit":
|
||||
out = _split_quantity_unit(out, r.get("source", "quantity"), dictionary)
|
||||
elif t == "extract_spec_from_name":
|
||||
out = _extract_spec_from_name(out, r.get("source", "name"), dictionary)
|
||||
elif t == "normalize_unit":
|
||||
out = _normalize_unit(out, r.get("target", "unit"), r.get("map", {}), dictionary)
|
||||
elif t == "compute_quantity_from_total":
|
||||
out = _compute_quantity_from_total(out)
|
||||
elif t == "fill_missing":
|
||||
out = _fill_missing(out, r.get("fills", {}))
|
||||
elif t == "mark_gift":
|
||||
out = _mark_gift(out)
|
||||
return out
|
||||
Reference in New Issue
Block a user