feat: 益选 OCR 订单处理系统初始提交

- 智能供应商识别(蓉城易购/烟草/杨碧月/通用)
- 百度 OCR 表格识别集成
- 规则引擎(列映射/数据清洗/单位转换/规格推断)
- 条码映射管理与云端同步(Gitea REST API)
- 云端同步支持:条码映射、供应商配置、商品资料、采购模板
- 拖拽一键处理(图片→OCR→Excel→合并)
- 191 个单元测试
- 移除无用的模板管理功能
- 清理 IDE 产物目录

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-04 19:51:13 +08:00
commit e4d62df7e3
78 changed files with 15257 additions and 0 deletions
+9
View File
@@ -0,0 +1,9 @@
"""
数据处理handlers模块初始化文件
"""
from .data_cleaner import DataCleaner
from .column_mapper import ColumnMapper
from .calculator import DataCalculator
__all__ = ['DataCleaner', 'ColumnMapper', 'DataCalculator']
+378
View File
@@ -0,0 +1,378 @@
"""
数据计算处理器
提供各种数据计算功能,如数量计算、价格计算、汇总统计等
"""
import pandas as pd
import numpy as np
from typing import Dict, Any, Optional, List, Union
from ...core.utils.log_utils import get_logger
logger = get_logger(__name__)
class DataCalculator:
"""数据计算处理器
提供标准化的数据计算功能,支持各种业务计算规则
"""
def __init__(self, config: Optional[Dict[str, Any]] = None):
"""初始化数据计算器
Args:
config: 计算配置
"""
self.config = config or {}
self.calculation_rules = []
def add_rule(self, rule_type: str, **kwargs):
"""添加计算规则
Args:
rule_type: 规则类型
**kwargs: 规则参数
"""
rule = {'type': rule_type, **kwargs}
self.calculation_rules.append(rule)
logger.debug(f"添加计算规则: {rule_type}")
def calculate(self, df: pd.DataFrame) -> pd.DataFrame:
"""执行数据计算
Args:
df: 输入数据
Returns:
计算后的数据
"""
logger.info(f"开始数据计算,原始数据形状: {df.shape}")
result_df = df.copy()
for i, rule in enumerate(self.calculation_rules):
try:
logger.debug(f"执行计算规则 {i+1}/{len(self.calculation_rules)}: {rule['type']}")
result_df = self._apply_rule(result_df, rule)
logger.debug(f"规则执行完成,数据形状: {result_df.shape}")
except Exception as e:
logger.error(f"计算规则执行失败: {rule}, 错误: {e}")
# 继续执行下一个规则,而不是中断整个流程
continue
logger.info(f"数据计算完成,最终数据形状: {result_df.shape}")
return result_df
def _apply_rule(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
"""应用单个计算规则
Args:
df: 数据
rule: 规则配置
Returns:
处理后的数据
"""
rule_type = rule.get('type')
if rule_type == 'multiply':
return self._multiply(df, rule)
elif rule_type == 'divide':
return self._divide(df, rule)
elif rule_type == 'add':
return self._add(df, rule)
elif rule_type == 'subtract':
return self._subtract(df, rule)
elif rule_type == 'formula':
return self._formula(df, rule)
elif rule_type == 'round':
return self._round(df, rule)
elif rule_type == 'sum':
return self._sum(df, rule)
elif rule_type == 'aggregate':
return self._aggregate(df, rule)
else:
logger.warning(f"未知的计算规则类型: {rule_type}")
return df
def _multiply(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
"""乘法计算
Args:
df: 数据
rule: 规则配置
Returns:
处理后的数据
"""
source_column = rule.get('source_column')
target_column = rule.get('target_column')
factor = rule.get('factor', 1)
if source_column and target_column:
if source_column in df.columns:
df[target_column] = df[source_column] * factor
logger.debug(f"乘法计算: {source_column} * {factor} -> {target_column}")
else:
logger.warning(f"源列不存在: {source_column}")
return df
def _divide(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
"""除法计算
Args:
df: 数据
rule: 规则配置
Returns:
处理后的数据
"""
source_column = rule.get('source_column')
target_column = rule.get('target_column')
divisor = rule.get('divisor', 1)
if source_column and target_column and divisor != 0:
if source_column in df.columns:
df[target_column] = df[source_column] / divisor
logger.debug(f"除法计算: {source_column} / {divisor} -> {target_column}")
else:
logger.warning(f"源列不存在: {source_column}")
elif divisor == 0:
logger.error("除数不能为0")
return df
def _add(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
"""加法计算
Args:
df: 数据
rule: 规则配置
Returns:
处理后的数据
"""
columns = rule.get('columns', [])
target_column = rule.get('target_column')
constant = rule.get('constant', 0)
if target_column:
if isinstance(columns, str):
columns = [columns]
if columns:
# 列相加
valid_columns = [col for col in columns if col in df.columns]
if valid_columns:
df[target_column] = df[valid_columns].sum(axis=1) + constant
logger.debug(f"加法计算: {valid_columns} + {constant} -> {target_column}")
else:
logger.warning(f"没有有效的列用于加法计算: {columns}")
else:
# 只加常数
if target_column in df.columns:
df[target_column] = df[target_column] + constant
logger.debug(f"加法计算: {target_column} + {constant}")
else:
logger.warning(f"目标列不存在: {target_column}")
return df
def _subtract(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
"""减法计算
Args:
df: 数据
rule: 规则配置
Returns:
处理后的数据
"""
minuend = rule.get('minuend') # 被减数列
subtrahend = rule.get('subtrahend') # 减数列
target_column = rule.get('target_column')
constant = rule.get('constant', 0)
if target_column and minuend and minuend in df.columns:
if subtrahend and subtrahend in df.columns:
df[target_column] = df[minuend] - df[subtrahend] - constant
logger.debug(f"减法计算: {minuend} - {subtrahend} - {constant} -> {target_column}")
else:
df[target_column] = df[minuend] - constant
logger.debug(f"减法计算: {minuend} - {constant} -> {target_column}")
else:
logger.warning(f"减法计算参数不完整或列不存在")
return df
def _formula(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
"""公式计算
Args:
df: 数据
rule: 规则配置
Returns:
处理后的数据
"""
formula = rule.get('formula')
target_column = rule.get('target_column')
if formula and target_column:
try:
df[target_column] = df.eval(formula)
logger.debug(f"公式计算: {formula} -> {target_column}")
except Exception as e:
logger.error(f"公式计算失败: {formula}, 错误: {e}")
else:
logger.warning("公式计算缺少公式或目标列")
return df
def _round(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
"""四舍五入
Args:
df: 数据
rule: 规则配置
Returns:
处理后的数据
"""
columns = rule.get('columns', [])
decimals = rule.get('decimals', 0)
if isinstance(columns, str):
columns = [columns]
target_columns = columns or df.select_dtypes(include=[np.number]).columns
for col in target_columns:
if col in df.columns and pd.api.types.is_numeric_dtype(df[col]):
df[col] = df[col].round(decimals)
logger.debug(f"四舍五入: {col} 保留 {decimals} 位小数")
return df
def _sum(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
"""求和计算
Args:
df: 数据
rule: 规则配置
Returns:
处理后的数据
"""
columns = rule.get('columns', [])
target_column = rule.get('target_column')
group_by = rule.get('group_by')
if isinstance(columns, str):
columns = [columns]
if group_by and group_by in df.columns:
# 分组求和
if columns:
for col in columns:
if col in df.columns:
sum_result = df.groupby(group_by)[col].sum()
logger.debug(f"分组求和: {col}{group_by} 分组")
else:
# 所有数值列分组求和
numeric_columns = df.select_dtypes(include=[np.number]).columns
sum_result = df.groupby(group_by)[numeric_columns].sum()
logger.debug(f"分组求和: 所有数值列 按 {group_by} 分组")
else:
# 总体求和
if columns:
valid_columns = [col for col in columns if col in df.columns]
if valid_columns and target_column:
df[target_column] = df[valid_columns].sum(axis=1)
logger.debug(f"求和计算: {valid_columns} -> {target_column}")
else:
# 所有数值列求和
numeric_columns = df.select_dtypes(include=[np.number]).columns
if target_column and len(numeric_columns) > 0:
df[target_column] = df[numeric_columns].sum(axis=1)
logger.debug(f"求和计算: {list(numeric_columns)} -> {target_column}")
return df
def _aggregate(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
"""聚合计算
Args:
df: 数据
rule: 规则配置
Returns:
处理后的数据
"""
group_by = rule.get('group_by')
aggregations = rule.get('aggregations', {})
if group_by and group_by in df.columns:
# 构建聚合函数字典
agg_dict = {}
for column, func in aggregations.items():
if column in df.columns:
if isinstance(func, str):
agg_dict[column] = func
elif isinstance(func, list):
agg_dict[column] = func
if agg_dict:
result = df.groupby(group_by).agg(agg_dict)
logger.debug(f"聚合计算: 按 {group_by} 分组, 聚合: {agg_dict}")
return result.reset_index()
return df
# 便捷方法
def multiply(self, source_column: str, target_column: str, factor: float):
"""乘法计算"""
self.add_rule('multiply', source_column=source_column,
target_column=target_column, factor=factor)
return self
def divide(self, source_column: str, target_column: str, divisor: float):
"""除法计算"""
self.add_rule('divide', source_column=source_column,
target_column=target_column, divisor=divisor)
return self
def add(self, columns: Union[str, List[str]], target_column: str, constant: float = 0):
"""加法计算"""
self.add_rule('add', columns=columns, target_column=target_column, constant=constant)
return self
def subtract(self, minuend: str, target_column: str,
subtrahend: Optional[str] = None, constant: float = 0):
"""减法计算"""
self.add_rule('subtract', minuend=minuend, target_column=target_column,
subtrahend=subtrahend, constant=constant)
return self
def formula(self, formula: str, target_column: str):
"""公式计算"""
self.add_rule('formula', formula=formula, target_column=target_column)
return self
def round_columns(self, columns: Optional[Union[str, List[str]]] = None, decimals: int = 0):
"""四舍五入"""
self.add_rule('round', columns=columns, decimals=decimals)
return self
def sum_columns(self, columns: Optional[Union[str, List[str]]] = None,
target_column: Optional[str] = None, group_by: Optional[str] = None):
"""求和计算"""
self.add_rule('sum', columns=columns, target_column=target_column, group_by=group_by)
return self
def aggregate(self, group_by: str, aggregations: Dict[str, Union[str, List[str]]]):
"""聚合计算"""
self.add_rule('aggregate', group_by=group_by, aggregations=aggregations)
return self
+382
View File
@@ -0,0 +1,382 @@
"""
列映射处理器
提供列名映射和转换功能,支持不同供应商的列名标准化
"""
import re
import pandas as pd
from typing import Dict, Any, Optional, List, Union
from ...core.utils.log_utils import get_logger
logger = get_logger(__name__)
class ColumnMapper:
"""列映射处理器
提供列名标准化功能,将不同供应商的列名映射到标准列名
"""
# 标准列名定义(所有列名别名的唯一来源)
STANDARD_COLUMNS = {
'barcode': [
'条码', '条形码', '商品条码', '商品条形码', '产品条码', '商品编码',
'商品编号', '条码(必填)', '电脑条码', '条码ID',
'barcode', 'Barcode', 'BarCode', 'code', '编码',
],
'name': [
'商品名称', '产品名称', '名称', '商品', '产品', '商品名', '品名',
'品项名', '商品或服务名称', '品项', '名 称',
'name', 'product_name',
],
'specification': [
'规格', '规格型号', '型号', '商品规格', '产品规格', '包装规格', '规 格',
'specification', 'spec', 'model',
],
'quantity': [
'数量', '采购量', '订货数量', '订单量', '需求量', '采购数量', '购买数量',
'订单数量', '数量(必填)', '采购量(必填)', '入库数', '入库数量', '数 量',
'quantity', 'qty',
],
'unit': [
'单位', '计量单位', '采购单位', '单位(必填)', '单位名称', '计价单位', '单 位',
'unit', 'units',
],
'unit_price': [
'单价', '价格', '采购单价', '进货价', '销售价', '采购价', '参考价',
'入库单价', '单价(必填)', '采购单价(必填)', '价格(必填)', '单 价',
'unit_price', 'price',
],
'total_price': [
'总价', '金额', '小计', '合计金额', '小计金额', '金额(元)',
'金额合计', '合计', '总额',
'total_price', 'total', 'amount',
],
'gift_quantity': [
'赠送量', '赠品数量', '赠送数量', '赠品',
],
'category': ['类别', '分类', '商品类别', 'category', 'type'],
'brand': ['品牌', '商标', 'brand'],
'supplier': ['供应商', '供货商', 'supplier', 'vendor'],
}
def __init__(self, mapping_config: Optional[Dict[str, Any]] = None):
"""初始化列映射器
Args:
mapping_config: 映射配置
"""
self.mapping_config = mapping_config or {}
self.custom_mappings = {}
self._build_reverse_mapping()
def _build_reverse_mapping(self):
"""构建反向映射表"""
self.reverse_mapping = {}
# 添加标准列的反向映射
for standard_name, variations in self.STANDARD_COLUMNS.items():
for variation in variations:
self.reverse_mapping[variation.lower()] = standard_name
# 添加自定义映射
for standard_name, custom_names in self.mapping_config.items():
if isinstance(custom_names, str):
custom_names = [custom_names]
for custom_name in custom_names:
self.reverse_mapping[custom_name.lower()] = standard_name
self.custom_mappings[custom_name.lower()] = standard_name
def map_columns(self, df: pd.DataFrame, target_columns: Optional[List[str]] = None) -> pd.DataFrame:
"""映射列名
Args:
df: 输入数据
target_columns: 目标列名列表,如果为None则使用所有标准列
Returns:
列名映射后的数据
"""
if target_columns is None:
target_columns = list(self.STANDARD_COLUMNS.keys())
logger.info(f"开始列名映射,目标列: {target_columns}")
logger.info(f"原始列名: {list(df.columns)}")
# 创建列名映射
column_mapping = {}
used_columns = set()
for target_col in target_columns:
# 查找匹配的原始列名
matched_column = self._find_matching_column(df.columns, target_col)
if matched_column:
column_mapping[matched_column] = target_col
used_columns.add(matched_column)
logger.debug(f"列名映射: {matched_column} -> {target_col}")
# 重命名列
if column_mapping:
df_mapped = df.rename(columns=column_mapping)
# 添加缺失的目标列
for target_col in target_columns:
if target_col not in df_mapped.columns:
df_mapped[target_col] = self._get_default_value(target_col)
logger.debug(f"添加缺失列: {target_col}")
# 只保留目标列
existing_target_columns = [col for col in target_columns if col in df_mapped.columns]
df_result = df_mapped[existing_target_columns]
logger.info(f"列名映射完成,结果列名: {list(df_result.columns)}")
return df_result
else:
logger.warning("没有找到可映射的列名")
return df
def _find_matching_column(self, columns: List[str], target_column: str) -> Optional[str]:
"""查找匹配的列名
Args:
columns: 原始列名列表
target_column: 目标标准列名
Returns:
匹配的原始列名或None
"""
# 获取目标列的所有可能变体
possible_names = []
# 标准列名变体
if target_column in self.STANDARD_COLUMNS:
possible_names.extend(self.STANDARD_COLUMNS[target_column])
# 自定义映射
for standard_name, custom_names in self.mapping_config.items():
if standard_name == target_column:
if isinstance(custom_names, str):
possible_names.append(custom_names)
else:
possible_names.extend(custom_names)
# 查找匹配
for possible_name in possible_names:
# 精确匹配(忽略大小写)
for column in columns:
if column.lower() == possible_name.lower():
return column
# 模糊匹配
for column in columns:
if possible_name.lower() in column.lower() or column.lower() in possible_name.lower():
return column
return None
def _get_default_value(self, column_name: str) -> Any:
"""获取列的默认值
Args:
column_name: 列名
Returns:
默认值
"""
# 根据列名类型返回合适的默认值
if column_name in ['quantity', 'unit_price', 'total_price']:
return 0
elif column_name in ['barcode', 'name', 'specification', 'unit', 'category', 'brand', 'supplier']:
return ''
else:
return None
def add_custom_mapping(self, standard_name: str, custom_names: Union[str, List[str]]):
"""添加自定义列名映射
Args:
standard_name: 标准列名
custom_names: 自定义列名或列名列表
"""
if isinstance(custom_names, str):
custom_names = [custom_names]
# 更新配置
self.mapping_config[standard_name] = custom_names
# 更新反向映射
for custom_name in custom_names:
self.reverse_mapping[custom_name.lower()] = standard_name
self.custom_mappings[custom_name.lower()] = standard_name
logger.info(f"添加自定义映射: {standard_name} <- {custom_names}")
def detect_column_types(self, df: pd.DataFrame) -> Dict[str, str]:
"""检测列的数据类型
Args:
df: 数据
Returns:
列类型字典
"""
column_types = {}
for column in df.columns:
if pd.api.types.is_numeric_dtype(df[column]):
column_types[column] = 'numeric'
elif pd.api.types.is_datetime64_any_dtype(df[column]):
column_types[column] = 'datetime'
elif pd.api.types.is_bool_dtype(df[column]):
column_types[column] = 'boolean'
else:
column_types[column] = 'text'
return column_types
def suggest_column_mapping(self, df: pd.DataFrame) -> Dict[str, List[str]]:
"""建议列名映射
Args:
df: 数据
Returns:
建议的映射关系
"""
suggestions = {}
for column in df.columns:
column_lower = column.lower()
suggestions[column] = []
# 检查标准列名
for standard_name, variations in self.STANDARD_COLUMNS.items():
for variation in variations:
if column_lower in variation.lower() or variation.lower() in column_lower:
suggestions[column].append(standard_name)
# 检查自定义映射
for custom_name, standard_name in self.custom_mappings.items():
if column_lower in custom_name or custom_name in column_lower:
suggestions[column].append(standard_name)
# 去重
suggestions[column] = list(set(suggestions[column]))
# 只返回有建议的列
return {k: v for k, v in suggestions.items() if v}
def validate_mapping(self, df: pd.DataFrame, required_columns: List[str]) -> Dict[str, Any]:
"""验证列映射结果
Args:
df: 映射后的数据
required_columns: 必需的列名列表
Returns:
验证结果
"""
result = {
'valid': True,
'missing_columns': [],
'empty_columns': [],
'warnings': []
}
# 检查缺失列
for col in required_columns:
if col not in df.columns:
result['missing_columns'].append(col)
result['valid'] = False
# 检查空列
for col in df.columns:
if df[col].isnull().all():
result['empty_columns'].append(col)
result['warnings'].append(f"'{col}' 全部为空值")
# 检查数值列
numeric_columns = ['quantity', 'unit_price', 'total_price']
for col in numeric_columns:
if col in df.columns and not pd.api.types.is_numeric_dtype(df[col]):
result['warnings'].append(f"'{col}' 不是数值类型")
return result
@classmethod
def find_column(cls, columns: List[str], standard_name: str) -> Optional[str]:
"""在列名列表中查找匹配标准列名的列
匹配策略: 精确匹配 → 忽略空白匹配 → 子串匹配
Args:
columns: 实际列名列表
standard_name: 标准列名 (STANDARD_COLUMNS 的键)
Returns:
匹配到的实际列名,未找到返回 None
"""
candidates = cls.STANDARD_COLUMNS.get(standard_name, [])
if not candidates:
return None
columns_str = [str(c) for c in columns]
# 精确匹配
for col in columns_str:
col_clean = col.strip()
for candidate in candidates:
if col_clean == candidate:
return col
# 忽略空白匹配
for col in columns_str:
col_clean = re.sub(r'\s+', '', col.strip())
for candidate in candidates:
if col_clean == re.sub(r'\s+', '', candidate):
return col
# 子串匹配 (候选名包含在列名中)
for col in columns_str:
col_lower = col.strip().lower()
for candidate in candidates:
if candidate.lower() in col_lower:
return col
return None
@staticmethod
def detect_header_row(df: pd.DataFrame, max_rows: int = 10, min_matches: int = 3) -> int:
"""检测表头所在行
扫描前 max_rows 行,返回包含最多关键词匹配的行索引。
Args:
df: 数据框
max_rows: 最大扫描行数
min_matches: 最少关键词匹配数
Returns:
表头行索引,未找到返回 -1
"""
header_keywords = [
'条码', '条形码', '商品条码', '商品名称', '名称', '规格',
'单价', '数量', '金额', '单位', '必填', '编码',
]
best_row = -1
best_matches = 0
for row_idx in range(min(max_rows, len(df))):
row_values = df.iloc[row_idx].astype(str)
matches = sum(
1 for kw in header_keywords
if any(kw in str(val) for val in row_values.values)
)
if matches >= min_matches and matches > best_matches:
best_matches = matches
best_row = row_idx
return best_row
+401
View File
@@ -0,0 +1,401 @@
"""
数据清洗处理器
提供各种数据清洗功能,如空值处理、重复项处理、数据类型转换等
"""
import pandas as pd
from typing import Dict, Any, Optional, List, Union
from ...core.utils.log_utils import get_logger
logger = get_logger(__name__)
class DataCleaner:
"""数据清洗处理器
提供标准化的数据清洗功能,支持链式调用和规则配置
"""
def __init__(self, config: Optional[Dict[str, Any]] = None):
"""初始化数据清洗器
Args:
config: 清洗配置
"""
self.config = config or {}
self.cleaning_rules = []
def add_rule(self, rule_type: str, **kwargs):
"""添加清洗规则
Args:
rule_type: 规则类型
**kwargs: 规则参数
"""
rule = {'type': rule_type, **kwargs}
self.cleaning_rules.append(rule)
logger.debug(f"添加清洗规则: {rule_type}")
def clean(self, df: pd.DataFrame) -> pd.DataFrame:
"""执行数据清洗
Args:
df: 输入数据
Returns:
清洗后的数据
"""
logger.info(f"开始数据清洗,原始数据形状: {df.shape}")
result_df = df.copy()
for i, rule in enumerate(self.cleaning_rules):
try:
logger.debug(f"执行清洗规则 {i+1}/{len(self.cleaning_rules)}: {rule['type']}")
result_df = self._apply_rule(result_df, rule)
logger.debug(f"规则执行完成,数据形状: {result_df.shape}")
except Exception as e:
logger.error(f"清洗规则执行失败: {rule}, 错误: {e}")
# 继续执行下一个规则,而不是中断整个流程
continue
logger.info(f"数据清洗完成,最终数据形状: {result_df.shape}")
return result_df
def _apply_rule(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
"""应用单个清洗规则
Args:
df: 数据
rule: 规则配置
Returns:
处理后的数据
"""
rule_type = rule.get('type')
if rule_type == 'remove_duplicates':
return self._remove_duplicates(df, rule)
elif rule_type == 'fill_na':
return self._fill_na(df, rule)
elif rule_type == 'remove_rows':
return self._remove_rows(df, rule)
elif rule_type == 'convert_type':
return self._convert_type(df, rule)
elif rule_type == 'strip_whitespace':
return self._strip_whitespace(df, rule)
elif rule_type == 'normalize_text':
return self._normalize_text(df, rule)
elif rule_type == 'validate_data':
return self._validate_data(df, rule)
else:
logger.warning(f"未知的清洗规则类型: {rule_type}")
return df
def _remove_duplicates(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
"""移除重复项
Args:
df: 数据
rule: 规则配置
Returns:
处理后的数据
"""
subset = rule.get('subset') # 用于判断重复的列
keep = rule.get('keep', 'first') # 保留哪个重复项
before_count = len(df)
df_cleaned = df.drop_duplicates(subset=subset, keep=keep)
after_count = len(df_cleaned)
logger.info(f"移除重复项: {before_count - after_count} 行被移除")
return df_cleaned
def _fill_na(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
"""填充空值
Args:
df: 数据
rule: 规则配置
Returns:
处理后的数据
"""
columns = rule.get('columns') # 要处理的列
value = rule.get('value', 0) # 填充值
method = rule.get('method') # 填充方法('ffill', 'bfill', 'mean', 'median'
if columns:
# 处理指定列
if isinstance(columns, str):
columns = [columns]
for col in columns:
if col in df.columns:
if method == 'ffill':
df[col] = df[col].fillna(method='ffill')
elif method == 'bfill':
df[col] = df[col].fillna(method='bfill')
elif method == 'mean':
df[col] = df[col].fillna(df[col].mean())
elif method == 'median':
df[col] = df[col].fillna(df[col].median())
else:
df[col] = df[col].fillna(value)
logger.debug(f"填充列 {col} 的空值: {method or value}")
else:
# 处理所有列
if method == 'ffill':
df = df.fillna(method='ffill')
elif method == 'bfill':
df = df.fillna(method='bfill')
else:
df = df.fillna(value)
logger.debug(f"填充所有列的空值: {method or value}")
return df
def _remove_rows(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
"""移除行
Args:
df: 数据
rule: 规则配置
Returns:
处理后的数据
"""
condition = rule.get('condition') # 条件表达式
columns = rule.get('columns') # 要检查的列
values = rule.get('values') # 要移除的值
if condition:
# 使用条件表达式
try:
before_count = len(df)
df_filtered = df.query(condition)
after_count = len(df_filtered)
logger.info(f"条件过滤: {condition}, 移除了 {before_count - after_count}")
return df_filtered
except Exception as e:
logger.error(f"条件表达式执行失败: {condition}, 错误: {e}")
return df
if columns and values:
# 基于列值过滤
if isinstance(columns, str):
columns = [columns]
if not isinstance(values, list):
values = [values]
df_filtered = df.copy()
for col in columns:
if col in df_filtered.columns:
mask = ~df_filtered[col].isin(values)
df_filtered = df_filtered[mask]
logger.debug(f"{col} 过滤值 {values}")
return df_filtered
logger.warning("移除行规则缺少条件或列配置")
return df
def _convert_type(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
"""类型转换
Args:
df: 数据
rule: 规则配置
Returns:
处理后的数据
"""
columns = rule.get('columns')
target_type = rule.get('target_type', 'float')
errors = rule.get('errors', 'coerce') # 错误处理方式
if isinstance(columns, str):
columns = [columns]
for col in columns:
if col in df.columns:
try:
if target_type == 'int':
df[col] = pd.to_numeric(df[col], errors=errors).astype('Int64')
elif target_type == 'float':
df[col] = pd.to_numeric(df[col], errors=errors)
elif target_type == 'datetime':
df[col] = pd.to_datetime(df[col], errors=errors)
elif target_type == 'string':
df[col] = df[col].astype(str)
else:
df[col] = df[col].astype(target_type)
logger.debug(f"{col} 类型转换: {target_type}")
except Exception as e:
logger.error(f"{col} 类型转换失败: {e}")
return df
def _strip_whitespace(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
"""去除空白字符
Args:
df: 数据
rule: 规则配置
Returns:
处理后的数据
"""
columns = rule.get('columns')
if columns:
if isinstance(columns, str):
columns = [columns]
for col in columns:
if col in df.columns and df[col].dtype == 'object':
df[col] = df[col].str.strip()
logger.debug(f"{col} 去除空白字符")
else:
# 处理所有文本列
text_columns = df.select_dtypes(include=['object']).columns
for col in text_columns:
df[col] = df[col].str.strip()
logger.debug(f"所有文本列去除空白字符: {list(text_columns)}")
return df
def _normalize_text(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
"""文本标准化
Args:
df: 数据
rule: 规则配置
Returns:
处理后的数据
"""
columns = rule.get('columns')
lowercase = rule.get('lowercase', False)
uppercase = rule.get('uppercase', False)
replace_map = rule.get('replace_map', {}) # 替换映射
if isinstance(columns, str):
columns = [columns]
target_columns = columns or df.select_dtypes(include=['object']).columns
for col in target_columns:
if col in df.columns and df[col].dtype == 'object':
if lowercase:
df[col] = df[col].str.lower()
elif uppercase:
df[col] = df[col].str.upper()
# 应用替换映射
for old, new in replace_map.items():
df[col] = df[col].str.replace(old, new)
logger.debug(f"{col} 文本标准化完成")
return df
def _validate_data(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
"""数据验证
Args:
df: 数据
rule: 规则配置
Returns:
处理后的数据
"""
columns = rule.get('columns')
min_value = rule.get('min_value')
max_value = rule.get('max_value')
required = rule.get('required', False)
if isinstance(columns, str):
columns = [columns]
validation_results = []
for col in columns:
if col in df.columns:
# 检查必需值
if required:
null_count = df[col].isnull().sum()
if null_count > 0:
validation_results.append(f"{col}: {null_count} 个空值")
# 检查数值范围
if min_value is not None or max_value is not None:
if pd.api.types.is_numeric_dtype(df[col]):
invalid_mask = pd.Series(False, index=df.index)
if min_value is not None:
invalid_mask |= df[col] < min_value
if max_value is not None:
invalid_mask |= df[col] > max_value
invalid_count = invalid_mask.sum()
if invalid_count > 0:
validation_results.append(f"{col}: {invalid_count} 个值超出范围")
if validation_results:
logger.warning(f"数据验证发现问题: {', '.join(validation_results)}")
else:
logger.debug("数据验证通过")
return df
# 便捷方法
def remove_duplicates(self, subset: Optional[List[str]] = None, keep: str = 'first'):
"""移除重复项"""
self.add_rule('remove_duplicates', subset=subset, keep=keep)
return self
def fill_na(self, columns: Optional[Union[str, List[str]]] = None,
value: Any = 0, method: Optional[str] = None):
"""填充空值"""
self.add_rule('fill_na', columns=columns, value=value, method=method)
return self
def remove_rows(self, condition: Optional[str] = None,
columns: Optional[Union[str, List[str]]] = None,
values: Optional[Any] = None):
"""移除行"""
self.add_rule('remove_rows', condition=condition, columns=columns, values=values)
return self
def convert_type(self, columns: Union[str, List[str]], target_type: str, errors: str = 'coerce'):
"""类型转换"""
self.add_rule('convert_type', columns=columns, target_type=target_type, errors=errors)
return self
def strip_whitespace(self, columns: Optional[Union[str, List[str]]] = None):
"""去除空白字符"""
self.add_rule('strip_whitespace', columns=columns)
return self
def normalize_text(self, columns: Optional[Union[str, List[str]]] = None,
lowercase: bool = False, uppercase: bool = False,
replace_map: Optional[Dict[str, str]] = None):
"""文本标准化"""
self.add_rule('normalize_text', columns=columns, lowercase=lowercase,
uppercase=uppercase, replace_map=replace_map or {})
return self
def validate_data(self, columns: Union[str, List[str]],
min_value: Optional[float] = None,
max_value: Optional[float] = None,
required: bool = False):
"""数据验证"""
self.add_rule('validate_data', columns=columns, min_value=min_value,
max_value=max_value, required=required)
return self
+150
View File
@@ -0,0 +1,150 @@
import re
import pandas as pd
from typing import List, Dict, Any, Optional
def _split_quantity_unit(df: pd.DataFrame, source: str, dictionary: Optional[Dict[str, Any]] = None) -> pd.DataFrame:
if source in df.columns:
vals = df[source].astype(str).fillna("")
nums = []
units = []
default_unit = (dictionary or {}).get("default_unit", "")
unit_synonyms = (dictionary or {}).get("unit_synonyms", {})
for v in vals:
m = re.search(r"(\d+(?:\.\d+)?)(箱|件|提|盒|瓶)", v)
if m:
nums.append(float(m.group(1)))
u = unit_synonyms.get(m.group(2), m.group(2))
units.append(u)
else:
try:
nums.append(float(v))
units.append(unit_synonyms.get(default_unit, default_unit))
except Exception:
nums.append(0.0)
units.append(unit_synonyms.get(default_unit, default_unit))
df["quantity"] = nums
df["unit"] = units
return df
def _extract_spec_from_name(df: pd.DataFrame, source: str, dictionary: Optional[Dict[str, Any]] = None) -> pd.DataFrame:
if source in df.columns:
names = df[source].astype(str).fillna("")
specs = []
packs = []
ignore_words = (dictionary or {}).get("ignore_words", [])
name_patterns = (dictionary or {}).get("name_patterns", [])
for s in names:
if ignore_words:
for w in ignore_words:
s = s.replace(w, "")
matched = False
for pat in name_patterns:
try:
m = re.search(pat, s)
if m and len(m.groups()) >= 2:
try:
qty = int(m.group(len(m.groups())))
except Exception:
qty = None
specs.append(s)
packs.append(qty)
matched = True
break
except Exception:
pass
if matched:
continue
m = re.search(r"(\d+(?:\.\d+)?)(ml|l|升|毫升)[*×xX](\d+)", s, re.IGNORECASE)
if m:
specs.append(f"{m.group(1)}{m.group(2)}*{m.group(3)}")
packs.append(int(m.group(3)))
continue
m2 = re.search(r"(\d+)[*×xX](\d+)", s)
if m2:
specs.append(f"1*{m2.group(2)}")
packs.append(int(m2.group(2)))
continue
m3 = re.search(r"(\d{2,3})\D*(\d{1,3})\D*", s)
if m3:
specs.append(f"1*{m3.group(2)}")
packs.append(int(m3.group(2)))
continue
specs.append("")
packs.append(None)
df["specification"] = df.get("specification", pd.Series(specs))
df["package_quantity"] = packs
return df
def _normalize_unit(df: pd.DataFrame, target: str, unit_map: Dict[str, str], dictionary: Optional[Dict[str, Any]] = None) -> pd.DataFrame:
if target in df.columns:
df[target] = df[target].astype(str)
df[target] = df[target].apply(lambda u: unit_map.get(u, u))
pack_multipliers = (dictionary or {}).get("pack_multipliers", {})
default_pq = (dictionary or {}).get("default_package_quantity", 1)
try:
if "quantity" in df.columns:
def convert_qty(row):
u = row.get(target)
q = row.get("quantity")
pq = row.get("package_quantity")
if u in ("", "", "", ""):
mult = pq or pack_multipliers.get(u, default_pq)
if pd.notna(q) and pd.notna(mult) and float(mult) > 0:
return float(q) * float(mult)
return q
df["quantity"] = df.apply(convert_qty, axis=1)
df[target] = df[target].apply(lambda u: "" if u in ("","","","") else u)
except Exception:
pass
return df
def _compute_quantity_from_total(df: pd.DataFrame) -> pd.DataFrame:
if "quantity" in df.columns and "unit_price" in df.columns:
qty = df["quantity"].fillna(0)
up = pd.to_numeric(df.get("unit_price", 0), errors="coerce").fillna(0)
tp = pd.to_numeric(df.get("total_price", 0), errors="coerce").fillna(0)
need = (qty <= 0) & (up > 0) & (tp > 0)
df.loc[need, "quantity"] = (tp[need] / up[need]).round(6)
return df
def _fill_missing(df: pd.DataFrame, fills: Dict[str, Any]) -> pd.DataFrame:
for k, v in fills.items():
if k in df.columns:
df[k] = df[k].fillna(v)
else:
df[k] = v
return df
def _mark_gift(df: pd.DataFrame) -> pd.DataFrame:
df["is_gift"] = False
tp = df.get("total_price")
up = df.get("unit_price")
flags = pd.Series([False]*len(df))
if tp is not None:
tpn = pd.to_numeric(tp, errors="coerce").fillna(0)
flags = flags | (tpn == 0)
if up is not None:
upn = pd.to_numeric(up, errors="coerce").fillna(0)
flags = flags | (upn == 0)
if "name" in df.columns:
flags = flags | df["name"].astype(str).str.contains(r"赠品|^o$|^O$", regex=True)
df.loc[flags, "is_gift"] = True
return df
def apply_rules(df: pd.DataFrame, rules: List[Dict[str, Any]], dictionary: Optional[Dict[str, Any]] = None) -> pd.DataFrame:
out = df.copy()
for r in rules or []:
t = r.get("type")
if t == "split_quantity_unit":
out = _split_quantity_unit(out, r.get("source", "quantity"), dictionary)
elif t == "extract_spec_from_name":
out = _extract_spec_from_name(out, r.get("source", "name"), dictionary)
elif t == "normalize_unit":
out = _normalize_unit(out, r.get("target", "unit"), r.get("map", {}), dictionary)
elif t == "compute_quantity_from_total":
out = _compute_quantity_from_total(out)
elif t == "fill_missing":
out = _fill_missing(out, r.get("fills", {}))
elif t == "mark_gift":
out = _mark_gift(out)
return out