新版本
This commit is contained in:
@@ -0,0 +1,9 @@
|
||||
"""
|
||||
数据处理handlers模块初始化文件
|
||||
"""
|
||||
|
||||
from .data_cleaner import DataCleaner
|
||||
from .column_mapper import ColumnMapper
|
||||
from .calculator import DataCalculator
|
||||
|
||||
__all__ = ['DataCleaner', 'ColumnMapper', 'DataCalculator']
|
||||
@@ -0,0 +1,378 @@
|
||||
"""
|
||||
数据计算处理器
|
||||
|
||||
提供各种数据计算功能,如数量计算、价格计算、汇总统计等
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from typing import Dict, Any, Optional, List, Union
|
||||
from ...core.utils.log_utils import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class DataCalculator:
|
||||
"""数据计算处理器
|
||||
|
||||
提供标准化的数据计算功能,支持各种业务计算规则
|
||||
"""
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
"""初始化数据计算器
|
||||
|
||||
Args:
|
||||
config: 计算配置
|
||||
"""
|
||||
self.config = config or {}
|
||||
self.calculation_rules = []
|
||||
|
||||
def add_rule(self, rule_type: str, **kwargs):
|
||||
"""添加计算规则
|
||||
|
||||
Args:
|
||||
rule_type: 规则类型
|
||||
**kwargs: 规则参数
|
||||
"""
|
||||
rule = {'type': rule_type, **kwargs}
|
||||
self.calculation_rules.append(rule)
|
||||
logger.debug(f"添加计算规则: {rule_type}")
|
||||
|
||||
def calculate(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""执行数据计算
|
||||
|
||||
Args:
|
||||
df: 输入数据
|
||||
|
||||
Returns:
|
||||
计算后的数据
|
||||
"""
|
||||
logger.info(f"开始数据计算,原始数据形状: {df.shape}")
|
||||
|
||||
result_df = df.copy()
|
||||
|
||||
for i, rule in enumerate(self.calculation_rules):
|
||||
try:
|
||||
logger.debug(f"执行计算规则 {i+1}/{len(self.calculation_rules)}: {rule['type']}")
|
||||
result_df = self._apply_rule(result_df, rule)
|
||||
logger.debug(f"规则执行完成,数据形状: {result_df.shape}")
|
||||
except Exception as e:
|
||||
logger.error(f"计算规则执行失败: {rule}, 错误: {e}")
|
||||
# 继续执行下一个规则,而不是中断整个流程
|
||||
continue
|
||||
|
||||
logger.info(f"数据计算完成,最终数据形状: {result_df.shape}")
|
||||
return result_df
|
||||
|
||||
def _apply_rule(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""应用单个计算规则
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
rule_type = rule.get('type')
|
||||
|
||||
if rule_type == 'multiply':
|
||||
return self._multiply(df, rule)
|
||||
elif rule_type == 'divide':
|
||||
return self._divide(df, rule)
|
||||
elif rule_type == 'add':
|
||||
return self._add(df, rule)
|
||||
elif rule_type == 'subtract':
|
||||
return self._subtract(df, rule)
|
||||
elif rule_type == 'formula':
|
||||
return self._formula(df, rule)
|
||||
elif rule_type == 'round':
|
||||
return self._round(df, rule)
|
||||
elif rule_type == 'sum':
|
||||
return self._sum(df, rule)
|
||||
elif rule_type == 'aggregate':
|
||||
return self._aggregate(df, rule)
|
||||
else:
|
||||
logger.warning(f"未知的计算规则类型: {rule_type}")
|
||||
return df
|
||||
|
||||
def _multiply(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""乘法计算
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
source_column = rule.get('source_column')
|
||||
target_column = rule.get('target_column')
|
||||
factor = rule.get('factor', 1)
|
||||
|
||||
if source_column and target_column:
|
||||
if source_column in df.columns:
|
||||
df[target_column] = df[source_column] * factor
|
||||
logger.debug(f"乘法计算: {source_column} * {factor} -> {target_column}")
|
||||
else:
|
||||
logger.warning(f"源列不存在: {source_column}")
|
||||
|
||||
return df
|
||||
|
||||
def _divide(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""除法计算
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
source_column = rule.get('source_column')
|
||||
target_column = rule.get('target_column')
|
||||
divisor = rule.get('divisor', 1)
|
||||
|
||||
if source_column and target_column and divisor != 0:
|
||||
if source_column in df.columns:
|
||||
df[target_column] = df[source_column] / divisor
|
||||
logger.debug(f"除法计算: {source_column} / {divisor} -> {target_column}")
|
||||
else:
|
||||
logger.warning(f"源列不存在: {source_column}")
|
||||
elif divisor == 0:
|
||||
logger.error("除数不能为0")
|
||||
|
||||
return df
|
||||
|
||||
def _add(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""加法计算
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
columns = rule.get('columns', [])
|
||||
target_column = rule.get('target_column')
|
||||
constant = rule.get('constant', 0)
|
||||
|
||||
if target_column:
|
||||
if isinstance(columns, str):
|
||||
columns = [columns]
|
||||
|
||||
if columns:
|
||||
# 列相加
|
||||
valid_columns = [col for col in columns if col in df.columns]
|
||||
if valid_columns:
|
||||
df[target_column] = df[valid_columns].sum(axis=1) + constant
|
||||
logger.debug(f"加法计算: {valid_columns} + {constant} -> {target_column}")
|
||||
else:
|
||||
logger.warning(f"没有有效的列用于加法计算: {columns}")
|
||||
else:
|
||||
# 只加常数
|
||||
if target_column in df.columns:
|
||||
df[target_column] = df[target_column] + constant
|
||||
logger.debug(f"加法计算: {target_column} + {constant}")
|
||||
else:
|
||||
logger.warning(f"目标列不存在: {target_column}")
|
||||
|
||||
return df
|
||||
|
||||
def _subtract(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""减法计算
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
minuend = rule.get('minuend') # 被减数列
|
||||
subtrahend = rule.get('subtrahend') # 减数列
|
||||
target_column = rule.get('target_column')
|
||||
constant = rule.get('constant', 0)
|
||||
|
||||
if target_column and minuend and minuend in df.columns:
|
||||
if subtrahend and subtrahend in df.columns:
|
||||
df[target_column] = df[minuend] - df[subtrahend] - constant
|
||||
logger.debug(f"减法计算: {minuend} - {subtrahend} - {constant} -> {target_column}")
|
||||
else:
|
||||
df[target_column] = df[minuend] - constant
|
||||
logger.debug(f"减法计算: {minuend} - {constant} -> {target_column}")
|
||||
else:
|
||||
logger.warning(f"减法计算参数不完整或列不存在")
|
||||
|
||||
return df
|
||||
|
||||
def _formula(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""公式计算
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
formula = rule.get('formula')
|
||||
target_column = rule.get('target_column')
|
||||
|
||||
if formula and target_column:
|
||||
try:
|
||||
df[target_column] = df.eval(formula)
|
||||
logger.debug(f"公式计算: {formula} -> {target_column}")
|
||||
except Exception as e:
|
||||
logger.error(f"公式计算失败: {formula}, 错误: {e}")
|
||||
else:
|
||||
logger.warning("公式计算缺少公式或目标列")
|
||||
|
||||
return df
|
||||
|
||||
def _round(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""四舍五入
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
columns = rule.get('columns', [])
|
||||
decimals = rule.get('decimals', 0)
|
||||
|
||||
if isinstance(columns, str):
|
||||
columns = [columns]
|
||||
|
||||
target_columns = columns or df.select_dtypes(include=[np.number]).columns
|
||||
|
||||
for col in target_columns:
|
||||
if col in df.columns and pd.api.types.is_numeric_dtype(df[col]):
|
||||
df[col] = df[col].round(decimals)
|
||||
logger.debug(f"四舍五入: {col} 保留 {decimals} 位小数")
|
||||
|
||||
return df
|
||||
|
||||
def _sum(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""求和计算
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
columns = rule.get('columns', [])
|
||||
target_column = rule.get('target_column')
|
||||
group_by = rule.get('group_by')
|
||||
|
||||
if isinstance(columns, str):
|
||||
columns = [columns]
|
||||
|
||||
if group_by and group_by in df.columns:
|
||||
# 分组求和
|
||||
if columns:
|
||||
for col in columns:
|
||||
if col in df.columns:
|
||||
sum_result = df.groupby(group_by)[col].sum()
|
||||
logger.debug(f"分组求和: {col} 按 {group_by} 分组")
|
||||
else:
|
||||
# 所有数值列分组求和
|
||||
numeric_columns = df.select_dtypes(include=[np.number]).columns
|
||||
sum_result = df.groupby(group_by)[numeric_columns].sum()
|
||||
logger.debug(f"分组求和: 所有数值列 按 {group_by} 分组")
|
||||
else:
|
||||
# 总体求和
|
||||
if columns:
|
||||
valid_columns = [col for col in columns if col in df.columns]
|
||||
if valid_columns and target_column:
|
||||
df[target_column] = df[valid_columns].sum(axis=1)
|
||||
logger.debug(f"求和计算: {valid_columns} -> {target_column}")
|
||||
else:
|
||||
# 所有数值列求和
|
||||
numeric_columns = df.select_dtypes(include=[np.number]).columns
|
||||
if target_column and len(numeric_columns) > 0:
|
||||
df[target_column] = df[numeric_columns].sum(axis=1)
|
||||
logger.debug(f"求和计算: {list(numeric_columns)} -> {target_column}")
|
||||
|
||||
return df
|
||||
|
||||
def _aggregate(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""聚合计算
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
group_by = rule.get('group_by')
|
||||
aggregations = rule.get('aggregations', {})
|
||||
|
||||
if group_by and group_by in df.columns:
|
||||
# 构建聚合函数字典
|
||||
agg_dict = {}
|
||||
for column, func in aggregations.items():
|
||||
if column in df.columns:
|
||||
if isinstance(func, str):
|
||||
agg_dict[column] = func
|
||||
elif isinstance(func, list):
|
||||
agg_dict[column] = func
|
||||
|
||||
if agg_dict:
|
||||
result = df.groupby(group_by).agg(agg_dict)
|
||||
logger.debug(f"聚合计算: 按 {group_by} 分组, 聚合: {agg_dict}")
|
||||
return result.reset_index()
|
||||
|
||||
return df
|
||||
|
||||
# 便捷方法
|
||||
def multiply(self, source_column: str, target_column: str, factor: float):
|
||||
"""乘法计算"""
|
||||
self.add_rule('multiply', source_column=source_column,
|
||||
target_column=target_column, factor=factor)
|
||||
return self
|
||||
|
||||
def divide(self, source_column: str, target_column: str, divisor: float):
|
||||
"""除法计算"""
|
||||
self.add_rule('divide', source_column=source_column,
|
||||
target_column=target_column, divisor=divisor)
|
||||
return self
|
||||
|
||||
def add(self, columns: Union[str, List[str]], target_column: str, constant: float = 0):
|
||||
"""加法计算"""
|
||||
self.add_rule('add', columns=columns, target_column=target_column, constant=constant)
|
||||
return self
|
||||
|
||||
def subtract(self, minuend: str, target_column: str,
|
||||
subtrahend: Optional[str] = None, constant: float = 0):
|
||||
"""减法计算"""
|
||||
self.add_rule('subtract', minuend=minuend, target_column=target_column,
|
||||
subtrahend=subtrahend, constant=constant)
|
||||
return self
|
||||
|
||||
def formula(self, formula: str, target_column: str):
|
||||
"""公式计算"""
|
||||
self.add_rule('formula', formula=formula, target_column=target_column)
|
||||
return self
|
||||
|
||||
def round_columns(self, columns: Optional[Union[str, List[str]]] = None, decimals: int = 0):
|
||||
"""四舍五入"""
|
||||
self.add_rule('round', columns=columns, decimals=decimals)
|
||||
return self
|
||||
|
||||
def sum_columns(self, columns: Optional[Union[str, List[str]]] = None,
|
||||
target_column: Optional[str] = None, group_by: Optional[str] = None):
|
||||
"""求和计算"""
|
||||
self.add_rule('sum', columns=columns, target_column=target_column, group_by=group_by)
|
||||
return self
|
||||
|
||||
def aggregate(self, group_by: str, aggregations: Dict[str, Union[str, List[str]]]):
|
||||
"""聚合计算"""
|
||||
self.add_rule('aggregate', group_by=group_by, aggregations=aggregations)
|
||||
return self
|
||||
@@ -0,0 +1,276 @@
|
||||
"""
|
||||
列映射处理器
|
||||
|
||||
提供列名映射和转换功能,支持不同供应商的列名标准化
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
from typing import Dict, Any, Optional, List, Union
|
||||
from ...core.utils.log_utils import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class ColumnMapper:
|
||||
"""列映射处理器
|
||||
|
||||
提供列名标准化功能,将不同供应商的列名映射到标准列名
|
||||
"""
|
||||
|
||||
# 标准列名定义
|
||||
STANDARD_COLUMNS = {
|
||||
'barcode': ['条码', '条形码', '商品条码', '产品条码', '条码(必填)', 'barcode', 'code'],
|
||||
'name': ['商品名称', '产品名称', '名称', '商品', '产品', 'name', 'product_name'],
|
||||
'specification': ['规格', '规格型号', '型号', 'specification', 'spec', 'model'],
|
||||
'quantity': ['数量', '采购量', '订货数量', '订单量', '需求量', 'quantity', 'qty', '采购量(必填)'],
|
||||
'unit': ['单位', '计量单位', 'unit', 'units'],
|
||||
'unit_price': ['单价', '价格', '采购单价', '进货价', 'unit_price', 'price', '采购单价(必填)'],
|
||||
'total_price': ['总价', '金额', '小计', 'total_price', 'total', 'amount'],
|
||||
'category': ['类别', '分类', '商品类别', 'category', 'type'],
|
||||
'brand': ['品牌', '商标', 'brand'],
|
||||
'supplier': ['供应商', '供货商', 'supplier', 'vendor']
|
||||
}
|
||||
|
||||
def __init__(self, mapping_config: Optional[Dict[str, Any]] = None):
|
||||
"""初始化列映射器
|
||||
|
||||
Args:
|
||||
mapping_config: 映射配置
|
||||
"""
|
||||
self.mapping_config = mapping_config or {}
|
||||
self.custom_mappings = {}
|
||||
self._build_reverse_mapping()
|
||||
|
||||
def _build_reverse_mapping(self):
|
||||
"""构建反向映射表"""
|
||||
self.reverse_mapping = {}
|
||||
|
||||
# 添加标准列的反向映射
|
||||
for standard_name, variations in self.STANDARD_COLUMNS.items():
|
||||
for variation in variations:
|
||||
self.reverse_mapping[variation.lower()] = standard_name
|
||||
|
||||
# 添加自定义映射
|
||||
for standard_name, custom_names in self.mapping_config.items():
|
||||
if isinstance(custom_names, str):
|
||||
custom_names = [custom_names]
|
||||
|
||||
for custom_name in custom_names:
|
||||
self.reverse_mapping[custom_name.lower()] = standard_name
|
||||
self.custom_mappings[custom_name.lower()] = standard_name
|
||||
|
||||
def map_columns(self, df: pd.DataFrame, target_columns: Optional[List[str]] = None) -> pd.DataFrame:
|
||||
"""映射列名
|
||||
|
||||
Args:
|
||||
df: 输入数据
|
||||
target_columns: 目标列名列表,如果为None则使用所有标准列
|
||||
|
||||
Returns:
|
||||
列名映射后的数据
|
||||
"""
|
||||
if target_columns is None:
|
||||
target_columns = list(self.STANDARD_COLUMNS.keys())
|
||||
|
||||
logger.info(f"开始列名映射,目标列: {target_columns}")
|
||||
logger.info(f"原始列名: {list(df.columns)}")
|
||||
|
||||
# 创建列名映射
|
||||
column_mapping = {}
|
||||
used_columns = set()
|
||||
|
||||
for target_col in target_columns:
|
||||
# 查找匹配的原始列名
|
||||
matched_column = self._find_matching_column(df.columns, target_col)
|
||||
if matched_column:
|
||||
column_mapping[matched_column] = target_col
|
||||
used_columns.add(matched_column)
|
||||
logger.debug(f"列名映射: {matched_column} -> {target_col}")
|
||||
|
||||
# 重命名列
|
||||
if column_mapping:
|
||||
df_mapped = df.rename(columns=column_mapping)
|
||||
|
||||
# 添加缺失的目标列
|
||||
for target_col in target_columns:
|
||||
if target_col not in df_mapped.columns:
|
||||
df_mapped[target_col] = self._get_default_value(target_col)
|
||||
logger.debug(f"添加缺失列: {target_col}")
|
||||
|
||||
# 只保留目标列
|
||||
existing_target_columns = [col for col in target_columns if col in df_mapped.columns]
|
||||
df_result = df_mapped[existing_target_columns]
|
||||
|
||||
logger.info(f"列名映射完成,结果列名: {list(df_result.columns)}")
|
||||
return df_result
|
||||
else:
|
||||
logger.warning("没有找到可映射的列名")
|
||||
return df
|
||||
|
||||
def _find_matching_column(self, columns: List[str], target_column: str) -> Optional[str]:
|
||||
"""查找匹配的列名
|
||||
|
||||
Args:
|
||||
columns: 原始列名列表
|
||||
target_column: 目标标准列名
|
||||
|
||||
Returns:
|
||||
匹配的原始列名或None
|
||||
"""
|
||||
# 获取目标列的所有可能变体
|
||||
possible_names = []
|
||||
|
||||
# 标准列名变体
|
||||
if target_column in self.STANDARD_COLUMNS:
|
||||
possible_names.extend(self.STANDARD_COLUMNS[target_column])
|
||||
|
||||
# 自定义映射
|
||||
for standard_name, custom_names in self.mapping_config.items():
|
||||
if standard_name == target_column:
|
||||
if isinstance(custom_names, str):
|
||||
possible_names.append(custom_names)
|
||||
else:
|
||||
possible_names.extend(custom_names)
|
||||
|
||||
# 查找匹配
|
||||
for possible_name in possible_names:
|
||||
# 精确匹配(忽略大小写)
|
||||
for column in columns:
|
||||
if column.lower() == possible_name.lower():
|
||||
return column
|
||||
|
||||
# 模糊匹配
|
||||
for column in columns:
|
||||
if possible_name.lower() in column.lower() or column.lower() in possible_name.lower():
|
||||
return column
|
||||
|
||||
return None
|
||||
|
||||
def _get_default_value(self, column_name: str) -> Any:
|
||||
"""获取列的默认值
|
||||
|
||||
Args:
|
||||
column_name: 列名
|
||||
|
||||
Returns:
|
||||
默认值
|
||||
"""
|
||||
# 根据列名类型返回合适的默认值
|
||||
if column_name in ['quantity', 'unit_price', 'total_price']:
|
||||
return 0
|
||||
elif column_name in ['barcode', 'name', 'specification', 'unit', 'category', 'brand', 'supplier']:
|
||||
return ''
|
||||
else:
|
||||
return None
|
||||
|
||||
def add_custom_mapping(self, standard_name: str, custom_names: Union[str, List[str]]):
|
||||
"""添加自定义列名映射
|
||||
|
||||
Args:
|
||||
standard_name: 标准列名
|
||||
custom_names: 自定义列名或列名列表
|
||||
"""
|
||||
if isinstance(custom_names, str):
|
||||
custom_names = [custom_names]
|
||||
|
||||
# 更新配置
|
||||
self.mapping_config[standard_name] = custom_names
|
||||
|
||||
# 更新反向映射
|
||||
for custom_name in custom_names:
|
||||
self.reverse_mapping[custom_name.lower()] = standard_name
|
||||
self.custom_mappings[custom_name.lower()] = standard_name
|
||||
|
||||
logger.info(f"添加自定义映射: {standard_name} <- {custom_names}")
|
||||
|
||||
def detect_column_types(self, df: pd.DataFrame) -> Dict[str, str]:
|
||||
"""检测列的数据类型
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
|
||||
Returns:
|
||||
列类型字典
|
||||
"""
|
||||
column_types = {}
|
||||
|
||||
for column in df.columns:
|
||||
if pd.api.types.is_numeric_dtype(df[column]):
|
||||
column_types[column] = 'numeric'
|
||||
elif pd.api.types.is_datetime64_any_dtype(df[column]):
|
||||
column_types[column] = 'datetime'
|
||||
elif pd.api.types.is_bool_dtype(df[column]):
|
||||
column_types[column] = 'boolean'
|
||||
else:
|
||||
column_types[column] = 'text'
|
||||
|
||||
return column_types
|
||||
|
||||
def suggest_column_mapping(self, df: pd.DataFrame) -> Dict[str, List[str]]:
|
||||
"""建议列名映射
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
|
||||
Returns:
|
||||
建议的映射关系
|
||||
"""
|
||||
suggestions = {}
|
||||
|
||||
for column in df.columns:
|
||||
column_lower = column.lower()
|
||||
suggestions[column] = []
|
||||
|
||||
# 检查标准列名
|
||||
for standard_name, variations in self.STANDARD_COLUMNS.items():
|
||||
for variation in variations:
|
||||
if column_lower in variation.lower() or variation.lower() in column_lower:
|
||||
suggestions[column].append(standard_name)
|
||||
|
||||
# 检查自定义映射
|
||||
for custom_name, standard_name in self.custom_mappings.items():
|
||||
if column_lower in custom_name or custom_name in column_lower:
|
||||
suggestions[column].append(standard_name)
|
||||
|
||||
# 去重
|
||||
suggestions[column] = list(set(suggestions[column]))
|
||||
|
||||
# 只返回有建议的列
|
||||
return {k: v for k, v in suggestions.items() if v}
|
||||
|
||||
def validate_mapping(self, df: pd.DataFrame, required_columns: List[str]) -> Dict[str, Any]:
|
||||
"""验证列映射结果
|
||||
|
||||
Args:
|
||||
df: 映射后的数据
|
||||
required_columns: 必需的列名列表
|
||||
|
||||
Returns:
|
||||
验证结果
|
||||
"""
|
||||
result = {
|
||||
'valid': True,
|
||||
'missing_columns': [],
|
||||
'empty_columns': [],
|
||||
'warnings': []
|
||||
}
|
||||
|
||||
# 检查缺失列
|
||||
for col in required_columns:
|
||||
if col not in df.columns:
|
||||
result['missing_columns'].append(col)
|
||||
result['valid'] = False
|
||||
|
||||
# 检查空列
|
||||
for col in df.columns:
|
||||
if df[col].isnull().all():
|
||||
result['empty_columns'].append(col)
|
||||
result['warnings'].append(f"列 '{col}' 全部为空值")
|
||||
|
||||
# 检查数值列
|
||||
numeric_columns = ['quantity', 'unit_price', 'total_price']
|
||||
for col in numeric_columns:
|
||||
if col in df.columns and not pd.api.types.is_numeric_dtype(df[col]):
|
||||
result['warnings'].append(f"列 '{col}' 不是数值类型")
|
||||
|
||||
return result
|
||||
@@ -0,0 +1,401 @@
|
||||
"""
|
||||
数据清洗处理器
|
||||
|
||||
提供各种数据清洗功能,如空值处理、重复项处理、数据类型转换等
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
from typing import Dict, Any, Optional, List, Union
|
||||
from ...core.utils.log_utils import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class DataCleaner:
|
||||
"""数据清洗处理器
|
||||
|
||||
提供标准化的数据清洗功能,支持链式调用和规则配置
|
||||
"""
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
"""初始化数据清洗器
|
||||
|
||||
Args:
|
||||
config: 清洗配置
|
||||
"""
|
||||
self.config = config or {}
|
||||
self.cleaning_rules = []
|
||||
|
||||
def add_rule(self, rule_type: str, **kwargs):
|
||||
"""添加清洗规则
|
||||
|
||||
Args:
|
||||
rule_type: 规则类型
|
||||
**kwargs: 规则参数
|
||||
"""
|
||||
rule = {'type': rule_type, **kwargs}
|
||||
self.cleaning_rules.append(rule)
|
||||
logger.debug(f"添加清洗规则: {rule_type}")
|
||||
|
||||
def clean(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""执行数据清洗
|
||||
|
||||
Args:
|
||||
df: 输入数据
|
||||
|
||||
Returns:
|
||||
清洗后的数据
|
||||
"""
|
||||
logger.info(f"开始数据清洗,原始数据形状: {df.shape}")
|
||||
|
||||
result_df = df.copy()
|
||||
|
||||
for i, rule in enumerate(self.cleaning_rules):
|
||||
try:
|
||||
logger.debug(f"执行清洗规则 {i+1}/{len(self.cleaning_rules)}: {rule['type']}")
|
||||
result_df = self._apply_rule(result_df, rule)
|
||||
logger.debug(f"规则执行完成,数据形状: {result_df.shape}")
|
||||
except Exception as e:
|
||||
logger.error(f"清洗规则执行失败: {rule}, 错误: {e}")
|
||||
# 继续执行下一个规则,而不是中断整个流程
|
||||
continue
|
||||
|
||||
logger.info(f"数据清洗完成,最终数据形状: {result_df.shape}")
|
||||
return result_df
|
||||
|
||||
def _apply_rule(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""应用单个清洗规则
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
rule_type = rule.get('type')
|
||||
|
||||
if rule_type == 'remove_duplicates':
|
||||
return self._remove_duplicates(df, rule)
|
||||
elif rule_type == 'fill_na':
|
||||
return self._fill_na(df, rule)
|
||||
elif rule_type == 'remove_rows':
|
||||
return self._remove_rows(df, rule)
|
||||
elif rule_type == 'convert_type':
|
||||
return self._convert_type(df, rule)
|
||||
elif rule_type == 'strip_whitespace':
|
||||
return self._strip_whitespace(df, rule)
|
||||
elif rule_type == 'normalize_text':
|
||||
return self._normalize_text(df, rule)
|
||||
elif rule_type == 'validate_data':
|
||||
return self._validate_data(df, rule)
|
||||
else:
|
||||
logger.warning(f"未知的清洗规则类型: {rule_type}")
|
||||
return df
|
||||
|
||||
def _remove_duplicates(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""移除重复项
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
subset = rule.get('subset') # 用于判断重复的列
|
||||
keep = rule.get('keep', 'first') # 保留哪个重复项
|
||||
|
||||
before_count = len(df)
|
||||
df_cleaned = df.drop_duplicates(subset=subset, keep=keep)
|
||||
after_count = len(df_cleaned)
|
||||
|
||||
logger.info(f"移除重复项: {before_count - after_count} 行被移除")
|
||||
return df_cleaned
|
||||
|
||||
def _fill_na(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""填充空值
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
columns = rule.get('columns') # 要处理的列
|
||||
value = rule.get('value', 0) # 填充值
|
||||
method = rule.get('method') # 填充方法('ffill', 'bfill', 'mean', 'median')
|
||||
|
||||
if columns:
|
||||
# 处理指定列
|
||||
if isinstance(columns, str):
|
||||
columns = [columns]
|
||||
|
||||
for col in columns:
|
||||
if col in df.columns:
|
||||
if method == 'ffill':
|
||||
df[col] = df[col].fillna(method='ffill')
|
||||
elif method == 'bfill':
|
||||
df[col] = df[col].fillna(method='bfill')
|
||||
elif method == 'mean':
|
||||
df[col] = df[col].fillna(df[col].mean())
|
||||
elif method == 'median':
|
||||
df[col] = df[col].fillna(df[col].median())
|
||||
else:
|
||||
df[col] = df[col].fillna(value)
|
||||
|
||||
logger.debug(f"填充列 {col} 的空值: {method or value}")
|
||||
else:
|
||||
# 处理所有列
|
||||
if method == 'ffill':
|
||||
df = df.fillna(method='ffill')
|
||||
elif method == 'bfill':
|
||||
df = df.fillna(method='bfill')
|
||||
else:
|
||||
df = df.fillna(value)
|
||||
|
||||
logger.debug(f"填充所有列的空值: {method or value}")
|
||||
|
||||
return df
|
||||
|
||||
def _remove_rows(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""移除行
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
condition = rule.get('condition') # 条件表达式
|
||||
columns = rule.get('columns') # 要检查的列
|
||||
values = rule.get('values') # 要移除的值
|
||||
|
||||
if condition:
|
||||
# 使用条件表达式
|
||||
try:
|
||||
before_count = len(df)
|
||||
df_filtered = df.query(condition)
|
||||
after_count = len(df_filtered)
|
||||
logger.info(f"条件过滤: {condition}, 移除了 {before_count - after_count} 行")
|
||||
return df_filtered
|
||||
except Exception as e:
|
||||
logger.error(f"条件表达式执行失败: {condition}, 错误: {e}")
|
||||
return df
|
||||
|
||||
if columns and values:
|
||||
# 基于列值过滤
|
||||
if isinstance(columns, str):
|
||||
columns = [columns]
|
||||
if not isinstance(values, list):
|
||||
values = [values]
|
||||
|
||||
df_filtered = df.copy()
|
||||
for col in columns:
|
||||
if col in df_filtered.columns:
|
||||
mask = ~df_filtered[col].isin(values)
|
||||
df_filtered = df_filtered[mask]
|
||||
logger.debug(f"列 {col} 过滤值 {values}")
|
||||
|
||||
return df_filtered
|
||||
|
||||
logger.warning("移除行规则缺少条件或列配置")
|
||||
return df
|
||||
|
||||
def _convert_type(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""类型转换
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
columns = rule.get('columns')
|
||||
target_type = rule.get('target_type', 'float')
|
||||
errors = rule.get('errors', 'coerce') # 错误处理方式
|
||||
|
||||
if isinstance(columns, str):
|
||||
columns = [columns]
|
||||
|
||||
for col in columns:
|
||||
if col in df.columns:
|
||||
try:
|
||||
if target_type == 'int':
|
||||
df[col] = pd.to_numeric(df[col], errors=errors).astype('Int64')
|
||||
elif target_type == 'float':
|
||||
df[col] = pd.to_numeric(df[col], errors=errors)
|
||||
elif target_type == 'datetime':
|
||||
df[col] = pd.to_datetime(df[col], errors=errors)
|
||||
elif target_type == 'string':
|
||||
df[col] = df[col].astype(str)
|
||||
else:
|
||||
df[col] = df[col].astype(target_type)
|
||||
|
||||
logger.debug(f"列 {col} 类型转换: {target_type}")
|
||||
except Exception as e:
|
||||
logger.error(f"列 {col} 类型转换失败: {e}")
|
||||
|
||||
return df
|
||||
|
||||
def _strip_whitespace(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""去除空白字符
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
columns = rule.get('columns')
|
||||
|
||||
if columns:
|
||||
if isinstance(columns, str):
|
||||
columns = [columns]
|
||||
|
||||
for col in columns:
|
||||
if col in df.columns and df[col].dtype == 'object':
|
||||
df[col] = df[col].str.strip()
|
||||
logger.debug(f"列 {col} 去除空白字符")
|
||||
else:
|
||||
# 处理所有文本列
|
||||
text_columns = df.select_dtypes(include=['object']).columns
|
||||
for col in text_columns:
|
||||
df[col] = df[col].str.strip()
|
||||
|
||||
logger.debug(f"所有文本列去除空白字符: {list(text_columns)}")
|
||||
|
||||
return df
|
||||
|
||||
def _normalize_text(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""文本标准化
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
columns = rule.get('columns')
|
||||
lowercase = rule.get('lowercase', False)
|
||||
uppercase = rule.get('uppercase', False)
|
||||
replace_map = rule.get('replace_map', {}) # 替换映射
|
||||
|
||||
if isinstance(columns, str):
|
||||
columns = [columns]
|
||||
|
||||
target_columns = columns or df.select_dtypes(include=['object']).columns
|
||||
|
||||
for col in target_columns:
|
||||
if col in df.columns and df[col].dtype == 'object':
|
||||
if lowercase:
|
||||
df[col] = df[col].str.lower()
|
||||
elif uppercase:
|
||||
df[col] = df[col].str.upper()
|
||||
|
||||
# 应用替换映射
|
||||
for old, new in replace_map.items():
|
||||
df[col] = df[col].str.replace(old, new)
|
||||
|
||||
logger.debug(f"列 {col} 文本标准化完成")
|
||||
|
||||
return df
|
||||
|
||||
def _validate_data(self, df: pd.DataFrame, rule: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""数据验证
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
rule: 规则配置
|
||||
|
||||
Returns:
|
||||
处理后的数据
|
||||
"""
|
||||
columns = rule.get('columns')
|
||||
min_value = rule.get('min_value')
|
||||
max_value = rule.get('max_value')
|
||||
required = rule.get('required', False)
|
||||
|
||||
if isinstance(columns, str):
|
||||
columns = [columns]
|
||||
|
||||
validation_results = []
|
||||
|
||||
for col in columns:
|
||||
if col in df.columns:
|
||||
# 检查必需值
|
||||
if required:
|
||||
null_count = df[col].isnull().sum()
|
||||
if null_count > 0:
|
||||
validation_results.append(f"{col}: {null_count} 个空值")
|
||||
|
||||
# 检查数值范围
|
||||
if min_value is not None or max_value is not None:
|
||||
if pd.api.types.is_numeric_dtype(df[col]):
|
||||
invalid_mask = pd.Series(False, index=df.index)
|
||||
if min_value is not None:
|
||||
invalid_mask |= df[col] < min_value
|
||||
if max_value is not None:
|
||||
invalid_mask |= df[col] > max_value
|
||||
|
||||
invalid_count = invalid_mask.sum()
|
||||
if invalid_count > 0:
|
||||
validation_results.append(f"{col}: {invalid_count} 个值超出范围")
|
||||
|
||||
if validation_results:
|
||||
logger.warning(f"数据验证发现问题: {', '.join(validation_results)}")
|
||||
else:
|
||||
logger.debug("数据验证通过")
|
||||
|
||||
return df
|
||||
|
||||
# 便捷方法
|
||||
def remove_duplicates(self, subset: Optional[List[str]] = None, keep: str = 'first'):
|
||||
"""移除重复项"""
|
||||
self.add_rule('remove_duplicates', subset=subset, keep=keep)
|
||||
return self
|
||||
|
||||
def fill_na(self, columns: Optional[Union[str, List[str]]] = None,
|
||||
value: Any = 0, method: Optional[str] = None):
|
||||
"""填充空值"""
|
||||
self.add_rule('fill_na', columns=columns, value=value, method=method)
|
||||
return self
|
||||
|
||||
def remove_rows(self, condition: Optional[str] = None,
|
||||
columns: Optional[Union[str, List[str]]] = None,
|
||||
values: Optional[Any] = None):
|
||||
"""移除行"""
|
||||
self.add_rule('remove_rows', condition=condition, columns=columns, values=values)
|
||||
return self
|
||||
|
||||
def convert_type(self, columns: Union[str, List[str]], target_type: str, errors: str = 'coerce'):
|
||||
"""类型转换"""
|
||||
self.add_rule('convert_type', columns=columns, target_type=target_type, errors=errors)
|
||||
return self
|
||||
|
||||
def strip_whitespace(self, columns: Optional[Union[str, List[str]]] = None):
|
||||
"""去除空白字符"""
|
||||
self.add_rule('strip_whitespace', columns=columns)
|
||||
return self
|
||||
|
||||
def normalize_text(self, columns: Optional[Union[str, List[str]]] = None,
|
||||
lowercase: bool = False, uppercase: bool = False,
|
||||
replace_map: Optional[Dict[str, str]] = None):
|
||||
"""文本标准化"""
|
||||
self.add_rule('normalize_text', columns=columns, lowercase=lowercase,
|
||||
uppercase=uppercase, replace_map=replace_map or {})
|
||||
return self
|
||||
|
||||
def validate_data(self, columns: Union[str, List[str]],
|
||||
min_value: Optional[float] = None,
|
||||
max_value: Optional[float] = None,
|
||||
required: bool = False):
|
||||
"""数据验证"""
|
||||
self.add_rule('validate_data', columns=columns, min_value=min_value,
|
||||
max_value=max_value, required=required)
|
||||
return self
|
||||
Reference in New Issue
Block a user