增强版v2-初始化仓库,验证好了ocr部分,先备份一次
This commit is contained in:
@@ -0,0 +1,5 @@
|
||||
"""
|
||||
OCR订单处理系统 - Excel处理模块
|
||||
----------------------------
|
||||
提供Excel文件处理、数据提取和转换功能。
|
||||
"""
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,213 @@
|
||||
"""
|
||||
单位转换处理模块
|
||||
-------------
|
||||
提供规格和单位的处理和转换功能。
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Dict, List, Optional, Tuple, Any
|
||||
|
||||
from ..utils.log_utils import get_logger
|
||||
from ..utils.string_utils import (
|
||||
clean_string,
|
||||
extract_number,
|
||||
extract_unit,
|
||||
extract_number_and_unit,
|
||||
parse_specification
|
||||
)
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
class UnitConverter:
|
||||
"""
|
||||
单位转换器:处理商品规格和单位转换
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""初始化单位转换器"""
|
||||
# 特殊条码配置
|
||||
self.special_barcodes = {
|
||||
'6925019900087': {
|
||||
'multiplier': 10, # 数量乘以10
|
||||
'target_unit': '瓶', # 目标单位
|
||||
'description': '特殊处理:数量*10,单位转换为瓶'
|
||||
}
|
||||
# 可以在这里添加更多特殊条码的配置
|
||||
}
|
||||
|
||||
# 有效的单位列表
|
||||
self.valid_units = ['件', '箱', '包', '提', '盒', '瓶', '个', '支', '袋', '副', '桶', '罐', 'L', 'l', '升']
|
||||
|
||||
# 需要特殊处理的单位
|
||||
self.special_units = ['件', '箱', '提', '盒']
|
||||
|
||||
logger.info("单位转换器初始化完成")
|
||||
|
||||
def add_special_barcode(self, barcode: str, multiplier: int, target_unit: str, description: str = "") -> None:
|
||||
"""
|
||||
添加特殊条码处理配置
|
||||
|
||||
Args:
|
||||
barcode: 条码
|
||||
multiplier: 数量乘数
|
||||
target_unit: 目标单位
|
||||
description: 处理描述
|
||||
"""
|
||||
self.special_barcodes[barcode] = {
|
||||
'multiplier': multiplier,
|
||||
'target_unit': target_unit,
|
||||
'description': description or f'特殊处理:数量*{multiplier},单位转换为{target_unit}'
|
||||
}
|
||||
logger.info(f"添加特殊条码配置: {barcode}, {description}")
|
||||
|
||||
def infer_specification_from_name(self, product_name: str) -> Optional[str]:
|
||||
"""
|
||||
从商品名称推断规格
|
||||
|
||||
Args:
|
||||
product_name: 商品名称
|
||||
|
||||
Returns:
|
||||
推断的规格,如果无法推断则返回None
|
||||
"""
|
||||
if not product_name or not isinstance(product_name, str):
|
||||
return None
|
||||
|
||||
try:
|
||||
# 清理商品名称
|
||||
name = clean_string(product_name)
|
||||
|
||||
# 1. 匹配 XX入纸箱 格式
|
||||
match = re.search(r'(\d+)入纸箱', name)
|
||||
if match:
|
||||
return f"1*{match.group(1)}"
|
||||
|
||||
# 2. 匹配 绿茶1*15-纸箱装 格式
|
||||
match = re.search(r'(\d+)[*×xX](\d+)[-\s]?纸箱', name)
|
||||
if match:
|
||||
return f"{match.group(1)}*{match.group(2)}"
|
||||
|
||||
# 3. 匹配 12.9L桶装水 格式
|
||||
match = re.search(r'([\d\.]+)[Ll升](?!.*[*×xX])', name)
|
||||
if match:
|
||||
return f"{match.group(1)}L*1"
|
||||
|
||||
# 4. 匹配 商品12入纸箱 格式(数字在中间)
|
||||
match = re.search(r'\D(\d+)入\w*箱', name)
|
||||
if match:
|
||||
return f"1*{match.group(1)}"
|
||||
|
||||
# 5. 匹配 商品15纸箱 格式(数字在中间)
|
||||
match = re.search(r'\D(\d+)\w*箱', name)
|
||||
if match:
|
||||
return f"1*{match.group(1)}"
|
||||
|
||||
# 6. 匹配 商品1*30 格式
|
||||
match = re.search(r'(\d+)[*×xX](\d+)', name)
|
||||
if match:
|
||||
return f"{match.group(1)}*{match.group(2)}"
|
||||
|
||||
logger.debug(f"无法从商品名称推断规格: {name}")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"从商品名称推断规格时出错: {e}")
|
||||
return None
|
||||
|
||||
def extract_unit_from_quantity(self, quantity_str: str) -> Tuple[Optional[float], Optional[str]]:
|
||||
"""
|
||||
从数量字符串提取单位
|
||||
|
||||
Args:
|
||||
quantity_str: 数量字符串
|
||||
|
||||
Returns:
|
||||
(数量, 单位)元组
|
||||
"""
|
||||
if not quantity_str or not isinstance(quantity_str, str):
|
||||
return None, None
|
||||
|
||||
try:
|
||||
# 清理数量字符串
|
||||
quantity_str = clean_string(quantity_str)
|
||||
|
||||
# 提取数字和单位
|
||||
return extract_number_and_unit(quantity_str)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"从数量字符串提取单位时出错: {quantity_str}, 错误: {e}")
|
||||
return None, None
|
||||
|
||||
def process_unit_conversion(self, product: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
处理单位转换,根据单位和规格转换数量和单价
|
||||
|
||||
Args:
|
||||
product: 商品字典,包含条码、单位、规格、数量和单价等字段
|
||||
|
||||
Returns:
|
||||
处理后的商品字典
|
||||
"""
|
||||
# 复制商品信息,避免修改原始数据
|
||||
result = product.copy()
|
||||
|
||||
try:
|
||||
# 获取条码、单位、规格、数量和单价
|
||||
barcode = product.get('barcode', '')
|
||||
unit = product.get('unit', '')
|
||||
specification = product.get('specification', '')
|
||||
quantity = product.get('quantity', 0)
|
||||
price = product.get('price', 0)
|
||||
|
||||
# 如果缺少关键信息,无法进行转换
|
||||
if not barcode or quantity == 0:
|
||||
return result
|
||||
|
||||
# 1. 首先检查是否是特殊条码
|
||||
if barcode in self.special_barcodes:
|
||||
special_config = self.special_barcodes[barcode]
|
||||
logger.info(f"应用特殊条码配置: {barcode}, {special_config['description']}")
|
||||
|
||||
# 应用乘数和单位转换
|
||||
result['quantity'] = quantity * special_config['multiplier']
|
||||
result['unit'] = special_config['target_unit']
|
||||
|
||||
# 如果有单价,进行单价转换
|
||||
if price != 0:
|
||||
result['price'] = price / special_config['multiplier']
|
||||
|
||||
return result
|
||||
|
||||
# 2. 提取规格包装数量
|
||||
package_quantity = None
|
||||
if specification:
|
||||
package_quantity = parse_specification(specification)
|
||||
|
||||
# 3. 处理单位转换
|
||||
if unit and unit in self.special_units and package_quantity:
|
||||
# 判断是否是三级规格(1*5*12格式)
|
||||
is_three_level = bool(re.search(r'\d+[\*xX×]\d+[\*xX×]\d+', str(specification)))
|
||||
|
||||
# 对于"提"和"盒"单位的特殊处理
|
||||
if (unit in ['提', '盒']) and not is_three_level:
|
||||
# 二级规格:保持原数量不变
|
||||
logger.info(f"二级规格的提/盒单位,保持原状: {unit}, 规格={specification}")
|
||||
return result
|
||||
|
||||
# 标准处理:数量×包装数量,单价÷包装数量
|
||||
logger.info(f"标准单位转换: {unit}->瓶, 规格={specification}, 包装数量={package_quantity}")
|
||||
result['quantity'] = quantity * package_quantity
|
||||
result['unit'] = '瓶'
|
||||
|
||||
if price != 0:
|
||||
result['price'] = price / package_quantity
|
||||
|
||||
return result
|
||||
|
||||
# 4. 默认返回原始数据
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"单位转换处理出错: {e}")
|
||||
# 发生错误时,返回原始数据
|
||||
return result
|
||||
@@ -0,0 +1,375 @@
|
||||
"""
|
||||
订单合并模块
|
||||
----------
|
||||
提供采购单合并功能,将多个采购单合并为一个。
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import xlrd
|
||||
import xlwt
|
||||
from xlutils.copy import copy as xlcopy
|
||||
from typing import Dict, List, Optional, Tuple, Union, Any
|
||||
from datetime import datetime
|
||||
|
||||
from ...config.settings import ConfigManager
|
||||
from ..utils.log_utils import get_logger
|
||||
from ..utils.file_utils import (
|
||||
ensure_dir,
|
||||
get_file_extension,
|
||||
get_files_by_extensions,
|
||||
load_json,
|
||||
save_json
|
||||
)
|
||||
from ..utils.string_utils import (
|
||||
clean_string,
|
||||
clean_barcode,
|
||||
format_barcode
|
||||
)
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
class PurchaseOrderMerger:
|
||||
"""
|
||||
采购单合并器:将多个采购单Excel文件合并成一个文件
|
||||
"""
|
||||
|
||||
def __init__(self, config: Optional[ConfigManager] = None):
|
||||
"""
|
||||
初始化采购单合并器
|
||||
|
||||
Args:
|
||||
config: 配置管理器,如果为None则创建新的
|
||||
"""
|
||||
logger.info("初始化PurchaseOrderMerger")
|
||||
self.config = config or ConfigManager()
|
||||
|
||||
# 获取配置
|
||||
self.output_dir = self.config.get_path('Paths', 'output_folder', 'data/output', create=True)
|
||||
|
||||
# 获取模板文件路径
|
||||
template_folder = self.config.get('Paths', 'template_folder', 'templates')
|
||||
template_name = self.config.get('Templates', 'purchase_order', '银豹-采购单模板.xls')
|
||||
|
||||
self.template_path = os.path.join(template_folder, template_name)
|
||||
|
||||
# 检查模板文件是否存在
|
||||
if not os.path.exists(self.template_path):
|
||||
logger.error(f"模板文件不存在: {self.template_path}")
|
||||
raise FileNotFoundError(f"模板文件不存在: {self.template_path}")
|
||||
|
||||
# 用于记录已合并的文件
|
||||
self.cache_file = os.path.join(self.output_dir, "merged_files.json")
|
||||
self.merged_files = self._load_merged_files()
|
||||
|
||||
logger.info(f"初始化完成,模板文件: {self.template_path}")
|
||||
|
||||
def _load_merged_files(self) -> Dict[str, str]:
|
||||
"""
|
||||
加载已合并文件的缓存
|
||||
|
||||
Returns:
|
||||
合并记录字典
|
||||
"""
|
||||
return load_json(self.cache_file, {})
|
||||
|
||||
def _save_merged_files(self) -> None:
|
||||
"""保存已合并文件的缓存"""
|
||||
save_json(self.merged_files, self.cache_file)
|
||||
|
||||
def get_purchase_orders(self) -> List[str]:
|
||||
"""
|
||||
获取output目录下的采购单Excel文件
|
||||
|
||||
Returns:
|
||||
采购单文件路径列表
|
||||
"""
|
||||
logger.info(f"搜索目录 {self.output_dir} 中的采购单Excel文件")
|
||||
|
||||
# 获取所有Excel文件
|
||||
all_files = get_files_by_extensions(self.output_dir, ['.xls', '.xlsx'])
|
||||
|
||||
# 筛选采购单文件
|
||||
purchase_orders = [
|
||||
file for file in all_files
|
||||
if os.path.basename(file).startswith('采购单_')
|
||||
]
|
||||
|
||||
if not purchase_orders:
|
||||
logger.warning(f"未在 {self.output_dir} 目录下找到采购单Excel文件")
|
||||
return []
|
||||
|
||||
# 按修改时间排序,最新的在前
|
||||
purchase_orders.sort(key=lambda x: os.path.getmtime(x), reverse=True)
|
||||
|
||||
logger.info(f"找到 {len(purchase_orders)} 个采购单Excel文件")
|
||||
return purchase_orders
|
||||
|
||||
def read_purchase_order(self, file_path: str) -> Optional[pd.DataFrame]:
|
||||
"""
|
||||
读取采购单Excel文件
|
||||
|
||||
Args:
|
||||
file_path: 采购单文件路径
|
||||
|
||||
Returns:
|
||||
数据帧,如果读取失败则返回None
|
||||
"""
|
||||
try:
|
||||
# 读取Excel文件
|
||||
df = pd.read_excel(file_path)
|
||||
logger.info(f"成功读取采购单文件: {file_path}")
|
||||
|
||||
# 打印列名,用于调试
|
||||
logger.debug(f"Excel文件的列名: {df.columns.tolist()}")
|
||||
|
||||
# 检查是否有特殊表头结构(如在第3行)
|
||||
special_header = False
|
||||
if len(df) > 3: # 确保有足够的行
|
||||
row3 = df.iloc[3].astype(str)
|
||||
header_keywords = ['行号', '条形码', '条码', '商品名称', '规格', '单价', '数量', '金额', '单位']
|
||||
# 计算匹配的关键词数量
|
||||
matches = sum(1 for keyword in header_keywords if any(keyword in str(val) for val in row3.values))
|
||||
# 如果匹配了至少3个关键词,认为第3行是表头
|
||||
if matches >= 3:
|
||||
logger.info(f"检测到特殊表头结构,使用第3行作为列名")
|
||||
# 创建新的数据帧,使用第3行作为列名,数据从第4行开始
|
||||
header_row = df.iloc[3]
|
||||
data_rows = df.iloc[4:].reset_index(drop=True)
|
||||
# 为每一列分配一个名称(避免重复的列名)
|
||||
new_columns = []
|
||||
for i, col in enumerate(header_row):
|
||||
col_str = str(col)
|
||||
if col_str == 'nan' or col_str == 'None' or pd.isna(col):
|
||||
new_columns.append(f"Col_{i}")
|
||||
else:
|
||||
new_columns.append(col_str)
|
||||
# 使用新列名创建新的DataFrame
|
||||
data_rows.columns = new_columns
|
||||
df = data_rows
|
||||
special_header = True
|
||||
logger.debug(f"重新构建的数据帧列名: {df.columns.tolist()}")
|
||||
|
||||
# 定义可能的列名映射
|
||||
column_mapping = {
|
||||
'条码': ['条码', '条形码', '商品条码', 'barcode', '商品条形码', '条形码', '商品条码', '商品编码', '商品编号', '条形码', '条码(必填)'],
|
||||
'采购量': ['数量', '采购数量', '购买数量', '采购数量', '订单数量', '采购数量', '采购量(必填)'],
|
||||
'采购单价': ['单价', '价格', '采购单价', '销售价', '采购单价(必填)'],
|
||||
'赠送量': ['赠送量', '赠品数量', '赠送数量', '赠品']
|
||||
}
|
||||
|
||||
# 映射实际的列名
|
||||
mapped_columns = {}
|
||||
for target_col, possible_names in column_mapping.items():
|
||||
for col in df.columns:
|
||||
# 移除列名中的空白字符和括号内容以进行比较
|
||||
clean_col = re.sub(r'\s+', '', str(col))
|
||||
clean_col = re.sub(r'(.*?)', '', clean_col) # 移除括号内容
|
||||
for name in possible_names:
|
||||
clean_name = re.sub(r'\s+', '', name)
|
||||
clean_name = re.sub(r'(.*?)', '', clean_name) # 移除括号内容
|
||||
if clean_col == clean_name:
|
||||
mapped_columns[target_col] = col
|
||||
break
|
||||
if target_col in mapped_columns:
|
||||
break
|
||||
|
||||
# 如果找到了必要的列,重命名列
|
||||
if mapped_columns:
|
||||
# 如果没有找到条码列,无法继续处理
|
||||
if '条码' not in mapped_columns:
|
||||
logger.error(f"未找到条码列: {file_path}")
|
||||
return None
|
||||
|
||||
df = df.rename(columns=mapped_columns)
|
||||
logger.info(f"列名映射结果: {mapped_columns}")
|
||||
|
||||
return df
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"读取采购单文件失败: {file_path}, 错误: {str(e)}")
|
||||
return None
|
||||
|
||||
def merge_purchase_orders(self, file_paths: List[str]) -> Optional[pd.DataFrame]:
|
||||
"""
|
||||
合并多个采购单文件
|
||||
|
||||
Args:
|
||||
file_paths: 采购单文件路径列表
|
||||
|
||||
Returns:
|
||||
合并后的数据帧,如果合并失败则返回None
|
||||
"""
|
||||
if not file_paths:
|
||||
logger.warning("没有需要合并的采购单文件")
|
||||
return None
|
||||
|
||||
# 读取所有采购单文件
|
||||
dfs = []
|
||||
for file_path in file_paths:
|
||||
df = self.read_purchase_order(file_path)
|
||||
if df is not None:
|
||||
dfs.append(df)
|
||||
|
||||
if not dfs:
|
||||
logger.warning("没有成功读取的采购单文件")
|
||||
return None
|
||||
|
||||
# 合并数据
|
||||
logger.info(f"开始合并 {len(dfs)} 个采购单文件")
|
||||
|
||||
# 首先,整理每个数据帧以确保它们有相同的结构
|
||||
processed_dfs = []
|
||||
for i, df in enumerate(dfs):
|
||||
# 确保必要的列存在
|
||||
required_columns = ['条码', '采购量', '采购单价']
|
||||
missing_columns = [col for col in required_columns if col not in df.columns]
|
||||
|
||||
if missing_columns:
|
||||
logger.warning(f"数据帧 {i} 缺少必要的列: {missing_columns}")
|
||||
continue
|
||||
|
||||
# 处理赠送量列不存在的情况
|
||||
if '赠送量' not in df.columns:
|
||||
df['赠送量'] = pd.NA
|
||||
|
||||
# 选择需要的列
|
||||
selected_df = df[['条码', '采购量', '采购单价', '赠送量']].copy()
|
||||
|
||||
# 清理和转换数据
|
||||
selected_df['条码'] = selected_df['条码'].apply(lambda x: format_barcode(x) if pd.notna(x) else x)
|
||||
selected_df['采购量'] = pd.to_numeric(selected_df['采购量'], errors='coerce')
|
||||
selected_df['采购单价'] = pd.to_numeric(selected_df['采购单价'], errors='coerce')
|
||||
selected_df['赠送量'] = pd.to_numeric(selected_df['赠送量'], errors='coerce')
|
||||
|
||||
# 过滤无效行
|
||||
valid_df = selected_df.dropna(subset=['条码', '采购量'])
|
||||
|
||||
processed_dfs.append(valid_df)
|
||||
|
||||
if not processed_dfs:
|
||||
logger.warning("没有有效的数据帧用于合并")
|
||||
return None
|
||||
|
||||
# 将所有数据帧合并
|
||||
merged_df = pd.concat(processed_dfs, ignore_index=True)
|
||||
|
||||
# 按条码和单价分组,合并相同商品
|
||||
merged_df['采购单价'] = merged_df['采购单价'].round(4) # 四舍五入到4位小数,避免浮点误差
|
||||
|
||||
# 对于同一条码和单价的商品,合并数量和赠送量
|
||||
grouped = merged_df.groupby(['条码', '采购单价'], as_index=False).agg({
|
||||
'采购量': 'sum',
|
||||
'赠送量': lambda x: sum(x.dropna()) if len(x.dropna()) > 0 else pd.NA
|
||||
})
|
||||
|
||||
# 计算其他信息
|
||||
grouped['采购金额'] = grouped['采购量'] * grouped['采购单价']
|
||||
|
||||
# 排序,按条码升序
|
||||
result = grouped.sort_values('条码').reset_index(drop=True)
|
||||
|
||||
logger.info(f"合并完成,共 {len(result)} 条商品记录")
|
||||
return result
|
||||
|
||||
def create_merged_purchase_order(self, df: pd.DataFrame) -> Optional[str]:
|
||||
"""
|
||||
创建合并的采购单文件
|
||||
|
||||
Args:
|
||||
df: 合并后的数据帧
|
||||
|
||||
Returns:
|
||||
输出文件路径,如果创建失败则返回None
|
||||
"""
|
||||
try:
|
||||
# 打开模板文件
|
||||
template_workbook = xlrd.open_workbook(self.template_path, formatting_info=True)
|
||||
template_sheet = template_workbook.sheet_by_index(0)
|
||||
|
||||
# 创建可写的副本
|
||||
output_workbook = xlcopy(template_workbook)
|
||||
output_sheet = output_workbook.get_sheet(0)
|
||||
|
||||
# 填充商品信息
|
||||
start_row = 4 # 从第5行开始填充数据(索引从0开始)
|
||||
|
||||
for i, (_, row) in enumerate(df.iterrows()):
|
||||
r = start_row + i
|
||||
|
||||
# 序号
|
||||
output_sheet.write(r, 0, i + 1)
|
||||
# 商品编码(条码)
|
||||
output_sheet.write(r, 1, row['条码'])
|
||||
# 商品名称(合并单没有名称信息,留空)
|
||||
output_sheet.write(r, 2, "")
|
||||
# 规格(合并单没有规格信息,留空)
|
||||
output_sheet.write(r, 3, "")
|
||||
# 单位(合并单没有单位信息,留空)
|
||||
output_sheet.write(r, 4, "")
|
||||
# 单价
|
||||
output_sheet.write(r, 5, row['采购单价'])
|
||||
# 采购数量
|
||||
output_sheet.write(r, 6, row['采购量'])
|
||||
# 采购金额
|
||||
output_sheet.write(r, 7, row['采购金额'])
|
||||
# 税率
|
||||
output_sheet.write(r, 8, 0)
|
||||
# 赠送量
|
||||
if pd.notna(row['赠送量']):
|
||||
output_sheet.write(r, 9, row['赠送量'])
|
||||
else:
|
||||
output_sheet.write(r, 9, "")
|
||||
|
||||
# 生成输出文件名
|
||||
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
|
||||
output_file = os.path.join(self.output_dir, f"合并采购单_{timestamp}.xls")
|
||||
|
||||
# 保存文件
|
||||
output_workbook.save(output_file)
|
||||
logger.info(f"合并采购单已保存到: {output_file}")
|
||||
return output_file
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"创建合并采购单时出错: {e}")
|
||||
return None
|
||||
|
||||
def process(self, file_paths: Optional[List[str]] = None) -> Optional[str]:
|
||||
"""
|
||||
处理采购单合并
|
||||
|
||||
Args:
|
||||
file_paths: 指定要合并的文件路径列表,如果为None则自动获取
|
||||
|
||||
Returns:
|
||||
合并后的文件路径,如果合并失败则返回None
|
||||
"""
|
||||
# 如果未指定文件路径,则获取所有采购单文件
|
||||
if file_paths is None:
|
||||
file_paths = self.get_purchase_orders()
|
||||
|
||||
# 检查是否有文件需要合并
|
||||
if not file_paths:
|
||||
logger.warning("没有找到可合并的采购单文件")
|
||||
return None
|
||||
|
||||
# 合并采购单
|
||||
merged_df = self.merge_purchase_orders(file_paths)
|
||||
if merged_df is None:
|
||||
logger.error("合并采购单失败")
|
||||
return None
|
||||
|
||||
# 创建合并的采购单文件
|
||||
output_file = self.create_merged_purchase_order(merged_df)
|
||||
if output_file is None:
|
||||
logger.error("创建合并采购单文件失败")
|
||||
return None
|
||||
|
||||
# 记录已合并文件
|
||||
for file_path in file_paths:
|
||||
self.merged_files[file_path] = output_file
|
||||
self._save_merged_files()
|
||||
|
||||
return output_file
|
||||
@@ -0,0 +1,393 @@
|
||||
"""
|
||||
Excel处理核心模块
|
||||
--------------
|
||||
提供Excel文件处理功能,包括表格解析、数据提取和处理。
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import xlrd
|
||||
import xlwt
|
||||
from xlutils.copy import copy as xlcopy
|
||||
from typing import Dict, List, Optional, Tuple, Union, Any
|
||||
from datetime import datetime
|
||||
|
||||
from ...config.settings import ConfigManager
|
||||
from ..utils.log_utils import get_logger
|
||||
from ..utils.file_utils import (
|
||||
ensure_dir,
|
||||
get_file_extension,
|
||||
get_latest_file,
|
||||
load_json,
|
||||
save_json
|
||||
)
|
||||
from ..utils.string_utils import (
|
||||
clean_string,
|
||||
clean_barcode,
|
||||
extract_number,
|
||||
format_barcode
|
||||
)
|
||||
from .converter import UnitConverter
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
class ExcelProcessor:
|
||||
"""
|
||||
Excel处理器:处理OCR识别后的Excel文件,
|
||||
提取条码、单价和数量,并按照采购单模板的格式填充
|
||||
"""
|
||||
|
||||
def __init__(self, config: Optional[ConfigManager] = None):
|
||||
"""
|
||||
初始化Excel处理器
|
||||
|
||||
Args:
|
||||
config: 配置管理器,如果为None则创建新的
|
||||
"""
|
||||
logger.info("初始化ExcelProcessor")
|
||||
self.config = config or ConfigManager()
|
||||
|
||||
# 获取配置
|
||||
self.output_dir = self.config.get_path('Paths', 'output_folder', 'data/output', create=True)
|
||||
self.temp_dir = self.config.get_path('Paths', 'temp_folder', 'data/temp', create=True)
|
||||
|
||||
# 获取模板文件路径
|
||||
template_folder = self.config.get('Paths', 'template_folder', 'templates')
|
||||
template_name = self.config.get('Templates', 'purchase_order', '银豹-采购单模板.xls')
|
||||
|
||||
self.template_path = os.path.join(template_folder, template_name)
|
||||
|
||||
# 检查模板文件是否存在
|
||||
if not os.path.exists(self.template_path):
|
||||
logger.error(f"模板文件不存在: {self.template_path}")
|
||||
raise FileNotFoundError(f"模板文件不存在: {self.template_path}")
|
||||
|
||||
# 用于记录已处理的文件
|
||||
self.cache_file = os.path.join(self.output_dir, "processed_files.json")
|
||||
self.processed_files = self._load_processed_files()
|
||||
|
||||
# 创建单位转换器
|
||||
self.unit_converter = UnitConverter()
|
||||
|
||||
logger.info(f"初始化完成,模板文件: {self.template_path}")
|
||||
|
||||
def _load_processed_files(self) -> Dict[str, str]:
|
||||
"""
|
||||
加载已处理文件的缓存
|
||||
|
||||
Returns:
|
||||
处理记录字典
|
||||
"""
|
||||
return load_json(self.cache_file, {})
|
||||
|
||||
def _save_processed_files(self) -> None:
|
||||
"""保存已处理文件的缓存"""
|
||||
save_json(self.processed_files, self.cache_file)
|
||||
|
||||
def get_latest_excel(self) -> Optional[str]:
|
||||
"""
|
||||
获取output目录下最新的Excel文件(排除采购单文件)
|
||||
|
||||
Returns:
|
||||
最新Excel文件的路径,如果未找到则返回None
|
||||
"""
|
||||
logger.info(f"搜索目录 {self.output_dir} 中的Excel文件")
|
||||
|
||||
# 使用文件工具获取最新文件
|
||||
latest_file = get_latest_file(
|
||||
self.output_dir,
|
||||
pattern="", # 不限制文件名
|
||||
extensions=['.xlsx', '.xls'] # 限制为Excel文件
|
||||
)
|
||||
|
||||
# 如果没有找到文件
|
||||
if not latest_file:
|
||||
logger.warning(f"未在 {self.output_dir} 目录下找到未处理的Excel文件")
|
||||
return None
|
||||
|
||||
# 检查是否是采购单(以"采购单_"开头的文件)
|
||||
file_name = os.path.basename(latest_file)
|
||||
if file_name.startswith('采购单_'):
|
||||
logger.warning(f"找到的最新文件是采购单,不作处理: {latest_file}")
|
||||
return None
|
||||
|
||||
logger.info(f"找到最新的Excel文件: {latest_file}")
|
||||
return latest_file
|
||||
|
||||
def validate_barcode(self, barcode: Any) -> bool:
|
||||
"""
|
||||
验证条码是否有效
|
||||
新增功能:如果条码是"仓库",则返回False以避免误认为有效条码
|
||||
|
||||
Args:
|
||||
barcode: 条码值
|
||||
|
||||
Returns:
|
||||
条码是否有效
|
||||
"""
|
||||
# 处理"仓库"特殊情况
|
||||
if isinstance(barcode, str) and barcode.strip() in ["仓库", "仓库全名"]:
|
||||
logger.warning(f"条码为仓库标识: {barcode}")
|
||||
return False
|
||||
|
||||
# 清理条码格式
|
||||
barcode_clean = clean_barcode(barcode)
|
||||
|
||||
# 对特定的错误条码进行修正(开头改6开头)
|
||||
if len(barcode_clean) > 8 and barcode_clean.startswith('5') and not barcode_clean.startswith('53'):
|
||||
barcode_clean = '6' + barcode_clean[1:]
|
||||
logger.info(f"修正条码前缀 5->6: {barcode} -> {barcode_clean}")
|
||||
|
||||
# 验证条码长度
|
||||
if len(barcode_clean) < 8 or len(barcode_clean) > 13:
|
||||
logger.warning(f"条码长度异常: {barcode_clean}, 长度={len(barcode_clean)}")
|
||||
return False
|
||||
|
||||
# 验证条码是否全为数字
|
||||
if not barcode_clean.isdigit():
|
||||
logger.warning(f"条码包含非数字字符: {barcode_clean}")
|
||||
return False
|
||||
|
||||
# 对于序号9的特殊情况,允许其条码格式
|
||||
if barcode_clean == "5321545613":
|
||||
logger.info(f"特殊条码验证通过: {barcode_clean}")
|
||||
return True
|
||||
|
||||
logger.debug(f"条码验证通过: {barcode_clean}")
|
||||
return True
|
||||
|
||||
def extract_barcode(self, df: pd.DataFrame) -> List[str]:
|
||||
"""
|
||||
从数据帧中提取条码列名
|
||||
|
||||
Args:
|
||||
df: 数据帧
|
||||
|
||||
Returns:
|
||||
可能的条码列名列表
|
||||
"""
|
||||
possible_barcode_columns = [
|
||||
'条码', '条形码', '商品条码', '商品条形码',
|
||||
'商品编码', '商品编号', '条形码', '条码(必填)',
|
||||
'barcode', 'Barcode', '编码', '条形码'
|
||||
]
|
||||
|
||||
found_columns = []
|
||||
for col in df.columns:
|
||||
col_str = str(col).strip()
|
||||
if col_str in possible_barcode_columns:
|
||||
found_columns.append(col)
|
||||
|
||||
return found_columns
|
||||
|
||||
def extract_product_info(self, df: pd.DataFrame) -> List[Dict]:
|
||||
"""
|
||||
从数据帧中提取商品信息
|
||||
|
||||
Args:
|
||||
df: 数据帧
|
||||
|
||||
Returns:
|
||||
商品信息列表
|
||||
"""
|
||||
# 提取有用的列
|
||||
barcode_cols = self.extract_barcode(df)
|
||||
|
||||
# 如果没有找到条码列,无法继续处理
|
||||
if not barcode_cols:
|
||||
logger.error("未找到条码列,无法处理")
|
||||
return []
|
||||
|
||||
# 定义列名映射
|
||||
column_mapping = {
|
||||
'name': ['商品名称', '名称', '品名', '商品', '商品名', '商品或服务名称', '品项名'],
|
||||
'specification': ['规格', '规格型号', '型号', '商品规格'],
|
||||
'quantity': ['数量', '采购数量', '购买数量', '采购数量', '订单数量', '数量(必填)'],
|
||||
'unit': ['单位', '采购单位', '计量单位', '单位(必填)'],
|
||||
'price': ['单价', '价格', '采购单价', '销售价', '进货价', '单价(必填)']
|
||||
}
|
||||
|
||||
# 映射列名到标准名称
|
||||
mapped_columns = {'barcode': barcode_cols[0]} # 使用第一个找到的条码列
|
||||
|
||||
for target, possible_names in column_mapping.items():
|
||||
for col in df.columns:
|
||||
col_str = str(col).strip()
|
||||
for name in possible_names:
|
||||
if col_str == name:
|
||||
mapped_columns[target] = col
|
||||
break
|
||||
if target in mapped_columns:
|
||||
break
|
||||
|
||||
logger.info(f"列名映射结果: {mapped_columns}")
|
||||
|
||||
# 提取商品信息
|
||||
products = []
|
||||
|
||||
for _, row in df.iterrows():
|
||||
barcode = row.get(mapped_columns['barcode'])
|
||||
|
||||
# 跳过空行或无效条码
|
||||
if pd.isna(barcode) or not self.validate_barcode(barcode):
|
||||
continue
|
||||
|
||||
# 创建商品信息字典
|
||||
product = {
|
||||
'barcode': format_barcode(barcode),
|
||||
'name': row.get(mapped_columns.get('name', ''), ''),
|
||||
'specification': row.get(mapped_columns.get('specification', ''), ''),
|
||||
'quantity': extract_number(str(row.get(mapped_columns.get('quantity', ''), 0))) or 0,
|
||||
'unit': str(row.get(mapped_columns.get('unit', ''), '')),
|
||||
'price': extract_number(str(row.get(mapped_columns.get('price', ''), 0))) or 0
|
||||
}
|
||||
|
||||
# 如果商品名称为空但商品条码不为空,则使用条码作为名称
|
||||
if not product['name'] and product['barcode']:
|
||||
product['name'] = f"商品 ({product['barcode']})"
|
||||
|
||||
# 推断规格
|
||||
if not product['specification'] and product['name']:
|
||||
inferred_spec = self.unit_converter.infer_specification_from_name(product['name'])
|
||||
if inferred_spec:
|
||||
product['specification'] = inferred_spec
|
||||
logger.info(f"从商品名称推断规格: {product['name']} -> {inferred_spec}")
|
||||
|
||||
# 单位处理:如果单位为空但数量包含单位信息
|
||||
quantity_str = str(row.get(mapped_columns.get('quantity', ''), ''))
|
||||
if not product['unit'] and '数量' in mapped_columns:
|
||||
num, unit = self.unit_converter.extract_unit_from_quantity(quantity_str)
|
||||
if unit:
|
||||
product['unit'] = unit
|
||||
logger.info(f"从数量提取单位: {quantity_str} -> {unit}")
|
||||
# 如果数量被提取出来,更新数量
|
||||
if num is not None:
|
||||
product['quantity'] = num
|
||||
|
||||
# 应用单位转换规则
|
||||
product = self.unit_converter.process_unit_conversion(product)
|
||||
|
||||
products.append(product)
|
||||
|
||||
logger.info(f"提取到 {len(products)} 个商品信息")
|
||||
return products
|
||||
|
||||
def fill_template(self, products: List[Dict], output_file_path: str) -> bool:
|
||||
"""
|
||||
填充采购单模板
|
||||
|
||||
Args:
|
||||
products: 商品信息列表
|
||||
output_file_path: 输出文件路径
|
||||
|
||||
Returns:
|
||||
是否成功填充
|
||||
"""
|
||||
try:
|
||||
# 打开模板文件
|
||||
template_workbook = xlrd.open_workbook(self.template_path, formatting_info=True)
|
||||
template_sheet = template_workbook.sheet_by_index(0)
|
||||
|
||||
# 创建可写的副本
|
||||
output_workbook = xlcopy(template_workbook)
|
||||
output_sheet = output_workbook.get_sheet(0)
|
||||
|
||||
# 填充商品信息
|
||||
start_row = 1 # 从第2行开始填充数据(索引从0开始)
|
||||
|
||||
for i, product in enumerate(products):
|
||||
row = start_row + i
|
||||
|
||||
# 序号
|
||||
output_sheet.write(row, 0, i + 1)
|
||||
# 商品编码(条码)
|
||||
output_sheet.write(row, 1, product['barcode'])
|
||||
# 商品名称
|
||||
output_sheet.write(row, 2, product['name'])
|
||||
# 规格
|
||||
output_sheet.write(row, 3, product['specification'])
|
||||
# 单位
|
||||
output_sheet.write(row, 4, product['unit'])
|
||||
# 单价
|
||||
output_sheet.write(row, 5, product['price'])
|
||||
# 采购数量
|
||||
output_sheet.write(row, 6, product['quantity'])
|
||||
# 采购金额(单价 × 数量)
|
||||
amount = product['price'] * product['quantity']
|
||||
output_sheet.write(row, 7, amount)
|
||||
# 税率
|
||||
output_sheet.write(row, 8, 0)
|
||||
# 赠送量(默认为0)
|
||||
output_sheet.write(row, 9, 0)
|
||||
|
||||
# 保存文件
|
||||
output_workbook.save(output_file_path)
|
||||
logger.info(f"采购单已保存到: {output_file_path}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"填充模板时出错: {e}")
|
||||
return False
|
||||
|
||||
def process_specific_file(self, file_path: str) -> Optional[str]:
|
||||
"""
|
||||
处理指定的Excel文件
|
||||
|
||||
Args:
|
||||
file_path: Excel文件路径
|
||||
|
||||
Returns:
|
||||
输出文件路径,如果处理失败则返回None
|
||||
"""
|
||||
logger.info(f"开始处理Excel文件: {file_path}")
|
||||
|
||||
if not os.path.exists(file_path):
|
||||
logger.error(f"文件不存在: {file_path}")
|
||||
return None
|
||||
|
||||
try:
|
||||
# 读取Excel文件
|
||||
df = pd.read_excel(file_path)
|
||||
logger.info(f"成功读取Excel文件: {file_path}, 共 {len(df)} 行")
|
||||
|
||||
# 提取商品信息
|
||||
products = self.extract_product_info(df)
|
||||
|
||||
if not products:
|
||||
logger.warning("未提取到有效商品信息")
|
||||
return None
|
||||
|
||||
# 生成输出文件名
|
||||
file_name = os.path.splitext(os.path.basename(file_path))[0]
|
||||
output_file = os.path.join(self.output_dir, f"采购单_{file_name}.xls")
|
||||
|
||||
# 填充模板并保存
|
||||
if self.fill_template(products, output_file):
|
||||
# 记录已处理文件
|
||||
self.processed_files[file_path] = output_file
|
||||
self._save_processed_files()
|
||||
return output_file
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"处理Excel文件时出错: {file_path}, 错误: {e}")
|
||||
return None
|
||||
|
||||
def process_latest_file(self) -> Optional[str]:
|
||||
"""
|
||||
处理最新的Excel文件
|
||||
|
||||
Returns:
|
||||
输出文件路径,如果处理失败则返回None
|
||||
"""
|
||||
# 获取最新的Excel文件
|
||||
latest_file = self.get_latest_excel()
|
||||
if not latest_file:
|
||||
logger.warning("未找到可处理的Excel文件")
|
||||
return None
|
||||
|
||||
# 处理文件
|
||||
return self.process_specific_file(latest_file)
|
||||
Reference in New Issue
Block a user