增强版v2-初始化仓库,验证好了ocr部分,先备份一次
This commit is contained in:
@@ -0,0 +1,8 @@
|
||||
"""
|
||||
OCR订单处理系统
|
||||
---------------
|
||||
用于自动识别和处理Excel格式的订单文件的系统。
|
||||
支持多种格式的订单处理,包括普通订单和赠品订单的处理。
|
||||
"""
|
||||
|
||||
__version__ = '2.0.0'
|
||||
Binary file not shown.
@@ -0,0 +1,5 @@
|
||||
"""
|
||||
OCR订单处理系统 - 命令行接口
|
||||
-------------------------
|
||||
提供命令行工具,便于用户使用系统功能。
|
||||
"""
|
||||
@@ -0,0 +1,138 @@
|
||||
"""
|
||||
Excel处理命令行工具
|
||||
---------------
|
||||
提供Excel处理相关的命令行接口。
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
from typing import List, Optional
|
||||
|
||||
from ..config.settings import ConfigManager
|
||||
from ..core.utils.log_utils import get_logger, close_logger
|
||||
from ..services.order_service import OrderService
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
def create_parser() -> argparse.ArgumentParser:
|
||||
"""
|
||||
创建命令行参数解析器
|
||||
|
||||
Returns:
|
||||
参数解析器
|
||||
"""
|
||||
parser = argparse.ArgumentParser(description='Excel处理工具')
|
||||
|
||||
# 通用选项
|
||||
parser.add_argument('--config', type=str, help='配置文件路径')
|
||||
|
||||
# 子命令
|
||||
subparsers = parser.add_subparsers(dest='command', help='子命令')
|
||||
|
||||
# 处理Excel命令
|
||||
process_parser = subparsers.add_parser('process', help='处理Excel文件')
|
||||
process_parser.add_argument('--input', type=str, help='输入Excel文件路径,如果不指定则处理最新的文件')
|
||||
|
||||
# 查看命令
|
||||
list_parser = subparsers.add_parser('list', help='获取最新的Excel文件')
|
||||
|
||||
return parser
|
||||
|
||||
def process_excel(order_service: OrderService, input_file: Optional[str] = None) -> bool:
|
||||
"""
|
||||
处理Excel文件
|
||||
|
||||
Args:
|
||||
order_service: 订单服务
|
||||
input_file: 输入文件路径,如果为None则处理最新的文件
|
||||
|
||||
Returns:
|
||||
处理是否成功
|
||||
"""
|
||||
if input_file:
|
||||
if not os.path.exists(input_file):
|
||||
logger.error(f"输入文件不存在: {input_file}")
|
||||
return False
|
||||
|
||||
result = order_service.process_excel(input_file)
|
||||
else:
|
||||
latest_file = order_service.get_latest_excel()
|
||||
if not latest_file:
|
||||
logger.warning("未找到可处理的Excel文件")
|
||||
return False
|
||||
|
||||
logger.info(f"处理最新的Excel文件: {latest_file}")
|
||||
result = order_service.process_excel(latest_file)
|
||||
|
||||
if result:
|
||||
logger.info(f"处理成功,输出文件: {result}")
|
||||
return True
|
||||
else:
|
||||
logger.error("处理失败")
|
||||
return False
|
||||
|
||||
def list_latest_excel(order_service: OrderService) -> bool:
|
||||
"""
|
||||
获取最新的Excel文件
|
||||
|
||||
Args:
|
||||
order_service: 订单服务
|
||||
|
||||
Returns:
|
||||
是否找到Excel文件
|
||||
"""
|
||||
latest_file = order_service.get_latest_excel()
|
||||
|
||||
if latest_file:
|
||||
logger.info(f"最新的Excel文件: {latest_file}")
|
||||
return True
|
||||
else:
|
||||
logger.info("未找到Excel文件")
|
||||
return False
|
||||
|
||||
def main(args: Optional[List[str]] = None) -> int:
|
||||
"""
|
||||
Excel处理命令行主函数
|
||||
|
||||
Args:
|
||||
args: 命令行参数,如果为None则使用sys.argv
|
||||
|
||||
Returns:
|
||||
退出状态码
|
||||
"""
|
||||
parser = create_parser()
|
||||
parsed_args = parser.parse_args(args)
|
||||
|
||||
if parsed_args.command is None:
|
||||
parser.print_help()
|
||||
return 1
|
||||
|
||||
try:
|
||||
# 创建配置管理器
|
||||
config = ConfigManager(parsed_args.config) if parsed_args.config else ConfigManager()
|
||||
|
||||
# 创建订单服务
|
||||
order_service = OrderService(config)
|
||||
|
||||
# 根据命令执行不同功能
|
||||
if parsed_args.command == 'process':
|
||||
success = process_excel(order_service, parsed_args.input)
|
||||
elif parsed_args.command == 'list':
|
||||
success = list_latest_excel(order_service)
|
||||
else:
|
||||
parser.print_help()
|
||||
return 1
|
||||
|
||||
return 0 if success else 1
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"执行过程中发生错误: {e}")
|
||||
return 1
|
||||
|
||||
finally:
|
||||
# 关闭日志
|
||||
close_logger(__name__)
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
@@ -0,0 +1,147 @@
|
||||
"""
|
||||
订单合并命令行工具
|
||||
--------------
|
||||
提供订单合并相关的命令行接口。
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
from typing import List, Optional
|
||||
|
||||
from ..config.settings import ConfigManager
|
||||
from ..core.utils.log_utils import get_logger, close_logger
|
||||
from ..services.order_service import OrderService
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
def create_parser() -> argparse.ArgumentParser:
|
||||
"""
|
||||
创建命令行参数解析器
|
||||
|
||||
Returns:
|
||||
参数解析器
|
||||
"""
|
||||
parser = argparse.ArgumentParser(description='订单合并工具')
|
||||
|
||||
# 通用选项
|
||||
parser.add_argument('--config', type=str, help='配置文件路径')
|
||||
|
||||
# 子命令
|
||||
subparsers = parser.add_subparsers(dest='command', help='子命令')
|
||||
|
||||
# 合并命令
|
||||
merge_parser = subparsers.add_parser('merge', help='合并采购单')
|
||||
merge_parser.add_argument('--input', type=str, help='输入采购单文件路径列表,以逗号分隔,如果不指定则合并所有采购单')
|
||||
|
||||
# 列出采购单命令
|
||||
list_parser = subparsers.add_parser('list', help='列出采购单文件')
|
||||
|
||||
return parser
|
||||
|
||||
def merge_orders(order_service: OrderService, input_files: Optional[str] = None) -> bool:
|
||||
"""
|
||||
合并采购单
|
||||
|
||||
Args:
|
||||
order_service: 订单服务
|
||||
input_files: 输入文件路径列表,以逗号分隔,如果为None则合并所有采购单
|
||||
|
||||
Returns:
|
||||
合并是否成功
|
||||
"""
|
||||
if input_files:
|
||||
# 分割输入文件列表
|
||||
file_paths = [path.strip() for path in input_files.split(',')]
|
||||
|
||||
# 检查文件是否存在
|
||||
for path in file_paths:
|
||||
if not os.path.exists(path):
|
||||
logger.error(f"输入文件不存在: {path}")
|
||||
return False
|
||||
|
||||
result = order_service.merge_orders(file_paths)
|
||||
else:
|
||||
# 获取所有采购单文件
|
||||
file_paths = order_service.get_purchase_orders()
|
||||
if not file_paths:
|
||||
logger.warning("未找到采购单文件")
|
||||
return False
|
||||
|
||||
logger.info(f"合并 {len(file_paths)} 个采购单文件")
|
||||
result = order_service.merge_orders()
|
||||
|
||||
if result:
|
||||
logger.info(f"合并成功,输出文件: {result}")
|
||||
return True
|
||||
else:
|
||||
logger.error("合并失败")
|
||||
return False
|
||||
|
||||
def list_purchase_orders(order_service: OrderService) -> bool:
|
||||
"""
|
||||
列出采购单文件
|
||||
|
||||
Args:
|
||||
order_service: 订单服务
|
||||
|
||||
Returns:
|
||||
是否有采购单文件
|
||||
"""
|
||||
files = order_service.get_purchase_orders()
|
||||
|
||||
if not files:
|
||||
logger.info("未找到采购单文件")
|
||||
return False
|
||||
|
||||
logger.info(f"采购单文件 ({len(files)}):")
|
||||
for file in files:
|
||||
logger.info(f" {file}")
|
||||
|
||||
return True
|
||||
|
||||
def main(args: Optional[List[str]] = None) -> int:
|
||||
"""
|
||||
订单合并命令行主函数
|
||||
|
||||
Args:
|
||||
args: 命令行参数,如果为None则使用sys.argv
|
||||
|
||||
Returns:
|
||||
退出状态码
|
||||
"""
|
||||
parser = create_parser()
|
||||
parsed_args = parser.parse_args(args)
|
||||
|
||||
if parsed_args.command is None:
|
||||
parser.print_help()
|
||||
return 1
|
||||
|
||||
try:
|
||||
# 创建配置管理器
|
||||
config = ConfigManager(parsed_args.config) if parsed_args.config else ConfigManager()
|
||||
|
||||
# 创建订单服务
|
||||
order_service = OrderService(config)
|
||||
|
||||
# 根据命令执行不同功能
|
||||
if parsed_args.command == 'merge':
|
||||
success = merge_orders(order_service, parsed_args.input)
|
||||
elif parsed_args.command == 'list':
|
||||
success = list_purchase_orders(order_service)
|
||||
else:
|
||||
parser.print_help()
|
||||
return 1
|
||||
|
||||
return 0 if success else 1
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"执行过程中发生错误: {e}")
|
||||
return 1
|
||||
|
||||
finally:
|
||||
# 关闭日志
|
||||
close_logger(__name__)
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
@@ -0,0 +1,164 @@
|
||||
"""
|
||||
OCR命令行工具
|
||||
----------
|
||||
提供OCR识别相关的命令行接口。
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
from typing import List, Optional
|
||||
|
||||
from ..config.settings import ConfigManager
|
||||
from ..core.utils.log_utils import get_logger, close_logger
|
||||
from ..services.ocr_service import OCRService
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
def create_parser() -> argparse.ArgumentParser:
|
||||
"""
|
||||
创建命令行参数解析器
|
||||
|
||||
Returns:
|
||||
参数解析器
|
||||
"""
|
||||
parser = argparse.ArgumentParser(description='OCR识别工具')
|
||||
|
||||
# 通用选项
|
||||
parser.add_argument('--config', type=str, help='配置文件路径')
|
||||
|
||||
# 子命令
|
||||
subparsers = parser.add_subparsers(dest='command', help='子命令')
|
||||
|
||||
# 单文件处理命令
|
||||
process_parser = subparsers.add_parser('process', help='处理单个文件')
|
||||
process_parser.add_argument('--input', type=str, required=True, help='输入图片文件路径')
|
||||
|
||||
# 批量处理命令
|
||||
batch_parser = subparsers.add_parser('batch', help='批量处理文件')
|
||||
batch_parser.add_argument('--batch-size', type=int, help='批处理大小')
|
||||
batch_parser.add_argument('--max-workers', type=int, help='最大线程数')
|
||||
|
||||
# 查看未处理文件命令
|
||||
list_parser = subparsers.add_parser('list', help='列出未处理的文件')
|
||||
|
||||
return parser
|
||||
|
||||
def process_file(ocr_service: OCRService, input_file: str) -> bool:
|
||||
"""
|
||||
处理单个文件
|
||||
|
||||
Args:
|
||||
ocr_service: OCR服务
|
||||
input_file: 输入文件路径
|
||||
|
||||
Returns:
|
||||
处理是否成功
|
||||
"""
|
||||
if not os.path.exists(input_file):
|
||||
logger.error(f"输入文件不存在: {input_file}")
|
||||
return False
|
||||
|
||||
if not ocr_service.validate_image(input_file):
|
||||
logger.error(f"输入文件无效: {input_file}")
|
||||
return False
|
||||
|
||||
result = ocr_service.process_image(input_file)
|
||||
|
||||
if result:
|
||||
logger.info(f"处理成功,输出文件: {result}")
|
||||
return True
|
||||
else:
|
||||
logger.error("处理失败")
|
||||
return False
|
||||
|
||||
def process_batch(ocr_service: OCRService, batch_size: Optional[int] = None, max_workers: Optional[int] = None) -> bool:
|
||||
"""
|
||||
批量处理文件
|
||||
|
||||
Args:
|
||||
ocr_service: OCR服务
|
||||
batch_size: 批处理大小
|
||||
max_workers: 最大线程数
|
||||
|
||||
Returns:
|
||||
处理是否成功
|
||||
"""
|
||||
total, success = ocr_service.process_images_batch(batch_size, max_workers)
|
||||
|
||||
if total == 0:
|
||||
logger.warning("没有找到需要处理的文件")
|
||||
return False
|
||||
|
||||
logger.info(f"批量处理完成,总计: {total},成功: {success}")
|
||||
return success > 0
|
||||
|
||||
def list_unprocessed(ocr_service: OCRService) -> bool:
|
||||
"""
|
||||
列出未处理的文件
|
||||
|
||||
Args:
|
||||
ocr_service: OCR服务
|
||||
|
||||
Returns:
|
||||
是否有未处理的文件
|
||||
"""
|
||||
files = ocr_service.get_unprocessed_images()
|
||||
|
||||
if not files:
|
||||
logger.info("没有未处理的文件")
|
||||
return False
|
||||
|
||||
logger.info(f"未处理的文件 ({len(files)}):")
|
||||
for file in files:
|
||||
logger.info(f" {file}")
|
||||
|
||||
return True
|
||||
|
||||
def main(args: Optional[List[str]] = None) -> int:
|
||||
"""
|
||||
OCR命令行主函数
|
||||
|
||||
Args:
|
||||
args: 命令行参数,如果为None则使用sys.argv
|
||||
|
||||
Returns:
|
||||
退出状态码
|
||||
"""
|
||||
parser = create_parser()
|
||||
parsed_args = parser.parse_args(args)
|
||||
|
||||
if parsed_args.command is None:
|
||||
parser.print_help()
|
||||
return 1
|
||||
|
||||
try:
|
||||
# 创建配置管理器
|
||||
config = ConfigManager(parsed_args.config) if parsed_args.config else ConfigManager()
|
||||
|
||||
# 创建OCR服务
|
||||
ocr_service = OCRService(config)
|
||||
|
||||
# 根据命令执行不同功能
|
||||
if parsed_args.command == 'process':
|
||||
success = process_file(ocr_service, parsed_args.input)
|
||||
elif parsed_args.command == 'batch':
|
||||
success = process_batch(ocr_service, parsed_args.batch_size, parsed_args.max_workers)
|
||||
elif parsed_args.command == 'list':
|
||||
success = list_unprocessed(ocr_service)
|
||||
else:
|
||||
parser.print_help()
|
||||
return 1
|
||||
|
||||
return 0 if success else 1
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"执行过程中发生错误: {e}")
|
||||
return 1
|
||||
|
||||
finally:
|
||||
# 关闭日志
|
||||
close_logger(__name__)
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
@@ -0,0 +1,5 @@
|
||||
"""
|
||||
OCR订单处理系统 - 配置模块
|
||||
------------------------
|
||||
负责管理系统配置,包括API密钥、路径和处理选项。
|
||||
"""
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,37 @@
|
||||
"""
|
||||
默认配置
|
||||
-------
|
||||
包含系统的默认配置值。
|
||||
"""
|
||||
|
||||
# 默认配置
|
||||
DEFAULT_CONFIG = {
|
||||
'API': {
|
||||
'api_key': '', # 将从配置文件中读取
|
||||
'secret_key': '', # 将从配置文件中读取
|
||||
'timeout': '30',
|
||||
'max_retries': '3',
|
||||
'retry_delay': '2',
|
||||
'api_url': 'https://aip.baidubce.com/rest/2.0/ocr/v1/table'
|
||||
},
|
||||
'Paths': {
|
||||
'input_folder': 'data/input',
|
||||
'output_folder': 'data/output',
|
||||
'temp_folder': 'data/temp',
|
||||
'template_folder': 'templates',
|
||||
'processed_record': 'data/processed_files.json'
|
||||
},
|
||||
'Performance': {
|
||||
'max_workers': '4',
|
||||
'batch_size': '5',
|
||||
'skip_existing': 'true'
|
||||
},
|
||||
'File': {
|
||||
'allowed_extensions': '.jpg,.jpeg,.png,.bmp',
|
||||
'excel_extension': '.xlsx',
|
||||
'max_file_size_mb': '4'
|
||||
},
|
||||
'Templates': {
|
||||
'purchase_order': '银豹-采购单模板.xls'
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,128 @@
|
||||
"""
|
||||
配置管理模块
|
||||
-----------
|
||||
提供统一的配置加载、访问和保存功能。
|
||||
"""
|
||||
|
||||
import os
|
||||
import configparser
|
||||
import logging
|
||||
from typing import Dict, List, Optional, Any
|
||||
|
||||
from .defaults import DEFAULT_CONFIG
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class ConfigManager:
|
||||
"""
|
||||
配置管理类,负责加载和保存配置
|
||||
单例模式确保全局只有一个配置实例
|
||||
"""
|
||||
_instance = None
|
||||
|
||||
def __new__(cls, config_file=None):
|
||||
"""单例模式实现"""
|
||||
if cls._instance is None:
|
||||
cls._instance = super(ConfigManager, cls).__new__(cls)
|
||||
cls._instance._init(config_file)
|
||||
return cls._instance
|
||||
|
||||
def _init(self, config_file):
|
||||
"""初始化配置管理器"""
|
||||
self.config_file = config_file or 'config.ini'
|
||||
self.config = configparser.ConfigParser()
|
||||
self.load_config()
|
||||
|
||||
def load_config(self) -> None:
|
||||
"""
|
||||
加载配置文件,如果不存在则创建默认配置
|
||||
"""
|
||||
if not os.path.exists(self.config_file):
|
||||
self.create_default_config()
|
||||
|
||||
try:
|
||||
self.config.read(self.config_file, encoding='utf-8')
|
||||
logger.info(f"已加载配置文件: {self.config_file}")
|
||||
except Exception as e:
|
||||
logger.error(f"加载配置文件时出错: {e}")
|
||||
logger.info("使用默认配置")
|
||||
self.create_default_config(save=False)
|
||||
|
||||
def create_default_config(self, save: bool = True) -> None:
|
||||
"""创建默认配置"""
|
||||
for section, options in DEFAULT_CONFIG.items():
|
||||
if not self.config.has_section(section):
|
||||
self.config.add_section(section)
|
||||
|
||||
for option, value in options.items():
|
||||
self.config.set(section, option, value)
|
||||
|
||||
if save:
|
||||
self.save_config()
|
||||
logger.info(f"已创建默认配置文件: {self.config_file}")
|
||||
|
||||
def save_config(self) -> None:
|
||||
"""保存配置到文件"""
|
||||
try:
|
||||
with open(self.config_file, 'w', encoding='utf-8') as f:
|
||||
self.config.write(f)
|
||||
logger.info(f"配置已保存到: {self.config_file}")
|
||||
except Exception as e:
|
||||
logger.error(f"保存配置文件时出错: {e}")
|
||||
|
||||
def get(self, section: str, option: str, fallback: Any = None) -> Any:
|
||||
"""获取配置值"""
|
||||
return self.config.get(section, option, fallback=fallback)
|
||||
|
||||
def getint(self, section: str, option: str, fallback: int = 0) -> int:
|
||||
"""获取整数配置值"""
|
||||
return self.config.getint(section, option, fallback=fallback)
|
||||
|
||||
def getfloat(self, section: str, option: str, fallback: float = 0.0) -> float:
|
||||
"""获取浮点数配置值"""
|
||||
return self.config.getfloat(section, option, fallback=fallback)
|
||||
|
||||
def getboolean(self, section: str, option: str, fallback: bool = False) -> bool:
|
||||
"""获取布尔配置值"""
|
||||
return self.config.getboolean(section, option, fallback=fallback)
|
||||
|
||||
def get_list(self, section: str, option: str, fallback: str = "", delimiter: str = ",") -> List[str]:
|
||||
"""获取列表配置值(逗号分隔的字符串转为列表)"""
|
||||
value = self.get(section, option, fallback)
|
||||
return [item.strip() for item in value.split(delimiter) if item.strip()]
|
||||
|
||||
def update(self, section: str, option: str, value: Any) -> None:
|
||||
"""更新配置选项"""
|
||||
if not self.config.has_section(section):
|
||||
self.config.add_section(section)
|
||||
|
||||
self.config.set(section, option, str(value))
|
||||
logger.debug(f"更新配置: [{section}] {option} = {value}")
|
||||
|
||||
def get_path(self, section: str, option: str, fallback: str = "", create: bool = False) -> str:
|
||||
"""
|
||||
获取路径配置并确保它是一个有效的绝对路径
|
||||
如果create为True,则自动创建该目录
|
||||
"""
|
||||
path = self.get(section, option, fallback)
|
||||
|
||||
if not os.path.isabs(path):
|
||||
# 相对路径,转为绝对路径
|
||||
path = os.path.abspath(path)
|
||||
|
||||
if create and not os.path.exists(path):
|
||||
try:
|
||||
# 如果是文件路径,创建其父目录
|
||||
if '.' in os.path.basename(path):
|
||||
directory = os.path.dirname(path)
|
||||
if directory and not os.path.exists(directory):
|
||||
os.makedirs(directory, exist_ok=True)
|
||||
logger.info(f"已创建目录: {directory}")
|
||||
else:
|
||||
# 否则认为是目录路径
|
||||
os.makedirs(path, exist_ok=True)
|
||||
logger.info(f"已创建目录: {path}")
|
||||
except Exception as e:
|
||||
logger.error(f"创建目录失败: {path}, 错误: {e}")
|
||||
|
||||
return path
|
||||
@@ -0,0 +1,5 @@
|
||||
"""
|
||||
OCR订单处理系统 - Excel处理模块
|
||||
----------------------------
|
||||
提供Excel文件处理、数据提取和转换功能。
|
||||
"""
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,213 @@
|
||||
"""
|
||||
单位转换处理模块
|
||||
-------------
|
||||
提供规格和单位的处理和转换功能。
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Dict, List, Optional, Tuple, Any
|
||||
|
||||
from ..utils.log_utils import get_logger
|
||||
from ..utils.string_utils import (
|
||||
clean_string,
|
||||
extract_number,
|
||||
extract_unit,
|
||||
extract_number_and_unit,
|
||||
parse_specification
|
||||
)
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
class UnitConverter:
|
||||
"""
|
||||
单位转换器:处理商品规格和单位转换
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""初始化单位转换器"""
|
||||
# 特殊条码配置
|
||||
self.special_barcodes = {
|
||||
'6925019900087': {
|
||||
'multiplier': 10, # 数量乘以10
|
||||
'target_unit': '瓶', # 目标单位
|
||||
'description': '特殊处理:数量*10,单位转换为瓶'
|
||||
}
|
||||
# 可以在这里添加更多特殊条码的配置
|
||||
}
|
||||
|
||||
# 有效的单位列表
|
||||
self.valid_units = ['件', '箱', '包', '提', '盒', '瓶', '个', '支', '袋', '副', '桶', '罐', 'L', 'l', '升']
|
||||
|
||||
# 需要特殊处理的单位
|
||||
self.special_units = ['件', '箱', '提', '盒']
|
||||
|
||||
logger.info("单位转换器初始化完成")
|
||||
|
||||
def add_special_barcode(self, barcode: str, multiplier: int, target_unit: str, description: str = "") -> None:
|
||||
"""
|
||||
添加特殊条码处理配置
|
||||
|
||||
Args:
|
||||
barcode: 条码
|
||||
multiplier: 数量乘数
|
||||
target_unit: 目标单位
|
||||
description: 处理描述
|
||||
"""
|
||||
self.special_barcodes[barcode] = {
|
||||
'multiplier': multiplier,
|
||||
'target_unit': target_unit,
|
||||
'description': description or f'特殊处理:数量*{multiplier},单位转换为{target_unit}'
|
||||
}
|
||||
logger.info(f"添加特殊条码配置: {barcode}, {description}")
|
||||
|
||||
def infer_specification_from_name(self, product_name: str) -> Optional[str]:
|
||||
"""
|
||||
从商品名称推断规格
|
||||
|
||||
Args:
|
||||
product_name: 商品名称
|
||||
|
||||
Returns:
|
||||
推断的规格,如果无法推断则返回None
|
||||
"""
|
||||
if not product_name or not isinstance(product_name, str):
|
||||
return None
|
||||
|
||||
try:
|
||||
# 清理商品名称
|
||||
name = clean_string(product_name)
|
||||
|
||||
# 1. 匹配 XX入纸箱 格式
|
||||
match = re.search(r'(\d+)入纸箱', name)
|
||||
if match:
|
||||
return f"1*{match.group(1)}"
|
||||
|
||||
# 2. 匹配 绿茶1*15-纸箱装 格式
|
||||
match = re.search(r'(\d+)[*×xX](\d+)[-\s]?纸箱', name)
|
||||
if match:
|
||||
return f"{match.group(1)}*{match.group(2)}"
|
||||
|
||||
# 3. 匹配 12.9L桶装水 格式
|
||||
match = re.search(r'([\d\.]+)[Ll升](?!.*[*×xX])', name)
|
||||
if match:
|
||||
return f"{match.group(1)}L*1"
|
||||
|
||||
# 4. 匹配 商品12入纸箱 格式(数字在中间)
|
||||
match = re.search(r'\D(\d+)入\w*箱', name)
|
||||
if match:
|
||||
return f"1*{match.group(1)}"
|
||||
|
||||
# 5. 匹配 商品15纸箱 格式(数字在中间)
|
||||
match = re.search(r'\D(\d+)\w*箱', name)
|
||||
if match:
|
||||
return f"1*{match.group(1)}"
|
||||
|
||||
# 6. 匹配 商品1*30 格式
|
||||
match = re.search(r'(\d+)[*×xX](\d+)', name)
|
||||
if match:
|
||||
return f"{match.group(1)}*{match.group(2)}"
|
||||
|
||||
logger.debug(f"无法从商品名称推断规格: {name}")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"从商品名称推断规格时出错: {e}")
|
||||
return None
|
||||
|
||||
def extract_unit_from_quantity(self, quantity_str: str) -> Tuple[Optional[float], Optional[str]]:
|
||||
"""
|
||||
从数量字符串提取单位
|
||||
|
||||
Args:
|
||||
quantity_str: 数量字符串
|
||||
|
||||
Returns:
|
||||
(数量, 单位)元组
|
||||
"""
|
||||
if not quantity_str or not isinstance(quantity_str, str):
|
||||
return None, None
|
||||
|
||||
try:
|
||||
# 清理数量字符串
|
||||
quantity_str = clean_string(quantity_str)
|
||||
|
||||
# 提取数字和单位
|
||||
return extract_number_and_unit(quantity_str)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"从数量字符串提取单位时出错: {quantity_str}, 错误: {e}")
|
||||
return None, None
|
||||
|
||||
def process_unit_conversion(self, product: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
处理单位转换,根据单位和规格转换数量和单价
|
||||
|
||||
Args:
|
||||
product: 商品字典,包含条码、单位、规格、数量和单价等字段
|
||||
|
||||
Returns:
|
||||
处理后的商品字典
|
||||
"""
|
||||
# 复制商品信息,避免修改原始数据
|
||||
result = product.copy()
|
||||
|
||||
try:
|
||||
# 获取条码、单位、规格、数量和单价
|
||||
barcode = product.get('barcode', '')
|
||||
unit = product.get('unit', '')
|
||||
specification = product.get('specification', '')
|
||||
quantity = product.get('quantity', 0)
|
||||
price = product.get('price', 0)
|
||||
|
||||
# 如果缺少关键信息,无法进行转换
|
||||
if not barcode or quantity == 0:
|
||||
return result
|
||||
|
||||
# 1. 首先检查是否是特殊条码
|
||||
if barcode in self.special_barcodes:
|
||||
special_config = self.special_barcodes[barcode]
|
||||
logger.info(f"应用特殊条码配置: {barcode}, {special_config['description']}")
|
||||
|
||||
# 应用乘数和单位转换
|
||||
result['quantity'] = quantity * special_config['multiplier']
|
||||
result['unit'] = special_config['target_unit']
|
||||
|
||||
# 如果有单价,进行单价转换
|
||||
if price != 0:
|
||||
result['price'] = price / special_config['multiplier']
|
||||
|
||||
return result
|
||||
|
||||
# 2. 提取规格包装数量
|
||||
package_quantity = None
|
||||
if specification:
|
||||
package_quantity = parse_specification(specification)
|
||||
|
||||
# 3. 处理单位转换
|
||||
if unit and unit in self.special_units and package_quantity:
|
||||
# 判断是否是三级规格(1*5*12格式)
|
||||
is_three_level = bool(re.search(r'\d+[\*xX×]\d+[\*xX×]\d+', str(specification)))
|
||||
|
||||
# 对于"提"和"盒"单位的特殊处理
|
||||
if (unit in ['提', '盒']) and not is_three_level:
|
||||
# 二级规格:保持原数量不变
|
||||
logger.info(f"二级规格的提/盒单位,保持原状: {unit}, 规格={specification}")
|
||||
return result
|
||||
|
||||
# 标准处理:数量×包装数量,单价÷包装数量
|
||||
logger.info(f"标准单位转换: {unit}->瓶, 规格={specification}, 包装数量={package_quantity}")
|
||||
result['quantity'] = quantity * package_quantity
|
||||
result['unit'] = '瓶'
|
||||
|
||||
if price != 0:
|
||||
result['price'] = price / package_quantity
|
||||
|
||||
return result
|
||||
|
||||
# 4. 默认返回原始数据
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"单位转换处理出错: {e}")
|
||||
# 发生错误时,返回原始数据
|
||||
return result
|
||||
@@ -0,0 +1,375 @@
|
||||
"""
|
||||
订单合并模块
|
||||
----------
|
||||
提供采购单合并功能,将多个采购单合并为一个。
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import xlrd
|
||||
import xlwt
|
||||
from xlutils.copy import copy as xlcopy
|
||||
from typing import Dict, List, Optional, Tuple, Union, Any
|
||||
from datetime import datetime
|
||||
|
||||
from ...config.settings import ConfigManager
|
||||
from ..utils.log_utils import get_logger
|
||||
from ..utils.file_utils import (
|
||||
ensure_dir,
|
||||
get_file_extension,
|
||||
get_files_by_extensions,
|
||||
load_json,
|
||||
save_json
|
||||
)
|
||||
from ..utils.string_utils import (
|
||||
clean_string,
|
||||
clean_barcode,
|
||||
format_barcode
|
||||
)
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
class PurchaseOrderMerger:
|
||||
"""
|
||||
采购单合并器:将多个采购单Excel文件合并成一个文件
|
||||
"""
|
||||
|
||||
def __init__(self, config: Optional[ConfigManager] = None):
|
||||
"""
|
||||
初始化采购单合并器
|
||||
|
||||
Args:
|
||||
config: 配置管理器,如果为None则创建新的
|
||||
"""
|
||||
logger.info("初始化PurchaseOrderMerger")
|
||||
self.config = config or ConfigManager()
|
||||
|
||||
# 获取配置
|
||||
self.output_dir = self.config.get_path('Paths', 'output_folder', 'data/output', create=True)
|
||||
|
||||
# 获取模板文件路径
|
||||
template_folder = self.config.get('Paths', 'template_folder', 'templates')
|
||||
template_name = self.config.get('Templates', 'purchase_order', '银豹-采购单模板.xls')
|
||||
|
||||
self.template_path = os.path.join(template_folder, template_name)
|
||||
|
||||
# 检查模板文件是否存在
|
||||
if not os.path.exists(self.template_path):
|
||||
logger.error(f"模板文件不存在: {self.template_path}")
|
||||
raise FileNotFoundError(f"模板文件不存在: {self.template_path}")
|
||||
|
||||
# 用于记录已合并的文件
|
||||
self.cache_file = os.path.join(self.output_dir, "merged_files.json")
|
||||
self.merged_files = self._load_merged_files()
|
||||
|
||||
logger.info(f"初始化完成,模板文件: {self.template_path}")
|
||||
|
||||
def _load_merged_files(self) -> Dict[str, str]:
|
||||
"""
|
||||
加载已合并文件的缓存
|
||||
|
||||
Returns:
|
||||
合并记录字典
|
||||
"""
|
||||
return load_json(self.cache_file, {})
|
||||
|
||||
def _save_merged_files(self) -> None:
|
||||
"""保存已合并文件的缓存"""
|
||||
save_json(self.merged_files, self.cache_file)
|
||||
|
||||
def get_purchase_orders(self) -> List[str]:
|
||||
"""
|
||||
获取output目录下的采购单Excel文件
|
||||
|
||||
Returns:
|
||||
采购单文件路径列表
|
||||
"""
|
||||
logger.info(f"搜索目录 {self.output_dir} 中的采购单Excel文件")
|
||||
|
||||
# 获取所有Excel文件
|
||||
all_files = get_files_by_extensions(self.output_dir, ['.xls', '.xlsx'])
|
||||
|
||||
# 筛选采购单文件
|
||||
purchase_orders = [
|
||||
file for file in all_files
|
||||
if os.path.basename(file).startswith('采购单_')
|
||||
]
|
||||
|
||||
if not purchase_orders:
|
||||
logger.warning(f"未在 {self.output_dir} 目录下找到采购单Excel文件")
|
||||
return []
|
||||
|
||||
# 按修改时间排序,最新的在前
|
||||
purchase_orders.sort(key=lambda x: os.path.getmtime(x), reverse=True)
|
||||
|
||||
logger.info(f"找到 {len(purchase_orders)} 个采购单Excel文件")
|
||||
return purchase_orders
|
||||
|
||||
def read_purchase_order(self, file_path: str) -> Optional[pd.DataFrame]:
|
||||
"""
|
||||
读取采购单Excel文件
|
||||
|
||||
Args:
|
||||
file_path: 采购单文件路径
|
||||
|
||||
Returns:
|
||||
数据帧,如果读取失败则返回None
|
||||
"""
|
||||
try:
|
||||
# 读取Excel文件
|
||||
df = pd.read_excel(file_path)
|
||||
logger.info(f"成功读取采购单文件: {file_path}")
|
||||
|
||||
# 打印列名,用于调试
|
||||
logger.debug(f"Excel文件的列名: {df.columns.tolist()}")
|
||||
|
||||
# 检查是否有特殊表头结构(如在第3行)
|
||||
special_header = False
|
||||
if len(df) > 3: # 确保有足够的行
|
||||
row3 = df.iloc[3].astype(str)
|
||||
header_keywords = ['行号', '条形码', '条码', '商品名称', '规格', '单价', '数量', '金额', '单位']
|
||||
# 计算匹配的关键词数量
|
||||
matches = sum(1 for keyword in header_keywords if any(keyword in str(val) for val in row3.values))
|
||||
# 如果匹配了至少3个关键词,认为第3行是表头
|
||||
if matches >= 3:
|
||||
logger.info(f"检测到特殊表头结构,使用第3行作为列名")
|
||||
# 创建新的数据帧,使用第3行作为列名,数据从第4行开始
|
||||
header_row = df.iloc[3]
|
||||
data_rows = df.iloc[4:].reset_index(drop=True)
|
||||
# 为每一列分配一个名称(避免重复的列名)
|
||||
new_columns = []
|
||||
for i, col in enumerate(header_row):
|
||||
col_str = str(col)
|
||||
if col_str == 'nan' or col_str == 'None' or pd.isna(col):
|
||||
new_columns.append(f"Col_{i}")
|
||||
else:
|
||||
new_columns.append(col_str)
|
||||
# 使用新列名创建新的DataFrame
|
||||
data_rows.columns = new_columns
|
||||
df = data_rows
|
||||
special_header = True
|
||||
logger.debug(f"重新构建的数据帧列名: {df.columns.tolist()}")
|
||||
|
||||
# 定义可能的列名映射
|
||||
column_mapping = {
|
||||
'条码': ['条码', '条形码', '商品条码', 'barcode', '商品条形码', '条形码', '商品条码', '商品编码', '商品编号', '条形码', '条码(必填)'],
|
||||
'采购量': ['数量', '采购数量', '购买数量', '采购数量', '订单数量', '采购数量', '采购量(必填)'],
|
||||
'采购单价': ['单价', '价格', '采购单价', '销售价', '采购单价(必填)'],
|
||||
'赠送量': ['赠送量', '赠品数量', '赠送数量', '赠品']
|
||||
}
|
||||
|
||||
# 映射实际的列名
|
||||
mapped_columns = {}
|
||||
for target_col, possible_names in column_mapping.items():
|
||||
for col in df.columns:
|
||||
# 移除列名中的空白字符和括号内容以进行比较
|
||||
clean_col = re.sub(r'\s+', '', str(col))
|
||||
clean_col = re.sub(r'(.*?)', '', clean_col) # 移除括号内容
|
||||
for name in possible_names:
|
||||
clean_name = re.sub(r'\s+', '', name)
|
||||
clean_name = re.sub(r'(.*?)', '', clean_name) # 移除括号内容
|
||||
if clean_col == clean_name:
|
||||
mapped_columns[target_col] = col
|
||||
break
|
||||
if target_col in mapped_columns:
|
||||
break
|
||||
|
||||
# 如果找到了必要的列,重命名列
|
||||
if mapped_columns:
|
||||
# 如果没有找到条码列,无法继续处理
|
||||
if '条码' not in mapped_columns:
|
||||
logger.error(f"未找到条码列: {file_path}")
|
||||
return None
|
||||
|
||||
df = df.rename(columns=mapped_columns)
|
||||
logger.info(f"列名映射结果: {mapped_columns}")
|
||||
|
||||
return df
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"读取采购单文件失败: {file_path}, 错误: {str(e)}")
|
||||
return None
|
||||
|
||||
def merge_purchase_orders(self, file_paths: List[str]) -> Optional[pd.DataFrame]:
|
||||
"""
|
||||
合并多个采购单文件
|
||||
|
||||
Args:
|
||||
file_paths: 采购单文件路径列表
|
||||
|
||||
Returns:
|
||||
合并后的数据帧,如果合并失败则返回None
|
||||
"""
|
||||
if not file_paths:
|
||||
logger.warning("没有需要合并的采购单文件")
|
||||
return None
|
||||
|
||||
# 读取所有采购单文件
|
||||
dfs = []
|
||||
for file_path in file_paths:
|
||||
df = self.read_purchase_order(file_path)
|
||||
if df is not None:
|
||||
dfs.append(df)
|
||||
|
||||
if not dfs:
|
||||
logger.warning("没有成功读取的采购单文件")
|
||||
return None
|
||||
|
||||
# 合并数据
|
||||
logger.info(f"开始合并 {len(dfs)} 个采购单文件")
|
||||
|
||||
# 首先,整理每个数据帧以确保它们有相同的结构
|
||||
processed_dfs = []
|
||||
for i, df in enumerate(dfs):
|
||||
# 确保必要的列存在
|
||||
required_columns = ['条码', '采购量', '采购单价']
|
||||
missing_columns = [col for col in required_columns if col not in df.columns]
|
||||
|
||||
if missing_columns:
|
||||
logger.warning(f"数据帧 {i} 缺少必要的列: {missing_columns}")
|
||||
continue
|
||||
|
||||
# 处理赠送量列不存在的情况
|
||||
if '赠送量' not in df.columns:
|
||||
df['赠送量'] = pd.NA
|
||||
|
||||
# 选择需要的列
|
||||
selected_df = df[['条码', '采购量', '采购单价', '赠送量']].copy()
|
||||
|
||||
# 清理和转换数据
|
||||
selected_df['条码'] = selected_df['条码'].apply(lambda x: format_barcode(x) if pd.notna(x) else x)
|
||||
selected_df['采购量'] = pd.to_numeric(selected_df['采购量'], errors='coerce')
|
||||
selected_df['采购单价'] = pd.to_numeric(selected_df['采购单价'], errors='coerce')
|
||||
selected_df['赠送量'] = pd.to_numeric(selected_df['赠送量'], errors='coerce')
|
||||
|
||||
# 过滤无效行
|
||||
valid_df = selected_df.dropna(subset=['条码', '采购量'])
|
||||
|
||||
processed_dfs.append(valid_df)
|
||||
|
||||
if not processed_dfs:
|
||||
logger.warning("没有有效的数据帧用于合并")
|
||||
return None
|
||||
|
||||
# 将所有数据帧合并
|
||||
merged_df = pd.concat(processed_dfs, ignore_index=True)
|
||||
|
||||
# 按条码和单价分组,合并相同商品
|
||||
merged_df['采购单价'] = merged_df['采购单价'].round(4) # 四舍五入到4位小数,避免浮点误差
|
||||
|
||||
# 对于同一条码和单价的商品,合并数量和赠送量
|
||||
grouped = merged_df.groupby(['条码', '采购单价'], as_index=False).agg({
|
||||
'采购量': 'sum',
|
||||
'赠送量': lambda x: sum(x.dropna()) if len(x.dropna()) > 0 else pd.NA
|
||||
})
|
||||
|
||||
# 计算其他信息
|
||||
grouped['采购金额'] = grouped['采购量'] * grouped['采购单价']
|
||||
|
||||
# 排序,按条码升序
|
||||
result = grouped.sort_values('条码').reset_index(drop=True)
|
||||
|
||||
logger.info(f"合并完成,共 {len(result)} 条商品记录")
|
||||
return result
|
||||
|
||||
def create_merged_purchase_order(self, df: pd.DataFrame) -> Optional[str]:
|
||||
"""
|
||||
创建合并的采购单文件
|
||||
|
||||
Args:
|
||||
df: 合并后的数据帧
|
||||
|
||||
Returns:
|
||||
输出文件路径,如果创建失败则返回None
|
||||
"""
|
||||
try:
|
||||
# 打开模板文件
|
||||
template_workbook = xlrd.open_workbook(self.template_path, formatting_info=True)
|
||||
template_sheet = template_workbook.sheet_by_index(0)
|
||||
|
||||
# 创建可写的副本
|
||||
output_workbook = xlcopy(template_workbook)
|
||||
output_sheet = output_workbook.get_sheet(0)
|
||||
|
||||
# 填充商品信息
|
||||
start_row = 4 # 从第5行开始填充数据(索引从0开始)
|
||||
|
||||
for i, (_, row) in enumerate(df.iterrows()):
|
||||
r = start_row + i
|
||||
|
||||
# 序号
|
||||
output_sheet.write(r, 0, i + 1)
|
||||
# 商品编码(条码)
|
||||
output_sheet.write(r, 1, row['条码'])
|
||||
# 商品名称(合并单没有名称信息,留空)
|
||||
output_sheet.write(r, 2, "")
|
||||
# 规格(合并单没有规格信息,留空)
|
||||
output_sheet.write(r, 3, "")
|
||||
# 单位(合并单没有单位信息,留空)
|
||||
output_sheet.write(r, 4, "")
|
||||
# 单价
|
||||
output_sheet.write(r, 5, row['采购单价'])
|
||||
# 采购数量
|
||||
output_sheet.write(r, 6, row['采购量'])
|
||||
# 采购金额
|
||||
output_sheet.write(r, 7, row['采购金额'])
|
||||
# 税率
|
||||
output_sheet.write(r, 8, 0)
|
||||
# 赠送量
|
||||
if pd.notna(row['赠送量']):
|
||||
output_sheet.write(r, 9, row['赠送量'])
|
||||
else:
|
||||
output_sheet.write(r, 9, "")
|
||||
|
||||
# 生成输出文件名
|
||||
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
|
||||
output_file = os.path.join(self.output_dir, f"合并采购单_{timestamp}.xls")
|
||||
|
||||
# 保存文件
|
||||
output_workbook.save(output_file)
|
||||
logger.info(f"合并采购单已保存到: {output_file}")
|
||||
return output_file
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"创建合并采购单时出错: {e}")
|
||||
return None
|
||||
|
||||
def process(self, file_paths: Optional[List[str]] = None) -> Optional[str]:
|
||||
"""
|
||||
处理采购单合并
|
||||
|
||||
Args:
|
||||
file_paths: 指定要合并的文件路径列表,如果为None则自动获取
|
||||
|
||||
Returns:
|
||||
合并后的文件路径,如果合并失败则返回None
|
||||
"""
|
||||
# 如果未指定文件路径,则获取所有采购单文件
|
||||
if file_paths is None:
|
||||
file_paths = self.get_purchase_orders()
|
||||
|
||||
# 检查是否有文件需要合并
|
||||
if not file_paths:
|
||||
logger.warning("没有找到可合并的采购单文件")
|
||||
return None
|
||||
|
||||
# 合并采购单
|
||||
merged_df = self.merge_purchase_orders(file_paths)
|
||||
if merged_df is None:
|
||||
logger.error("合并采购单失败")
|
||||
return None
|
||||
|
||||
# 创建合并的采购单文件
|
||||
output_file = self.create_merged_purchase_order(merged_df)
|
||||
if output_file is None:
|
||||
logger.error("创建合并采购单文件失败")
|
||||
return None
|
||||
|
||||
# 记录已合并文件
|
||||
for file_path in file_paths:
|
||||
self.merged_files[file_path] = output_file
|
||||
self._save_merged_files()
|
||||
|
||||
return output_file
|
||||
@@ -0,0 +1,393 @@
|
||||
"""
|
||||
Excel处理核心模块
|
||||
--------------
|
||||
提供Excel文件处理功能,包括表格解析、数据提取和处理。
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import xlrd
|
||||
import xlwt
|
||||
from xlutils.copy import copy as xlcopy
|
||||
from typing import Dict, List, Optional, Tuple, Union, Any
|
||||
from datetime import datetime
|
||||
|
||||
from ...config.settings import ConfigManager
|
||||
from ..utils.log_utils import get_logger
|
||||
from ..utils.file_utils import (
|
||||
ensure_dir,
|
||||
get_file_extension,
|
||||
get_latest_file,
|
||||
load_json,
|
||||
save_json
|
||||
)
|
||||
from ..utils.string_utils import (
|
||||
clean_string,
|
||||
clean_barcode,
|
||||
extract_number,
|
||||
format_barcode
|
||||
)
|
||||
from .converter import UnitConverter
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
class ExcelProcessor:
|
||||
"""
|
||||
Excel处理器:处理OCR识别后的Excel文件,
|
||||
提取条码、单价和数量,并按照采购单模板的格式填充
|
||||
"""
|
||||
|
||||
def __init__(self, config: Optional[ConfigManager] = None):
|
||||
"""
|
||||
初始化Excel处理器
|
||||
|
||||
Args:
|
||||
config: 配置管理器,如果为None则创建新的
|
||||
"""
|
||||
logger.info("初始化ExcelProcessor")
|
||||
self.config = config or ConfigManager()
|
||||
|
||||
# 获取配置
|
||||
self.output_dir = self.config.get_path('Paths', 'output_folder', 'data/output', create=True)
|
||||
self.temp_dir = self.config.get_path('Paths', 'temp_folder', 'data/temp', create=True)
|
||||
|
||||
# 获取模板文件路径
|
||||
template_folder = self.config.get('Paths', 'template_folder', 'templates')
|
||||
template_name = self.config.get('Templates', 'purchase_order', '银豹-采购单模板.xls')
|
||||
|
||||
self.template_path = os.path.join(template_folder, template_name)
|
||||
|
||||
# 检查模板文件是否存在
|
||||
if not os.path.exists(self.template_path):
|
||||
logger.error(f"模板文件不存在: {self.template_path}")
|
||||
raise FileNotFoundError(f"模板文件不存在: {self.template_path}")
|
||||
|
||||
# 用于记录已处理的文件
|
||||
self.cache_file = os.path.join(self.output_dir, "processed_files.json")
|
||||
self.processed_files = self._load_processed_files()
|
||||
|
||||
# 创建单位转换器
|
||||
self.unit_converter = UnitConverter()
|
||||
|
||||
logger.info(f"初始化完成,模板文件: {self.template_path}")
|
||||
|
||||
def _load_processed_files(self) -> Dict[str, str]:
|
||||
"""
|
||||
加载已处理文件的缓存
|
||||
|
||||
Returns:
|
||||
处理记录字典
|
||||
"""
|
||||
return load_json(self.cache_file, {})
|
||||
|
||||
def _save_processed_files(self) -> None:
|
||||
"""保存已处理文件的缓存"""
|
||||
save_json(self.processed_files, self.cache_file)
|
||||
|
||||
def get_latest_excel(self) -> Optional[str]:
|
||||
"""
|
||||
获取output目录下最新的Excel文件(排除采购单文件)
|
||||
|
||||
Returns:
|
||||
最新Excel文件的路径,如果未找到则返回None
|
||||
"""
|
||||
logger.info(f"搜索目录 {self.output_dir} 中的Excel文件")
|
||||
|
||||
# 使用文件工具获取最新文件
|
||||
latest_file = get_latest_file(
|
||||
self.output_dir,
|
||||
pattern="", # 不限制文件名
|
||||
extensions=['.xlsx', '.xls'] # 限制为Excel文件
|
||||
)
|
||||
|
||||
# 如果没有找到文件
|
||||
if not latest_file:
|
||||
logger.warning(f"未在 {self.output_dir} 目录下找到未处理的Excel文件")
|
||||
return None
|
||||
|
||||
# 检查是否是采购单(以"采购单_"开头的文件)
|
||||
file_name = os.path.basename(latest_file)
|
||||
if file_name.startswith('采购单_'):
|
||||
logger.warning(f"找到的最新文件是采购单,不作处理: {latest_file}")
|
||||
return None
|
||||
|
||||
logger.info(f"找到最新的Excel文件: {latest_file}")
|
||||
return latest_file
|
||||
|
||||
def validate_barcode(self, barcode: Any) -> bool:
|
||||
"""
|
||||
验证条码是否有效
|
||||
新增功能:如果条码是"仓库",则返回False以避免误认为有效条码
|
||||
|
||||
Args:
|
||||
barcode: 条码值
|
||||
|
||||
Returns:
|
||||
条码是否有效
|
||||
"""
|
||||
# 处理"仓库"特殊情况
|
||||
if isinstance(barcode, str) and barcode.strip() in ["仓库", "仓库全名"]:
|
||||
logger.warning(f"条码为仓库标识: {barcode}")
|
||||
return False
|
||||
|
||||
# 清理条码格式
|
||||
barcode_clean = clean_barcode(barcode)
|
||||
|
||||
# 对特定的错误条码进行修正(开头改6开头)
|
||||
if len(barcode_clean) > 8 and barcode_clean.startswith('5') and not barcode_clean.startswith('53'):
|
||||
barcode_clean = '6' + barcode_clean[1:]
|
||||
logger.info(f"修正条码前缀 5->6: {barcode} -> {barcode_clean}")
|
||||
|
||||
# 验证条码长度
|
||||
if len(barcode_clean) < 8 or len(barcode_clean) > 13:
|
||||
logger.warning(f"条码长度异常: {barcode_clean}, 长度={len(barcode_clean)}")
|
||||
return False
|
||||
|
||||
# 验证条码是否全为数字
|
||||
if not barcode_clean.isdigit():
|
||||
logger.warning(f"条码包含非数字字符: {barcode_clean}")
|
||||
return False
|
||||
|
||||
# 对于序号9的特殊情况,允许其条码格式
|
||||
if barcode_clean == "5321545613":
|
||||
logger.info(f"特殊条码验证通过: {barcode_clean}")
|
||||
return True
|
||||
|
||||
logger.debug(f"条码验证通过: {barcode_clean}")
|
||||
return True
|
||||
|
||||
def extract_barcode(self, df: pd.DataFrame) -> List[str]:
|
||||
"""
|
||||
从数据帧中提取条码列名
|
||||
|
||||
Args:
|
||||
df: 数据帧
|
||||
|
||||
Returns:
|
||||
可能的条码列名列表
|
||||
"""
|
||||
possible_barcode_columns = [
|
||||
'条码', '条形码', '商品条码', '商品条形码',
|
||||
'商品编码', '商品编号', '条形码', '条码(必填)',
|
||||
'barcode', 'Barcode', '编码', '条形码'
|
||||
]
|
||||
|
||||
found_columns = []
|
||||
for col in df.columns:
|
||||
col_str = str(col).strip()
|
||||
if col_str in possible_barcode_columns:
|
||||
found_columns.append(col)
|
||||
|
||||
return found_columns
|
||||
|
||||
def extract_product_info(self, df: pd.DataFrame) -> List[Dict]:
|
||||
"""
|
||||
从数据帧中提取商品信息
|
||||
|
||||
Args:
|
||||
df: 数据帧
|
||||
|
||||
Returns:
|
||||
商品信息列表
|
||||
"""
|
||||
# 提取有用的列
|
||||
barcode_cols = self.extract_barcode(df)
|
||||
|
||||
# 如果没有找到条码列,无法继续处理
|
||||
if not barcode_cols:
|
||||
logger.error("未找到条码列,无法处理")
|
||||
return []
|
||||
|
||||
# 定义列名映射
|
||||
column_mapping = {
|
||||
'name': ['商品名称', '名称', '品名', '商品', '商品名', '商品或服务名称', '品项名'],
|
||||
'specification': ['规格', '规格型号', '型号', '商品规格'],
|
||||
'quantity': ['数量', '采购数量', '购买数量', '采购数量', '订单数量', '数量(必填)'],
|
||||
'unit': ['单位', '采购单位', '计量单位', '单位(必填)'],
|
||||
'price': ['单价', '价格', '采购单价', '销售价', '进货价', '单价(必填)']
|
||||
}
|
||||
|
||||
# 映射列名到标准名称
|
||||
mapped_columns = {'barcode': barcode_cols[0]} # 使用第一个找到的条码列
|
||||
|
||||
for target, possible_names in column_mapping.items():
|
||||
for col in df.columns:
|
||||
col_str = str(col).strip()
|
||||
for name in possible_names:
|
||||
if col_str == name:
|
||||
mapped_columns[target] = col
|
||||
break
|
||||
if target in mapped_columns:
|
||||
break
|
||||
|
||||
logger.info(f"列名映射结果: {mapped_columns}")
|
||||
|
||||
# 提取商品信息
|
||||
products = []
|
||||
|
||||
for _, row in df.iterrows():
|
||||
barcode = row.get(mapped_columns['barcode'])
|
||||
|
||||
# 跳过空行或无效条码
|
||||
if pd.isna(barcode) or not self.validate_barcode(barcode):
|
||||
continue
|
||||
|
||||
# 创建商品信息字典
|
||||
product = {
|
||||
'barcode': format_barcode(barcode),
|
||||
'name': row.get(mapped_columns.get('name', ''), ''),
|
||||
'specification': row.get(mapped_columns.get('specification', ''), ''),
|
||||
'quantity': extract_number(str(row.get(mapped_columns.get('quantity', ''), 0))) or 0,
|
||||
'unit': str(row.get(mapped_columns.get('unit', ''), '')),
|
||||
'price': extract_number(str(row.get(mapped_columns.get('price', ''), 0))) or 0
|
||||
}
|
||||
|
||||
# 如果商品名称为空但商品条码不为空,则使用条码作为名称
|
||||
if not product['name'] and product['barcode']:
|
||||
product['name'] = f"商品 ({product['barcode']})"
|
||||
|
||||
# 推断规格
|
||||
if not product['specification'] and product['name']:
|
||||
inferred_spec = self.unit_converter.infer_specification_from_name(product['name'])
|
||||
if inferred_spec:
|
||||
product['specification'] = inferred_spec
|
||||
logger.info(f"从商品名称推断规格: {product['name']} -> {inferred_spec}")
|
||||
|
||||
# 单位处理:如果单位为空但数量包含单位信息
|
||||
quantity_str = str(row.get(mapped_columns.get('quantity', ''), ''))
|
||||
if not product['unit'] and '数量' in mapped_columns:
|
||||
num, unit = self.unit_converter.extract_unit_from_quantity(quantity_str)
|
||||
if unit:
|
||||
product['unit'] = unit
|
||||
logger.info(f"从数量提取单位: {quantity_str} -> {unit}")
|
||||
# 如果数量被提取出来,更新数量
|
||||
if num is not None:
|
||||
product['quantity'] = num
|
||||
|
||||
# 应用单位转换规则
|
||||
product = self.unit_converter.process_unit_conversion(product)
|
||||
|
||||
products.append(product)
|
||||
|
||||
logger.info(f"提取到 {len(products)} 个商品信息")
|
||||
return products
|
||||
|
||||
def fill_template(self, products: List[Dict], output_file_path: str) -> bool:
|
||||
"""
|
||||
填充采购单模板
|
||||
|
||||
Args:
|
||||
products: 商品信息列表
|
||||
output_file_path: 输出文件路径
|
||||
|
||||
Returns:
|
||||
是否成功填充
|
||||
"""
|
||||
try:
|
||||
# 打开模板文件
|
||||
template_workbook = xlrd.open_workbook(self.template_path, formatting_info=True)
|
||||
template_sheet = template_workbook.sheet_by_index(0)
|
||||
|
||||
# 创建可写的副本
|
||||
output_workbook = xlcopy(template_workbook)
|
||||
output_sheet = output_workbook.get_sheet(0)
|
||||
|
||||
# 填充商品信息
|
||||
start_row = 1 # 从第2行开始填充数据(索引从0开始)
|
||||
|
||||
for i, product in enumerate(products):
|
||||
row = start_row + i
|
||||
|
||||
# 序号
|
||||
output_sheet.write(row, 0, i + 1)
|
||||
# 商品编码(条码)
|
||||
output_sheet.write(row, 1, product['barcode'])
|
||||
# 商品名称
|
||||
output_sheet.write(row, 2, product['name'])
|
||||
# 规格
|
||||
output_sheet.write(row, 3, product['specification'])
|
||||
# 单位
|
||||
output_sheet.write(row, 4, product['unit'])
|
||||
# 单价
|
||||
output_sheet.write(row, 5, product['price'])
|
||||
# 采购数量
|
||||
output_sheet.write(row, 6, product['quantity'])
|
||||
# 采购金额(单价 × 数量)
|
||||
amount = product['price'] * product['quantity']
|
||||
output_sheet.write(row, 7, amount)
|
||||
# 税率
|
||||
output_sheet.write(row, 8, 0)
|
||||
# 赠送量(默认为0)
|
||||
output_sheet.write(row, 9, 0)
|
||||
|
||||
# 保存文件
|
||||
output_workbook.save(output_file_path)
|
||||
logger.info(f"采购单已保存到: {output_file_path}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"填充模板时出错: {e}")
|
||||
return False
|
||||
|
||||
def process_specific_file(self, file_path: str) -> Optional[str]:
|
||||
"""
|
||||
处理指定的Excel文件
|
||||
|
||||
Args:
|
||||
file_path: Excel文件路径
|
||||
|
||||
Returns:
|
||||
输出文件路径,如果处理失败则返回None
|
||||
"""
|
||||
logger.info(f"开始处理Excel文件: {file_path}")
|
||||
|
||||
if not os.path.exists(file_path):
|
||||
logger.error(f"文件不存在: {file_path}")
|
||||
return None
|
||||
|
||||
try:
|
||||
# 读取Excel文件
|
||||
df = pd.read_excel(file_path)
|
||||
logger.info(f"成功读取Excel文件: {file_path}, 共 {len(df)} 行")
|
||||
|
||||
# 提取商品信息
|
||||
products = self.extract_product_info(df)
|
||||
|
||||
if not products:
|
||||
logger.warning("未提取到有效商品信息")
|
||||
return None
|
||||
|
||||
# 生成输出文件名
|
||||
file_name = os.path.splitext(os.path.basename(file_path))[0]
|
||||
output_file = os.path.join(self.output_dir, f"采购单_{file_name}.xls")
|
||||
|
||||
# 填充模板并保存
|
||||
if self.fill_template(products, output_file):
|
||||
# 记录已处理文件
|
||||
self.processed_files[file_path] = output_file
|
||||
self._save_processed_files()
|
||||
return output_file
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"处理Excel文件时出错: {file_path}, 错误: {e}")
|
||||
return None
|
||||
|
||||
def process_latest_file(self) -> Optional[str]:
|
||||
"""
|
||||
处理最新的Excel文件
|
||||
|
||||
Returns:
|
||||
输出文件路径,如果处理失败则返回None
|
||||
"""
|
||||
# 获取最新的Excel文件
|
||||
latest_file = self.get_latest_excel()
|
||||
if not latest_file:
|
||||
logger.warning("未找到可处理的Excel文件")
|
||||
return None
|
||||
|
||||
# 处理文件
|
||||
return self.process_specific_file(latest_file)
|
||||
@@ -0,0 +1,5 @@
|
||||
"""
|
||||
OCR订单处理系统 - OCR核心模块
|
||||
---------------------------
|
||||
提供OCR识别相关功能,包括图片预处理、文字识别和表格识别。
|
||||
"""
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,344 @@
|
||||
"""
|
||||
百度OCR客户端模块
|
||||
---------------
|
||||
提供百度OCR API的访问和调用功能。
|
||||
"""
|
||||
|
||||
import os
|
||||
import time
|
||||
import base64
|
||||
import requests
|
||||
import logging
|
||||
from typing import Dict, Optional, Any, Union
|
||||
|
||||
from ...config.settings import ConfigManager
|
||||
from ..utils.log_utils import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
class TokenManager:
|
||||
"""
|
||||
令牌管理类,负责获取和刷新百度API访问令牌
|
||||
"""
|
||||
|
||||
def __init__(self, api_key: str, secret_key: str, max_retries: int = 3, retry_delay: int = 2):
|
||||
"""
|
||||
初始化令牌管理器
|
||||
|
||||
Args:
|
||||
api_key: 百度API Key
|
||||
secret_key: 百度Secret Key
|
||||
max_retries: 最大重试次数
|
||||
retry_delay: 重试延迟(秒)
|
||||
"""
|
||||
self.api_key = api_key
|
||||
self.secret_key = secret_key
|
||||
self.max_retries = max_retries
|
||||
self.retry_delay = retry_delay
|
||||
self.access_token = None
|
||||
self.token_expiry = 0
|
||||
|
||||
def get_token(self) -> Optional[str]:
|
||||
"""
|
||||
获取访问令牌,如果令牌已过期则刷新
|
||||
|
||||
Returns:
|
||||
访问令牌,如果获取失败则返回None
|
||||
"""
|
||||
if self.is_token_valid():
|
||||
return self.access_token
|
||||
|
||||
return self.refresh_token()
|
||||
|
||||
def is_token_valid(self) -> bool:
|
||||
"""
|
||||
检查令牌是否有效
|
||||
|
||||
Returns:
|
||||
令牌是否有效
|
||||
"""
|
||||
return (
|
||||
self.access_token is not None and
|
||||
self.token_expiry > time.time() + 60 # 提前1分钟刷新
|
||||
)
|
||||
|
||||
def refresh_token(self) -> Optional[str]:
|
||||
"""
|
||||
刷新访问令牌
|
||||
|
||||
Returns:
|
||||
新的访问令牌,如果获取失败则返回None
|
||||
"""
|
||||
url = "https://aip.baidubce.com/oauth/2.0/token"
|
||||
params = {
|
||||
"grant_type": "client_credentials",
|
||||
"client_id": self.api_key,
|
||||
"client_secret": self.secret_key
|
||||
}
|
||||
|
||||
for attempt in range(self.max_retries):
|
||||
try:
|
||||
response = requests.post(url, params=params, timeout=10)
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
if "access_token" in result:
|
||||
self.access_token = result["access_token"]
|
||||
# 设置令牌过期时间(默认30天,提前1小时过期以确保安全)
|
||||
self.token_expiry = time.time() + result.get("expires_in", 2592000) - 3600
|
||||
logger.info("成功获取访问令牌")
|
||||
return self.access_token
|
||||
|
||||
logger.warning(f"获取访问令牌失败 (尝试 {attempt+1}/{self.max_retries}): {response.text}")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"获取访问令牌时发生错误 (尝试 {attempt+1}/{self.max_retries}): {e}")
|
||||
|
||||
# 如果不是最后一次尝试,则等待后重试
|
||||
if attempt < self.max_retries - 1:
|
||||
time.sleep(self.retry_delay * (attempt + 1)) # 指数退避
|
||||
|
||||
logger.error("无法获取访问令牌")
|
||||
return None
|
||||
|
||||
class BaiduOCRClient:
|
||||
"""
|
||||
百度OCR API客户端
|
||||
"""
|
||||
|
||||
def __init__(self, config: Optional[ConfigManager] = None):
|
||||
"""
|
||||
初始化百度OCR客户端
|
||||
|
||||
Args:
|
||||
config: 配置管理器,如果为None则创建新的
|
||||
"""
|
||||
self.config = config or ConfigManager()
|
||||
|
||||
# 获取配置
|
||||
self.api_key = self.config.get('API', 'api_key')
|
||||
self.secret_key = self.config.get('API', 'secret_key')
|
||||
self.timeout = self.config.getint('API', 'timeout', 30)
|
||||
self.max_retries = self.config.getint('API', 'max_retries', 3)
|
||||
self.retry_delay = self.config.getint('API', 'retry_delay', 2)
|
||||
self.api_url = self.config.get('API', 'api_url', 'https://aip.baidubce.com/rest/2.0/ocr/v1/table')
|
||||
|
||||
# 创建令牌管理器
|
||||
self.token_manager = TokenManager(
|
||||
self.api_key,
|
||||
self.secret_key,
|
||||
self.max_retries,
|
||||
self.retry_delay
|
||||
)
|
||||
|
||||
# 验证API配置
|
||||
if not self.api_key or not self.secret_key:
|
||||
logger.warning("API密钥未设置,请在配置文件中设置API密钥")
|
||||
|
||||
def read_image(self, image_path: str) -> Optional[bytes]:
|
||||
"""
|
||||
读取图片文件为二进制数据
|
||||
|
||||
Args:
|
||||
image_path: 图片文件路径
|
||||
|
||||
Returns:
|
||||
图片二进制数据,如果读取失败则返回None
|
||||
"""
|
||||
try:
|
||||
with open(image_path, 'rb') as f:
|
||||
return f.read()
|
||||
except Exception as e:
|
||||
logger.error(f"读取图片文件失败: {image_path}, 错误: {e}")
|
||||
return None
|
||||
|
||||
def recognize_table(self, image_data: Union[str, bytes]) -> Optional[Dict]:
|
||||
"""
|
||||
识别表格
|
||||
|
||||
Args:
|
||||
image_data: 图片数据,可以是文件路径或二进制数据
|
||||
|
||||
Returns:
|
||||
识别结果字典,如果识别失败则返回None
|
||||
"""
|
||||
# 获取访问令牌
|
||||
access_token = self.token_manager.get_token()
|
||||
if not access_token:
|
||||
logger.error("无法获取访问令牌,无法进行表格识别")
|
||||
return None
|
||||
|
||||
# 如果是文件路径,读取图片数据
|
||||
if isinstance(image_data, str):
|
||||
image_data = self.read_image(image_data)
|
||||
if image_data is None:
|
||||
return None
|
||||
|
||||
# 准备请求参数
|
||||
url = f"{self.api_url}?access_token={access_token}"
|
||||
image_base64 = base64.b64encode(image_data).decode('utf-8')
|
||||
|
||||
# 请求参数 - 添加return_excel参数,与v1版本保持一致
|
||||
payload = {
|
||||
'image': image_base64,
|
||||
'is_sync': 'true', # 同步请求
|
||||
'request_type': 'excel', # 输出为Excel
|
||||
'return_excel': 'true' # 直接返回Excel数据
|
||||
}
|
||||
|
||||
headers = {
|
||||
'Content-Type': 'application/x-www-form-urlencoded',
|
||||
'Accept': 'application/json'
|
||||
}
|
||||
|
||||
# 发送请求
|
||||
for attempt in range(self.max_retries):
|
||||
try:
|
||||
response = requests.post(
|
||||
url,
|
||||
data=payload,
|
||||
headers=headers,
|
||||
timeout=self.timeout
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
# 打印返回结果以便调试
|
||||
logger.debug(f"百度OCR API返回结果: {result}")
|
||||
|
||||
if 'error_code' in result:
|
||||
error_msg = result.get('error_msg', '未知错误')
|
||||
logger.error(f"百度OCR API错误: {error_msg}")
|
||||
# 如果是授权错误,尝试刷新令牌
|
||||
if result.get('error_code') in [110, 111]: # 授权相关错误码
|
||||
logger.info("尝试刷新访问令牌...")
|
||||
self.token_manager.refresh_token()
|
||||
return None
|
||||
|
||||
# 兼容不同的返回结构
|
||||
# 这是最关键的修改部分: 直接返回整个结果,不强制要求特定结构
|
||||
return result
|
||||
else:
|
||||
logger.warning(f"表格识别请求失败 (尝试 {attempt+1}/{self.max_retries}): {response.text}")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"表格识别时发生错误 (尝试 {attempt+1}/{self.max_retries}): {e}")
|
||||
|
||||
# 如果不是最后一次尝试,则等待后重试
|
||||
if attempt < self.max_retries - 1:
|
||||
wait_time = self.retry_delay * (2 ** attempt) # 指数退避
|
||||
logger.info(f"将在 {wait_time} 秒后重试...")
|
||||
time.sleep(wait_time)
|
||||
|
||||
logger.error("表格识别失败")
|
||||
return None
|
||||
|
||||
def get_excel_result(self, request_id_or_result: Union[str, Dict]) -> Optional[bytes]:
|
||||
"""
|
||||
获取Excel结果
|
||||
|
||||
Args:
|
||||
request_id_or_result: 请求ID或完整的识别结果
|
||||
|
||||
Returns:
|
||||
Excel二进制数据,如果获取失败则返回None
|
||||
"""
|
||||
# 获取访问令牌
|
||||
access_token = self.token_manager.get_token()
|
||||
if not access_token:
|
||||
logger.error("无法获取访问令牌,无法获取Excel结果")
|
||||
return None
|
||||
|
||||
# 处理直接传入结果对象的情况
|
||||
request_id = request_id_or_result
|
||||
if isinstance(request_id_or_result, dict):
|
||||
# v1版本兼容处理:如果结果中直接包含Excel数据
|
||||
if 'result' in request_id_or_result:
|
||||
# 如果是同步返回的Excel结果(某些API版本会直接返回)
|
||||
if 'result_data' in request_id_or_result['result']:
|
||||
excel_content = request_id_or_result['result']['result_data']
|
||||
if excel_content:
|
||||
try:
|
||||
return base64.b64decode(excel_content)
|
||||
except Exception as e:
|
||||
logger.error(f"解析Excel数据失败: {e}")
|
||||
|
||||
# 提取request_id
|
||||
if 'request_id' in request_id_or_result['result']:
|
||||
request_id = request_id_or_result['result']['request_id']
|
||||
logger.debug(f"从result子对象中提取request_id: {request_id}")
|
||||
elif 'tables_result' in request_id_or_result['result'] and len(request_id_or_result['result']['tables_result']) > 0:
|
||||
# 某些版本API可能直接返回表格内容,此时可能没有request_id
|
||||
logger.info("检测到API直接返回了表格内容,但没有request_id")
|
||||
return None
|
||||
# 有些版本可能request_id在顶层
|
||||
elif 'request_id' in request_id_or_result:
|
||||
request_id = request_id_or_result['request_id']
|
||||
logger.debug(f"从顶层对象中提取request_id: {request_id}")
|
||||
|
||||
# 如果没有有效的request_id,无法获取结果
|
||||
if not isinstance(request_id, str):
|
||||
logger.error(f"无法从结果中提取有效的request_id: {request_id_or_result}")
|
||||
return None
|
||||
|
||||
url = f"https://aip.baidubce.com/rest/2.0/solution/v1/form_ocr/get_request_result?access_token={access_token}"
|
||||
|
||||
payload = {
|
||||
'request_id': request_id,
|
||||
'result_type': 'excel'
|
||||
}
|
||||
|
||||
headers = {
|
||||
'Content-Type': 'application/x-www-form-urlencoded',
|
||||
'Accept': 'application/json'
|
||||
}
|
||||
|
||||
for attempt in range(self.max_retries):
|
||||
try:
|
||||
response = requests.post(
|
||||
url,
|
||||
data=payload,
|
||||
headers=headers,
|
||||
timeout=self.timeout
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
try:
|
||||
result = response.json()
|
||||
logger.debug(f"获取Excel结果返回: {result}")
|
||||
|
||||
# 检查是否还在处理中
|
||||
if result.get('result', {}).get('ret_code') == 3:
|
||||
logger.info(f"Excel结果正在处理中,等待后重试 (尝试 {attempt+1}/{self.max_retries})")
|
||||
time.sleep(2)
|
||||
continue
|
||||
|
||||
# 检查是否有错误
|
||||
if 'error_code' in result or result.get('result', {}).get('ret_code') != 0:
|
||||
error_msg = result.get('error_msg') or result.get('result', {}).get('ret_msg', '未知错误')
|
||||
logger.error(f"获取Excel结果失败: {error_msg}")
|
||||
return None
|
||||
|
||||
# 获取Excel内容
|
||||
excel_content = result.get('result', {}).get('result_data')
|
||||
if excel_content:
|
||||
return base64.b64decode(excel_content)
|
||||
else:
|
||||
logger.error("Excel结果为空")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"解析Excel结果时出错: {e}")
|
||||
return None
|
||||
|
||||
else:
|
||||
logger.warning(f"获取Excel结果请求失败 (尝试 {attempt+1}/{self.max_retries}): {response.text}")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"获取Excel结果时发生错误 (尝试 {attempt+1}/{self.max_retries}): {e}")
|
||||
|
||||
# 如果不是最后一次尝试,则等待后重试
|
||||
if attempt < self.max_retries - 1:
|
||||
time.sleep(self.retry_delay * (attempt + 1))
|
||||
|
||||
logger.error("获取Excel结果失败")
|
||||
return None
|
||||
@@ -0,0 +1,334 @@
|
||||
"""
|
||||
表格OCR处理模块
|
||||
-------------
|
||||
处理图片并提取表格内容,保存为Excel文件。
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import json
|
||||
import base64
|
||||
from datetime import datetime
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from typing import Dict, List, Optional, Tuple, Union, Any
|
||||
|
||||
from ...config.settings import ConfigManager
|
||||
from ..utils.log_utils import get_logger
|
||||
from ..utils.file_utils import (
|
||||
ensure_dir,
|
||||
get_file_extension,
|
||||
get_files_by_extensions,
|
||||
generate_timestamp_filename,
|
||||
is_file_size_valid,
|
||||
load_json,
|
||||
save_json
|
||||
)
|
||||
from .baidu_ocr import BaiduOCRClient
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
class ProcessedRecordManager:
|
||||
"""处理记录管理器,用于跟踪已处理的文件"""
|
||||
|
||||
def __init__(self, record_file: str):
|
||||
"""
|
||||
初始化处理记录管理器
|
||||
|
||||
Args:
|
||||
record_file: 记录文件路径
|
||||
"""
|
||||
self.record_file = record_file
|
||||
self.processed_files = self._load_record()
|
||||
|
||||
def _load_record(self) -> Dict[str, str]:
|
||||
"""
|
||||
加载处理记录
|
||||
|
||||
Returns:
|
||||
处理记录字典,键为输入文件路径,值为输出文件路径
|
||||
"""
|
||||
return load_json(self.record_file, {})
|
||||
|
||||
def save_record(self) -> None:
|
||||
"""保存处理记录"""
|
||||
save_json(self.processed_files, self.record_file)
|
||||
|
||||
def is_processed(self, image_file: str) -> bool:
|
||||
"""
|
||||
检查图片是否已处理
|
||||
|
||||
Args:
|
||||
image_file: 图片文件路径
|
||||
|
||||
Returns:
|
||||
是否已处理
|
||||
"""
|
||||
return image_file in self.processed_files
|
||||
|
||||
def mark_as_processed(self, image_file: str, output_file: str) -> None:
|
||||
"""
|
||||
标记图片为已处理
|
||||
|
||||
Args:
|
||||
image_file: 图片文件路径
|
||||
output_file: 输出文件路径
|
||||
"""
|
||||
self.processed_files[image_file] = output_file
|
||||
self.save_record()
|
||||
|
||||
def get_output_file(self, image_file: str) -> Optional[str]:
|
||||
"""
|
||||
获取图片的输出文件路径
|
||||
|
||||
Args:
|
||||
image_file: 图片文件路径
|
||||
|
||||
Returns:
|
||||
输出文件路径,如果不存在则返回None
|
||||
"""
|
||||
return self.processed_files.get(image_file)
|
||||
|
||||
def get_unprocessed_files(self, files: List[str]) -> List[str]:
|
||||
"""
|
||||
获取未处理的文件列表
|
||||
|
||||
Args:
|
||||
files: 文件列表
|
||||
|
||||
Returns:
|
||||
未处理的文件列表
|
||||
"""
|
||||
return [file for file in files if not self.is_processed(file)]
|
||||
|
||||
class OCRProcessor:
|
||||
"""
|
||||
OCR处理器,用于表格识别与处理
|
||||
"""
|
||||
|
||||
def __init__(self, config: Optional[ConfigManager] = None):
|
||||
"""
|
||||
初始化OCR处理器
|
||||
|
||||
Args:
|
||||
config: 配置管理器,如果为None则创建新的
|
||||
"""
|
||||
self.config = config or ConfigManager()
|
||||
|
||||
# 创建百度OCR客户端
|
||||
self.ocr_client = BaiduOCRClient(self.config)
|
||||
|
||||
# 获取配置
|
||||
self.input_folder = self.config.get_path('Paths', 'input_folder', 'data/input', create=True)
|
||||
self.output_folder = self.config.get_path('Paths', 'output_folder', 'data/output', create=True)
|
||||
self.temp_folder = self.config.get_path('Paths', 'temp_folder', 'data/temp', create=True)
|
||||
|
||||
# 确保目录结构正确
|
||||
for folder in [self.input_folder, self.output_folder, self.temp_folder]:
|
||||
if not os.path.exists(folder):
|
||||
os.makedirs(folder, exist_ok=True)
|
||||
logger.info(f"创建目录: {folder}")
|
||||
|
||||
# 记录实际路径
|
||||
logger.info(f"使用输入目录: {os.path.abspath(self.input_folder)}")
|
||||
logger.info(f"使用输出目录: {os.path.abspath(self.output_folder)}")
|
||||
logger.info(f"使用临时目录: {os.path.abspath(self.temp_folder)}")
|
||||
|
||||
self.allowed_extensions = self.config.get_list('File', 'allowed_extensions', '.jpg,.jpeg,.png,.bmp')
|
||||
self.max_file_size_mb = self.config.getfloat('File', 'max_file_size_mb', 4.0)
|
||||
self.excel_extension = self.config.get('File', 'excel_extension', '.xlsx')
|
||||
|
||||
# 处理性能配置
|
||||
self.max_workers = self.config.getint('Performance', 'max_workers', 4)
|
||||
self.batch_size = self.config.getint('Performance', 'batch_size', 5)
|
||||
self.skip_existing = self.config.getboolean('Performance', 'skip_existing', True)
|
||||
|
||||
# 初始化处理记录管理器
|
||||
record_file = self.config.get('Paths', 'processed_record', 'data/processed_files.json')
|
||||
self.record_manager = ProcessedRecordManager(record_file)
|
||||
|
||||
logger.info(f"OCR处理器初始化完成,输入目录: {self.input_folder}, 输出目录: {self.output_folder}")
|
||||
|
||||
def get_unprocessed_images(self) -> List[str]:
|
||||
"""
|
||||
获取未处理的图片列表
|
||||
|
||||
Returns:
|
||||
未处理的图片文件路径列表
|
||||
"""
|
||||
# 获取所有图片文件
|
||||
image_files = get_files_by_extensions(self.input_folder, self.allowed_extensions)
|
||||
|
||||
# 如果需要跳过已存在的文件
|
||||
if self.skip_existing:
|
||||
# 过滤已处理的文件
|
||||
unprocessed_files = self.record_manager.get_unprocessed_files(image_files)
|
||||
logger.info(f"找到 {len(image_files)} 个图片文件,其中 {len(unprocessed_files)} 个未处理")
|
||||
return unprocessed_files
|
||||
|
||||
logger.info(f"找到 {len(image_files)} 个图片文件(不跳过已处理的文件)")
|
||||
return image_files
|
||||
|
||||
def validate_image(self, image_path: str) -> bool:
|
||||
"""
|
||||
验证图片是否有效
|
||||
|
||||
Args:
|
||||
image_path: 图片文件路径
|
||||
|
||||
Returns:
|
||||
图片是否有效
|
||||
"""
|
||||
# 检查文件是否存在
|
||||
if not os.path.exists(image_path):
|
||||
logger.warning(f"图片文件不存在: {image_path}")
|
||||
return False
|
||||
|
||||
# 检查文件扩展名
|
||||
ext = get_file_extension(image_path)
|
||||
if ext not in self.allowed_extensions:
|
||||
logger.warning(f"不支持的文件类型: {ext}, 文件: {image_path}")
|
||||
return False
|
||||
|
||||
# 检查文件大小
|
||||
if not is_file_size_valid(image_path, self.max_file_size_mb):
|
||||
logger.warning(f"文件大小超过限制 ({self.max_file_size_mb}MB): {image_path}")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def process_image(self, image_path: str) -> Optional[str]:
|
||||
"""
|
||||
处理单个图片
|
||||
|
||||
Args:
|
||||
image_path: 图片文件路径
|
||||
|
||||
Returns:
|
||||
输出Excel文件路径,如果处理失败则返回None
|
||||
"""
|
||||
# 验证图片
|
||||
if not self.validate_image(image_path):
|
||||
return None
|
||||
|
||||
# 如果需要跳过已处理的文件
|
||||
if self.skip_existing and self.record_manager.is_processed(image_path):
|
||||
output_file = self.record_manager.get_output_file(image_path)
|
||||
logger.info(f"图片已处理,跳过: {image_path}, 输出文件: {output_file}")
|
||||
return output_file
|
||||
|
||||
logger.info(f"开始处理图片: {image_path}")
|
||||
|
||||
try:
|
||||
# 生成输出文件路径
|
||||
file_name = os.path.splitext(os.path.basename(image_path))[0]
|
||||
output_file = os.path.join(self.output_folder, f"{file_name}{self.excel_extension}")
|
||||
|
||||
# 检查是否已存在对应的Excel文件
|
||||
if os.path.exists(output_file) and self.skip_existing:
|
||||
logger.info(f"已存在对应的Excel文件,跳过处理: {os.path.basename(image_path)} -> {os.path.basename(output_file)}")
|
||||
# 记录处理结果
|
||||
self.record_manager.mark_as_processed(image_path, output_file)
|
||||
return output_file
|
||||
|
||||
# 进行OCR识别
|
||||
ocr_result = self.ocr_client.recognize_table(image_path)
|
||||
if not ocr_result:
|
||||
logger.error(f"OCR识别失败: {image_path}")
|
||||
return None
|
||||
|
||||
# 保存Excel文件 - 按照v1版本逻辑提取Excel数据
|
||||
excel_base64 = None
|
||||
|
||||
# 从不同可能的字段中尝试获取Excel数据
|
||||
if 'excel_file' in ocr_result:
|
||||
excel_base64 = ocr_result['excel_file']
|
||||
logger.debug("从excel_file字段获取Excel数据")
|
||||
elif 'result' in ocr_result:
|
||||
if 'result_data' in ocr_result['result']:
|
||||
excel_base64 = ocr_result['result']['result_data']
|
||||
logger.debug("从result.result_data字段获取Excel数据")
|
||||
elif 'excel_file' in ocr_result['result']:
|
||||
excel_base64 = ocr_result['result']['excel_file']
|
||||
logger.debug("从result.excel_file字段获取Excel数据")
|
||||
elif 'tables_result' in ocr_result['result'] and ocr_result['result']['tables_result']:
|
||||
for table in ocr_result['result']['tables_result']:
|
||||
if 'excel_file' in table:
|
||||
excel_base64 = table['excel_file']
|
||||
logger.debug("从tables_result中获取Excel数据")
|
||||
break
|
||||
|
||||
# 如果还是没有找到Excel数据,尝试通过get_excel_result获取
|
||||
if not excel_base64:
|
||||
logger.info("无法从直接返回中获取Excel数据,尝试通过API获取...")
|
||||
excel_data = self.ocr_client.get_excel_result(ocr_result)
|
||||
if not excel_data:
|
||||
logger.error(f"获取Excel结果失败: {image_path}")
|
||||
return None
|
||||
|
||||
# 保存Excel文件
|
||||
os.makedirs(os.path.dirname(output_file), exist_ok=True)
|
||||
with open(output_file, 'wb') as f:
|
||||
f.write(excel_data)
|
||||
else:
|
||||
# 解码并保存Excel文件
|
||||
try:
|
||||
excel_data = base64.b64decode(excel_base64)
|
||||
os.makedirs(os.path.dirname(output_file), exist_ok=True)
|
||||
with open(output_file, 'wb') as f:
|
||||
f.write(excel_data)
|
||||
except Exception as e:
|
||||
logger.error(f"解码或保存Excel数据时出错: {e}")
|
||||
return None
|
||||
|
||||
logger.info(f"图片处理成功: {image_path}, 输出文件: {output_file}")
|
||||
|
||||
# 标记为已处理
|
||||
self.record_manager.mark_as_processed(image_path, output_file)
|
||||
|
||||
return output_file
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"处理图片时出错: {image_path}, 错误: {e}")
|
||||
return None
|
||||
|
||||
def process_images_batch(self, batch_size: int = None, max_workers: int = None) -> Tuple[int, int]:
|
||||
"""
|
||||
批量处理图片
|
||||
|
||||
Args:
|
||||
batch_size: 批处理大小,如果为None则使用配置值
|
||||
max_workers: 最大线程数,如果为None则使用配置值
|
||||
|
||||
Returns:
|
||||
(总处理数, 成功处理数)元组
|
||||
"""
|
||||
# 使用配置值或参数值
|
||||
batch_size = batch_size or self.batch_size
|
||||
max_workers = max_workers or self.max_workers
|
||||
|
||||
# 获取未处理的图片
|
||||
unprocessed_images = self.get_unprocessed_images()
|
||||
if not unprocessed_images:
|
||||
logger.warning("没有需要处理的图片")
|
||||
return 0, 0
|
||||
|
||||
total = len(unprocessed_images)
|
||||
success = 0
|
||||
|
||||
# 按批次处理
|
||||
for i in range(0, total, batch_size):
|
||||
batch = unprocessed_images[i:i + batch_size]
|
||||
logger.info(f"处理批次 {i//batch_size + 1}/{(total-1)//batch_size + 1}, 大小: {len(batch)}")
|
||||
|
||||
# 使用线程池并行处理
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
results = list(executor.map(self.process_image, batch))
|
||||
|
||||
# 统计成功数
|
||||
success += sum(1 for result in results if result is not None)
|
||||
|
||||
logger.info(f"批次处理完成, 成功: {sum(1 for result in results if result is not None)}/{len(batch)}")
|
||||
|
||||
logger.info(f"所有图片处理完成, 总计: {total}, 成功: {success}")
|
||||
return total, success
|
||||
@@ -0,0 +1,5 @@
|
||||
"""
|
||||
OCR订单处理系统 - 工具模块
|
||||
------------------------
|
||||
提供系统通用工具和辅助函数。
|
||||
"""
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,251 @@
|
||||
"""
|
||||
文件操作工具模块
|
||||
--------------
|
||||
提供文件处理、查找和管理功能。
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import shutil
|
||||
import json
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Union, Any
|
||||
|
||||
from .log_utils import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
def ensure_dir(directory: str) -> bool:
|
||||
"""
|
||||
确保目录存在,如果不存在则创建
|
||||
|
||||
Args:
|
||||
directory: 目录路径
|
||||
|
||||
Returns:
|
||||
是否成功创建或目录已存在
|
||||
"""
|
||||
try:
|
||||
os.makedirs(directory, exist_ok=True)
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"创建目录失败: {directory}, 错误: {e}")
|
||||
return False
|
||||
|
||||
def get_file_extension(file_path: str) -> str:
|
||||
"""
|
||||
获取文件扩展名(小写)
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
文件扩展名,包含点(例如 .jpg)
|
||||
"""
|
||||
return os.path.splitext(file_path)[1].lower()
|
||||
|
||||
def is_valid_extension(file_path: str, allowed_extensions: List[str]) -> bool:
|
||||
"""
|
||||
检查文件扩展名是否在允许的列表中
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
allowed_extensions: 允许的扩展名列表(例如 ['.jpg', '.png'])
|
||||
|
||||
Returns:
|
||||
文件扩展名是否有效
|
||||
"""
|
||||
ext = get_file_extension(file_path)
|
||||
return ext in allowed_extensions
|
||||
|
||||
def get_files_by_extensions(directory: str, extensions: List[str], exclude_patterns: List[str] = None) -> List[str]:
|
||||
"""
|
||||
获取指定目录下所有符合扩展名的文件路径
|
||||
|
||||
Args:
|
||||
directory: 目录路径
|
||||
extensions: 扩展名列表(例如 ['.jpg', '.png'])
|
||||
exclude_patterns: 排除的文件名模式(例如 ['~$', '.tmp'])
|
||||
|
||||
Returns:
|
||||
文件路径列表
|
||||
"""
|
||||
if exclude_patterns is None:
|
||||
exclude_patterns = ['~$', '.tmp']
|
||||
|
||||
files = []
|
||||
for file in os.listdir(directory):
|
||||
file_path = os.path.join(directory, file)
|
||||
|
||||
# 检查是否是文件
|
||||
if not os.path.isfile(file_path):
|
||||
continue
|
||||
|
||||
# 检查扩展名
|
||||
if not is_valid_extension(file_path, extensions):
|
||||
continue
|
||||
|
||||
# 检查排除模式
|
||||
exclude = False
|
||||
for pattern in exclude_patterns:
|
||||
if pattern in file:
|
||||
exclude = True
|
||||
break
|
||||
|
||||
if not exclude:
|
||||
files.append(file_path)
|
||||
|
||||
return files
|
||||
|
||||
def get_latest_file(directory: str, pattern: str = "", extensions: List[str] = None) -> Optional[str]:
|
||||
"""
|
||||
获取指定目录下最新的文件
|
||||
|
||||
Args:
|
||||
directory: 目录路径
|
||||
pattern: 文件名包含的字符串模式
|
||||
extensions: 限制的文件扩展名列表
|
||||
|
||||
Returns:
|
||||
最新文件的路径,如果没有找到则返回None
|
||||
"""
|
||||
if not os.path.exists(directory):
|
||||
logger.warning(f"目录不存在: {directory}")
|
||||
return None
|
||||
|
||||
files = []
|
||||
for file in os.listdir(directory):
|
||||
# 检查模式和扩展名
|
||||
if (pattern and pattern not in file) or \
|
||||
(extensions and not is_valid_extension(file, extensions)):
|
||||
continue
|
||||
|
||||
file_path = os.path.join(directory, file)
|
||||
if os.path.isfile(file_path):
|
||||
files.append((file_path, os.path.getmtime(file_path)))
|
||||
|
||||
if not files:
|
||||
logger.warning(f"未在目录 {directory} 中找到符合条件的文件")
|
||||
return None
|
||||
|
||||
# 按修改时间排序,返回最新的
|
||||
sorted_files = sorted(files, key=lambda x: x[1], reverse=True)
|
||||
return sorted_files[0][0]
|
||||
|
||||
def generate_timestamp_filename(original_path: str) -> str:
|
||||
"""
|
||||
生成基于时间戳的文件名
|
||||
|
||||
Args:
|
||||
original_path: 原始文件路径
|
||||
|
||||
Returns:
|
||||
带时间戳的新文件路径
|
||||
"""
|
||||
dir_path = os.path.dirname(original_path)
|
||||
ext = os.path.splitext(original_path)[1]
|
||||
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
|
||||
return os.path.join(dir_path, f"{timestamp}{ext}")
|
||||
|
||||
def rename_file(source_path: str, target_path: str) -> bool:
|
||||
"""
|
||||
重命名文件
|
||||
|
||||
Args:
|
||||
source_path: 源文件路径
|
||||
target_path: 目标文件路径
|
||||
|
||||
Returns:
|
||||
是否成功重命名
|
||||
"""
|
||||
try:
|
||||
# 确保目标目录存在
|
||||
target_dir = os.path.dirname(target_path)
|
||||
ensure_dir(target_dir)
|
||||
|
||||
# 重命名文件
|
||||
os.rename(source_path, target_path)
|
||||
logger.info(f"文件已重命名: {os.path.basename(source_path)} -> {os.path.basename(target_path)}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"重命名文件失败: {e}")
|
||||
return False
|
||||
|
||||
def load_json(file_path: str, default: Any = None) -> Any:
|
||||
"""
|
||||
加载JSON文件
|
||||
|
||||
Args:
|
||||
file_path: JSON文件路径
|
||||
default: 如果文件不存在或加载失败时返回的默认值
|
||||
|
||||
Returns:
|
||||
JSON内容,或者默认值
|
||||
"""
|
||||
if not os.path.exists(file_path):
|
||||
return default
|
||||
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
except Exception as e:
|
||||
logger.error(f"加载JSON文件失败: {file_path}, 错误: {e}")
|
||||
return default
|
||||
|
||||
def save_json(data: Any, file_path: str, ensure_ascii: bool = False, indent: int = 2) -> bool:
|
||||
"""
|
||||
保存数据到JSON文件
|
||||
|
||||
Args:
|
||||
data: 要保存的数据
|
||||
file_path: JSON文件路径
|
||||
ensure_ascii: 是否确保ASCII编码
|
||||
indent: 缩进空格数
|
||||
|
||||
Returns:
|
||||
是否成功保存
|
||||
"""
|
||||
try:
|
||||
# 确保目录存在
|
||||
directory = os.path.dirname(file_path)
|
||||
ensure_dir(directory)
|
||||
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, ensure_ascii=ensure_ascii, indent=indent)
|
||||
logger.debug(f"JSON数据已保存到: {file_path}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"保存JSON文件失败: {file_path}, 错误: {e}")
|
||||
return False
|
||||
|
||||
def get_file_size(file_path: str) -> int:
|
||||
"""
|
||||
获取文件大小(字节)
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
文件大小(字节)
|
||||
"""
|
||||
try:
|
||||
return os.path.getsize(file_path)
|
||||
except Exception as e:
|
||||
logger.error(f"获取文件大小失败: {file_path}, 错误: {e}")
|
||||
return 0
|
||||
|
||||
def is_file_size_valid(file_path: str, max_size_mb: float) -> bool:
|
||||
"""
|
||||
检查文件大小是否在允许范围内
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
max_size_mb: 最大允许大小(MB)
|
||||
|
||||
Returns:
|
||||
文件大小是否有效
|
||||
"""
|
||||
size_bytes = get_file_size(file_path)
|
||||
max_size_bytes = max_size_mb * 1024 * 1024
|
||||
return size_bytes <= max_size_bytes
|
||||
@@ -0,0 +1,129 @@
|
||||
"""
|
||||
日志工具模块
|
||||
----------
|
||||
提供统一的日志配置和管理功能。
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict
|
||||
|
||||
# 日志处理器字典,用于跟踪已创建的处理器
|
||||
_handlers: Dict[str, logging.Handler] = {}
|
||||
|
||||
def setup_logger(name: str,
|
||||
log_file: Optional[str] = None,
|
||||
level=logging.INFO,
|
||||
console_output: bool = True,
|
||||
file_output: bool = True,
|
||||
log_format: str = '%(asctime)s - %(name)s - %(levelname)s - %(message)s') -> logging.Logger:
|
||||
"""
|
||||
配置并返回日志记录器
|
||||
|
||||
Args:
|
||||
name: 日志记录器的名称
|
||||
log_file: 日志文件路径,如果为None则使用默认路径
|
||||
level: 日志级别
|
||||
console_output: 是否输出到控制台
|
||||
file_output: 是否输出到文件
|
||||
log_format: 日志格式
|
||||
|
||||
Returns:
|
||||
配置好的日志记录器
|
||||
"""
|
||||
# 获取或创建日志记录器
|
||||
logger = logging.getLogger(name)
|
||||
|
||||
# 如果已经配置过处理器,不重复配置
|
||||
if logger.handlers:
|
||||
return logger
|
||||
|
||||
# 设置日志级别
|
||||
logger.setLevel(level)
|
||||
|
||||
# 创建格式化器
|
||||
formatter = logging.Formatter(log_format)
|
||||
|
||||
# 如果需要输出到文件
|
||||
if file_output:
|
||||
# 如果没有指定日志文件,使用默认路径
|
||||
if log_file is None:
|
||||
log_dir = os.path.abspath('logs')
|
||||
# 确保日志目录存在
|
||||
os.makedirs(log_dir, exist_ok=True)
|
||||
log_file = os.path.join(log_dir, f"{name}.log")
|
||||
|
||||
# 创建文件处理器
|
||||
try:
|
||||
file_handler = logging.FileHandler(log_file, encoding='utf-8')
|
||||
file_handler.setFormatter(formatter)
|
||||
file_handler.setLevel(level)
|
||||
logger.addHandler(file_handler)
|
||||
_handlers[f"{name}_file"] = file_handler
|
||||
|
||||
# 记录活跃标记,避免被日志清理工具删除
|
||||
active_marker = os.path.join(os.path.dirname(log_file), f"{name}.active")
|
||||
with open(active_marker, 'w', encoding='utf-8') as f:
|
||||
f.write(f"Active since: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
except Exception as e:
|
||||
print(f"无法创建日志文件处理器: {e}")
|
||||
|
||||
# 如果需要输出到控制台
|
||||
if console_output:
|
||||
# 创建控制台处理器
|
||||
console_handler = logging.StreamHandler(sys.stdout)
|
||||
console_handler.setFormatter(formatter)
|
||||
console_handler.setLevel(level)
|
||||
logger.addHandler(console_handler)
|
||||
_handlers[f"{name}_console"] = console_handler
|
||||
|
||||
return logger
|
||||
|
||||
def get_logger(name: str) -> logging.Logger:
|
||||
"""
|
||||
获取已配置的日志记录器,如果不存在则创建一个新的
|
||||
|
||||
Args:
|
||||
name: 日志记录器的名称
|
||||
|
||||
Returns:
|
||||
日志记录器
|
||||
"""
|
||||
logger = logging.getLogger(name)
|
||||
if not logger.handlers:
|
||||
return setup_logger(name)
|
||||
return logger
|
||||
|
||||
def close_logger(name: str) -> None:
|
||||
"""
|
||||
关闭日志记录器的所有处理器
|
||||
|
||||
Args:
|
||||
name: 日志记录器的名称
|
||||
"""
|
||||
logger = logging.getLogger(name)
|
||||
for handler in logger.handlers[:]:
|
||||
handler.close()
|
||||
logger.removeHandler(handler)
|
||||
|
||||
# 清除处理器缓存
|
||||
_handlers.pop(f"{name}_file", None)
|
||||
_handlers.pop(f"{name}_console", None)
|
||||
|
||||
def cleanup_active_marker(name: str) -> None:
|
||||
"""
|
||||
清理日志活跃标记
|
||||
|
||||
Args:
|
||||
name: 日志记录器的名称
|
||||
"""
|
||||
try:
|
||||
log_dir = os.path.abspath('logs')
|
||||
active_marker = os.path.join(log_dir, f"{name}.active")
|
||||
if os.path.exists(active_marker):
|
||||
os.remove(active_marker)
|
||||
except Exception as e:
|
||||
print(f"无法清理日志活跃标记: {e}")
|
||||
@@ -0,0 +1,207 @@
|
||||
"""
|
||||
字符串处理工具模块
|
||||
---------------
|
||||
提供字符串处理、正则表达式匹配等功能。
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Dict, List, Optional, Tuple, Any, Match, Pattern
|
||||
|
||||
def clean_string(text: str) -> str:
|
||||
"""
|
||||
清理字符串,移除多余空白
|
||||
|
||||
Args:
|
||||
text: 源字符串
|
||||
|
||||
Returns:
|
||||
清理后的字符串
|
||||
"""
|
||||
if not isinstance(text, str):
|
||||
return ""
|
||||
|
||||
# 移除首尾空白
|
||||
text = text.strip()
|
||||
# 移除多余空白
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
return text
|
||||
|
||||
def remove_non_digits(text: str) -> str:
|
||||
"""
|
||||
移除字符串中的非数字字符
|
||||
|
||||
Args:
|
||||
text: 源字符串
|
||||
|
||||
Returns:
|
||||
只包含数字的字符串
|
||||
"""
|
||||
if not isinstance(text, str):
|
||||
return ""
|
||||
|
||||
return re.sub(r'\D', '', text)
|
||||
|
||||
def extract_number(text: str) -> Optional[float]:
|
||||
"""
|
||||
从字符串中提取数字
|
||||
|
||||
Args:
|
||||
text: 源字符串
|
||||
|
||||
Returns:
|
||||
提取的数字,如果没有则返回None
|
||||
"""
|
||||
if not isinstance(text, str):
|
||||
return None
|
||||
|
||||
# 匹配数字(可以包含小数点和负号)
|
||||
match = re.search(r'-?\d+(\.\d+)?', text)
|
||||
if match:
|
||||
return float(match.group())
|
||||
return None
|
||||
|
||||
def extract_unit(text: str, units: List[str] = None) -> Optional[str]:
|
||||
"""
|
||||
从字符串中提取单位
|
||||
|
||||
Args:
|
||||
text: 源字符串
|
||||
units: 有效单位列表,如果为None则自动识别
|
||||
|
||||
Returns:
|
||||
提取的单位,如果没有则返回None
|
||||
"""
|
||||
if not isinstance(text, str):
|
||||
return None
|
||||
|
||||
# 如果提供了单位列表,检查字符串中是否包含
|
||||
if units:
|
||||
for unit in units:
|
||||
if unit in text:
|
||||
return unit
|
||||
return None
|
||||
|
||||
# 否则,尝试自动识别常见单位
|
||||
# 正则表达式:匹配数字后面的非数字部分作为单位
|
||||
match = re.search(r'\d+\s*([^\d\s]+)', text)
|
||||
if match:
|
||||
return match.group(1)
|
||||
return None
|
||||
|
||||
def extract_number_and_unit(text: str) -> Tuple[Optional[float], Optional[str]]:
|
||||
"""
|
||||
从字符串中同时提取数字和单位
|
||||
|
||||
Args:
|
||||
text: 源字符串
|
||||
|
||||
Returns:
|
||||
(数字, 单位)元组,如果没有则对应返回None
|
||||
"""
|
||||
if not isinstance(text, str):
|
||||
return None, None
|
||||
|
||||
# 匹配数字和单位的组合
|
||||
match = re.search(r'(-?\d+(?:\.\d+)?)\s*([^\d\s]+)?', text)
|
||||
if match:
|
||||
number = float(match.group(1))
|
||||
unit = match.group(2) if match.group(2) else None
|
||||
return number, unit
|
||||
return None, None
|
||||
|
||||
def parse_specification(spec_str: str) -> Optional[int]:
|
||||
"""
|
||||
解析规格字符串,提取包装数量
|
||||
支持格式:1*15, 1x15, 1*5*10
|
||||
|
||||
Args:
|
||||
spec_str: 规格字符串
|
||||
|
||||
Returns:
|
||||
包装数量,如果无法解析则返回None
|
||||
"""
|
||||
if not spec_str or not isinstance(spec_str, str):
|
||||
return None
|
||||
|
||||
try:
|
||||
# 清理规格字符串
|
||||
spec_str = clean_string(spec_str)
|
||||
|
||||
# 匹配1*5*10 格式的三级规格
|
||||
match = re.search(r'(\d+)[\*xX×](\d+)[\*xX×](\d+)', spec_str)
|
||||
if match:
|
||||
# 取最后一个数字作为袋数量
|
||||
return int(match.group(3))
|
||||
|
||||
# 匹配1*15, 1x15 格式
|
||||
match = re.search(r'(\d+)[\*xX×](\d+)', spec_str)
|
||||
if match:
|
||||
# 取第二个数字作为包装数量
|
||||
return int(match.group(2))
|
||||
|
||||
# 匹配24瓶/件等格式
|
||||
match = re.search(r'(\d+)[瓶个支袋][//](件|箱)', spec_str)
|
||||
if match:
|
||||
return int(match.group(1))
|
||||
|
||||
# 匹配4L格式
|
||||
match = re.search(r'(\d+(?:\.\d+)?)\s*[Ll升][*×]?(\d+)?', spec_str)
|
||||
if match:
|
||||
# 如果有第二个数字,返回它;否则返回1
|
||||
return int(match.group(2)) if match.group(2) else 1
|
||||
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
def clean_barcode(barcode: Any) -> str:
|
||||
"""
|
||||
清理条码格式
|
||||
|
||||
Args:
|
||||
barcode: 条码(可以是字符串、整数或浮点数)
|
||||
|
||||
Returns:
|
||||
清理后的条码字符串
|
||||
"""
|
||||
if isinstance(barcode, (int, float)):
|
||||
barcode = f"{barcode:.0f}"
|
||||
|
||||
# 清理条码格式,移除可能的非数字字符(包括小数点)
|
||||
barcode_clean = re.sub(r'\.0+$', '', str(barcode)) # 移除末尾0
|
||||
barcode_clean = re.sub(r'\D', '', barcode_clean) # 只保留数字
|
||||
|
||||
return barcode_clean
|
||||
|
||||
def is_scientific_notation(value: str) -> bool:
|
||||
"""
|
||||
检查字符串是否是科学计数法表示
|
||||
|
||||
Args:
|
||||
value: 字符串值
|
||||
|
||||
Returns:
|
||||
是否是科学计数法
|
||||
"""
|
||||
return bool(re.match(r'^-?\d+(\.\d+)?[eE][+-]?\d+$', str(value)))
|
||||
|
||||
def format_barcode(barcode: Any) -> str:
|
||||
"""
|
||||
格式化条码,处理科学计数法
|
||||
|
||||
Args:
|
||||
barcode: 条码值
|
||||
|
||||
Returns:
|
||||
格式化后的条码字符串
|
||||
"""
|
||||
if isinstance(barcode, (int, float)) or is_scientific_notation(str(barcode)):
|
||||
try:
|
||||
# 转换为整数并格式化为字符串
|
||||
return f"{int(float(barcode))}"
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
# 如果不是数字或转换失败,返回原始字符串
|
||||
return str(barcode)
|
||||
@@ -0,0 +1,5 @@
|
||||
"""
|
||||
OCR订单处理系统 - 服务模块
|
||||
-----------------------
|
||||
提供业务逻辑服务,协调各个核心组件完成业务功能。
|
||||
"""
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,88 @@
|
||||
"""
|
||||
OCR服务模块
|
||||
---------
|
||||
提供OCR识别服务,协调OCR流程。
|
||||
"""
|
||||
|
||||
from typing import Dict, List, Optional, Tuple, Union, Any
|
||||
|
||||
from ..config.settings import ConfigManager
|
||||
from ..core.utils.log_utils import get_logger
|
||||
from ..core.ocr.table_ocr import OCRProcessor
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
class OCRService:
|
||||
"""
|
||||
OCR识别服务:协调OCR流程
|
||||
"""
|
||||
|
||||
def __init__(self, config: Optional[ConfigManager] = None):
|
||||
"""
|
||||
初始化OCR服务
|
||||
|
||||
Args:
|
||||
config: 配置管理器,如果为None则创建新的
|
||||
"""
|
||||
logger.info("初始化OCRService")
|
||||
self.config = config or ConfigManager()
|
||||
|
||||
# 创建OCR处理器
|
||||
self.ocr_processor = OCRProcessor(self.config)
|
||||
|
||||
logger.info("OCRService初始化完成")
|
||||
|
||||
def get_unprocessed_images(self) -> List[str]:
|
||||
"""
|
||||
获取待处理的图片列表
|
||||
|
||||
Returns:
|
||||
待处理图片路径列表
|
||||
"""
|
||||
return self.ocr_processor.get_unprocessed_images()
|
||||
|
||||
def process_image(self, image_path: str) -> Optional[str]:
|
||||
"""
|
||||
处理单张图片
|
||||
|
||||
Args:
|
||||
image_path: 图片路径
|
||||
|
||||
Returns:
|
||||
输出Excel文件路径,如果处理失败则返回None
|
||||
"""
|
||||
logger.info(f"OCRService开始处理图片: {image_path}")
|
||||
result = self.ocr_processor.process_image(image_path)
|
||||
|
||||
if result:
|
||||
logger.info(f"OCRService处理图片成功: {image_path} -> {result}")
|
||||
else:
|
||||
logger.error(f"OCRService处理图片失败: {image_path}")
|
||||
|
||||
return result
|
||||
|
||||
def process_images_batch(self, batch_size: int = None, max_workers: int = None) -> Tuple[int, int]:
|
||||
"""
|
||||
批量处理图片
|
||||
|
||||
Args:
|
||||
batch_size: 批处理大小
|
||||
max_workers: 最大线程数
|
||||
|
||||
Returns:
|
||||
(总处理数, 成功处理数)元组
|
||||
"""
|
||||
logger.info(f"OCRService开始批量处理图片, batch_size={batch_size}, max_workers={max_workers}")
|
||||
return self.ocr_processor.process_images_batch(batch_size, max_workers)
|
||||
|
||||
def validate_image(self, image_path: str) -> bool:
|
||||
"""
|
||||
验证图片是否有效
|
||||
|
||||
Args:
|
||||
image_path: 图片路径
|
||||
|
||||
Returns:
|
||||
图片是否有效
|
||||
"""
|
||||
return self.ocr_processor.validate_image(image_path)
|
||||
@@ -0,0 +1,87 @@
|
||||
"""
|
||||
订单服务模块
|
||||
---------
|
||||
提供订单处理服务,协调Excel处理和订单合并流程。
|
||||
"""
|
||||
|
||||
from typing import Dict, List, Optional, Tuple, Union, Any
|
||||
|
||||
from ..config.settings import ConfigManager
|
||||
from ..core.utils.log_utils import get_logger
|
||||
from ..core.excel.processor import ExcelProcessor
|
||||
from ..core.excel.merger import PurchaseOrderMerger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
class OrderService:
|
||||
"""
|
||||
订单服务:协调Excel处理和订单合并流程
|
||||
"""
|
||||
|
||||
def __init__(self, config: Optional[ConfigManager] = None):
|
||||
"""
|
||||
初始化订单服务
|
||||
|
||||
Args:
|
||||
config: 配置管理器,如果为None则创建新的
|
||||
"""
|
||||
logger.info("初始化OrderService")
|
||||
self.config = config or ConfigManager()
|
||||
|
||||
# 创建Excel处理器和采购单合并器
|
||||
self.excel_processor = ExcelProcessor(self.config)
|
||||
self.order_merger = PurchaseOrderMerger(self.config)
|
||||
|
||||
logger.info("OrderService初始化完成")
|
||||
|
||||
def get_latest_excel(self) -> Optional[str]:
|
||||
"""
|
||||
获取最新的Excel文件
|
||||
|
||||
Returns:
|
||||
最新Excel文件路径,如果未找到则返回None
|
||||
"""
|
||||
return self.excel_processor.get_latest_excel()
|
||||
|
||||
def process_excel(self, file_path: Optional[str] = None) -> Optional[str]:
|
||||
"""
|
||||
处理Excel文件,生成采购单
|
||||
|
||||
Args:
|
||||
file_path: Excel文件路径,如果为None则处理最新的文件
|
||||
|
||||
Returns:
|
||||
输出采购单文件路径,如果处理失败则返回None
|
||||
"""
|
||||
if file_path:
|
||||
logger.info(f"OrderService开始处理指定Excel文件: {file_path}")
|
||||
return self.excel_processor.process_specific_file(file_path)
|
||||
else:
|
||||
logger.info("OrderService开始处理最新Excel文件")
|
||||
return self.excel_processor.process_latest_file()
|
||||
|
||||
def get_purchase_orders(self) -> List[str]:
|
||||
"""
|
||||
获取采购单文件列表
|
||||
|
||||
Returns:
|
||||
采购单文件路径列表
|
||||
"""
|
||||
return self.order_merger.get_purchase_orders()
|
||||
|
||||
def merge_orders(self, file_paths: Optional[List[str]] = None) -> Optional[str]:
|
||||
"""
|
||||
合并采购单
|
||||
|
||||
Args:
|
||||
file_paths: 采购单文件路径列表,如果为None则处理所有采购单
|
||||
|
||||
Returns:
|
||||
合并后的采购单文件路径,如果合并失败则返回None
|
||||
"""
|
||||
if file_paths:
|
||||
logger.info(f"OrderService开始合并指定采购单: {file_paths}")
|
||||
else:
|
||||
logger.info("OrderService开始合并所有采购单")
|
||||
|
||||
return self.order_merger.process(file_paths)
|
||||
Reference in New Issue
Block a user