"""Processing endpoints: OCR, Excel conversion, merge, and full pipeline.""" import os import sys import traceback from pathlib import Path from typing import Optional, List from fastapi import APIRouter, HTTPException, Depends, Request from pydantic import BaseModel from ..auth.dependencies import get_current_user from ..services.service_wrapper import ServiceWrapper router = APIRouter(prefix="/api/processing", tags=["processing"]) _wrapper = ServiceWrapper(max_workers=3) _project_root = Path(__file__).resolve().parent.parent.parent.parent _input_dir = _project_root / "data" / "input" _output_dir = _project_root / "data" / "output" _result_dir = _project_root / "data" / "result" class PipelineRequest(BaseModel): files: Optional[List[str]] = None # specific files, or None = all in input/ supplier: Optional[str] = None # force supplier type class TaskResponse(BaseModel): task_id: str status: str message: str def _get_task_manager(request: Request): return request.state.task_manager def _list_input_files(filter_ext: Optional[List[str]] = None) -> List[Path]: if not _input_dir.is_dir(): return [] files = [] for f in sorted(_input_dir.iterdir()): if f.is_file(): if filter_ext is None or f.suffix.lower() in filter_ext: files.append(f) return files @router.post("/ocr-batch", response_model=TaskResponse) async def ocr_batch( request: Request, current_user: dict = Depends(get_current_user), ): """Run OCR on all images in input/.""" tm = _get_task_manager(request) task = tm.create_task("批量OCR识别") image_exts = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif'} files = _list_input_files(filter_ext=list(image_exts)) if not files: raise HTTPException(400, "input/ 目录中没有图片文件") async def _run(): try: from app.services.ocr_service import OCRService svc = OCRService() total = len(files) for i, f in enumerate(files): tm.update_progress(task.id, int((i / total) * 100), f"正在识别: {f.name}") tm.add_log(task.id, f"[OCR] 处理 {f.name}") try: svc.process_single(str(f), str(_output_dir)) tm.add_log(task.id, f"[OCR] 完成: {f.name}") except Exception as e: tm.add_log(task.id, f"[OCR] 失败: {f.name} - {e}") result_files = [f.name for f in _output_dir.iterdir() if f.is_file()] tm.set_completed(task.id, result_files=result_files, message=f"OCR完成,共处理 {total} 个文件") except Exception as e: tm.set_failed(task.id, str(e)) import asyncio asyncio.create_task(_run()) return TaskResponse(task_id=task.id, status="accepted", message="OCR任务已创建") @router.post("/excel", response_model=TaskResponse) async def process_excel( request: Request, body: PipelineRequest = PipelineRequest(), current_user: dict = Depends(get_current_user), ): """Convert OCR output Excel files to standardized format.""" tm = _get_task_manager(request) task = tm.create_task("Excel标准化处理") excel_exts = {'.xls', '.xlsx'} if body.files: files = [_output_dir / f for f in body.files if (_output_dir / f).is_file()] else: files = _list_input_files(filter_ext=list(excel_exts)) if not files: files = _list_input_files_from(_output_dir, filter_ext=list(excel_exts)) if not files: raise HTTPException(400, "没有找到Excel文件") async def _run(): try: from app.services.order_service import OrderService svc = OrderService() total = len(files) for i, f in enumerate(files): tm.update_progress(task.id, int((i / total) * 100), f"正在处理: {f.name}") tm.add_log(task.id, f"[Excel] 处理 {f.name}") try: svc.process_excel(str(f), str(_result_dir)) tm.add_log(task.id, f"[Excel] 完成: {f.name}") except Exception as e: tm.add_log(task.id, f"[Excel] 失败: {f.name} - {e}") result_files = [f.name for f in _result_dir.iterdir() if f.is_file()] tm.set_completed(task.id, result_files=result_files, message=f"Excel处理完成,共 {total} 个文件") except Exception as e: tm.set_failed(task.id, str(e)) import asyncio asyncio.create_task(_run()) return TaskResponse(task_id=task.id, status="accepted", message="Excel处理任务已创建") @router.post("/merge", response_model=TaskResponse) async def merge_orders( request: Request, current_user: dict = Depends(get_current_user), ): """Merge all processed Excel files into a single purchase order.""" tm = _get_task_manager(request) task = tm.create_task("合并采购单") async def _run(): try: from app.services.order_service import OrderService svc = OrderService() tm.update_progress(task.id, 20, "正在合并采购单...") tm.add_log(task.id, "[合并] 开始合并") result = svc.merge_orders(str(_result_dir)) tm.add_log(task.id, f"[合并] 完成: {result}") tm.set_completed(task.id, result_files=[result] if result else [], message="合并完成") except Exception as e: tm.set_failed(task.id, str(e)) import asyncio asyncio.create_task(_run()) return TaskResponse(task_id=task.id, status="accepted", message="合并任务已创建") @router.post("/pipeline", response_model=TaskResponse) async def full_pipeline( request: Request, body: PipelineRequest = PipelineRequest(), current_user: dict = Depends(get_current_user), ): """Run the full pipeline: OCR → Excel → Merge.""" tm = _get_task_manager(request) task = tm.create_task("一键全流程处理") async def _run(): try: # Step 1: OCR tm.update_progress(task.id, 0, "步骤 1/3: OCR识别") tm.add_log(task.id, "[Pipeline] 开始OCR识别") from app.services.ocr_service import OCRService ocr_svc = OCRService() image_exts = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif'} images = _list_input_files(filter_ext=list(image_exts)) for i, f in enumerate(images): pct = int((i / max(len(images), 1)) * 30) tm.update_progress(task.id, pct, f"OCR: {f.name}") try: ocr_svc.process_single(str(f), str(_output_dir)) tm.add_log(task.id, f"[OCR] 完成: {f.name}") except Exception as e: tm.add_log(task.id, f"[OCR] 失败: {f.name} - {e}") # Step 2: Excel conversion tm.update_progress(task.id, 35, "步骤 2/3: Excel标准化") tm.add_log(task.id, "[Pipeline] 开始Excel处理") from app.services.order_service import OrderService order_svc = OrderService() excel_files = list(_output_dir.glob("*.xls")) + list(_output_dir.glob("*.xlsx")) for i, f in enumerate(excel_files): pct = 35 + int((i / max(len(excel_files), 1)) * 35) tm.update_progress(task.id, pct, f"Excel: {f.name}") try: order_svc.process_excel(str(f), str(_result_dir)) tm.add_log(task.id, f"[Excel] 完成: {f.name}") except Exception as e: tm.add_log(task.id, f"[Excel] 失败: {f.name} - {e}") # Step 3: Merge tm.update_progress(task.id, 75, "步骤 3/3: 合并采购单") tm.add_log(task.id, "[Pipeline] 开始合并") try: result = order_svc.merge_orders(str(_result_dir)) tm.add_log(task.id, f"[合并] 完成: {result}") except Exception as e: tm.add_log(task.id, f"[合并] 失败: {e}") result = None result_files = [f.name for f in _result_dir.iterdir() if f.is_file()] tm.set_completed(task.id, result_files=result_files, message="全流程处理完成") except Exception as e: tb = traceback.format_exc() tm.add_log(task.id, f"[错误] {tb}") tm.set_failed(task.id, str(e)) import asyncio asyncio.create_task(_run()) return TaskResponse(task_id=task.id, status="accepted", message="全流程任务已创建") @router.get("/status/{task_id}") async def get_task_status( task_id: str, request: Request, current_user: dict = Depends(get_current_user), ): tm = _get_task_manager(request) task = tm.get_task(task_id) if not task: raise HTTPException(404, "任务不存在") return task.to_dict() def _list_input_files_from(directory: Path, filter_ext: List[str] = None) -> List[Path]: if not directory.is_dir(): return [] files = [] for f in sorted(directory.iterdir()): if f.is_file(): if filter_ext is None or f.suffix.lower() in filter_ext: files.append(f) return files