orc-order-v2/web/backend/routers/processing.py

"""Processing endpoints: OCR, Excel conversion, merge, and full pipeline."""

import asyncio
import os
import sys
import traceback
from pathlib import Path
from typing import Optional, List

from fastapi import APIRouter, HTTPException, Depends, Request
from pydantic import BaseModel

from ..auth.dependencies import get_current_user
from ..services.service_wrapper import ServiceWrapper
from ..services.db_schema import upsert_file_relation

router = APIRouter(prefix="/api/processing", tags=["processing"])

_wrapper = ServiceWrapper(max_workers=3)

_project_root = Path(__file__).resolve().parent.parent.parent.parent
_input_dir = _project_root / "data" / "input"
_output_dir = _project_root / "data" / "output"
_result_dir = _project_root / "data" / "result"


class PipelineRequest(BaseModel):
    files: Optional[List[str]] = None
    supplier: Optional[str] = None


class SingleFileRequest(BaseModel):
    filename: str


class MergeBatchRequest(BaseModel):
    filenames: List[str]


class TaskResponse(BaseModel):
    task_id: str
    status: str
    message: str


def _get_task_manager(request: Request):
    return request.state.task_manager


def _list_input_files(filter_ext: Optional[List[str]] = None) -> List[Path]:
    if not _input_dir.is_dir():
        return []
    files = []
    for f in sorted(_input_dir.iterdir()):
        if f.is_file():
            if filter_ext is None or f.suffix.lower() in filter_ext:
                files.append(f)
    return files


def _list_files_in(directory: Path, filter_ext: List[str] = None) -> List[Path]:
    if not directory.is_dir():
        return []
    files = []
    for f in sorted(directory.iterdir()):
        if f.is_file():
            if filter_ext is None or f.suffix.lower() in filter_ext:
                files.append(f)
    return files


def _run_background(coro):
    """Schedule a coroutine as a background task."""
    asyncio.ensure_future(coro)


# ---------------------------------------------------------------------------
# Batch endpoints
# ---------------------------------------------------------------------------

@router.post("/ocr-batch", response_model=TaskResponse)
async def ocr_batch(
    request: Request,
    current_user: dict = Depends(get_current_user),
):
    """Run OCR on all images in input/."""
    tm = _get_task_manager(request)
    task = tm.create_task("批量OCR识别")

    image_exts = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif'}
    files = _list_input_files(filter_ext=list(image_exts))
    if not files:
        raise HTTPException(400, "input/ 目录中没有图片文件")

    async def _bg():
        def do_work():
            from app.services.ocr_service import OCRService
            svc = OCRService()
            total = len(files)
            for i, f in enumerate(files):
                # Skip check
                out_stem = f.stem
                # OCR output could be .xlsx or .xls
                out_xlsx = _output_dir / f"{out_stem}.xlsx"
                out_xls = _output_dir / f"{out_stem}.xls"
                if out_xlsx.exists() or out_xls.exists():
                    out_name = out_xlsx.name if out_xlsx.exists() else out_xls.name
                    tm.add_log(task.id, f"[跳过] {f.name} 已OCR过 → {out_name}")
                    upsert_file_relation(input_image=f.name, output_excel=out_name, status='ocr_done')
                    continue

                tm.update_progress(task.id, int((i / total) * 100), f"正在识别: {f.name}")
                tm.add_log(task.id, f"[OCR] 处理 {f.name}")
                try:
                    svc.process_image(str(f))
                    # Find the output file
                    for ext in ['.xlsx', '.xls']:
                        candidate = _output_dir / f"{out_stem}{ext}"
                        if candidate.exists():
                            upsert_file_relation(input_image=f.name, output_excel=candidate.name, status='ocr_done')
                            break
                    tm.add_log(task.id, f"[OCR] 完成: {f.name}")
                except Exception as e:
                    tm.add_log(task.id, f"[OCR] 失败: {f.name} - {e}")

            result_files = [f.name for f in _output_dir.iterdir() if f.is_file()]
            tm.set_completed(task.id, result_files=result_files, message=f"OCR完成，共处理 {total} 个文件")

        await _wrapper.run_sync(do_work)

    _run_background(_bg())
    return TaskResponse(task_id=task.id, status="accepted", message="OCR任务已创建")


@router.post("/excel", response_model=TaskResponse)
async def process_excel(
    request: Request,
    body: PipelineRequest = PipelineRequest(),
    current_user: dict = Depends(get_current_user),
):
    """Convert OCR output Excel files to standardized purchase orders."""
    tm = _get_task_manager(request)
    task = tm.create_task("Excel标准化处理")

    excel_exts = {'.xls', '.xlsx'}
    if body.files:
        files = [_output_dir / f for f in body.files if (_output_dir / f).is_file()]
    else:
        files = _list_files_in(_output_dir, filter_ext=list(excel_exts))

    if not files:
        raise HTTPException(400, "output/ 目录中没有Excel文件")

    async def _bg():
        def do_work():
            from app.services.order_service import OrderService
            svc = OrderService()
            total = len(files)
            for i, f in enumerate(files):
                # Skip check
                result_name = f"采购单_{f.stem}.xls"
                result_path = _result_dir / result_name
                if result_path.exists():
                    tm.add_log(task.id, f"[跳过] {f.name} 已处理过 → {result_name}")
                    upsert_file_relation(output_excel=f.name, result_purchase=result_name, status='done')
                    continue

                tm.update_progress(task.id, int((i / total) * 100), f"正在处理: {f.name}")
                tm.add_log(task.id, f"[Excel] 处理 {f.name}")
                try:
                    svc.process_excel(str(f))
                    # Find result file
                    if result_path.exists():
                        upsert_file_relation(output_excel=f.name, result_purchase=result_name, status='done')
                    tm.add_log(task.id, f"[Excel] 完成: {f.name}")
                except Exception as e:
                    tm.add_log(task.id, f"[Excel] 失败: {f.name} - {e}")

            result_files = [f.name for f in _result_dir.iterdir() if f.is_file()]
            tm.set_completed(task.id, result_files=result_files, message=f"Excel处理完成，共 {total} 个文件")

        await _wrapper.run_sync(do_work)

    _run_background(_bg())
    return TaskResponse(task_id=task.id, status="accepted", message="Excel处理任务已创建")


@router.post("/merge", response_model=TaskResponse)
async def merge_orders(
    request: Request,
    body: MergeBatchRequest = MergeBatchRequest(filenames=[]),
    current_user: dict = Depends(get_current_user),
):
    """Merge selected purchase order files into one PosPal template."""
    tm = _get_task_manager(request)
    task = tm.create_task("合并采购单")

    # If specific files provided, use them; otherwise merge all
    if body.filenames:
        file_paths = [_result_dir / f for f in body.filenames if (_result_dir / f).is_file()]
    else:
        file_paths = list(_result_dir.glob("采购单_*.xls"))

    if not file_paths:
        raise HTTPException(400, "没有找到可合并的采购单文件")

    async def _bg():
        def do_work():
            from app.core.excel.merger import PurchaseOrderMerger
            tm.update_progress(task.id, 20, "正在合并采购单...")
            tm.add_log(task.id, f"[合并] 合并 {len(file_paths)} 个文件")
            try:
                from app.config.settings import ConfigManager
                merger = PurchaseOrderMerger(ConfigManager())
                result = merger.process([str(f) for f in file_paths])
                if result:
                    merged_name = Path(result).name
                    upsert_file_relation(result_purchase=merged_name, status='merged')
                    tm.add_log(task.id, f"[合并] 完成: {merged_name}")
                    tm.set_completed(task.id, result_files=[merged_name], message="合并完成")
                else:
                    tm.set_failed(task.id, "合并返回空结果")
            except Exception as e:
                tm.add_log(task.id, f"[合并] 失败: {e}")
                tm.set_failed(task.id, str(e))

        await _wrapper.run_sync(do_work)

    _run_background(_bg())
    return TaskResponse(task_id=task.id, status="accepted", message="合并任务已创建")


@router.post("/pipeline", response_model=TaskResponse)
async def full_pipeline(
    request: Request,
    body: PipelineRequest = PipelineRequest(),
    current_user: dict = Depends(get_current_user),
):
    """Run the full pipeline: OCR -> Excel -> Result (NO merge)."""
    tm = _get_task_manager(request)
    task = tm.create_task("一键全流程处理")

    async def _bg():
        def do_work():
            try:
                # Step 1: OCR
                tm.update_progress(task.id, 0, "步骤 1/2: OCR识别")
                tm.add_log(task.id, "[Pipeline] 开始OCR识别")
                from app.services.ocr_service import OCRService
                ocr_svc = OCRService()

                image_exts = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif'}
                images = _list_input_files(filter_ext=list(image_exts))
                for i, f in enumerate(images):
                    pct = int((i / max(len(images), 1)) * 40)

                    # Skip check
                    out_stem = f.stem
                    out_xlsx = _output_dir / f"{out_stem}.xlsx"
                    out_xls = _output_dir / f"{out_stem}.xls"
                    if out_xlsx.exists() or out_xls.exists():
                        out_name = out_xlsx.name if out_xlsx.exists() else out_xls.name
                        tm.add_log(task.id, f"[跳过] {f.name} 已OCR过 → {out_name}")
                        upsert_file_relation(input_image=f.name, output_excel=out_name, status='ocr_done')
                        tm.update_progress(task.id, pct, f"跳过: {f.name}")
                        continue

                    tm.update_progress(task.id, pct, f"OCR: {f.name}")
                    try:
                        ocr_svc.process_image(str(f))
                        for ext in ['.xlsx', '.xls']:
                            candidate = _output_dir / f"{out_stem}{ext}"
                            if candidate.exists():
                                upsert_file_relation(input_image=f.name, output_excel=candidate.name, status='ocr_done')
                                break
                        tm.add_log(task.id, f"[OCR] 完成: {f.name}")
                    except Exception as e:
                        tm.add_log(task.id, f"[OCR] 失败: {f.name} - {e}")

                # Step 2: Excel conversion
                tm.update_progress(task.id, 45, "步骤 2/2: Excel标准化")
                tm.add_log(task.id, "[Pipeline] 开始Excel处理")
                from app.services.order_service import OrderService
                order_svc = OrderService()

                excel_files = list(_output_dir.glob("*.xls")) + list(_output_dir.glob("*.xlsx"))
                for i, f in enumerate(excel_files):
                    pct = 45 + int((i / max(len(excel_files), 1)) * 55)

                    # Skip check
                    result_name = f"采购单_{f.stem}.xls"
                    result_path = _result_dir / result_name
                    if result_path.exists():
                        tm.add_log(task.id, f"[跳过] {f.name} 已处理过 → {result_name}")
                        upsert_file_relation(output_excel=f.name, result_purchase=result_name, status='done')
                        tm.update_progress(task.id, pct, f"跳过: {f.name}")
                        continue

                    tm.update_progress(task.id, pct, f"Excel: {f.name}")
                    try:
                        order_svc.process_excel(str(f))
                        if result_path.exists():
                            upsert_file_relation(output_excel=f.name, result_purchase=result_name, status='done')
                        tm.add_log(task.id, f"[Excel] 完成: {f.name}")
                    except Exception as e:
                        tm.add_log(task.id, f"[Excel] 失败: {f.name} - {e}")

                result_files = [f.name for f in _result_dir.iterdir() if f.is_file()]
                tm.set_completed(task.id, result_files=result_files, message="全流程处理完成（不含合并）")
            except Exception as e:
                tb = traceback.format_exc()
                tm.add_log(task.id, f"[错误] {tb}")
                tm.set_failed(task.id, str(e))

        await _wrapper.run_sync(do_work)

    _run_background(_bg())
    return TaskResponse(task_id=task.id, status="accepted", message="全流程任务已创建")


# ---------------------------------------------------------------------------
# Single-file endpoints
# ---------------------------------------------------------------------------

@router.post("/ocr-single", response_model=TaskResponse)
async def ocr_single(
    request: Request,
    body: SingleFileRequest,
    current_user: dict = Depends(get_current_user),
):
    """OCR a single image file."""
    tm = _get_task_manager(request)
    task = tm.create_task(f"OCR: {body.filename}")

    file_path = _input_dir / body.filename
    if not file_path.is_file():
        raise HTTPException(404, f"文件不存在: {body.filename}")

    async def _bg():
        def do_work():
            from app.services.ocr_service import OCRService
            svc = OCRService()
            tm.update_progress(task.id, 10, f"正在识别: {body.filename}")
            tm.add_log(task.id, f"[OCR] 处理 {body.filename}")
            try:
                svc.process_image(str(file_path))
                # Find output
                stem = file_path.stem
                for ext in ['.xlsx', '.xls']:
                    candidate = _output_dir / f"{stem}{ext}"
                    if candidate.exists():
                        upsert_file_relation(input_image=body.filename, output_excel=candidate.name, status='ocr_done')
                        break
                tm.add_log(task.id, f"[OCR] 完成: {body.filename}")
                result_files = [f.name for f in _output_dir.iterdir() if f.is_file()]
                tm.set_completed(task.id, result_files=result_files, message=f"OCR完成: {body.filename}")
            except Exception as e:
                tm.add_log(task.id, f"[OCR] 失败: {e}")
                tm.set_failed(task.id, str(e))

        await _wrapper.run_sync(do_work)

    _run_background(_bg())
    return TaskResponse(task_id=task.id, status="accepted", message=f"OCR任务已创建: {body.filename}")


@router.post("/excel-single", response_model=TaskResponse)
async def excel_single(
    request: Request,
    body: SingleFileRequest,
    current_user: dict = Depends(get_current_user),
):
    """Process a single Excel file to purchase order."""
    tm = _get_task_manager(request)
    task = tm.create_task(f"Excel处理: {body.filename}")

    file_path = _output_dir / body.filename
    if not file_path.is_file():
        raise HTTPException(404, f"文件不存在: {body.filename}")

    async def _bg():
        def do_work():
            from app.services.order_service import OrderService
            svc = OrderService()
            tm.update_progress(task.id, 10, f"正在处理: {body.filename}")
            tm.add_log(task.id, f"[Excel] 处理 {body.filename}")
            try:
                svc.process_excel(str(file_path))
                result_name = f"采购单_{file_path.stem}.xls"
                if (_result_dir / result_name).exists():
                    upsert_file_relation(output_excel=body.filename, result_purchase=result_name, status='done')
                tm.add_log(task.id, f"[Excel] 完成: {body.filename}")
                result_files = [f.name for f in _result_dir.iterdir() if f.is_file()]
                tm.set_completed(task.id, result_files=result_files, message=f"Excel处理完成: {body.filename}")
            except Exception as e:
                tm.add_log(task.id, f"[Excel] 失败: {e}")
                tm.set_failed(task.id, str(e))

        await _wrapper.run_sync(do_work)

    _run_background(_bg())
    return TaskResponse(task_id=task.id, status="accepted", message=f"Excel处理任务已创建: {body.filename}")


@router.post("/pipeline-single", response_model=TaskResponse)
async def pipeline_single(
    request: Request,
    body: SingleFileRequest,
    current_user: dict = Depends(get_current_user),
):
    """Full pipeline for a single image: OCR -> Excel -> Result (no merge)."""
    tm = _get_task_manager(request)
    task = tm.create_task(f"全流程: {body.filename}")

    file_path = _input_dir / body.filename
    if not file_path.is_file():
        raise HTTPException(404, f"文件不存在: {body.filename}")

    async def _bg():
        def do_work():
            try:
                stem = file_path.stem

                # Step 1: OCR
                tm.update_progress(task.id, 10, "步骤 1/2: OCR识别")
                tm.add_log(task.id, f"[Pipeline] OCR: {body.filename}")
                from app.services.ocr_service import OCRService
                ocr_svc = OCRService()

                out_xlsx = _output_dir / f"{stem}.xlsx"
                out_xls = _output_dir / f"{stem}.xls"
                if out_xlsx.exists() or out_xls.exists():
                    out_name = out_xlsx.name if out_xlsx.exists() else out_xls.name
                    tm.add_log(task.id, f"[跳过] 已OCR过 → {out_name}")
                    upsert_file_relation(input_image=body.filename, output_excel=out_name, status='ocr_done')
                else:
                    ocr_svc.process_image(str(file_path))
                    for ext in ['.xlsx', '.xls']:
                        candidate = _output_dir / f"{stem}{ext}"
                        if candidate.exists():
                            upsert_file_relation(input_image=body.filename, output_excel=candidate.name, status='ocr_done')
                            break
                    tm.add_log(task.id, f"[OCR] 完成")

                # Step 2: Excel
                tm.update_progress(task.id, 50, "步骤 2/2: Excel处理")
                tm.add_log(task.id, f"[Pipeline] Excel处理")
                from app.services.order_service import OrderService
                order_svc = OrderService()

                result_name = f"采购单_{stem}.xls"
                result_path = _result_dir / result_name
                if result_path.exists():
                    tm.add_log(task.id, f"[跳过] 已处理过 → {result_name}")
                    upsert_file_relation(output_excel=f"{stem}.xlsx", result_purchase=result_name, status='done')
                else:
                    # Find the output excel
                    excel_file = out_xlsx if out_xlsx.exists() else (out_xls if out_xls.exists() else None)
                    if excel_file:
                        order_svc.process_excel(str(excel_file))
                        if result_path.exists():
                            upsert_file_relation(output_excel=excel_file.name, result_purchase=result_name, status='done')
                        tm.add_log(task.id, f"[Excel] 完成")
                    else:
                        tm.add_log(task.id, f"[错误] OCR未生成Excel文件")

                result_files = [f.name for f in _result_dir.iterdir() if f.is_file()]
                tm.set_completed(task.id, result_files=result_files, message=f"全流程完成: {body.filename}")
            except Exception as e:
                tb = traceback.format_exc()
                tm.add_log(task.id, f"[错误] {tb}")
                tm.set_failed(task.id, str(e))

        await _wrapper.run_sync(do_work)

    _run_background(_bg())
    return TaskResponse(task_id=task.id, status="accepted", message=f"全流程任务已创建: {body.filename}")


@router.post("/merge-batch", response_model=TaskResponse)
async def merge_batch(
    request: Request,
    body: MergeBatchRequest,
    current_user: dict = Depends(get_current_user),
):
    """Merge selected purchase order files into one PosPal template."""
    tm = _get_task_manager(request)
    task = tm.create_task("批量合并采购单")

    file_paths = [_result_dir / f for f in body.filenames if (_result_dir / f).is_file()]
    if not file_paths:
        raise HTTPException(400, "没有找到可合并的采购单文件")

    async def _bg():
        def do_work():
            from app.core.excel.merger import PurchaseOrderMerger
            tm.update_progress(task.id, 20, f"正在合并 {len(file_paths)} 个采购单...")
            tm.add_log(task.id, f"[合并] 合并文件: {', '.join(f.name for f in file_paths)}")
            try:
                from app.config.settings import ConfigManager
                merger = PurchaseOrderMerger(ConfigManager())
                result = merger.process([str(f) for f in file_paths])
                if result:
                    merged_name = Path(result).name
                    upsert_file_relation(result_purchase=merged_name, status='merged')
                    tm.add_log(task.id, f"[合并] 完成: {merged_name}")
                    tm.set_completed(task.id, result_files=[merged_name], message="批量合并完成")
                else:
                    tm.set_failed(task.id, "合并返回空结果")
            except Exception as e:
                tm.add_log(task.id, f"[合并] 失败: {e}")
                tm.set_failed(task.id, str(e))

        await _wrapper.run_sync(do_work)

    _run_background(_bg())
    return TaskResponse(task_id=task.id, status="accepted", message="批量合并任务已创建")


# ---------------------------------------------------------------------------
# Status endpoint
# ---------------------------------------------------------------------------

@router.get("/status/{task_id}")
async def get_task_status(
    task_id: str,
    request: Request,
    current_user: dict = Depends(get_current_user),
):
    tm = _get_task_manager(request)
    task = tm.get_task(task_id)
    if not task:
        raise HTTPException(404, "任务不存在")
    return task.to_dict()