WebhockTransfer/app/main.py
auto-bot 6f4793a330 feat: add comprehensive health checks and Prometheus metrics
- Add app/health.py with HealthChecker and MetricsCollector classes
- Implement composite /health endpoint checking DB and HTTP client
- Add /metrics endpoint with Prometheus exposition format
- Add webhook request metrics (counters, histograms, gauges)
- Create app/http_client.py for shared AsyncClient management
- Update app/main.py lifespan to init/close HTTP client
- Add comprehensive tests in tests/test_health.py

This enables proper observability with health checks and metrics
for monitoring system status and webhook processing performance.
2025-12-24 11:10:54 +08:00

142 lines
5.2 KiB
Python

import asyncio
from fastapi import FastAPI, Request, BackgroundTasks, Body
from fastapi.responses import JSONResponse
from app.logging import get_logger
from app.admin import router as admin_router
from app.db import SessionLocal, WebhookEndpoint, RequestLog, DeliveryLog, init_db
from app.services.engine import engine
from app.models import IncomingPayload
from app.health import health_checker, metrics_collector
from app.http_client import init_http_client, close_http_client
from contextlib import asynccontextmanager
logger = get_logger("app")
@asynccontextmanager
async def lifespan(app: FastAPI):
# Startup: Initialize DB and HTTP client
logger.info("Initializing database...")
init_db()
logger.info("Initializing http client...")
await init_http_client()
try:
yield
finally:
# Shutdown: close shared http client
logger.info("Shutting down http client...")
await close_http_client()
app = FastAPI(lifespan=lifespan)
app.include_router(admin_router)
def save_logs(namespace: str, payload_dict: dict, routed: list, notified: list):
try:
db = SessionLocal()
# Create RequestLog
req_log = RequestLog(
namespace=namespace,
remark=str(payload_dict.get("remark", "")),
event_no=str(payload_dict.get("event_define_no", "")),
raw_body=payload_dict,
status="success"
)
db.add(req_log)
db.commit()
db.refresh(req_log)
# Create DeliveryLogs
for r in routed:
db.add(DeliveryLog(
request_id=req_log.id,
target_name=r.get("target"),
type="relay",
status="success" if r.get("ok") else "failed",
response_summary=str(r.get("error") or "OK"),
attempt_count=r.get("_retry_attempts", 1),
latency_seconds=r.get("_retry_latency", 0.0)
))
for n in notified:
db.add(DeliveryLog(
request_id=req_log.id,
target_name=n.get("channel"),
type="notify",
status="success" if n.get("ok") else "failed",
response_summary=str(n.get("error") or "OK"),
attempt_count=n.get("_retry_attempts", 1),
latency_seconds=n.get("_retry_latency", 0.0)
))
db.commit()
db.close()
except Exception as e:
logger.error(f"Failed to save logs: {e}")
@app.get("/health")
async def health():
"""Comprehensive health check endpoint."""
return await health_checker.check_overall()
@app.get("/metrics")
async def metrics():
"""Prometheus-style metrics endpoint."""
return metrics_collector.get_prometheus_format()
@app.post("/webhook/{namespace}")
async def webhook(namespace: str, payload: IncomingPayload = Body(...), background_tasks: BackgroundTasks = BackgroundTasks()):
import time
start_time = time.time()
db = SessionLocal()
endpoint = db.query(WebhookEndpoint).filter(WebhookEndpoint.namespace == namespace).first()
if not endpoint:
# Record failed request metric
metrics_collector.increment_counter("webhook_requests_total", labels={"status": "endpoint_not_found"})
db.close()
return JSONResponse({"error": "Endpoint not found"}, status_code=404)
if not endpoint.is_active:
# Record failed request metric
metrics_collector.increment_counter("webhook_requests_total", labels={"status": "endpoint_inactive"})
db.close()
return JSONResponse({"error": "Endpoint inactive"}, status_code=403)
endpoint_id = endpoint.id
db.close()
# payload is validated by Pydantic; convert to plain dict for engine
try:
body_dict = payload.model_dump()
except Exception:
metrics_collector.increment_counter("webhook_requests_total", labels={"status": "invalid_payload"})
return JSONResponse({"error": "Invalid payload"}, status_code=400)
# Use new engine
routed, notified = await engine.process(endpoint_id, body_dict)
# Record successful request metric
metrics_collector.increment_counter("webhook_requests_total", labels={"status": "success"})
# Record processing duration
processing_time = time.time() - start_time
metrics_collector.observe_histogram("webhook_processing_duration_seconds", processing_time)
# Record delivery attempts and failures
total_attempts = sum(r.get("_retry_attempts", 1) for r in routed) + sum(n.get("_retry_attempts", 1) for n in notified)
total_failures = sum(1 for r in routed if not r.get("ok", False)) + sum(1 for n in notified if not n.get("ok", False))
metrics_collector.increment_counter("delivery_attempts_total", total_attempts)
metrics_collector.increment_counter("delivery_failures_total", total_failures)
# Async save logs
background_tasks.add_task(save_logs, namespace, body_dict, routed, notified)
result = {
"namespace": namespace,
"routed": routed,
"notified": notified
}
logger.info({"event": "webhook_processed", "namespace": namespace, "routed": len(routed), "notified": len(notified)})
return JSONResponse(result)