WebhockTransfer/app/health.py
auto-bot 6f4793a330 feat: add comprehensive health checks and Prometheus metrics
- Add app/health.py with HealthChecker and MetricsCollector classes
- Implement composite /health endpoint checking DB and HTTP client
- Add /metrics endpoint with Prometheus exposition format
- Add webhook request metrics (counters, histograms, gauges)
- Create app/http_client.py for shared AsyncClient management
- Update app/main.py lifespan to init/close HTTP client
- Add comprehensive tests in tests/test_health.py

This enables proper observability with health checks and metrics
for monitoring system status and webhook processing performance.
2025-12-24 11:10:54 +08:00

212 lines
7.1 KiB
Python

"""
Health check utilities and endpoints for observability.
"""
import time
from typing import Dict, Any
from app.db import SessionLocal, engine
from app.http_client import get_http_client
from app.logging import get_logger
logger = get_logger(__name__)
class HealthChecker:
"""Composite health checker for various system components."""
def __init__(self):
self._last_db_check = 0
self._last_http_check = 0
self._db_status = False # Start with unknown status
self._http_status = False # Start with unknown status
self._check_interval = 30 # seconds
async def check_database(self) -> Dict[str, Any]:
"""Check database connectivity and basic operations."""
current_time = time.time()
# Use cached result if recent
if current_time - self._last_db_check < self._check_interval:
return {
"status": "ok" if self._db_status else "error",
"cached": True,
"last_check": self._last_db_check
}
try:
# Test database connection
db = SessionLocal()
# Simple query to test connection
db.execute("SELECT 1")
db.close()
self._db_status = True
self._last_db_check = current_time
return {
"status": "ok",
"cached": False,
"last_check": current_time
}
except Exception as e:
logger.error(f"Database health check failed: {e}")
self._db_status = False
self._last_db_check = current_time
return {
"status": "error",
"error": str(e),
"cached": False,
"last_check": current_time
}
async def check_http_client(self) -> Dict[str, Any]:
"""Check HTTP client availability."""
current_time = time.time()
# Use cached result if recent
if current_time - self._last_http_check < self._check_interval:
return {
"status": "ok" if self._http_status else "error",
"cached": True,
"last_check": self._last_http_check
}
try:
# Check if HTTP client is available
client = get_http_client()
if client is None:
# Try to create a temporary client to test
import httpx
async with httpx.AsyncClient(timeout=5) as test_client:
# Just test that we can create and close a client
pass
self._http_status = True
self._last_http_check = current_time
return {
"status": "ok",
"cached": False,
"last_check": current_time
}
except Exception as e:
logger.error(f"HTTP client health check failed: {e}")
self._http_status = False
self._last_http_check = current_time
return {
"status": "error",
"error": str(e),
"cached": False,
"last_check": current_time
}
async def check_overall(self) -> Dict[str, Any]:
"""Perform comprehensive health check."""
db_health = await self.check_database()
http_health = await self.check_http_client()
# Overall status is healthy only if all components are healthy
overall_status = "ok" if (
db_health["status"] == "ok" and
http_health["status"] == "ok"
) else "error"
return {
"status": overall_status,
"timestamp": time.time(),
"checks": {
"database": db_health,
"http_client": http_health
},
"version": "2.2.0", # From dashboard
"service": "webhook-relay"
}
# Global health checker instance
health_checker = HealthChecker()
class MetricsCollector:
"""Simple Prometheus-style metrics collector."""
def __init__(self):
self._metrics = {}
self._start_time = time.time()
def increment_counter(self, name: str, value: float = 1.0, labels: Dict[str, str] = None):
"""Increment a counter metric."""
key = name
if labels:
# Simple label encoding
label_str = ",".join(f"{k}={v}" for k, v in sorted(labels.items()))
key = f"{name}{{{label_str}}}"
if key not in self._metrics:
self._metrics[key] = {"type": "counter", "value": 0.0}
self._metrics[key]["value"] += value
def set_gauge(self, name: str, value: float, labels: Dict[str, str] = None):
"""Set a gauge metric."""
key = name
if labels:
label_str = ",".join(f"{k}={v}" for k, v in sorted(labels.items()))
key = f"{name}{{{label_str}}}"
self._metrics[key] = {"type": "gauge", "value": value}
def observe_histogram(self, name: str, value: float, labels: Dict[str, str] = None):
"""Observe a histogram value (simplified - just track last value)."""
key = name
if labels:
label_str = ",".join(f"{k}={v}" for k, v in sorted(labels.items()))
key = f"{name}{{{label_str}}}"
self._metrics[key] = {"type": "histogram", "value": value}
def get_prometheus_format(self) -> str:
"""Generate Prometheus exposition format output."""
lines = []
# Add HELP and TYPE comments for known metrics
known_metrics = {
"webhook_requests_total": "Total number of webhook requests processed",
"webhook_processing_duration_seconds": "Time spent processing webhooks",
"delivery_attempts_total": "Total number of delivery attempts",
"delivery_failures_total": "Total number of delivery failures",
"uptime_seconds": "Service uptime in seconds"
}
for key, metric_info in self._metrics.items():
# Extract metric name and labels
if "{" in key:
name = key.split("{")[0]
labels_part = key.split("{")[1].rstrip("}")
else:
name = key
labels_part = ""
# Add HELP comment
if name in known_metrics:
lines.append(f"# HELP {name} {known_metrics[name]}")
# Add TYPE comment
lines.append(f"# TYPE {name} {metric_info['type']}")
# Add metric value
if labels_part:
lines.append(f"{name}{{{labels_part}}} {metric_info['value']}")
else:
lines.append(f"{name} {metric_info['value']}")
lines.append("") # Empty line between metrics
# Add uptime metric
uptime = time.time() - self._start_time
lines.insert(0, "# HELP uptime_seconds Service uptime in seconds")
lines.insert(1, "# TYPE uptime_seconds gauge")
lines.insert(2, f"uptime_seconds {uptime}")
lines.insert(3, "")
return "\n".join(lines)
# Global metrics collector instance
metrics_collector = MetricsCollector()