feat: add async retry mechanism with exponential backoff

- Add app/utils/retry.py with configurable async retry decorator
- Update DeliveryLog model to track attempt_count and latency_seconds
- Apply @http_retry to engine._exec_forward and _exec_notify methods
- Update save_logs to record retry metadata
- Add comprehensive unit tests for retry functionality
- Support configuration via environment variables (RETRY_*)

This improves reliability for downstream HTTP calls by automatically
retrying transient failures with exponential backoff and jitter.
This commit is contained in:
auto-bot
2025-12-24 11:04:41 +08:00
parent 0def77dc30
commit b11c39f3bf
5 changed files with 269 additions and 33 deletions
+40 -30
View File
@@ -1,8 +1,9 @@
from typing import Any, Dict, List, Optional
from typing import Any, Dict, List, Optional, Tuple
import asyncio
import re
from app.db import SessionLocal, ProcessingRule, RuleAction, Target, NotificationChannel, MessageTemplate
from app.logging import get_logger
from app.utils.retry import http_retry
from app.templates import safe_render
logger = get_logger("engine")
@@ -83,8 +84,8 @@ class RuleEngine:
for action in rule.actions:
if action.action_type == 'forward' and action.target:
t_dict = {"name": action.target.name, "url": action.target.url, "timeout_ms": action.target.timeout_ms}
tasks.append(self._exec_forward(t_dict, payload))
tasks.append(self._wrap_retry_task(self._exec_forward, t_dict, payload, action=action))
elif action.action_type == 'notify':
# Check if we have a valid channel
if action.channel:
@@ -94,18 +95,18 @@ class RuleEngine:
template_content = action.template.template_content
else:
template_content = current_context.get("template_content")
if template_content:
try:
# Flatten payload + merge current context vars
render_context = self._flatten_payload(payload)
render_context.update(current_context["vars"])
# Use safe Jinja2 rendering (supports legacy {var} by conversion)
msg = safe_render(template_content, render_context)
c_dict = {"channel": action.channel.channel_type, "url": action.channel.webhook_url}
tasks.append(self._exec_notify(c_dict, msg))
tasks.append(self._wrap_retry_task(self._exec_notify, c_dict, msg, action=action))
except Exception as e:
logger.exception(f"Template render failed for action {action.id}: {e}")
tasks.append(self._return_error("notify", action.channel.name, str(e)))
@@ -199,30 +200,39 @@ class RuleEngine:
return out
async def _exec_forward(self, target: dict, payload: dict):
try:
import httpx
async with httpx.AsyncClient() as client:
resp = await client.post(target['url'], json=payload, timeout=target.get('timeout_ms', 5000)/1000)
resp.raise_for_status()
return {"type": "forward", "target": target['name'], "ok": True}
except Exception as e:
return {"type": "forward", "target": target['name'], "ok": False, "error": str(e)}
@http_retry()
async def _exec_forward(self, target: dict, payload: dict) -> Tuple[Dict[str, Any], Dict[str, Any]]:
"""Execute forward with retry logic. Returns (result_dict, retry_metadata)."""
import httpx
async with httpx.AsyncClient() as client:
resp = await client.post(target['url'], json=payload, timeout=target.get('timeout_ms', 5000)/1000)
resp.raise_for_status()
return {"type": "forward", "target": target['name'], "ok": True}, {}
async def _exec_notify(self, channel: dict, msg: str):
try:
from app.services.notify import send_feishu, send_wecom
channel_type = channel.get('channel')
url = channel.get('url')
if channel_type == 'feishu':
await send_feishu(url, msg)
elif channel_type == 'wecom':
await send_wecom(url, msg)
return {"type": "notify", "channel": channel_type, "ok": True}
except Exception as e:
logger.exception(f"Notification failed for {channel.get('channel')}: {e}")
return {"type": "notify", "channel": channel.get('channel'), "ok": False, "error": str(e)}
@http_retry()
async def _exec_notify(self, channel: dict, msg: str) -> Tuple[Dict[str, Any], Dict[str, Any]]:
"""Execute notify with retry logic. Returns (result_dict, retry_metadata)."""
from app.services.notify import send_feishu, send_wecom
channel_type = channel.get('channel')
url = channel.get('url')
if channel_type == 'feishu':
await send_feishu(url, msg)
elif channel_type == 'wecom':
await send_wecom(url, msg)
return {"type": "notify", "channel": channel_type, "ok": True}, {}
async def _wrap_retry_task(self, func, *args, **kwargs):
"""Wrap retry-enabled task to handle metadata and return standard result format."""
action = kwargs.pop('action', None) # Remove action from kwargs
result, metadata = await func(*args, **kwargs)
# Add retry metadata to result dict for logging
if metadata:
result['_retry_attempts'] = metadata.get('attempts', 1)
result['_retry_latency'] = metadata.get('total_latency', 0.0)
return result
async def _return_error(self, type_str, name, err):
return {"type": type_str, "target" if type_str == 'forward' else "channel": name, "ok": False, "error": err}