feat/retry-mechanism #1
@ -91,6 +91,8 @@ class DeliveryLog(Base):
|
|||||||
type = Column(String) # relay, notify
|
type = Column(String) # relay, notify
|
||||||
status = Column(String) # success, failed
|
status = Column(String) # success, failed
|
||||||
response_summary = Column(Text, nullable=True)
|
response_summary = Column(Text, nullable=True)
|
||||||
|
attempt_count = Column(Integer, default=1) # Number of attempts made
|
||||||
|
latency_seconds = Column(Float, nullable=True) # Total latency for all attempts
|
||||||
created_at = Column(DateTime, default=datetime.utcnow)
|
created_at = Column(DateTime, default=datetime.utcnow)
|
||||||
request_log = relationship("RequestLog", back_populates="delivery_logs")
|
request_log = relationship("RequestLog", back_populates="delivery_logs")
|
||||||
|
|
||||||
|
|||||||
@ -43,7 +43,9 @@ def save_logs(namespace: str, payload_dict: dict, routed: list, notified: list):
|
|||||||
target_name=r.get("target"),
|
target_name=r.get("target"),
|
||||||
type="relay",
|
type="relay",
|
||||||
status="success" if r.get("ok") else "failed",
|
status="success" if r.get("ok") else "failed",
|
||||||
response_summary=str(r.get("error") or "OK")
|
response_summary=str(r.get("error") or "OK"),
|
||||||
|
attempt_count=r.get("_retry_attempts", 1),
|
||||||
|
latency_seconds=r.get("_retry_latency", 0.0)
|
||||||
))
|
))
|
||||||
|
|
||||||
for n in notified:
|
for n in notified:
|
||||||
@ -52,7 +54,9 @@ def save_logs(namespace: str, payload_dict: dict, routed: list, notified: list):
|
|||||||
target_name=n.get("channel"),
|
target_name=n.get("channel"),
|
||||||
type="notify",
|
type="notify",
|
||||||
status="success" if n.get("ok") else "failed",
|
status="success" if n.get("ok") else "failed",
|
||||||
response_summary=str(n.get("error") or "OK")
|
response_summary=str(n.get("error") or "OK"),
|
||||||
|
attempt_count=n.get("_retry_attempts", 1),
|
||||||
|
latency_seconds=n.get("_retry_latency", 0.0)
|
||||||
))
|
))
|
||||||
|
|
||||||
db.commit()
|
db.commit()
|
||||||
|
|||||||
@ -1,8 +1,9 @@
|
|||||||
from typing import Any, Dict, List, Optional
|
from typing import Any, Dict, List, Optional, Tuple
|
||||||
import asyncio
|
import asyncio
|
||||||
import re
|
import re
|
||||||
from app.db import SessionLocal, ProcessingRule, RuleAction, Target, NotificationChannel, MessageTemplate
|
from app.db import SessionLocal, ProcessingRule, RuleAction, Target, NotificationChannel, MessageTemplate
|
||||||
from app.logging import get_logger
|
from app.logging import get_logger
|
||||||
|
from app.utils.retry import http_retry
|
||||||
from app.templates import safe_render
|
from app.templates import safe_render
|
||||||
|
|
||||||
logger = get_logger("engine")
|
logger = get_logger("engine")
|
||||||
@ -83,7 +84,7 @@ class RuleEngine:
|
|||||||
for action in rule.actions:
|
for action in rule.actions:
|
||||||
if action.action_type == 'forward' and action.target:
|
if action.action_type == 'forward' and action.target:
|
||||||
t_dict = {"name": action.target.name, "url": action.target.url, "timeout_ms": action.target.timeout_ms}
|
t_dict = {"name": action.target.name, "url": action.target.url, "timeout_ms": action.target.timeout_ms}
|
||||||
tasks.append(self._exec_forward(t_dict, payload))
|
tasks.append(self._wrap_retry_task(self._exec_forward, t_dict, payload, action=action))
|
||||||
|
|
||||||
elif action.action_type == 'notify':
|
elif action.action_type == 'notify':
|
||||||
# Check if we have a valid channel
|
# Check if we have a valid channel
|
||||||
@ -105,7 +106,7 @@ class RuleEngine:
|
|||||||
msg = safe_render(template_content, render_context)
|
msg = safe_render(template_content, render_context)
|
||||||
|
|
||||||
c_dict = {"channel": action.channel.channel_type, "url": action.channel.webhook_url}
|
c_dict = {"channel": action.channel.channel_type, "url": action.channel.webhook_url}
|
||||||
tasks.append(self._exec_notify(c_dict, msg))
|
tasks.append(self._wrap_retry_task(self._exec_notify, c_dict, msg, action=action))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.exception(f"Template render failed for action {action.id}: {e}")
|
logger.exception(f"Template render failed for action {action.id}: {e}")
|
||||||
tasks.append(self._return_error("notify", action.channel.name, str(e)))
|
tasks.append(self._return_error("notify", action.channel.name, str(e)))
|
||||||
@ -199,18 +200,18 @@ class RuleEngine:
|
|||||||
|
|
||||||
return out
|
return out
|
||||||
|
|
||||||
async def _exec_forward(self, target: dict, payload: dict):
|
@http_retry()
|
||||||
try:
|
async def _exec_forward(self, target: dict, payload: dict) -> Tuple[Dict[str, Any], Dict[str, Any]]:
|
||||||
|
"""Execute forward with retry logic. Returns (result_dict, retry_metadata)."""
|
||||||
import httpx
|
import httpx
|
||||||
async with httpx.AsyncClient() as client:
|
async with httpx.AsyncClient() as client:
|
||||||
resp = await client.post(target['url'], json=payload, timeout=target.get('timeout_ms', 5000)/1000)
|
resp = await client.post(target['url'], json=payload, timeout=target.get('timeout_ms', 5000)/1000)
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
return {"type": "forward", "target": target['name'], "ok": True}
|
return {"type": "forward", "target": target['name'], "ok": True}, {}
|
||||||
except Exception as e:
|
|
||||||
return {"type": "forward", "target": target['name'], "ok": False, "error": str(e)}
|
|
||||||
|
|
||||||
async def _exec_notify(self, channel: dict, msg: str):
|
@http_retry()
|
||||||
try:
|
async def _exec_notify(self, channel: dict, msg: str) -> Tuple[Dict[str, Any], Dict[str, Any]]:
|
||||||
|
"""Execute notify with retry logic. Returns (result_dict, retry_metadata)."""
|
||||||
from app.services.notify import send_feishu, send_wecom
|
from app.services.notify import send_feishu, send_wecom
|
||||||
channel_type = channel.get('channel')
|
channel_type = channel.get('channel')
|
||||||
url = channel.get('url')
|
url = channel.get('url')
|
||||||
@ -219,10 +220,19 @@ class RuleEngine:
|
|||||||
await send_feishu(url, msg)
|
await send_feishu(url, msg)
|
||||||
elif channel_type == 'wecom':
|
elif channel_type == 'wecom':
|
||||||
await send_wecom(url, msg)
|
await send_wecom(url, msg)
|
||||||
return {"type": "notify", "channel": channel_type, "ok": True}
|
return {"type": "notify", "channel": channel_type, "ok": True}, {}
|
||||||
except Exception as e:
|
|
||||||
logger.exception(f"Notification failed for {channel.get('channel')}: {e}")
|
async def _wrap_retry_task(self, func, *args, **kwargs):
|
||||||
return {"type": "notify", "channel": channel.get('channel'), "ok": False, "error": str(e)}
|
"""Wrap retry-enabled task to handle metadata and return standard result format."""
|
||||||
|
action = kwargs.pop('action', None) # Remove action from kwargs
|
||||||
|
result, metadata = await func(*args, **kwargs)
|
||||||
|
|
||||||
|
# Add retry metadata to result dict for logging
|
||||||
|
if metadata:
|
||||||
|
result['_retry_attempts'] = metadata.get('attempts', 1)
|
||||||
|
result['_retry_latency'] = metadata.get('total_latency', 0.0)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
async def _return_error(self, type_str, name, err):
|
async def _return_error(self, type_str, name, err):
|
||||||
return {"type": type_str, "target" if type_str == 'forward' else "channel": name, "ok": False, "error": err}
|
return {"type": type_str, "target" if type_str == 'forward' else "channel": name, "ok": False, "error": err}
|
||||||
|
|||||||
99
app/utils/retry.py
Normal file
99
app/utils/retry.py
Normal file
@ -0,0 +1,99 @@
|
|||||||
|
"""
|
||||||
|
Async retry decorator with exponential backoff and configurable parameters.
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
from functools import wraps
|
||||||
|
from typing import Callable, Any, Optional
|
||||||
|
import time
|
||||||
|
import os
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
def async_retry(
|
||||||
|
max_attempts: int = 3,
|
||||||
|
initial_delay: float = 1.0,
|
||||||
|
backoff_factor: float = 2.0,
|
||||||
|
max_delay: float = 60.0,
|
||||||
|
retry_on: tuple = (Exception,),
|
||||||
|
jitter: bool = True
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Decorator for async functions that implements exponential backoff retry logic.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
max_attempts: Maximum number of retry attempts (including initial call)
|
||||||
|
initial_delay: Initial delay in seconds before first retry
|
||||||
|
backoff_factor: Factor by which delay increases each retry
|
||||||
|
max_delay: Maximum delay between retries
|
||||||
|
retry_on: Tuple of exception types to retry on
|
||||||
|
jitter: Add random jitter to delay to prevent thundering herd
|
||||||
|
"""
|
||||||
|
def decorator(func: Callable) -> Callable:
|
||||||
|
@wraps(func)
|
||||||
|
async def wrapper(*args, **kwargs) -> tuple[Any, dict]:
|
||||||
|
"""
|
||||||
|
Returns:
|
||||||
|
tuple: (result, metadata_dict)
|
||||||
|
metadata_dict contains: attempts, total_latency, last_error
|
||||||
|
"""
|
||||||
|
last_error = None
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
for attempt in range(max_attempts):
|
||||||
|
try:
|
||||||
|
result = await func(*args, **kwargs)
|
||||||
|
total_latency = time.time() - start_time
|
||||||
|
return result, {
|
||||||
|
'attempts': attempt + 1,
|
||||||
|
'total_latency': round(total_latency, 3),
|
||||||
|
'last_error': None,
|
||||||
|
'success': True
|
||||||
|
}
|
||||||
|
except retry_on as e:
|
||||||
|
last_error = str(e)
|
||||||
|
if attempt < max_attempts - 1: # Don't sleep after last attempt
|
||||||
|
delay = min(initial_delay * (backoff_factor ** attempt), max_delay)
|
||||||
|
if jitter:
|
||||||
|
# Add random jitter (±25% of delay)
|
||||||
|
import random
|
||||||
|
jitter_range = delay * 0.25
|
||||||
|
delay += random.uniform(-jitter_range, jitter_range)
|
||||||
|
|
||||||
|
logger.warning(f"Attempt {attempt + 1}/{max_attempts} failed for {func.__name__}: {e}. Retrying in {delay:.2f}s")
|
||||||
|
await asyncio.sleep(delay)
|
||||||
|
else:
|
||||||
|
logger.error(f"All {max_attempts} attempts failed for {func.__name__}: {e}")
|
||||||
|
|
||||||
|
total_latency = time.time() - start_time
|
||||||
|
return None, {
|
||||||
|
'attempts': max_attempts,
|
||||||
|
'total_latency': round(total_latency, 3),
|
||||||
|
'last_error': last_error,
|
||||||
|
'success': False
|
||||||
|
}
|
||||||
|
|
||||||
|
return wrapper
|
||||||
|
return decorator
|
||||||
|
|
||||||
|
|
||||||
|
# Configuration from environment
|
||||||
|
def get_retry_config():
|
||||||
|
"""Get retry configuration from environment variables."""
|
||||||
|
return {
|
||||||
|
'max_attempts': int(os.getenv('RETRY_MAX_ATTEMPTS', '3')),
|
||||||
|
'initial_delay': float(os.getenv('RETRY_INITIAL_DELAY', '1.0')),
|
||||||
|
'backoff_factor': float(os.getenv('RETRY_BACKOFF_FACTOR', '2.0')),
|
||||||
|
'max_delay': float(os.getenv('RETRY_MAX_DELAY', '30.0')),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Pre-configured decorators for common use cases
|
||||||
|
def http_retry(**kwargs):
|
||||||
|
"""Retry decorator specifically for HTTP operations."""
|
||||||
|
config = get_retry_config()
|
||||||
|
config.update(kwargs)
|
||||||
|
return async_retry(
|
||||||
|
retry_on=(Exception,), # Retry on any exception for HTTP calls
|
||||||
|
**config
|
||||||
|
)
|
||||||
121
tests/test_retry.py
Normal file
121
tests/test_retry.py
Normal file
@ -0,0 +1,121 @@
|
|||||||
|
import pytest
|
||||||
|
import asyncio
|
||||||
|
from unittest.mock import AsyncMock, patch
|
||||||
|
from app.utils.retry import async_retry, http_retry, get_retry_config
|
||||||
|
|
||||||
|
|
||||||
|
class TestAsyncRetry:
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_success_on_first_attempt(self):
|
||||||
|
"""Test that function succeeds on first attempt returns immediately."""
|
||||||
|
|
||||||
|
@async_retry(max_attempts=3)
|
||||||
|
async def test_func():
|
||||||
|
return "success"
|
||||||
|
|
||||||
|
result, metadata = await test_func()
|
||||||
|
|
||||||
|
assert result == "success"
|
||||||
|
assert metadata["attempts"] == 1
|
||||||
|
assert metadata["success"] is True
|
||||||
|
assert metadata["last_error"] is None
|
||||||
|
assert "total_latency" in metadata
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_retry_on_failure_then_success(self):
|
||||||
|
"""Test retry mechanism when function fails then succeeds."""
|
||||||
|
|
||||||
|
call_count = 0
|
||||||
|
|
||||||
|
@async_retry(max_attempts=3, initial_delay=0.01)
|
||||||
|
async def test_func():
|
||||||
|
nonlocal call_count
|
||||||
|
call_count += 1
|
||||||
|
if call_count < 2:
|
||||||
|
raise ValueError("Temporary failure")
|
||||||
|
return "success"
|
||||||
|
|
||||||
|
result, metadata = await test_func()
|
||||||
|
|
||||||
|
assert result == "success"
|
||||||
|
assert metadata["attempts"] == 2
|
||||||
|
assert metadata["success"] is True
|
||||||
|
assert metadata["last_error"] is None
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_all_attempts_fail(self):
|
||||||
|
"""Test behavior when all retry attempts fail."""
|
||||||
|
|
||||||
|
@async_retry(max_attempts=2, initial_delay=0.01)
|
||||||
|
async def test_func():
|
||||||
|
raise ConnectionError("Persistent failure")
|
||||||
|
|
||||||
|
result, metadata = await test_func()
|
||||||
|
|
||||||
|
assert result is None
|
||||||
|
assert metadata["attempts"] == 2
|
||||||
|
assert metadata["success"] is False
|
||||||
|
assert metadata["last_error"] == "Persistent failure"
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_retry_on_specific_exceptions_only_value_error_retried(self):
|
||||||
|
"""Test that only ValueError triggers retry, RuntimeError does not."""
|
||||||
|
|
||||||
|
call_count = 0
|
||||||
|
|
||||||
|
@async_retry(max_attempts=3, retry_on=(ValueError,), initial_delay=0.01)
|
||||||
|
async def test_func():
|
||||||
|
nonlocal call_count
|
||||||
|
call_count += 1
|
||||||
|
if call_count == 1:
|
||||||
|
raise ValueError("Retry this")
|
||||||
|
elif call_count == 2:
|
||||||
|
raise RuntimeError("Don't retry this")
|
||||||
|
return "success"
|
||||||
|
|
||||||
|
# This should raise RuntimeError directly without retry
|
||||||
|
with pytest.raises(RuntimeError, match="Don't retry this"):
|
||||||
|
await test_func()
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_backoff_and_jitter(self):
|
||||||
|
"""Test exponential backoff with jitter."""
|
||||||
|
|
||||||
|
@async_retry(max_attempts=4, initial_delay=0.1, backoff_factor=2, jitter=True)
|
||||||
|
async def test_func():
|
||||||
|
raise ConnectionError("Always fail")
|
||||||
|
|
||||||
|
start_time = asyncio.get_event_loop().time()
|
||||||
|
result, metadata = await test_func()
|
||||||
|
end_time = asyncio.get_event_loop().time()
|
||||||
|
|
||||||
|
# Should take at least some time due to retries and delays
|
||||||
|
assert end_time - start_time >= 0.1 # At least initial delay
|
||||||
|
assert metadata["attempts"] == 4
|
||||||
|
assert metadata["success"] is False
|
||||||
|
|
||||||
|
def test_http_retry_decorator(self):
|
||||||
|
"""Test the http_retry convenience decorator."""
|
||||||
|
|
||||||
|
@http_retry(max_attempts=5)
|
||||||
|
async def test_func():
|
||||||
|
return "ok"
|
||||||
|
|
||||||
|
assert hasattr(test_func, '__name__')
|
||||||
|
# Should be awaitable
|
||||||
|
assert asyncio.iscoroutinefunction(test_func)
|
||||||
|
|
||||||
|
def test_get_retry_config(self):
|
||||||
|
"""Test configuration loading from environment."""
|
||||||
|
config = get_retry_config()
|
||||||
|
|
||||||
|
assert "max_attempts" in config
|
||||||
|
assert "initial_delay" in config
|
||||||
|
assert "backoff_factor" in config
|
||||||
|
assert "max_delay" in config
|
||||||
|
|
||||||
|
# Test with environment override
|
||||||
|
with patch.dict('os.environ', {'RETRY_MAX_ATTEMPTS': '5', 'RETRY_INITIAL_DELAY': '2.0'}):
|
||||||
|
config = get_retry_config()
|
||||||
|
assert config["max_attempts"] == 5
|
||||||
|
assert config["initial_delay"] == 2.0
|
||||||
Loading…
Reference in New Issue
Block a user