feat: add async retry mechanism with exponential backoff

- Add app/utils/retry.py with configurable async retry decorator - Update DeliveryLog model to track attempt_count and latency_seconds - Apply @http_retry to engine._exec_forward and _exec_notify methods - Update save_logs to record retry metadata - Add comprehensive unit tests for retry functionality - Support configuration via environment variables (RETRY_*) This improves reliability for downstream HTTP calls by automatically retrying transient failures with exponential backoff and jitter.
2025-12-24 11:04:41 +08:00
parent 0def77dc30
commit b11c39f3bf
5 changed files with 269 additions and 33 deletions
@@ -0,0 +1,99 @@
+"""
+Async retry decorator with exponential backoff and configurable parameters.
+"""
+import asyncio
+import logging
+from functools import wraps
+from typing import Callable, Any, Optional
+import time
+import os
+
+logger = logging.getLogger(__name__)
+
+def async_retry(
+    max_attempts: int = 3,
+    initial_delay: float = 1.0,
+    backoff_factor: float = 2.0,
+    max_delay: float = 60.0,
+    retry_on: tuple = (Exception,),
+    jitter: bool = True
+):
+    """
+    Decorator for async functions that implements exponential backoff retry logic.
+
+    Args:
+        max_attempts: Maximum number of retry attempts (including initial call)
+        initial_delay: Initial delay in seconds before first retry
+        backoff_factor: Factor by which delay increases each retry
+        max_delay: Maximum delay between retries
+        retry_on: Tuple of exception types to retry on
+        jitter: Add random jitter to delay to prevent thundering herd
+    """
+    def decorator(func: Callable) -> Callable:
+        @wraps(func)
+        async def wrapper(*args, **kwargs) -> tuple[Any, dict]:
+            """
+            Returns:
+                tuple: (result, metadata_dict)
+                metadata_dict contains: attempts, total_latency, last_error
+            """
+            last_error = None
+            start_time = time.time()
+
+            for attempt in range(max_attempts):
+                try:
+                    result = await func(*args, **kwargs)
+                    total_latency = time.time() - start_time
+                    return result, {
+                        'attempts': attempt + 1,
+                        'total_latency': round(total_latency, 3),
+                        'last_error': None,
+                        'success': True
+                    }
+                except retry_on as e:
+                    last_error = str(e)
+                    if attempt < max_attempts - 1:  # Don't sleep after last attempt
+                        delay = min(initial_delay * (backoff_factor ** attempt), max_delay)
+                        if jitter:
+                            # Add random jitter (±25% of delay)
+                            import random
+                            jitter_range = delay * 0.25
+                            delay += random.uniform(-jitter_range, jitter_range)
+
+                        logger.warning(f"Attempt {attempt + 1}/{max_attempts} failed for {func.__name__}: {e}. Retrying in {delay:.2f}s")
+                        await asyncio.sleep(delay)
+                    else:
+                        logger.error(f"All {max_attempts} attempts failed for {func.__name__}: {e}")
+
+            total_latency = time.time() - start_time
+            return None, {
+                'attempts': max_attempts,
+                'total_latency': round(total_latency, 3),
+                'last_error': last_error,
+                'success': False
+            }
+
+        return wrapper
+    return decorator
+
+
+# Configuration from environment
+def get_retry_config():
+    """Get retry configuration from environment variables."""
+    return {
+        'max_attempts': int(os.getenv('RETRY_MAX_ATTEMPTS', '3')),
+        'initial_delay': float(os.getenv('RETRY_INITIAL_DELAY', '1.0')),
+        'backoff_factor': float(os.getenv('RETRY_BACKOFF_FACTOR', '2.0')),
+        'max_delay': float(os.getenv('RETRY_MAX_DELAY', '30.0')),
+    }
+
+
+# Pre-configured decorators for common use cases
+def http_retry(**kwargs):
+    """Retry decorator specifically for HTTP operations."""
+    config = get_retry_config()
+    config.update(kwargs)
+    return async_retry(
+        retry_on=(Exception,),  # Retry on any exception for HTTP calls
+        **config
+    )