v4.0.0 大大大更新

2026-05-01 01:22:42 +08:00 · 2025-12-13 13:44:35 +08:00
parent 97c05aa33c
commit c7bacdfff7
61 changed files with 12407 additions and 5889 deletions
@@ -0,0 +1,44 @@
+# coding=utf-8
+"""
+存储模块 - 支持多种存储后端
+
+支持的存储后端:
+- local: 本地 SQLite + TXT/HTML 文件
+- remote: 远程云存储（S3 兼容协议：R2/OSS/COS/S3 等）
+- auto: 根据环境自动选择（GitHub Actions 用 remote，其他用 local）
+"""
+
+from trendradar.storage.base import (
+    StorageBackend,
+    NewsItem,
+    NewsData,
+    convert_crawl_results_to_news_data,
+    convert_news_data_to_results,
+)
+from trendradar.storage.local import LocalStorageBackend
+from trendradar.storage.manager import StorageManager, get_storage_manager
+
+# 远程后端可选导入（需要 boto3）
+try:
+    from trendradar.storage.remote import RemoteStorageBackend
+    HAS_REMOTE = True
+except ImportError:
+    RemoteStorageBackend = None
+    HAS_REMOTE = False
+
+__all__ = [
+    # 基础类
+    "StorageBackend",
+    "NewsItem",
+    "NewsData",
+    # 转换函数
+    "convert_crawl_results_to_news_data",
+    "convert_news_data_to_results",
+    # 后端实现
+    "LocalStorageBackend",
+    "RemoteStorageBackend",
+    "HAS_REMOTE",
+    # 管理器
+    "StorageManager",
+    "get_storage_manager",
+]
@@ -0,0 +1,457 @@
+# coding=utf-8
+"""
+存储后端抽象基类和数据模型
+
+定义统一的存储接口，所有存储后端都需要实现这些方法
+"""
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from datetime import datetime
+from typing import Dict, List, Optional, Any
+import json
+
+
+@dataclass
+class NewsItem:
+    """新闻条目数据模型"""
+
+    title: str                          # 新闻标题
+    source_id: str                      # 来源平台ID（如 toutiao, baidu）
+    source_name: str = ""               # 来源平台名称（运行时使用，数据库不存储）
+    rank: int = 0                       # 排名
+    url: str = ""                       # 链接 URL
+    mobile_url: str = ""                # 移动端 URL
+    crawl_time: str = ""                # 抓取时间（HH:MM 格式）
+
+    # 统计信息（用于分析）
+    ranks: List[int] = field(default_factory=list)  # 历史排名列表
+    first_time: str = ""                # 首次出现时间
+    last_time: str = ""                 # 最后出现时间
+    count: int = 1                      # 出现次数
+
+    def to_dict(self) -> Dict[str, Any]:
+        """转换为字典"""
+        return {
+            "title": self.title,
+            "source_id": self.source_id,
+            "source_name": self.source_name,
+            "rank": self.rank,
+            "url": self.url,
+            "mobile_url": self.mobile_url,
+            "crawl_time": self.crawl_time,
+            "ranks": self.ranks,
+            "first_time": self.first_time,
+            "last_time": self.last_time,
+            "count": self.count,
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "NewsItem":
+        """从字典创建"""
+        return cls(
+            title=data.get("title", ""),
+            source_id=data.get("source_id", ""),
+            source_name=data.get("source_name", ""),
+            rank=data.get("rank", 0),
+            url=data.get("url", ""),
+            mobile_url=data.get("mobile_url", ""),
+            crawl_time=data.get("crawl_time", ""),
+            ranks=data.get("ranks", []),
+            first_time=data.get("first_time", ""),
+            last_time=data.get("last_time", ""),
+            count=data.get("count", 1),
+        )
+
+
+@dataclass
+class NewsData:
+    """
+    新闻数据集合
+
+    结构:
+    - date: 日期（YYYY-MM-DD）
+    - crawl_time: 抓取时间（HH时MM分）
+    - items: 按来源ID分组的新闻条目
+    - id_to_name: 来源ID到名称的映射
+    - failed_ids: 失败的来源ID列表
+    """
+
+    date: str                                   # 日期
+    crawl_time: str                             # 抓取时间
+    items: Dict[str, List[NewsItem]]            # 按来源分组的新闻
+    id_to_name: Dict[str, str] = field(default_factory=dict)   # ID到名称映射
+    failed_ids: List[str] = field(default_factory=list)        # 失败的ID
+
+    def to_dict(self) -> Dict[str, Any]:
+        """转换为字典"""
+        items_dict = {}
+        for source_id, news_list in self.items.items():
+            items_dict[source_id] = [item.to_dict() for item in news_list]
+
+        return {
+            "date": self.date,
+            "crawl_time": self.crawl_time,
+            "items": items_dict,
+            "id_to_name": self.id_to_name,
+            "failed_ids": self.failed_ids,
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "NewsData":
+        """从字典创建"""
+        items = {}
+        items_data = data.get("items", {})
+        for source_id, news_list in items_data.items():
+            items[source_id] = [NewsItem.from_dict(item) for item in news_list]
+
+        return cls(
+            date=data.get("date", ""),
+            crawl_time=data.get("crawl_time", ""),
+            items=items,
+            id_to_name=data.get("id_to_name", {}),
+            failed_ids=data.get("failed_ids", []),
+        )
+
+    def get_total_count(self) -> int:
+        """获取新闻总数"""
+        return sum(len(news_list) for news_list in self.items.values())
+
+    def merge_with(self, other: "NewsData") -> "NewsData":
+        """
+        合并另一个 NewsData 到当前数据
+
+        合并规则:
+        - 相同 source_id + title 的新闻合并排名历史
+        - 更新 last_time 和 count
+        - 保留较早的 first_time
+        """
+        merged_items = {}
+
+        # 复制当前数据
+        for source_id, news_list in self.items.items():
+            merged_items[source_id] = {item.title: item for item in news_list}
+
+        # 合并其他数据
+        for source_id, news_list in other.items.items():
+            if source_id not in merged_items:
+                merged_items[source_id] = {}
+
+            for item in news_list:
+                if item.title in merged_items[source_id]:
+                    # 合并已存在的新闻
+                    existing = merged_items[source_id][item.title]
+
+                    # 合并排名
+                    existing_ranks = set(existing.ranks) if existing.ranks else set()
+                    new_ranks = set(item.ranks) if item.ranks else set()
+                    merged_ranks = sorted(existing_ranks | new_ranks)
+                    existing.ranks = merged_ranks
+
+                    # 更新时间
+                    if item.first_time and (not existing.first_time or item.first_time < existing.first_time):
+                        existing.first_time = item.first_time
+                    if item.last_time and (not existing.last_time or item.last_time > existing.last_time):
+                        existing.last_time = item.last_time
+
+                    # 更新计数
+                    existing.count += 1
+
+                    # 保留URL（如果原来没有）
+                    if not existing.url and item.url:
+                        existing.url = item.url
+                    if not existing.mobile_url and item.mobile_url:
+                        existing.mobile_url = item.mobile_url
+                else:
+                    # 添加新新闻
+                    merged_items[source_id][item.title] = item
+
+        # 转换回列表格式
+        final_items = {}
+        for source_id, items_dict in merged_items.items():
+            final_items[source_id] = list(items_dict.values())
+
+        # 合并 id_to_name
+        merged_id_to_name = {**self.id_to_name, **other.id_to_name}
+
+        # 合并 failed_ids（去重）
+        merged_failed_ids = list(set(self.failed_ids + other.failed_ids))
+
+        return NewsData(
+            date=self.date or other.date,
+            crawl_time=other.crawl_time,  # 使用较新的抓取时间
+            items=final_items,
+            id_to_name=merged_id_to_name,
+            failed_ids=merged_failed_ids,
+        )
+
+
+class StorageBackend(ABC):
+    """
+    存储后端抽象基类
+
+    所有存储后端都需要实现这些方法，以支持:
+    - 保存新闻数据
+    - 读取当天所有数据
+    - 检测新增新闻
+    - 生成报告文件（TXT/HTML）
+    """
+
+    @abstractmethod
+    def save_news_data(self, data: NewsData) -> bool:
+        """
+        保存新闻数据
+
+        Args:
+            data: 新闻数据
+
+        Returns:
+            是否保存成功
+        """
+        pass
+
+    @abstractmethod
+    def get_today_all_data(self, date: Optional[str] = None) -> Optional[NewsData]:
+        """
+        获取指定日期的所有新闻数据
+
+        Args:
+            date: 日期字符串（YYYY-MM-DD），默认为今天
+
+        Returns:
+            合并后的新闻数据，如果没有数据返回 None
+        """
+        pass
+
+    @abstractmethod
+    def get_latest_crawl_data(self, date: Optional[str] = None) -> Optional[NewsData]:
+        """
+        获取最新一次抓取的数据
+
+        Args:
+            date: 日期字符串，默认为今天
+
+        Returns:
+            最新抓取的新闻数据
+        """
+        pass
+
+    @abstractmethod
+    def detect_new_titles(self, current_data: NewsData) -> Dict[str, Dict]:
+        """
+        检测新增的标题
+
+        Args:
+            current_data: 当前抓取的数据
+
+        Returns:
+            新增的标题数据，格式: {source_id: {title: title_data}}
+        """
+        pass
+
+    @abstractmethod
+    def save_txt_snapshot(self, data: NewsData) -> Optional[str]:
+        """
+        保存 TXT 快照（可选功能，本地环境可用）
+
+        Args:
+            data: 新闻数据
+
+        Returns:
+            保存的文件路径，如果不支持返回 None
+        """
+        pass
+
+    @abstractmethod
+    def save_html_report(self, html_content: str, filename: str, is_summary: bool = False) -> Optional[str]:
+        """
+        保存 HTML 报告
+
+        Args:
+            html_content: HTML 内容
+            filename: 文件名
+            is_summary: 是否为汇总报告
+
+        Returns:
+            保存的文件路径
+        """
+        pass
+
+    @abstractmethod
+    def is_first_crawl_today(self, date: Optional[str] = None) -> bool:
+        """
+        检查是否是当天第一次抓取
+
+        Args:
+            date: 日期字符串，默认为今天
+
+        Returns:
+            是否是第一次抓取
+        """
+        pass
+
+    @abstractmethod
+    def cleanup(self) -> None:
+        """
+        清理资源（如临时文件、数据库连接等）
+        """
+        pass
+
+    @abstractmethod
+    def cleanup_old_data(self, retention_days: int) -> int:
+        """
+        清理过期数据
+
+        Args:
+            retention_days: 保留天数（0 表示不清理）
+
+        Returns:
+            删除的日期目录数量
+        """
+        pass
+
+    @property
+    @abstractmethod
+    def backend_name(self) -> str:
+        """
+        存储后端名称
+        """
+        pass
+
+    @property
+    @abstractmethod
+    def supports_txt(self) -> bool:
+        """
+        是否支持生成 TXT 快照
+        """
+        pass
+
+    # === 推送记录相关方法 ===
+
+    @abstractmethod
+    def has_pushed_today(self, date: Optional[str] = None) -> bool:
+        """
+        检查指定日期是否已推送过
+
+        Args:
+            date: 日期字符串（YYYY-MM-DD），默认为今天
+
+        Returns:
+            是否已推送
+        """
+        pass
+
+    @abstractmethod
+    def record_push(self, report_type: str, date: Optional[str] = None) -> bool:
+        """
+        记录推送
+
+        Args:
+            report_type: 报告类型
+            date: 日期字符串（YYYY-MM-DD），默认为今天
+
+        Returns:
+            是否记录成功
+        """
+        pass
+
+
+def convert_crawl_results_to_news_data(
+    results: Dict[str, Dict],
+    id_to_name: Dict[str, str],
+    failed_ids: List[str],
+    crawl_time: str,
+    crawl_date: str,
+) -> NewsData:
+    """
+    将爬虫结果转换为 NewsData 格式
+
+    Args:
+        results: 爬虫返回的结果 {source_id: {title: {ranks: [], url: "", mobileUrl: ""}}}
+        id_to_name: 来源ID到名称的映射
+        failed_ids: 失败的来源ID
+        crawl_time: 抓取时间（HH:MM）
+        crawl_date: 抓取日期（YYYY-MM-DD）
+
+    Returns:
+        NewsData 对象
+    """
+    items = {}
+
+    for source_id, titles_data in results.items():
+        source_name = id_to_name.get(source_id, source_id)
+        news_list = []
+
+        for title, data in titles_data.items():
+            if isinstance(data, dict):
+                ranks = data.get("ranks", [])
+                url = data.get("url", "")
+                mobile_url = data.get("mobileUrl", "")
+            else:
+                # 兼容旧格式
+                ranks = data if isinstance(data, list) else []
+                url = ""
+                mobile_url = ""
+
+            rank = ranks[0] if ranks else 99
+
+            news_item = NewsItem(
+                title=title,
+                source_id=source_id,
+                source_name=source_name,
+                rank=rank,
+                url=url,
+                mobile_url=mobile_url,
+                crawl_time=crawl_time,
+                ranks=ranks,
+                first_time=crawl_time,
+                last_time=crawl_time,
+                count=1,
+            )
+            news_list.append(news_item)
+
+        items[source_id] = news_list
+
+    return NewsData(
+        date=crawl_date,
+        crawl_time=crawl_time,
+        items=items,
+        id_to_name=id_to_name,
+        failed_ids=failed_ids,
+    )
+
+
+def convert_news_data_to_results(data: NewsData) -> tuple:
+    """
+    将 NewsData 转换回原有的 results 格式（用于兼容现有代码）
+
+    Args:
+        data: NewsData 对象
+
+    Returns:
+        (results, id_to_name, title_info) 元组
+    """
+    results = {}
+    title_info = {}
+
+    for source_id, news_list in data.items.items():
+        results[source_id] = {}
+        title_info[source_id] = {}
+
+        for item in news_list:
+            results[source_id][item.title] = {
+                "ranks": item.ranks,
+                "url": item.url,
+                "mobileUrl": item.mobile_url,
+            }
+
+            title_info[source_id][item.title] = {
+                "first_time": item.first_time,
+                "last_time": item.last_time,
+                "count": item.count,
+                "ranks": item.ranks,
+                "url": item.url,
+                "mobileUrl": item.mobile_url,
+            }
+
+    return results, data.id_to_name, title_info
@@ -0,0 +1,869 @@
+# coding=utf-8
+"""
+本地存储后端 - SQLite + TXT/HTML
+
+使用 SQLite 作为主存储，支持可选的 TXT 快照和 HTML 报告
+"""
+
+import sqlite3
+import os
+import shutil
+import pytz
+import re
+from datetime import datetime, timedelta
+from pathlib import Path
+from typing import Dict, List, Optional, Any
+
+from trendradar.storage.base import StorageBackend, NewsItem, NewsData
+from trendradar.utils.time import (
+    get_configured_time,
+    format_date_folder,
+    format_time_filename,
+)
+
+
+class LocalStorageBackend(StorageBackend):
+    """
+    本地存储后端
+
+    使用 SQLite 数据库存储新闻数据，支持：
+    - 按日期组织的 SQLite 数据库文件
+    - 可选的 TXT 快照（用于调试）
+    - HTML 报告生成
+    """
+
+    def __init__(
+        self,
+        data_dir: str = "output",
+        enable_txt: bool = True,
+        enable_html: bool = True,
+        timezone: str = "Asia/Shanghai",
+    ):
+        """
+        初始化本地存储后端
+
+        Args:
+            data_dir: 数据目录路径
+            enable_txt: 是否启用 TXT 快照
+            enable_html: 是否启用 HTML 报告
+            timezone: 时区配置（默认 Asia/Shanghai）
+        """
+        self.data_dir = Path(data_dir)
+        self.enable_txt = enable_txt
+        self.enable_html = enable_html
+        self.timezone = timezone
+        self._db_connections: Dict[str, sqlite3.Connection] = {}
+
+    @property
+    def backend_name(self) -> str:
+        return "local"
+
+    @property
+    def supports_txt(self) -> bool:
+        return self.enable_txt
+
+    def _get_configured_time(self) -> datetime:
+        """获取配置时区的当前时间"""
+        return get_configured_time(self.timezone)
+
+    def _format_date_folder(self, date: Optional[str] = None) -> str:
+        """格式化日期文件夹名 (ISO 格式: YYYY-MM-DD)"""
+        return format_date_folder(date, self.timezone)
+
+    def _format_time_filename(self) -> str:
+        """格式化时间文件名 (格式: HH-MM)"""
+        return format_time_filename(self.timezone)
+
+    def _get_db_path(self, date: Optional[str] = None) -> Path:
+        """获取 SQLite 数据库路径"""
+        date_folder = self._format_date_folder(date)
+        db_dir = self.data_dir / date_folder
+        db_dir.mkdir(parents=True, exist_ok=True)
+        return db_dir / "news.db"
+
+    def _get_connection(self, date: Optional[str] = None) -> sqlite3.Connection:
+        """获取数据库连接（带缓存）"""
+        db_path = str(self._get_db_path(date))
+
+        if db_path not in self._db_connections:
+            conn = sqlite3.connect(db_path)
+            conn.row_factory = sqlite3.Row
+            self._init_tables(conn)
+            self._db_connections[db_path] = conn
+
+        return self._db_connections[db_path]
+
+    def _get_schema_path(self) -> Path:
+        """获取 schema.sql 文件路径"""
+        return Path(__file__).parent / "schema.sql"
+
+    def _init_tables(self, conn: sqlite3.Connection) -> None:
+        """从 schema.sql 初始化数据库表结构"""
+        schema_path = self._get_schema_path()
+        
+        if schema_path.exists():
+            with open(schema_path, "r", encoding="utf-8") as f:
+                schema_sql = f.read()
+            conn.executescript(schema_sql)
+        else:
+            raise FileNotFoundError(f"Schema file not found: {schema_path}")
+        
+        conn.commit()
+
+    def save_news_data(self, data: NewsData) -> bool:
+        """
+        保存新闻数据到 SQLite（以 URL 为唯一标识，支持标题更新检测）
+
+        Args:
+            data: 新闻数据
+
+        Returns:
+            是否保存成功
+        """
+        try:
+            conn = self._get_connection(data.date)
+            cursor = conn.cursor()
+
+            # 获取配置时区的当前时间
+            now_str = self._get_configured_time().strftime("%Y-%m-%d %H:%M:%S")
+
+            # 首先同步平台信息到 platforms 表
+            for source_id, source_name in data.id_to_name.items():
+                cursor.execute("""
+                    INSERT INTO platforms (id, name, updated_at)
+                    VALUES (?, ?, ?)
+                    ON CONFLICT(id) DO UPDATE SET
+                        name = excluded.name,
+                        updated_at = excluded.updated_at
+                """, (source_id, source_name, now_str))
+
+            # 统计计数器
+            new_count = 0
+            updated_count = 0
+            title_changed_count = 0
+            success_sources = []
+
+            for source_id, news_list in data.items.items():
+                success_sources.append(source_id)
+
+                for item in news_list:
+                    try:
+                        # 检查是否已存在（通过 URL + platform_id）
+                        if item.url:
+                            cursor.execute("""
+                                SELECT id, title FROM news_items
+                                WHERE url = ? AND platform_id = ?
+                            """, (item.url, source_id))
+                            existing = cursor.fetchone()
+
+                            if existing:
+                                # 已存在，更新记录
+                                existing_id, existing_title = existing
+
+                                # 检查标题是否变化
+                                if existing_title != item.title:
+                                    # 记录标题变更
+                                    cursor.execute("""
+                                        INSERT INTO title_changes
+                                        (news_item_id, old_title, new_title, changed_at)
+                                        VALUES (?, ?, ?, ?)
+                                    """, (existing_id, existing_title, item.title, now_str))
+                                    title_changed_count += 1
+
+                                # 记录排名历史
+                                cursor.execute("""
+                                    INSERT INTO rank_history
+                                    (news_item_id, rank, crawl_time, created_at)
+                                    VALUES (?, ?, ?, ?)
+                                """, (existing_id, item.rank, data.crawl_time, now_str))
+
+                                # 更新现有记录
+                                cursor.execute("""
+                                    UPDATE news_items SET
+                                        title = ?,
+                                        rank = ?,
+                                        mobile_url = ?,
+                                        last_crawl_time = ?,
+                                        crawl_count = crawl_count + 1,
+                                        updated_at = ?
+                                    WHERE id = ?
+                                """, (item.title, item.rank, item.mobile_url,
+                                      data.crawl_time, now_str, existing_id))
+                                updated_count += 1
+                            else:
+                                # 不存在，插入新记录
+                                cursor.execute("""
+                                    INSERT INTO news_items
+                                    (title, platform_id, rank, url, mobile_url,
+                                     first_crawl_time, last_crawl_time, crawl_count,
+                                     created_at, updated_at)
+                                    VALUES (?, ?, ?, ?, ?, ?, ?, 1, ?, ?)
+                                """, (item.title, source_id, item.rank, item.url,
+                                      item.mobile_url, data.crawl_time, data.crawl_time,
+                                      now_str, now_str))
+                                new_id = cursor.lastrowid
+                                # 记录初始排名
+                                cursor.execute("""
+                                    INSERT INTO rank_history
+                                    (news_item_id, rank, crawl_time, created_at)
+                                    VALUES (?, ?, ?, ?)
+                                """, (new_id, item.rank, data.crawl_time, now_str))
+                                new_count += 1
+                        else:
+                            # URL 为空的情况，直接插入（不做去重）
+                            cursor.execute("""
+                                INSERT INTO news_items
+                                (title, platform_id, rank, url, mobile_url,
+                                 first_crawl_time, last_crawl_time, crawl_count,
+                                 created_at, updated_at)
+                                VALUES (?, ?, ?, ?, ?, ?, ?, 1, ?, ?)
+                            """, (item.title, source_id, item.rank, item.url,
+                                  item.mobile_url, data.crawl_time, data.crawl_time,
+                                  now_str, now_str))
+                            new_id = cursor.lastrowid
+                            # 记录初始排名
+                            cursor.execute("""
+                                INSERT INTO rank_history
+                                (news_item_id, rank, crawl_time, created_at)
+                                VALUES (?, ?, ?, ?)
+                            """, (new_id, item.rank, data.crawl_time, now_str))
+                            new_count += 1
+
+                    except sqlite3.Error as e:
+                        print(f"保存新闻条目失败 [{item.title[:30]}...]: {e}")
+
+            total_items = new_count + updated_count
+
+            # 记录抓取信息
+            cursor.execute("""
+                INSERT OR REPLACE INTO crawl_records
+                (crawl_time, total_items, created_at)
+                VALUES (?, ?, ?)
+            """, (data.crawl_time, total_items, now_str))
+
+            # 获取刚插入的 crawl_record 的 ID
+            cursor.execute("""
+                SELECT id FROM crawl_records WHERE crawl_time = ?
+            """, (data.crawl_time,))
+            record_row = cursor.fetchone()
+            if record_row:
+                crawl_record_id = record_row[0]
+
+                # 记录成功的来源
+                for source_id in success_sources:
+                    cursor.execute("""
+                        INSERT OR REPLACE INTO crawl_source_status
+                        (crawl_record_id, platform_id, status)
+                        VALUES (?, ?, 'success')
+                    """, (crawl_record_id, source_id))
+
+                # 记录失败的来源
+                for failed_id in data.failed_ids:
+                    # 确保失败的平台也在 platforms 表中
+                    cursor.execute("""
+                        INSERT OR IGNORE INTO platforms (id, name, updated_at)
+                        VALUES (?, ?, ?)
+                    """, (failed_id, failed_id, now_str))
+
+                    cursor.execute("""
+                        INSERT OR REPLACE INTO crawl_source_status
+                        (crawl_record_id, platform_id, status)
+                        VALUES (?, ?, 'failed')
+                    """, (crawl_record_id, failed_id))
+
+            conn.commit()
+
+            # 输出详细的存储统计日志
+            log_parts = [f"[本地存储] 处理完成：新增 {new_count} 条"]
+            if updated_count > 0:
+                log_parts.append(f"更新 {updated_count} 条")
+            if title_changed_count > 0:
+                log_parts.append(f"标题变更 {title_changed_count} 条")
+            print("，".join(log_parts))
+
+            return True
+
+        except Exception as e:
+            print(f"[本地存储] 保存失败: {e}")
+            return False
+
+    def get_today_all_data(self, date: Optional[str] = None) -> Optional[NewsData]:
+        """
+        获取指定日期的所有新闻数据（合并后）
+
+        Args:
+            date: 日期字符串，默认为今天
+
+        Returns:
+            合并后的新闻数据
+        """
+        try:
+            db_path = self._get_db_path(date)
+            if not db_path.exists():
+                return None
+
+            conn = self._get_connection(date)
+            cursor = conn.cursor()
+
+            # 获取所有新闻数据（包含 id 用于查询排名历史）
+            cursor.execute("""
+                SELECT n.id, n.title, n.platform_id, p.name as platform_name,
+                       n.rank, n.url, n.mobile_url,
+                       n.first_crawl_time, n.last_crawl_time, n.crawl_count
+                FROM news_items n
+                LEFT JOIN platforms p ON n.platform_id = p.id
+                ORDER BY n.platform_id, n.last_crawl_time
+            """)
+
+            rows = cursor.fetchall()
+            if not rows:
+                return None
+
+            # 收集所有 news_item_id
+            news_ids = [row[0] for row in rows]
+
+            # 批量查询排名历史
+            rank_history_map: Dict[int, List[int]] = {}
+            if news_ids:
+                placeholders = ",".join("?" * len(news_ids))
+                cursor.execute(f"""
+                    SELECT news_item_id, rank FROM rank_history
+                    WHERE news_item_id IN ({placeholders})
+                    ORDER BY news_item_id, crawl_time
+                """, news_ids)
+                for rh_row in cursor.fetchall():
+                    news_id, rank = rh_row[0], rh_row[1]
+                    if news_id not in rank_history_map:
+                        rank_history_map[news_id] = []
+                    if rank not in rank_history_map[news_id]:
+                        rank_history_map[news_id].append(rank)
+
+            # 按 platform_id 分组
+            items: Dict[str, List[NewsItem]] = {}
+            id_to_name: Dict[str, str] = {}
+            crawl_date = self._format_date_folder(date)
+
+            for row in rows:
+                news_id = row[0]
+                platform_id = row[2]
+                title = row[1]
+                platform_name = row[3] or platform_id
+
+                id_to_name[platform_id] = platform_name
+
+                if platform_id not in items:
+                    items[platform_id] = []
+
+                # 获取排名历史，如果没有则使用当前排名
+                ranks = rank_history_map.get(news_id, [row[4]])
+
+                items[platform_id].append(NewsItem(
+                    title=title,
+                    source_id=platform_id,
+                    source_name=platform_name,
+                    rank=row[4],
+                    url=row[5] or "",
+                    mobile_url=row[6] or "",
+                    crawl_time=row[8],  # last_crawl_time
+                    ranks=ranks,
+                    first_time=row[7],  # first_crawl_time
+                    last_time=row[8],   # last_crawl_time
+                    count=row[9],       # crawl_count
+                ))
+
+            final_items = items
+
+            # 获取失败的来源
+            cursor.execute("""
+                SELECT DISTINCT css.platform_id
+                FROM crawl_source_status css
+                JOIN crawl_records cr ON css.crawl_record_id = cr.id
+                WHERE css.status = 'failed'
+            """)
+            failed_ids = [row[0] for row in cursor.fetchall()]
+
+            # 获取最新的抓取时间
+            cursor.execute("""
+                SELECT crawl_time FROM crawl_records
+                ORDER BY crawl_time DESC
+                LIMIT 1
+            """)
+
+            time_row = cursor.fetchone()
+            crawl_time = time_row[0] if time_row else self._format_time_filename()
+
+            return NewsData(
+                date=crawl_date,
+                crawl_time=crawl_time,
+                items=final_items,
+                id_to_name=id_to_name,
+                failed_ids=failed_ids,
+            )
+
+        except Exception as e:
+            print(f"[本地存储] 读取数据失败: {e}")
+            return None
+
+    def get_latest_crawl_data(self, date: Optional[str] = None) -> Optional[NewsData]:
+        """
+        获取最新一次抓取的数据
+
+        Args:
+            date: 日期字符串，默认为今天
+
+        Returns:
+            最新抓取的新闻数据
+        """
+        try:
+            db_path = self._get_db_path(date)
+            if not db_path.exists():
+                return None
+
+            conn = self._get_connection(date)
+            cursor = conn.cursor()
+
+            # 获取最新的抓取时间
+            cursor.execute("""
+                SELECT crawl_time FROM crawl_records
+                ORDER BY crawl_time DESC
+                LIMIT 1
+            """)
+
+            time_row = cursor.fetchone()
+            if not time_row:
+                return None
+
+            latest_time = time_row[0]
+
+            # 获取该时间的新闻数据（包含 id 用于查询排名历史）
+            cursor.execute("""
+                SELECT n.id, n.title, n.platform_id, p.name as platform_name,
+                       n.rank, n.url, n.mobile_url,
+                       n.first_crawl_time, n.last_crawl_time, n.crawl_count
+                FROM news_items n
+                LEFT JOIN platforms p ON n.platform_id = p.id
+                WHERE n.last_crawl_time = ?
+            """, (latest_time,))
+
+            rows = cursor.fetchall()
+            if not rows:
+                return None
+
+            # 收集所有 news_item_id
+            news_ids = [row[0] for row in rows]
+
+            # 批量查询排名历史
+            rank_history_map: Dict[int, List[int]] = {}
+            if news_ids:
+                placeholders = ",".join("?" * len(news_ids))
+                cursor.execute(f"""
+                    SELECT news_item_id, rank FROM rank_history
+                    WHERE news_item_id IN ({placeholders})
+                    ORDER BY news_item_id, crawl_time
+                """, news_ids)
+                for rh_row in cursor.fetchall():
+                    news_id, rank = rh_row[0], rh_row[1]
+                    if news_id not in rank_history_map:
+                        rank_history_map[news_id] = []
+                    if rank not in rank_history_map[news_id]:
+                        rank_history_map[news_id].append(rank)
+
+            items: Dict[str, List[NewsItem]] = {}
+            id_to_name: Dict[str, str] = {}
+            crawl_date = self._format_date_folder(date)
+
+            for row in rows:
+                news_id = row[0]
+                platform_id = row[2]
+                platform_name = row[3] or platform_id
+                id_to_name[platform_id] = platform_name
+
+                if platform_id not in items:
+                    items[platform_id] = []
+
+                # 获取排名历史，如果没有则使用当前排名
+                ranks = rank_history_map.get(news_id, [row[4]])
+
+                items[platform_id].append(NewsItem(
+                    title=row[1],
+                    source_id=platform_id,
+                    source_name=platform_name,
+                    rank=row[4],
+                    url=row[5] or "",
+                    mobile_url=row[6] or "",
+                    crawl_time=row[8],  # last_crawl_time
+                    ranks=ranks,
+                    first_time=row[7],  # first_crawl_time
+                    last_time=row[8],   # last_crawl_time
+                    count=row[9],       # crawl_count
+                ))
+
+            # 获取失败的来源（针对最新一次抓取）
+            cursor.execute("""
+                SELECT css.platform_id
+                FROM crawl_source_status css
+                JOIN crawl_records cr ON css.crawl_record_id = cr.id
+                WHERE cr.crawl_time = ? AND css.status = 'failed'
+            """, (latest_time,))
+
+            failed_ids = [row[0] for row in cursor.fetchall()]
+
+            return NewsData(
+                date=crawl_date,
+                crawl_time=latest_time,
+                items=items,
+                id_to_name=id_to_name,
+                failed_ids=failed_ids,
+            )
+
+        except Exception as e:
+            print(f"[本地存储] 获取最新数据失败: {e}")
+            return None
+
+    def detect_new_titles(self, current_data: NewsData) -> Dict[str, Dict]:
+        """
+        检测新增的标题
+
+        Args:
+            current_data: 当前抓取的数据
+
+        Returns:
+            新增的标题数据 {source_id: {title: NewsItem}}
+        """
+        try:
+            # 获取历史数据
+            historical_data = self.get_today_all_data(current_data.date)
+
+            if not historical_data:
+                # 没有历史数据，所有都是新的
+                new_titles = {}
+                for source_id, news_list in current_data.items.items():
+                    new_titles[source_id] = {item.title: item for item in news_list}
+                return new_titles
+
+            # 收集历史标题
+            historical_titles: Dict[str, set] = {}
+            for source_id, news_list in historical_data.items.items():
+                historical_titles[source_id] = {item.title for item in news_list}
+
+            # 检测新增
+            new_titles = {}
+            for source_id, news_list in current_data.items.items():
+                hist_set = historical_titles.get(source_id, set())
+                for item in news_list:
+                    if item.title not in hist_set:
+                        if source_id not in new_titles:
+                            new_titles[source_id] = {}
+                        new_titles[source_id][item.title] = item
+
+            return new_titles
+
+        except Exception as e:
+            print(f"[本地存储] 检测新标题失败: {e}")
+            return {}
+
+    def save_txt_snapshot(self, data: NewsData) -> Optional[str]:
+        """
+        保存 TXT 快照
+
+        Args:
+            data: 新闻数据
+
+        Returns:
+            保存的文件路径
+        """
+        if not self.enable_txt:
+            return None
+
+        try:
+            date_folder = self._format_date_folder(data.date)
+            txt_dir = self.data_dir / date_folder / "txt"
+            txt_dir.mkdir(parents=True, exist_ok=True)
+
+            file_path = txt_dir / f"{data.crawl_time}.txt"
+
+            with open(file_path, "w", encoding="utf-8") as f:
+                for source_id, news_list in data.items.items():
+                    source_name = data.id_to_name.get(source_id, source_id)
+
+                    # 写入来源标题
+                    if source_name and source_name != source_id:
+                        f.write(f"{source_id} | {source_name}\n")
+                    else:
+                        f.write(f"{source_id}\n")
+
+                    # 按排名排序
+                    sorted_news = sorted(news_list, key=lambda x: x.rank)
+
+                    for item in sorted_news:
+                        line = f"{item.rank}. {item.title}"
+                        if item.url:
+                            line += f" [URL:{item.url}]"
+                        if item.mobile_url:
+                            line += f" [MOBILE:{item.mobile_url}]"
+                        f.write(line + "\n")
+
+                    f.write("\n")
+
+                # 写入失败的来源
+                if data.failed_ids:
+                    f.write("==== 以下ID请求失败 ====\n")
+                    for failed_id in data.failed_ids:
+                        f.write(f"{failed_id}\n")
+
+            print(f"[本地存储] TXT 快照已保存: {file_path}")
+            return str(file_path)
+
+        except Exception as e:
+            print(f"[本地存储] 保存 TXT 快照失败: {e}")
+            return None
+
+    def save_html_report(self, html_content: str, filename: str, is_summary: bool = False) -> Optional[str]:
+        """
+        保存 HTML 报告
+
+        Args:
+            html_content: HTML 内容
+            filename: 文件名
+            is_summary: 是否为汇总报告
+
+        Returns:
+            保存的文件路径
+        """
+        if not self.enable_html:
+            return None
+
+        try:
+            date_folder = self._format_date_folder()
+            html_dir = self.data_dir / date_folder / "html"
+            html_dir.mkdir(parents=True, exist_ok=True)
+
+            file_path = html_dir / filename
+
+            with open(file_path, "w", encoding="utf-8") as f:
+                f.write(html_content)
+
+            print(f"[本地存储] HTML 报告已保存: {file_path}")
+            return str(file_path)
+
+        except Exception as e:
+            print(f"[本地存储] 保存 HTML 报告失败: {e}")
+            return None
+
+    def is_first_crawl_today(self, date: Optional[str] = None) -> bool:
+        """
+        检查是否是当天第一次抓取
+
+        Args:
+            date: 日期字符串，默认为今天
+
+        Returns:
+            是否是第一次抓取
+        """
+        try:
+            db_path = self._get_db_path(date)
+            if not db_path.exists():
+                return True
+
+            conn = self._get_connection(date)
+            cursor = conn.cursor()
+
+            cursor.execute("""
+                SELECT COUNT(*) as count FROM crawl_records
+            """)
+
+            row = cursor.fetchone()
+            count = row[0] if row else 0
+
+            # 如果只有一条或没有记录，视为第一次抓取
+            return count <= 1
+
+        except Exception as e:
+            print(f"[本地存储] 检查首次抓取失败: {e}")
+            return True
+
+    def get_crawl_times(self, date: Optional[str] = None) -> List[str]:
+        """
+        获取指定日期的所有抓取时间列表
+
+        Args:
+            date: 日期字符串，默认为今天
+
+        Returns:
+            抓取时间列表（按时间排序）
+        """
+        try:
+            db_path = self._get_db_path(date)
+            if not db_path.exists():
+                return []
+
+            conn = self._get_connection(date)
+            cursor = conn.cursor()
+
+            cursor.execute("""
+                SELECT crawl_time FROM crawl_records
+                ORDER BY crawl_time
+            """)
+
+            rows = cursor.fetchall()
+            return [row[0] for row in rows]
+
+        except Exception as e:
+            print(f"[本地存储] 获取抓取时间列表失败: {e}")
+            return []
+
+    def cleanup(self) -> None:
+        """清理资源（关闭数据库连接）"""
+        for db_path, conn in self._db_connections.items():
+            try:
+                conn.close()
+                print(f"[本地存储] 关闭数据库连接: {db_path}")
+            except Exception as e:
+                print(f"[本地存储] 关闭连接失败 {db_path}: {e}")
+
+        self._db_connections.clear()
+
+    def cleanup_old_data(self, retention_days: int) -> int:
+        """
+        清理过期数据
+
+        Args:
+            retention_days: 保留天数（0 表示不清理）
+
+        Returns:
+            删除的日期目录数量
+        """
+        if retention_days <= 0:
+            return 0
+
+        deleted_count = 0
+        cutoff_date = self._get_configured_time() - timedelta(days=retention_days)
+
+        try:
+            if not self.data_dir.exists():
+                return 0
+
+            for date_folder in self.data_dir.iterdir():
+                if not date_folder.is_dir() or date_folder.name.startswith('.'):
+                    continue
+
+                # 解析日期文件夹名（支持两种格式）
+                folder_date = None
+                try:
+                    # ISO 格式: YYYY-MM-DD
+                    date_match = re.match(r'(\d{4})-(\d{2})-(\d{2})', date_folder.name)
+                    if date_match:
+                        folder_date = datetime(
+                            int(date_match.group(1)),
+                            int(date_match.group(2)),
+                            int(date_match.group(3)),
+                            tzinfo=pytz.timezone("Asia/Shanghai")
+                        )
+                    else:
+                        # 旧中文格式: YYYY年MM月DD日
+                        date_match = re.match(r'(\d{4})年(\d{2})月(\d{2})日', date_folder.name)
+                        if date_match:
+                            folder_date = datetime(
+                                int(date_match.group(1)),
+                                int(date_match.group(2)),
+                                int(date_match.group(3)),
+                                tzinfo=pytz.timezone("Asia/Shanghai")
+                            )
+                except Exception:
+                    continue
+
+                if folder_date and folder_date < cutoff_date:
+                    # 先关闭该日期的数据库连接
+                    db_path = str(self._get_db_path(date_folder.name))
+                    if db_path in self._db_connections:
+                        try:
+                            self._db_connections[db_path].close()
+                            del self._db_connections[db_path]
+                        except Exception:
+                            pass
+
+                    # 删除整个日期目录
+                    try:
+                        shutil.rmtree(date_folder)
+                        deleted_count += 1
+                        print(f"[本地存储] 清理过期数据: {date_folder.name}")
+                    except Exception as e:
+                        print(f"[本地存储] 删除目录失败 {date_folder.name}: {e}")
+
+            if deleted_count > 0:
+                print(f"[本地存储] 共清理 {deleted_count} 个过期日期目录")
+
+            return deleted_count
+
+        except Exception as e:
+            print(f"[本地存储] 清理过期数据失败: {e}")
+            return deleted_count
+
+    def has_pushed_today(self, date: Optional[str] = None) -> bool:
+        """
+        检查指定日期是否已推送过
+
+        Args:
+            date: 日期字符串（YYYY-MM-DD），默认为今天
+
+        Returns:
+            是否已推送
+        """
+        try:
+            conn = self._get_connection(date)
+            cursor = conn.cursor()
+
+            target_date = self._format_date_folder(date)
+
+            cursor.execute("""
+                SELECT pushed FROM push_records WHERE date = ?
+            """, (target_date,))
+
+            row = cursor.fetchone()
+            if row:
+                return bool(row[0])
+            return False
+
+        except Exception as e:
+            print(f"[本地存储] 检查推送记录失败: {e}")
+            return False
+
+    def record_push(self, report_type: str, date: Optional[str] = None) -> bool:
+        """
+        记录推送
+
+        Args:
+            report_type: 报告类型
+            date: 日期字符串（YYYY-MM-DD），默认为今天
+
+        Returns:
+            是否记录成功
+        """
+        try:
+            conn = self._get_connection(date)
+            cursor = conn.cursor()
+
+            target_date = self._format_date_folder(date)
+            now_str = self._get_configured_time().strftime("%Y-%m-%d %H:%M:%S")
+
+            cursor.execute("""
+                INSERT INTO push_records (date, pushed, push_time, report_type, created_at)
+                VALUES (?, 1, ?, ?, ?)
+                ON CONFLICT(date) DO UPDATE SET
+                    pushed = 1,
+                    push_time = excluded.push_time,
+                    report_type = excluded.report_type
+            """, (target_date, now_str, report_type, now_str))
+
+            conn.commit()
+
+            print(f"[本地存储] 推送记录已保存: {report_type} at {now_str}")
+            return True
+
+        except Exception as e:
+            print(f"[本地存储] 记录推送失败: {e}")
+            return False
+
+    def __del__(self):
+        """析构函数，确保关闭连接"""
+        self.cleanup()
@@ -0,0 +1,316 @@
+# coding=utf-8
+"""
+存储管理器 - 统一管理存储后端
+
+根据环境和配置自动选择合适的存储后端
+"""
+
+import os
+from typing import Optional
+
+from trendradar.storage.base import StorageBackend, NewsData
+
+
+# 存储管理器单例
+_storage_manager: Optional["StorageManager"] = None
+
+
+class StorageManager:
+    """
+    存储管理器
+
+    功能：
+    - 自动检测运行环境（GitHub Actions / Docker / 本地）
+    - 根据配置选择存储后端（local / remote / auto）
+    - 提供统一的存储接口
+    - 支持从远程拉取数据到本地
+    """
+
+    def __init__(
+        self,
+        backend_type: str = "auto",
+        data_dir: str = "output",
+        enable_txt: bool = True,
+        enable_html: bool = True,
+        remote_config: Optional[dict] = None,
+        local_retention_days: int = 0,
+        remote_retention_days: int = 0,
+        pull_enabled: bool = False,
+        pull_days: int = 0,
+        timezone: str = "Asia/Shanghai",
+    ):
+        """
+        初始化存储管理器
+
+        Args:
+            backend_type: 存储后端类型 (local / remote / auto)
+            data_dir: 本地数据目录
+            enable_txt: 是否启用 TXT 快照
+            enable_html: 是否启用 HTML 报告
+            remote_config: 远程存储配置（endpoint_url, bucket_name, access_key_id 等）
+            local_retention_days: 本地数据保留天数（0 = 无限制）
+            remote_retention_days: 远程数据保留天数（0 = 无限制）
+            pull_enabled: 是否启用启动时自动拉取
+            pull_days: 拉取最近 N 天的数据
+            timezone: 时区配置（默认 Asia/Shanghai）
+        """
+        self.backend_type = backend_type
+        self.data_dir = data_dir
+        self.enable_txt = enable_txt
+        self.enable_html = enable_html
+        self.remote_config = remote_config or {}
+        self.local_retention_days = local_retention_days
+        self.remote_retention_days = remote_retention_days
+        self.pull_enabled = pull_enabled
+        self.pull_days = pull_days
+        self.timezone = timezone
+
+        self._backend: Optional[StorageBackend] = None
+        self._remote_backend: Optional[StorageBackend] = None
+
+    @staticmethod
+    def is_github_actions() -> bool:
+        """检测是否在 GitHub Actions 环境中运行"""
+        return os.environ.get("GITHUB_ACTIONS") == "true"
+
+    @staticmethod
+    def is_docker() -> bool:
+        """检测是否在 Docker 容器中运行"""
+        # 方法1: 检查 /.dockerenv 文件
+        if os.path.exists("/.dockerenv"):
+            return True
+
+        # 方法2: 检查 cgroup（Linux）
+        try:
+            with open("/proc/1/cgroup", "r") as f:
+                return "docker" in f.read()
+        except (FileNotFoundError, PermissionError):
+            pass
+
+        # 方法3: 检查环境变量
+        return os.environ.get("DOCKER_CONTAINER") == "true"
+
+    def _resolve_backend_type(self) -> str:
+        """解析实际使用的后端类型"""
+        if self.backend_type == "auto":
+            if self.is_github_actions():
+                # GitHub Actions 环境，检查是否配置了远程存储
+                if self._has_remote_config():
+                    return "remote"
+                else:
+                    print("[存储管理器] GitHub Actions 环境但未配置远程存储，使用本地存储")
+                    return "local"
+            else:
+                return "local"
+        return self.backend_type
+
+    def _has_remote_config(self) -> bool:
+        """检查是否有有效的远程存储配置"""
+        # 检查配置或环境变量
+        bucket_name = self.remote_config.get("bucket_name") or os.environ.get("S3_BUCKET_NAME")
+        access_key = self.remote_config.get("access_key_id") or os.environ.get("S3_ACCESS_KEY_ID")
+        secret_key = self.remote_config.get("secret_access_key") or os.environ.get("S3_SECRET_ACCESS_KEY")
+        endpoint = self.remote_config.get("endpoint_url") or os.environ.get("S3_ENDPOINT_URL")
+
+        # 调试日志
+        has_config = bool(bucket_name and access_key and secret_key and endpoint)
+        if not has_config:
+            print(f"[存储管理器] 远程存储配置检查失败:")
+            print(f"  - bucket_name: {'已配置' if bucket_name else '未配置'}")
+            print(f"  - access_key_id: {'已配置' if access_key else '未配置'}")
+            print(f"  - secret_access_key: {'已配置' if secret_key else '未配置'}")
+            print(f"  - endpoint_url: {'已配置' if endpoint else '未配置'}")
+
+        return has_config
+
+    def _create_remote_backend(self) -> Optional[StorageBackend]:
+        """创建远程存储后端"""
+        try:
+            from trendradar.storage.remote import RemoteStorageBackend
+
+            return RemoteStorageBackend(
+                bucket_name=self.remote_config.get("bucket_name") or os.environ.get("S3_BUCKET_NAME", ""),
+                access_key_id=self.remote_config.get("access_key_id") or os.environ.get("S3_ACCESS_KEY_ID", ""),
+                secret_access_key=self.remote_config.get("secret_access_key") or os.environ.get("S3_SECRET_ACCESS_KEY", ""),
+                endpoint_url=self.remote_config.get("endpoint_url") or os.environ.get("S3_ENDPOINT_URL", ""),
+                region=self.remote_config.get("region") or os.environ.get("S3_REGION", ""),
+                enable_txt=self.enable_txt,
+                enable_html=self.enable_html,
+                timezone=self.timezone,
+            )
+        except ImportError as e:
+            print(f"[存储管理器] 远程后端导入失败: {e}")
+            print("[存储管理器] 请确保已安装 boto3: pip install boto3")
+            return None
+        except Exception as e:
+            print(f"[存储管理器] 远程后端初始化失败: {e}")
+            return None
+
+    def get_backend(self) -> StorageBackend:
+        """获取存储后端实例"""
+        if self._backend is None:
+            resolved_type = self._resolve_backend_type()
+
+            if resolved_type == "remote":
+                self._backend = self._create_remote_backend()
+                if self._backend:
+                    print(f"[存储管理器] 使用远程存储后端")
+                else:
+                    print("[存储管理器] 回退到本地存储")
+                    resolved_type = "local"
+
+            if resolved_type == "local" or self._backend is None:
+                from trendradar.storage.local import LocalStorageBackend
+
+                self._backend = LocalStorageBackend(
+                    data_dir=self.data_dir,
+                    enable_txt=self.enable_txt,
+                    enable_html=self.enable_html,
+                    timezone=self.timezone,
+                )
+                print(f"[存储管理器] 使用本地存储后端 (数据目录: {self.data_dir})")
+
+        return self._backend
+
+    def pull_from_remote(self) -> int:
+        """
+        从远程拉取数据到本地
+
+        Returns:
+            成功拉取的文件数量
+        """
+        if not self.pull_enabled or self.pull_days <= 0:
+            return 0
+
+        if not self._has_remote_config():
+            print("[存储管理器] 未配置远程存储，无法拉取")
+            return 0
+
+        # 创建远程后端（如果还没有）
+        if self._remote_backend is None:
+            self._remote_backend = self._create_remote_backend()
+
+        if self._remote_backend is None:
+            print("[存储管理器] 无法创建远程后端，拉取失败")
+            return 0
+
+        # 调用拉取方法
+        return self._remote_backend.pull_recent_days(self.pull_days, self.data_dir)
+
+    def save_news_data(self, data: NewsData) -> bool:
+        """保存新闻数据"""
+        return self.get_backend().save_news_data(data)
+
+    def get_today_all_data(self, date: Optional[str] = None) -> Optional[NewsData]:
+        """获取当天所有数据"""
+        return self.get_backend().get_today_all_data(date)
+
+    def get_latest_crawl_data(self, date: Optional[str] = None) -> Optional[NewsData]:
+        """获取最新抓取数据"""
+        return self.get_backend().get_latest_crawl_data(date)
+
+    def detect_new_titles(self, current_data: NewsData) -> dict:
+        """检测新增标题"""
+        return self.get_backend().detect_new_titles(current_data)
+
+    def save_txt_snapshot(self, data: NewsData) -> Optional[str]:
+        """保存 TXT 快照"""
+        return self.get_backend().save_txt_snapshot(data)
+
+    def save_html_report(self, html_content: str, filename: str, is_summary: bool = False) -> Optional[str]:
+        """保存 HTML 报告"""
+        return self.get_backend().save_html_report(html_content, filename, is_summary)
+
+    def is_first_crawl_today(self, date: Optional[str] = None) -> bool:
+        """检查是否是当天第一次抓取"""
+        return self.get_backend().is_first_crawl_today(date)
+
+    def cleanup(self) -> None:
+        """清理资源"""
+        if self._backend:
+            self._backend.cleanup()
+        if self._remote_backend:
+            self._remote_backend.cleanup()
+
+    def cleanup_old_data(self) -> int:
+        """
+        清理过期数据
+
+        Returns:
+            删除的日期目录数量
+        """
+        total_deleted = 0
+
+        # 清理本地数据
+        if self.local_retention_days > 0:
+            total_deleted += self.get_backend().cleanup_old_data(self.local_retention_days)
+
+        # 清理远程数据（如果配置了）
+        if self.remote_retention_days > 0 and self._has_remote_config():
+            if self._remote_backend is None:
+                self._remote_backend = self._create_remote_backend()
+            if self._remote_backend:
+                total_deleted += self._remote_backend.cleanup_old_data(self.remote_retention_days)
+
+        return total_deleted
+
+    @property
+    def backend_name(self) -> str:
+        """获取当前后端名称"""
+        return self.get_backend().backend_name
+
+    @property
+    def supports_txt(self) -> bool:
+        """是否支持 TXT 快照"""
+        return self.get_backend().supports_txt
+
+
+def get_storage_manager(
+    backend_type: str = "auto",
+    data_dir: str = "output",
+    enable_txt: bool = True,
+    enable_html: bool = True,
+    remote_config: Optional[dict] = None,
+    local_retention_days: int = 0,
+    remote_retention_days: int = 0,
+    pull_enabled: bool = False,
+    pull_days: int = 0,
+    timezone: str = "Asia/Shanghai",
+    force_new: bool = False,
+) -> StorageManager:
+    """
+    获取存储管理器单例
+
+    Args:
+        backend_type: 存储后端类型
+        data_dir: 本地数据目录
+        enable_txt: 是否启用 TXT 快照
+        enable_html: 是否启用 HTML 报告
+        remote_config: 远程存储配置
+        local_retention_days: 本地数据保留天数（0 = 无限制）
+        remote_retention_days: 远程数据保留天数（0 = 无限制）
+        pull_enabled: 是否启用启动时自动拉取
+        pull_days: 拉取最近 N 天的数据
+        timezone: 时区配置（默认 Asia/Shanghai）
+        force_new: 是否强制创建新实例
+
+    Returns:
+        StorageManager 实例
+    """
+    global _storage_manager
+
+    if _storage_manager is None or force_new:
+        _storage_manager = StorageManager(
+            backend_type=backend_type,
+            data_dir=data_dir,
+            enable_txt=enable_txt,
+            enable_html=enable_html,
+            remote_config=remote_config,
+            local_retention_days=local_retention_days,
+            remote_retention_days=remote_retention_days,
+            pull_enabled=pull_enabled,
+            pull_days=pull_days,
+            timezone=timezone,
+        )
+
+    return _storage_manager
@@ -0,0 +1,117 @@
+-- TrendRadar 数据库表结构
+
+-- ============================================
+-- 平台信息表
+-- 核心：id 不变，name 可变
+-- ============================================
+CREATE TABLE IF NOT EXISTS platforms (
+    id TEXT PRIMARY KEY,
+    name TEXT NOT NULL,
+    is_active INTEGER DEFAULT 1,
+    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+);
+
+-- ============================================
+-- 新闻条目表
+-- 以 URL + platform_id 为唯一标识，支持去重存储
+-- ============================================
+CREATE TABLE IF NOT EXISTS news_items (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    title TEXT NOT NULL,
+    platform_id TEXT NOT NULL,
+    rank INTEGER NOT NULL,
+    url TEXT DEFAULT '',
+    mobile_url TEXT DEFAULT '',
+    first_crawl_time TEXT NOT NULL,      -- 首次抓取时间
+    last_crawl_time TEXT NOT NULL,       -- 最后抓取时间
+    crawl_count INTEGER DEFAULT 1,       -- 抓取次数
+    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    FOREIGN KEY (platform_id) REFERENCES platforms(id)
+);
+
+-- ============================================
+-- 标题变更历史表
+-- 记录同一 URL 下标题的变化
+-- ============================================
+CREATE TABLE IF NOT EXISTS title_changes (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    news_item_id INTEGER NOT NULL,
+    old_title TEXT NOT NULL,
+    new_title TEXT NOT NULL,
+    changed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    FOREIGN KEY (news_item_id) REFERENCES news_items(id)
+);
+
+-- ============================================
+-- 排名历史表
+-- 记录每次抓取时的排名变化
+-- ============================================
+CREATE TABLE IF NOT EXISTS rank_history (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    news_item_id INTEGER NOT NULL,
+    rank INTEGER NOT NULL,
+    crawl_time TEXT NOT NULL,
+    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    FOREIGN KEY (news_item_id) REFERENCES news_items(id)
+);
+
+-- ============================================
+-- 抓取记录表
+-- 记录每次抓取的时间和数量
+-- ============================================
+CREATE TABLE IF NOT EXISTS crawl_records (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    crawl_time TEXT NOT NULL UNIQUE,
+    total_items INTEGER DEFAULT 0,
+    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+);
+
+-- ============================================
+-- 抓取来源状态表
+-- 记录每次抓取各平台的成功/失败状态
+-- ============================================
+CREATE TABLE IF NOT EXISTS crawl_source_status (
+    crawl_record_id INTEGER NOT NULL,
+    platform_id TEXT NOT NULL,
+    status TEXT NOT NULL CHECK(status IN ('success', 'failed')),
+    PRIMARY KEY (crawl_record_id, platform_id),
+    FOREIGN KEY (crawl_record_id) REFERENCES crawl_records(id),
+    FOREIGN KEY (platform_id) REFERENCES platforms(id)
+);
+
+-- ============================================
+-- 推送记录表
+-- 用于 push_window once_per_day 功能
+-- ============================================
+CREATE TABLE IF NOT EXISTS push_records (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    date TEXT NOT NULL UNIQUE,
+    pushed INTEGER DEFAULT 0,
+    push_time TEXT,
+    report_type TEXT,
+    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+);
+
+-- ============================================
+-- 索引定义
+-- ============================================
+
+-- 平台索引
+CREATE INDEX IF NOT EXISTS idx_news_platform ON news_items(platform_id);
+
+-- 时间索引（用于查询最新数据）
+CREATE INDEX IF NOT EXISTS idx_news_crawl_time ON news_items(last_crawl_time);
+
+-- 标题索引（用于标题搜索）
+CREATE INDEX IF NOT EXISTS idx_news_title ON news_items(title);
+
+-- URL + platform_id 唯一索引（仅对非空 URL，实现去重）
+CREATE UNIQUE INDEX IF NOT EXISTS idx_news_url_platform
+    ON news_items(url, platform_id) WHERE url != '';
+
+-- 抓取状态索引
+CREATE INDEX IF NOT EXISTS idx_crawl_status_record ON crawl_source_status(crawl_record_id);
+
+-- 排名历史索引
+CREATE INDEX IF NOT EXISTS idx_rank_history_news ON rank_history(news_item_id);