v4.0.0 大大大更新

2026-05-01 00:12:42 +08:00 · 2025-12-13 13:44:35 +08:00
parent 97c05aa33c
commit c7bacdfff7
61 changed files with 12407 additions and 5889 deletions
@@ -4,4 +4,4 @@ TrendRadar MCP Server
 提供基于MCP协议的新闻聚合数据查询和系统管理接口。
 """

-__version__ = "1.0.0"
+__version__ = "1.1.0"
@@ -15,6 +15,7 @@ from .tools.analytics import AnalyticsTools
 from .tools.search_tools import SearchTools
 from .tools.config_mgmt import ConfigManagementTools
 from .tools.system import SystemManagementTools
+from .tools.storage_sync import StorageSyncTools
 from .utils.date_parser import DateParser
 from .utils.errors import MCPError

@@ -34,6 +35,7 @@ def _get_tools(project_root: Optional[str] = None):
        _tools_instances['search'] = SearchTools(project_root)
        _tools_instances['config'] = ConfigManagementTools(project_root)
        _tools_instances['system'] = SystemManagementTools(project_root)
+        _tools_instances['storage'] = StorageSyncTools(project_root)
    return _tools_instances


@@ -657,6 +659,127 @@ async def trigger_crawl(
    return json.dumps(result, ensure_ascii=False, indent=2)


+# ==================== 存储同步工具 ====================
+
+@mcp.tool
+async def sync_from_remote(
+    days: int = 7
+) -> str:
+    """
+    从远程存储拉取数据到本地
+
+    用于 MCP Server 等场景：爬虫存到远程云存储（如 Cloudflare R2），
+    MCP Server 拉取到本地进行分析查询。
+
+    Args:
+        days: 拉取最近 N 天的数据，默认 7 天
+              - 0: 不拉取
+              - 7: 拉取最近一周的数据
+              - 30: 拉取最近一个月的数据
+
+    Returns:
+        JSON格式的同步结果，包含：
+        - success: 是否成功
+        - synced_files: 成功同步的文件数量
+        - synced_dates: 成功同步的日期列表
+        - skipped_dates: 跳过的日期（本地已存在）
+        - failed_dates: 失败的日期及错误信息
+        - message: 操作结果描述
+
+    Examples:
+        - sync_from_remote()  # 拉取最近7天
+        - sync_from_remote(days=30)  # 拉取最近30天
+
+    Note:
+        需要在 config/config.yaml 中配置远程存储（storage.remote）或设置环境变量：
+        - S3_ENDPOINT_URL: 服务端点
+        - S3_BUCKET_NAME: 存储桶名称
+        - S3_ACCESS_KEY_ID: 访问密钥 ID
+        - S3_SECRET_ACCESS_KEY: 访问密钥
+    """
+    tools = _get_tools()
+    result = tools['storage'].sync_from_remote(days=days)
+    return json.dumps(result, ensure_ascii=False, indent=2)
+
+
+@mcp.tool
+async def get_storage_status() -> str:
+    """
+    获取存储配置和状态
+
+    查看当前存储后端配置、本地和远程存储的状态信息。
+
+    Returns:
+        JSON格式的存储状态信息，包含：
+        - backend: 当前使用的后端类型（local/remote/auto）
+        - local: 本地存储状态
+            - data_dir: 数据目录
+            - retention_days: 保留天数
+            - total_size: 总大小
+            - date_count: 日期数量
+            - earliest_date: 最早日期
+            - latest_date: 最新日期
+        - remote: 远程存储状态
+            - configured: 是否已配置
+            - endpoint_url: 服务端点
+            - bucket_name: 存储桶名称
+            - date_count: 远程日期数量
+        - pull: 拉取配置
+            - enabled: 是否启用自动拉取
+            - days: 自动拉取天数
+
+    Examples:
+        - get_storage_status()  # 查看所有存储状态
+    """
+    tools = _get_tools()
+    result = tools['storage'].get_storage_status()
+    return json.dumps(result, ensure_ascii=False, indent=2)
+
+
+@mcp.tool
+async def list_available_dates(
+    source: str = "both"
+) -> str:
+    """
+    列出本地/远程可用的日期范围
+
+    查看本地和远程存储中有哪些日期的数据可用，
+    帮助了解数据覆盖范围和同步状态。
+
+    Args:
+        source: 数据来源，可选值：
+            - "local": 仅列出本地可用日期
+            - "remote": 仅列出远程可用日期
+            - "both": 同时列出两者并进行对比（默认）
+
+    Returns:
+        JSON格式的日期列表，包含：
+        - local: 本地日期信息（如果 source 包含 local）
+            - dates: 日期列表（按时间倒序）
+            - count: 日期数量
+            - earliest: 最早日期
+            - latest: 最新日期
+        - remote: 远程日期信息（如果 source 包含 remote）
+            - configured: 是否已配置远程存储
+            - dates: 日期列表
+            - count: 日期数量
+            - earliest: 最早日期
+            - latest: 最新日期
+        - comparison: 对比结果（仅当 source="both" 时）
+            - only_local: 仅本地存在的日期
+            - only_remote: 仅远程存在的日期
+            - both: 两边都存在的日期
+
+    Examples:
+        - list_available_dates()  # 查看本地和远程的对比
+        - list_available_dates(source="local")  # 仅查看本地
+        - list_available_dates(source="remote")  # 仅查看远程
+    """
+    tools = _get_tools()
+    result = tools['storage'].list_available_dates(source=source)
+    return json.dumps(result, ensure_ascii=False, indent=2)
+
+
 # ==================== 启动入口 ====================

 def run_server(
@@ -721,6 +844,11 @@ def run_server(
    print("    11. get_current_config      - 获取当前系统配置")
    print("    12. get_system_status       - 获取系统运行状态")
    print("    13. trigger_crawl           - 手动触发爬取任务")
+    print()
+    print("    === 存储同步工具 ===")
+    print("    14. sync_from_remote        - 从远程存储拉取数据到本地")
+    print("    15. get_storage_status      - 获取存储配置和状态")
+    print("    16. list_available_dates    - 列出本地/远程可用日期")
    print("=" * 60)
    print()

@@ -517,24 +517,55 @@ class DataService:
        # 遍历日期文件夹
        for date_folder in output_dir.iterdir():
            if date_folder.is_dir() and not date_folder.name.startswith('.'):
-                # 解析日期（格式: YYYY年MM月DD日）
-                try:
-                    date_match = re.match(r'(\d{4})年(\d{2})月(\d{2})日', date_folder.name)
-                    if date_match:
-                        folder_date = datetime(
-                            int(date_match.group(1)),
-                            int(date_match.group(2)),
-                            int(date_match.group(3))
-                        )
-                        available_dates.append(folder_date)
-                except Exception:
-                    pass
+                folder_date = self._parse_date_folder_name(date_folder.name)
+                if folder_date:
+                    available_dates.append(folder_date)

        if not available_dates:
            return (None, None)

        return (min(available_dates), max(available_dates))

+    def _parse_date_folder_name(self, folder_name: str) -> Optional[datetime]:
+        """
+        解析日期文件夹名称（兼容中文和ISO格式）
+
+        支持两种格式：
+        - 中文格式：YYYY年MM月DD日
+        - ISO格式：YYYY-MM-DD
+
+        Args:
+            folder_name: 文件夹名称
+
+        Returns:
+            datetime 对象，解析失败返回 None
+        """
+        # 尝试中文格式：YYYY年MM月DD日
+        chinese_match = re.match(r'(\d{4})年(\d{2})月(\d{2})日', folder_name)
+        if chinese_match:
+            try:
+                return datetime(
+                    int(chinese_match.group(1)),
+                    int(chinese_match.group(2)),
+                    int(chinese_match.group(3))
+                )
+            except ValueError:
+                pass
+
+        # 尝试 ISO 格式：YYYY-MM-DD
+        iso_match = re.match(r'(\d{4})-(\d{2})-(\d{2})', folder_name)
+        if iso_match:
+            try:
+                return datetime(
+                    int(iso_match.group(1)),
+                    int(iso_match.group(2)),
+                    int(iso_match.group(3))
+                )
+            except ValueError:
+                pass
+
+        return None
+
    def get_system_status(self) -> Dict:
        """
        获取系统运行状态
@@ -553,26 +584,14 @@ class DataService:
        if output_dir.exists():
            # 遍历日期文件夹
            for date_folder in output_dir.iterdir():
-                if date_folder.is_dir():
-                    # 解析日期
-                    try:
-                        date_str = date_folder.name
-                        # 格式: YYYY年MM月DD日
-                        date_match = re.match(r'(\d{4})年(\d{2})月(\d{2})日', date_str)
-                        if date_match:
-                            folder_date = datetime(
-                                int(date_match.group(1)),
-                                int(date_match.group(2)),
-                                int(date_match.group(3))
-                            )
-
-                            if oldest_record is None or folder_date < oldest_record:
-                                oldest_record = folder_date
-                            if latest_record is None or folder_date > latest_record:
-                                latest_record = folder_date
-
-                    except:
-                        pass
+                if date_folder.is_dir() and not date_folder.name.startswith('.'):
+                    # 解析日期（兼容中文和ISO格式）
+                    folder_date = self._parse_date_folder_name(date_folder.name)
+                    if folder_date:
+                        if oldest_record is None or folder_date < oldest_record:
+                            oldest_record = folder_date
+                        if latest_record is None or folder_date > latest_record:
+                            latest_record = folder_date

                    # 计算存储大小
                    for item in date_folder.rglob("*"):
@@ -2,9 +2,12 @@
 文件解析服务

 提供txt格式新闻数据和YAML配置文件的解析功能。
+支持从 SQLite 数据库和 TXT 文件两种数据源读取。
 """

+import json
 import re
+import sqlite3
 from pathlib import Path
 from typing import Dict, List, Tuple, Optional
 from datetime import datetime
@@ -145,17 +148,310 @@ class ParserService:

    def get_date_folder_name(self, date: datetime = None) -> str:
        """
-        获取日期文件夹名称
+        获取日期文件夹名称（兼容中文和ISO格式）

        Args:
            date: 日期对象，默认为今天

        Returns:
-            文件夹名称，格式: YYYY年MM月DD日
+            实际存在的文件夹名称，优先返回中文格式（YYYY年MM月DD日），
+            若不存在则返回 ISO 格式（YYYY-MM-DD）
        """
        if date is None:
            date = datetime.now()
-        return date.strftime("%Y年%m月%d日")
+        return self._find_date_folder(date)
+
+    def _get_date_folder_name(self, date: datetime = None) -> str:
+        """
+        获取日期文件夹名称（兼容中文和ISO格式）
+
+        Args:
+            date: 日期对象，默认为今天
+
+        Returns:
+            实际存在的文件夹名称，优先返回中文格式（YYYY年MM月DD日），
+            若不存在则返回 ISO 格式（YYYY-MM-DD）
+        """
+        if date is None:
+            date = datetime.now()
+        return self._find_date_folder(date)
+
+    def _find_date_folder(self, date: datetime) -> str:
+        """
+        查找实际存在的日期文件夹
+
+        支持两种格式：
+        - 中文格式：YYYY年MM月DD日（优先）
+        - ISO格式：YYYY-MM-DD
+
+        Args:
+            date: 日期对象
+
+        Returns:
+            实际存在的文件夹名称，若都不存在则返回中文格式
+        """
+        output_dir = self.project_root / "output"
+
+        # 中文格式：YYYY年MM月DD日
+        chinese_format = date.strftime("%Y年%m月%d日")
+        # ISO格式：YYYY-MM-DD
+        iso_format = date.strftime("%Y-%m-%d")
+
+        # 优先检查中文格式
+        if (output_dir / chinese_format).exists():
+            return chinese_format
+        # 其次检查 ISO 格式
+        if (output_dir / iso_format).exists():
+            return iso_format
+
+        # 都不存在，返回中文格式（与项目现有风格一致）
+        return chinese_format
+
+    def _get_sqlite_db_path(self, date: datetime = None) -> Optional[Path]:
+        """
+        获取 SQLite 数据库文件路径
+
+        Args:
+            date: 日期对象，默认为今天
+
+        Returns:
+            数据库文件路径，如果不存在则返回 None
+        """
+        date_folder = self._get_date_folder_name(date)
+        db_path = self.project_root / "output" / date_folder / "news.db"
+        if db_path.exists():
+            return db_path
+        return None
+
+    def _get_txt_folder_path(self, date: datetime = None) -> Optional[Path]:
+        """
+        获取 TXT 文件夹路径
+
+        Args:
+            date: 日期对象，默认为今天
+
+        Returns:
+            TXT 文件夹路径，如果不存在则返回 None
+        """
+        date_folder = self._get_date_folder_name(date)
+        txt_path = self.project_root / "output" / date_folder / "txt"
+        if txt_path.exists() and txt_path.is_dir():
+            return txt_path
+        return None
+
+    def _read_from_txt(
+        self,
+        date: datetime = None,
+        platform_ids: Optional[List[str]] = None
+    ) -> Optional[Tuple[Dict, Dict, Dict]]:
+        """
+        从 TXT 文件夹读取新闻数据
+
+        Args:
+            date: 日期对象，默认为今天
+            platform_ids: 平台ID列表，None表示所有平台
+
+        Returns:
+            (all_titles, id_to_name, all_timestamps) 元组，如果不存在返回 None
+        """
+        txt_folder = self._get_txt_folder_path(date)
+        if txt_folder is None:
+            return None
+
+        # 获取所有 TXT 文件并按时间排序
+        txt_files = sorted(txt_folder.glob("*.txt"))
+        if not txt_files:
+            return None
+
+        all_titles = {}
+        id_to_name = {}
+        all_timestamps = {}
+
+        for txt_file in txt_files:
+            try:
+                titles_by_id, file_id_to_name = self.parse_txt_file(txt_file)
+
+                # 记录时间戳
+                all_timestamps[txt_file.name] = txt_file.stat().st_mtime
+
+                # 合并 id_to_name
+                id_to_name.update(file_id_to_name)
+
+                # 合并标题数据
+                for source_id, titles in titles_by_id.items():
+                    # 如果指定了 platform_ids，过滤
+                    if platform_ids and source_id not in platform_ids:
+                        continue
+
+                    if source_id not in all_titles:
+                        all_titles[source_id] = {}
+
+                    for title, data in titles.items():
+                        if title not in all_titles[source_id]:
+                            # 新标题
+                            all_titles[source_id][title] = {
+                                "ranks": data.get("ranks", []),
+                                "url": data.get("url", ""),
+                                "mobileUrl": data.get("mobileUrl", ""),
+                                "first_time": txt_file.stem,  # 使用文件名作为时间
+                                "last_time": txt_file.stem,
+                                "count": 1,
+                            }
+                        else:
+                            # 合并已存在的标题
+                            existing = all_titles[source_id][title]
+                            # 合并排名
+                            for rank in data.get("ranks", []):
+                                if rank not in existing["ranks"]:
+                                    existing["ranks"].append(rank)
+                            # 更新 last_time
+                            existing["last_time"] = txt_file.stem
+                            existing["count"] += 1
+                            # 保留 URL
+                            if not existing["url"] and data.get("url"):
+                                existing["url"] = data["url"]
+                            if not existing["mobileUrl"] and data.get("mobileUrl"):
+                                existing["mobileUrl"] = data["mobileUrl"]
+
+            except Exception as e:
+                print(f"Warning: 解析 TXT 文件失败 {txt_file}: {e}")
+                continue
+
+        if not all_titles:
+            return None
+
+        return (all_titles, id_to_name, all_timestamps)
+
+    def _read_from_sqlite(
+        self,
+        date: datetime = None,
+        platform_ids: Optional[List[str]] = None
+    ) -> Optional[Tuple[Dict, Dict, Dict]]:
+        """
+        从 SQLite 数据库读取新闻数据
+
+        新表结构数据已按 URL 去重，包含：
+        - first_crawl_time: 首次抓取时间
+        - last_crawl_time: 最后抓取时间
+        - crawl_count: 抓取次数
+
+        Args:
+            date: 日期对象，默认为今天
+            platform_ids: 平台ID列表，None表示所有平台
+
+        Returns:
+            (all_titles, id_to_name, all_timestamps) 元组，如果数据库不存在返回 None
+        """
+        db_path = self._get_sqlite_db_path(date)
+        if db_path is None:
+            return None
+
+        all_titles = {}
+        id_to_name = {}
+        all_timestamps = {}
+
+        try:
+            conn = sqlite3.connect(str(db_path))
+            conn.row_factory = sqlite3.Row
+            cursor = conn.cursor()
+
+            # 检查表是否存在
+            cursor.execute("""
+                SELECT name FROM sqlite_master
+                WHERE type='table' AND name='news_items'
+            """)
+            if not cursor.fetchone():
+                conn.close()
+                return None
+
+            # 构建查询
+            if platform_ids:
+                placeholders = ','.join(['?' for _ in platform_ids])
+                query = f"""
+                    SELECT n.id, n.platform_id, p.name as platform_name, n.title,
+                           n.rank, n.url, n.mobile_url,
+                           n.first_crawl_time, n.last_crawl_time, n.crawl_count
+                    FROM news_items n
+                    LEFT JOIN platforms p ON n.platform_id = p.id
+                    WHERE n.platform_id IN ({placeholders})
+                """
+                cursor.execute(query, platform_ids)
+            else:
+                cursor.execute("""
+                    SELECT n.id, n.platform_id, p.name as platform_name, n.title,
+                           n.rank, n.url, n.mobile_url,
+                           n.first_crawl_time, n.last_crawl_time, n.crawl_count
+                    FROM news_items n
+                    LEFT JOIN platforms p ON n.platform_id = p.id
+                """)
+
+            rows = cursor.fetchall()
+
+            # 收集所有 news_item_id 用于查询历史排名
+            news_ids = [row['id'] for row in rows]
+            rank_history_map = {}
+
+            if news_ids:
+                placeholders = ",".join("?" * len(news_ids))
+                cursor.execute(f"""
+                    SELECT news_item_id, rank FROM rank_history
+                    WHERE news_item_id IN ({placeholders})
+                    ORDER BY news_item_id, crawl_time
+                """, news_ids)
+                
+                for rh_row in cursor.fetchall():
+                    news_id = rh_row['news_item_id']
+                    rank = rh_row['rank']
+                    if news_id not in rank_history_map:
+                        rank_history_map[news_id] = []
+                    rank_history_map[news_id].append(rank)
+
+            for row in rows:
+                news_id = row['id']
+                platform_id = row['platform_id']
+                platform_name = row['platform_name'] or platform_id
+                title = row['title']
+
+                # 更新 id_to_name
+                if platform_id not in id_to_name:
+                    id_to_name[platform_id] = platform_name
+
+                # 初始化平台字典
+                if platform_id not in all_titles:
+                    all_titles[platform_id] = {}
+
+                # 获取排名历史，如果为空则使用当前排名
+                ranks = rank_history_map.get(news_id, [row['rank']])
+
+                # 直接使用数据（已去重）
+                all_titles[platform_id][title] = {
+                    "ranks": ranks,
+                    "url": row['url'] or "",
+                    "mobileUrl": row['mobile_url'] or "",
+                    "first_time": row['first_crawl_time'] or "",
+                    "last_time": row['last_crawl_time'] or "",
+                    "count": row['crawl_count'] or 1,
+                }
+
+            # 获取抓取时间作为 timestamps
+            cursor.execute("""
+                SELECT crawl_time FROM crawl_records
+                ORDER BY crawl_time
+            """)
+            for row in cursor.fetchall():
+                crawl_time = row['crawl_time']
+                all_timestamps[f"{crawl_time}.db"] = 0  # 用虚拟时间戳
+
+            conn.close()
+
+            if not all_titles:
+                return None
+
+            return (all_titles, id_to_name, all_timestamps)
+
+        except Exception as e:
+            print(f"Warning: 从 SQLite 读取数据失败: {e}")
+            return None

    def read_all_titles_for_date(
        self,
@@ -163,7 +459,7 @@ class ParserService:
        platform_ids: Optional[List[str]] = None
    ) -> Tuple[Dict, Dict, Dict]:
        """
-        读取指定日期的所有标题文件（带缓存）
+        读取指定日期的所有标题（带缓存）

        Args:
            date: 日期对象，默认为今天
@@ -193,71 +489,23 @@ class ParserService:
        if cached:
            return cached

-        # 缓存未命中，读取文件
-        date_folder = self.get_date_folder_name(date)
-        txt_dir = self.project_root / "output" / date_folder / "txt"
+        # 优先从 SQLite 读取
+        sqlite_result = self._read_from_sqlite(date, platform_ids)
+        if sqlite_result:
+            self.cache.set(cache_key, sqlite_result)
+            return sqlite_result

-        if not txt_dir.exists():
-            raise DataNotFoundError(
-                f"未找到 {date_folder} 的数据目录",
-                suggestion="请先运行爬虫或检查日期是否正确"
-            )
+        # SQLite 不存在，尝试从 TXT 读取
+        txt_result = self._read_from_txt(date, platform_ids)
+        if txt_result:
+            self.cache.set(cache_key, txt_result)
+            return txt_result

-        all_titles = {}
-        id_to_name = {}
-        all_timestamps = {}
-
-        # 读取所有txt文件
-        txt_files = sorted(txt_dir.glob("*.txt"))
-
-        if not txt_files:
-            raise DataNotFoundError(
-                f"{date_folder} 没有数据文件",
-                suggestion="请等待爬虫任务完成"
-            )
-
-        for txt_file in txt_files:
-            try:
-                titles_by_id, file_id_to_name = self.parse_txt_file(txt_file)
-
-                # 更新id_to_name
-                id_to_name.update(file_id_to_name)
-
-                # 合并标题数据
-                for platform_id, titles in titles_by_id.items():
-                    # 如果指定了平台过滤
-                    if platform_ids and platform_id not in platform_ids:
-                        continue
-
-                    if platform_id not in all_titles:
-                        all_titles[platform_id] = {}
-
-                    for title, info in titles.items():
-                        if title in all_titles[platform_id]:
-                            # 合并排名
-                            all_titles[platform_id][title]["ranks"].extend(info["ranks"])
-                        else:
-                            all_titles[platform_id][title] = info.copy()
-
-                # 记录文件时间戳
-                all_timestamps[txt_file.name] = txt_file.stat().st_mtime
-
-            except Exception as e:
-                # 忽略单个文件的解析错误，继续处理其他文件
-                print(f"Warning: 解析文件 {txt_file} 失败: {e}")
-                continue
-
-        if not all_titles:
-            raise DataNotFoundError(
-                f"{date_folder} 没有有效的数据",
-                suggestion="请检查数据文件格式或重新运行爬虫"
-            )
-
-        # 缓存结果
-        result = (all_titles, id_to_name, all_timestamps)
-        self.cache.set(cache_key, result)
-
-        return result
+        # 两种数据源都不存在
+        raise DataNotFoundError(
+            f"未找到 {date_str} 的数据",
+            suggestion="请先运行爬虫或检查日期是否正确"
+        )

    def parse_yaml_config(self, config_path: str = None) -> dict:
        """
@@ -25,7 +25,6 @@ def calculate_news_weight(news_data: Dict, rank_threshold: int = 5) -> float:
    """
    计算新闻权重（用于排序）

-    基于 main.py 的权重算法实现，综合考虑：
    - 排名权重 (60%)：新闻在榜单中的排名
    - 频次权重 (30%)：新闻出现的次数
    - 热度权重 (10%)：高排名出现的比例
@@ -0,0 +1,468 @@
+# coding=utf-8
+"""
+存储同步工具
+
+实现从远程存储拉取数据到本地、获取存储状态、列出可用日期等功能。
+"""
+
+import os
+import re
+from pathlib import Path
+from datetime import datetime, timedelta
+from typing import Dict, List, Optional
+
+import yaml
+
+from ..utils.errors import MCPError
+
+
+class StorageSyncTools:
+    """存储同步工具类"""
+
+    def __init__(self, project_root: str = None):
+        """
+        初始化存储同步工具
+
+        Args:
+            project_root: 项目根目录
+        """
+        if project_root:
+            self.project_root = Path(project_root)
+        else:
+            current_file = Path(__file__)
+            self.project_root = current_file.parent.parent.parent
+
+        self._config = None
+        self._remote_backend = None
+
+    def _load_config(self) -> dict:
+        """加载配置文件"""
+        if self._config is None:
+            config_path = self.project_root / "config" / "config.yaml"
+            if config_path.exists():
+                with open(config_path, "r", encoding="utf-8") as f:
+                    self._config = yaml.safe_load(f)
+            else:
+                self._config = {}
+        return self._config
+
+    def _get_storage_config(self) -> dict:
+        """获取存储配置"""
+        config = self._load_config()
+        return config.get("storage", {})
+
+    def _get_remote_config(self) -> dict:
+        """
+        获取远程存储配置（合并配置文件和环境变量）
+        """
+        storage_config = self._get_storage_config()
+        remote_config = storage_config.get("remote", {})
+
+        return {
+            "endpoint_url": remote_config.get("endpoint_url") or os.environ.get("S3_ENDPOINT_URL", ""),
+            "bucket_name": remote_config.get("bucket_name") or os.environ.get("S3_BUCKET_NAME", ""),
+            "access_key_id": remote_config.get("access_key_id") or os.environ.get("S3_ACCESS_KEY_ID", ""),
+            "secret_access_key": remote_config.get("secret_access_key") or os.environ.get("S3_SECRET_ACCESS_KEY", ""),
+            "region": remote_config.get("region") or os.environ.get("S3_REGION", ""),
+        }
+
+    def _has_remote_config(self) -> bool:
+        """检查是否有有效的远程存储配置"""
+        config = self._get_remote_config()
+        return bool(
+            config.get("bucket_name") and
+            config.get("access_key_id") and
+            config.get("secret_access_key") and
+            config.get("endpoint_url")
+        )
+
+    def _get_remote_backend(self):
+        """获取远程存储后端实例"""
+        if self._remote_backend is not None:
+            return self._remote_backend
+
+        if not self._has_remote_config():
+            return None
+
+        try:
+            from trendradar.storage.remote import RemoteStorageBackend
+
+            remote_config = self._get_remote_config()
+            config = self._load_config()
+            timezone = config.get("app", {}).get("timezone", "Asia/Shanghai")
+
+            self._remote_backend = RemoteStorageBackend(
+                bucket_name=remote_config["bucket_name"],
+                access_key_id=remote_config["access_key_id"],
+                secret_access_key=remote_config["secret_access_key"],
+                endpoint_url=remote_config["endpoint_url"],
+                region=remote_config.get("region", ""),
+                timezone=timezone,
+            )
+            return self._remote_backend
+        except ImportError:
+            print("[存储同步] 远程存储后端需要安装 boto3: pip install boto3")
+            return None
+        except Exception as e:
+            print(f"[存储同步] 创建远程后端失败: {e}")
+            return None
+
+    def _get_local_data_dir(self) -> Path:
+        """获取本地数据目录"""
+        storage_config = self._get_storage_config()
+        local_config = storage_config.get("local", {})
+        data_dir = local_config.get("data_dir", "output")
+        return self.project_root / data_dir
+
+    def _parse_date_folder_name(self, folder_name: str) -> Optional[datetime]:
+        """
+        解析日期文件夹名称（兼容中文和 ISO 格式）
+
+        支持两种格式：
+        - 中文格式：YYYY年MM月DD日
+        - ISO 格式：YYYY-MM-DD
+        """
+        # 尝试 ISO 格式
+        iso_match = re.match(r'(\d{4})-(\d{2})-(\d{2})', folder_name)
+        if iso_match:
+            try:
+                return datetime(
+                    int(iso_match.group(1)),
+                    int(iso_match.group(2)),
+                    int(iso_match.group(3))
+                )
+            except ValueError:
+                pass
+
+        # 尝试中文格式
+        chinese_match = re.match(r'(\d{4})年(\d{2})月(\d{2})日', folder_name)
+        if chinese_match:
+            try:
+                return datetime(
+                    int(chinese_match.group(1)),
+                    int(chinese_match.group(2)),
+                    int(chinese_match.group(3))
+                )
+            except ValueError:
+                pass
+
+        return None
+
+    def _get_local_dates(self) -> List[str]:
+        """获取本地可用的日期列表"""
+        local_dir = self._get_local_data_dir()
+        dates = []
+
+        if not local_dir.exists():
+            return dates
+
+        for item in local_dir.iterdir():
+            if item.is_dir() and not item.name.startswith('.'):
+                folder_date = self._parse_date_folder_name(item.name)
+                if folder_date:
+                    dates.append(folder_date.strftime("%Y-%m-%d"))
+
+        return sorted(dates, reverse=True)
+
+    def _calculate_dir_size(self, path: Path) -> int:
+        """计算目录大小（字节）"""
+        total_size = 0
+        if path.exists():
+            for item in path.rglob("*"):
+                if item.is_file():
+                    total_size += item.stat().st_size
+        return total_size
+
+    def sync_from_remote(self, days: int = 7) -> Dict:
+        """
+        从远程存储拉取数据到本地
+
+        Args:
+            days: 拉取最近 N 天的数据，默认 7 天
+
+        Returns:
+            同步结果字典
+        """
+        try:
+            # 检查远程配置
+            if not self._has_remote_config():
+                return {
+                    "success": False,
+                    "error": {
+                        "code": "REMOTE_NOT_CONFIGURED",
+                        "message": "未配置远程存储",
+                        "suggestion": "请在 config/config.yaml 中配置 storage.remote 或设置环境变量"
+                    }
+                }
+
+            # 获取远程后端
+            remote_backend = self._get_remote_backend()
+            if remote_backend is None:
+                return {
+                    "success": False,
+                    "error": {
+                        "code": "REMOTE_BACKEND_FAILED",
+                        "message": "无法创建远程存储后端",
+                        "suggestion": "请检查远程存储配置和 boto3 是否已安装"
+                    }
+                }
+
+            # 获取本地数据目录
+            local_dir = self._get_local_data_dir()
+            local_dir.mkdir(parents=True, exist_ok=True)
+
+            # 获取远程可用日期
+            remote_dates = remote_backend.list_remote_dates()
+
+            # 获取本地已有日期
+            local_dates = set(self._get_local_dates())
+
+            # 计算需要拉取的日期（最近 N 天）
+            from trendradar.utils.time import get_configured_time
+            config = self._load_config()
+            timezone = config.get("app", {}).get("timezone", "Asia/Shanghai")
+            now = get_configured_time(timezone)
+
+            target_dates = []
+            for i in range(days):
+                date = now - timedelta(days=i)
+                date_str = date.strftime("%Y-%m-%d")
+                if date_str in remote_dates:
+                    target_dates.append(date_str)
+
+            # 执行拉取
+            synced_dates = []
+            skipped_dates = []
+            failed_dates = []
+
+            for date_str in target_dates:
+                # 检查本地是否已存在
+                if date_str in local_dates:
+                    skipped_dates.append(date_str)
+                    continue
+
+                # 拉取单个日期
+                try:
+                    local_date_dir = local_dir / date_str
+                    local_db_path = local_date_dir / "news.db"
+                    remote_key = f"news/{date_str}.db"
+
+                    local_date_dir.mkdir(parents=True, exist_ok=True)
+                    remote_backend.s3_client.download_file(
+                        remote_backend.bucket_name,
+                        remote_key,
+                        str(local_db_path)
+                    )
+                    synced_dates.append(date_str)
+                    print(f"[存储同步] 已拉取: {date_str}")
+                except Exception as e:
+                    failed_dates.append({"date": date_str, "error": str(e)})
+                    print(f"[存储同步] 拉取失败 ({date_str}): {e}")
+
+            return {
+                "success": True,
+                "synced_files": len(synced_dates),
+                "synced_dates": synced_dates,
+                "skipped_dates": skipped_dates,
+                "failed_dates": failed_dates,
+                "message": f"成功同步 {len(synced_dates)} 天数据" + (
+                    f"，跳过 {len(skipped_dates)} 天（本地已存在）" if skipped_dates else ""
+                ) + (
+                    f"，失败 {len(failed_dates)} 天" if failed_dates else ""
+                )
+            }
+
+        except MCPError as e:
+            return {
+                "success": False,
+                "error": e.to_dict()
+            }
+        except Exception as e:
+            return {
+                "success": False,
+                "error": {
+                    "code": "INTERNAL_ERROR",
+                    "message": str(e)
+                }
+            }
+
+    def get_storage_status(self) -> Dict:
+        """
+        获取存储配置和状态
+
+        Returns:
+            存储状态字典
+        """
+        try:
+            storage_config = self._get_storage_config()
+            config = self._load_config()
+
+            # 本地存储状态
+            local_config = storage_config.get("local", {})
+            local_dir = self._get_local_data_dir()
+            local_size = self._calculate_dir_size(local_dir)
+            local_dates = self._get_local_dates()
+
+            local_status = {
+                "data_dir": local_config.get("data_dir", "output"),
+                "retention_days": local_config.get("retention_days", 0),
+                "total_size": f"{local_size / 1024 / 1024:.2f} MB",
+                "total_size_bytes": local_size,
+                "date_count": len(local_dates),
+                "earliest_date": local_dates[-1] if local_dates else None,
+                "latest_date": local_dates[0] if local_dates else None,
+            }
+
+            # 远程存储状态
+            remote_config = storage_config.get("remote", {})
+            has_remote = self._has_remote_config()
+
+            remote_status = {
+                "configured": has_remote,
+                "retention_days": remote_config.get("retention_days", 0),
+            }
+
+            if has_remote:
+                merged_config = self._get_remote_config()
+                # 脱敏显示
+                endpoint = merged_config.get("endpoint_url", "")
+                bucket = merged_config.get("bucket_name", "")
+                remote_status["endpoint_url"] = endpoint
+                remote_status["bucket_name"] = bucket
+
+                # 尝试获取远程日期列表
+                remote_backend = self._get_remote_backend()
+                if remote_backend:
+                    try:
+                        remote_dates = remote_backend.list_remote_dates()
+                        remote_status["date_count"] = len(remote_dates)
+                        remote_status["earliest_date"] = remote_dates[-1] if remote_dates else None
+                        remote_status["latest_date"] = remote_dates[0] if remote_dates else None
+                    except Exception as e:
+                        remote_status["error"] = str(e)
+
+            # 拉取配置状态
+            pull_config = storage_config.get("pull", {})
+            pull_status = {
+                "enabled": pull_config.get("enabled", False),
+                "days": pull_config.get("days", 7),
+            }
+
+            return {
+                "success": True,
+                "backend": storage_config.get("backend", "auto"),
+                "local": local_status,
+                "remote": remote_status,
+                "pull": pull_status,
+            }
+
+        except MCPError as e:
+            return {
+                "success": False,
+                "error": e.to_dict()
+            }
+        except Exception as e:
+            return {
+                "success": False,
+                "error": {
+                    "code": "INTERNAL_ERROR",
+                    "message": str(e)
+                }
+            }
+
+    def list_available_dates(self, source: str = "both") -> Dict:
+        """
+        列出可用的日期范围
+
+        Args:
+            source: 数据来源
+                - "local": 仅本地
+                - "remote": 仅远程
+                - "both": 两者都列出（默认）
+
+        Returns:
+            日期列表字典
+        """
+        try:
+            result = {
+                "success": True,
+            }
+
+            # 本地日期
+            if source in ("local", "both"):
+                local_dates = self._get_local_dates()
+                result["local"] = {
+                    "dates": local_dates,
+                    "count": len(local_dates),
+                    "earliest": local_dates[-1] if local_dates else None,
+                    "latest": local_dates[0] if local_dates else None,
+                }
+
+            # 远程日期
+            if source in ("remote", "both"):
+                if not self._has_remote_config():
+                    result["remote"] = {
+                        "configured": False,
+                        "dates": [],
+                        "count": 0,
+                        "earliest": None,
+                        "latest": None,
+                        "error": "未配置远程存储"
+                    }
+                else:
+                    remote_backend = self._get_remote_backend()
+                    if remote_backend:
+                        try:
+                            remote_dates = remote_backend.list_remote_dates()
+                            result["remote"] = {
+                                "configured": True,
+                                "dates": remote_dates,
+                                "count": len(remote_dates),
+                                "earliest": remote_dates[-1] if remote_dates else None,
+                                "latest": remote_dates[0] if remote_dates else None,
+                            }
+                        except Exception as e:
+                            result["remote"] = {
+                                "configured": True,
+                                "dates": [],
+                                "count": 0,
+                                "earliest": None,
+                                "latest": None,
+                                "error": str(e)
+                            }
+                    else:
+                        result["remote"] = {
+                            "configured": True,
+                            "dates": [],
+                            "count": 0,
+                            "earliest": None,
+                            "latest": None,
+                            "error": "无法创建远程存储后端"
+                        }
+
+            # 如果同时查询两者，计算差异
+            if source == "both" and "local" in result and "remote" in result:
+                local_set = set(result["local"]["dates"])
+                remote_set = set(result["remote"].get("dates", []))
+
+                result["comparison"] = {
+                    "only_local": sorted(list(local_set - remote_set), reverse=True),
+                    "only_remote": sorted(list(remote_set - local_set), reverse=True),
+                    "both": sorted(list(local_set & remote_set), reverse=True),
+                }
+
+            return result
+
+        except MCPError as e:
+            return {
+                "success": False,
+                "error": e.to_dict()
+            }
+        except Exception as e:
+            return {
+                "success": False,
+                "error": {
+                    "code": "INTERNAL_ERROR",
+                    "message": str(e)
+                }
+            }
@@ -87,13 +87,13 @@ class SystemManagementTools:
            >>> print(result['saved_files'])
        """
        try:
-            import json
            import time
-            import random
-            import requests
-            from datetime import datetime
-            import pytz
            import yaml
+            from trendradar.crawler.fetcher import DataFetcher
+            from trendradar.storage.local import LocalStorageBackend
+            from trendradar.storage.base import convert_crawl_results_to_news_data
+            from trendradar.utils.time import get_configured_time, format_date_folder, format_time_filename
+            from ..services.cache_service import get_cache

            # 参数验证
            platforms = validate_platforms(platforms)
@@ -129,9 +129,6 @@ class SystemManagementTools:
            else:
                target_platforms = all_platforms

-            # 获取请求间隔
-            request_interval = config_data.get("crawler", {}).get("request_interval", 100)
-
            # 构建平台ID列表
            ids = []
            for platform in target_platforms:
@@ -142,87 +139,82 @@ class SystemManagementTools:

            print(f"开始临时爬取，平台: {[p.get('name', p['id']) for p in target_platforms]}")

-            # 爬取数据
-            results = {}
-            id_to_name = {}
-            failed_ids = []
+            # 初始化数据获取器
+            crawler_config = config_data.get("crawler", {})
+            proxy_url = None
+            if crawler_config.get("use_proxy"):
+                proxy_url = crawler_config.get("proxy_url")
+            
+            fetcher = DataFetcher(proxy_url=proxy_url)
+            request_interval = crawler_config.get("request_interval", 100)

-            for i, id_info in enumerate(ids):
-                if isinstance(id_info, tuple):
-                    id_value, name = id_info
-                else:
-                    id_value = id_info
-                    name = id_value
+            # 执行爬取
+            results, id_to_name, failed_ids = fetcher.crawl_websites(
+                ids_list=ids,
+                request_interval=request_interval
+            )

-                id_to_name[id_value] = name
+            # 获取当前时间（统一使用 trendradar 的时间工具）
+            # 从配置中读取时区，默认为 Asia/Shanghai
+            timezone = config_data.get("app", {}).get("timezone", "Asia/Shanghai")
+            current_time = get_configured_time(timezone)
+            crawl_date = format_date_folder(None, timezone)
+            crawl_time_str = format_time_filename(timezone)

-                # 构建请求URL
-                url = f"https://newsnow.busiyi.world/api/s?id={id_value}&latest"
+            # 转换为标准数据模型
+            news_data = convert_crawl_results_to_news_data(
+                results=results,
+                id_to_name=id_to_name,
+                failed_ids=failed_ids,
+                crawl_time=crawl_time_str,
+                crawl_date=crawl_date
+            )

-                headers = {
-                    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
-                    "Accept": "application/json, text/plain, */*",
-                    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
-                    "Connection": "keep-alive",
-                    "Cache-Control": "no-cache",
-                }
+            # 初始化存储后端
+            storage = LocalStorageBackend(
+                data_dir=str(self.project_root / "output"),
+                enable_txt=True,
+                enable_html=True,
+                timezone=timezone
+            )

-                # 重试机制
-                max_retries = 2
-                retries = 0
-                success = False
+            # 尝试持久化数据
+            save_success = False
+            save_error_msg = ""
+            saved_files = {}

-                while retries <= max_retries and not success:
-                    try:
-                        response = requests.get(url, headers=headers, timeout=10)
-                        response.raise_for_status()
+            try:
+                # 1. 保存到 SQLite (核心持久化)
+                if storage.save_news_data(news_data):
+                    save_success = True
+                
+                # 2. 如果请求保存到本地，生成 TXT/HTML 快照
+                if save_to_local:
+                    # 保存 TXT
+                    txt_path = storage.save_txt_snapshot(news_data)
+                    if txt_path:
+                        saved_files["txt"] = txt_path

-                        data_text = response.text
-                        data_json = json.loads(data_text)
+                    # 保存 HTML (使用简化版生成器)
+                    html_content = self._generate_simple_html(results, id_to_name, failed_ids, current_time)
+                    html_filename = f"{crawl_time_str}.html"
+                    html_path = storage.save_html_report(html_content, html_filename)
+                    if html_path:
+                        saved_files["html"] = html_path

-                        status = data_json.get("status", "未知")
-                        if status not in ["success", "cache"]:
-                            raise ValueError(f"响应状态异常: {status}")
+            except Exception as e:
+                # 捕获所有保存错误（特别是 Docker 只读卷导致的 PermissionError）
+                print(f"[System] 数据保存失败: {e}")
+                save_success = False
+                save_error_msg = str(e)

-                        status_info = "最新数据" if status == "success" else "缓存数据"
-                        print(f"获取 {id_value} 成功（{status_info}）")
+            # 3. 清除缓存，确保下次查询获取最新数据
+            # 即使保存失败，内存中的数据可能已经通过其他方式更新，或者是临时的
+            get_cache().clear()
+            print("[System] 缓存已清除")

-                        # 解析数据
-                        results[id_value] = {}
-                        for index, item in enumerate(data_json.get("items", []), 1):
-                            title = item["title"]
-                            url_link = item.get("url", "")
-                            mobile_url = item.get("mobileUrl", "")
-
-                            if title in results[id_value]:
-                                results[id_value][title]["ranks"].append(index)
-                            else:
-                                results[id_value][title] = {
-                                    "ranks": [index],
-                                    "url": url_link,
-                                    "mobileUrl": mobile_url,
-                                }
-
-                        success = True
-
-                    except Exception as e:
-                        retries += 1
-                        if retries <= max_retries:
-                            wait_time = random.uniform(3, 5)
-                            print(f"请求 {id_value} 失败: {e}. {wait_time:.2f}秒后重试...")
-                            time.sleep(wait_time)
-                        else:
-                            print(f"请求 {id_value} 失败: {e}")
-                            failed_ids.append(id_value)
-
-                # 请求间隔
-                if i < len(ids) - 1:
-                    actual_interval = request_interval + random.randint(-10, 20)
-                    actual_interval = max(50, actual_interval)
-                    time.sleep(actual_interval / 1000)
-
-            # 格式化返回数据
-            news_data = []
+            # 构建返回结果
+            news_response_data = []
            for platform_id, titles_data in results.items():
                platform_name = id_to_name.get(platform_id, platform_id)
                for title, info in titles_data.items():
@@ -230,131 +222,42 @@ class SystemManagementTools:
                        "platform_id": platform_id,
                        "platform_name": platform_name,
                        "title": title,
-                        "ranks": info["ranks"]
+                        "ranks": info.get("ranks", [])
                    }
-
-                    # 条件性添加 URL 字段
                    if include_url:
                        news_item["url"] = info.get("url", "")
                        news_item["mobile_url"] = info.get("mobileUrl", "")
+                    news_response_data.append(news_item)

-                    news_data.append(news_item)
-
-            # 获取北京时间
-            beijing_tz = pytz.timezone("Asia/Shanghai")
-            now = datetime.now(beijing_tz)
-
-            # 构建返回结果
            result = {
                "success": True,
                "task_id": f"crawl_{int(time.time())}",
                "status": "completed",
-                "crawl_time": now.strftime("%Y-%m-%d %H:%M:%S"),
+                "crawl_time": current_time.strftime("%Y-%m-%d %H:%M:%S"),
                "platforms": list(results.keys()),
-                "total_news": len(news_data),
+                "total_news": len(news_response_data),
                "failed_platforms": failed_ids,
-                "data": news_data,
-                "saved_to_local": save_to_local
+                "data": news_response_data,
+                "saved_to_local": save_success and save_to_local
            }

-            # 如果需要持久化，调用保存逻辑
-            if save_to_local:
-                try:
-                    import re
-
-                    # 辅助函数：清理标题
-                    def clean_title(title: str) -> str:
-                        """清理标题中的特殊字符"""
-                        if not isinstance(title, str):
-                            title = str(title)
-                        cleaned_title = title.replace("\n", " ").replace("\r", " ")
-                        cleaned_title = re.sub(r"\s+", " ", cleaned_title)
-                        cleaned_title = cleaned_title.strip()
-                        return cleaned_title
-
-                    # 辅助函数：创建目录
-                    def ensure_directory_exists(directory: str):
-                        """确保目录存在"""
-                        Path(directory).mkdir(parents=True, exist_ok=True)
-
-                    # 格式化日期和时间
-                    date_folder = now.strftime("%Y年%m月%d日")
-                    time_filename = now.strftime("%H时%M分")
-
-                    # 创建 txt 文件路径
-                    txt_dir = self.project_root / "output" / date_folder / "txt"
-                    ensure_directory_exists(str(txt_dir))
-                    txt_file_path = txt_dir / f"{time_filename}.txt"
-
-                    # 创建 html 文件路径
-                    html_dir = self.project_root / "output" / date_folder / "html"
-                    ensure_directory_exists(str(html_dir))
-                    html_file_path = html_dir / f"{time_filename}.html"
-
-                    # 保存 txt 文件（按照 main.py 的格式）
-                    with open(txt_file_path, "w", encoding="utf-8") as f:
-                        for id_value, title_data in results.items():
-                            # id | name 或 id
-                            name = id_to_name.get(id_value)
-                            if name and name != id_value:
-                                f.write(f"{id_value} | {name}\n")
-                            else:
-                                f.write(f"{id_value}\n")
-
-                            # 按排名排序标题
-                            sorted_titles = []
-                            for title, info in title_data.items():
-                                cleaned = clean_title(title)
-                                if isinstance(info, dict):
-                                    ranks = info.get("ranks", [])
-                                    url = info.get("url", "")
-                                    mobile_url = info.get("mobileUrl", "")
-                                else:
-                                    ranks = info if isinstance(info, list) else []
-                                    url = ""
-                                    mobile_url = ""
-
-                                rank = ranks[0] if ranks else 1
-                                sorted_titles.append((rank, cleaned, url, mobile_url))
-
-                            sorted_titles.sort(key=lambda x: x[0])
-
-                            for rank, cleaned, url, mobile_url in sorted_titles:
-                                line = f"{rank}. {cleaned}"
-                                if url:
-                                    line += f" [URL:{url}]"
-                                if mobile_url:
-                                    line += f" [MOBILE:{mobile_url}]"
-                                f.write(line + "\n")
-
-                            f.write("\n")
-
-                        if failed_ids:
-                            f.write("==== 以下ID请求失败 ====\n")
-                            for id_value in failed_ids:
-                                f.write(f"{id_value}\n")
-
-                    # 保存 html 文件（简化版）
-                    html_content = self._generate_simple_html(results, id_to_name, failed_ids, now)
-                    with open(html_file_path, "w", encoding="utf-8") as f:
-                        f.write(html_content)
-
-                    print(f"数据已保存到:")
-                    print(f"  TXT: {txt_file_path}")
-                    print(f"  HTML: {html_file_path}")
-
-                    result["saved_files"] = {
-                        "txt": str(txt_file_path),
-                        "html": str(html_file_path)
-                    }
-                    result["note"] = "数据已持久化到 output 文件夹"
-
-                except Exception as e:
-                    print(f"保存文件失败: {e}")
-                    result["save_error"] = str(e)
-                    result["note"] = "爬取成功但保存失败，数据仅在内存中"
+            if save_success:
+                if save_to_local:
+                    result["saved_files"] = saved_files
+                    result["note"] = "数据已保存到 SQLite 数据库及 output 文件夹"
+                else:
+                    result["note"] = "数据已保存到 SQLite 数据库 (仅内存中返回结果，未生成TXT快照)"
            else:
-                result["note"] = "临时爬取结果，未持久化到output文件夹"
+                # 明确告知用户保存失败
+                result["saved_to_local"] = False
+                result["save_error"] = save_error_msg
+                if "Read-only file system" in save_error_msg or "Permission denied" in save_error_msg:
+                    result["note"] = "爬取成功，但无法写入数据库（Docker只读模式）。数据仅在本次返回中有效。"
+                else:
+                    result["note"] = f"爬取成功但保存失败: {save_error_msg}"
+
+            # 清理资源
+            storage.cleanup()

            return result

@@ -283,13 +283,13 @@ class DateParser:
            date: datetime对象

        Returns:
-            文件夹名称，格式: YYYY年MM月DD日
+            文件夹名称，格式: YYYY-MM-DD

        Examples:
            >>> DateParser.format_date_folder(datetime(2025, 10, 11))
-            '2025年10月11日'
+            '2025-10-11'
        """
-        return date.strftime("%Y年%m月%d日")
+        return date.strftime("%Y-%m-%d")

    @staticmethod
    def validate_date_not_future(date: datetime) -> None: