v3.0.0 AI 智能分析功能

2026-05-01 01:12:42 +08:00 · 2025-10-20 21:41:24 +08:00
parent da81d69309
commit 2afc24e6fb
29 changed files with 6931 additions and 54 deletions
@@ -0,0 +1,5 @@
+"""
+服务层模块
+
+提供数据访问、缓存、解析等核心服务。
+"""
@@ -0,0 +1,136 @@
+"""
+缓存服务
+
+实现TTL缓存机制，提升数据访问性能。
+"""
+
+import time
+from typing import Any, Optional
+from threading import Lock
+
+
+class CacheService:
+    """缓存服务类"""
+
+    def __init__(self):
+        """初始化缓存服务"""
+        self._cache = {}
+        self._timestamps = {}
+        self._lock = Lock()
+
+    def get(self, key: str, ttl: int = 900) -> Optional[Any]:
+        """
+        获取缓存数据
+
+        Args:
+            key: 缓存键
+            ttl: 存活时间（秒），默认15分钟
+
+        Returns:
+            缓存的值，如果不存在或已过期则返回None
+        """
+        with self._lock:
+            if key in self._cache:
+                # 检查是否过期
+                if time.time() - self._timestamps[key] < ttl:
+                    return self._cache[key]
+                else:
+                    # 已过期，删除缓存
+                    del self._cache[key]
+                    del self._timestamps[key]
+        return None
+
+    def set(self, key: str, value: Any) -> None:
+        """
+        设置缓存数据
+
+        Args:
+            key: 缓存键
+            value: 缓存值
+        """
+        with self._lock:
+            self._cache[key] = value
+            self._timestamps[key] = time.time()
+
+    def delete(self, key: str) -> bool:
+        """
+        删除缓存
+
+        Args:
+            key: 缓存键
+
+        Returns:
+            是否成功删除
+        """
+        with self._lock:
+            if key in self._cache:
+                del self._cache[key]
+                del self._timestamps[key]
+                return True
+        return False
+
+    def clear(self) -> None:
+        """清空所有缓存"""
+        with self._lock:
+            self._cache.clear()
+            self._timestamps.clear()
+
+    def cleanup_expired(self, ttl: int = 900) -> int:
+        """
+        清理过期缓存
+
+        Args:
+            ttl: 存活时间（秒）
+
+        Returns:
+            清理的条目数量
+        """
+        with self._lock:
+            current_time = time.time()
+            expired_keys = [
+                key for key, timestamp in self._timestamps.items()
+                if current_time - timestamp >= ttl
+            ]
+
+            for key in expired_keys:
+                del self._cache[key]
+                del self._timestamps[key]
+
+            return len(expired_keys)
+
+    def get_stats(self) -> dict:
+        """
+        获取缓存统计信息
+
+        Returns:
+            统计信息字典
+        """
+        with self._lock:
+            return {
+                "total_entries": len(self._cache),
+                "oldest_entry_age": (
+                    time.time() - min(self._timestamps.values())
+                    if self._timestamps else 0
+                ),
+                "newest_entry_age": (
+                    time.time() - max(self._timestamps.values())
+                    if self._timestamps else 0
+                )
+            }
+
+
+# 全局缓存实例
+_global_cache = None
+
+
+def get_cache() -> CacheService:
+    """
+    获取全局缓存实例
+
+    Returns:
+        全局缓存服务实例
+    """
+    global _global_cache
+    if _global_cache is None:
+        _global_cache = CacheService()
+    return _global_cache
@@ -0,0 +1,564 @@
+"""
+数据访问服务
+
+提供统一的数据查询接口,封装数据访问逻辑。
+"""
+
+import re
+from collections import Counter
+from datetime import datetime, timedelta
+from typing import Dict, List, Optional, Tuple
+
+from .cache_service import get_cache
+from .parser_service import ParserService
+from ..utils.errors import DataNotFoundError
+
+
+class DataService:
+    """数据访问服务类"""
+
+    def __init__(self, project_root: str = None):
+        """
+        初始化数据服务
+
+        Args:
+            project_root: 项目根目录
+        """
+        self.parser = ParserService(project_root)
+        self.cache = get_cache()
+
+    def get_latest_news(
+        self,
+        platforms: Optional[List[str]] = None,
+        limit: int = 50,
+        include_url: bool = False
+    ) -> List[Dict]:
+        """
+        获取最新一批爬取的新闻数据
+
+        Args:
+            platforms: 平台ID列表,None表示所有平台
+            limit: 返回条数限制
+            include_url: 是否包含URL链接,默认False(节省token)
+
+        Returns:
+            新闻列表
+
+        Raises:
+            DataNotFoundError: 数据不存在
+        """
+        # 尝试从缓存获取
+        cache_key = f"latest_news:{','.join(platforms or [])}:{limit}:{include_url}"
+        cached = self.cache.get(cache_key, ttl=900)  # 15分钟缓存
+        if cached:
+            return cached
+
+        # 读取今天的数据
+        all_titles, id_to_name, timestamps = self.parser.read_all_titles_for_date(
+            date=None,
+            platform_ids=platforms
+        )
+
+        # 获取最新的文件时间
+        if timestamps:
+            latest_timestamp = max(timestamps.values())
+            fetch_time = datetime.fromtimestamp(latest_timestamp)
+        else:
+            fetch_time = datetime.now()
+
+        # 转换为新闻列表
+        news_list = []
+        for platform_id, titles in all_titles.items():
+            platform_name = id_to_name.get(platform_id, platform_id)
+
+            for title, info in titles.items():
+                # 取第一个排名
+                rank = info["ranks"][0] if info["ranks"] else 0
+
+                news_item = {
+                    "title": title,
+                    "platform": platform_id,
+                    "platform_name": platform_name,
+                    "rank": rank,
+                    "timestamp": fetch_time.strftime("%Y-%m-%d %H:%M:%S")
+                }
+
+                # 条件性添加 URL 字段
+                if include_url:
+                    news_item["url"] = info.get("url", "")
+                    news_item["mobileUrl"] = info.get("mobileUrl", "")
+
+                news_list.append(news_item)
+
+        # 按排名排序
+        news_list.sort(key=lambda x: x["rank"])
+
+        # 限制返回数量
+        result = news_list[:limit]
+
+        # 缓存结果
+        self.cache.set(cache_key, result)
+
+        return result
+
+    def get_news_by_date(
+        self,
+        target_date: datetime,
+        platforms: Optional[List[str]] = None,
+        limit: int = 50,
+        include_url: bool = False
+    ) -> List[Dict]:
+        """
+        按指定日期获取新闻
+
+        Args:
+            target_date: 目标日期
+            platforms: 平台ID列表,None表示所有平台
+            limit: 返回条数限制
+            include_url: 是否包含URL链接,默认False(节省token)
+
+        Returns:
+            新闻列表
+
+        Raises:
+            DataNotFoundError: 数据不存在
+
+        Examples:
+            >>> service = DataService()
+            >>> news = service.get_news_by_date(
+            ...     target_date=datetime(2025, 10, 10),
+            ...     platforms=['zhihu'],
+            ...     limit=20
+            ... )
+        """
+        # 尝试从缓存获取
+        date_str = target_date.strftime("%Y-%m-%d")
+        cache_key = f"news_by_date:{date_str}:{','.join(platforms or [])}:{limit}:{include_url}"
+        cached = self.cache.get(cache_key, ttl=1800)  # 30分钟缓存
+        if cached:
+            return cached
+
+        # 读取指定日期的数据
+        all_titles, id_to_name, timestamps = self.parser.read_all_titles_for_date(
+            date=target_date,
+            platform_ids=platforms
+        )
+
+        # 转换为新闻列表
+        news_list = []
+        for platform_id, titles in all_titles.items():
+            platform_name = id_to_name.get(platform_id, platform_id)
+
+            for title, info in titles.items():
+                # 计算平均排名
+                avg_rank = sum(info["ranks"]) / len(info["ranks"]) if info["ranks"] else 0
+
+                news_item = {
+                    "title": title,
+                    "platform": platform_id,
+                    "platform_name": platform_name,
+                    "rank": info["ranks"][0] if info["ranks"] else 0,
+                    "avg_rank": round(avg_rank, 2),
+                    "count": len(info["ranks"]),
+                    "date": date_str
+                }
+
+                # 条件性添加 URL 字段
+                if include_url:
+                    news_item["url"] = info.get("url", "")
+                    news_item["mobileUrl"] = info.get("mobileUrl", "")
+
+                news_list.append(news_item)
+
+        # 按排名排序
+        news_list.sort(key=lambda x: x["rank"])
+
+        # 限制返回数量
+        result = news_list[:limit]
+
+        # 缓存结果(历史数据缓存更久)
+        self.cache.set(cache_key, result)
+
+        return result
+
+    def search_news_by_keyword(
+        self,
+        keyword: str,
+        date_range: Optional[Tuple[datetime, datetime]] = None,
+        platforms: Optional[List[str]] = None,
+        limit: Optional[int] = None
+    ) -> Dict:
+        """
+        按关键词搜索新闻
+
+        Args:
+            keyword: 搜索关键词
+            date_range: 日期范围 (start_date, end_date)
+            platforms: 平台过滤列表
+            limit: 返回条数限制(可选)
+
+        Returns:
+            搜索结果字典
+
+        Raises:
+            DataNotFoundError: 数据不存在
+        """
+        # 确定搜索日期范围
+        if date_range:
+            start_date, end_date = date_range
+        else:
+            # 默认搜索今天
+            start_date = end_date = datetime.now()
+
+        # 收集所有匹配的新闻
+        results = []
+        platform_distribution = Counter()
+
+        # 遍历日期范围
+        current_date = start_date
+        while current_date <= end_date:
+            try:
+                all_titles, id_to_name, _ = self.parser.read_all_titles_for_date(
+                    date=current_date,
+                    platform_ids=platforms
+                )
+
+                # 搜索包含关键词的标题
+                for platform_id, titles in all_titles.items():
+                    platform_name = id_to_name.get(platform_id, platform_id)
+
+                    for title, info in titles.items():
+                        if keyword.lower() in title.lower():
+                            # 计算平均排名
+                            avg_rank = sum(info["ranks"]) / len(info["ranks"]) if info["ranks"] else 0
+
+                            results.append({
+                                "title": title,
+                                "platform": platform_id,
+                                "platform_name": platform_name,
+                                "ranks": info["ranks"],
+                                "count": len(info["ranks"]),
+                                "avg_rank": round(avg_rank, 2),
+                                "url": info.get("url", ""),
+                                "mobileUrl": info.get("mobileUrl", ""),
+                                "date": current_date.strftime("%Y-%m-%d")
+                            })
+
+                            platform_distribution[platform_id] += 1
+
+            except DataNotFoundError:
+                # 该日期没有数据,继续下一天
+                pass
+
+            # 下一天
+            current_date += timedelta(days=1)
+
+        if not results:
+            raise DataNotFoundError(
+                f"未找到包含关键词 '{keyword}' 的新闻",
+                suggestion="请尝试其他关键词或扩大日期范围"
+            )
+
+        # 计算统计信息
+        total_ranks = []
+        for item in results:
+            total_ranks.extend(item["ranks"])
+
+        avg_rank = sum(total_ranks) / len(total_ranks) if total_ranks else 0
+
+        # 限制返回数量(如果指定)
+        total_found = len(results)
+        if limit is not None and limit > 0:
+            results = results[:limit]
+
+        return {
+            "results": results,
+            "total": len(results),
+            "total_found": total_found,
+            "statistics": {
+                "platform_distribution": dict(platform_distribution),
+                "avg_rank": round(avg_rank, 2),
+                "keyword": keyword
+            }
+        }
+
+    def get_trending_topics(
+        self,
+        top_n: int = 10,
+        mode: str = "current"
+    ) -> Dict:
+        """
+        获取个人关注词的新闻出现频率统计
+
+        注意:本工具基于 config/frequency_words.txt 中的个人关注词列表进行统计,
+        而不是自动从新闻中提取热点话题。用户可以自定义这个关注词列表。
+
+        Args:
+            top_n: 返回TOP N关注词
+            mode: 模式 - daily(当日累计), current(最新一批)
+
+        Returns:
+            关注词频率统计字典
+
+        Raises:
+            DataNotFoundError: 数据不存在
+        """
+        # 尝试从缓存获取
+        cache_key = f"trending_topics:{top_n}:{mode}"
+        cached = self.cache.get(cache_key, ttl=1800)  # 30分钟缓存
+        if cached:
+            return cached
+
+        # 读取今天的数据
+        all_titles, id_to_name, timestamps = self.parser.read_all_titles_for_date()
+
+        if not all_titles:
+            raise DataNotFoundError(
+                "未找到今天的新闻数据",
+                suggestion="请确保爬虫已经运行并生成了数据"
+            )
+
+        # 加载关键词配置
+        word_groups = self.parser.parse_frequency_words()
+
+        # 根据mode选择要处理的标题数据
+        titles_to_process = {}
+
+        if mode == "daily":
+            # daily模式:处理当天所有累计数据
+            titles_to_process = all_titles
+
+        elif mode == "current":
+            # current模式:只处理最新一批数据(最新时间戳的文件)
+            if timestamps:
+                # 找出最新的时间戳
+                latest_timestamp = max(timestamps.values())
+
+                # 重新读取,只获取最新时间的数据
+                # 这里我们通过timestamps字典反查找最新文件对应的平台
+                latest_titles, _, _ = self.parser.read_all_titles_for_date()
+
+                # 由于read_all_titles_for_date返回所有文件的合并数据,
+                # 我们需要通过timestamps来过滤出最新批次
+                # 简化实现:使用当前所有数据作为最新批次
+                # (更精确的实现需要解析服务支持按时间过滤)
+                titles_to_process = latest_titles
+            else:
+                titles_to_process = all_titles
+
+        else:
+            raise ValueError(
+                f"不支持的模式: {mode}。支持的模式: daily, current"
+            )
+
+        # 统计词频
+        word_frequency = Counter()
+        keyword_to_news = {}
+
+        # 遍历要处理的标题
+        for platform_id, titles in titles_to_process.items():
+            for title in titles.keys():
+                # 对每个关键词组进行匹配
+                for group in word_groups:
+                    all_words = group.get("required", []) + group.get("normal", [])
+
+                    for word in all_words:
+                        if word and word in title:
+                            word_frequency[word] += 1
+
+                            if word not in keyword_to_news:
+                                keyword_to_news[word] = []
+                            keyword_to_news[word].append(title)
+
+        # 获取TOP N关键词
+        top_keywords = word_frequency.most_common(top_n)
+
+        # 构建话题列表
+        topics = []
+        for keyword, frequency in top_keywords:
+            matched_news = keyword_to_news.get(keyword, [])
+
+            topics.append({
+                "keyword": keyword,
+                "frequency": frequency,
+                "matched_news": len(set(matched_news)),  # 去重后的新闻数量
+                "trend": "stable",  # TODO: 需要历史数据来计算趋势
+                "weight_score": 0.0  # TODO: 需要实现权重计算
+            })
+
+        # 构建结果
+        result = {
+            "topics": topics,
+            "generated_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+            "mode": mode,
+            "total_keywords": len(word_frequency),
+            "description": self._get_mode_description(mode)
+        }
+
+        # 缓存结果
+        self.cache.set(cache_key, result)
+
+        return result
+
+    def _get_mode_description(self, mode: str) -> str:
+        """获取模式描述"""
+        descriptions = {
+            "daily": "当日累计统计",
+            "current": "最新一批统计"
+        }
+        return descriptions.get(mode, "未知模式")
+
+    def get_current_config(self, section: str = "all") -> Dict:
+        """
+        获取当前系统配置
+
+        Args:
+            section: 配置节 - all/crawler/push/keywords/weights
+
+        Returns:
+            配置字典
+
+        Raises:
+            FileParseError: 配置文件解析错误
+        """
+        # 尝试从缓存获取
+        cache_key = f"config:{section}"
+        cached = self.cache.get(cache_key, ttl=3600)  # 1小时缓存
+        if cached:
+            return cached
+
+        # 解析配置文件
+        config_data = self.parser.parse_yaml_config()
+        word_groups = self.parser.parse_frequency_words()
+
+        # 根据section返回对应配置
+        if section == "all" or section == "crawler":
+            crawler_config = {
+                "enable_crawler": config_data.get("crawler", {}).get("enable_crawler", True),
+                "use_proxy": config_data.get("crawler", {}).get("use_proxy", False),
+                "request_interval": config_data.get("crawler", {}).get("request_interval", 1),
+                "retry_times": 3,
+                "platforms": [p["id"] for p in config_data.get("platforms", [])]
+            }
+
+        if section == "all" or section == "push":
+            push_config = {
+                "enable_notification": config_data.get("notification", {}).get("enable_notification", True),
+                "enabled_channels": [],
+                "message_batch_size": config_data.get("notification", {}).get("message_batch_size", 20),
+                "push_window": config_data.get("notification", {}).get("push_window", {})
+            }
+
+            # 检测已配置的通知渠道
+            webhooks = config_data.get("notification", {}).get("webhooks", {})
+            if webhooks.get("feishu_url"):
+                push_config["enabled_channels"].append("feishu")
+            if webhooks.get("dingtalk_url"):
+                push_config["enabled_channels"].append("dingtalk")
+            if webhooks.get("wework_url"):
+                push_config["enabled_channels"].append("wework")
+
+        if section == "all" or section == "keywords":
+            keywords_config = {
+                "word_groups": word_groups,
+                "total_groups": len(word_groups)
+            }
+
+        if section == "all" or section == "weights":
+            weights_config = {
+                "rank_weight": config_data.get("weight", {}).get("rank_weight", 0.6),
+                "frequency_weight": config_data.get("weight", {}).get("frequency_weight", 0.3),
+                "hotness_weight": config_data.get("weight", {}).get("hotness_weight", 0.1)
+            }
+
+        # 组装结果
+        if section == "all":
+            result = {
+                "crawler": crawler_config,
+                "push": push_config,
+                "keywords": keywords_config,
+                "weights": weights_config
+            }
+        elif section == "crawler":
+            result = crawler_config
+        elif section == "push":
+            result = push_config
+        elif section == "keywords":
+            result = keywords_config
+        elif section == "weights":
+            result = weights_config
+        else:
+            result = {}
+
+        # 缓存结果
+        self.cache.set(cache_key, result)
+
+        return result
+
+    def get_system_status(self) -> Dict:
+        """
+        获取系统运行状态
+
+        Returns:
+            系统状态字典
+        """
+        # 获取数据统计
+        output_dir = self.parser.project_root / "output"
+
+        total_storage = 0
+        oldest_record = None
+        latest_record = None
+        total_news = 0
+
+        if output_dir.exists():
+            # 遍历日期文件夹
+            for date_folder in output_dir.iterdir():
+                if date_folder.is_dir():
+                    # 解析日期
+                    try:
+                        date_str = date_folder.name
+                        # 格式: YYYY年MM月DD日
+                        date_match = re.match(r'(\d{4})年(\d{2})月(\d{2})日', date_str)
+                        if date_match:
+                            folder_date = datetime(
+                                int(date_match.group(1)),
+                                int(date_match.group(2)),
+                                int(date_match.group(3))
+                            )
+
+                            if oldest_record is None or folder_date < oldest_record:
+                                oldest_record = folder_date
+                            if latest_record is None or folder_date > latest_record:
+                                latest_record = folder_date
+
+                    except:
+                        pass
+
+                    # 计算存储大小
+                    for item in date_folder.rglob("*"):
+                        if item.is_file():
+                            total_storage += item.stat().st_size
+
+        # 读取版本信息
+        version_file = self.parser.project_root / "version"
+        version = "unknown"
+        if version_file.exists():
+            try:
+                with open(version_file, "r") as f:
+                    version = f.read().strip()
+            except:
+                pass
+
+        return {
+            "system": {
+                "version": version,
+                "project_root": str(self.parser.project_root)
+            },
+            "data": {
+                "total_storage": f"{total_storage / 1024 / 1024:.2f} MB",
+                "oldest_record": oldest_record.strftime("%Y-%m-%d") if oldest_record else None,
+                "latest_record": latest_record.strftime("%Y-%m-%d") if latest_record else None,
+            },
+            "cache": self.cache.get_stats(),
+            "health": "healthy"
+        }
@@ -0,0 +1,355 @@
+"""
+文件解析服务
+
+提供txt格式新闻数据和YAML配置文件的解析功能。
+"""
+
+import re
+from pathlib import Path
+from typing import Dict, List, Tuple, Optional
+from datetime import datetime
+
+import yaml
+
+from ..utils.errors import FileParseError, DataNotFoundError
+from .cache_service import get_cache
+
+
+class ParserService:
+    """文件解析服务类"""
+
+    def __init__(self, project_root: str = None):
+        """
+        初始化解析服务
+
+        Args:
+            project_root: 项目根目录，默认为当前目录的父目录
+        """
+        if project_root is None:
+            # 获取当前文件所在目录的父目录的父目录
+            current_file = Path(__file__)
+            self.project_root = current_file.parent.parent.parent
+        else:
+            self.project_root = Path(project_root)
+
+        # 初始化缓存服务
+        self.cache = get_cache()
+
+    @staticmethod
+    def clean_title(title: str) -> str:
+        """
+        清理标题文本
+
+        Args:
+            title: 原始标题
+
+        Returns:
+            清理后的标题
+        """
+        # 移除多余空白
+        title = re.sub(r'\s+', ' ', title)
+        # 移除特殊字符
+        title = title.strip()
+        return title
+
+    def parse_txt_file(self, file_path: Path) -> Tuple[Dict, Dict]:
+        """
+        解析单个txt文件的标题数据
+
+        Args:
+            file_path: txt文件路径
+
+        Returns:
+            (titles_by_id, id_to_name) 元组
+            - titles_by_id: {platform_id: {title: {ranks, url, mobileUrl}}}
+            - id_to_name: {platform_id: platform_name}
+
+        Raises:
+            FileParseError: 文件解析错误
+        """
+        if not file_path.exists():
+            raise FileParseError(str(file_path), "文件不存在")
+
+        titles_by_id = {}
+        id_to_name = {}
+
+        try:
+            with open(file_path, "r", encoding="utf-8") as f:
+                content = f.read()
+                sections = content.split("\n\n")
+
+                for section in sections:
+                    if not section.strip() or "==== 以下ID请求失败 ====" in section:
+                        continue
+
+                    lines = section.strip().split("\n")
+                    if len(lines) < 2:
+                        continue
+
+                    # 解析header: id | name 或 id
+                    header_line = lines[0].strip()
+                    if " | " in header_line:
+                        parts = header_line.split(" | ", 1)
+                        source_id = parts[0].strip()
+                        name = parts[1].strip()
+                        id_to_name[source_id] = name
+                    else:
+                        source_id = header_line
+                        id_to_name[source_id] = source_id
+
+                    titles_by_id[source_id] = {}
+
+                    # 解析标题行
+                    for line in lines[1:]:
+                        if line.strip():
+                            try:
+                                title_part = line.strip()
+                                rank = None
+
+                                # 提取排名
+                                if ". " in title_part and title_part.split(". ")[0].isdigit():
+                                    rank_str, title_part = title_part.split(". ", 1)
+                                    rank = int(rank_str)
+
+                                # 提取 MOBILE URL
+                                mobile_url = ""
+                                if " [MOBILE:" in title_part:
+                                    title_part, mobile_part = title_part.rsplit(" [MOBILE:", 1)
+                                    if mobile_part.endswith("]"):
+                                        mobile_url = mobile_part[:-1]
+
+                                # 提取 URL
+                                url = ""
+                                if " [URL:" in title_part:
+                                    title_part, url_part = title_part.rsplit(" [URL:", 1)
+                                    if url_part.endswith("]"):
+                                        url = url_part[:-1]
+
+                                title = self.clean_title(title_part.strip())
+                                ranks = [rank] if rank is not None else [1]
+
+                                titles_by_id[source_id][title] = {
+                                    "ranks": ranks,
+                                    "url": url,
+                                    "mobileUrl": mobile_url,
+                                }
+
+                            except Exception as e:
+                                # 忽略单行解析错误
+                                continue
+
+        except Exception as e:
+            raise FileParseError(str(file_path), str(e))
+
+        return titles_by_id, id_to_name
+
+    def get_date_folder_name(self, date: datetime = None) -> str:
+        """
+        获取日期文件夹名称
+
+        Args:
+            date: 日期对象，默认为今天
+
+        Returns:
+            文件夹名称，格式: YYYY年MM月DD日
+        """
+        if date is None:
+            date = datetime.now()
+        return date.strftime("%Y年%m月%d日")
+
+    def read_all_titles_for_date(
+        self,
+        date: datetime = None,
+        platform_ids: Optional[List[str]] = None
+    ) -> Tuple[Dict, Dict, Dict]:
+        """
+        读取指定日期的所有标题文件（带缓存）
+
+        Args:
+            date: 日期对象，默认为今天
+            platform_ids: 平台ID列表，None表示所有平台
+
+        Returns:
+            (all_titles, id_to_name, all_timestamps) 元组
+            - all_titles: {platform_id: {title: {ranks, url, mobileUrl, ...}}}
+            - id_to_name: {platform_id: platform_name}
+            - all_timestamps: {filename: timestamp}
+
+        Raises:
+            DataNotFoundError: 数据不存在
+        """
+        # 生成缓存键
+        date_str = self.get_date_folder_name(date)
+        platform_key = ','.join(sorted(platform_ids)) if platform_ids else 'all'
+        cache_key = f"read_all_titles:{date_str}:{platform_key}"
+
+        # 尝试从缓存获取
+        # 对于历史数据（非今天），使用更长的缓存时间（1小时）
+        # 对于今天的数据，使用较短的缓存时间（15分钟），因为可能有新数据
+        is_today = (date is None) or (date.date() == datetime.now().date())
+        ttl = 900 if is_today else 3600  # 15分钟 vs 1小时
+
+        cached = self.cache.get(cache_key, ttl=ttl)
+        if cached:
+            return cached
+
+        # 缓存未命中，读取文件
+        date_folder = self.get_date_folder_name(date)
+        txt_dir = self.project_root / "output" / date_folder / "txt"
+
+        if not txt_dir.exists():
+            raise DataNotFoundError(
+                f"未找到 {date_folder} 的数据目录",
+                suggestion="请先运行爬虫或检查日期是否正确"
+            )
+
+        all_titles = {}
+        id_to_name = {}
+        all_timestamps = {}
+
+        # 读取所有txt文件
+        txt_files = sorted(txt_dir.glob("*.txt"))
+
+        if not txt_files:
+            raise DataNotFoundError(
+                f"{date_folder} 没有数据文件",
+                suggestion="请等待爬虫任务完成"
+            )
+
+        for txt_file in txt_files:
+            try:
+                titles_by_id, file_id_to_name = self.parse_txt_file(txt_file)
+
+                # 更新id_to_name
+                id_to_name.update(file_id_to_name)
+
+                # 合并标题数据
+                for platform_id, titles in titles_by_id.items():
+                    # 如果指定了平台过滤
+                    if platform_ids and platform_id not in platform_ids:
+                        continue
+
+                    if platform_id not in all_titles:
+                        all_titles[platform_id] = {}
+
+                    for title, info in titles.items():
+                        if title in all_titles[platform_id]:
+                            # 合并排名
+                            all_titles[platform_id][title]["ranks"].extend(info["ranks"])
+                        else:
+                            all_titles[platform_id][title] = info.copy()
+
+                # 记录文件时间戳
+                all_timestamps[txt_file.name] = txt_file.stat().st_mtime
+
+            except Exception as e:
+                # 忽略单个文件的解析错误，继续处理其他文件
+                print(f"Warning: 解析文件 {txt_file} 失败: {e}")
+                continue
+
+        if not all_titles:
+            raise DataNotFoundError(
+                f"{date_folder} 没有有效的数据",
+                suggestion="请检查数据文件格式或重新运行爬虫"
+            )
+
+        # 缓存结果
+        result = (all_titles, id_to_name, all_timestamps)
+        self.cache.set(cache_key, result)
+
+        return result
+
+    def parse_yaml_config(self, config_path: str = None) -> dict:
+        """
+        解析YAML配置文件
+
+        Args:
+            config_path: 配置文件路径，默认为 config/config.yaml
+
+        Returns:
+            配置字典
+
+        Raises:
+            FileParseError: 配置文件解析错误
+        """
+        if config_path is None:
+            config_path = self.project_root / "config" / "config.yaml"
+        else:
+            config_path = Path(config_path)
+
+        if not config_path.exists():
+            raise FileParseError(str(config_path), "配置文件不存在")
+
+        try:
+            with open(config_path, "r", encoding="utf-8") as f:
+                config_data = yaml.safe_load(f)
+            return config_data
+        except Exception as e:
+            raise FileParseError(str(config_path), str(e))
+
+    def parse_frequency_words(self, words_file: str = None) -> List[Dict]:
+        """
+        解析关键词配置文件
+
+        Args:
+            words_file: 关键词文件路径，默认为 config/frequency_words.txt
+
+        Returns:
+            词组列表
+
+        Raises:
+            FileParseError: 文件解析错误
+        """
+        if words_file is None:
+            words_file = self.project_root / "config" / "frequency_words.txt"
+        else:
+            words_file = Path(words_file)
+
+        if not words_file.exists():
+            return []
+
+        word_groups = []
+
+        try:
+            with open(words_file, "r", encoding="utf-8") as f:
+                for line in f:
+                    line = line.strip()
+                    if not line or line.startswith("#"):
+                        continue
+
+                    # 使用 | 分隔符
+                    parts = [p.strip() for p in line.split("|")]
+                    if not parts:
+                        continue
+
+                    group = {
+                        "required": [],
+                        "normal": [],
+                        "filter_words": []
+                    }
+
+                    for part in parts:
+                        if not part:
+                            continue
+
+                        words = [w.strip() for w in part.split(",")]
+                        for word in words:
+                            if not word:
+                                continue
+                            if word.endswith("+"):
+                                # 必须词
+                                group["required"].append(word[:-1])
+                            elif word.endswith("!"):
+                                # 过滤词
+                                group["filter_words"].append(word[:-1])
+                            else:
+                                # 普通词
+                                group["normal"].append(word)
+
+                    if group["required"] or group["normal"]:
+                        word_groups.append(group)
+
+        except Exception as e:
+            raise FileParseError(str(words_file), str(e))
+
+        return word_groups