v3.0.0 AI 智能分析功能

2026-05-01 01:12:42 +08:00 · 2025-10-20 21:41:24 +08:00
parent da81d69309
commit 2afc24e6fb
29 changed files with 6931 additions and 54 deletions
@@ -0,0 +1,5 @@
+"""
+MCP 工具模块
+
+包含所有MCP工具的实现。
+"""
@@ -0,0 +1,66 @@
+"""
+配置管理工具
+
+实现配置查询和管理功能。
+"""
+
+from typing import Dict, Optional
+
+from ..services.data_service import DataService
+from ..utils.validators import validate_config_section
+from ..utils.errors import MCPError
+
+
+class ConfigManagementTools:
+    """配置管理工具类"""
+
+    def __init__(self, project_root: str = None):
+        """
+        初始化配置管理工具
+
+        Args:
+            project_root: 项目根目录
+        """
+        self.data_service = DataService(project_root)
+
+    def get_current_config(self, section: Optional[str] = None) -> Dict:
+        """
+        获取当前系统配置
+
+        Args:
+            section: 配置节 - all/crawler/push/keywords/weights，默认all
+
+        Returns:
+            配置字典
+
+        Example:
+            >>> tools = ConfigManagementTools()
+            >>> result = tools.get_current_config(section="crawler")
+            >>> print(result['crawler']['platforms'])
+        """
+        try:
+            # 参数验证
+            section = validate_config_section(section)
+
+            # 获取配置
+            config = self.data_service.get_current_config(section=section)
+
+            return {
+                "config": config,
+                "section": section,
+                "success": True
+            }
+
+        except MCPError as e:
+            return {
+                "success": False,
+                "error": e.to_dict()
+            }
+        except Exception as e:
+            return {
+                "success": False,
+                "error": {
+                    "code": "INTERNAL_ERROR",
+                    "message": str(e)
+                }
+            }
@@ -0,0 +1,284 @@
+"""
+数据查询工具
+
+实现P0核心的数据查询工具。
+"""
+
+from typing import Dict, List, Optional
+
+from ..services.data_service import DataService
+from ..utils.validators import (
+    validate_platforms,
+    validate_limit,
+    validate_keyword,
+    validate_date_range,
+    validate_top_n,
+    validate_mode,
+    validate_date_query
+)
+from ..utils.errors import MCPError
+
+
+class DataQueryTools:
+    """数据查询工具类"""
+
+    def __init__(self, project_root: str = None):
+        """
+        初始化数据查询工具
+
+        Args:
+            project_root: 项目根目录
+        """
+        self.data_service = DataService(project_root)
+
+    def get_latest_news(
+        self,
+        platforms: Optional[List[str]] = None,
+        limit: Optional[int] = None,
+        include_url: bool = False
+    ) -> Dict:
+        """
+        获取最新一批爬取的新闻数据
+
+        Args:
+            platforms: 平台ID列表，如 ['zhihu', 'weibo']
+            limit: 返回条数限制，默认20
+            include_url: 是否包含URL链接，默认False（节省token）
+
+        Returns:
+            新闻列表字典
+
+        Example:
+            >>> tools = DataQueryTools()
+            >>> result = tools.get_latest_news(platforms=['zhihu'], limit=10)
+            >>> print(result['total'])
+            10
+        """
+        try:
+            # 参数验证
+            platforms = validate_platforms(platforms)
+            limit = validate_limit(limit, default=50)
+
+            # 获取数据
+            news_list = self.data_service.get_latest_news(
+                platforms=platforms,
+                limit=limit,
+                include_url=include_url
+            )
+
+            return {
+                "news": news_list,
+                "total": len(news_list),
+                "platforms": platforms,
+                "success": True
+            }
+
+        except MCPError as e:
+            return {
+                "success": False,
+                "error": e.to_dict()
+            }
+        except Exception as e:
+            return {
+                "success": False,
+                "error": {
+                    "code": "INTERNAL_ERROR",
+                    "message": str(e)
+                }
+            }
+
+    def search_news_by_keyword(
+        self,
+        keyword: str,
+        date_range: Optional[Dict] = None,
+        platforms: Optional[List[str]] = None,
+        limit: Optional[int] = None
+    ) -> Dict:
+        """
+        按关键词搜索历史新闻
+
+        Args:
+            keyword: 搜索关键词（必需）
+            date_range: 日期范围，格式: {"start": "YYYY-MM-DD", "end": "YYYY-MM-DD"}
+            platforms: 平台过滤列表
+            limit: 返回条数限制（可选，默认返回所有）
+
+        Returns:
+            搜索结果字典
+
+        Example:
+            >>> tools = DataQueryTools()
+            >>> result = tools.search_news_by_keyword(
+            ...     keyword="人工智能",
+            ...     date_range={"start": "2025-10-01", "end": "2025-10-11"},
+            ...     limit=50
+            ... )
+            >>> print(result['total'])
+        """
+        try:
+            # 参数验证
+            keyword = validate_keyword(keyword)
+            date_range_tuple = validate_date_range(date_range)
+            platforms = validate_platforms(platforms)
+
+            if limit is not None:
+                limit = validate_limit(limit, default=100)
+
+            # 搜索数据
+            search_result = self.data_service.search_news_by_keyword(
+                keyword=keyword,
+                date_range=date_range_tuple,
+                platforms=platforms,
+                limit=limit
+            )
+
+            return {
+                **search_result,
+                "success": True
+            }
+
+        except MCPError as e:
+            return {
+                "success": False,
+                "error": e.to_dict()
+            }
+        except Exception as e:
+            return {
+                "success": False,
+                "error": {
+                    "code": "INTERNAL_ERROR",
+                    "message": str(e)
+                }
+            }
+
+    def get_trending_topics(
+        self,
+        top_n: Optional[int] = None,
+        mode: Optional[str] = None
+    ) -> Dict:
+        """
+        获取个人关注词的新闻出现频率统计
+
+        注意：本工具基于 config/frequency_words.txt 中的个人关注词列表进行统计，
+        而不是自动从新闻中提取热点话题。这是一个个人可定制的关注词列表，
+        用户可以根据自己的兴趣添加或删除关注词。
+
+        Args:
+            top_n: 返回TOP N关注词，默认10
+            mode: 模式 - daily(当日累计), current(最新一批), incremental(增量)
+
+        Returns:
+            关注词频率统计字典，包含每个关注词在新闻中出现的次数
+
+        Example:
+            >>> tools = DataQueryTools()
+            >>> result = tools.get_trending_topics(top_n=5, mode="current")
+            >>> print(len(result['topics']))
+            5
+            >>> # 返回的是你在 frequency_words.txt 中设置的关注词的频率统计
+        """
+        try:
+            # 参数验证
+            top_n = validate_top_n(top_n, default=10)
+            valid_modes = ["daily", "current", "incremental"]
+            mode = validate_mode(mode, valid_modes, default="current")
+
+            # 获取趋势话题
+            trending_result = self.data_service.get_trending_topics(
+                top_n=top_n,
+                mode=mode
+            )
+
+            return {
+                **trending_result,
+                "success": True
+            }
+
+        except MCPError as e:
+            return {
+                "success": False,
+                "error": e.to_dict()
+            }
+        except Exception as e:
+            return {
+                "success": False,
+                "error": {
+                    "code": "INTERNAL_ERROR",
+                    "message": str(e)
+                }
+            }
+
+    def get_news_by_date(
+        self,
+        date_query: Optional[str] = None,
+        platforms: Optional[List[str]] = None,
+        limit: Optional[int] = None,
+        include_url: bool = False
+    ) -> Dict:
+        """
+        按日期查询新闻，支持自然语言日期
+
+        Args:
+            date_query: 日期查询字符串（可选，默认"今天"），支持：
+                - 相对日期：今天、昨天、前天、3天前、yesterday、3 days ago
+                - 星期：上周一、本周三、last monday、this friday
+                - 绝对日期：2025-10-10、10月10日、2025年10月10日
+            platforms: 平台ID列表，如 ['zhihu', 'weibo']
+            limit: 返回条数限制，默认50
+            include_url: 是否包含URL链接，默认False（节省token）
+
+        Returns:
+            新闻列表字典
+
+        Example:
+            >>> tools = DataQueryTools()
+            >>> # 不指定日期，默认查询今天
+            >>> result = tools.get_news_by_date(platforms=['zhihu'], limit=20)
+            >>> # 指定日期
+            >>> result = tools.get_news_by_date(
+            ...     date_query="昨天",
+            ...     platforms=['zhihu'],
+            ...     limit=20
+            ... )
+            >>> print(result['total'])
+            20
+        """
+        try:
+            # 参数验证 - 默认今天
+            if date_query is None:
+                date_query = "今天"
+            target_date = validate_date_query(date_query)
+            platforms = validate_platforms(platforms)
+            limit = validate_limit(limit, default=50)
+
+            # 获取数据
+            news_list = self.data_service.get_news_by_date(
+                target_date=target_date,
+                platforms=platforms,
+                limit=limit,
+                include_url=include_url
+            )
+
+            return {
+                "news": news_list,
+                "total": len(news_list),
+                "date": target_date.strftime("%Y-%m-%d"),
+                "date_query": date_query,
+                "platforms": platforms,
+                "success": True
+            }
+
+        except MCPError as e:
+            return {
+                "success": False,
+                "error": e.to_dict()
+            }
+        except Exception as e:
+            return {
+                "success": False,
+                "error": {
+                    "code": "INTERNAL_ERROR",
+                    "message": str(e)
+                }
+            }
+
@@ -0,0 +1,664 @@
+"""
+智能新闻检索工具
+
+提供模糊搜索、链接查询、历史相关新闻检索等高级搜索功能。
+"""
+
+import re
+from collections import Counter
+from datetime import datetime, timedelta
+from difflib import SequenceMatcher
+from typing import Dict, List, Optional, Tuple
+
+from ..services.data_service import DataService
+from ..utils.validators import validate_keyword, validate_limit
+from ..utils.errors import MCPError, InvalidParameterError, DataNotFoundError
+
+
+class SearchTools:
+    """智能新闻检索工具类"""
+
+    def __init__(self, project_root: str = None):
+        """
+        初始化智能检索工具
+
+        Args:
+            project_root: 项目根目录
+        """
+        self.data_service = DataService(project_root)
+        # 中文停用词列表
+        self.stopwords = {
+            '的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一',
+            '一个', '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有',
+            '看', '好', '自己', '这', '那', '来', '被', '与', '为', '对', '将', '从',
+            '以', '及', '等', '但', '或', '而', '于', '中', '由', '可', '可以', '已',
+            '已经', '还', '更', '最', '再', '因为', '所以', '如果', '虽然', '然而'
+        }
+
+    def search_news_unified(
+        self,
+        query: str,
+        search_mode: str = "keyword",
+        date_range: Optional[Dict[str, str]] = None,
+        platforms: Optional[List[str]] = None,
+        limit: int = 50,
+        sort_by: str = "relevance",
+        threshold: float = 0.6,
+        include_url: bool = False
+    ) -> Dict:
+        """
+        统一新闻搜索工具 - 整合多种搜索模式
+
+        Args:
+            query: 查询内容（必需）- 关键词、内容片段或实体名称
+            search_mode: 搜索模式，可选值：
+                - "keyword": 精确关键词匹配（默认）
+                - "fuzzy": 模糊内容匹配（使用相似度算法）
+                - "entity": 实体名称搜索（自动按权重排序）
+            date_range: 日期范围，格式: {"start": "YYYY-MM-DD", "end": "YYYY-MM-DD"}
+                       不指定则默认查询今天
+            platforms: 平台过滤列表，如 ['zhihu', 'weibo']
+            limit: 返回条数限制，默认50
+            sort_by: 排序方式，可选值：
+                - "relevance": 按相关度排序（默认）
+                - "weight": 按新闻权重排序
+                - "date": 按日期排序
+            threshold: 相似度阈值（仅fuzzy模式有效），0-1之间，默认0.6
+            include_url: 是否包含URL链接，默认False（节省token）
+
+        Returns:
+            搜索结果字典，包含匹配的新闻列表
+
+        Examples:
+            - search_news_unified(query="人工智能", search_mode="keyword")
+            - search_news_unified(query="特斯拉降价", search_mode="fuzzy", threshold=0.4)
+            - search_news_unified(query="马斯克", search_mode="entity", limit=20)
+            - search_news_unified(query="iPhone 16发布", search_mode="keyword")
+        """
+        try:
+            # 参数验证
+            query = validate_keyword(query)
+
+            if search_mode not in ["keyword", "fuzzy", "entity"]:
+                raise InvalidParameterError(
+                    f"无效的搜索模式: {search_mode}",
+                    suggestion="支持的模式: keyword, fuzzy, entity"
+                )
+
+            if sort_by not in ["relevance", "weight", "date"]:
+                raise InvalidParameterError(
+                    f"无效的排序方式: {sort_by}",
+                    suggestion="支持的排序: relevance, weight, date"
+                )
+
+            limit = validate_limit(limit, default=50)
+            threshold = max(0.0, min(1.0, threshold))
+
+            # 处理日期范围
+            if date_range:
+                from ..utils.validators import validate_date_range
+                date_range_tuple = validate_date_range(date_range)
+                start_date, end_date = date_range_tuple
+            else:
+                # 默认今天
+                start_date = end_date = datetime.now()
+
+            # 收集所有匹配的新闻
+            all_matches = []
+            current_date = start_date
+
+            while current_date <= end_date:
+                try:
+                    all_titles, id_to_name, timestamps = self.data_service.parser.read_all_titles_for_date(
+                        date=current_date,
+                        platform_ids=platforms
+                    )
+
+                    # 根据搜索模式执行不同的搜索逻辑
+                    if search_mode == "keyword":
+                        matches = self._search_by_keyword_mode(
+                            query, all_titles, id_to_name, current_date, include_url
+                        )
+                    elif search_mode == "fuzzy":
+                        matches = self._search_by_fuzzy_mode(
+                            query, all_titles, id_to_name, current_date, threshold, include_url
+                        )
+                    else:  # entity
+                        matches = self._search_by_entity_mode(
+                            query, all_titles, id_to_name, current_date, include_url
+                        )
+
+                    all_matches.extend(matches)
+
+                except DataNotFoundError:
+                    # 该日期没有数据，继续下一天
+                    pass
+
+                current_date += timedelta(days=1)
+
+            if not all_matches:
+                time_desc = "今天" if start_date == end_date else f"{start_date.strftime('%Y-%m-%d')} 至 {end_date.strftime('%Y-%m-%d')}"
+                return {
+                    "success": True,
+                    "results": [],
+                    "total": 0,
+                    "query": query,
+                    "search_mode": search_mode,
+                    "time_range": time_desc,
+                    "message": f"未找到匹配的新闻（{time_desc}）"
+                }
+
+            # 统一排序逻辑
+            if sort_by == "relevance":
+                all_matches.sort(key=lambda x: x.get("similarity_score", 1.0), reverse=True)
+            elif sort_by == "weight":
+                from .analytics import calculate_news_weight
+                all_matches.sort(key=lambda x: calculate_news_weight(x), reverse=True)
+            elif sort_by == "date":
+                all_matches.sort(key=lambda x: x.get("date", ""), reverse=True)
+
+            # 限制返回数量
+            results = all_matches[:limit]
+
+            # 构建时间范围描述
+            if start_date == end_date:
+                time_range_desc = start_date.strftime("%Y-%m-%d")
+            else:
+                time_range_desc = f"{start_date.strftime('%Y-%m-%d')} 至 {end_date.strftime('%Y-%m-%d')}"
+
+            result = {
+                "success": True,
+                "summary": {
+                    "total_found": len(all_matches),
+                    "returned_count": len(results),
+                    "requested_limit": limit,
+                    "search_mode": search_mode,
+                    "query": query,
+                    "platforms": platforms or "所有平台",
+                    "time_range": time_range_desc,
+                    "sort_by": sort_by
+                },
+                "results": results
+            }
+
+            if search_mode == "fuzzy":
+                result["summary"]["threshold"] = threshold
+                if len(all_matches) < limit:
+                    result["note"] = f"模糊搜索模式下，相似度阈值 {threshold} 仅匹配到 {len(all_matches)} 条结果"
+
+            return result
+
+        except MCPError as e:
+            return {
+                "success": False,
+                "error": e.to_dict()
+            }
+        except Exception as e:
+            return {
+                "success": False,
+                "error": {
+                    "code": "INTERNAL_ERROR",
+                    "message": str(e)
+                }
+            }
+
+    def _search_by_keyword_mode(
+        self,
+        query: str,
+        all_titles: Dict,
+        id_to_name: Dict,
+        current_date: datetime,
+        include_url: bool
+    ) -> List[Dict]:
+        """
+        关键词搜索模式（精确匹配）
+
+        Args:
+            query: 搜索关键词
+            all_titles: 所有标题字典
+            id_to_name: 平台ID到名称映射
+            current_date: 当前日期
+
+        Returns:
+            匹配的新闻列表
+        """
+        matches = []
+        query_lower = query.lower()
+
+        for platform_id, titles in all_titles.items():
+            platform_name = id_to_name.get(platform_id, platform_id)
+
+            for title, info in titles.items():
+                # 精确包含判断
+                if query_lower in title.lower():
+                    news_item = {
+                        "title": title,
+                        "platform": platform_id,
+                        "platform_name": platform_name,
+                        "date": current_date.strftime("%Y-%m-%d"),
+                        "similarity_score": 1.0,  # 精确匹配，相似度为1
+                        "ranks": info.get("ranks", []),
+                        "count": len(info.get("ranks", [])),
+                        "rank": info["ranks"][0] if info["ranks"] else 999
+                    }
+
+                    # 条件性添加 URL 字段
+                    if include_url:
+                        news_item["url"] = info.get("url", "")
+                        news_item["mobileUrl"] = info.get("mobileUrl", "")
+
+                    matches.append(news_item)
+
+        return matches
+
+    def _search_by_fuzzy_mode(
+        self,
+        query: str,
+        all_titles: Dict,
+        id_to_name: Dict,
+        current_date: datetime,
+        threshold: float,
+        include_url: bool
+    ) -> List[Dict]:
+        """
+        模糊搜索模式（使用相似度算法）
+
+        Args:
+            query: 搜索内容
+            all_titles: 所有标题字典
+            id_to_name: 平台ID到名称映射
+            current_date: 当前日期
+            threshold: 相似度阈值
+
+        Returns:
+            匹配的新闻列表
+        """
+        matches = []
+
+        for platform_id, titles in all_titles.items():
+            platform_name = id_to_name.get(platform_id, platform_id)
+
+            for title, info in titles.items():
+                # 模糊匹配
+                is_match, similarity = self._fuzzy_match(query, title, threshold)
+
+                if is_match:
+                    news_item = {
+                        "title": title,
+                        "platform": platform_id,
+                        "platform_name": platform_name,
+                        "date": current_date.strftime("%Y-%m-%d"),
+                        "similarity_score": round(similarity, 4),
+                        "ranks": info.get("ranks", []),
+                        "count": len(info.get("ranks", [])),
+                        "rank": info["ranks"][0] if info["ranks"] else 999
+                    }
+
+                    # 条件性添加 URL 字段
+                    if include_url:
+                        news_item["url"] = info.get("url", "")
+                        news_item["mobileUrl"] = info.get("mobileUrl", "")
+
+                    matches.append(news_item)
+
+        return matches
+
+    def _search_by_entity_mode(
+        self,
+        query: str,
+        all_titles: Dict,
+        id_to_name: Dict,
+        current_date: datetime,
+        include_url: bool
+    ) -> List[Dict]:
+        """
+        实体搜索模式（自动按权重排序）
+
+        Args:
+            query: 实体名称
+            all_titles: 所有标题字典
+            id_to_name: 平台ID到名称映射
+            current_date: 当前日期
+
+        Returns:
+            匹配的新闻列表
+        """
+        matches = []
+
+        for platform_id, titles in all_titles.items():
+            platform_name = id_to_name.get(platform_id, platform_id)
+
+            for title, info in titles.items():
+                # 实体搜索：精确包含实体名称
+                if query in title:
+                    news_item = {
+                        "title": title,
+                        "platform": platform_id,
+                        "platform_name": platform_name,
+                        "date": current_date.strftime("%Y-%m-%d"),
+                        "similarity_score": 1.0,
+                        "ranks": info.get("ranks", []),
+                        "count": len(info.get("ranks", [])),
+                        "rank": info["ranks"][0] if info["ranks"] else 999
+                    }
+
+                    # 条件性添加 URL 字段
+                    if include_url:
+                        news_item["url"] = info.get("url", "")
+                        news_item["mobileUrl"] = info.get("mobileUrl", "")
+
+                    matches.append(news_item)
+
+        return matches
+
+    def _calculate_similarity(self, text1: str, text2: str) -> float:
+        """
+        计算两个文本的相似度
+
+        Args:
+            text1: 文本1
+            text2: 文本2
+
+        Returns:
+            相似度分数 (0-1之间)
+        """
+        # 使用 difflib.SequenceMatcher 计算序列相似度
+        return SequenceMatcher(None, text1.lower(), text2.lower()).ratio()
+
+    def _fuzzy_match(self, query: str, text: str, threshold: float = 0.3) -> Tuple[bool, float]:
+        """
+        模糊匹配函数
+
+        Args:
+            query: 查询文本
+            text: 待匹配文本
+            threshold: 匹配阈值
+
+        Returns:
+            (是否匹配, 相似度分数)
+        """
+        # 直接包含判断
+        if query.lower() in text.lower():
+            return True, 1.0
+
+        # 计算整体相似度
+        similarity = self._calculate_similarity(query, text)
+        if similarity >= threshold:
+            return True, similarity
+
+        # 分词后的部分匹配
+        query_words = set(self._extract_keywords(query))
+        text_words = set(self._extract_keywords(text))
+
+        if not query_words or not text_words:
+            return False, 0.0
+
+        # 计算关键词重合度
+        common_words = query_words & text_words
+        keyword_overlap = len(common_words) / len(query_words)
+
+        if keyword_overlap >= 0.5:  # 50%的关键词重合
+            return True, keyword_overlap
+
+        return False, similarity
+
+    def _extract_keywords(self, text: str, min_length: int = 2) -> List[str]:
+        """
+        从文本中提取关键词
+
+        Args:
+            text: 输入文本
+            min_length: 最小词长
+
+        Returns:
+            关键词列表
+        """
+        # 移除URL和特殊字符
+        text = re.sub(r'http[s]?://\S+', '', text)
+        text = re.sub(r'\[.*?\]', '', text)  # 移除方括号内容
+
+        # 使用正则表达式分词（中文和英文）
+        words = re.findall(r'[\w]+', text)
+
+        # 过滤停用词和短词
+        keywords = [
+            word for word in words
+            if word and len(word) >= min_length and word not in self.stopwords
+        ]
+
+        return keywords
+
+    def _calculate_keyword_overlap(self, keywords1: List[str], keywords2: List[str]) -> float:
+        """
+        计算两个关键词列表的重合度
+
+        Args:
+            keywords1: 关键词列表1
+            keywords2: 关键词列表2
+
+        Returns:
+            重合度分数 (0-1之间)
+        """
+        if not keywords1 or not keywords2:
+            return 0.0
+
+        set1 = set(keywords1)
+        set2 = set(keywords2)
+
+        # Jaccard 相似度
+        intersection = len(set1 & set2)
+        union = len(set1 | set2)
+
+        if union == 0:
+            return 0.0
+
+        return intersection / union
+
+    def search_related_news_history(
+        self,
+        reference_text: str,
+        time_range: str = "yesterday",
+        start_date: Optional[datetime] = None,
+        end_date: Optional[datetime] = None,
+        threshold: float = 0.4,
+        limit: int = 50,
+        include_url: bool = False
+    ) -> Dict:
+        """
+        在历史数据中搜索与给定新闻相关的新闻
+
+        Args:
+            reference_text: 参考新闻标题或内容
+            time_range: 时间范围预设值，可选：
+                - "yesterday": 昨天
+                - "last_week": 上周 (7天)
+                - "last_month": 上个月 (30天)
+                - "custom": 自定义日期范围（需要提供 start_date 和 end_date）
+            start_date: 自定义开始日期（仅当 time_range="custom" 时有效）
+            end_date: 自定义结束日期（仅当 time_range="custom" 时有效）
+            threshold: 相似度阈值 (0-1之间)，默认0.4
+            limit: 返回条数限制，默认50
+            include_url: 是否包含URL链接，默认False（节省token）
+
+        Returns:
+            搜索结果字典，包含相关新闻列表
+
+        Example:
+            >>> tools = SearchTools()
+            >>> result = tools.search_related_news_history(
+            ...     reference_text="人工智能技术突破",
+            ...     time_range="last_week",
+            ...     threshold=0.4,
+            ...     limit=50
+            ... )
+            >>> for news in result['results']:
+            ...     print(f"{news['date']}: {news['title']} (相似度: {news['similarity_score']})")
+        """
+        try:
+            # 参数验证
+            reference_text = validate_keyword(reference_text)
+            threshold = max(0.0, min(1.0, threshold))
+            limit = validate_limit(limit, default=50)
+
+            # 确定查询日期范围
+            today = datetime.now()
+
+            if time_range == "yesterday":
+                search_start = today - timedelta(days=1)
+                search_end = today - timedelta(days=1)
+            elif time_range == "last_week":
+                search_start = today - timedelta(days=7)
+                search_end = today - timedelta(days=1)
+            elif time_range == "last_month":
+                search_start = today - timedelta(days=30)
+                search_end = today - timedelta(days=1)
+            elif time_range == "custom":
+                if not start_date or not end_date:
+                    raise InvalidParameterError(
+                        "自定义时间范围需要提供 start_date 和 end_date",
+                        suggestion="请提供 start_date 和 end_date 参数"
+                    )
+                search_start = start_date
+                search_end = end_date
+            else:
+                raise InvalidParameterError(
+                    f"不支持的时间范围: {time_range}",
+                    suggestion="请使用 'yesterday', 'last_week', 'last_month' 或 'custom'"
+                )
+
+            # 提取参考文本的关键词
+            reference_keywords = self._extract_keywords(reference_text)
+
+            if not reference_keywords:
+                raise InvalidParameterError(
+                    "无法从参考文本中提取关键词",
+                    suggestion="请提供更详细的文本内容"
+                )
+
+            # 收集所有相关新闻
+            all_related_news = []
+            current_date = search_start
+
+            while current_date <= search_end:
+                try:
+                    # 读取该日期的数据
+                    all_titles, id_to_name, _ = self.data_service.parser.read_all_titles_for_date(current_date)
+
+                    # 搜索相关新闻
+                    for platform_id, titles in all_titles.items():
+                        platform_name = id_to_name.get(platform_id, platform_id)
+
+                        for title, info in titles.items():
+                            # 计算标题相似度
+                            title_similarity = self._calculate_similarity(reference_text, title)
+
+                            # 提取标题关键词
+                            title_keywords = self._extract_keywords(title)
+
+                            # 计算关键词重合度
+                            keyword_overlap = self._calculate_keyword_overlap(
+                                reference_keywords,
+                                title_keywords
+                            )
+
+                            # 综合相似度 (70% 关键词重合 + 30% 文本相似度)
+                            combined_score = keyword_overlap * 0.7 + title_similarity * 0.3
+
+                            if combined_score >= threshold:
+                                news_item = {
+                                    "title": title,
+                                    "platform": platform_id,
+                                    "platform_name": platform_name,
+                                    "date": current_date.strftime("%Y-%m-%d"),
+                                    "similarity_score": round(combined_score, 4),
+                                    "keyword_overlap": round(keyword_overlap, 4),
+                                    "text_similarity": round(title_similarity, 4),
+                                    "common_keywords": list(set(reference_keywords) & set(title_keywords)),
+                                    "rank": info["ranks"][0] if info["ranks"] else 0
+                                }
+
+                                # 条件性添加 URL 字段
+                                if include_url:
+                                    news_item["url"] = info.get("url", "")
+                                    news_item["mobileUrl"] = info.get("mobileUrl", "")
+
+                                all_related_news.append(news_item)
+
+                except DataNotFoundError:
+                    # 该日期没有数据，继续下一天
+                    pass
+                except Exception as e:
+                    # 记录错误但继续处理其他日期
+                    print(f"Warning: 处理日期 {current_date.strftime('%Y-%m-%d')} 时出错: {e}")
+
+                # 移动到下一天
+                current_date += timedelta(days=1)
+
+            if not all_related_news:
+                return {
+                    "success": True,
+                    "results": [],
+                    "total": 0,
+                    "query": reference_text,
+                    "time_range": time_range,
+                    "date_range": {
+                        "start": search_start.strftime("%Y-%m-%d"),
+                        "end": search_end.strftime("%Y-%m-%d")
+                    },
+                    "message": "未找到相关新闻"
+                }
+
+            # 按相似度排序
+            all_related_news.sort(key=lambda x: x["similarity_score"], reverse=True)
+
+            # 限制返回数量
+            results = all_related_news[:limit]
+
+            # 统计信息
+            platform_distribution = Counter([news["platform"] for news in all_related_news])
+            date_distribution = Counter([news["date"] for news in all_related_news])
+
+            result = {
+                "success": True,
+                "summary": {
+                    "total_found": len(all_related_news),
+                    "returned_count": len(results),
+                    "requested_limit": limit,
+                    "threshold": threshold,
+                    "reference_text": reference_text,
+                    "reference_keywords": reference_keywords,
+                    "time_range": time_range,
+                    "date_range": {
+                        "start": search_start.strftime("%Y-%m-%d"),
+                        "end": search_end.strftime("%Y-%m-%d")
+                    }
+                },
+                "results": results,
+                "statistics": {
+                    "platform_distribution": dict(platform_distribution),
+                    "date_distribution": dict(date_distribution),
+                    "avg_similarity": round(
+                        sum([news["similarity_score"] for news in all_related_news]) / len(all_related_news),
+                        4
+                    ) if all_related_news else 0.0
+                }
+            }
+
+            if len(all_related_news) < limit:
+                result["note"] = f"相关性阈值 {threshold} 下仅找到 {len(all_related_news)} 条相关新闻"
+
+            return result
+
+        except MCPError as e:
+            return {
+                "success": False,
+                "error": e.to_dict()
+            }
+        except Exception as e:
+            return {
+                "success": False,
+                "error": {
+                    "code": "INTERNAL_ERROR",
+                    "message": str(e)
+                }
+            }
@@ -0,0 +1,465 @@
+"""
+系统管理工具
+
+实现系统状态查询和爬虫触发功能。
+"""
+
+from pathlib import Path
+from typing import Dict, List, Optional
+
+from ..services.data_service import DataService
+from ..utils.validators import validate_platforms
+from ..utils.errors import MCPError, CrawlTaskError
+
+
+class SystemManagementTools:
+    """系统管理工具类"""
+
+    def __init__(self, project_root: str = None):
+        """
+        初始化系统管理工具
+
+        Args:
+            project_root: 项目根目录
+        """
+        self.data_service = DataService(project_root)
+        if project_root:
+            self.project_root = Path(project_root)
+        else:
+            # 获取项目根目录
+            current_file = Path(__file__)
+            self.project_root = current_file.parent.parent.parent
+
+    def get_system_status(self) -> Dict:
+        """
+        获取系统运行状态和健康检查信息
+
+        Returns:
+            系统状态字典
+
+        Example:
+            >>> tools = SystemManagementTools()
+            >>> result = tools.get_system_status()
+            >>> print(result['system']['version'])
+        """
+        try:
+            # 获取系统状态
+            status = self.data_service.get_system_status()
+
+            return {
+                **status,
+                "success": True
+            }
+
+        except MCPError as e:
+            return {
+                "success": False,
+                "error": e.to_dict()
+            }
+        except Exception as e:
+            return {
+                "success": False,
+                "error": {
+                    "code": "INTERNAL_ERROR",
+                    "message": str(e)
+                }
+            }
+
+    def trigger_crawl(self, platforms: Optional[List[str]] = None, save_to_local: bool = False, include_url: bool = False) -> Dict:
+        """
+        手动触发一次临时爬取任务（可选持久化）
+
+        Args:
+            platforms: 指定平台列表，为空则爬取所有平台
+            save_to_local: 是否保存到本地 output 目录，默认 False
+            include_url: 是否包含URL链接，默认False（节省token）
+
+        Returns:
+            爬取结果字典，包含新闻数据和保存路径（如果保存）
+
+        Example:
+            >>> tools = SystemManagementTools()
+            >>> # 临时爬取，不保存
+            >>> result = tools.trigger_crawl(platforms=['zhihu', 'weibo'])
+            >>> print(result['data'])
+            >>> # 爬取并保存到本地
+            >>> result = tools.trigger_crawl(platforms=['zhihu'], save_to_local=True)
+            >>> print(result['saved_files'])
+        """
+        try:
+            import json
+            import time
+            import random
+            import requests
+            from datetime import datetime
+            import pytz
+            import yaml
+
+            # 参数验证
+            platforms = validate_platforms(platforms)
+
+            # 加载配置文件
+            config_path = self.project_root / "config" / "config.yaml"
+            if not config_path.exists():
+                raise CrawlTaskError(
+                    "配置文件不存在",
+                    suggestion=f"请确保配置文件存在: {config_path}"
+                )
+
+            # 读取配置
+            with open(config_path, "r", encoding="utf-8") as f:
+                config_data = yaml.safe_load(f)
+
+            # 获取平台配置
+            all_platforms = config_data.get("platforms", [])
+            if not all_platforms:
+                raise CrawlTaskError(
+                    "配置文件中没有平台配置",
+                    suggestion="请检查 config/config.yaml 中的 platforms 配置"
+                )
+
+            # 过滤平台
+            if platforms:
+                target_platforms = [p for p in all_platforms if p["id"] in platforms]
+                if not target_platforms:
+                    raise CrawlTaskError(
+                        f"指定的平台不存在: {platforms}",
+                        suggestion=f"可用平台: {[p['id'] for p in all_platforms]}"
+                    )
+            else:
+                target_platforms = all_platforms
+
+            # 获取请求间隔
+            request_interval = config_data.get("crawler", {}).get("request_interval", 100)
+
+            # 构建平台ID列表
+            ids = []
+            for platform in target_platforms:
+                if "name" in platform:
+                    ids.append((platform["id"], platform["name"]))
+                else:
+                    ids.append(platform["id"])
+
+            print(f"开始临时爬取，平台: {[p.get('name', p['id']) for p in target_platforms]}")
+
+            # 爬取数据
+            results = {}
+            id_to_name = {}
+            failed_ids = []
+
+            for i, id_info in enumerate(ids):
+                if isinstance(id_info, tuple):
+                    id_value, name = id_info
+                else:
+                    id_value = id_info
+                    name = id_value
+
+                id_to_name[id_value] = name
+
+                # 构建请求URL
+                url = f"https://newsnow.busiyi.world/api/s?id={id_value}&latest"
+
+                headers = {
+                    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
+                    "Accept": "application/json, text/plain, */*",
+                    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+                    "Connection": "keep-alive",
+                    "Cache-Control": "no-cache",
+                }
+
+                # 重试机制
+                max_retries = 2
+                retries = 0
+                success = False
+
+                while retries <= max_retries and not success:
+                    try:
+                        response = requests.get(url, headers=headers, timeout=10)
+                        response.raise_for_status()
+
+                        data_text = response.text
+                        data_json = json.loads(data_text)
+
+                        status = data_json.get("status", "未知")
+                        if status not in ["success", "cache"]:
+                            raise ValueError(f"响应状态异常: {status}")
+
+                        status_info = "最新数据" if status == "success" else "缓存数据"
+                        print(f"获取 {id_value} 成功（{status_info}）")
+
+                        # 解析数据
+                        results[id_value] = {}
+                        for index, item in enumerate(data_json.get("items", []), 1):
+                            title = item["title"]
+                            url_link = item.get("url", "")
+                            mobile_url = item.get("mobileUrl", "")
+
+                            if title in results[id_value]:
+                                results[id_value][title]["ranks"].append(index)
+                            else:
+                                results[id_value][title] = {
+                                    "ranks": [index],
+                                    "url": url_link,
+                                    "mobileUrl": mobile_url,
+                                }
+
+                        success = True
+
+                    except Exception as e:
+                        retries += 1
+                        if retries <= max_retries:
+                            wait_time = random.uniform(3, 5)
+                            print(f"请求 {id_value} 失败: {e}. {wait_time:.2f}秒后重试...")
+                            time.sleep(wait_time)
+                        else:
+                            print(f"请求 {id_value} 失败: {e}")
+                            failed_ids.append(id_value)
+
+                # 请求间隔
+                if i < len(ids) - 1:
+                    actual_interval = request_interval + random.randint(-10, 20)
+                    actual_interval = max(50, actual_interval)
+                    time.sleep(actual_interval / 1000)
+
+            # 格式化返回数据
+            news_data = []
+            for platform_id, titles_data in results.items():
+                platform_name = id_to_name.get(platform_id, platform_id)
+                for title, info in titles_data.items():
+                    news_item = {
+                        "platform_id": platform_id,
+                        "platform_name": platform_name,
+                        "title": title,
+                        "ranks": info["ranks"]
+                    }
+
+                    # 条件性添加 URL 字段
+                    if include_url:
+                        news_item["url"] = info.get("url", "")
+                        news_item["mobile_url"] = info.get("mobileUrl", "")
+
+                    news_data.append(news_item)
+
+            # 获取北京时间
+            beijing_tz = pytz.timezone("Asia/Shanghai")
+            now = datetime.now(beijing_tz)
+
+            # 构建返回结果
+            result = {
+                "success": True,
+                "task_id": f"crawl_{int(time.time())}",
+                "status": "completed",
+                "crawl_time": now.strftime("%Y-%m-%d %H:%M:%S"),
+                "platforms": list(results.keys()),
+                "total_news": len(news_data),
+                "failed_platforms": failed_ids,
+                "data": news_data,
+                "saved_to_local": save_to_local
+            }
+
+            # 如果需要持久化，调用保存逻辑
+            if save_to_local:
+                try:
+                    import re
+
+                    # 辅助函数：清理标题
+                    def clean_title(title: str) -> str:
+                        """清理标题中的特殊字符"""
+                        if not isinstance(title, str):
+                            title = str(title)
+                        cleaned_title = title.replace("\n", " ").replace("\r", " ")
+                        cleaned_title = re.sub(r"\s+", " ", cleaned_title)
+                        cleaned_title = cleaned_title.strip()
+                        return cleaned_title
+
+                    # 辅助函数：创建目录
+                    def ensure_directory_exists(directory: str):
+                        """确保目录存在"""
+                        Path(directory).mkdir(parents=True, exist_ok=True)
+
+                    # 格式化日期和时间
+                    date_folder = now.strftime("%Y年%m月%d日")
+                    time_filename = now.strftime("%H时%M分")
+
+                    # 创建 txt 文件路径
+                    txt_dir = self.project_root / "output" / date_folder / "txt"
+                    ensure_directory_exists(str(txt_dir))
+                    txt_file_path = txt_dir / f"{time_filename}.txt"
+
+                    # 创建 html 文件路径
+                    html_dir = self.project_root / "output" / date_folder / "html"
+                    ensure_directory_exists(str(html_dir))
+                    html_file_path = html_dir / f"{time_filename}.html"
+
+                    # 保存 txt 文件（按照 main.py 的格式）
+                    with open(txt_file_path, "w", encoding="utf-8") as f:
+                        for id_value, title_data in results.items():
+                            # id | name 或 id
+                            name = id_to_name.get(id_value)
+                            if name and name != id_value:
+                                f.write(f"{id_value} | {name}\n")
+                            else:
+                                f.write(f"{id_value}\n")
+
+                            # 按排名排序标题
+                            sorted_titles = []
+                            for title, info in title_data.items():
+                                cleaned = clean_title(title)
+                                if isinstance(info, dict):
+                                    ranks = info.get("ranks", [])
+                                    url = info.get("url", "")
+                                    mobile_url = info.get("mobileUrl", "")
+                                else:
+                                    ranks = info if isinstance(info, list) else []
+                                    url = ""
+                                    mobile_url = ""
+
+                                rank = ranks[0] if ranks else 1
+                                sorted_titles.append((rank, cleaned, url, mobile_url))
+
+                            sorted_titles.sort(key=lambda x: x[0])
+
+                            for rank, cleaned, url, mobile_url in sorted_titles:
+                                line = f"{rank}. {cleaned}"
+                                if url:
+                                    line += f" [URL:{url}]"
+                                if mobile_url:
+                                    line += f" [MOBILE:{mobile_url}]"
+                                f.write(line + "\n")
+
+                            f.write("\n")
+
+                        if failed_ids:
+                            f.write("==== 以下ID请求失败 ====\n")
+                            for id_value in failed_ids:
+                                f.write(f"{id_value}\n")
+
+                    # 保存 html 文件（简化版）
+                    html_content = self._generate_simple_html(results, id_to_name, failed_ids, now)
+                    with open(html_file_path, "w", encoding="utf-8") as f:
+                        f.write(html_content)
+
+                    print(f"数据已保存到:")
+                    print(f"  TXT: {txt_file_path}")
+                    print(f"  HTML: {html_file_path}")
+
+                    result["saved_files"] = {
+                        "txt": str(txt_file_path),
+                        "html": str(html_file_path)
+                    }
+                    result["note"] = "数据已持久化到 output 文件夹"
+
+                except Exception as e:
+                    print(f"保存文件失败: {e}")
+                    result["save_error"] = str(e)
+                    result["note"] = "爬取成功但保存失败，数据仅在内存中"
+            else:
+                result["note"] = "临时爬取结果，未持久化到output文件夹"
+
+            return result
+
+        except MCPError as e:
+            return {
+                "success": False,
+                "error": e.to_dict()
+            }
+        except Exception as e:
+            import traceback
+            return {
+                "success": False,
+                "error": {
+                    "code": "INTERNAL_ERROR",
+                    "message": str(e),
+                    "traceback": traceback.format_exc()
+                }
+            }
+
+    def _generate_simple_html(self, results: Dict, id_to_name: Dict, failed_ids: List, now) -> str:
+        """生成简化的 HTML 报告"""
+        html = """<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>MCP 爬取结果</title>
+    <style>
+        body { font-family: Arial, sans-serif; margin: 20px; background: #f5f5f5; }
+        .container { max-width: 900px; margin: 0 auto; background: white; padding: 20px; border-radius: 8px; }
+        h1 { color: #333; border-bottom: 2px solid #4CAF50; padding-bottom: 10px; }
+        .platform { margin-bottom: 30px; }
+        .platform-name { background: #4CAF50; color: white; padding: 10px; border-radius: 5px; margin-bottom: 10px; }
+        .news-item { padding: 8px; border-bottom: 1px solid #eee; }
+        .rank { color: #666; font-weight: bold; margin-right: 10px; }
+        .title { color: #333; }
+        .link { color: #1976D2; text-decoration: none; margin-left: 10px; font-size: 0.9em; }
+        .link:hover { text-decoration: underline; }
+        .failed { background: #ffebee; padding: 10px; border-radius: 5px; margin-top: 20px; }
+        .failed h3 { color: #c62828; margin-top: 0; }
+        .timestamp { color: #666; font-size: 0.9em; text-align: right; margin-top: 20px; }
+    </style>
+</head>
+<body>
+    <div class="container">
+        <h1>MCP 爬取结果</h1>
+"""
+
+        # 添加时间戳
+        html += f'        <p class="timestamp">爬取时间: {now.strftime("%Y-%m-%d %H:%M:%S")}</p>\n\n'
+
+        # 遍历每个平台
+        for platform_id, titles_data in results.items():
+            platform_name = id_to_name.get(platform_id, platform_id)
+            html += f'        <div class="platform">\n'
+            html += f'            <div class="platform-name">{platform_name}</div>\n'
+
+            # 排序标题
+            sorted_items = []
+            for title, info in titles_data.items():
+                ranks = info.get("ranks", [])
+                url = info.get("url", "")
+                mobile_url = info.get("mobileUrl", "")
+                rank = ranks[0] if ranks else 999
+                sorted_items.append((rank, title, url, mobile_url))
+
+            sorted_items.sort(key=lambda x: x[0])
+
+            # 显示新闻
+            for rank, title, url, mobile_url in sorted_items:
+                html += f'            <div class="news-item">\n'
+                html += f'                <span class="rank">{rank}.</span>\n'
+                html += f'                <span class="title">{self._html_escape(title)}</span>\n'
+                if url:
+                    html += f'                <a class="link" href="{self._html_escape(url)}" target="_blank">链接</a>\n'
+                if mobile_url and mobile_url != url:
+                    html += f'                <a class="link" href="{self._html_escape(mobile_url)}" target="_blank">移动版</a>\n'
+                html += '            </div>\n'
+
+            html += '        </div>\n\n'
+
+        # 失败的平台
+        if failed_ids:
+            html += '        <div class="failed">\n'
+            html += '            <h3>请求失败的平台</h3>\n'
+            html += '            <ul>\n'
+            for platform_id in failed_ids:
+                html += f'                <li>{self._html_escape(platform_id)}</li>\n'
+            html += '            </ul>\n'
+            html += '        </div>\n'
+
+        html += """    </div>
+</body>
+</html>"""
+
+        return html
+
+    def _html_escape(self, text: str) -> str:
+        """HTML 转义"""
+        if not isinstance(text, str):
+            text = str(text)
+        return (
+            text.replace("&", "&amp;")
+            .replace("<", "&lt;")
+            .replace(">", "&gt;")
+            .replace('"', "&quot;")
+            .replace("'", "&#x27;")
+        )