v3.0.0 AI 智能分析功能

This commit is contained in:
sansan
2025-10-20 21:41:24 +08:00
parent da81d69309
commit 2afc24e6fb
29 changed files with 6931 additions and 54 deletions
+5
View File
@@ -0,0 +1,5 @@
"""
MCP 工具模块
包含所有MCP工具的实现。
"""
File diff suppressed because it is too large Load Diff
+66
View File
@@ -0,0 +1,66 @@
"""
配置管理工具
实现配置查询和管理功能。
"""
from typing import Dict, Optional
from ..services.data_service import DataService
from ..utils.validators import validate_config_section
from ..utils.errors import MCPError
class ConfigManagementTools:
"""配置管理工具类"""
def __init__(self, project_root: str = None):
"""
初始化配置管理工具
Args:
project_root: 项目根目录
"""
self.data_service = DataService(project_root)
def get_current_config(self, section: Optional[str] = None) -> Dict:
"""
获取当前系统配置
Args:
section: 配置节 - all/crawler/push/keywords/weights,默认all
Returns:
配置字典
Example:
>>> tools = ConfigManagementTools()
>>> result = tools.get_current_config(section="crawler")
>>> print(result['crawler']['platforms'])
"""
try:
# 参数验证
section = validate_config_section(section)
# 获取配置
config = self.data_service.get_current_config(section=section)
return {
"config": config,
"section": section,
"success": True
}
except MCPError as e:
return {
"success": False,
"error": e.to_dict()
}
except Exception as e:
return {
"success": False,
"error": {
"code": "INTERNAL_ERROR",
"message": str(e)
}
}
+284
View File
@@ -0,0 +1,284 @@
"""
数据查询工具
实现P0核心的数据查询工具。
"""
from typing import Dict, List, Optional
from ..services.data_service import DataService
from ..utils.validators import (
validate_platforms,
validate_limit,
validate_keyword,
validate_date_range,
validate_top_n,
validate_mode,
validate_date_query
)
from ..utils.errors import MCPError
class DataQueryTools:
"""数据查询工具类"""
def __init__(self, project_root: str = None):
"""
初始化数据查询工具
Args:
project_root: 项目根目录
"""
self.data_service = DataService(project_root)
def get_latest_news(
self,
platforms: Optional[List[str]] = None,
limit: Optional[int] = None,
include_url: bool = False
) -> Dict:
"""
获取最新一批爬取的新闻数据
Args:
platforms: 平台ID列表,如 ['zhihu', 'weibo']
limit: 返回条数限制,默认20
include_url: 是否包含URL链接,默认False(节省token
Returns:
新闻列表字典
Example:
>>> tools = DataQueryTools()
>>> result = tools.get_latest_news(platforms=['zhihu'], limit=10)
>>> print(result['total'])
10
"""
try:
# 参数验证
platforms = validate_platforms(platforms)
limit = validate_limit(limit, default=50)
# 获取数据
news_list = self.data_service.get_latest_news(
platforms=platforms,
limit=limit,
include_url=include_url
)
return {
"news": news_list,
"total": len(news_list),
"platforms": platforms,
"success": True
}
except MCPError as e:
return {
"success": False,
"error": e.to_dict()
}
except Exception as e:
return {
"success": False,
"error": {
"code": "INTERNAL_ERROR",
"message": str(e)
}
}
def search_news_by_keyword(
self,
keyword: str,
date_range: Optional[Dict] = None,
platforms: Optional[List[str]] = None,
limit: Optional[int] = None
) -> Dict:
"""
按关键词搜索历史新闻
Args:
keyword: 搜索关键词(必需)
date_range: 日期范围,格式: {"start": "YYYY-MM-DD", "end": "YYYY-MM-DD"}
platforms: 平台过滤列表
limit: 返回条数限制(可选,默认返回所有)
Returns:
搜索结果字典
Example:
>>> tools = DataQueryTools()
>>> result = tools.search_news_by_keyword(
... keyword="人工智能",
... date_range={"start": "2025-10-01", "end": "2025-10-11"},
... limit=50
... )
>>> print(result['total'])
"""
try:
# 参数验证
keyword = validate_keyword(keyword)
date_range_tuple = validate_date_range(date_range)
platforms = validate_platforms(platforms)
if limit is not None:
limit = validate_limit(limit, default=100)
# 搜索数据
search_result = self.data_service.search_news_by_keyword(
keyword=keyword,
date_range=date_range_tuple,
platforms=platforms,
limit=limit
)
return {
**search_result,
"success": True
}
except MCPError as e:
return {
"success": False,
"error": e.to_dict()
}
except Exception as e:
return {
"success": False,
"error": {
"code": "INTERNAL_ERROR",
"message": str(e)
}
}
def get_trending_topics(
self,
top_n: Optional[int] = None,
mode: Optional[str] = None
) -> Dict:
"""
获取个人关注词的新闻出现频率统计
注意:本工具基于 config/frequency_words.txt 中的个人关注词列表进行统计,
而不是自动从新闻中提取热点话题。这是一个个人可定制的关注词列表,
用户可以根据自己的兴趣添加或删除关注词。
Args:
top_n: 返回TOP N关注词,默认10
mode: 模式 - daily(当日累计), current(最新一批), incremental(增量)
Returns:
关注词频率统计字典,包含每个关注词在新闻中出现的次数
Example:
>>> tools = DataQueryTools()
>>> result = tools.get_trending_topics(top_n=5, mode="current")
>>> print(len(result['topics']))
5
>>> # 返回的是你在 frequency_words.txt 中设置的关注词的频率统计
"""
try:
# 参数验证
top_n = validate_top_n(top_n, default=10)
valid_modes = ["daily", "current", "incremental"]
mode = validate_mode(mode, valid_modes, default="current")
# 获取趋势话题
trending_result = self.data_service.get_trending_topics(
top_n=top_n,
mode=mode
)
return {
**trending_result,
"success": True
}
except MCPError as e:
return {
"success": False,
"error": e.to_dict()
}
except Exception as e:
return {
"success": False,
"error": {
"code": "INTERNAL_ERROR",
"message": str(e)
}
}
def get_news_by_date(
self,
date_query: Optional[str] = None,
platforms: Optional[List[str]] = None,
limit: Optional[int] = None,
include_url: bool = False
) -> Dict:
"""
按日期查询新闻,支持自然语言日期
Args:
date_query: 日期查询字符串(可选,默认"今天"),支持:
- 相对日期:今天、昨天、前天、3天前、yesterday、3 days ago
- 星期:上周一、本周三、last monday、this friday
- 绝对日期:2025-10-10、10月10日、2025年10月10日
platforms: 平台ID列表,如 ['zhihu', 'weibo']
limit: 返回条数限制,默认50
include_url: 是否包含URL链接,默认False(节省token
Returns:
新闻列表字典
Example:
>>> tools = DataQueryTools()
>>> # 不指定日期,默认查询今天
>>> result = tools.get_news_by_date(platforms=['zhihu'], limit=20)
>>> # 指定日期
>>> result = tools.get_news_by_date(
... date_query="昨天",
... platforms=['zhihu'],
... limit=20
... )
>>> print(result['total'])
20
"""
try:
# 参数验证 - 默认今天
if date_query is None:
date_query = "今天"
target_date = validate_date_query(date_query)
platforms = validate_platforms(platforms)
limit = validate_limit(limit, default=50)
# 获取数据
news_list = self.data_service.get_news_by_date(
target_date=target_date,
platforms=platforms,
limit=limit,
include_url=include_url
)
return {
"news": news_list,
"total": len(news_list),
"date": target_date.strftime("%Y-%m-%d"),
"date_query": date_query,
"platforms": platforms,
"success": True
}
except MCPError as e:
return {
"success": False,
"error": e.to_dict()
}
except Exception as e:
return {
"success": False,
"error": {
"code": "INTERNAL_ERROR",
"message": str(e)
}
}
+664
View File
@@ -0,0 +1,664 @@
"""
智能新闻检索工具
提供模糊搜索、链接查询、历史相关新闻检索等高级搜索功能。
"""
import re
from collections import Counter
from datetime import datetime, timedelta
from difflib import SequenceMatcher
from typing import Dict, List, Optional, Tuple
from ..services.data_service import DataService
from ..utils.validators import validate_keyword, validate_limit
from ..utils.errors import MCPError, InvalidParameterError, DataNotFoundError
class SearchTools:
"""智能新闻检索工具类"""
def __init__(self, project_root: str = None):
"""
初始化智能检索工具
Args:
project_root: 项目根目录
"""
self.data_service = DataService(project_root)
# 中文停用词列表
self.stopwords = {
'', '', '', '', '', '', '', '', '', '', '', '',
'一个', '', '', '', '', '', '', '', '', '', '', '没有',
'', '', '自己', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '', '可以', '',
'已经', '', '', '', '', '因为', '所以', '如果', '虽然', '然而'
}
def search_news_unified(
self,
query: str,
search_mode: str = "keyword",
date_range: Optional[Dict[str, str]] = None,
platforms: Optional[List[str]] = None,
limit: int = 50,
sort_by: str = "relevance",
threshold: float = 0.6,
include_url: bool = False
) -> Dict:
"""
统一新闻搜索工具 - 整合多种搜索模式
Args:
query: 查询内容(必需)- 关键词、内容片段或实体名称
search_mode: 搜索模式,可选值:
- "keyword": 精确关键词匹配(默认)
- "fuzzy": 模糊内容匹配(使用相似度算法)
- "entity": 实体名称搜索(自动按权重排序)
date_range: 日期范围,格式: {"start": "YYYY-MM-DD", "end": "YYYY-MM-DD"}
不指定则默认查询今天
platforms: 平台过滤列表,如 ['zhihu', 'weibo']
limit: 返回条数限制,默认50
sort_by: 排序方式,可选值:
- "relevance": 按相关度排序(默认)
- "weight": 按新闻权重排序
- "date": 按日期排序
threshold: 相似度阈值(仅fuzzy模式有效),0-1之间,默认0.6
include_url: 是否包含URL链接,默认False(节省token
Returns:
搜索结果字典,包含匹配的新闻列表
Examples:
- search_news_unified(query="人工智能", search_mode="keyword")
- search_news_unified(query="特斯拉降价", search_mode="fuzzy", threshold=0.4)
- search_news_unified(query="马斯克", search_mode="entity", limit=20)
- search_news_unified(query="iPhone 16发布", search_mode="keyword")
"""
try:
# 参数验证
query = validate_keyword(query)
if search_mode not in ["keyword", "fuzzy", "entity"]:
raise InvalidParameterError(
f"无效的搜索模式: {search_mode}",
suggestion="支持的模式: keyword, fuzzy, entity"
)
if sort_by not in ["relevance", "weight", "date"]:
raise InvalidParameterError(
f"无效的排序方式: {sort_by}",
suggestion="支持的排序: relevance, weight, date"
)
limit = validate_limit(limit, default=50)
threshold = max(0.0, min(1.0, threshold))
# 处理日期范围
if date_range:
from ..utils.validators import validate_date_range
date_range_tuple = validate_date_range(date_range)
start_date, end_date = date_range_tuple
else:
# 默认今天
start_date = end_date = datetime.now()
# 收集所有匹配的新闻
all_matches = []
current_date = start_date
while current_date <= end_date:
try:
all_titles, id_to_name, timestamps = self.data_service.parser.read_all_titles_for_date(
date=current_date,
platform_ids=platforms
)
# 根据搜索模式执行不同的搜索逻辑
if search_mode == "keyword":
matches = self._search_by_keyword_mode(
query, all_titles, id_to_name, current_date, include_url
)
elif search_mode == "fuzzy":
matches = self._search_by_fuzzy_mode(
query, all_titles, id_to_name, current_date, threshold, include_url
)
else: # entity
matches = self._search_by_entity_mode(
query, all_titles, id_to_name, current_date, include_url
)
all_matches.extend(matches)
except DataNotFoundError:
# 该日期没有数据,继续下一天
pass
current_date += timedelta(days=1)
if not all_matches:
time_desc = "今天" if start_date == end_date else f"{start_date.strftime('%Y-%m-%d')}{end_date.strftime('%Y-%m-%d')}"
return {
"success": True,
"results": [],
"total": 0,
"query": query,
"search_mode": search_mode,
"time_range": time_desc,
"message": f"未找到匹配的新闻({time_desc}"
}
# 统一排序逻辑
if sort_by == "relevance":
all_matches.sort(key=lambda x: x.get("similarity_score", 1.0), reverse=True)
elif sort_by == "weight":
from .analytics import calculate_news_weight
all_matches.sort(key=lambda x: calculate_news_weight(x), reverse=True)
elif sort_by == "date":
all_matches.sort(key=lambda x: x.get("date", ""), reverse=True)
# 限制返回数量
results = all_matches[:limit]
# 构建时间范围描述
if start_date == end_date:
time_range_desc = start_date.strftime("%Y-%m-%d")
else:
time_range_desc = f"{start_date.strftime('%Y-%m-%d')}{end_date.strftime('%Y-%m-%d')}"
result = {
"success": True,
"summary": {
"total_found": len(all_matches),
"returned_count": len(results),
"requested_limit": limit,
"search_mode": search_mode,
"query": query,
"platforms": platforms or "所有平台",
"time_range": time_range_desc,
"sort_by": sort_by
},
"results": results
}
if search_mode == "fuzzy":
result["summary"]["threshold"] = threshold
if len(all_matches) < limit:
result["note"] = f"模糊搜索模式下,相似度阈值 {threshold} 仅匹配到 {len(all_matches)} 条结果"
return result
except MCPError as e:
return {
"success": False,
"error": e.to_dict()
}
except Exception as e:
return {
"success": False,
"error": {
"code": "INTERNAL_ERROR",
"message": str(e)
}
}
def _search_by_keyword_mode(
self,
query: str,
all_titles: Dict,
id_to_name: Dict,
current_date: datetime,
include_url: bool
) -> List[Dict]:
"""
关键词搜索模式(精确匹配)
Args:
query: 搜索关键词
all_titles: 所有标题字典
id_to_name: 平台ID到名称映射
current_date: 当前日期
Returns:
匹配的新闻列表
"""
matches = []
query_lower = query.lower()
for platform_id, titles in all_titles.items():
platform_name = id_to_name.get(platform_id, platform_id)
for title, info in titles.items():
# 精确包含判断
if query_lower in title.lower():
news_item = {
"title": title,
"platform": platform_id,
"platform_name": platform_name,
"date": current_date.strftime("%Y-%m-%d"),
"similarity_score": 1.0, # 精确匹配,相似度为1
"ranks": info.get("ranks", []),
"count": len(info.get("ranks", [])),
"rank": info["ranks"][0] if info["ranks"] else 999
}
# 条件性添加 URL 字段
if include_url:
news_item["url"] = info.get("url", "")
news_item["mobileUrl"] = info.get("mobileUrl", "")
matches.append(news_item)
return matches
def _search_by_fuzzy_mode(
self,
query: str,
all_titles: Dict,
id_to_name: Dict,
current_date: datetime,
threshold: float,
include_url: bool
) -> List[Dict]:
"""
模糊搜索模式(使用相似度算法)
Args:
query: 搜索内容
all_titles: 所有标题字典
id_to_name: 平台ID到名称映射
current_date: 当前日期
threshold: 相似度阈值
Returns:
匹配的新闻列表
"""
matches = []
for platform_id, titles in all_titles.items():
platform_name = id_to_name.get(platform_id, platform_id)
for title, info in titles.items():
# 模糊匹配
is_match, similarity = self._fuzzy_match(query, title, threshold)
if is_match:
news_item = {
"title": title,
"platform": platform_id,
"platform_name": platform_name,
"date": current_date.strftime("%Y-%m-%d"),
"similarity_score": round(similarity, 4),
"ranks": info.get("ranks", []),
"count": len(info.get("ranks", [])),
"rank": info["ranks"][0] if info["ranks"] else 999
}
# 条件性添加 URL 字段
if include_url:
news_item["url"] = info.get("url", "")
news_item["mobileUrl"] = info.get("mobileUrl", "")
matches.append(news_item)
return matches
def _search_by_entity_mode(
self,
query: str,
all_titles: Dict,
id_to_name: Dict,
current_date: datetime,
include_url: bool
) -> List[Dict]:
"""
实体搜索模式(自动按权重排序)
Args:
query: 实体名称
all_titles: 所有标题字典
id_to_name: 平台ID到名称映射
current_date: 当前日期
Returns:
匹配的新闻列表
"""
matches = []
for platform_id, titles in all_titles.items():
platform_name = id_to_name.get(platform_id, platform_id)
for title, info in titles.items():
# 实体搜索:精确包含实体名称
if query in title:
news_item = {
"title": title,
"platform": platform_id,
"platform_name": platform_name,
"date": current_date.strftime("%Y-%m-%d"),
"similarity_score": 1.0,
"ranks": info.get("ranks", []),
"count": len(info.get("ranks", [])),
"rank": info["ranks"][0] if info["ranks"] else 999
}
# 条件性添加 URL 字段
if include_url:
news_item["url"] = info.get("url", "")
news_item["mobileUrl"] = info.get("mobileUrl", "")
matches.append(news_item)
return matches
def _calculate_similarity(self, text1: str, text2: str) -> float:
"""
计算两个文本的相似度
Args:
text1: 文本1
text2: 文本2
Returns:
相似度分数 (0-1之间)
"""
# 使用 difflib.SequenceMatcher 计算序列相似度
return SequenceMatcher(None, text1.lower(), text2.lower()).ratio()
def _fuzzy_match(self, query: str, text: str, threshold: float = 0.3) -> Tuple[bool, float]:
"""
模糊匹配函数
Args:
query: 查询文本
text: 待匹配文本
threshold: 匹配阈值
Returns:
(是否匹配, 相似度分数)
"""
# 直接包含判断
if query.lower() in text.lower():
return True, 1.0
# 计算整体相似度
similarity = self._calculate_similarity(query, text)
if similarity >= threshold:
return True, similarity
# 分词后的部分匹配
query_words = set(self._extract_keywords(query))
text_words = set(self._extract_keywords(text))
if not query_words or not text_words:
return False, 0.0
# 计算关键词重合度
common_words = query_words & text_words
keyword_overlap = len(common_words) / len(query_words)
if keyword_overlap >= 0.5: # 50%的关键词重合
return True, keyword_overlap
return False, similarity
def _extract_keywords(self, text: str, min_length: int = 2) -> List[str]:
"""
从文本中提取关键词
Args:
text: 输入文本
min_length: 最小词长
Returns:
关键词列表
"""
# 移除URL和特殊字符
text = re.sub(r'http[s]?://\S+', '', text)
text = re.sub(r'\[.*?\]', '', text) # 移除方括号内容
# 使用正则表达式分词(中文和英文)
words = re.findall(r'[\w]+', text)
# 过滤停用词和短词
keywords = [
word for word in words
if word and len(word) >= min_length and word not in self.stopwords
]
return keywords
def _calculate_keyword_overlap(self, keywords1: List[str], keywords2: List[str]) -> float:
"""
计算两个关键词列表的重合度
Args:
keywords1: 关键词列表1
keywords2: 关键词列表2
Returns:
重合度分数 (0-1之间)
"""
if not keywords1 or not keywords2:
return 0.0
set1 = set(keywords1)
set2 = set(keywords2)
# Jaccard 相似度
intersection = len(set1 & set2)
union = len(set1 | set2)
if union == 0:
return 0.0
return intersection / union
def search_related_news_history(
self,
reference_text: str,
time_range: str = "yesterday",
start_date: Optional[datetime] = None,
end_date: Optional[datetime] = None,
threshold: float = 0.4,
limit: int = 50,
include_url: bool = False
) -> Dict:
"""
在历史数据中搜索与给定新闻相关的新闻
Args:
reference_text: 参考新闻标题或内容
time_range: 时间范围预设值,可选:
- "yesterday": 昨天
- "last_week": 上周 (7天)
- "last_month": 上个月 (30天)
- "custom": 自定义日期范围(需要提供 start_date 和 end_date
start_date: 自定义开始日期(仅当 time_range="custom" 时有效)
end_date: 自定义结束日期(仅当 time_range="custom" 时有效)
threshold: 相似度阈值 (0-1之间),默认0.4
limit: 返回条数限制,默认50
include_url: 是否包含URL链接,默认False(节省token
Returns:
搜索结果字典,包含相关新闻列表
Example:
>>> tools = SearchTools()
>>> result = tools.search_related_news_history(
... reference_text="人工智能技术突破",
... time_range="last_week",
... threshold=0.4,
... limit=50
... )
>>> for news in result['results']:
... print(f"{news['date']}: {news['title']} (相似度: {news['similarity_score']})")
"""
try:
# 参数验证
reference_text = validate_keyword(reference_text)
threshold = max(0.0, min(1.0, threshold))
limit = validate_limit(limit, default=50)
# 确定查询日期范围
today = datetime.now()
if time_range == "yesterday":
search_start = today - timedelta(days=1)
search_end = today - timedelta(days=1)
elif time_range == "last_week":
search_start = today - timedelta(days=7)
search_end = today - timedelta(days=1)
elif time_range == "last_month":
search_start = today - timedelta(days=30)
search_end = today - timedelta(days=1)
elif time_range == "custom":
if not start_date or not end_date:
raise InvalidParameterError(
"自定义时间范围需要提供 start_date 和 end_date",
suggestion="请提供 start_date 和 end_date 参数"
)
search_start = start_date
search_end = end_date
else:
raise InvalidParameterError(
f"不支持的时间范围: {time_range}",
suggestion="请使用 'yesterday', 'last_week', 'last_month''custom'"
)
# 提取参考文本的关键词
reference_keywords = self._extract_keywords(reference_text)
if not reference_keywords:
raise InvalidParameterError(
"无法从参考文本中提取关键词",
suggestion="请提供更详细的文本内容"
)
# 收集所有相关新闻
all_related_news = []
current_date = search_start
while current_date <= search_end:
try:
# 读取该日期的数据
all_titles, id_to_name, _ = self.data_service.parser.read_all_titles_for_date(current_date)
# 搜索相关新闻
for platform_id, titles in all_titles.items():
platform_name = id_to_name.get(platform_id, platform_id)
for title, info in titles.items():
# 计算标题相似度
title_similarity = self._calculate_similarity(reference_text, title)
# 提取标题关键词
title_keywords = self._extract_keywords(title)
# 计算关键词重合度
keyword_overlap = self._calculate_keyword_overlap(
reference_keywords,
title_keywords
)
# 综合相似度 (70% 关键词重合 + 30% 文本相似度)
combined_score = keyword_overlap * 0.7 + title_similarity * 0.3
if combined_score >= threshold:
news_item = {
"title": title,
"platform": platform_id,
"platform_name": platform_name,
"date": current_date.strftime("%Y-%m-%d"),
"similarity_score": round(combined_score, 4),
"keyword_overlap": round(keyword_overlap, 4),
"text_similarity": round(title_similarity, 4),
"common_keywords": list(set(reference_keywords) & set(title_keywords)),
"rank": info["ranks"][0] if info["ranks"] else 0
}
# 条件性添加 URL 字段
if include_url:
news_item["url"] = info.get("url", "")
news_item["mobileUrl"] = info.get("mobileUrl", "")
all_related_news.append(news_item)
except DataNotFoundError:
# 该日期没有数据,继续下一天
pass
except Exception as e:
# 记录错误但继续处理其他日期
print(f"Warning: 处理日期 {current_date.strftime('%Y-%m-%d')} 时出错: {e}")
# 移动到下一天
current_date += timedelta(days=1)
if not all_related_news:
return {
"success": True,
"results": [],
"total": 0,
"query": reference_text,
"time_range": time_range,
"date_range": {
"start": search_start.strftime("%Y-%m-%d"),
"end": search_end.strftime("%Y-%m-%d")
},
"message": "未找到相关新闻"
}
# 按相似度排序
all_related_news.sort(key=lambda x: x["similarity_score"], reverse=True)
# 限制返回数量
results = all_related_news[:limit]
# 统计信息
platform_distribution = Counter([news["platform"] for news in all_related_news])
date_distribution = Counter([news["date"] for news in all_related_news])
result = {
"success": True,
"summary": {
"total_found": len(all_related_news),
"returned_count": len(results),
"requested_limit": limit,
"threshold": threshold,
"reference_text": reference_text,
"reference_keywords": reference_keywords,
"time_range": time_range,
"date_range": {
"start": search_start.strftime("%Y-%m-%d"),
"end": search_end.strftime("%Y-%m-%d")
}
},
"results": results,
"statistics": {
"platform_distribution": dict(platform_distribution),
"date_distribution": dict(date_distribution),
"avg_similarity": round(
sum([news["similarity_score"] for news in all_related_news]) / len(all_related_news),
4
) if all_related_news else 0.0
}
}
if len(all_related_news) < limit:
result["note"] = f"相关性阈值 {threshold} 下仅找到 {len(all_related_news)} 条相关新闻"
return result
except MCPError as e:
return {
"success": False,
"error": e.to_dict()
}
except Exception as e:
return {
"success": False,
"error": {
"code": "INTERNAL_ERROR",
"message": str(e)
}
}
+465
View File
@@ -0,0 +1,465 @@
"""
系统管理工具
实现系统状态查询和爬虫触发功能。
"""
from pathlib import Path
from typing import Dict, List, Optional
from ..services.data_service import DataService
from ..utils.validators import validate_platforms
from ..utils.errors import MCPError, CrawlTaskError
class SystemManagementTools:
"""系统管理工具类"""
def __init__(self, project_root: str = None):
"""
初始化系统管理工具
Args:
project_root: 项目根目录
"""
self.data_service = DataService(project_root)
if project_root:
self.project_root = Path(project_root)
else:
# 获取项目根目录
current_file = Path(__file__)
self.project_root = current_file.parent.parent.parent
def get_system_status(self) -> Dict:
"""
获取系统运行状态和健康检查信息
Returns:
系统状态字典
Example:
>>> tools = SystemManagementTools()
>>> result = tools.get_system_status()
>>> print(result['system']['version'])
"""
try:
# 获取系统状态
status = self.data_service.get_system_status()
return {
**status,
"success": True
}
except MCPError as e:
return {
"success": False,
"error": e.to_dict()
}
except Exception as e:
return {
"success": False,
"error": {
"code": "INTERNAL_ERROR",
"message": str(e)
}
}
def trigger_crawl(self, platforms: Optional[List[str]] = None, save_to_local: bool = False, include_url: bool = False) -> Dict:
"""
手动触发一次临时爬取任务(可选持久化)
Args:
platforms: 指定平台列表,为空则爬取所有平台
save_to_local: 是否保存到本地 output 目录,默认 False
include_url: 是否包含URL链接,默认False(节省token
Returns:
爬取结果字典,包含新闻数据和保存路径(如果保存)
Example:
>>> tools = SystemManagementTools()
>>> # 临时爬取,不保存
>>> result = tools.trigger_crawl(platforms=['zhihu', 'weibo'])
>>> print(result['data'])
>>> # 爬取并保存到本地
>>> result = tools.trigger_crawl(platforms=['zhihu'], save_to_local=True)
>>> print(result['saved_files'])
"""
try:
import json
import time
import random
import requests
from datetime import datetime
import pytz
import yaml
# 参数验证
platforms = validate_platforms(platforms)
# 加载配置文件
config_path = self.project_root / "config" / "config.yaml"
if not config_path.exists():
raise CrawlTaskError(
"配置文件不存在",
suggestion=f"请确保配置文件存在: {config_path}"
)
# 读取配置
with open(config_path, "r", encoding="utf-8") as f:
config_data = yaml.safe_load(f)
# 获取平台配置
all_platforms = config_data.get("platforms", [])
if not all_platforms:
raise CrawlTaskError(
"配置文件中没有平台配置",
suggestion="请检查 config/config.yaml 中的 platforms 配置"
)
# 过滤平台
if platforms:
target_platforms = [p for p in all_platforms if p["id"] in platforms]
if not target_platforms:
raise CrawlTaskError(
f"指定的平台不存在: {platforms}",
suggestion=f"可用平台: {[p['id'] for p in all_platforms]}"
)
else:
target_platforms = all_platforms
# 获取请求间隔
request_interval = config_data.get("crawler", {}).get("request_interval", 100)
# 构建平台ID列表
ids = []
for platform in target_platforms:
if "name" in platform:
ids.append((platform["id"], platform["name"]))
else:
ids.append(platform["id"])
print(f"开始临时爬取,平台: {[p.get('name', p['id']) for p in target_platforms]}")
# 爬取数据
results = {}
id_to_name = {}
failed_ids = []
for i, id_info in enumerate(ids):
if isinstance(id_info, tuple):
id_value, name = id_info
else:
id_value = id_info
name = id_value
id_to_name[id_value] = name
# 构建请求URL
url = f"https://newsnow.busiyi.world/api/s?id={id_value}&latest"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Accept": "application/json, text/plain, */*",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Connection": "keep-alive",
"Cache-Control": "no-cache",
}
# 重试机制
max_retries = 2
retries = 0
success = False
while retries <= max_retries and not success:
try:
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
data_text = response.text
data_json = json.loads(data_text)
status = data_json.get("status", "未知")
if status not in ["success", "cache"]:
raise ValueError(f"响应状态异常: {status}")
status_info = "最新数据" if status == "success" else "缓存数据"
print(f"获取 {id_value} 成功({status_info}")
# 解析数据
results[id_value] = {}
for index, item in enumerate(data_json.get("items", []), 1):
title = item["title"]
url_link = item.get("url", "")
mobile_url = item.get("mobileUrl", "")
if title in results[id_value]:
results[id_value][title]["ranks"].append(index)
else:
results[id_value][title] = {
"ranks": [index],
"url": url_link,
"mobileUrl": mobile_url,
}
success = True
except Exception as e:
retries += 1
if retries <= max_retries:
wait_time = random.uniform(3, 5)
print(f"请求 {id_value} 失败: {e}. {wait_time:.2f}秒后重试...")
time.sleep(wait_time)
else:
print(f"请求 {id_value} 失败: {e}")
failed_ids.append(id_value)
# 请求间隔
if i < len(ids) - 1:
actual_interval = request_interval + random.randint(-10, 20)
actual_interval = max(50, actual_interval)
time.sleep(actual_interval / 1000)
# 格式化返回数据
news_data = []
for platform_id, titles_data in results.items():
platform_name = id_to_name.get(platform_id, platform_id)
for title, info in titles_data.items():
news_item = {
"platform_id": platform_id,
"platform_name": platform_name,
"title": title,
"ranks": info["ranks"]
}
# 条件性添加 URL 字段
if include_url:
news_item["url"] = info.get("url", "")
news_item["mobile_url"] = info.get("mobileUrl", "")
news_data.append(news_item)
# 获取北京时间
beijing_tz = pytz.timezone("Asia/Shanghai")
now = datetime.now(beijing_tz)
# 构建返回结果
result = {
"success": True,
"task_id": f"crawl_{int(time.time())}",
"status": "completed",
"crawl_time": now.strftime("%Y-%m-%d %H:%M:%S"),
"platforms": list(results.keys()),
"total_news": len(news_data),
"failed_platforms": failed_ids,
"data": news_data,
"saved_to_local": save_to_local
}
# 如果需要持久化,调用保存逻辑
if save_to_local:
try:
import re
# 辅助函数:清理标题
def clean_title(title: str) -> str:
"""清理标题中的特殊字符"""
if not isinstance(title, str):
title = str(title)
cleaned_title = title.replace("\n", " ").replace("\r", " ")
cleaned_title = re.sub(r"\s+", " ", cleaned_title)
cleaned_title = cleaned_title.strip()
return cleaned_title
# 辅助函数:创建目录
def ensure_directory_exists(directory: str):
"""确保目录存在"""
Path(directory).mkdir(parents=True, exist_ok=True)
# 格式化日期和时间
date_folder = now.strftime("%Y年%m月%d")
time_filename = now.strftime("%H时%M分")
# 创建 txt 文件路径
txt_dir = self.project_root / "output" / date_folder / "txt"
ensure_directory_exists(str(txt_dir))
txt_file_path = txt_dir / f"{time_filename}.txt"
# 创建 html 文件路径
html_dir = self.project_root / "output" / date_folder / "html"
ensure_directory_exists(str(html_dir))
html_file_path = html_dir / f"{time_filename}.html"
# 保存 txt 文件(按照 main.py 的格式)
with open(txt_file_path, "w", encoding="utf-8") as f:
for id_value, title_data in results.items():
# id | name 或 id
name = id_to_name.get(id_value)
if name and name != id_value:
f.write(f"{id_value} | {name}\n")
else:
f.write(f"{id_value}\n")
# 按排名排序标题
sorted_titles = []
for title, info in title_data.items():
cleaned = clean_title(title)
if isinstance(info, dict):
ranks = info.get("ranks", [])
url = info.get("url", "")
mobile_url = info.get("mobileUrl", "")
else:
ranks = info if isinstance(info, list) else []
url = ""
mobile_url = ""
rank = ranks[0] if ranks else 1
sorted_titles.append((rank, cleaned, url, mobile_url))
sorted_titles.sort(key=lambda x: x[0])
for rank, cleaned, url, mobile_url in sorted_titles:
line = f"{rank}. {cleaned}"
if url:
line += f" [URL:{url}]"
if mobile_url:
line += f" [MOBILE:{mobile_url}]"
f.write(line + "\n")
f.write("\n")
if failed_ids:
f.write("==== 以下ID请求失败 ====\n")
for id_value in failed_ids:
f.write(f"{id_value}\n")
# 保存 html 文件(简化版)
html_content = self._generate_simple_html(results, id_to_name, failed_ids, now)
with open(html_file_path, "w", encoding="utf-8") as f:
f.write(html_content)
print(f"数据已保存到:")
print(f" TXT: {txt_file_path}")
print(f" HTML: {html_file_path}")
result["saved_files"] = {
"txt": str(txt_file_path),
"html": str(html_file_path)
}
result["note"] = "数据已持久化到 output 文件夹"
except Exception as e:
print(f"保存文件失败: {e}")
result["save_error"] = str(e)
result["note"] = "爬取成功但保存失败,数据仅在内存中"
else:
result["note"] = "临时爬取结果,未持久化到output文件夹"
return result
except MCPError as e:
return {
"success": False,
"error": e.to_dict()
}
except Exception as e:
import traceback
return {
"success": False,
"error": {
"code": "INTERNAL_ERROR",
"message": str(e),
"traceback": traceback.format_exc()
}
}
def _generate_simple_html(self, results: Dict, id_to_name: Dict, failed_ids: List, now) -> str:
"""生成简化的 HTML 报告"""
html = """<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>MCP 爬取结果</title>
<style>
body { font-family: Arial, sans-serif; margin: 20px; background: #f5f5f5; }
.container { max-width: 900px; margin: 0 auto; background: white; padding: 20px; border-radius: 8px; }
h1 { color: #333; border-bottom: 2px solid #4CAF50; padding-bottom: 10px; }
.platform { margin-bottom: 30px; }
.platform-name { background: #4CAF50; color: white; padding: 10px; border-radius: 5px; margin-bottom: 10px; }
.news-item { padding: 8px; border-bottom: 1px solid #eee; }
.rank { color: #666; font-weight: bold; margin-right: 10px; }
.title { color: #333; }
.link { color: #1976D2; text-decoration: none; margin-left: 10px; font-size: 0.9em; }
.link:hover { text-decoration: underline; }
.failed { background: #ffebee; padding: 10px; border-radius: 5px; margin-top: 20px; }
.failed h3 { color: #c62828; margin-top: 0; }
.timestamp { color: #666; font-size: 0.9em; text-align: right; margin-top: 20px; }
</style>
</head>
<body>
<div class="container">
<h1>MCP 爬取结果</h1>
"""
# 添加时间戳
html += f' <p class="timestamp">爬取时间: {now.strftime("%Y-%m-%d %H:%M:%S")}</p>\n\n'
# 遍历每个平台
for platform_id, titles_data in results.items():
platform_name = id_to_name.get(platform_id, platform_id)
html += f' <div class="platform">\n'
html += f' <div class="platform-name">{platform_name}</div>\n'
# 排序标题
sorted_items = []
for title, info in titles_data.items():
ranks = info.get("ranks", [])
url = info.get("url", "")
mobile_url = info.get("mobileUrl", "")
rank = ranks[0] if ranks else 999
sorted_items.append((rank, title, url, mobile_url))
sorted_items.sort(key=lambda x: x[0])
# 显示新闻
for rank, title, url, mobile_url in sorted_items:
html += f' <div class="news-item">\n'
html += f' <span class="rank">{rank}.</span>\n'
html += f' <span class="title">{self._html_escape(title)}</span>\n'
if url:
html += f' <a class="link" href="{self._html_escape(url)}" target="_blank">链接</a>\n'
if mobile_url and mobile_url != url:
html += f' <a class="link" href="{self._html_escape(mobile_url)}" target="_blank">移动版</a>\n'
html += ' </div>\n'
html += ' </div>\n\n'
# 失败的平台
if failed_ids:
html += ' <div class="failed">\n'
html += ' <h3>请求失败的平台</h3>\n'
html += ' <ul>\n'
for platform_id in failed_ids:
html += f' <li>{self._html_escape(platform_id)}</li>\n'
html += ' </ul>\n'
html += ' </div>\n'
html += """ </div>
</body>
</html>"""
return html
def _html_escape(self, text: str) -> str:
"""HTML 转义"""
if not isinstance(text, str):
text = str(text)
return (
text.replace("&", "&amp;")
.replace("<", "&lt;")
.replace(">", "&gt;")
.replace('"', "&quot;")
.replace("'", "&#x27;")
)