mirror of
https://gitee.com/houhuan/TrendRadar.git
synced 2026-05-01 01:12:42 +08:00
v3.0.0 AI 智能分析功能
This commit is contained in:
@@ -0,0 +1,5 @@
|
||||
"""
|
||||
MCP 工具模块
|
||||
|
||||
包含所有MCP工具的实现。
|
||||
"""
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,66 @@
|
||||
"""
|
||||
配置管理工具
|
||||
|
||||
实现配置查询和管理功能。
|
||||
"""
|
||||
|
||||
from typing import Dict, Optional
|
||||
|
||||
from ..services.data_service import DataService
|
||||
from ..utils.validators import validate_config_section
|
||||
from ..utils.errors import MCPError
|
||||
|
||||
|
||||
class ConfigManagementTools:
|
||||
"""配置管理工具类"""
|
||||
|
||||
def __init__(self, project_root: str = None):
|
||||
"""
|
||||
初始化配置管理工具
|
||||
|
||||
Args:
|
||||
project_root: 项目根目录
|
||||
"""
|
||||
self.data_service = DataService(project_root)
|
||||
|
||||
def get_current_config(self, section: Optional[str] = None) -> Dict:
|
||||
"""
|
||||
获取当前系统配置
|
||||
|
||||
Args:
|
||||
section: 配置节 - all/crawler/push/keywords/weights,默认all
|
||||
|
||||
Returns:
|
||||
配置字典
|
||||
|
||||
Example:
|
||||
>>> tools = ConfigManagementTools()
|
||||
>>> result = tools.get_current_config(section="crawler")
|
||||
>>> print(result['crawler']['platforms'])
|
||||
"""
|
||||
try:
|
||||
# 参数验证
|
||||
section = validate_config_section(section)
|
||||
|
||||
# 获取配置
|
||||
config = self.data_service.get_current_config(section=section)
|
||||
|
||||
return {
|
||||
"config": config,
|
||||
"section": section,
|
||||
"success": True
|
||||
}
|
||||
|
||||
except MCPError as e:
|
||||
return {
|
||||
"success": False,
|
||||
"error": e.to_dict()
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
"success": False,
|
||||
"error": {
|
||||
"code": "INTERNAL_ERROR",
|
||||
"message": str(e)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,284 @@
|
||||
"""
|
||||
数据查询工具
|
||||
|
||||
实现P0核心的数据查询工具。
|
||||
"""
|
||||
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from ..services.data_service import DataService
|
||||
from ..utils.validators import (
|
||||
validate_platforms,
|
||||
validate_limit,
|
||||
validate_keyword,
|
||||
validate_date_range,
|
||||
validate_top_n,
|
||||
validate_mode,
|
||||
validate_date_query
|
||||
)
|
||||
from ..utils.errors import MCPError
|
||||
|
||||
|
||||
class DataQueryTools:
|
||||
"""数据查询工具类"""
|
||||
|
||||
def __init__(self, project_root: str = None):
|
||||
"""
|
||||
初始化数据查询工具
|
||||
|
||||
Args:
|
||||
project_root: 项目根目录
|
||||
"""
|
||||
self.data_service = DataService(project_root)
|
||||
|
||||
def get_latest_news(
|
||||
self,
|
||||
platforms: Optional[List[str]] = None,
|
||||
limit: Optional[int] = None,
|
||||
include_url: bool = False
|
||||
) -> Dict:
|
||||
"""
|
||||
获取最新一批爬取的新闻数据
|
||||
|
||||
Args:
|
||||
platforms: 平台ID列表,如 ['zhihu', 'weibo']
|
||||
limit: 返回条数限制,默认20
|
||||
include_url: 是否包含URL链接,默认False(节省token)
|
||||
|
||||
Returns:
|
||||
新闻列表字典
|
||||
|
||||
Example:
|
||||
>>> tools = DataQueryTools()
|
||||
>>> result = tools.get_latest_news(platforms=['zhihu'], limit=10)
|
||||
>>> print(result['total'])
|
||||
10
|
||||
"""
|
||||
try:
|
||||
# 参数验证
|
||||
platforms = validate_platforms(platforms)
|
||||
limit = validate_limit(limit, default=50)
|
||||
|
||||
# 获取数据
|
||||
news_list = self.data_service.get_latest_news(
|
||||
platforms=platforms,
|
||||
limit=limit,
|
||||
include_url=include_url
|
||||
)
|
||||
|
||||
return {
|
||||
"news": news_list,
|
||||
"total": len(news_list),
|
||||
"platforms": platforms,
|
||||
"success": True
|
||||
}
|
||||
|
||||
except MCPError as e:
|
||||
return {
|
||||
"success": False,
|
||||
"error": e.to_dict()
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
"success": False,
|
||||
"error": {
|
||||
"code": "INTERNAL_ERROR",
|
||||
"message": str(e)
|
||||
}
|
||||
}
|
||||
|
||||
def search_news_by_keyword(
|
||||
self,
|
||||
keyword: str,
|
||||
date_range: Optional[Dict] = None,
|
||||
platforms: Optional[List[str]] = None,
|
||||
limit: Optional[int] = None
|
||||
) -> Dict:
|
||||
"""
|
||||
按关键词搜索历史新闻
|
||||
|
||||
Args:
|
||||
keyword: 搜索关键词(必需)
|
||||
date_range: 日期范围,格式: {"start": "YYYY-MM-DD", "end": "YYYY-MM-DD"}
|
||||
platforms: 平台过滤列表
|
||||
limit: 返回条数限制(可选,默认返回所有)
|
||||
|
||||
Returns:
|
||||
搜索结果字典
|
||||
|
||||
Example:
|
||||
>>> tools = DataQueryTools()
|
||||
>>> result = tools.search_news_by_keyword(
|
||||
... keyword="人工智能",
|
||||
... date_range={"start": "2025-10-01", "end": "2025-10-11"},
|
||||
... limit=50
|
||||
... )
|
||||
>>> print(result['total'])
|
||||
"""
|
||||
try:
|
||||
# 参数验证
|
||||
keyword = validate_keyword(keyword)
|
||||
date_range_tuple = validate_date_range(date_range)
|
||||
platforms = validate_platforms(platforms)
|
||||
|
||||
if limit is not None:
|
||||
limit = validate_limit(limit, default=100)
|
||||
|
||||
# 搜索数据
|
||||
search_result = self.data_service.search_news_by_keyword(
|
||||
keyword=keyword,
|
||||
date_range=date_range_tuple,
|
||||
platforms=platforms,
|
||||
limit=limit
|
||||
)
|
||||
|
||||
return {
|
||||
**search_result,
|
||||
"success": True
|
||||
}
|
||||
|
||||
except MCPError as e:
|
||||
return {
|
||||
"success": False,
|
||||
"error": e.to_dict()
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
"success": False,
|
||||
"error": {
|
||||
"code": "INTERNAL_ERROR",
|
||||
"message": str(e)
|
||||
}
|
||||
}
|
||||
|
||||
def get_trending_topics(
|
||||
self,
|
||||
top_n: Optional[int] = None,
|
||||
mode: Optional[str] = None
|
||||
) -> Dict:
|
||||
"""
|
||||
获取个人关注词的新闻出现频率统计
|
||||
|
||||
注意:本工具基于 config/frequency_words.txt 中的个人关注词列表进行统计,
|
||||
而不是自动从新闻中提取热点话题。这是一个个人可定制的关注词列表,
|
||||
用户可以根据自己的兴趣添加或删除关注词。
|
||||
|
||||
Args:
|
||||
top_n: 返回TOP N关注词,默认10
|
||||
mode: 模式 - daily(当日累计), current(最新一批), incremental(增量)
|
||||
|
||||
Returns:
|
||||
关注词频率统计字典,包含每个关注词在新闻中出现的次数
|
||||
|
||||
Example:
|
||||
>>> tools = DataQueryTools()
|
||||
>>> result = tools.get_trending_topics(top_n=5, mode="current")
|
||||
>>> print(len(result['topics']))
|
||||
5
|
||||
>>> # 返回的是你在 frequency_words.txt 中设置的关注词的频率统计
|
||||
"""
|
||||
try:
|
||||
# 参数验证
|
||||
top_n = validate_top_n(top_n, default=10)
|
||||
valid_modes = ["daily", "current", "incremental"]
|
||||
mode = validate_mode(mode, valid_modes, default="current")
|
||||
|
||||
# 获取趋势话题
|
||||
trending_result = self.data_service.get_trending_topics(
|
||||
top_n=top_n,
|
||||
mode=mode
|
||||
)
|
||||
|
||||
return {
|
||||
**trending_result,
|
||||
"success": True
|
||||
}
|
||||
|
||||
except MCPError as e:
|
||||
return {
|
||||
"success": False,
|
||||
"error": e.to_dict()
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
"success": False,
|
||||
"error": {
|
||||
"code": "INTERNAL_ERROR",
|
||||
"message": str(e)
|
||||
}
|
||||
}
|
||||
|
||||
def get_news_by_date(
|
||||
self,
|
||||
date_query: Optional[str] = None,
|
||||
platforms: Optional[List[str]] = None,
|
||||
limit: Optional[int] = None,
|
||||
include_url: bool = False
|
||||
) -> Dict:
|
||||
"""
|
||||
按日期查询新闻,支持自然语言日期
|
||||
|
||||
Args:
|
||||
date_query: 日期查询字符串(可选,默认"今天"),支持:
|
||||
- 相对日期:今天、昨天、前天、3天前、yesterday、3 days ago
|
||||
- 星期:上周一、本周三、last monday、this friday
|
||||
- 绝对日期:2025-10-10、10月10日、2025年10月10日
|
||||
platforms: 平台ID列表,如 ['zhihu', 'weibo']
|
||||
limit: 返回条数限制,默认50
|
||||
include_url: 是否包含URL链接,默认False(节省token)
|
||||
|
||||
Returns:
|
||||
新闻列表字典
|
||||
|
||||
Example:
|
||||
>>> tools = DataQueryTools()
|
||||
>>> # 不指定日期,默认查询今天
|
||||
>>> result = tools.get_news_by_date(platforms=['zhihu'], limit=20)
|
||||
>>> # 指定日期
|
||||
>>> result = tools.get_news_by_date(
|
||||
... date_query="昨天",
|
||||
... platforms=['zhihu'],
|
||||
... limit=20
|
||||
... )
|
||||
>>> print(result['total'])
|
||||
20
|
||||
"""
|
||||
try:
|
||||
# 参数验证 - 默认今天
|
||||
if date_query is None:
|
||||
date_query = "今天"
|
||||
target_date = validate_date_query(date_query)
|
||||
platforms = validate_platforms(platforms)
|
||||
limit = validate_limit(limit, default=50)
|
||||
|
||||
# 获取数据
|
||||
news_list = self.data_service.get_news_by_date(
|
||||
target_date=target_date,
|
||||
platforms=platforms,
|
||||
limit=limit,
|
||||
include_url=include_url
|
||||
)
|
||||
|
||||
return {
|
||||
"news": news_list,
|
||||
"total": len(news_list),
|
||||
"date": target_date.strftime("%Y-%m-%d"),
|
||||
"date_query": date_query,
|
||||
"platforms": platforms,
|
||||
"success": True
|
||||
}
|
||||
|
||||
except MCPError as e:
|
||||
return {
|
||||
"success": False,
|
||||
"error": e.to_dict()
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
"success": False,
|
||||
"error": {
|
||||
"code": "INTERNAL_ERROR",
|
||||
"message": str(e)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,664 @@
|
||||
"""
|
||||
智能新闻检索工具
|
||||
|
||||
提供模糊搜索、链接查询、历史相关新闻检索等高级搜索功能。
|
||||
"""
|
||||
|
||||
import re
|
||||
from collections import Counter
|
||||
from datetime import datetime, timedelta
|
||||
from difflib import SequenceMatcher
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
from ..services.data_service import DataService
|
||||
from ..utils.validators import validate_keyword, validate_limit
|
||||
from ..utils.errors import MCPError, InvalidParameterError, DataNotFoundError
|
||||
|
||||
|
||||
class SearchTools:
|
||||
"""智能新闻检索工具类"""
|
||||
|
||||
def __init__(self, project_root: str = None):
|
||||
"""
|
||||
初始化智能检索工具
|
||||
|
||||
Args:
|
||||
project_root: 项目根目录
|
||||
"""
|
||||
self.data_service = DataService(project_root)
|
||||
# 中文停用词列表
|
||||
self.stopwords = {
|
||||
'的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一',
|
||||
'一个', '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有',
|
||||
'看', '好', '自己', '这', '那', '来', '被', '与', '为', '对', '将', '从',
|
||||
'以', '及', '等', '但', '或', '而', '于', '中', '由', '可', '可以', '已',
|
||||
'已经', '还', '更', '最', '再', '因为', '所以', '如果', '虽然', '然而'
|
||||
}
|
||||
|
||||
def search_news_unified(
|
||||
self,
|
||||
query: str,
|
||||
search_mode: str = "keyword",
|
||||
date_range: Optional[Dict[str, str]] = None,
|
||||
platforms: Optional[List[str]] = None,
|
||||
limit: int = 50,
|
||||
sort_by: str = "relevance",
|
||||
threshold: float = 0.6,
|
||||
include_url: bool = False
|
||||
) -> Dict:
|
||||
"""
|
||||
统一新闻搜索工具 - 整合多种搜索模式
|
||||
|
||||
Args:
|
||||
query: 查询内容(必需)- 关键词、内容片段或实体名称
|
||||
search_mode: 搜索模式,可选值:
|
||||
- "keyword": 精确关键词匹配(默认)
|
||||
- "fuzzy": 模糊内容匹配(使用相似度算法)
|
||||
- "entity": 实体名称搜索(自动按权重排序)
|
||||
date_range: 日期范围,格式: {"start": "YYYY-MM-DD", "end": "YYYY-MM-DD"}
|
||||
不指定则默认查询今天
|
||||
platforms: 平台过滤列表,如 ['zhihu', 'weibo']
|
||||
limit: 返回条数限制,默认50
|
||||
sort_by: 排序方式,可选值:
|
||||
- "relevance": 按相关度排序(默认)
|
||||
- "weight": 按新闻权重排序
|
||||
- "date": 按日期排序
|
||||
threshold: 相似度阈值(仅fuzzy模式有效),0-1之间,默认0.6
|
||||
include_url: 是否包含URL链接,默认False(节省token)
|
||||
|
||||
Returns:
|
||||
搜索结果字典,包含匹配的新闻列表
|
||||
|
||||
Examples:
|
||||
- search_news_unified(query="人工智能", search_mode="keyword")
|
||||
- search_news_unified(query="特斯拉降价", search_mode="fuzzy", threshold=0.4)
|
||||
- search_news_unified(query="马斯克", search_mode="entity", limit=20)
|
||||
- search_news_unified(query="iPhone 16发布", search_mode="keyword")
|
||||
"""
|
||||
try:
|
||||
# 参数验证
|
||||
query = validate_keyword(query)
|
||||
|
||||
if search_mode not in ["keyword", "fuzzy", "entity"]:
|
||||
raise InvalidParameterError(
|
||||
f"无效的搜索模式: {search_mode}",
|
||||
suggestion="支持的模式: keyword, fuzzy, entity"
|
||||
)
|
||||
|
||||
if sort_by not in ["relevance", "weight", "date"]:
|
||||
raise InvalidParameterError(
|
||||
f"无效的排序方式: {sort_by}",
|
||||
suggestion="支持的排序: relevance, weight, date"
|
||||
)
|
||||
|
||||
limit = validate_limit(limit, default=50)
|
||||
threshold = max(0.0, min(1.0, threshold))
|
||||
|
||||
# 处理日期范围
|
||||
if date_range:
|
||||
from ..utils.validators import validate_date_range
|
||||
date_range_tuple = validate_date_range(date_range)
|
||||
start_date, end_date = date_range_tuple
|
||||
else:
|
||||
# 默认今天
|
||||
start_date = end_date = datetime.now()
|
||||
|
||||
# 收集所有匹配的新闻
|
||||
all_matches = []
|
||||
current_date = start_date
|
||||
|
||||
while current_date <= end_date:
|
||||
try:
|
||||
all_titles, id_to_name, timestamps = self.data_service.parser.read_all_titles_for_date(
|
||||
date=current_date,
|
||||
platform_ids=platforms
|
||||
)
|
||||
|
||||
# 根据搜索模式执行不同的搜索逻辑
|
||||
if search_mode == "keyword":
|
||||
matches = self._search_by_keyword_mode(
|
||||
query, all_titles, id_to_name, current_date, include_url
|
||||
)
|
||||
elif search_mode == "fuzzy":
|
||||
matches = self._search_by_fuzzy_mode(
|
||||
query, all_titles, id_to_name, current_date, threshold, include_url
|
||||
)
|
||||
else: # entity
|
||||
matches = self._search_by_entity_mode(
|
||||
query, all_titles, id_to_name, current_date, include_url
|
||||
)
|
||||
|
||||
all_matches.extend(matches)
|
||||
|
||||
except DataNotFoundError:
|
||||
# 该日期没有数据,继续下一天
|
||||
pass
|
||||
|
||||
current_date += timedelta(days=1)
|
||||
|
||||
if not all_matches:
|
||||
time_desc = "今天" if start_date == end_date else f"{start_date.strftime('%Y-%m-%d')} 至 {end_date.strftime('%Y-%m-%d')}"
|
||||
return {
|
||||
"success": True,
|
||||
"results": [],
|
||||
"total": 0,
|
||||
"query": query,
|
||||
"search_mode": search_mode,
|
||||
"time_range": time_desc,
|
||||
"message": f"未找到匹配的新闻({time_desc})"
|
||||
}
|
||||
|
||||
# 统一排序逻辑
|
||||
if sort_by == "relevance":
|
||||
all_matches.sort(key=lambda x: x.get("similarity_score", 1.0), reverse=True)
|
||||
elif sort_by == "weight":
|
||||
from .analytics import calculate_news_weight
|
||||
all_matches.sort(key=lambda x: calculate_news_weight(x), reverse=True)
|
||||
elif sort_by == "date":
|
||||
all_matches.sort(key=lambda x: x.get("date", ""), reverse=True)
|
||||
|
||||
# 限制返回数量
|
||||
results = all_matches[:limit]
|
||||
|
||||
# 构建时间范围描述
|
||||
if start_date == end_date:
|
||||
time_range_desc = start_date.strftime("%Y-%m-%d")
|
||||
else:
|
||||
time_range_desc = f"{start_date.strftime('%Y-%m-%d')} 至 {end_date.strftime('%Y-%m-%d')}"
|
||||
|
||||
result = {
|
||||
"success": True,
|
||||
"summary": {
|
||||
"total_found": len(all_matches),
|
||||
"returned_count": len(results),
|
||||
"requested_limit": limit,
|
||||
"search_mode": search_mode,
|
||||
"query": query,
|
||||
"platforms": platforms or "所有平台",
|
||||
"time_range": time_range_desc,
|
||||
"sort_by": sort_by
|
||||
},
|
||||
"results": results
|
||||
}
|
||||
|
||||
if search_mode == "fuzzy":
|
||||
result["summary"]["threshold"] = threshold
|
||||
if len(all_matches) < limit:
|
||||
result["note"] = f"模糊搜索模式下,相似度阈值 {threshold} 仅匹配到 {len(all_matches)} 条结果"
|
||||
|
||||
return result
|
||||
|
||||
except MCPError as e:
|
||||
return {
|
||||
"success": False,
|
||||
"error": e.to_dict()
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
"success": False,
|
||||
"error": {
|
||||
"code": "INTERNAL_ERROR",
|
||||
"message": str(e)
|
||||
}
|
||||
}
|
||||
|
||||
def _search_by_keyword_mode(
|
||||
self,
|
||||
query: str,
|
||||
all_titles: Dict,
|
||||
id_to_name: Dict,
|
||||
current_date: datetime,
|
||||
include_url: bool
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
关键词搜索模式(精确匹配)
|
||||
|
||||
Args:
|
||||
query: 搜索关键词
|
||||
all_titles: 所有标题字典
|
||||
id_to_name: 平台ID到名称映射
|
||||
current_date: 当前日期
|
||||
|
||||
Returns:
|
||||
匹配的新闻列表
|
||||
"""
|
||||
matches = []
|
||||
query_lower = query.lower()
|
||||
|
||||
for platform_id, titles in all_titles.items():
|
||||
platform_name = id_to_name.get(platform_id, platform_id)
|
||||
|
||||
for title, info in titles.items():
|
||||
# 精确包含判断
|
||||
if query_lower in title.lower():
|
||||
news_item = {
|
||||
"title": title,
|
||||
"platform": platform_id,
|
||||
"platform_name": platform_name,
|
||||
"date": current_date.strftime("%Y-%m-%d"),
|
||||
"similarity_score": 1.0, # 精确匹配,相似度为1
|
||||
"ranks": info.get("ranks", []),
|
||||
"count": len(info.get("ranks", [])),
|
||||
"rank": info["ranks"][0] if info["ranks"] else 999
|
||||
}
|
||||
|
||||
# 条件性添加 URL 字段
|
||||
if include_url:
|
||||
news_item["url"] = info.get("url", "")
|
||||
news_item["mobileUrl"] = info.get("mobileUrl", "")
|
||||
|
||||
matches.append(news_item)
|
||||
|
||||
return matches
|
||||
|
||||
def _search_by_fuzzy_mode(
|
||||
self,
|
||||
query: str,
|
||||
all_titles: Dict,
|
||||
id_to_name: Dict,
|
||||
current_date: datetime,
|
||||
threshold: float,
|
||||
include_url: bool
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
模糊搜索模式(使用相似度算法)
|
||||
|
||||
Args:
|
||||
query: 搜索内容
|
||||
all_titles: 所有标题字典
|
||||
id_to_name: 平台ID到名称映射
|
||||
current_date: 当前日期
|
||||
threshold: 相似度阈值
|
||||
|
||||
Returns:
|
||||
匹配的新闻列表
|
||||
"""
|
||||
matches = []
|
||||
|
||||
for platform_id, titles in all_titles.items():
|
||||
platform_name = id_to_name.get(platform_id, platform_id)
|
||||
|
||||
for title, info in titles.items():
|
||||
# 模糊匹配
|
||||
is_match, similarity = self._fuzzy_match(query, title, threshold)
|
||||
|
||||
if is_match:
|
||||
news_item = {
|
||||
"title": title,
|
||||
"platform": platform_id,
|
||||
"platform_name": platform_name,
|
||||
"date": current_date.strftime("%Y-%m-%d"),
|
||||
"similarity_score": round(similarity, 4),
|
||||
"ranks": info.get("ranks", []),
|
||||
"count": len(info.get("ranks", [])),
|
||||
"rank": info["ranks"][0] if info["ranks"] else 999
|
||||
}
|
||||
|
||||
# 条件性添加 URL 字段
|
||||
if include_url:
|
||||
news_item["url"] = info.get("url", "")
|
||||
news_item["mobileUrl"] = info.get("mobileUrl", "")
|
||||
|
||||
matches.append(news_item)
|
||||
|
||||
return matches
|
||||
|
||||
def _search_by_entity_mode(
|
||||
self,
|
||||
query: str,
|
||||
all_titles: Dict,
|
||||
id_to_name: Dict,
|
||||
current_date: datetime,
|
||||
include_url: bool
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
实体搜索模式(自动按权重排序)
|
||||
|
||||
Args:
|
||||
query: 实体名称
|
||||
all_titles: 所有标题字典
|
||||
id_to_name: 平台ID到名称映射
|
||||
current_date: 当前日期
|
||||
|
||||
Returns:
|
||||
匹配的新闻列表
|
||||
"""
|
||||
matches = []
|
||||
|
||||
for platform_id, titles in all_titles.items():
|
||||
platform_name = id_to_name.get(platform_id, platform_id)
|
||||
|
||||
for title, info in titles.items():
|
||||
# 实体搜索:精确包含实体名称
|
||||
if query in title:
|
||||
news_item = {
|
||||
"title": title,
|
||||
"platform": platform_id,
|
||||
"platform_name": platform_name,
|
||||
"date": current_date.strftime("%Y-%m-%d"),
|
||||
"similarity_score": 1.0,
|
||||
"ranks": info.get("ranks", []),
|
||||
"count": len(info.get("ranks", [])),
|
||||
"rank": info["ranks"][0] if info["ranks"] else 999
|
||||
}
|
||||
|
||||
# 条件性添加 URL 字段
|
||||
if include_url:
|
||||
news_item["url"] = info.get("url", "")
|
||||
news_item["mobileUrl"] = info.get("mobileUrl", "")
|
||||
|
||||
matches.append(news_item)
|
||||
|
||||
return matches
|
||||
|
||||
def _calculate_similarity(self, text1: str, text2: str) -> float:
|
||||
"""
|
||||
计算两个文本的相似度
|
||||
|
||||
Args:
|
||||
text1: 文本1
|
||||
text2: 文本2
|
||||
|
||||
Returns:
|
||||
相似度分数 (0-1之间)
|
||||
"""
|
||||
# 使用 difflib.SequenceMatcher 计算序列相似度
|
||||
return SequenceMatcher(None, text1.lower(), text2.lower()).ratio()
|
||||
|
||||
def _fuzzy_match(self, query: str, text: str, threshold: float = 0.3) -> Tuple[bool, float]:
|
||||
"""
|
||||
模糊匹配函数
|
||||
|
||||
Args:
|
||||
query: 查询文本
|
||||
text: 待匹配文本
|
||||
threshold: 匹配阈值
|
||||
|
||||
Returns:
|
||||
(是否匹配, 相似度分数)
|
||||
"""
|
||||
# 直接包含判断
|
||||
if query.lower() in text.lower():
|
||||
return True, 1.0
|
||||
|
||||
# 计算整体相似度
|
||||
similarity = self._calculate_similarity(query, text)
|
||||
if similarity >= threshold:
|
||||
return True, similarity
|
||||
|
||||
# 分词后的部分匹配
|
||||
query_words = set(self._extract_keywords(query))
|
||||
text_words = set(self._extract_keywords(text))
|
||||
|
||||
if not query_words or not text_words:
|
||||
return False, 0.0
|
||||
|
||||
# 计算关键词重合度
|
||||
common_words = query_words & text_words
|
||||
keyword_overlap = len(common_words) / len(query_words)
|
||||
|
||||
if keyword_overlap >= 0.5: # 50%的关键词重合
|
||||
return True, keyword_overlap
|
||||
|
||||
return False, similarity
|
||||
|
||||
def _extract_keywords(self, text: str, min_length: int = 2) -> List[str]:
|
||||
"""
|
||||
从文本中提取关键词
|
||||
|
||||
Args:
|
||||
text: 输入文本
|
||||
min_length: 最小词长
|
||||
|
||||
Returns:
|
||||
关键词列表
|
||||
"""
|
||||
# 移除URL和特殊字符
|
||||
text = re.sub(r'http[s]?://\S+', '', text)
|
||||
text = re.sub(r'\[.*?\]', '', text) # 移除方括号内容
|
||||
|
||||
# 使用正则表达式分词(中文和英文)
|
||||
words = re.findall(r'[\w]+', text)
|
||||
|
||||
# 过滤停用词和短词
|
||||
keywords = [
|
||||
word for word in words
|
||||
if word and len(word) >= min_length and word not in self.stopwords
|
||||
]
|
||||
|
||||
return keywords
|
||||
|
||||
def _calculate_keyword_overlap(self, keywords1: List[str], keywords2: List[str]) -> float:
|
||||
"""
|
||||
计算两个关键词列表的重合度
|
||||
|
||||
Args:
|
||||
keywords1: 关键词列表1
|
||||
keywords2: 关键词列表2
|
||||
|
||||
Returns:
|
||||
重合度分数 (0-1之间)
|
||||
"""
|
||||
if not keywords1 or not keywords2:
|
||||
return 0.0
|
||||
|
||||
set1 = set(keywords1)
|
||||
set2 = set(keywords2)
|
||||
|
||||
# Jaccard 相似度
|
||||
intersection = len(set1 & set2)
|
||||
union = len(set1 | set2)
|
||||
|
||||
if union == 0:
|
||||
return 0.0
|
||||
|
||||
return intersection / union
|
||||
|
||||
def search_related_news_history(
|
||||
self,
|
||||
reference_text: str,
|
||||
time_range: str = "yesterday",
|
||||
start_date: Optional[datetime] = None,
|
||||
end_date: Optional[datetime] = None,
|
||||
threshold: float = 0.4,
|
||||
limit: int = 50,
|
||||
include_url: bool = False
|
||||
) -> Dict:
|
||||
"""
|
||||
在历史数据中搜索与给定新闻相关的新闻
|
||||
|
||||
Args:
|
||||
reference_text: 参考新闻标题或内容
|
||||
time_range: 时间范围预设值,可选:
|
||||
- "yesterday": 昨天
|
||||
- "last_week": 上周 (7天)
|
||||
- "last_month": 上个月 (30天)
|
||||
- "custom": 自定义日期范围(需要提供 start_date 和 end_date)
|
||||
start_date: 自定义开始日期(仅当 time_range="custom" 时有效)
|
||||
end_date: 自定义结束日期(仅当 time_range="custom" 时有效)
|
||||
threshold: 相似度阈值 (0-1之间),默认0.4
|
||||
limit: 返回条数限制,默认50
|
||||
include_url: 是否包含URL链接,默认False(节省token)
|
||||
|
||||
Returns:
|
||||
搜索结果字典,包含相关新闻列表
|
||||
|
||||
Example:
|
||||
>>> tools = SearchTools()
|
||||
>>> result = tools.search_related_news_history(
|
||||
... reference_text="人工智能技术突破",
|
||||
... time_range="last_week",
|
||||
... threshold=0.4,
|
||||
... limit=50
|
||||
... )
|
||||
>>> for news in result['results']:
|
||||
... print(f"{news['date']}: {news['title']} (相似度: {news['similarity_score']})")
|
||||
"""
|
||||
try:
|
||||
# 参数验证
|
||||
reference_text = validate_keyword(reference_text)
|
||||
threshold = max(0.0, min(1.0, threshold))
|
||||
limit = validate_limit(limit, default=50)
|
||||
|
||||
# 确定查询日期范围
|
||||
today = datetime.now()
|
||||
|
||||
if time_range == "yesterday":
|
||||
search_start = today - timedelta(days=1)
|
||||
search_end = today - timedelta(days=1)
|
||||
elif time_range == "last_week":
|
||||
search_start = today - timedelta(days=7)
|
||||
search_end = today - timedelta(days=1)
|
||||
elif time_range == "last_month":
|
||||
search_start = today - timedelta(days=30)
|
||||
search_end = today - timedelta(days=1)
|
||||
elif time_range == "custom":
|
||||
if not start_date or not end_date:
|
||||
raise InvalidParameterError(
|
||||
"自定义时间范围需要提供 start_date 和 end_date",
|
||||
suggestion="请提供 start_date 和 end_date 参数"
|
||||
)
|
||||
search_start = start_date
|
||||
search_end = end_date
|
||||
else:
|
||||
raise InvalidParameterError(
|
||||
f"不支持的时间范围: {time_range}",
|
||||
suggestion="请使用 'yesterday', 'last_week', 'last_month' 或 'custom'"
|
||||
)
|
||||
|
||||
# 提取参考文本的关键词
|
||||
reference_keywords = self._extract_keywords(reference_text)
|
||||
|
||||
if not reference_keywords:
|
||||
raise InvalidParameterError(
|
||||
"无法从参考文本中提取关键词",
|
||||
suggestion="请提供更详细的文本内容"
|
||||
)
|
||||
|
||||
# 收集所有相关新闻
|
||||
all_related_news = []
|
||||
current_date = search_start
|
||||
|
||||
while current_date <= search_end:
|
||||
try:
|
||||
# 读取该日期的数据
|
||||
all_titles, id_to_name, _ = self.data_service.parser.read_all_titles_for_date(current_date)
|
||||
|
||||
# 搜索相关新闻
|
||||
for platform_id, titles in all_titles.items():
|
||||
platform_name = id_to_name.get(platform_id, platform_id)
|
||||
|
||||
for title, info in titles.items():
|
||||
# 计算标题相似度
|
||||
title_similarity = self._calculate_similarity(reference_text, title)
|
||||
|
||||
# 提取标题关键词
|
||||
title_keywords = self._extract_keywords(title)
|
||||
|
||||
# 计算关键词重合度
|
||||
keyword_overlap = self._calculate_keyword_overlap(
|
||||
reference_keywords,
|
||||
title_keywords
|
||||
)
|
||||
|
||||
# 综合相似度 (70% 关键词重合 + 30% 文本相似度)
|
||||
combined_score = keyword_overlap * 0.7 + title_similarity * 0.3
|
||||
|
||||
if combined_score >= threshold:
|
||||
news_item = {
|
||||
"title": title,
|
||||
"platform": platform_id,
|
||||
"platform_name": platform_name,
|
||||
"date": current_date.strftime("%Y-%m-%d"),
|
||||
"similarity_score": round(combined_score, 4),
|
||||
"keyword_overlap": round(keyword_overlap, 4),
|
||||
"text_similarity": round(title_similarity, 4),
|
||||
"common_keywords": list(set(reference_keywords) & set(title_keywords)),
|
||||
"rank": info["ranks"][0] if info["ranks"] else 0
|
||||
}
|
||||
|
||||
# 条件性添加 URL 字段
|
||||
if include_url:
|
||||
news_item["url"] = info.get("url", "")
|
||||
news_item["mobileUrl"] = info.get("mobileUrl", "")
|
||||
|
||||
all_related_news.append(news_item)
|
||||
|
||||
except DataNotFoundError:
|
||||
# 该日期没有数据,继续下一天
|
||||
pass
|
||||
except Exception as e:
|
||||
# 记录错误但继续处理其他日期
|
||||
print(f"Warning: 处理日期 {current_date.strftime('%Y-%m-%d')} 时出错: {e}")
|
||||
|
||||
# 移动到下一天
|
||||
current_date += timedelta(days=1)
|
||||
|
||||
if not all_related_news:
|
||||
return {
|
||||
"success": True,
|
||||
"results": [],
|
||||
"total": 0,
|
||||
"query": reference_text,
|
||||
"time_range": time_range,
|
||||
"date_range": {
|
||||
"start": search_start.strftime("%Y-%m-%d"),
|
||||
"end": search_end.strftime("%Y-%m-%d")
|
||||
},
|
||||
"message": "未找到相关新闻"
|
||||
}
|
||||
|
||||
# 按相似度排序
|
||||
all_related_news.sort(key=lambda x: x["similarity_score"], reverse=True)
|
||||
|
||||
# 限制返回数量
|
||||
results = all_related_news[:limit]
|
||||
|
||||
# 统计信息
|
||||
platform_distribution = Counter([news["platform"] for news in all_related_news])
|
||||
date_distribution = Counter([news["date"] for news in all_related_news])
|
||||
|
||||
result = {
|
||||
"success": True,
|
||||
"summary": {
|
||||
"total_found": len(all_related_news),
|
||||
"returned_count": len(results),
|
||||
"requested_limit": limit,
|
||||
"threshold": threshold,
|
||||
"reference_text": reference_text,
|
||||
"reference_keywords": reference_keywords,
|
||||
"time_range": time_range,
|
||||
"date_range": {
|
||||
"start": search_start.strftime("%Y-%m-%d"),
|
||||
"end": search_end.strftime("%Y-%m-%d")
|
||||
}
|
||||
},
|
||||
"results": results,
|
||||
"statistics": {
|
||||
"platform_distribution": dict(platform_distribution),
|
||||
"date_distribution": dict(date_distribution),
|
||||
"avg_similarity": round(
|
||||
sum([news["similarity_score"] for news in all_related_news]) / len(all_related_news),
|
||||
4
|
||||
) if all_related_news else 0.0
|
||||
}
|
||||
}
|
||||
|
||||
if len(all_related_news) < limit:
|
||||
result["note"] = f"相关性阈值 {threshold} 下仅找到 {len(all_related_news)} 条相关新闻"
|
||||
|
||||
return result
|
||||
|
||||
except MCPError as e:
|
||||
return {
|
||||
"success": False,
|
||||
"error": e.to_dict()
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
"success": False,
|
||||
"error": {
|
||||
"code": "INTERNAL_ERROR",
|
||||
"message": str(e)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,465 @@
|
||||
"""
|
||||
系统管理工具
|
||||
|
||||
实现系统状态查询和爬虫触发功能。
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from ..services.data_service import DataService
|
||||
from ..utils.validators import validate_platforms
|
||||
from ..utils.errors import MCPError, CrawlTaskError
|
||||
|
||||
|
||||
class SystemManagementTools:
|
||||
"""系统管理工具类"""
|
||||
|
||||
def __init__(self, project_root: str = None):
|
||||
"""
|
||||
初始化系统管理工具
|
||||
|
||||
Args:
|
||||
project_root: 项目根目录
|
||||
"""
|
||||
self.data_service = DataService(project_root)
|
||||
if project_root:
|
||||
self.project_root = Path(project_root)
|
||||
else:
|
||||
# 获取项目根目录
|
||||
current_file = Path(__file__)
|
||||
self.project_root = current_file.parent.parent.parent
|
||||
|
||||
def get_system_status(self) -> Dict:
|
||||
"""
|
||||
获取系统运行状态和健康检查信息
|
||||
|
||||
Returns:
|
||||
系统状态字典
|
||||
|
||||
Example:
|
||||
>>> tools = SystemManagementTools()
|
||||
>>> result = tools.get_system_status()
|
||||
>>> print(result['system']['version'])
|
||||
"""
|
||||
try:
|
||||
# 获取系统状态
|
||||
status = self.data_service.get_system_status()
|
||||
|
||||
return {
|
||||
**status,
|
||||
"success": True
|
||||
}
|
||||
|
||||
except MCPError as e:
|
||||
return {
|
||||
"success": False,
|
||||
"error": e.to_dict()
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
"success": False,
|
||||
"error": {
|
||||
"code": "INTERNAL_ERROR",
|
||||
"message": str(e)
|
||||
}
|
||||
}
|
||||
|
||||
def trigger_crawl(self, platforms: Optional[List[str]] = None, save_to_local: bool = False, include_url: bool = False) -> Dict:
|
||||
"""
|
||||
手动触发一次临时爬取任务(可选持久化)
|
||||
|
||||
Args:
|
||||
platforms: 指定平台列表,为空则爬取所有平台
|
||||
save_to_local: 是否保存到本地 output 目录,默认 False
|
||||
include_url: 是否包含URL链接,默认False(节省token)
|
||||
|
||||
Returns:
|
||||
爬取结果字典,包含新闻数据和保存路径(如果保存)
|
||||
|
||||
Example:
|
||||
>>> tools = SystemManagementTools()
|
||||
>>> # 临时爬取,不保存
|
||||
>>> result = tools.trigger_crawl(platforms=['zhihu', 'weibo'])
|
||||
>>> print(result['data'])
|
||||
>>> # 爬取并保存到本地
|
||||
>>> result = tools.trigger_crawl(platforms=['zhihu'], save_to_local=True)
|
||||
>>> print(result['saved_files'])
|
||||
"""
|
||||
try:
|
||||
import json
|
||||
import time
|
||||
import random
|
||||
import requests
|
||||
from datetime import datetime
|
||||
import pytz
|
||||
import yaml
|
||||
|
||||
# 参数验证
|
||||
platforms = validate_platforms(platforms)
|
||||
|
||||
# 加载配置文件
|
||||
config_path = self.project_root / "config" / "config.yaml"
|
||||
if not config_path.exists():
|
||||
raise CrawlTaskError(
|
||||
"配置文件不存在",
|
||||
suggestion=f"请确保配置文件存在: {config_path}"
|
||||
)
|
||||
|
||||
# 读取配置
|
||||
with open(config_path, "r", encoding="utf-8") as f:
|
||||
config_data = yaml.safe_load(f)
|
||||
|
||||
# 获取平台配置
|
||||
all_platforms = config_data.get("platforms", [])
|
||||
if not all_platforms:
|
||||
raise CrawlTaskError(
|
||||
"配置文件中没有平台配置",
|
||||
suggestion="请检查 config/config.yaml 中的 platforms 配置"
|
||||
)
|
||||
|
||||
# 过滤平台
|
||||
if platforms:
|
||||
target_platforms = [p for p in all_platforms if p["id"] in platforms]
|
||||
if not target_platforms:
|
||||
raise CrawlTaskError(
|
||||
f"指定的平台不存在: {platforms}",
|
||||
suggestion=f"可用平台: {[p['id'] for p in all_platforms]}"
|
||||
)
|
||||
else:
|
||||
target_platforms = all_platforms
|
||||
|
||||
# 获取请求间隔
|
||||
request_interval = config_data.get("crawler", {}).get("request_interval", 100)
|
||||
|
||||
# 构建平台ID列表
|
||||
ids = []
|
||||
for platform in target_platforms:
|
||||
if "name" in platform:
|
||||
ids.append((platform["id"], platform["name"]))
|
||||
else:
|
||||
ids.append(platform["id"])
|
||||
|
||||
print(f"开始临时爬取,平台: {[p.get('name', p['id']) for p in target_platforms]}")
|
||||
|
||||
# 爬取数据
|
||||
results = {}
|
||||
id_to_name = {}
|
||||
failed_ids = []
|
||||
|
||||
for i, id_info in enumerate(ids):
|
||||
if isinstance(id_info, tuple):
|
||||
id_value, name = id_info
|
||||
else:
|
||||
id_value = id_info
|
||||
name = id_value
|
||||
|
||||
id_to_name[id_value] = name
|
||||
|
||||
# 构建请求URL
|
||||
url = f"https://newsnow.busiyi.world/api/s?id={id_value}&latest"
|
||||
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
|
||||
"Accept": "application/json, text/plain, */*",
|
||||
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
||||
"Connection": "keep-alive",
|
||||
"Cache-Control": "no-cache",
|
||||
}
|
||||
|
||||
# 重试机制
|
||||
max_retries = 2
|
||||
retries = 0
|
||||
success = False
|
||||
|
||||
while retries <= max_retries and not success:
|
||||
try:
|
||||
response = requests.get(url, headers=headers, timeout=10)
|
||||
response.raise_for_status()
|
||||
|
||||
data_text = response.text
|
||||
data_json = json.loads(data_text)
|
||||
|
||||
status = data_json.get("status", "未知")
|
||||
if status not in ["success", "cache"]:
|
||||
raise ValueError(f"响应状态异常: {status}")
|
||||
|
||||
status_info = "最新数据" if status == "success" else "缓存数据"
|
||||
print(f"获取 {id_value} 成功({status_info})")
|
||||
|
||||
# 解析数据
|
||||
results[id_value] = {}
|
||||
for index, item in enumerate(data_json.get("items", []), 1):
|
||||
title = item["title"]
|
||||
url_link = item.get("url", "")
|
||||
mobile_url = item.get("mobileUrl", "")
|
||||
|
||||
if title in results[id_value]:
|
||||
results[id_value][title]["ranks"].append(index)
|
||||
else:
|
||||
results[id_value][title] = {
|
||||
"ranks": [index],
|
||||
"url": url_link,
|
||||
"mobileUrl": mobile_url,
|
||||
}
|
||||
|
||||
success = True
|
||||
|
||||
except Exception as e:
|
||||
retries += 1
|
||||
if retries <= max_retries:
|
||||
wait_time = random.uniform(3, 5)
|
||||
print(f"请求 {id_value} 失败: {e}. {wait_time:.2f}秒后重试...")
|
||||
time.sleep(wait_time)
|
||||
else:
|
||||
print(f"请求 {id_value} 失败: {e}")
|
||||
failed_ids.append(id_value)
|
||||
|
||||
# 请求间隔
|
||||
if i < len(ids) - 1:
|
||||
actual_interval = request_interval + random.randint(-10, 20)
|
||||
actual_interval = max(50, actual_interval)
|
||||
time.sleep(actual_interval / 1000)
|
||||
|
||||
# 格式化返回数据
|
||||
news_data = []
|
||||
for platform_id, titles_data in results.items():
|
||||
platform_name = id_to_name.get(platform_id, platform_id)
|
||||
for title, info in titles_data.items():
|
||||
news_item = {
|
||||
"platform_id": platform_id,
|
||||
"platform_name": platform_name,
|
||||
"title": title,
|
||||
"ranks": info["ranks"]
|
||||
}
|
||||
|
||||
# 条件性添加 URL 字段
|
||||
if include_url:
|
||||
news_item["url"] = info.get("url", "")
|
||||
news_item["mobile_url"] = info.get("mobileUrl", "")
|
||||
|
||||
news_data.append(news_item)
|
||||
|
||||
# 获取北京时间
|
||||
beijing_tz = pytz.timezone("Asia/Shanghai")
|
||||
now = datetime.now(beijing_tz)
|
||||
|
||||
# 构建返回结果
|
||||
result = {
|
||||
"success": True,
|
||||
"task_id": f"crawl_{int(time.time())}",
|
||||
"status": "completed",
|
||||
"crawl_time": now.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"platforms": list(results.keys()),
|
||||
"total_news": len(news_data),
|
||||
"failed_platforms": failed_ids,
|
||||
"data": news_data,
|
||||
"saved_to_local": save_to_local
|
||||
}
|
||||
|
||||
# 如果需要持久化,调用保存逻辑
|
||||
if save_to_local:
|
||||
try:
|
||||
import re
|
||||
|
||||
# 辅助函数:清理标题
|
||||
def clean_title(title: str) -> str:
|
||||
"""清理标题中的特殊字符"""
|
||||
if not isinstance(title, str):
|
||||
title = str(title)
|
||||
cleaned_title = title.replace("\n", " ").replace("\r", " ")
|
||||
cleaned_title = re.sub(r"\s+", " ", cleaned_title)
|
||||
cleaned_title = cleaned_title.strip()
|
||||
return cleaned_title
|
||||
|
||||
# 辅助函数:创建目录
|
||||
def ensure_directory_exists(directory: str):
|
||||
"""确保目录存在"""
|
||||
Path(directory).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 格式化日期和时间
|
||||
date_folder = now.strftime("%Y年%m月%d日")
|
||||
time_filename = now.strftime("%H时%M分")
|
||||
|
||||
# 创建 txt 文件路径
|
||||
txt_dir = self.project_root / "output" / date_folder / "txt"
|
||||
ensure_directory_exists(str(txt_dir))
|
||||
txt_file_path = txt_dir / f"{time_filename}.txt"
|
||||
|
||||
# 创建 html 文件路径
|
||||
html_dir = self.project_root / "output" / date_folder / "html"
|
||||
ensure_directory_exists(str(html_dir))
|
||||
html_file_path = html_dir / f"{time_filename}.html"
|
||||
|
||||
# 保存 txt 文件(按照 main.py 的格式)
|
||||
with open(txt_file_path, "w", encoding="utf-8") as f:
|
||||
for id_value, title_data in results.items():
|
||||
# id | name 或 id
|
||||
name = id_to_name.get(id_value)
|
||||
if name and name != id_value:
|
||||
f.write(f"{id_value} | {name}\n")
|
||||
else:
|
||||
f.write(f"{id_value}\n")
|
||||
|
||||
# 按排名排序标题
|
||||
sorted_titles = []
|
||||
for title, info in title_data.items():
|
||||
cleaned = clean_title(title)
|
||||
if isinstance(info, dict):
|
||||
ranks = info.get("ranks", [])
|
||||
url = info.get("url", "")
|
||||
mobile_url = info.get("mobileUrl", "")
|
||||
else:
|
||||
ranks = info if isinstance(info, list) else []
|
||||
url = ""
|
||||
mobile_url = ""
|
||||
|
||||
rank = ranks[0] if ranks else 1
|
||||
sorted_titles.append((rank, cleaned, url, mobile_url))
|
||||
|
||||
sorted_titles.sort(key=lambda x: x[0])
|
||||
|
||||
for rank, cleaned, url, mobile_url in sorted_titles:
|
||||
line = f"{rank}. {cleaned}"
|
||||
if url:
|
||||
line += f" [URL:{url}]"
|
||||
if mobile_url:
|
||||
line += f" [MOBILE:{mobile_url}]"
|
||||
f.write(line + "\n")
|
||||
|
||||
f.write("\n")
|
||||
|
||||
if failed_ids:
|
||||
f.write("==== 以下ID请求失败 ====\n")
|
||||
for id_value in failed_ids:
|
||||
f.write(f"{id_value}\n")
|
||||
|
||||
# 保存 html 文件(简化版)
|
||||
html_content = self._generate_simple_html(results, id_to_name, failed_ids, now)
|
||||
with open(html_file_path, "w", encoding="utf-8") as f:
|
||||
f.write(html_content)
|
||||
|
||||
print(f"数据已保存到:")
|
||||
print(f" TXT: {txt_file_path}")
|
||||
print(f" HTML: {html_file_path}")
|
||||
|
||||
result["saved_files"] = {
|
||||
"txt": str(txt_file_path),
|
||||
"html": str(html_file_path)
|
||||
}
|
||||
result["note"] = "数据已持久化到 output 文件夹"
|
||||
|
||||
except Exception as e:
|
||||
print(f"保存文件失败: {e}")
|
||||
result["save_error"] = str(e)
|
||||
result["note"] = "爬取成功但保存失败,数据仅在内存中"
|
||||
else:
|
||||
result["note"] = "临时爬取结果,未持久化到output文件夹"
|
||||
|
||||
return result
|
||||
|
||||
except MCPError as e:
|
||||
return {
|
||||
"success": False,
|
||||
"error": e.to_dict()
|
||||
}
|
||||
except Exception as e:
|
||||
import traceback
|
||||
return {
|
||||
"success": False,
|
||||
"error": {
|
||||
"code": "INTERNAL_ERROR",
|
||||
"message": str(e),
|
||||
"traceback": traceback.format_exc()
|
||||
}
|
||||
}
|
||||
|
||||
def _generate_simple_html(self, results: Dict, id_to_name: Dict, failed_ids: List, now) -> str:
|
||||
"""生成简化的 HTML 报告"""
|
||||
html = """<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>MCP 爬取结果</title>
|
||||
<style>
|
||||
body { font-family: Arial, sans-serif; margin: 20px; background: #f5f5f5; }
|
||||
.container { max-width: 900px; margin: 0 auto; background: white; padding: 20px; border-radius: 8px; }
|
||||
h1 { color: #333; border-bottom: 2px solid #4CAF50; padding-bottom: 10px; }
|
||||
.platform { margin-bottom: 30px; }
|
||||
.platform-name { background: #4CAF50; color: white; padding: 10px; border-radius: 5px; margin-bottom: 10px; }
|
||||
.news-item { padding: 8px; border-bottom: 1px solid #eee; }
|
||||
.rank { color: #666; font-weight: bold; margin-right: 10px; }
|
||||
.title { color: #333; }
|
||||
.link { color: #1976D2; text-decoration: none; margin-left: 10px; font-size: 0.9em; }
|
||||
.link:hover { text-decoration: underline; }
|
||||
.failed { background: #ffebee; padding: 10px; border-radius: 5px; margin-top: 20px; }
|
||||
.failed h3 { color: #c62828; margin-top: 0; }
|
||||
.timestamp { color: #666; font-size: 0.9em; text-align: right; margin-top: 20px; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<h1>MCP 爬取结果</h1>
|
||||
"""
|
||||
|
||||
# 添加时间戳
|
||||
html += f' <p class="timestamp">爬取时间: {now.strftime("%Y-%m-%d %H:%M:%S")}</p>\n\n'
|
||||
|
||||
# 遍历每个平台
|
||||
for platform_id, titles_data in results.items():
|
||||
platform_name = id_to_name.get(platform_id, platform_id)
|
||||
html += f' <div class="platform">\n'
|
||||
html += f' <div class="platform-name">{platform_name}</div>\n'
|
||||
|
||||
# 排序标题
|
||||
sorted_items = []
|
||||
for title, info in titles_data.items():
|
||||
ranks = info.get("ranks", [])
|
||||
url = info.get("url", "")
|
||||
mobile_url = info.get("mobileUrl", "")
|
||||
rank = ranks[0] if ranks else 999
|
||||
sorted_items.append((rank, title, url, mobile_url))
|
||||
|
||||
sorted_items.sort(key=lambda x: x[0])
|
||||
|
||||
# 显示新闻
|
||||
for rank, title, url, mobile_url in sorted_items:
|
||||
html += f' <div class="news-item">\n'
|
||||
html += f' <span class="rank">{rank}.</span>\n'
|
||||
html += f' <span class="title">{self._html_escape(title)}</span>\n'
|
||||
if url:
|
||||
html += f' <a class="link" href="{self._html_escape(url)}" target="_blank">链接</a>\n'
|
||||
if mobile_url and mobile_url != url:
|
||||
html += f' <a class="link" href="{self._html_escape(mobile_url)}" target="_blank">移动版</a>\n'
|
||||
html += ' </div>\n'
|
||||
|
||||
html += ' </div>\n\n'
|
||||
|
||||
# 失败的平台
|
||||
if failed_ids:
|
||||
html += ' <div class="failed">\n'
|
||||
html += ' <h3>请求失败的平台</h3>\n'
|
||||
html += ' <ul>\n'
|
||||
for platform_id in failed_ids:
|
||||
html += f' <li>{self._html_escape(platform_id)}</li>\n'
|
||||
html += ' </ul>\n'
|
||||
html += ' </div>\n'
|
||||
|
||||
html += """ </div>
|
||||
</body>
|
||||
</html>"""
|
||||
|
||||
return html
|
||||
|
||||
def _html_escape(self, text: str) -> str:
|
||||
"""HTML 转义"""
|
||||
if not isinstance(text, str):
|
||||
text = str(text)
|
||||
return (
|
||||
text.replace("&", "&")
|
||||
.replace("<", "<")
|
||||
.replace(">", ">")
|
||||
.replace('"', """)
|
||||
.replace("'", "'")
|
||||
)
|
||||
Reference in New Issue
Block a user