v4.0.0 大大大更新

This commit is contained in:
sansan
2025-12-13 13:44:35 +08:00
parent 97c05aa33c
commit c7bacdfff7
61 changed files with 12407 additions and 5889 deletions
+1 -1
View File
@@ -4,4 +4,4 @@ TrendRadar MCP Server
提供基于MCP协议的新闻聚合数据查询和系统管理接口。
"""
__version__ = "1.0.0"
__version__ = "1.1.0"
+128
View File
@@ -15,6 +15,7 @@ from .tools.analytics import AnalyticsTools
from .tools.search_tools import SearchTools
from .tools.config_mgmt import ConfigManagementTools
from .tools.system import SystemManagementTools
from .tools.storage_sync import StorageSyncTools
from .utils.date_parser import DateParser
from .utils.errors import MCPError
@@ -34,6 +35,7 @@ def _get_tools(project_root: Optional[str] = None):
_tools_instances['search'] = SearchTools(project_root)
_tools_instances['config'] = ConfigManagementTools(project_root)
_tools_instances['system'] = SystemManagementTools(project_root)
_tools_instances['storage'] = StorageSyncTools(project_root)
return _tools_instances
@@ -657,6 +659,127 @@ async def trigger_crawl(
return json.dumps(result, ensure_ascii=False, indent=2)
# ==================== 存储同步工具 ====================
@mcp.tool
async def sync_from_remote(
days: int = 7
) -> str:
"""
从远程存储拉取数据到本地
用于 MCP Server 等场景:爬虫存到远程云存储(如 Cloudflare R2),
MCP Server 拉取到本地进行分析查询。
Args:
days: 拉取最近 N 天的数据,默认 7 天
- 0: 不拉取
- 7: 拉取最近一周的数据
- 30: 拉取最近一个月的数据
Returns:
JSON格式的同步结果,包含:
- success: 是否成功
- synced_files: 成功同步的文件数量
- synced_dates: 成功同步的日期列表
- skipped_dates: 跳过的日期(本地已存在)
- failed_dates: 失败的日期及错误信息
- message: 操作结果描述
Examples:
- sync_from_remote() # 拉取最近7天
- sync_from_remote(days=30) # 拉取最近30天
Note:
需要在 config/config.yaml 中配置远程存储(storage.remote)或设置环境变量:
- S3_ENDPOINT_URL: 服务端点
- S3_BUCKET_NAME: 存储桶名称
- S3_ACCESS_KEY_ID: 访问密钥 ID
- S3_SECRET_ACCESS_KEY: 访问密钥
"""
tools = _get_tools()
result = tools['storage'].sync_from_remote(days=days)
return json.dumps(result, ensure_ascii=False, indent=2)
@mcp.tool
async def get_storage_status() -> str:
"""
获取存储配置和状态
查看当前存储后端配置、本地和远程存储的状态信息。
Returns:
JSON格式的存储状态信息,包含:
- backend: 当前使用的后端类型(local/remote/auto
- local: 本地存储状态
- data_dir: 数据目录
- retention_days: 保留天数
- total_size: 总大小
- date_count: 日期数量
- earliest_date: 最早日期
- latest_date: 最新日期
- remote: 远程存储状态
- configured: 是否已配置
- endpoint_url: 服务端点
- bucket_name: 存储桶名称
- date_count: 远程日期数量
- pull: 拉取配置
- enabled: 是否启用自动拉取
- days: 自动拉取天数
Examples:
- get_storage_status() # 查看所有存储状态
"""
tools = _get_tools()
result = tools['storage'].get_storage_status()
return json.dumps(result, ensure_ascii=False, indent=2)
@mcp.tool
async def list_available_dates(
source: str = "both"
) -> str:
"""
列出本地/远程可用的日期范围
查看本地和远程存储中有哪些日期的数据可用,
帮助了解数据覆盖范围和同步状态。
Args:
source: 数据来源,可选值:
- "local": 仅列出本地可用日期
- "remote": 仅列出远程可用日期
- "both": 同时列出两者并进行对比(默认)
Returns:
JSON格式的日期列表,包含:
- local: 本地日期信息(如果 source 包含 local
- dates: 日期列表(按时间倒序)
- count: 日期数量
- earliest: 最早日期
- latest: 最新日期
- remote: 远程日期信息(如果 source 包含 remote
- configured: 是否已配置远程存储
- dates: 日期列表
- count: 日期数量
- earliest: 最早日期
- latest: 最新日期
- comparison: 对比结果(仅当 source="both" 时)
- only_local: 仅本地存在的日期
- only_remote: 仅远程存在的日期
- both: 两边都存在的日期
Examples:
- list_available_dates() # 查看本地和远程的对比
- list_available_dates(source="local") # 仅查看本地
- list_available_dates(source="remote") # 仅查看远程
"""
tools = _get_tools()
result = tools['storage'].list_available_dates(source=source)
return json.dumps(result, ensure_ascii=False, indent=2)
# ==================== 启动入口 ====================
def run_server(
@@ -721,6 +844,11 @@ def run_server(
print(" 11. get_current_config - 获取当前系统配置")
print(" 12. get_system_status - 获取系统运行状态")
print(" 13. trigger_crawl - 手动触发爬取任务")
print()
print(" === 存储同步工具 ===")
print(" 14. sync_from_remote - 从远程存储拉取数据到本地")
print(" 15. get_storage_status - 获取存储配置和状态")
print(" 16. list_available_dates - 列出本地/远程可用日期")
print("=" * 60)
print()
+51 -32
View File
@@ -517,24 +517,55 @@ class DataService:
# 遍历日期文件夹
for date_folder in output_dir.iterdir():
if date_folder.is_dir() and not date_folder.name.startswith('.'):
# 解析日期(格式: YYYY年MM月DD日)
try:
date_match = re.match(r'(\d{4})年(\d{2})月(\d{2})日', date_folder.name)
if date_match:
folder_date = datetime(
int(date_match.group(1)),
int(date_match.group(2)),
int(date_match.group(3))
)
available_dates.append(folder_date)
except Exception:
pass
folder_date = self._parse_date_folder_name(date_folder.name)
if folder_date:
available_dates.append(folder_date)
if not available_dates:
return (None, None)
return (min(available_dates), max(available_dates))
def _parse_date_folder_name(self, folder_name: str) -> Optional[datetime]:
"""
解析日期文件夹名称(兼容中文和ISO格式)
支持两种格式:
- 中文格式:YYYY年MM月DD日
- ISO格式:YYYY-MM-DD
Args:
folder_name: 文件夹名称
Returns:
datetime 对象,解析失败返回 None
"""
# 尝试中文格式:YYYY年MM月DD日
chinese_match = re.match(r'(\d{4})年(\d{2})月(\d{2})日', folder_name)
if chinese_match:
try:
return datetime(
int(chinese_match.group(1)),
int(chinese_match.group(2)),
int(chinese_match.group(3))
)
except ValueError:
pass
# 尝试 ISO 格式:YYYY-MM-DD
iso_match = re.match(r'(\d{4})-(\d{2})-(\d{2})', folder_name)
if iso_match:
try:
return datetime(
int(iso_match.group(1)),
int(iso_match.group(2)),
int(iso_match.group(3))
)
except ValueError:
pass
return None
def get_system_status(self) -> Dict:
"""
获取系统运行状态
@@ -553,26 +584,14 @@ class DataService:
if output_dir.exists():
# 遍历日期文件夹
for date_folder in output_dir.iterdir():
if date_folder.is_dir():
# 解析日期
try:
date_str = date_folder.name
# 格式: YYYY年MM月DD日
date_match = re.match(r'(\d{4})年(\d{2})月(\d{2})日', date_str)
if date_match:
folder_date = datetime(
int(date_match.group(1)),
int(date_match.group(2)),
int(date_match.group(3))
)
if oldest_record is None or folder_date < oldest_record:
oldest_record = folder_date
if latest_record is None or folder_date > latest_record:
latest_record = folder_date
except:
pass
if date_folder.is_dir() and not date_folder.name.startswith('.'):
# 解析日期(兼容中文和ISO格式)
folder_date = self._parse_date_folder_name(date_folder.name)
if folder_date:
if oldest_record is None or folder_date < oldest_record:
oldest_record = folder_date
if latest_record is None or folder_date > latest_record:
latest_record = folder_date
# 计算存储大小
for item in date_folder.rglob("*"):
+315 -67
View File
@@ -2,9 +2,12 @@
文件解析服务
提供txt格式新闻数据和YAML配置文件的解析功能。
支持从 SQLite 数据库和 TXT 文件两种数据源读取。
"""
import json
import re
import sqlite3
from pathlib import Path
from typing import Dict, List, Tuple, Optional
from datetime import datetime
@@ -145,17 +148,310 @@ class ParserService:
def get_date_folder_name(self, date: datetime = None) -> str:
"""
获取日期文件夹名称
获取日期文件夹名称(兼容中文和ISO格式)
Args:
date: 日期对象,默认为今天
Returns:
文件夹名称,格式: YYYY年MM月DD日
实际存在的文件夹名称,优先返回中文格式(YYYY年MM月DD日),
若不存在则返回 ISO 格式(YYYY-MM-DD
"""
if date is None:
date = datetime.now()
return date.strftime("%Y年%m月%d")
return self._find_date_folder(date)
def _get_date_folder_name(self, date: datetime = None) -> str:
"""
获取日期文件夹名称(兼容中文和ISO格式)
Args:
date: 日期对象,默认为今天
Returns:
实际存在的文件夹名称,优先返回中文格式(YYYY年MM月DD日),
若不存在则返回 ISO 格式(YYYY-MM-DD
"""
if date is None:
date = datetime.now()
return self._find_date_folder(date)
def _find_date_folder(self, date: datetime) -> str:
"""
查找实际存在的日期文件夹
支持两种格式:
- 中文格式:YYYY年MM月DD日(优先)
- ISO格式:YYYY-MM-DD
Args:
date: 日期对象
Returns:
实际存在的文件夹名称,若都不存在则返回中文格式
"""
output_dir = self.project_root / "output"
# 中文格式:YYYY年MM月DD日
chinese_format = date.strftime("%Y年%m月%d")
# ISO格式:YYYY-MM-DD
iso_format = date.strftime("%Y-%m-%d")
# 优先检查中文格式
if (output_dir / chinese_format).exists():
return chinese_format
# 其次检查 ISO 格式
if (output_dir / iso_format).exists():
return iso_format
# 都不存在,返回中文格式(与项目现有风格一致)
return chinese_format
def _get_sqlite_db_path(self, date: datetime = None) -> Optional[Path]:
"""
获取 SQLite 数据库文件路径
Args:
date: 日期对象,默认为今天
Returns:
数据库文件路径,如果不存在则返回 None
"""
date_folder = self._get_date_folder_name(date)
db_path = self.project_root / "output" / date_folder / "news.db"
if db_path.exists():
return db_path
return None
def _get_txt_folder_path(self, date: datetime = None) -> Optional[Path]:
"""
获取 TXT 文件夹路径
Args:
date: 日期对象,默认为今天
Returns:
TXT 文件夹路径,如果不存在则返回 None
"""
date_folder = self._get_date_folder_name(date)
txt_path = self.project_root / "output" / date_folder / "txt"
if txt_path.exists() and txt_path.is_dir():
return txt_path
return None
def _read_from_txt(
self,
date: datetime = None,
platform_ids: Optional[List[str]] = None
) -> Optional[Tuple[Dict, Dict, Dict]]:
"""
从 TXT 文件夹读取新闻数据
Args:
date: 日期对象,默认为今天
platform_ids: 平台ID列表,None表示所有平台
Returns:
(all_titles, id_to_name, all_timestamps) 元组,如果不存在返回 None
"""
txt_folder = self._get_txt_folder_path(date)
if txt_folder is None:
return None
# 获取所有 TXT 文件并按时间排序
txt_files = sorted(txt_folder.glob("*.txt"))
if not txt_files:
return None
all_titles = {}
id_to_name = {}
all_timestamps = {}
for txt_file in txt_files:
try:
titles_by_id, file_id_to_name = self.parse_txt_file(txt_file)
# 记录时间戳
all_timestamps[txt_file.name] = txt_file.stat().st_mtime
# 合并 id_to_name
id_to_name.update(file_id_to_name)
# 合并标题数据
for source_id, titles in titles_by_id.items():
# 如果指定了 platform_ids,过滤
if platform_ids and source_id not in platform_ids:
continue
if source_id not in all_titles:
all_titles[source_id] = {}
for title, data in titles.items():
if title not in all_titles[source_id]:
# 新标题
all_titles[source_id][title] = {
"ranks": data.get("ranks", []),
"url": data.get("url", ""),
"mobileUrl": data.get("mobileUrl", ""),
"first_time": txt_file.stem, # 使用文件名作为时间
"last_time": txt_file.stem,
"count": 1,
}
else:
# 合并已存在的标题
existing = all_titles[source_id][title]
# 合并排名
for rank in data.get("ranks", []):
if rank not in existing["ranks"]:
existing["ranks"].append(rank)
# 更新 last_time
existing["last_time"] = txt_file.stem
existing["count"] += 1
# 保留 URL
if not existing["url"] and data.get("url"):
existing["url"] = data["url"]
if not existing["mobileUrl"] and data.get("mobileUrl"):
existing["mobileUrl"] = data["mobileUrl"]
except Exception as e:
print(f"Warning: 解析 TXT 文件失败 {txt_file}: {e}")
continue
if not all_titles:
return None
return (all_titles, id_to_name, all_timestamps)
def _read_from_sqlite(
self,
date: datetime = None,
platform_ids: Optional[List[str]] = None
) -> Optional[Tuple[Dict, Dict, Dict]]:
"""
从 SQLite 数据库读取新闻数据
新表结构数据已按 URL 去重,包含:
- first_crawl_time: 首次抓取时间
- last_crawl_time: 最后抓取时间
- crawl_count: 抓取次数
Args:
date: 日期对象,默认为今天
platform_ids: 平台ID列表,None表示所有平台
Returns:
(all_titles, id_to_name, all_timestamps) 元组,如果数据库不存在返回 None
"""
db_path = self._get_sqlite_db_path(date)
if db_path is None:
return None
all_titles = {}
id_to_name = {}
all_timestamps = {}
try:
conn = sqlite3.connect(str(db_path))
conn.row_factory = sqlite3.Row
cursor = conn.cursor()
# 检查表是否存在
cursor.execute("""
SELECT name FROM sqlite_master
WHERE type='table' AND name='news_items'
""")
if not cursor.fetchone():
conn.close()
return None
# 构建查询
if platform_ids:
placeholders = ','.join(['?' for _ in platform_ids])
query = f"""
SELECT n.id, n.platform_id, p.name as platform_name, n.title,
n.rank, n.url, n.mobile_url,
n.first_crawl_time, n.last_crawl_time, n.crawl_count
FROM news_items n
LEFT JOIN platforms p ON n.platform_id = p.id
WHERE n.platform_id IN ({placeholders})
"""
cursor.execute(query, platform_ids)
else:
cursor.execute("""
SELECT n.id, n.platform_id, p.name as platform_name, n.title,
n.rank, n.url, n.mobile_url,
n.first_crawl_time, n.last_crawl_time, n.crawl_count
FROM news_items n
LEFT JOIN platforms p ON n.platform_id = p.id
""")
rows = cursor.fetchall()
# 收集所有 news_item_id 用于查询历史排名
news_ids = [row['id'] for row in rows]
rank_history_map = {}
if news_ids:
placeholders = ",".join("?" * len(news_ids))
cursor.execute(f"""
SELECT news_item_id, rank FROM rank_history
WHERE news_item_id IN ({placeholders})
ORDER BY news_item_id, crawl_time
""", news_ids)
for rh_row in cursor.fetchall():
news_id = rh_row['news_item_id']
rank = rh_row['rank']
if news_id not in rank_history_map:
rank_history_map[news_id] = []
rank_history_map[news_id].append(rank)
for row in rows:
news_id = row['id']
platform_id = row['platform_id']
platform_name = row['platform_name'] or platform_id
title = row['title']
# 更新 id_to_name
if platform_id not in id_to_name:
id_to_name[platform_id] = platform_name
# 初始化平台字典
if platform_id not in all_titles:
all_titles[platform_id] = {}
# 获取排名历史,如果为空则使用当前排名
ranks = rank_history_map.get(news_id, [row['rank']])
# 直接使用数据(已去重)
all_titles[platform_id][title] = {
"ranks": ranks,
"url": row['url'] or "",
"mobileUrl": row['mobile_url'] or "",
"first_time": row['first_crawl_time'] or "",
"last_time": row['last_crawl_time'] or "",
"count": row['crawl_count'] or 1,
}
# 获取抓取时间作为 timestamps
cursor.execute("""
SELECT crawl_time FROM crawl_records
ORDER BY crawl_time
""")
for row in cursor.fetchall():
crawl_time = row['crawl_time']
all_timestamps[f"{crawl_time}.db"] = 0 # 用虚拟时间戳
conn.close()
if not all_titles:
return None
return (all_titles, id_to_name, all_timestamps)
except Exception as e:
print(f"Warning: 从 SQLite 读取数据失败: {e}")
return None
def read_all_titles_for_date(
self,
@@ -163,7 +459,7 @@ class ParserService:
platform_ids: Optional[List[str]] = None
) -> Tuple[Dict, Dict, Dict]:
"""
读取指定日期的所有标题文件(带缓存)
读取指定日期的所有标题(带缓存)
Args:
date: 日期对象,默认为今天
@@ -193,71 +489,23 @@ class ParserService:
if cached:
return cached
# 缓存未命中,读取文件
date_folder = self.get_date_folder_name(date)
txt_dir = self.project_root / "output" / date_folder / "txt"
# 优先从 SQLite 读取
sqlite_result = self._read_from_sqlite(date, platform_ids)
if sqlite_result:
self.cache.set(cache_key, sqlite_result)
return sqlite_result
if not txt_dir.exists():
raise DataNotFoundError(
f"未找到 {date_folder} 的数据目录",
suggestion="请先运行爬虫或检查日期是否正确"
)
# SQLite 不存在,尝试从 TXT 读取
txt_result = self._read_from_txt(date, platform_ids)
if txt_result:
self.cache.set(cache_key, txt_result)
return txt_result
all_titles = {}
id_to_name = {}
all_timestamps = {}
# 读取所有txt文件
txt_files = sorted(txt_dir.glob("*.txt"))
if not txt_files:
raise DataNotFoundError(
f"{date_folder} 没有数据文件",
suggestion="请等待爬虫任务完成"
)
for txt_file in txt_files:
try:
titles_by_id, file_id_to_name = self.parse_txt_file(txt_file)
# 更新id_to_name
id_to_name.update(file_id_to_name)
# 合并标题数据
for platform_id, titles in titles_by_id.items():
# 如果指定了平台过滤
if platform_ids and platform_id not in platform_ids:
continue
if platform_id not in all_titles:
all_titles[platform_id] = {}
for title, info in titles.items():
if title in all_titles[platform_id]:
# 合并排名
all_titles[platform_id][title]["ranks"].extend(info["ranks"])
else:
all_titles[platform_id][title] = info.copy()
# 记录文件时间戳
all_timestamps[txt_file.name] = txt_file.stat().st_mtime
except Exception as e:
# 忽略单个文件的解析错误,继续处理其他文件
print(f"Warning: 解析文件 {txt_file} 失败: {e}")
continue
if not all_titles:
raise DataNotFoundError(
f"{date_folder} 没有有效的数据",
suggestion="请检查数据文件格式或重新运行爬虫"
)
# 缓存结果
result = (all_titles, id_to_name, all_timestamps)
self.cache.set(cache_key, result)
return result
# 两种数据源都不存在
raise DataNotFoundError(
f"未找到 {date_str} 的数据",
suggestion="请先运行爬虫或检查日期是否正确"
)
def parse_yaml_config(self, config_path: str = None) -> dict:
"""
-1
View File
@@ -25,7 +25,6 @@ def calculate_news_weight(news_data: Dict, rank_threshold: int = 5) -> float:
"""
计算新闻权重(用于排序)
基于 main.py 的权重算法实现,综合考虑:
- 排名权重 (60%):新闻在榜单中的排名
- 频次权重 (30%):新闻出现的次数
- 热度权重 (10%):高排名出现的比例
+468
View File
@@ -0,0 +1,468 @@
# coding=utf-8
"""
存储同步工具
实现从远程存储拉取数据到本地、获取存储状态、列出可用日期等功能。
"""
import os
import re
from pathlib import Path
from datetime import datetime, timedelta
from typing import Dict, List, Optional
import yaml
from ..utils.errors import MCPError
class StorageSyncTools:
"""存储同步工具类"""
def __init__(self, project_root: str = None):
"""
初始化存储同步工具
Args:
project_root: 项目根目录
"""
if project_root:
self.project_root = Path(project_root)
else:
current_file = Path(__file__)
self.project_root = current_file.parent.parent.parent
self._config = None
self._remote_backend = None
def _load_config(self) -> dict:
"""加载配置文件"""
if self._config is None:
config_path = self.project_root / "config" / "config.yaml"
if config_path.exists():
with open(config_path, "r", encoding="utf-8") as f:
self._config = yaml.safe_load(f)
else:
self._config = {}
return self._config
def _get_storage_config(self) -> dict:
"""获取存储配置"""
config = self._load_config()
return config.get("storage", {})
def _get_remote_config(self) -> dict:
"""
获取远程存储配置(合并配置文件和环境变量)
"""
storage_config = self._get_storage_config()
remote_config = storage_config.get("remote", {})
return {
"endpoint_url": remote_config.get("endpoint_url") or os.environ.get("S3_ENDPOINT_URL", ""),
"bucket_name": remote_config.get("bucket_name") or os.environ.get("S3_BUCKET_NAME", ""),
"access_key_id": remote_config.get("access_key_id") or os.environ.get("S3_ACCESS_KEY_ID", ""),
"secret_access_key": remote_config.get("secret_access_key") or os.environ.get("S3_SECRET_ACCESS_KEY", ""),
"region": remote_config.get("region") or os.environ.get("S3_REGION", ""),
}
def _has_remote_config(self) -> bool:
"""检查是否有有效的远程存储配置"""
config = self._get_remote_config()
return bool(
config.get("bucket_name") and
config.get("access_key_id") and
config.get("secret_access_key") and
config.get("endpoint_url")
)
def _get_remote_backend(self):
"""获取远程存储后端实例"""
if self._remote_backend is not None:
return self._remote_backend
if not self._has_remote_config():
return None
try:
from trendradar.storage.remote import RemoteStorageBackend
remote_config = self._get_remote_config()
config = self._load_config()
timezone = config.get("app", {}).get("timezone", "Asia/Shanghai")
self._remote_backend = RemoteStorageBackend(
bucket_name=remote_config["bucket_name"],
access_key_id=remote_config["access_key_id"],
secret_access_key=remote_config["secret_access_key"],
endpoint_url=remote_config["endpoint_url"],
region=remote_config.get("region", ""),
timezone=timezone,
)
return self._remote_backend
except ImportError:
print("[存储同步] 远程存储后端需要安装 boto3: pip install boto3")
return None
except Exception as e:
print(f"[存储同步] 创建远程后端失败: {e}")
return None
def _get_local_data_dir(self) -> Path:
"""获取本地数据目录"""
storage_config = self._get_storage_config()
local_config = storage_config.get("local", {})
data_dir = local_config.get("data_dir", "output")
return self.project_root / data_dir
def _parse_date_folder_name(self, folder_name: str) -> Optional[datetime]:
"""
解析日期文件夹名称(兼容中文和 ISO 格式)
支持两种格式:
- 中文格式:YYYY年MM月DD日
- ISO 格式:YYYY-MM-DD
"""
# 尝试 ISO 格式
iso_match = re.match(r'(\d{4})-(\d{2})-(\d{2})', folder_name)
if iso_match:
try:
return datetime(
int(iso_match.group(1)),
int(iso_match.group(2)),
int(iso_match.group(3))
)
except ValueError:
pass
# 尝试中文格式
chinese_match = re.match(r'(\d{4})年(\d{2})月(\d{2})日', folder_name)
if chinese_match:
try:
return datetime(
int(chinese_match.group(1)),
int(chinese_match.group(2)),
int(chinese_match.group(3))
)
except ValueError:
pass
return None
def _get_local_dates(self) -> List[str]:
"""获取本地可用的日期列表"""
local_dir = self._get_local_data_dir()
dates = []
if not local_dir.exists():
return dates
for item in local_dir.iterdir():
if item.is_dir() and not item.name.startswith('.'):
folder_date = self._parse_date_folder_name(item.name)
if folder_date:
dates.append(folder_date.strftime("%Y-%m-%d"))
return sorted(dates, reverse=True)
def _calculate_dir_size(self, path: Path) -> int:
"""计算目录大小(字节)"""
total_size = 0
if path.exists():
for item in path.rglob("*"):
if item.is_file():
total_size += item.stat().st_size
return total_size
def sync_from_remote(self, days: int = 7) -> Dict:
"""
从远程存储拉取数据到本地
Args:
days: 拉取最近 N 天的数据,默认 7 天
Returns:
同步结果字典
"""
try:
# 检查远程配置
if not self._has_remote_config():
return {
"success": False,
"error": {
"code": "REMOTE_NOT_CONFIGURED",
"message": "未配置远程存储",
"suggestion": "请在 config/config.yaml 中配置 storage.remote 或设置环境变量"
}
}
# 获取远程后端
remote_backend = self._get_remote_backend()
if remote_backend is None:
return {
"success": False,
"error": {
"code": "REMOTE_BACKEND_FAILED",
"message": "无法创建远程存储后端",
"suggestion": "请检查远程存储配置和 boto3 是否已安装"
}
}
# 获取本地数据目录
local_dir = self._get_local_data_dir()
local_dir.mkdir(parents=True, exist_ok=True)
# 获取远程可用日期
remote_dates = remote_backend.list_remote_dates()
# 获取本地已有日期
local_dates = set(self._get_local_dates())
# 计算需要拉取的日期(最近 N 天)
from trendradar.utils.time import get_configured_time
config = self._load_config()
timezone = config.get("app", {}).get("timezone", "Asia/Shanghai")
now = get_configured_time(timezone)
target_dates = []
for i in range(days):
date = now - timedelta(days=i)
date_str = date.strftime("%Y-%m-%d")
if date_str in remote_dates:
target_dates.append(date_str)
# 执行拉取
synced_dates = []
skipped_dates = []
failed_dates = []
for date_str in target_dates:
# 检查本地是否已存在
if date_str in local_dates:
skipped_dates.append(date_str)
continue
# 拉取单个日期
try:
local_date_dir = local_dir / date_str
local_db_path = local_date_dir / "news.db"
remote_key = f"news/{date_str}.db"
local_date_dir.mkdir(parents=True, exist_ok=True)
remote_backend.s3_client.download_file(
remote_backend.bucket_name,
remote_key,
str(local_db_path)
)
synced_dates.append(date_str)
print(f"[存储同步] 已拉取: {date_str}")
except Exception as e:
failed_dates.append({"date": date_str, "error": str(e)})
print(f"[存储同步] 拉取失败 ({date_str}): {e}")
return {
"success": True,
"synced_files": len(synced_dates),
"synced_dates": synced_dates,
"skipped_dates": skipped_dates,
"failed_dates": failed_dates,
"message": f"成功同步 {len(synced_dates)} 天数据" + (
f",跳过 {len(skipped_dates)} 天(本地已存在)" if skipped_dates else ""
) + (
f",失败 {len(failed_dates)}" if failed_dates else ""
)
}
except MCPError as e:
return {
"success": False,
"error": e.to_dict()
}
except Exception as e:
return {
"success": False,
"error": {
"code": "INTERNAL_ERROR",
"message": str(e)
}
}
def get_storage_status(self) -> Dict:
"""
获取存储配置和状态
Returns:
存储状态字典
"""
try:
storage_config = self._get_storage_config()
config = self._load_config()
# 本地存储状态
local_config = storage_config.get("local", {})
local_dir = self._get_local_data_dir()
local_size = self._calculate_dir_size(local_dir)
local_dates = self._get_local_dates()
local_status = {
"data_dir": local_config.get("data_dir", "output"),
"retention_days": local_config.get("retention_days", 0),
"total_size": f"{local_size / 1024 / 1024:.2f} MB",
"total_size_bytes": local_size,
"date_count": len(local_dates),
"earliest_date": local_dates[-1] if local_dates else None,
"latest_date": local_dates[0] if local_dates else None,
}
# 远程存储状态
remote_config = storage_config.get("remote", {})
has_remote = self._has_remote_config()
remote_status = {
"configured": has_remote,
"retention_days": remote_config.get("retention_days", 0),
}
if has_remote:
merged_config = self._get_remote_config()
# 脱敏显示
endpoint = merged_config.get("endpoint_url", "")
bucket = merged_config.get("bucket_name", "")
remote_status["endpoint_url"] = endpoint
remote_status["bucket_name"] = bucket
# 尝试获取远程日期列表
remote_backend = self._get_remote_backend()
if remote_backend:
try:
remote_dates = remote_backend.list_remote_dates()
remote_status["date_count"] = len(remote_dates)
remote_status["earliest_date"] = remote_dates[-1] if remote_dates else None
remote_status["latest_date"] = remote_dates[0] if remote_dates else None
except Exception as e:
remote_status["error"] = str(e)
# 拉取配置状态
pull_config = storage_config.get("pull", {})
pull_status = {
"enabled": pull_config.get("enabled", False),
"days": pull_config.get("days", 7),
}
return {
"success": True,
"backend": storage_config.get("backend", "auto"),
"local": local_status,
"remote": remote_status,
"pull": pull_status,
}
except MCPError as e:
return {
"success": False,
"error": e.to_dict()
}
except Exception as e:
return {
"success": False,
"error": {
"code": "INTERNAL_ERROR",
"message": str(e)
}
}
def list_available_dates(self, source: str = "both") -> Dict:
"""
列出可用的日期范围
Args:
source: 数据来源
- "local": 仅本地
- "remote": 仅远程
- "both": 两者都列出(默认)
Returns:
日期列表字典
"""
try:
result = {
"success": True,
}
# 本地日期
if source in ("local", "both"):
local_dates = self._get_local_dates()
result["local"] = {
"dates": local_dates,
"count": len(local_dates),
"earliest": local_dates[-1] if local_dates else None,
"latest": local_dates[0] if local_dates else None,
}
# 远程日期
if source in ("remote", "both"):
if not self._has_remote_config():
result["remote"] = {
"configured": False,
"dates": [],
"count": 0,
"earliest": None,
"latest": None,
"error": "未配置远程存储"
}
else:
remote_backend = self._get_remote_backend()
if remote_backend:
try:
remote_dates = remote_backend.list_remote_dates()
result["remote"] = {
"configured": True,
"dates": remote_dates,
"count": len(remote_dates),
"earliest": remote_dates[-1] if remote_dates else None,
"latest": remote_dates[0] if remote_dates else None,
}
except Exception as e:
result["remote"] = {
"configured": True,
"dates": [],
"count": 0,
"earliest": None,
"latest": None,
"error": str(e)
}
else:
result["remote"] = {
"configured": True,
"dates": [],
"count": 0,
"earliest": None,
"latest": None,
"error": "无法创建远程存储后端"
}
# 如果同时查询两者,计算差异
if source == "both" and "local" in result and "remote" in result:
local_set = set(result["local"]["dates"])
remote_set = set(result["remote"].get("dates", []))
result["comparison"] = {
"only_local": sorted(list(local_set - remote_set), reverse=True),
"only_remote": sorted(list(remote_set - local_set), reverse=True),
"both": sorted(list(local_set & remote_set), reverse=True),
}
return result
except MCPError as e:
return {
"success": False,
"error": e.to_dict()
}
except Exception as e:
return {
"success": False,
"error": {
"code": "INTERNAL_ERROR",
"message": str(e)
}
}
+93 -190
View File
@@ -87,13 +87,13 @@ class SystemManagementTools:
>>> print(result['saved_files'])
"""
try:
import json
import time
import random
import requests
from datetime import datetime
import pytz
import yaml
from trendradar.crawler.fetcher import DataFetcher
from trendradar.storage.local import LocalStorageBackend
from trendradar.storage.base import convert_crawl_results_to_news_data
from trendradar.utils.time import get_configured_time, format_date_folder, format_time_filename
from ..services.cache_service import get_cache
# 参数验证
platforms = validate_platforms(platforms)
@@ -129,9 +129,6 @@ class SystemManagementTools:
else:
target_platforms = all_platforms
# 获取请求间隔
request_interval = config_data.get("crawler", {}).get("request_interval", 100)
# 构建平台ID列表
ids = []
for platform in target_platforms:
@@ -142,87 +139,82 @@ class SystemManagementTools:
print(f"开始临时爬取,平台: {[p.get('name', p['id']) for p in target_platforms]}")
# 爬取数据
results = {}
id_to_name = {}
failed_ids = []
# 初始化数据获取器
crawler_config = config_data.get("crawler", {})
proxy_url = None
if crawler_config.get("use_proxy"):
proxy_url = crawler_config.get("proxy_url")
fetcher = DataFetcher(proxy_url=proxy_url)
request_interval = crawler_config.get("request_interval", 100)
for i, id_info in enumerate(ids):
if isinstance(id_info, tuple):
id_value, name = id_info
else:
id_value = id_info
name = id_value
# 执行爬取
results, id_to_name, failed_ids = fetcher.crawl_websites(
ids_list=ids,
request_interval=request_interval
)
id_to_name[id_value] = name
# 获取当前时间(统一使用 trendradar 的时间工具)
# 从配置中读取时区,默认为 Asia/Shanghai
timezone = config_data.get("app", {}).get("timezone", "Asia/Shanghai")
current_time = get_configured_time(timezone)
crawl_date = format_date_folder(None, timezone)
crawl_time_str = format_time_filename(timezone)
# 构建请求URL
url = f"https://newsnow.busiyi.world/api/s?id={id_value}&latest"
# 转换为标准数据模型
news_data = convert_crawl_results_to_news_data(
results=results,
id_to_name=id_to_name,
failed_ids=failed_ids,
crawl_time=crawl_time_str,
crawl_date=crawl_date
)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Accept": "application/json, text/plain, */*",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Connection": "keep-alive",
"Cache-Control": "no-cache",
}
# 初始化存储后端
storage = LocalStorageBackend(
data_dir=str(self.project_root / "output"),
enable_txt=True,
enable_html=True,
timezone=timezone
)
# 重试机制
max_retries = 2
retries = 0
success = False
# 尝试持久化数据
save_success = False
save_error_msg = ""
saved_files = {}
while retries <= max_retries and not success:
try:
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
try:
# 1. 保存到 SQLite (核心持久化)
if storage.save_news_data(news_data):
save_success = True
# 2. 如果请求保存到本地,生成 TXT/HTML 快照
if save_to_local:
# 保存 TXT
txt_path = storage.save_txt_snapshot(news_data)
if txt_path:
saved_files["txt"] = txt_path
data_text = response.text
data_json = json.loads(data_text)
# 保存 HTML (使用简化版生成器)
html_content = self._generate_simple_html(results, id_to_name, failed_ids, current_time)
html_filename = f"{crawl_time_str}.html"
html_path = storage.save_html_report(html_content, html_filename)
if html_path:
saved_files["html"] = html_path
status = data_json.get("status", "未知")
if status not in ["success", "cache"]:
raise ValueError(f"响应状态异常: {status}")
except Exception as e:
# 捕获所有保存错误(特别是 Docker 只读卷导致的 PermissionError
print(f"[System] 数据保存失败: {e}")
save_success = False
save_error_msg = str(e)
status_info = "最新数据" if status == "success" else "缓存数据"
print(f"获取 {id_value} 成功({status_info}")
# 3. 清除缓存,确保下次查询获取最新数据
# 即使保存失败,内存中的数据可能已经通过其他方式更新,或者是临时的
get_cache().clear()
print("[System] 缓存已清除")
# 解析数据
results[id_value] = {}
for index, item in enumerate(data_json.get("items", []), 1):
title = item["title"]
url_link = item.get("url", "")
mobile_url = item.get("mobileUrl", "")
if title in results[id_value]:
results[id_value][title]["ranks"].append(index)
else:
results[id_value][title] = {
"ranks": [index],
"url": url_link,
"mobileUrl": mobile_url,
}
success = True
except Exception as e:
retries += 1
if retries <= max_retries:
wait_time = random.uniform(3, 5)
print(f"请求 {id_value} 失败: {e}. {wait_time:.2f}秒后重试...")
time.sleep(wait_time)
else:
print(f"请求 {id_value} 失败: {e}")
failed_ids.append(id_value)
# 请求间隔
if i < len(ids) - 1:
actual_interval = request_interval + random.randint(-10, 20)
actual_interval = max(50, actual_interval)
time.sleep(actual_interval / 1000)
# 格式化返回数据
news_data = []
# 构建返回结果
news_response_data = []
for platform_id, titles_data in results.items():
platform_name = id_to_name.get(platform_id, platform_id)
for title, info in titles_data.items():
@@ -230,131 +222,42 @@ class SystemManagementTools:
"platform_id": platform_id,
"platform_name": platform_name,
"title": title,
"ranks": info["ranks"]
"ranks": info.get("ranks", [])
}
# 条件性添加 URL 字段
if include_url:
news_item["url"] = info.get("url", "")
news_item["mobile_url"] = info.get("mobileUrl", "")
news_response_data.append(news_item)
news_data.append(news_item)
# 获取北京时间
beijing_tz = pytz.timezone("Asia/Shanghai")
now = datetime.now(beijing_tz)
# 构建返回结果
result = {
"success": True,
"task_id": f"crawl_{int(time.time())}",
"status": "completed",
"crawl_time": now.strftime("%Y-%m-%d %H:%M:%S"),
"crawl_time": current_time.strftime("%Y-%m-%d %H:%M:%S"),
"platforms": list(results.keys()),
"total_news": len(news_data),
"total_news": len(news_response_data),
"failed_platforms": failed_ids,
"data": news_data,
"saved_to_local": save_to_local
"data": news_response_data,
"saved_to_local": save_success and save_to_local
}
# 如果需要持久化,调用保存逻辑
if save_to_local:
try:
import re
# 辅助函数:清理标题
def clean_title(title: str) -> str:
"""清理标题中的特殊字符"""
if not isinstance(title, str):
title = str(title)
cleaned_title = title.replace("\n", " ").replace("\r", " ")
cleaned_title = re.sub(r"\s+", " ", cleaned_title)
cleaned_title = cleaned_title.strip()
return cleaned_title
# 辅助函数:创建目录
def ensure_directory_exists(directory: str):
"""确保目录存在"""
Path(directory).mkdir(parents=True, exist_ok=True)
# 格式化日期和时间
date_folder = now.strftime("%Y年%m月%d")
time_filename = now.strftime("%H时%M分")
# 创建 txt 文件路径
txt_dir = self.project_root / "output" / date_folder / "txt"
ensure_directory_exists(str(txt_dir))
txt_file_path = txt_dir / f"{time_filename}.txt"
# 创建 html 文件路径
html_dir = self.project_root / "output" / date_folder / "html"
ensure_directory_exists(str(html_dir))
html_file_path = html_dir / f"{time_filename}.html"
# 保存 txt 文件(按照 main.py 的格式)
with open(txt_file_path, "w", encoding="utf-8") as f:
for id_value, title_data in results.items():
# id | name 或 id
name = id_to_name.get(id_value)
if name and name != id_value:
f.write(f"{id_value} | {name}\n")
else:
f.write(f"{id_value}\n")
# 按排名排序标题
sorted_titles = []
for title, info in title_data.items():
cleaned = clean_title(title)
if isinstance(info, dict):
ranks = info.get("ranks", [])
url = info.get("url", "")
mobile_url = info.get("mobileUrl", "")
else:
ranks = info if isinstance(info, list) else []
url = ""
mobile_url = ""
rank = ranks[0] if ranks else 1
sorted_titles.append((rank, cleaned, url, mobile_url))
sorted_titles.sort(key=lambda x: x[0])
for rank, cleaned, url, mobile_url in sorted_titles:
line = f"{rank}. {cleaned}"
if url:
line += f" [URL:{url}]"
if mobile_url:
line += f" [MOBILE:{mobile_url}]"
f.write(line + "\n")
f.write("\n")
if failed_ids:
f.write("==== 以下ID请求失败 ====\n")
for id_value in failed_ids:
f.write(f"{id_value}\n")
# 保存 html 文件(简化版)
html_content = self._generate_simple_html(results, id_to_name, failed_ids, now)
with open(html_file_path, "w", encoding="utf-8") as f:
f.write(html_content)
print(f"数据已保存到:")
print(f" TXT: {txt_file_path}")
print(f" HTML: {html_file_path}")
result["saved_files"] = {
"txt": str(txt_file_path),
"html": str(html_file_path)
}
result["note"] = "数据已持久化到 output 文件夹"
except Exception as e:
print(f"保存文件失败: {e}")
result["save_error"] = str(e)
result["note"] = "爬取成功但保存失败,数据仅在内存中"
if save_success:
if save_to_local:
result["saved_files"] = saved_files
result["note"] = "数据已保存到 SQLite 数据库及 output 文件夹"
else:
result["note"] = "数据已保存到 SQLite 数据库 (仅内存中返回结果,未生成TXT快照)"
else:
result["note"] = "临时爬取结果,未持久化到output文件夹"
# 明确告知用户保存失败
result["saved_to_local"] = False
result["save_error"] = save_error_msg
if "Read-only file system" in save_error_msg or "Permission denied" in save_error_msg:
result["note"] = "爬取成功,但无法写入数据库(Docker只读模式)。数据仅在本次返回中有效。"
else:
result["note"] = f"爬取成功但保存失败: {save_error_msg}"
# 清理资源
storage.cleanup()
return result
+3 -3
View File
@@ -283,13 +283,13 @@ class DateParser:
date: datetime对象
Returns:
文件夹名称,格式: YYYYMMDD
文件夹名称,格式: YYYY-MM-DD
Examples:
>>> DateParser.format_date_folder(datetime(2025, 10, 11))
'2025年10月11日'
'2025-10-11'
"""
return date.strftime("%Y年%m月%d")
return date.strftime("%Y-%m-%d")
@staticmethod
def validate_date_not_future(date: datetime) -> None: