v4.0.0 大大大更新

This commit is contained in:
sansan
2025-12-13 13:44:35 +08:00
parent 97c05aa33c
commit c7bacdfff7
61 changed files with 12407 additions and 5889 deletions
+13
View File
@@ -0,0 +1,13 @@
# coding=utf-8
"""
TrendRadar - 热点新闻聚合与分析工具
使用方式:
python -m trendradar # 模块执行
trendradar # 安装后执行
"""
from trendradar.context import AppContext
__version__ = "4.0.0"
__all__ = ["AppContext", "__version__"]
+719
View File
@@ -0,0 +1,719 @@
# coding=utf-8
"""
TrendRadar 主程序
热点新闻聚合与分析工具
支持: python -m trendradar
"""
import os
import webbrowser
from pathlib import Path
from typing import Dict, List, Tuple, Optional
import requests
from trendradar.context import AppContext
# 版本号直接定义,避免循环导入
VERSION = "4.0.0"
from trendradar.core import load_config
from trendradar.crawler import DataFetcher
from trendradar.storage import convert_crawl_results_to_news_data
def check_version_update(
current_version: str, version_url: str, proxy_url: Optional[str] = None
) -> Tuple[bool, Optional[str]]:
"""检查版本更新"""
try:
proxies = None
if proxy_url:
proxies = {"http": proxy_url, "https": proxy_url}
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Accept": "text/plain, */*",
"Cache-Control": "no-cache",
}
response = requests.get(
version_url, proxies=proxies, headers=headers, timeout=10
)
response.raise_for_status()
remote_version = response.text.strip()
print(f"当前版本: {current_version}, 远程版本: {remote_version}")
# 比较版本
def parse_version(version_str):
try:
parts = version_str.strip().split(".")
if len(parts) != 3:
raise ValueError("版本号格式不正确")
return int(parts[0]), int(parts[1]), int(parts[2])
except:
return 0, 0, 0
current_tuple = parse_version(current_version)
remote_tuple = parse_version(remote_version)
need_update = current_tuple < remote_tuple
return need_update, remote_version if need_update else None
except Exception as e:
print(f"版本检查失败: {e}")
return False, None
# === 主分析器 ===
class NewsAnalyzer:
"""新闻分析器"""
# 模式策略定义
MODE_STRATEGIES = {
"incremental": {
"mode_name": "增量模式",
"description": "增量模式(只关注新增新闻,无新增时不推送)",
"realtime_report_type": "实时增量",
"summary_report_type": "当日汇总",
"should_send_realtime": True,
"should_generate_summary": True,
"summary_mode": "daily",
},
"current": {
"mode_name": "当前榜单模式",
"description": "当前榜单模式(当前榜单匹配新闻 + 新增新闻区域 + 按时推送)",
"realtime_report_type": "实时当前榜单",
"summary_report_type": "当前榜单汇总",
"should_send_realtime": True,
"should_generate_summary": True,
"summary_mode": "current",
},
"daily": {
"mode_name": "当日汇总模式",
"description": "当日汇总模式(所有匹配新闻 + 新增新闻区域 + 按时推送)",
"realtime_report_type": "",
"summary_report_type": "当日汇总",
"should_send_realtime": False,
"should_generate_summary": True,
"summary_mode": "daily",
},
}
def __init__(self):
# 加载配置
print("正在加载配置...")
config = load_config()
print(f"TrendRadar v{VERSION} 配置加载完成")
print(f"监控平台数量: {len(config['PLATFORMS'])}")
print(f"时区: {config.get('TIMEZONE', 'Asia/Shanghai')}")
# 创建应用上下文
self.ctx = AppContext(config)
self.request_interval = self.ctx.config["REQUEST_INTERVAL"]
self.report_mode = self.ctx.config["REPORT_MODE"]
self.rank_threshold = self.ctx.rank_threshold
self.is_github_actions = os.environ.get("GITHUB_ACTIONS") == "true"
self.is_docker_container = self._detect_docker_environment()
self.update_info = None
self.proxy_url = None
self._setup_proxy()
self.data_fetcher = DataFetcher(self.proxy_url)
# 初始化存储管理器(使用 AppContext)
self._init_storage_manager()
if self.is_github_actions:
self._check_version_update()
def _init_storage_manager(self) -> None:
"""初始化存储管理器(使用 AppContext)"""
# 获取数据保留天数(支持环境变量覆盖)
env_retention = os.environ.get("STORAGE_RETENTION_DAYS", "").strip()
if env_retention:
# 环境变量覆盖配置
self.ctx.config["STORAGE"]["RETENTION_DAYS"] = int(env_retention)
self.storage_manager = self.ctx.get_storage_manager()
print(f"存储后端: {self.storage_manager.backend_name}")
retention_days = self.ctx.config.get("STORAGE", {}).get("RETENTION_DAYS", 0)
if retention_days > 0:
print(f"数据保留天数: {retention_days}")
def _detect_docker_environment(self) -> bool:
"""检测是否运行在 Docker 容器中"""
try:
if os.environ.get("DOCKER_CONTAINER") == "true":
return True
if os.path.exists("/.dockerenv"):
return True
return False
except Exception:
return False
def _should_open_browser(self) -> bool:
"""判断是否应该打开浏览器"""
return not self.is_github_actions and not self.is_docker_container
def _setup_proxy(self) -> None:
"""设置代理配置"""
if not self.is_github_actions and self.ctx.config["USE_PROXY"]:
self.proxy_url = self.ctx.config["DEFAULT_PROXY"]
print("本地环境,使用代理")
elif not self.is_github_actions and not self.ctx.config["USE_PROXY"]:
print("本地环境,未启用代理")
else:
print("GitHub Actions环境,不使用代理")
def _check_version_update(self) -> None:
"""检查版本更新"""
try:
need_update, remote_version = check_version_update(
VERSION, self.ctx.config["VERSION_CHECK_URL"], self.proxy_url
)
if need_update and remote_version:
self.update_info = {
"current_version": VERSION,
"remote_version": remote_version,
}
print(f"发现新版本: {remote_version} (当前: {VERSION})")
else:
print("版本检查完成,当前为最新版本")
except Exception as e:
print(f"版本检查出错: {e}")
def _get_mode_strategy(self) -> Dict:
"""获取当前模式的策略配置"""
return self.MODE_STRATEGIES.get(self.report_mode, self.MODE_STRATEGIES["daily"])
def _has_notification_configured(self) -> bool:
"""检查是否配置了任何通知渠道"""
cfg = self.ctx.config
return any(
[
cfg["FEISHU_WEBHOOK_URL"],
cfg["DINGTALK_WEBHOOK_URL"],
cfg["WEWORK_WEBHOOK_URL"],
(cfg["TELEGRAM_BOT_TOKEN"] and cfg["TELEGRAM_CHAT_ID"]),
(
cfg["EMAIL_FROM"]
and cfg["EMAIL_PASSWORD"]
and cfg["EMAIL_TO"]
),
(cfg["NTFY_SERVER_URL"] and cfg["NTFY_TOPIC"]),
cfg["BARK_URL"],
cfg["SLACK_WEBHOOK_URL"],
]
)
def _has_valid_content(
self, stats: List[Dict], new_titles: Optional[Dict] = None
) -> bool:
"""检查是否有有效的新闻内容"""
if self.report_mode in ["incremental", "current"]:
# 增量模式和current模式下,只要stats有内容就说明有匹配的新闻
return any(stat["count"] > 0 for stat in stats)
else:
# 当日汇总模式下,检查是否有匹配的频率词新闻或新增新闻
has_matched_news = any(stat["count"] > 0 for stat in stats)
has_new_news = bool(
new_titles and any(len(titles) > 0 for titles in new_titles.values())
)
return has_matched_news or has_new_news
def _load_analysis_data(
self,
) -> Optional[Tuple[Dict, Dict, Dict, Dict, List, List]]:
"""统一的数据加载和预处理,使用当前监控平台列表过滤历史数据"""
try:
# 获取当前配置的监控平台ID列表
current_platform_ids = self.ctx.platform_ids
print(f"当前监控平台: {current_platform_ids}")
all_results, id_to_name, title_info = self.ctx.read_today_titles(
current_platform_ids
)
if not all_results:
print("没有找到当天的数据")
return None
total_titles = sum(len(titles) for titles in all_results.values())
print(f"读取到 {total_titles} 个标题(已按当前监控平台过滤)")
new_titles = self.ctx.detect_new_titles(current_platform_ids)
word_groups, filter_words, global_filters = self.ctx.load_frequency_words()
return (
all_results,
id_to_name,
title_info,
new_titles,
word_groups,
filter_words,
global_filters,
)
except Exception as e:
print(f"数据加载失败: {e}")
return None
def _prepare_current_title_info(self, results: Dict, time_info: str) -> Dict:
"""从当前抓取结果构建标题信息"""
title_info = {}
for source_id, titles_data in results.items():
title_info[source_id] = {}
for title, title_data in titles_data.items():
ranks = title_data.get("ranks", [])
url = title_data.get("url", "")
mobile_url = title_data.get("mobileUrl", "")
title_info[source_id][title] = {
"first_time": time_info,
"last_time": time_info,
"count": 1,
"ranks": ranks,
"url": url,
"mobileUrl": mobile_url,
}
return title_info
def _run_analysis_pipeline(
self,
data_source: Dict,
mode: str,
title_info: Dict,
new_titles: Dict,
word_groups: List[Dict],
filter_words: List[str],
id_to_name: Dict,
failed_ids: Optional[List] = None,
is_daily_summary: bool = False,
global_filters: Optional[List[str]] = None,
) -> Tuple[List[Dict], Optional[str]]:
"""统一的分析流水线:数据处理 → 统计计算 → HTML生成"""
# 统计计算(使用 AppContext
stats, total_titles = self.ctx.count_frequency(
data_source,
word_groups,
filter_words,
id_to_name,
title_info,
new_titles,
mode=mode,
global_filters=global_filters,
)
# HTML生成(如果启用)
html_file = None
if self.ctx.config["STORAGE"]["FORMATS"]["HTML"]:
html_file = self.ctx.generate_html(
stats,
total_titles,
failed_ids=failed_ids,
new_titles=new_titles,
id_to_name=id_to_name,
mode=mode,
is_daily_summary=is_daily_summary,
update_info=self.update_info if self.ctx.config["SHOW_VERSION_UPDATE"] else None,
)
return stats, html_file
def _send_notification_if_needed(
self,
stats: List[Dict],
report_type: str,
mode: str,
failed_ids: Optional[List] = None,
new_titles: Optional[Dict] = None,
id_to_name: Optional[Dict] = None,
html_file_path: Optional[str] = None,
) -> bool:
"""统一的通知发送逻辑,包含所有判断条件"""
has_notification = self._has_notification_configured()
cfg = self.ctx.config
if (
cfg["ENABLE_NOTIFICATION"]
and has_notification
and self._has_valid_content(stats, new_titles)
):
# 推送窗口控制
if cfg["PUSH_WINDOW"]["ENABLED"]:
push_manager = self.ctx.create_push_manager()
time_range_start = cfg["PUSH_WINDOW"]["TIME_RANGE"]["START"]
time_range_end = cfg["PUSH_WINDOW"]["TIME_RANGE"]["END"]
if not push_manager.is_in_time_range(time_range_start, time_range_end):
now = self.ctx.get_time()
print(
f"推送窗口控制:当前时间 {now.strftime('%H:%M')} 不在推送时间窗口 {time_range_start}-{time_range_end} 内,跳过推送"
)
return False
if cfg["PUSH_WINDOW"]["ONCE_PER_DAY"]:
if push_manager.has_pushed_today():
print(f"推送窗口控制:今天已推送过,跳过本次推送")
return False
else:
print(f"推送窗口控制:今天首次推送")
# 准备报告数据
report_data = self.ctx.prepare_report(stats, failed_ids, new_titles, id_to_name, mode)
# 是否发送版本更新信息
update_info_to_send = self.update_info if cfg["SHOW_VERSION_UPDATE"] else None
# 使用 NotificationDispatcher 发送到所有渠道
dispatcher = self.ctx.create_notification_dispatcher()
results = dispatcher.dispatch_all(
report_data=report_data,
report_type=report_type,
update_info=update_info_to_send,
proxy_url=self.proxy_url,
mode=mode,
html_file_path=html_file_path,
)
if not results:
print("未配置任何通知渠道,跳过通知发送")
return False
# 如果成功发送了任何通知,且启用了每天只推一次,则记录推送
if (
cfg["PUSH_WINDOW"]["ENABLED"]
and cfg["PUSH_WINDOW"]["ONCE_PER_DAY"]
and any(results.values())
):
push_manager = self.ctx.create_push_manager()
push_manager.record_push(report_type)
return True
elif cfg["ENABLE_NOTIFICATION"] and not has_notification:
print("⚠️ 警告:通知功能已启用但未配置任何通知渠道,将跳过通知发送")
elif not cfg["ENABLE_NOTIFICATION"]:
print(f"跳过{report_type}通知:通知功能已禁用")
elif (
cfg["ENABLE_NOTIFICATION"]
and has_notification
and not self._has_valid_content(stats, new_titles)
):
mode_strategy = self._get_mode_strategy()
if "实时" in report_type:
print(
f"跳过实时推送通知:{mode_strategy['mode_name']}下未检测到匹配的新闻"
)
else:
print(
f"跳过{mode_strategy['summary_report_type']}通知:未匹配到有效的新闻内容"
)
return False
def _generate_summary_report(self, mode_strategy: Dict) -> Optional[str]:
"""生成汇总报告(带通知)"""
summary_type = (
"当前榜单汇总" if mode_strategy["summary_mode"] == "current" else "当日汇总"
)
print(f"生成{summary_type}报告...")
# 加载分析数据
analysis_data = self._load_analysis_data()
if not analysis_data:
return None
all_results, id_to_name, title_info, new_titles, word_groups, filter_words, global_filters = (
analysis_data
)
# 运行分析流水线
stats, html_file = self._run_analysis_pipeline(
all_results,
mode_strategy["summary_mode"],
title_info,
new_titles,
word_groups,
filter_words,
id_to_name,
is_daily_summary=True,
global_filters=global_filters,
)
if html_file:
print(f"{summary_type}报告已生成: {html_file}")
# 发送通知
self._send_notification_if_needed(
stats,
mode_strategy["summary_report_type"],
mode_strategy["summary_mode"],
failed_ids=[],
new_titles=new_titles,
id_to_name=id_to_name,
html_file_path=html_file,
)
return html_file
def _generate_summary_html(self, mode: str = "daily") -> Optional[str]:
"""生成汇总HTML"""
summary_type = "当前榜单汇总" if mode == "current" else "当日汇总"
print(f"生成{summary_type}HTML...")
# 加载分析数据
analysis_data = self._load_analysis_data()
if not analysis_data:
return None
all_results, id_to_name, title_info, new_titles, word_groups, filter_words, global_filters = (
analysis_data
)
# 运行分析流水线
_, html_file = self._run_analysis_pipeline(
all_results,
mode,
title_info,
new_titles,
word_groups,
filter_words,
id_to_name,
is_daily_summary=True,
global_filters=global_filters,
)
if html_file:
print(f"{summary_type}HTML已生成: {html_file}")
return html_file
def _initialize_and_check_config(self) -> None:
"""通用初始化和配置检查"""
now = self.ctx.get_time()
print(f"当前北京时间: {now.strftime('%Y-%m-%d %H:%M:%S')}")
if not self.ctx.config["ENABLE_CRAWLER"]:
print("爬虫功能已禁用(ENABLE_CRAWLER=False),程序退出")
return
has_notification = self._has_notification_configured()
if not self.ctx.config["ENABLE_NOTIFICATION"]:
print("通知功能已禁用(ENABLE_NOTIFICATION=False),将只进行数据抓取")
elif not has_notification:
print("未配置任何通知渠道,将只进行数据抓取,不发送通知")
else:
print("通知功能已启用,将发送通知")
mode_strategy = self._get_mode_strategy()
print(f"报告模式: {self.report_mode}")
print(f"运行模式: {mode_strategy['description']}")
def _crawl_data(self) -> Tuple[Dict, Dict, List]:
"""执行数据爬取"""
ids = []
for platform in self.ctx.platforms:
if "name" in platform:
ids.append((platform["id"], platform["name"]))
else:
ids.append(platform["id"])
print(
f"配置的监控平台: {[p.get('name', p['id']) for p in self.ctx.platforms]}"
)
print(f"开始爬取数据,请求间隔 {self.request_interval} 毫秒")
Path("output").mkdir(parents=True, exist_ok=True)
results, id_to_name, failed_ids = self.data_fetcher.crawl_websites(
ids, self.request_interval
)
# 转换为 NewsData 格式并保存到存储后端
crawl_time = self.ctx.format_time()
crawl_date = self.ctx.format_date()
news_data = convert_crawl_results_to_news_data(
results, id_to_name, failed_ids, crawl_time, crawl_date
)
# 保存到存储后端(SQLite
if self.storage_manager.save_news_data(news_data):
print(f"数据已保存到存储后端: {self.storage_manager.backend_name}")
# 保存 TXT 快照(如果启用)
txt_file = self.storage_manager.save_txt_snapshot(news_data)
if txt_file:
print(f"TXT 快照已保存: {txt_file}")
# 兼容:同时保存到原有 TXT 格式(确保向后兼容)
if self.ctx.config["STORAGE"]["FORMATS"]["TXT"]:
title_file = self.ctx.save_titles(results, id_to_name, failed_ids)
print(f"标题已保存到: {title_file}")
return results, id_to_name, failed_ids
def _execute_mode_strategy(
self, mode_strategy: Dict, results: Dict, id_to_name: Dict, failed_ids: List
) -> Optional[str]:
"""执行模式特定逻辑"""
# 获取当前监控平台ID列表
current_platform_ids = self.ctx.platform_ids
new_titles = self.ctx.detect_new_titles(current_platform_ids)
time_info = self.ctx.format_time()
if self.ctx.config["STORAGE"]["FORMATS"]["TXT"]:
self.ctx.save_titles(results, id_to_name, failed_ids)
word_groups, filter_words, global_filters = self.ctx.load_frequency_words()
# current模式下,实时推送需要使用完整的历史数据来保证统计信息的完整性
if self.report_mode == "current":
# 加载完整的历史数据(已按当前平台过滤)
analysis_data = self._load_analysis_data()
if analysis_data:
(
all_results,
historical_id_to_name,
historical_title_info,
historical_new_titles,
_,
_,
_,
) = analysis_data
print(
f"current模式:使用过滤后的历史数据,包含平台:{list(all_results.keys())}"
)
stats, html_file = self._run_analysis_pipeline(
all_results,
self.report_mode,
historical_title_info,
historical_new_titles,
word_groups,
filter_words,
historical_id_to_name,
failed_ids=failed_ids,
global_filters=global_filters,
)
combined_id_to_name = {**historical_id_to_name, **id_to_name}
if html_file:
print(f"HTML报告已生成: {html_file}")
# 发送实时通知(使用完整历史数据的统计结果)
summary_html = None
if mode_strategy["should_send_realtime"]:
self._send_notification_if_needed(
stats,
mode_strategy["realtime_report_type"],
self.report_mode,
failed_ids=failed_ids,
new_titles=historical_new_titles,
id_to_name=combined_id_to_name,
html_file_path=html_file,
)
else:
print("❌ 严重错误:无法读取刚保存的数据文件")
raise RuntimeError("数据一致性检查失败:保存后立即读取失败")
else:
title_info = self._prepare_current_title_info(results, time_info)
stats, html_file = self._run_analysis_pipeline(
results,
self.report_mode,
title_info,
new_titles,
word_groups,
filter_words,
id_to_name,
failed_ids=failed_ids,
global_filters=global_filters,
)
if html_file:
print(f"HTML报告已生成: {html_file}")
# 发送实时通知(如果需要)
summary_html = None
if mode_strategy["should_send_realtime"]:
self._send_notification_if_needed(
stats,
mode_strategy["realtime_report_type"],
self.report_mode,
failed_ids=failed_ids,
new_titles=new_titles,
id_to_name=id_to_name,
html_file_path=html_file,
)
# 生成汇总报告(如果需要)
summary_html = None
if mode_strategy["should_generate_summary"]:
if mode_strategy["should_send_realtime"]:
# 如果已经发送了实时通知,汇总只生成HTML不发送通知
summary_html = self._generate_summary_html(
mode_strategy["summary_mode"]
)
else:
# daily模式:直接生成汇总报告并发送通知
summary_html = self._generate_summary_report(mode_strategy)
# 打开浏览器(仅在非容器环境)
if self._should_open_browser() and html_file:
if summary_html:
summary_url = "file://" + str(Path(summary_html).resolve())
print(f"正在打开汇总报告: {summary_url}")
webbrowser.open(summary_url)
else:
file_url = "file://" + str(Path(html_file).resolve())
print(f"正在打开HTML报告: {file_url}")
webbrowser.open(file_url)
elif self.is_docker_container and html_file:
if summary_html:
print(f"汇总报告已生成(Docker环境): {summary_html}")
else:
print(f"HTML报告已生成(Docker环境): {html_file}")
return summary_html
def run(self) -> None:
"""执行分析流程"""
try:
self._initialize_and_check_config()
mode_strategy = self._get_mode_strategy()
results, id_to_name, failed_ids = self._crawl_data()
self._execute_mode_strategy(mode_strategy, results, id_to_name, failed_ids)
except Exception as e:
print(f"分析流程执行出错: {e}")
raise
finally:
# 清理资源(包括过期数据清理和数据库连接关闭)
self.ctx.cleanup()
def main():
"""主程序入口"""
try:
analyzer = NewsAnalyzer()
analyzer.run()
except FileNotFoundError as e:
print(f"❌ 配置文件错误: {e}")
print("\n请确保以下文件存在:")
print(" • config/config.yaml")
print(" • config/frequency_words.txt")
print("\n参考项目文档进行正确配置")
except Exception as e:
print(f"❌ 程序运行错误: {e}")
raise
if __name__ == "__main__":
main()
+388
View File
@@ -0,0 +1,388 @@
# coding=utf-8
"""
应用上下文模块
提供配置上下文类,封装所有依赖配置的操作,消除全局状态和包装函数。
"""
from datetime import datetime
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Tuple
from trendradar.utils.time import (
get_configured_time,
format_date_folder,
format_time_filename,
get_current_time_display,
convert_time_for_display,
)
from trendradar.core import (
load_frequency_words,
matches_word_groups,
save_titles_to_file,
read_all_today_titles,
detect_latest_new_titles,
is_first_crawl_today,
count_word_frequency,
)
from trendradar.report import (
clean_title,
prepare_report_data,
generate_html_report,
render_html_content,
)
from trendradar.notification import (
render_feishu_content,
render_dingtalk_content,
split_content_into_batches,
NotificationDispatcher,
PushRecordManager,
)
from trendradar.storage import get_storage_manager
class AppContext:
"""
应用上下文类
封装所有依赖配置的操作,提供统一的接口。
消除对全局 CONFIG 的依赖,提高可测试性。
使用示例:
config = load_config()
ctx = AppContext(config)
# 时间操作
now = ctx.get_time()
date_folder = ctx.format_date()
# 存储操作
storage = ctx.get_storage_manager()
# 报告生成
html = ctx.generate_html_report(stats, total_titles, ...)
"""
def __init__(self, config: Dict[str, Any]):
"""
初始化应用上下文
Args:
config: 完整的配置字典
"""
self.config = config
self._storage_manager = None
# === 配置访问 ===
@property
def timezone(self) -> str:
"""获取配置的时区"""
return self.config.get("TIMEZONE", "Asia/Shanghai")
@property
def rank_threshold(self) -> int:
"""获取排名阈值"""
return self.config.get("RANK_THRESHOLD", 50)
@property
def weight_config(self) -> Dict:
"""获取权重配置"""
return self.config.get("WEIGHT_CONFIG", {})
@property
def platforms(self) -> List[Dict]:
"""获取平台配置列表"""
return self.config.get("PLATFORMS", [])
@property
def platform_ids(self) -> List[str]:
"""获取平台ID列表"""
return [p["id"] for p in self.platforms]
# === 时间操作 ===
def get_time(self) -> datetime:
"""获取当前配置时区的时间"""
return get_configured_time(self.timezone)
def format_date(self) -> str:
"""格式化日期文件夹 (YYYY-MM-DD)"""
return format_date_folder(timezone=self.timezone)
def format_time(self) -> str:
"""格式化时间文件名 (HH-MM)"""
return format_time_filename(self.timezone)
def get_time_display(self) -> str:
"""获取时间显示 (HH:MM)"""
return get_current_time_display(self.timezone)
@staticmethod
def convert_time_display(time_str: str) -> str:
"""将 HH-MM 转换为 HH:MM"""
return convert_time_for_display(time_str)
# === 存储操作 ===
def get_storage_manager(self):
"""获取存储管理器(延迟初始化,单例)"""
if self._storage_manager is None:
storage_config = self.config.get("STORAGE", {})
remote_config = storage_config.get("REMOTE", {})
local_config = storage_config.get("LOCAL", {})
pull_config = storage_config.get("PULL", {})
self._storage_manager = get_storage_manager(
backend_type=storage_config.get("BACKEND", "auto"),
data_dir=local_config.get("DATA_DIR", "output"),
enable_txt=storage_config.get("FORMATS", {}).get("TXT", True),
enable_html=storage_config.get("FORMATS", {}).get("HTML", True),
remote_config={
"bucket_name": remote_config.get("BUCKET_NAME", ""),
"access_key_id": remote_config.get("ACCESS_KEY_ID", ""),
"secret_access_key": remote_config.get("SECRET_ACCESS_KEY", ""),
"endpoint_url": remote_config.get("ENDPOINT_URL", ""),
"region": remote_config.get("REGION", ""),
},
local_retention_days=local_config.get("RETENTION_DAYS", 0),
remote_retention_days=remote_config.get("RETENTION_DAYS", 0),
pull_enabled=pull_config.get("ENABLED", False),
pull_days=pull_config.get("DAYS", 7),
timezone=self.timezone,
)
return self._storage_manager
def get_output_path(self, subfolder: str, filename: str) -> str:
"""获取输出路径"""
output_dir = Path("output") / self.format_date() / subfolder
output_dir.mkdir(parents=True, exist_ok=True)
return str(output_dir / filename)
# === 数据处理 ===
def save_titles(self, results: Dict, id_to_name: Dict, failed_ids: List) -> str:
"""保存标题到文件"""
output_path = self.get_output_path("txt", f"{self.format_time()}.txt")
return save_titles_to_file(results, id_to_name, failed_ids, output_path, clean_title)
def read_today_titles(
self, platform_ids: Optional[List[str]] = None
) -> Tuple[Dict, Dict, Dict]:
"""读取当天所有标题"""
return read_all_today_titles(self.get_storage_manager(), platform_ids)
def detect_new_titles(
self, platform_ids: Optional[List[str]] = None
) -> Dict:
"""检测最新批次的新增标题"""
return detect_latest_new_titles(self.get_storage_manager(), platform_ids)
def is_first_crawl(self) -> bool:
"""检测是否是当天第一次爬取"""
return is_first_crawl_today("output", self.format_date())
# === 频率词处理 ===
def load_frequency_words(
self, frequency_file: Optional[str] = None
) -> Tuple[List[Dict], List[str], List[str]]:
"""加载频率词配置"""
return load_frequency_words(frequency_file)
def matches_word_groups(
self,
title: str,
word_groups: List[Dict],
filter_words: List[str],
global_filters: Optional[List[str]] = None,
) -> bool:
"""检查标题是否匹配词组规则"""
return matches_word_groups(title, word_groups, filter_words, global_filters)
# === 统计分析 ===
def count_frequency(
self,
results: Dict,
word_groups: List[Dict],
filter_words: List[str],
id_to_name: Dict,
title_info: Optional[Dict] = None,
new_titles: Optional[Dict] = None,
mode: str = "daily",
global_filters: Optional[List[str]] = None,
) -> Tuple[List[Dict], int]:
"""统计词频"""
return count_word_frequency(
results=results,
word_groups=word_groups,
filter_words=filter_words,
id_to_name=id_to_name,
title_info=title_info,
rank_threshold=self.rank_threshold,
new_titles=new_titles,
mode=mode,
global_filters=global_filters,
weight_config=self.weight_config,
max_news_per_keyword=self.config.get("MAX_NEWS_PER_KEYWORD", 0),
sort_by_position_first=self.config.get("SORT_BY_POSITION_FIRST", False),
is_first_crawl_func=self.is_first_crawl,
convert_time_func=self.convert_time_display,
)
# === 报告生成 ===
def prepare_report(
self,
stats: List[Dict],
failed_ids: Optional[List] = None,
new_titles: Optional[Dict] = None,
id_to_name: Optional[Dict] = None,
mode: str = "daily",
) -> Dict:
"""准备报告数据"""
return prepare_report_data(
stats=stats,
failed_ids=failed_ids,
new_titles=new_titles,
id_to_name=id_to_name,
mode=mode,
rank_threshold=self.rank_threshold,
matches_word_groups_func=self.matches_word_groups,
load_frequency_words_func=self.load_frequency_words,
)
def generate_html(
self,
stats: List[Dict],
total_titles: int,
failed_ids: Optional[List] = None,
new_titles: Optional[Dict] = None,
id_to_name: Optional[Dict] = None,
mode: str = "daily",
is_daily_summary: bool = False,
update_info: Optional[Dict] = None,
) -> str:
"""生成HTML报告"""
return generate_html_report(
stats=stats,
total_titles=total_titles,
failed_ids=failed_ids,
new_titles=new_titles,
id_to_name=id_to_name,
mode=mode,
is_daily_summary=is_daily_summary,
update_info=update_info,
rank_threshold=self.rank_threshold,
output_dir="output",
date_folder=self.format_date(),
time_filename=self.format_time(),
render_html_func=lambda *args, **kwargs: self.render_html(*args, **kwargs),
matches_word_groups_func=self.matches_word_groups,
load_frequency_words_func=self.load_frequency_words,
enable_index_copy=True,
)
def render_html(
self,
report_data: Dict,
total_titles: int,
is_daily_summary: bool = False,
mode: str = "daily",
update_info: Optional[Dict] = None,
) -> str:
"""渲染HTML内容"""
return render_html_content(
report_data=report_data,
total_titles=total_titles,
is_daily_summary=is_daily_summary,
mode=mode,
update_info=update_info,
reverse_content_order=self.config.get("REVERSE_CONTENT_ORDER", False),
get_time_func=self.get_time,
)
# === 通知内容渲染 ===
def render_feishu(
self,
report_data: Dict,
update_info: Optional[Dict] = None,
mode: str = "daily",
) -> str:
"""渲染飞书内容"""
return render_feishu_content(
report_data=report_data,
update_info=update_info,
mode=mode,
separator=self.config.get("FEISHU_MESSAGE_SEPARATOR", "---"),
reverse_content_order=self.config.get("REVERSE_CONTENT_ORDER", False),
get_time_func=self.get_time,
)
def render_dingtalk(
self,
report_data: Dict,
update_info: Optional[Dict] = None,
mode: str = "daily",
) -> str:
"""渲染钉钉内容"""
return render_dingtalk_content(
report_data=report_data,
update_info=update_info,
mode=mode,
reverse_content_order=self.config.get("REVERSE_CONTENT_ORDER", False),
get_time_func=self.get_time,
)
def split_content(
self,
report_data: Dict,
format_type: str,
update_info: Optional[Dict] = None,
max_bytes: Optional[int] = None,
mode: str = "daily",
) -> List[str]:
"""分批处理消息内容"""
return split_content_into_batches(
report_data=report_data,
format_type=format_type,
update_info=update_info,
max_bytes=max_bytes,
mode=mode,
batch_sizes={
"dingtalk": self.config.get("DINGTALK_BATCH_SIZE", 20000),
"feishu": self.config.get("FEISHU_BATCH_SIZE", 29000),
"default": self.config.get("MESSAGE_BATCH_SIZE", 4000),
},
feishu_separator=self.config.get("FEISHU_MESSAGE_SEPARATOR", "---"),
reverse_content_order=self.config.get("REVERSE_CONTENT_ORDER", False),
get_time_func=self.get_time,
)
# === 通知发送 ===
def create_notification_dispatcher(self) -> NotificationDispatcher:
"""创建通知调度器"""
return NotificationDispatcher(
config=self.config,
get_time_func=self.get_time,
split_content_func=self.split_content,
)
def create_push_manager(self) -> PushRecordManager:
"""创建推送记录管理器"""
return PushRecordManager(
storage_backend=self.get_storage_manager(),
get_time_func=self.get_time,
)
# === 资源清理 ===
def cleanup(self):
"""清理资源"""
if self._storage_manager:
self._storage_manager.cleanup_old_data()
self._storage_manager.cleanup()
self._storage_manager = None
+47
View File
@@ -0,0 +1,47 @@
# coding=utf-8
"""
核心模块 - 配置管理和核心工具
"""
from trendradar.core.config import (
parse_multi_account_config,
validate_paired_configs,
limit_accounts,
get_account_at_index,
)
from trendradar.core.loader import load_config
from trendradar.core.frequency import load_frequency_words, matches_word_groups
from trendradar.core.data import (
save_titles_to_file,
read_all_today_titles_from_storage,
read_all_today_titles,
detect_latest_new_titles_from_storage,
detect_latest_new_titles,
is_first_crawl_today,
)
from trendradar.core.analyzer import (
calculate_news_weight,
format_time_display,
count_word_frequency,
)
__all__ = [
"parse_multi_account_config",
"validate_paired_configs",
"limit_accounts",
"get_account_at_index",
"load_config",
"load_frequency_words",
"matches_word_groups",
# 数据处理
"save_titles_to_file",
"read_all_today_titles_from_storage",
"read_all_today_titles",
"detect_latest_new_titles_from_storage",
"detect_latest_new_titles",
"is_first_crawl_today",
# 统计分析
"calculate_news_weight",
"format_time_display",
"count_word_frequency",
]
+469
View File
@@ -0,0 +1,469 @@
# coding=utf-8
"""
统计分析模块
提供新闻统计和分析功能:
- calculate_news_weight: 计算新闻权重
- format_time_display: 格式化时间显示
- count_word_frequency: 统计词频
"""
from typing import Dict, List, Tuple, Optional, Callable
from trendradar.core.frequency import matches_word_groups
def calculate_news_weight(
title_data: Dict,
rank_threshold: int,
weight_config: Dict,
) -> float:
"""
计算新闻权重,用于排序
Args:
title_data: 标题数据,包含 ranks 和 count
rank_threshold: 排名阈值
weight_config: 权重配置 {RANK_WEIGHT, FREQUENCY_WEIGHT, HOTNESS_WEIGHT}
Returns:
float: 计算出的权重值
"""
ranks = title_data.get("ranks", [])
if not ranks:
return 0.0
count = title_data.get("count", len(ranks))
# 排名权重:Σ(11 - min(rank, 10)) / 出现次数
rank_scores = []
for rank in ranks:
score = 11 - min(rank, 10)
rank_scores.append(score)
rank_weight = sum(rank_scores) / len(ranks) if ranks else 0
# 频次权重:min(出现次数, 10) × 10
frequency_weight = min(count, 10) * 10
# 热度加成:高排名次数 / 总出现次数 × 100
high_rank_count = sum(1 for rank in ranks if rank <= rank_threshold)
hotness_ratio = high_rank_count / len(ranks) if ranks else 0
hotness_weight = hotness_ratio * 100
total_weight = (
rank_weight * weight_config["RANK_WEIGHT"]
+ frequency_weight * weight_config["FREQUENCY_WEIGHT"]
+ hotness_weight * weight_config["HOTNESS_WEIGHT"]
)
return total_weight
def format_time_display(
first_time: str,
last_time: str,
convert_time_func: Callable[[str], str],
) -> str:
"""
格式化时间显示(将 HH-MM 转换为 HH:MM
Args:
first_time: 首次出现时间
last_time: 最后出现时间
convert_time_func: 时间格式转换函数
Returns:
str: 格式化后的时间显示字符串
"""
if not first_time:
return ""
# 转换为显示格式
first_display = convert_time_func(first_time)
last_display = convert_time_func(last_time)
if first_display == last_display or not last_display:
return first_display
else:
return f"[{first_display} ~ {last_display}]"
def count_word_frequency(
results: Dict,
word_groups: List[Dict],
filter_words: List[str],
id_to_name: Dict,
title_info: Optional[Dict] = None,
rank_threshold: int = 3,
new_titles: Optional[Dict] = None,
mode: str = "daily",
global_filters: Optional[List[str]] = None,
weight_config: Optional[Dict] = None,
max_news_per_keyword: int = 0,
sort_by_position_first: bool = False,
is_first_crawl_func: Optional[Callable[[], bool]] = None,
convert_time_func: Optional[Callable[[str], str]] = None,
) -> Tuple[List[Dict], int]:
"""
统计词频,支持必须词、频率词、过滤词、全局过滤词,并标记新增标题
Args:
results: 抓取结果 {source_id: {title: title_data}}
word_groups: 词组配置列表
filter_words: 过滤词列表
id_to_name: ID 到名称的映射
title_info: 标题统计信息(可选)
rank_threshold: 排名阈值
new_titles: 新增标题(可选)
mode: 报告模式 (daily/incremental/current)
global_filters: 全局过滤词(可选)
weight_config: 权重配置
max_news_per_keyword: 每个关键词最大显示数量
sort_by_position_first: 是否优先按配置位置排序
is_first_crawl_func: 检测是否是当天第一次爬取的函数
convert_time_func: 时间格式转换函数
Returns:
Tuple[List[Dict], int]: (统计结果列表, 总标题数)
"""
# 默认权重配置
if weight_config is None:
weight_config = {
"RANK_WEIGHT": 0.4,
"FREQUENCY_WEIGHT": 0.3,
"HOTNESS_WEIGHT": 0.3,
}
# 默认时间转换函数
if convert_time_func is None:
convert_time_func = lambda x: x
# 默认首次爬取检测函数
if is_first_crawl_func is None:
is_first_crawl_func = lambda: True
# 如果没有配置词组,创建一个包含所有新闻的虚拟词组
if not word_groups:
print("频率词配置为空,将显示所有新闻")
word_groups = [{"required": [], "normal": [], "group_key": "全部新闻"}]
filter_words = [] # 清空过滤词,显示所有新闻
is_first_today = is_first_crawl_func()
# 确定处理的数据源和新增标记逻辑
if mode == "incremental":
if is_first_today:
# 增量模式 + 当天第一次:处理所有新闻,都标记为新增
results_to_process = results
all_news_are_new = True
else:
# 增量模式 + 当天非第一次:只处理新增的新闻
results_to_process = new_titles if new_titles else {}
all_news_are_new = True
elif mode == "current":
# current 模式:只处理当前时间批次的新闻,但统计信息来自全部历史
if title_info:
latest_time = None
for source_titles in title_info.values():
for title_data in source_titles.values():
last_time = title_data.get("last_time", "")
if last_time:
if latest_time is None or last_time > latest_time:
latest_time = last_time
# 只处理 last_time 等于最新时间的新闻
if latest_time:
results_to_process = {}
for source_id, source_titles in results.items():
if source_id in title_info:
filtered_titles = {}
for title, title_data in source_titles.items():
if title in title_info[source_id]:
info = title_info[source_id][title]
if info.get("last_time") == latest_time:
filtered_titles[title] = title_data
if filtered_titles:
results_to_process[source_id] = filtered_titles
print(
f"当前榜单模式:最新时间 {latest_time},筛选出 {sum(len(titles) for titles in results_to_process.values())} 条当前榜单新闻"
)
else:
results_to_process = results
else:
results_to_process = results
all_news_are_new = False
else:
# 当日汇总模式:处理所有新闻
results_to_process = results
all_news_are_new = False
total_input_news = sum(len(titles) for titles in results.values())
filter_status = (
"全部显示"
if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻"
else "频率词过滤"
)
print(f"当日汇总模式:处理 {total_input_news} 条新闻,模式:{filter_status}")
word_stats = {}
total_titles = 0
processed_titles = {}
matched_new_count = 0
if title_info is None:
title_info = {}
if new_titles is None:
new_titles = {}
for group in word_groups:
group_key = group["group_key"]
word_stats[group_key] = {"count": 0, "titles": {}}
for source_id, titles_data in results_to_process.items():
total_titles += len(titles_data)
if source_id not in processed_titles:
processed_titles[source_id] = {}
for title, title_data in titles_data.items():
if title in processed_titles.get(source_id, {}):
continue
# 使用统一的匹配逻辑
matches_frequency_words = matches_word_groups(
title, word_groups, filter_words, global_filters
)
if not matches_frequency_words:
continue
# 如果是增量模式或 current 模式第一次,统计匹配的新增新闻数量
if (mode == "incremental" and all_news_are_new) or (
mode == "current" and is_first_today
):
matched_new_count += 1
source_ranks = title_data.get("ranks", [])
source_url = title_data.get("url", "")
source_mobile_url = title_data.get("mobileUrl", "")
# 找到匹配的词组(防御性转换确保类型安全)
title_lower = str(title).lower() if not isinstance(title, str) else title.lower()
for group in word_groups:
required_words = group["required"]
normal_words = group["normal"]
# 如果是"全部新闻"模式,所有标题都匹配第一个(唯一的)词组
if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻":
group_key = group["group_key"]
word_stats[group_key]["count"] += 1
if source_id not in word_stats[group_key]["titles"]:
word_stats[group_key]["titles"][source_id] = []
else:
# 原有的匹配逻辑
if required_words:
all_required_present = all(
req_word.lower() in title_lower
for req_word in required_words
)
if not all_required_present:
continue
if normal_words:
any_normal_present = any(
normal_word.lower() in title_lower
for normal_word in normal_words
)
if not any_normal_present:
continue
group_key = group["group_key"]
word_stats[group_key]["count"] += 1
if source_id not in word_stats[group_key]["titles"]:
word_stats[group_key]["titles"][source_id] = []
first_time = ""
last_time = ""
count_info = 1
ranks = source_ranks if source_ranks else []
url = source_url
mobile_url = source_mobile_url
# 对于 current 模式,从历史统计信息中获取完整数据
if (
mode == "current"
and title_info
and source_id in title_info
and title in title_info[source_id]
):
info = title_info[source_id][title]
first_time = info.get("first_time", "")
last_time = info.get("last_time", "")
count_info = info.get("count", 1)
if "ranks" in info and info["ranks"]:
ranks = info["ranks"]
url = info.get("url", source_url)
mobile_url = info.get("mobileUrl", source_mobile_url)
elif (
title_info
and source_id in title_info
and title in title_info[source_id]
):
info = title_info[source_id][title]
first_time = info.get("first_time", "")
last_time = info.get("last_time", "")
count_info = info.get("count", 1)
if "ranks" in info and info["ranks"]:
ranks = info["ranks"]
url = info.get("url", source_url)
mobile_url = info.get("mobileUrl", source_mobile_url)
if not ranks:
ranks = [99]
time_display = format_time_display(first_time, last_time, convert_time_func)
source_name = id_to_name.get(source_id, source_id)
# 判断是否为新增
is_new = False
if all_news_are_new:
# 增量模式下所有处理的新闻都是新增,或者当天第一次的所有新闻都是新增
is_new = True
elif new_titles and source_id in new_titles:
# 检查是否在新增列表中
new_titles_for_source = new_titles[source_id]
is_new = title in new_titles_for_source
word_stats[group_key]["titles"][source_id].append(
{
"title": title,
"source_name": source_name,
"first_time": first_time,
"last_time": last_time,
"time_display": time_display,
"count": count_info,
"ranks": ranks,
"rank_threshold": rank_threshold,
"url": url,
"mobileUrl": mobile_url,
"is_new": is_new,
}
)
if source_id not in processed_titles:
processed_titles[source_id] = {}
processed_titles[source_id][title] = True
break
# 最后统一打印汇总信息
if mode == "incremental":
if is_first_today:
total_input_news = sum(len(titles) for titles in results.values())
filter_status = (
"全部显示"
if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻"
else "频率词匹配"
)
print(
f"增量模式:当天第一次爬取,{total_input_news} 条新闻中有 {matched_new_count}{filter_status}"
)
else:
if new_titles:
total_new_count = sum(len(titles) for titles in new_titles.values())
filter_status = (
"全部显示"
if len(word_groups) == 1
and word_groups[0]["group_key"] == "全部新闻"
else "匹配频率词"
)
print(
f"增量模式:{total_new_count} 条新增新闻中,有 {matched_new_count}{filter_status}"
)
if matched_new_count == 0 and len(word_groups) > 1:
print("增量模式:没有新增新闻匹配频率词,将不会发送通知")
else:
print("增量模式:未检测到新增新闻")
elif mode == "current":
total_input_news = sum(len(titles) for titles in results_to_process.values())
if is_first_today:
filter_status = (
"全部显示"
if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻"
else "频率词匹配"
)
print(
f"当前榜单模式:当天第一次爬取,{total_input_news} 条当前榜单新闻中有 {matched_new_count}{filter_status}"
)
else:
matched_count = sum(stat["count"] for stat in word_stats.values())
filter_status = (
"全部显示"
if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻"
else "频率词匹配"
)
print(
f"当前榜单模式:{total_input_news} 条当前榜单新闻中有 {matched_count}{filter_status}"
)
stats = []
# 创建 group_key 到位置和最大数量的映射
group_key_to_position = {
group["group_key"]: idx for idx, group in enumerate(word_groups)
}
group_key_to_max_count = {
group["group_key"]: group.get("max_count", 0) for group in word_groups
}
for group_key, data in word_stats.items():
all_titles = []
for source_id, title_list in data["titles"].items():
all_titles.extend(title_list)
# 按权重排序
sorted_titles = sorted(
all_titles,
key=lambda x: (
-calculate_news_weight(x, rank_threshold, weight_config),
min(x["ranks"]) if x["ranks"] else 999,
-x["count"],
),
)
# 应用最大显示数量限制(优先级:单独配置 > 全局配置)
group_max_count = group_key_to_max_count.get(group_key, 0)
if group_max_count == 0:
# 使用全局配置
group_max_count = max_news_per_keyword
if group_max_count > 0:
sorted_titles = sorted_titles[:group_max_count]
stats.append(
{
"word": group_key,
"count": data["count"],
"position": group_key_to_position.get(group_key, 999),
"titles": sorted_titles,
"percentage": (
round(data["count"] / total_titles * 100, 2)
if total_titles > 0
else 0
),
}
)
# 根据配置选择排序优先级
if sort_by_position_first:
# 先按配置位置,再按热点条数
stats.sort(key=lambda x: (x["position"], -x["count"]))
else:
# 先按热点条数,再按配置位置(原逻辑)
stats.sort(key=lambda x: (-x["count"], x["position"]))
# 打印过滤后的匹配新闻数(与推送显示一致)
matched_news_count = sum(len(stat["titles"]) for stat in stats if stat["count"] > 0)
if mode == "daily":
print(f"频率词过滤后:{matched_news_count} 条新闻匹配(将显示在推送中)")
return stats, total_titles
+152
View File
@@ -0,0 +1,152 @@
# coding=utf-8
"""
配置工具模块 - 多账号配置解析和验证
提供多账号推送配置的解析、验证和限制功能
"""
from typing import Dict, List, Optional, Tuple
def parse_multi_account_config(config_value: str, separator: str = ";") -> List[str]:
"""
解析多账号配置,返回账号列表
Args:
config_value: 配置值字符串,多个账号用分隔符分隔
separator: 分隔符,默认为 ;
Returns:
账号列表,空字符串会被保留(用于占位)
Examples:
>>> parse_multi_account_config("url1;url2;url3")
['url1', 'url2', 'url3']
>>> parse_multi_account_config(";token2") # 第一个账号无token
['', 'token2']
>>> parse_multi_account_config("")
[]
"""
if not config_value:
return []
# 保留空字符串用于占位(如 ";token2" 表示第一个账号无token
accounts = [acc.strip() for acc in config_value.split(separator)]
# 过滤掉全部为空的情况
if all(not acc for acc in accounts):
return []
return accounts
def validate_paired_configs(
configs: Dict[str, List[str]],
channel_name: str,
required_keys: Optional[List[str]] = None
) -> Tuple[bool, int]:
"""
验证配对配置的数量是否一致
对于需要多个配置项配对的渠道(如 Telegram 的 token 和 chat_id),
验证所有配置项的账号数量是否一致。
Args:
configs: 配置字典,key 为配置名,value 为账号列表
channel_name: 渠道名称,用于日志输出
required_keys: 必须有值的配置项列表
Returns:
(是否验证通过, 账号数量)
Examples:
>>> validate_paired_configs({
... "token": ["t1", "t2"],
... "chat_id": ["c1", "c2"]
... }, "Telegram", ["token", "chat_id"])
(True, 2)
>>> validate_paired_configs({
... "token": ["t1", "t2"],
... "chat_id": ["c1"] # 数量不匹配
... }, "Telegram", ["token", "chat_id"])
(False, 0)
"""
# 过滤掉空列表
non_empty_configs = {k: v for k, v in configs.items() if v}
if not non_empty_configs:
return True, 0
# 检查必须项
if required_keys:
for key in required_keys:
if key not in non_empty_configs or not non_empty_configs[key]:
return True, 0 # 必须项为空,视为未配置
# 获取所有非空配置的长度
lengths = {k: len(v) for k, v in non_empty_configs.items()}
unique_lengths = set(lengths.values())
if len(unique_lengths) > 1:
print(f"{channel_name} 配置错误:配对配置数量不一致,将跳过该渠道推送")
for key, length in lengths.items():
print(f" - {key}: {length}")
return False, 0
return True, list(unique_lengths)[0] if unique_lengths else 0
def limit_accounts(
accounts: List[str],
max_count: int,
channel_name: str
) -> List[str]:
"""
限制账号数量
当配置的账号数量超过最大限制时,只使用前 N 个账号,
并输出警告信息。
Args:
accounts: 账号列表
max_count: 最大账号数量
channel_name: 渠道名称,用于日志输出
Returns:
限制后的账号列表
Examples:
>>> limit_accounts(["a1", "a2", "a3"], 2, "飞书")
⚠️ 飞书 配置了 3 个账号,超过最大限制 2,只使用前 2 个
['a1', 'a2']
"""
if len(accounts) > max_count:
print(f"⚠️ {channel_name} 配置了 {len(accounts)} 个账号,超过最大限制 {max_count},只使用前 {max_count}")
print(f" ⚠️ 警告:如果您是 fork 用户,过多账号可能导致 GitHub Actions 运行时间过长,存在账号风险")
return accounts[:max_count]
return accounts
def get_account_at_index(accounts: List[str], index: int, default: str = "") -> str:
"""
安全获取指定索引的账号值
当索引超出范围或账号值为空时,返回默认值。
Args:
accounts: 账号列表
index: 索引
default: 默认值
Returns:
账号值或默认值
Examples:
>>> get_account_at_index(["a", "b", "c"], 1)
'b'
>>> get_account_at_index(["a", "", "c"], 1, "default")
'default'
>>> get_account_at_index(["a"], 5, "default")
'default'
"""
if index < len(accounts):
return accounts[index] if accounts[index] else default
return default
+291
View File
@@ -0,0 +1,291 @@
# coding=utf-8
"""
数据处理模块
提供数据读取、保存和检测功能:
- save_titles_to_file: 保存标题到 TXT 文件
- read_all_today_titles: 从存储后端读取当天所有标题
- detect_latest_new_titles: 检测最新批次的新增标题
Author: TrendRadar Team
"""
from pathlib import Path
from typing import Dict, List, Tuple, Optional, Callable
def save_titles_to_file(
results: Dict,
id_to_name: Dict,
failed_ids: List,
output_path: str,
clean_title_func: Callable[[str], str],
) -> str:
"""
保存标题到 TXT 文件
Args:
results: 抓取结果 {source_id: {title: title_data}}
id_to_name: ID 到名称的映射
failed_ids: 失败的 ID 列表
output_path: 输出文件路径
clean_title_func: 标题清理函数
Returns:
str: 保存的文件路径
"""
# 确保目录存在
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
for id_value, title_data in results.items():
# id | name 或 id
name = id_to_name.get(id_value)
if name and name != id_value:
f.write(f"{id_value} | {name}\n")
else:
f.write(f"{id_value}\n")
# 按排名排序标题
sorted_titles = []
for title, info in title_data.items():
cleaned_title = clean_title_func(title)
if isinstance(info, dict):
ranks = info.get("ranks", [])
url = info.get("url", "")
mobile_url = info.get("mobileUrl", "")
else:
ranks = info if isinstance(info, list) else []
url = ""
mobile_url = ""
rank = ranks[0] if ranks else 1
sorted_titles.append((rank, cleaned_title, url, mobile_url))
sorted_titles.sort(key=lambda x: x[0])
for rank, cleaned_title, url, mobile_url in sorted_titles:
line = f"{rank}. {cleaned_title}"
if url:
line += f" [URL:{url}]"
if mobile_url:
line += f" [MOBILE:{mobile_url}]"
f.write(line + "\n")
f.write("\n")
if failed_ids:
f.write("==== 以下ID请求失败 ====\n")
for id_value in failed_ids:
f.write(f"{id_value}\n")
return output_path
def read_all_today_titles_from_storage(
storage_manager,
current_platform_ids: Optional[List[str]] = None,
) -> Tuple[Dict, Dict, Dict]:
"""
从存储后端读取当天所有标题(SQLite 数据)
Args:
storage_manager: 存储管理器实例
current_platform_ids: 当前监控的平台 ID 列表(用于过滤)
Returns:
Tuple[Dict, Dict, Dict]: (all_results, id_to_name, title_info)
"""
try:
news_data = storage_manager.get_today_all_data()
if not news_data or not news_data.items:
return {}, {}, {}
all_results = {}
final_id_to_name = {}
title_info = {}
for source_id, news_list in news_data.items.items():
# 按平台过滤
if current_platform_ids is not None and source_id not in current_platform_ids:
continue
# 获取来源名称
source_name = news_data.id_to_name.get(source_id, source_id)
final_id_to_name[source_id] = source_name
if source_id not in all_results:
all_results[source_id] = {}
title_info[source_id] = {}
for item in news_list:
title = item.title
ranks = getattr(item, 'ranks', [item.rank])
first_time = getattr(item, 'first_time', item.crawl_time)
last_time = getattr(item, 'last_time', item.crawl_time)
count = getattr(item, 'count', 1)
all_results[source_id][title] = {
"ranks": ranks,
"url": item.url or "",
"mobileUrl": item.mobile_url or "",
}
title_info[source_id][title] = {
"first_time": first_time,
"last_time": last_time,
"count": count,
"ranks": ranks,
"url": item.url or "",
"mobileUrl": item.mobile_url or "",
}
return all_results, final_id_to_name, title_info
except Exception as e:
print(f"[存储] 从存储后端读取数据失败: {e}")
return {}, {}, {}
def read_all_today_titles(
storage_manager,
current_platform_ids: Optional[List[str]] = None,
) -> Tuple[Dict, Dict, Dict]:
"""
读取当天所有标题(从存储后端)
Args:
storage_manager: 存储管理器实例
current_platform_ids: 当前监控的平台 ID 列表(用于过滤)
Returns:
Tuple[Dict, Dict, Dict]: (all_results, id_to_name, title_info)
"""
all_results, final_id_to_name, title_info = read_all_today_titles_from_storage(
storage_manager, current_platform_ids
)
if all_results:
total_count = sum(len(titles) for titles in all_results.values())
print(f"[存储] 已从存储后端读取 {total_count} 条标题")
else:
print("[存储] 当天暂无数据")
return all_results, final_id_to_name, title_info
def detect_latest_new_titles_from_storage(
storage_manager,
current_platform_ids: Optional[List[str]] = None,
) -> Dict:
"""
从存储后端检测最新批次的新增标题
Args:
storage_manager: 存储管理器实例
current_platform_ids: 当前监控的平台 ID 列表(用于过滤)
Returns:
Dict: 新增标题 {source_id: {title: title_data}}
"""
try:
# 获取最新抓取数据
latest_data = storage_manager.get_latest_crawl_data()
if not latest_data or not latest_data.items:
return {}
# 获取所有历史数据
all_data = storage_manager.get_today_all_data()
if not all_data or not all_data.items:
# 没有历史数据(第一次抓取),不应该有"新增"标题
return {}
# 收集历史标题(不包括最新批次的时间)
latest_time = latest_data.crawl_time
historical_titles = {}
for source_id, news_list in all_data.items.items():
if current_platform_ids is not None and source_id not in current_platform_ids:
continue
historical_titles[source_id] = set()
for item in news_list:
# 只统计非最新批次的标题
first_time = getattr(item, 'first_time', item.crawl_time)
if first_time != latest_time:
historical_titles[source_id].add(item.title)
# 检查是否是当天第一次抓取(没有任何历史标题)
# 如果所有平台的历史标题集合都为空,说明只有一个抓取批次,不应该有"新增"标题
has_historical_data = any(len(titles) > 0 for titles in historical_titles.values())
if not has_historical_data:
return {}
# 找出新增标题
new_titles = {}
for source_id, news_list in latest_data.items.items():
if current_platform_ids is not None and source_id not in current_platform_ids:
continue
historical_set = historical_titles.get(source_id, set())
source_new_titles = {}
for item in news_list:
if item.title not in historical_set:
source_new_titles[item.title] = {
"ranks": [item.rank],
"url": item.url or "",
"mobileUrl": item.mobile_url or "",
}
if source_new_titles:
new_titles[source_id] = source_new_titles
return new_titles
except Exception as e:
print(f"[存储] 从存储后端检测新标题失败: {e}")
return {}
def detect_latest_new_titles(
storage_manager,
current_platform_ids: Optional[List[str]] = None,
) -> Dict:
"""
检测当日最新批次的新增标题(从存储后端)
Args:
storage_manager: 存储管理器实例
current_platform_ids: 当前监控的平台 ID 列表(用于过滤)
Returns:
Dict: 新增标题 {source_id: {title: title_data}}
"""
new_titles = detect_latest_new_titles_from_storage(storage_manager, current_platform_ids)
if new_titles:
total_new = sum(len(titles) for titles in new_titles.values())
print(f"[存储] 从存储后端检测到 {total_new} 条新增标题")
return new_titles
def is_first_crawl_today(output_dir: str, date_folder: str) -> bool:
"""
检测是否是当天第一次爬取
Args:
output_dir: 输出目录
date_folder: 日期文件夹名称
Returns:
bool: 是否是当天第一次爬取
"""
txt_dir = Path(output_dir) / date_folder / "txt"
if not txt_dir.exists():
return True
files = sorted([f for f in txt_dir.iterdir() if f.suffix == ".txt"])
return len(files) <= 1
+194
View File
@@ -0,0 +1,194 @@
# coding=utf-8
"""
频率词配置加载模块
负责从配置文件加载频率词规则,支持:
- 普通词组
- 必须词(+前缀)
- 过滤词(!前缀)
- 全局过滤词([GLOBAL_FILTER] 区域)
- 最大显示数量(@前缀)
"""
import os
from pathlib import Path
from typing import Dict, List, Tuple, Optional
def load_frequency_words(
frequency_file: Optional[str] = None,
) -> Tuple[List[Dict], List[str], List[str]]:
"""
加载频率词配置
配置文件格式说明:
- 每个词组由空行分隔
- [GLOBAL_FILTER] 区域定义全局过滤词
- [WORD_GROUPS] 区域定义词组(默认)
词组语法:
- 普通词:直接写入,任意匹配即可
- +词:必须词,所有必须词都要匹配
- !词:过滤词,匹配则排除
- @数字:该词组最多显示的条数
Args:
frequency_file: 频率词配置文件路径,默认从环境变量 FREQUENCY_WORDS_PATH 获取或使用 config/frequency_words.txt
Returns:
(词组列表, 词组内过滤词, 全局过滤词)
Raises:
FileNotFoundError: 频率词文件不存在
"""
if frequency_file is None:
frequency_file = os.environ.get(
"FREQUENCY_WORDS_PATH", "config/frequency_words.txt"
)
frequency_path = Path(frequency_file)
if not frequency_path.exists():
raise FileNotFoundError(f"频率词文件 {frequency_file} 不存在")
with open(frequency_path, "r", encoding="utf-8") as f:
content = f.read()
word_groups = [group.strip() for group in content.split("\n\n") if group.strip()]
processed_groups = []
filter_words = []
global_filters = []
# 默认区域(向后兼容)
current_section = "WORD_GROUPS"
for group in word_groups:
lines = [line.strip() for line in group.split("\n") if line.strip()]
if not lines:
continue
# 检查是否为区域标记
if lines[0].startswith("[") and lines[0].endswith("]"):
section_name = lines[0][1:-1].upper()
if section_name in ("GLOBAL_FILTER", "WORD_GROUPS"):
current_section = section_name
lines = lines[1:] # 移除标记行
# 处理全局过滤区域
if current_section == "GLOBAL_FILTER":
# 直接添加所有非空行到全局过滤列表
for line in lines:
# 忽略特殊语法前缀,只提取纯文本
if line.startswith(("!", "+", "@")):
continue # 全局过滤区不支持特殊语法
if line:
global_filters.append(line)
continue
# 处理词组区域
words = lines
group_required_words = []
group_normal_words = []
group_filter_words = []
group_max_count = 0 # 默认不限制
for word in words:
if word.startswith("@"):
# 解析最大显示数量(只接受正整数)
try:
count = int(word[1:])
if count > 0:
group_max_count = count
except (ValueError, IndexError):
pass # 忽略无效的@数字格式
elif word.startswith("!"):
filter_words.append(word[1:])
group_filter_words.append(word[1:])
elif word.startswith("+"):
group_required_words.append(word[1:])
else:
group_normal_words.append(word)
if group_required_words or group_normal_words:
if group_normal_words:
group_key = " ".join(group_normal_words)
else:
group_key = " ".join(group_required_words)
processed_groups.append(
{
"required": group_required_words,
"normal": group_normal_words,
"group_key": group_key,
"max_count": group_max_count,
}
)
return processed_groups, filter_words, global_filters
def matches_word_groups(
title: str,
word_groups: List[Dict],
filter_words: List[str],
global_filters: Optional[List[str]] = None
) -> bool:
"""
检查标题是否匹配词组规则
Args:
title: 标题文本
word_groups: 词组列表
filter_words: 过滤词列表
global_filters: 全局过滤词列表
Returns:
是否匹配
"""
# 防御性类型检查:确保 title 是有效字符串
if not isinstance(title, str):
title = str(title) if title is not None else ""
if not title.strip():
return False
title_lower = title.lower()
# 全局过滤检查(优先级最高)
if global_filters:
if any(global_word.lower() in title_lower for global_word in global_filters):
return False
# 如果没有配置词组,则匹配所有标题(支持显示全部新闻)
if not word_groups:
return True
# 过滤词检查
if any(filter_word.lower() in title_lower for filter_word in filter_words):
return False
# 词组匹配检查
for group in word_groups:
required_words = group["required"]
normal_words = group["normal"]
# 必须词检查
if required_words:
all_required_present = all(
req_word.lower() in title_lower for req_word in required_words
)
if not all_required_present:
continue
# 普通词检查
if normal_words:
any_normal_present = any(
normal_word.lower() in title_lower for normal_word in normal_words
)
if not any_normal_present:
continue
return True
return False
+332
View File
@@ -0,0 +1,332 @@
# coding=utf-8
"""
配置加载模块
负责从 YAML 配置文件和环境变量加载配置。
"""
import os
from pathlib import Path
from typing import Dict, Any, Optional
import yaml
from .config import parse_multi_account_config, validate_paired_configs
def _get_env_bool(key: str, default: bool = False) -> Optional[bool]:
"""从环境变量获取布尔值,如果未设置返回 None"""
value = os.environ.get(key, "").strip().lower()
if not value:
return None
return value in ("true", "1")
def _get_env_int(key: str, default: int = 0) -> int:
"""从环境变量获取整数值"""
value = os.environ.get(key, "").strip()
if not value:
return default
try:
return int(value)
except ValueError:
return default
def _get_env_str(key: str, default: str = "") -> str:
"""从环境变量获取字符串值"""
return os.environ.get(key, "").strip() or default
def _load_app_config(config_data: Dict) -> Dict:
"""加载应用配置"""
app_config = config_data.get("app", {})
return {
"VERSION_CHECK_URL": app_config.get("version_check_url", ""),
"SHOW_VERSION_UPDATE": app_config.get("show_version_update", True),
"TIMEZONE": _get_env_str("TIMEZONE") or app_config.get("timezone", "Asia/Shanghai"),
}
def _load_crawler_config(config_data: Dict) -> Dict:
"""加载爬虫配置"""
crawler_config = config_data.get("crawler", {})
enable_crawler_env = _get_env_bool("ENABLE_CRAWLER")
return {
"REQUEST_INTERVAL": crawler_config.get("request_interval", 100),
"USE_PROXY": crawler_config.get("use_proxy", False),
"DEFAULT_PROXY": crawler_config.get("default_proxy", ""),
"ENABLE_CRAWLER": enable_crawler_env if enable_crawler_env is not None else crawler_config.get("enable_crawler", True),
}
def _load_report_config(config_data: Dict) -> Dict:
"""加载报告配置"""
report_config = config_data.get("report", {})
# 环境变量覆盖
sort_by_position_env = _get_env_bool("SORT_BY_POSITION_FIRST")
reverse_content_env = _get_env_bool("REVERSE_CONTENT_ORDER")
max_news_env = _get_env_int("MAX_NEWS_PER_KEYWORD")
return {
"REPORT_MODE": _get_env_str("REPORT_MODE") or report_config.get("mode", "daily"),
"RANK_THRESHOLD": report_config.get("rank_threshold", 10),
"SORT_BY_POSITION_FIRST": sort_by_position_env if sort_by_position_env is not None else report_config.get("sort_by_position_first", False),
"MAX_NEWS_PER_KEYWORD": max_news_env or report_config.get("max_news_per_keyword", 0),
"REVERSE_CONTENT_ORDER": reverse_content_env if reverse_content_env is not None else report_config.get("reverse_content_order", False),
}
def _load_notification_config(config_data: Dict) -> Dict:
"""加载通知配置"""
notification = config_data.get("notification", {})
enable_notification_env = _get_env_bool("ENABLE_NOTIFICATION")
return {
"ENABLE_NOTIFICATION": enable_notification_env if enable_notification_env is not None else notification.get("enable_notification", True),
"MESSAGE_BATCH_SIZE": notification.get("message_batch_size", 4000),
"DINGTALK_BATCH_SIZE": notification.get("dingtalk_batch_size", 20000),
"FEISHU_BATCH_SIZE": notification.get("feishu_batch_size", 29000),
"BARK_BATCH_SIZE": notification.get("bark_batch_size", 3600),
"SLACK_BATCH_SIZE": notification.get("slack_batch_size", 4000),
"BATCH_SEND_INTERVAL": notification.get("batch_send_interval", 1.0),
"FEISHU_MESSAGE_SEPARATOR": notification.get("feishu_message_separator", "---"),
"MAX_ACCOUNTS_PER_CHANNEL": _get_env_int("MAX_ACCOUNTS_PER_CHANNEL") or notification.get("max_accounts_per_channel", 3),
}
def _load_push_window_config(config_data: Dict) -> Dict:
"""加载推送窗口配置"""
notification = config_data.get("notification", {})
push_window = notification.get("push_window", {})
time_range = push_window.get("time_range", {})
enabled_env = _get_env_bool("PUSH_WINDOW_ENABLED")
once_per_day_env = _get_env_bool("PUSH_WINDOW_ONCE_PER_DAY")
return {
"ENABLED": enabled_env if enabled_env is not None else push_window.get("enabled", False),
"TIME_RANGE": {
"START": _get_env_str("PUSH_WINDOW_START") or time_range.get("start", "08:00"),
"END": _get_env_str("PUSH_WINDOW_END") or time_range.get("end", "22:00"),
},
"ONCE_PER_DAY": once_per_day_env if once_per_day_env is not None else push_window.get("once_per_day", True),
}
def _load_weight_config(config_data: Dict) -> Dict:
"""加载权重配置"""
weight = config_data.get("weight", {})
return {
"RANK_WEIGHT": weight.get("rank_weight", 1.0),
"FREQUENCY_WEIGHT": weight.get("frequency_weight", 1.0),
"HOTNESS_WEIGHT": weight.get("hotness_weight", 1.0),
}
def _load_storage_config(config_data: Dict) -> Dict:
"""加载存储配置"""
storage = config_data.get("storage", {})
formats = storage.get("formats", {})
local = storage.get("local", {})
remote = storage.get("remote", {})
pull = storage.get("pull", {})
txt_enabled_env = _get_env_bool("STORAGE_TXT_ENABLED")
html_enabled_env = _get_env_bool("STORAGE_HTML_ENABLED")
pull_enabled_env = _get_env_bool("PULL_ENABLED")
return {
"BACKEND": _get_env_str("STORAGE_BACKEND") or storage.get("backend", "auto"),
"FORMATS": {
"SQLITE": formats.get("sqlite", True),
"TXT": txt_enabled_env if txt_enabled_env is not None else formats.get("txt", True),
"HTML": html_enabled_env if html_enabled_env is not None else formats.get("html", True),
},
"LOCAL": {
"DATA_DIR": local.get("data_dir", "output"),
"RETENTION_DAYS": _get_env_int("LOCAL_RETENTION_DAYS") or local.get("retention_days", 0),
},
"REMOTE": {
"ENDPOINT_URL": _get_env_str("S3_ENDPOINT_URL") or remote.get("endpoint_url", ""),
"BUCKET_NAME": _get_env_str("S3_BUCKET_NAME") or remote.get("bucket_name", ""),
"ACCESS_KEY_ID": _get_env_str("S3_ACCESS_KEY_ID") or remote.get("access_key_id", ""),
"SECRET_ACCESS_KEY": _get_env_str("S3_SECRET_ACCESS_KEY") or remote.get("secret_access_key", ""),
"REGION": _get_env_str("S3_REGION") or remote.get("region", ""),
"RETENTION_DAYS": _get_env_int("REMOTE_RETENTION_DAYS") or remote.get("retention_days", 0),
},
"PULL": {
"ENABLED": pull_enabled_env if pull_enabled_env is not None else pull.get("enabled", False),
"DAYS": _get_env_int("PULL_DAYS") or pull.get("days", 7),
},
}
def _load_webhook_config(config_data: Dict) -> Dict:
"""加载 Webhook 配置"""
notification = config_data.get("notification", {})
webhooks = notification.get("webhooks", {})
return {
# 飞书
"FEISHU_WEBHOOK_URL": _get_env_str("FEISHU_WEBHOOK_URL") or webhooks.get("feishu_url", ""),
# 钉钉
"DINGTALK_WEBHOOK_URL": _get_env_str("DINGTALK_WEBHOOK_URL") or webhooks.get("dingtalk_url", ""),
# 企业微信
"WEWORK_WEBHOOK_URL": _get_env_str("WEWORK_WEBHOOK_URL") or webhooks.get("wework_url", ""),
"WEWORK_MSG_TYPE": _get_env_str("WEWORK_MSG_TYPE") or webhooks.get("wework_msg_type", "markdown"),
# Telegram
"TELEGRAM_BOT_TOKEN": _get_env_str("TELEGRAM_BOT_TOKEN") or webhooks.get("telegram_bot_token", ""),
"TELEGRAM_CHAT_ID": _get_env_str("TELEGRAM_CHAT_ID") or webhooks.get("telegram_chat_id", ""),
# 邮件
"EMAIL_FROM": _get_env_str("EMAIL_FROM") or webhooks.get("email_from", ""),
"EMAIL_PASSWORD": _get_env_str("EMAIL_PASSWORD") or webhooks.get("email_password", ""),
"EMAIL_TO": _get_env_str("EMAIL_TO") or webhooks.get("email_to", ""),
"EMAIL_SMTP_SERVER": _get_env_str("EMAIL_SMTP_SERVER") or webhooks.get("email_smtp_server", ""),
"EMAIL_SMTP_PORT": _get_env_str("EMAIL_SMTP_PORT") or webhooks.get("email_smtp_port", ""),
# ntfy
"NTFY_SERVER_URL": _get_env_str("NTFY_SERVER_URL") or webhooks.get("ntfy_server_url") or "https://ntfy.sh",
"NTFY_TOPIC": _get_env_str("NTFY_TOPIC") or webhooks.get("ntfy_topic", ""),
"NTFY_TOKEN": _get_env_str("NTFY_TOKEN") or webhooks.get("ntfy_token", ""),
# Bark
"BARK_URL": _get_env_str("BARK_URL") or webhooks.get("bark_url", ""),
# Slack
"SLACK_WEBHOOK_URL": _get_env_str("SLACK_WEBHOOK_URL") or webhooks.get("slack_webhook_url", ""),
}
def _print_notification_sources(config: Dict) -> None:
"""打印通知渠道配置来源信息"""
notification_sources = []
max_accounts = config["MAX_ACCOUNTS_PER_CHANNEL"]
if config["FEISHU_WEBHOOK_URL"]:
accounts = parse_multi_account_config(config["FEISHU_WEBHOOK_URL"])
count = min(len(accounts), max_accounts)
source = "环境变量" if os.environ.get("FEISHU_WEBHOOK_URL") else "配置文件"
notification_sources.append(f"飞书({source}, {count}个账号)")
if config["DINGTALK_WEBHOOK_URL"]:
accounts = parse_multi_account_config(config["DINGTALK_WEBHOOK_URL"])
count = min(len(accounts), max_accounts)
source = "环境变量" if os.environ.get("DINGTALK_WEBHOOK_URL") else "配置文件"
notification_sources.append(f"钉钉({source}, {count}个账号)")
if config["WEWORK_WEBHOOK_URL"]:
accounts = parse_multi_account_config(config["WEWORK_WEBHOOK_URL"])
count = min(len(accounts), max_accounts)
source = "环境变量" if os.environ.get("WEWORK_WEBHOOK_URL") else "配置文件"
notification_sources.append(f"企业微信({source}, {count}个账号)")
if config["TELEGRAM_BOT_TOKEN"] and config["TELEGRAM_CHAT_ID"]:
tokens = parse_multi_account_config(config["TELEGRAM_BOT_TOKEN"])
chat_ids = parse_multi_account_config(config["TELEGRAM_CHAT_ID"])
valid, count = validate_paired_configs(
{"bot_token": tokens, "chat_id": chat_ids},
"Telegram",
required_keys=["bot_token", "chat_id"]
)
if valid and count > 0:
count = min(count, max_accounts)
token_source = "环境变量" if os.environ.get("TELEGRAM_BOT_TOKEN") else "配置文件"
notification_sources.append(f"Telegram({token_source}, {count}个账号)")
if config["EMAIL_FROM"] and config["EMAIL_PASSWORD"] and config["EMAIL_TO"]:
from_source = "环境变量" if os.environ.get("EMAIL_FROM") else "配置文件"
notification_sources.append(f"邮件({from_source})")
if config["NTFY_SERVER_URL"] and config["NTFY_TOPIC"]:
topics = parse_multi_account_config(config["NTFY_TOPIC"])
tokens = parse_multi_account_config(config["NTFY_TOKEN"])
if tokens:
valid, count = validate_paired_configs(
{"topic": topics, "token": tokens},
"ntfy"
)
if valid and count > 0:
count = min(count, max_accounts)
server_source = "环境变量" if os.environ.get("NTFY_SERVER_URL") else "配置文件"
notification_sources.append(f"ntfy({server_source}, {count}个账号)")
else:
count = min(len(topics), max_accounts)
server_source = "环境变量" if os.environ.get("NTFY_SERVER_URL") else "配置文件"
notification_sources.append(f"ntfy({server_source}, {count}个账号)")
if config["BARK_URL"]:
accounts = parse_multi_account_config(config["BARK_URL"])
count = min(len(accounts), max_accounts)
bark_source = "环境变量" if os.environ.get("BARK_URL") else "配置文件"
notification_sources.append(f"Bark({bark_source}, {count}个账号)")
if config["SLACK_WEBHOOK_URL"]:
accounts = parse_multi_account_config(config["SLACK_WEBHOOK_URL"])
count = min(len(accounts), max_accounts)
slack_source = "环境变量" if os.environ.get("SLACK_WEBHOOK_URL") else "配置文件"
notification_sources.append(f"Slack({slack_source}, {count}个账号)")
if notification_sources:
print(f"通知渠道配置来源: {', '.join(notification_sources)}")
print(f"每个渠道最大账号数: {max_accounts}")
else:
print("未配置任何通知渠道")
def load_config(config_path: Optional[str] = None) -> Dict[str, Any]:
"""
加载配置文件
Args:
config_path: 配置文件路径,默认从环境变量 CONFIG_PATH 获取或使用 config/config.yaml
Returns:
包含所有配置的字典
Raises:
FileNotFoundError: 配置文件不存在
"""
if config_path is None:
config_path = os.environ.get("CONFIG_PATH", "config/config.yaml")
if not Path(config_path).exists():
raise FileNotFoundError(f"配置文件 {config_path} 不存在")
with open(config_path, "r", encoding="utf-8") as f:
config_data = yaml.safe_load(f)
print(f"配置文件加载成功: {config_path}")
# 合并所有配置
config = {}
# 应用配置
config.update(_load_app_config(config_data))
# 爬虫配置
config.update(_load_crawler_config(config_data))
# 报告配置
config.update(_load_report_config(config_data))
# 通知配置
config.update(_load_notification_config(config_data))
# 推送窗口配置
config["PUSH_WINDOW"] = _load_push_window_config(config_data)
# 权重配置
config["WEIGHT_CONFIG"] = _load_weight_config(config_data)
# 平台配置
config["PLATFORMS"] = config_data.get("platforms", [])
# 存储配置
config["STORAGE"] = _load_storage_config(config_data)
# Webhook 配置
config.update(_load_webhook_config(config_data))
# 打印通知渠道配置来源
_print_notification_sources(config)
return config
+8
View File
@@ -0,0 +1,8 @@
# coding=utf-8
"""
爬虫模块 - 数据抓取功能
"""
from trendradar.crawler.fetcher import DataFetcher
__all__ = ["DataFetcher"]
+184
View File
@@ -0,0 +1,184 @@
# coding=utf-8
"""
数据获取器模块
负责从 NewsNow API 抓取新闻数据,支持:
- 单个平台数据获取
- 批量平台数据爬取
- 自动重试机制
- 代理支持
"""
import json
import random
import time
from typing import Dict, List, Tuple, Optional, Union
import requests
class DataFetcher:
"""数据获取器"""
# 默认 API 地址
DEFAULT_API_URL = "https://newsnow.busiyi.world/api/s"
# 默认请求头
DEFAULT_HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Accept": "application/json, text/plain, */*",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Connection": "keep-alive",
"Cache-Control": "no-cache",
}
def __init__(
self,
proxy_url: Optional[str] = None,
api_url: Optional[str] = None,
):
"""
初始化数据获取器
Args:
proxy_url: 代理服务器 URL(可选)
api_url: API 基础 URL(可选,默认使用 DEFAULT_API_URL
"""
self.proxy_url = proxy_url
self.api_url = api_url or self.DEFAULT_API_URL
def fetch_data(
self,
id_info: Union[str, Tuple[str, str]],
max_retries: int = 2,
min_retry_wait: int = 3,
max_retry_wait: int = 5,
) -> Tuple[Optional[str], str, str]:
"""
获取指定ID数据,支持重试
Args:
id_info: 平台ID 或 (平台ID, 别名) 元组
max_retries: 最大重试次数
min_retry_wait: 最小重试等待时间(秒)
max_retry_wait: 最大重试等待时间(秒)
Returns:
(响应文本, 平台ID, 别名) 元组,失败时响应文本为 None
"""
if isinstance(id_info, tuple):
id_value, alias = id_info
else:
id_value = id_info
alias = id_value
url = f"{self.api_url}?id={id_value}&latest"
proxies = None
if self.proxy_url:
proxies = {"http": self.proxy_url, "https": self.proxy_url}
retries = 0
while retries <= max_retries:
try:
response = requests.get(
url,
proxies=proxies,
headers=self.DEFAULT_HEADERS,
timeout=10,
)
response.raise_for_status()
data_text = response.text
data_json = json.loads(data_text)
status = data_json.get("status", "未知")
if status not in ["success", "cache"]:
raise ValueError(f"响应状态异常: {status}")
status_info = "最新数据" if status == "success" else "缓存数据"
print(f"获取 {id_value} 成功({status_info}")
return data_text, id_value, alias
except Exception as e:
retries += 1
if retries <= max_retries:
base_wait = random.uniform(min_retry_wait, max_retry_wait)
additional_wait = (retries - 1) * random.uniform(1, 2)
wait_time = base_wait + additional_wait
print(f"请求 {id_value} 失败: {e}. {wait_time:.2f}秒后重试...")
time.sleep(wait_time)
else:
print(f"请求 {id_value} 失败: {e}")
return None, id_value, alias
return None, id_value, alias
def crawl_websites(
self,
ids_list: List[Union[str, Tuple[str, str]]],
request_interval: int = 100,
) -> Tuple[Dict, Dict, List]:
"""
爬取多个网站数据
Args:
ids_list: 平台ID列表,每个元素可以是字符串或 (平台ID, 别名) 元组
request_interval: 请求间隔(毫秒)
Returns:
(结果字典, ID到名称的映射, 失败ID列表) 元组
"""
results = {}
id_to_name = {}
failed_ids = []
for i, id_info in enumerate(ids_list):
if isinstance(id_info, tuple):
id_value, name = id_info
else:
id_value = id_info
name = id_value
id_to_name[id_value] = name
response, _, _ = self.fetch_data(id_info)
if response:
try:
data = json.loads(response)
results[id_value] = {}
for index, item in enumerate(data.get("items", []), 1):
title = item.get("title")
# 跳过无效标题(None、float、空字符串)
if title is None or isinstance(title, float) or not str(title).strip():
continue
title = str(title).strip()
url = item.get("url", "")
mobile_url = item.get("mobileUrl", "")
if title in results[id_value]:
results[id_value][title]["ranks"].append(index)
else:
results[id_value][title] = {
"ranks": [index],
"url": url,
"mobileUrl": mobile_url,
}
except json.JSONDecodeError:
print(f"解析 {id_value} 响应失败")
failed_ids.append(id_value)
except Exception as e:
print(f"处理 {id_value} 数据出错: {e}")
failed_ids.append(id_value)
else:
failed_ids.append(id_value)
# 请求间隔(除了最后一个)
if i < len(ids_list) - 1:
actual_interval = request_interval + random.randint(-10, 20)
actual_interval = max(50, actual_interval)
time.sleep(actual_interval / 1000)
print(f"成功: {list(results.keys())}, 失败: {failed_ids}")
return results, id_to_name, failed_ids
+81
View File
@@ -0,0 +1,81 @@
# coding=utf-8
"""
通知推送模块
提供多渠道通知推送功能,包括:
- 飞书、钉钉、企业微信
- Telegram、Slack
- Email、ntfy、Bark
模块结构:
- push_manager: 推送记录管理
- formatters: 内容格式转换
- batch: 批次处理工具
- renderer: 通知内容渲染
- splitter: 消息分批拆分
- senders: 消息发送器(各渠道发送函数)
- dispatcher: 多账号通知调度器
"""
from trendradar.notification.push_manager import PushRecordManager
from trendradar.notification.formatters import (
strip_markdown,
convert_markdown_to_mrkdwn,
)
from trendradar.notification.batch import (
get_batch_header,
get_max_batch_header_size,
truncate_to_bytes,
add_batch_headers,
)
from trendradar.notification.renderer import (
render_feishu_content,
render_dingtalk_content,
)
from trendradar.notification.splitter import (
split_content_into_batches,
DEFAULT_BATCH_SIZES,
)
from trendradar.notification.senders import (
send_to_feishu,
send_to_dingtalk,
send_to_wework,
send_to_telegram,
send_to_email,
send_to_ntfy,
send_to_bark,
send_to_slack,
SMTP_CONFIGS,
)
from trendradar.notification.dispatcher import NotificationDispatcher
__all__ = [
# 推送记录管理
"PushRecordManager",
# 格式转换
"strip_markdown",
"convert_markdown_to_mrkdwn",
# 批次处理
"get_batch_header",
"get_max_batch_header_size",
"truncate_to_bytes",
"add_batch_headers",
# 内容渲染
"render_feishu_content",
"render_dingtalk_content",
# 消息分批
"split_content_into_batches",
"DEFAULT_BATCH_SIZES",
# 消息发送器
"send_to_feishu",
"send_to_dingtalk",
"send_to_wework",
"send_to_telegram",
"send_to_email",
"send_to_ntfy",
"send_to_bark",
"send_to_slack",
"SMTP_CONFIGS",
# 通知调度器
"NotificationDispatcher",
]
+115
View File
@@ -0,0 +1,115 @@
# coding=utf-8
"""
批次处理模块
提供消息分批发送的辅助函数
"""
from typing import List
def get_batch_header(format_type: str, batch_num: int, total_batches: int) -> str:
"""根据 format_type 生成对应格式的批次头部
Args:
format_type: 推送类型(telegram, slack, wework_text, bark, feishu, dingtalk, ntfy, wework
batch_num: 当前批次编号
total_batches: 总批次数
Returns:
格式化的批次头部字符串
"""
if format_type == "telegram":
return f"<b>[第 {batch_num}/{total_batches} 批次]</b>\n\n"
elif format_type == "slack":
return f"*[第 {batch_num}/{total_batches} 批次]*\n\n"
elif format_type in ("wework_text", "bark"):
# 企业微信文本模式和 Bark 使用纯文本格式
return f"[第 {batch_num}/{total_batches} 批次]\n\n"
else:
# 飞书、钉钉、ntfy、企业微信 markdown 模式
return f"**[第 {batch_num}/{total_batches} 批次]**\n\n"
def get_max_batch_header_size(format_type: str) -> int:
"""估算批次头部的最大字节数(假设最多 99 批次)
用于在分批时预留空间,避免事后截断破坏内容完整性。
Args:
format_type: 推送类型
Returns:
最大头部字节数
"""
# 生成最坏情况的头部(99/99 批次)
max_header = get_batch_header(format_type, 99, 99)
return len(max_header.encode("utf-8"))
def truncate_to_bytes(text: str, max_bytes: int) -> str:
"""安全截断字符串到指定字节数,避免截断多字节字符
Args:
text: 要截断的文本
max_bytes: 最大字节数
Returns:
截断后的文本
"""
text_bytes = text.encode("utf-8")
if len(text_bytes) <= max_bytes:
return text
# 截断到指定字节数
truncated = text_bytes[:max_bytes]
# 处理可能的不完整 UTF-8 字符
for i in range(min(4, len(truncated))):
try:
return truncated[: len(truncated) - i].decode("utf-8")
except UnicodeDecodeError:
continue
# 极端情况:返回空字符串
return ""
def add_batch_headers(
batches: List[str], format_type: str, max_bytes: int
) -> List[str]:
"""为批次添加头部,动态计算确保总大小不超过限制
Args:
batches: 原始批次列表
format_type: 推送类型(bark, telegram, feishu 等)
max_bytes: 该推送类型的最大字节限制
Returns:
添加头部后的批次列表
"""
if len(batches) <= 1:
return batches
total = len(batches)
result = []
for i, content in enumerate(batches, 1):
# 生成批次头部
header = get_batch_header(format_type, i, total)
header_size = len(header.encode("utf-8"))
# 动态计算允许的最大内容大小
max_content_size = max_bytes - header_size
content_size = len(content.encode("utf-8"))
# 如果超出,截断到安全大小
if content_size > max_content_size:
print(
f"警告:{format_type}{i}/{total} 批次内容({content_size}字节) + 头部({header_size}字节) 超出限制({max_bytes}字节),截断到 {max_content_size} 字节"
)
content = truncate_to_bytes(content, max_content_size)
result.append(header + content)
return result
+420
View File
@@ -0,0 +1,420 @@
# coding=utf-8
"""
通知调度器模块
提供统一的通知分发接口。
支持所有通知渠道的多账号配置,使用 `;` 分隔多个账号。
使用示例:
dispatcher = NotificationDispatcher(config, get_time_func, split_content_func)
results = dispatcher.dispatch_all(report_data, report_type, ...)
"""
from typing import Any, Callable, Dict, List, Optional
from trendradar.core.config import (
get_account_at_index,
limit_accounts,
parse_multi_account_config,
validate_paired_configs,
)
from .senders import (
send_to_bark,
send_to_dingtalk,
send_to_email,
send_to_feishu,
send_to_ntfy,
send_to_slack,
send_to_telegram,
send_to_wework,
)
class NotificationDispatcher:
"""
统一的多账号通知调度器
将多账号发送逻辑封装,提供简洁的 dispatch_all 接口。
内部处理账号解析、数量限制、配对验证等逻辑。
"""
def __init__(
self,
config: Dict[str, Any],
get_time_func: Callable,
split_content_func: Callable,
):
"""
初始化通知调度器
Args:
config: 完整的配置字典,包含所有通知渠道的配置
get_time_func: 获取当前时间的函数
split_content_func: 内容分批函数
"""
self.config = config
self.get_time_func = get_time_func
self.split_content_func = split_content_func
self.max_accounts = config.get("MAX_ACCOUNTS_PER_CHANNEL", 3)
def dispatch_all(
self,
report_data: Dict,
report_type: str,
update_info: Optional[Dict] = None,
proxy_url: Optional[str] = None,
mode: str = "daily",
html_file_path: Optional[str] = None,
) -> Dict[str, bool]:
"""
分发通知到所有已配置的渠道
Args:
report_data: 报告数据(由 prepare_report_data 生成)
report_type: 报告类型(如 "当日汇总""实时增量"
update_info: 版本更新信息(可选)
proxy_url: 代理 URL(可选)
mode: 报告模式 (daily/current/incremental)
html_file_path: HTML 报告文件路径(邮件使用)
Returns:
Dict[str, bool]: 每个渠道的发送结果,key 为渠道名,value 为是否成功
"""
results = {}
# 飞书
if self.config.get("FEISHU_WEBHOOK_URL"):
results["feishu"] = self._send_feishu(
report_data, report_type, update_info, proxy_url, mode
)
# 钉钉
if self.config.get("DINGTALK_WEBHOOK_URL"):
results["dingtalk"] = self._send_dingtalk(
report_data, report_type, update_info, proxy_url, mode
)
# 企业微信
if self.config.get("WEWORK_WEBHOOK_URL"):
results["wework"] = self._send_wework(
report_data, report_type, update_info, proxy_url, mode
)
# Telegram(需要配对验证)
if self.config.get("TELEGRAM_BOT_TOKEN") and self.config.get("TELEGRAM_CHAT_ID"):
results["telegram"] = self._send_telegram(
report_data, report_type, update_info, proxy_url, mode
)
# ntfy(需要配对验证)
if self.config.get("NTFY_SERVER_URL") and self.config.get("NTFY_TOPIC"):
results["ntfy"] = self._send_ntfy(
report_data, report_type, update_info, proxy_url, mode
)
# Bark
if self.config.get("BARK_URL"):
results["bark"] = self._send_bark(
report_data, report_type, update_info, proxy_url, mode
)
# Slack
if self.config.get("SLACK_WEBHOOK_URL"):
results["slack"] = self._send_slack(
report_data, report_type, update_info, proxy_url, mode
)
# 邮件(保持原有逻辑,已支持多收件人)
if (
self.config.get("EMAIL_FROM")
and self.config.get("EMAIL_PASSWORD")
and self.config.get("EMAIL_TO")
):
results["email"] = self._send_email(report_type, html_file_path)
return results
def _send_to_multi_accounts(
self,
channel_name: str,
config_value: str,
send_func: Callable[..., bool],
**kwargs,
) -> bool:
"""
通用多账号发送逻辑
Args:
channel_name: 渠道名称(用于日志和账号数量限制提示)
config_value: 配置值(可能包含多个账号,用 ; 分隔)
send_func: 发送函数,签名为 (account, account_label=..., **kwargs) -> bool
**kwargs: 传递给发送函数的其他参数
Returns:
bool: 任一账号发送成功则返回 True
"""
accounts = parse_multi_account_config(config_value)
if not accounts:
return False
accounts = limit_accounts(accounts, self.max_accounts, channel_name)
results = []
for i, account in enumerate(accounts):
if account:
account_label = f"账号{i+1}" if len(accounts) > 1 else ""
result = send_func(account, account_label=account_label, **kwargs)
results.append(result)
return any(results) if results else False
def _send_feishu(
self,
report_data: Dict,
report_type: str,
update_info: Optional[Dict],
proxy_url: Optional[str],
mode: str,
) -> bool:
"""发送到飞书(多账号)"""
return self._send_to_multi_accounts(
channel_name="飞书",
config_value=self.config["FEISHU_WEBHOOK_URL"],
send_func=lambda url, account_label: send_to_feishu(
webhook_url=url,
report_data=report_data,
report_type=report_type,
update_info=update_info,
proxy_url=proxy_url,
mode=mode,
account_label=account_label,
batch_size=self.config.get("FEISHU_BATCH_SIZE", 29000),
batch_interval=self.config.get("BATCH_SEND_INTERVAL", 1.0),
split_content_func=self.split_content_func,
get_time_func=self.get_time_func,
),
)
def _send_dingtalk(
self,
report_data: Dict,
report_type: str,
update_info: Optional[Dict],
proxy_url: Optional[str],
mode: str,
) -> bool:
"""发送到钉钉(多账号)"""
return self._send_to_multi_accounts(
channel_name="钉钉",
config_value=self.config["DINGTALK_WEBHOOK_URL"],
send_func=lambda url, account_label: send_to_dingtalk(
webhook_url=url,
report_data=report_data,
report_type=report_type,
update_info=update_info,
proxy_url=proxy_url,
mode=mode,
account_label=account_label,
batch_size=self.config.get("DINGTALK_BATCH_SIZE", 20000),
batch_interval=self.config.get("BATCH_SEND_INTERVAL", 1.0),
split_content_func=self.split_content_func,
),
)
def _send_wework(
self,
report_data: Dict,
report_type: str,
update_info: Optional[Dict],
proxy_url: Optional[str],
mode: str,
) -> bool:
"""发送到企业微信(多账号)"""
return self._send_to_multi_accounts(
channel_name="企业微信",
config_value=self.config["WEWORK_WEBHOOK_URL"],
send_func=lambda url, account_label: send_to_wework(
webhook_url=url,
report_data=report_data,
report_type=report_type,
update_info=update_info,
proxy_url=proxy_url,
mode=mode,
account_label=account_label,
batch_size=self.config.get("MESSAGE_BATCH_SIZE", 4000),
batch_interval=self.config.get("BATCH_SEND_INTERVAL", 1.0),
msg_type=self.config.get("WEWORK_MSG_TYPE", "markdown"),
split_content_func=self.split_content_func,
),
)
def _send_telegram(
self,
report_data: Dict,
report_type: str,
update_info: Optional[Dict],
proxy_url: Optional[str],
mode: str,
) -> bool:
"""发送到 Telegram(多账号,需验证 token 和 chat_id 配对)"""
telegram_tokens = parse_multi_account_config(self.config["TELEGRAM_BOT_TOKEN"])
telegram_chat_ids = parse_multi_account_config(self.config["TELEGRAM_CHAT_ID"])
if not telegram_tokens or not telegram_chat_ids:
return False
# 验证配对
valid, count = validate_paired_configs(
{"bot_token": telegram_tokens, "chat_id": telegram_chat_ids},
"Telegram",
required_keys=["bot_token", "chat_id"],
)
if not valid or count == 0:
return False
# 限制账号数量
telegram_tokens = limit_accounts(telegram_tokens, self.max_accounts, "Telegram")
telegram_chat_ids = telegram_chat_ids[: len(telegram_tokens)]
results = []
for i in range(len(telegram_tokens)):
token = telegram_tokens[i]
chat_id = telegram_chat_ids[i]
if token and chat_id:
account_label = f"账号{i+1}" if len(telegram_tokens) > 1 else ""
result = send_to_telegram(
bot_token=token,
chat_id=chat_id,
report_data=report_data,
report_type=report_type,
update_info=update_info,
proxy_url=proxy_url,
mode=mode,
account_label=account_label,
batch_size=self.config.get("MESSAGE_BATCH_SIZE", 4000),
batch_interval=self.config.get("BATCH_SEND_INTERVAL", 1.0),
split_content_func=self.split_content_func,
)
results.append(result)
return any(results) if results else False
def _send_ntfy(
self,
report_data: Dict,
report_type: str,
update_info: Optional[Dict],
proxy_url: Optional[str],
mode: str,
) -> bool:
"""发送到 ntfy(多账号,需验证 topic 和 token 配对)"""
ntfy_server_url = self.config["NTFY_SERVER_URL"]
ntfy_topics = parse_multi_account_config(self.config["NTFY_TOPIC"])
ntfy_tokens = parse_multi_account_config(self.config.get("NTFY_TOKEN", ""))
if not ntfy_server_url or not ntfy_topics:
return False
# 验证 token 和 topic 数量一致(如果配置了 token)
if ntfy_tokens and len(ntfy_tokens) != len(ntfy_topics):
print(
f"❌ ntfy 配置错误:topic 数量({len(ntfy_topics)})与 token 数量({len(ntfy_tokens)})不一致,跳过 ntfy 推送"
)
return False
# 限制账号数量
ntfy_topics = limit_accounts(ntfy_topics, self.max_accounts, "ntfy")
if ntfy_tokens:
ntfy_tokens = ntfy_tokens[: len(ntfy_topics)]
results = []
for i, topic in enumerate(ntfy_topics):
if topic:
token = get_account_at_index(ntfy_tokens, i, "") if ntfy_tokens else ""
account_label = f"账号{i+1}" if len(ntfy_topics) > 1 else ""
result = send_to_ntfy(
server_url=ntfy_server_url,
topic=topic,
token=token,
report_data=report_data,
report_type=report_type,
update_info=update_info,
proxy_url=proxy_url,
mode=mode,
account_label=account_label,
batch_size=3800,
split_content_func=self.split_content_func,
)
results.append(result)
return any(results) if results else False
def _send_bark(
self,
report_data: Dict,
report_type: str,
update_info: Optional[Dict],
proxy_url: Optional[str],
mode: str,
) -> bool:
"""发送到 Bark(多账号)"""
return self._send_to_multi_accounts(
channel_name="Bark",
config_value=self.config["BARK_URL"],
send_func=lambda url, account_label: send_to_bark(
bark_url=url,
report_data=report_data,
report_type=report_type,
update_info=update_info,
proxy_url=proxy_url,
mode=mode,
account_label=account_label,
batch_size=self.config.get("BARK_BATCH_SIZE", 3600),
batch_interval=self.config.get("BATCH_SEND_INTERVAL", 1.0),
split_content_func=self.split_content_func,
),
)
def _send_slack(
self,
report_data: Dict,
report_type: str,
update_info: Optional[Dict],
proxy_url: Optional[str],
mode: str,
) -> bool:
"""发送到 Slack(多账号)"""
return self._send_to_multi_accounts(
channel_name="Slack",
config_value=self.config["SLACK_WEBHOOK_URL"],
send_func=lambda url, account_label: send_to_slack(
webhook_url=url,
report_data=report_data,
report_type=report_type,
update_info=update_info,
proxy_url=proxy_url,
mode=mode,
account_label=account_label,
batch_size=self.config.get("SLACK_BATCH_SIZE", 4000),
batch_interval=self.config.get("BATCH_SEND_INTERVAL", 1.0),
split_content_func=self.split_content_func,
),
)
def _send_email(
self,
report_type: str,
html_file_path: Optional[str],
) -> bool:
"""发送邮件(保持原有逻辑,已支持多收件人)"""
return send_to_email(
from_email=self.config["EMAIL_FROM"],
password=self.config["EMAIL_PASSWORD"],
to_email=self.config["EMAIL_TO"],
report_type=report_type,
html_file_path=html_file_path,
custom_smtp_server=self.config.get("EMAIL_SMTP_SERVER", ""),
custom_smtp_port=self.config.get("EMAIL_SMTP_PORT", ""),
get_time_func=self.get_time_func,
)
+80
View File
@@ -0,0 +1,80 @@
# coding=utf-8
"""
通知内容格式转换模块
提供不同推送平台间的格式转换功能
"""
import re
def strip_markdown(text: str) -> str:
"""去除文本中的 markdown 语法格式,用于个人微信推送
Args:
text: 包含 markdown 格式的文本
Returns:
纯文本内容
"""
# 去除粗体 **text** 或 __text__
text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)
text = re.sub(r'__(.+?)__', r'\1', text)
# 去除斜体 *text* 或 _text_
text = re.sub(r'\*(.+?)\*', r'\1', text)
text = re.sub(r'_(.+?)_', r'\1', text)
# 去除删除线 ~~text~~
text = re.sub(r'~~(.+?)~~', r'\1', text)
# 转换链接 [text](url) -> text url(保留 URL
text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'\1 \2', text)
# 去除图片 ![alt](url) -> alt
text = re.sub(r'!\[(.+?)\]\(.+?\)', r'\1', text)
# 去除行内代码 `code`
text = re.sub(r'`(.+?)`', r'\1', text)
# 去除引用符号 >
text = re.sub(r'^>\s*', '', text, flags=re.MULTILINE)
# 去除标题符号 # ## ### 等
text = re.sub(r'^#+\s*', '', text, flags=re.MULTILINE)
# 去除水平分割线 --- 或 ***
text = re.sub(r'^[\-\*]{3,}\s*$', '', text, flags=re.MULTILINE)
# 去除 HTML 标签 <font color='xxx'>text</font> -> text
text = re.sub(r'<font[^>]*>(.+?)</font>', r'\1', text)
text = re.sub(r'<[^>]+>', '', text)
# 清理多余的空行(保留最多两个连续空行)
text = re.sub(r'\n{3,}', '\n\n', text)
return text.strip()
def convert_markdown_to_mrkdwn(content: str) -> str:
"""
将标准 Markdown 转换为 Slack 的 mrkdwn 格式
转换规则:
- **粗体** → *粗体*
- [文本](url) → <url|文本>
- 保留其他格式(代码块、列表等)
Args:
content: Markdown 格式的内容
Returns:
Slack mrkdwn 格式的内容
"""
# 1. 转换链接格式: [文本](url) → <url|文本>
content = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'<\2|\1>', content)
# 2. 转换粗体: **文本** → *文本*
content = re.sub(r'\*\*([^*]+)\*\*', r'*\1*', content)
return content
+109
View File
@@ -0,0 +1,109 @@
# coding=utf-8
"""
推送记录管理模块
管理推送记录,支持每日只推送一次和时间窗口控制
通过 storage_backend 统一存储,支持本地 SQLite 和远程云存储
"""
from datetime import datetime
from typing import Callable, Optional, Any
import pytz
class PushRecordManager:
"""
推送记录管理器
通过 storage_backend 统一管理推送记录:
- 本地环境:使用 LocalStorageBackend,数据存储在本地 SQLite
- GitHub Actions:使用 RemoteStorageBackend,数据存储在云端
这样 once_per_day 功能在 GitHub Actions 上也能正常工作。
"""
def __init__(
self,
storage_backend: Any,
get_time_func: Optional[Callable[[], datetime]] = None,
):
"""
初始化推送记录管理器
Args:
storage_backend: 存储后端实例(LocalStorageBackend 或 RemoteStorageBackend
get_time_func: 获取当前时间的函数(应使用配置的时区)
"""
self.storage_backend = storage_backend
self.get_time = get_time_func or self._default_get_time
print(f"[推送记录] 使用 {storage_backend.backend_name} 存储后端")
def _default_get_time(self) -> datetime:
"""默认时间获取函数(UTC+8"""
return datetime.now(pytz.timezone("Asia/Shanghai"))
def has_pushed_today(self) -> bool:
"""
检查今天是否已经推送过
Returns:
是否已推送
"""
return self.storage_backend.has_pushed_today()
def record_push(self, report_type: str) -> bool:
"""
记录推送
Args:
report_type: 报告类型
Returns:
是否记录成功
"""
return self.storage_backend.record_push(report_type)
def is_in_time_range(self, start_time: str, end_time: str) -> bool:
"""
检查当前时间是否在指定时间范围内
Args:
start_time: 开始时间(格式:HH:MM
end_time: 结束时间(格式:HH:MM
Returns:
是否在时间范围内
"""
now = self.get_time()
current_time = now.strftime("%H:%M")
def normalize_time(time_str: str) -> str:
"""将时间字符串标准化为 HH:MM 格式"""
try:
parts = time_str.strip().split(":")
if len(parts) != 2:
raise ValueError(f"时间格式错误: {time_str}")
hour = int(parts[0])
minute = int(parts[1])
if not (0 <= hour <= 23 and 0 <= minute <= 59):
raise ValueError(f"时间范围错误: {time_str}")
return f"{hour:02d}:{minute:02d}"
except Exception as e:
print(f"时间格式化错误 '{time_str}': {e}")
return time_str
normalized_start = normalize_time(start_time)
normalized_end = normalize_time(end_time)
normalized_current = normalize_time(current_time)
result = normalized_start <= normalized_current <= normalized_end
if not result:
print(f"时间窗口判断:当前 {normalized_current},窗口 {normalized_start}-{normalized_end}")
return result
+260
View File
@@ -0,0 +1,260 @@
# coding=utf-8
"""
通知内容渲染模块
提供多平台通知内容渲染功能,生成格式化的推送消息
"""
from datetime import datetime
from typing import Dict, List, Optional, Callable
from trendradar.report.formatter import format_title_for_platform
def render_feishu_content(
report_data: Dict,
update_info: Optional[Dict] = None,
mode: str = "daily",
separator: str = "---",
reverse_content_order: bool = False,
get_time_func: Optional[Callable[[], datetime]] = None,
) -> str:
"""渲染飞书通知内容
Args:
report_data: 报告数据字典,包含 stats, new_titles, failed_ids, total_new_count
update_info: 版本更新信息(可选)
mode: 报告模式 ("daily", "incremental", "current")
separator: 内容分隔符
reverse_content_order: 是否反转内容顺序(新增在前)
get_time_func: 获取当前时间的函数(可选,默认使用 datetime.now()
Returns:
格式化的飞书消息内容
"""
# 生成热点词汇统计部分
stats_content = ""
if report_data["stats"]:
stats_content += "📊 **热点词汇统计**\n\n"
total_count = len(report_data["stats"])
for i, stat in enumerate(report_data["stats"]):
word = stat["word"]
count = stat["count"]
sequence_display = f"<font color='grey'>[{i + 1}/{total_count}]</font>"
if count >= 10:
stats_content += f"🔥 {sequence_display} **{word}** : <font color='red'>{count}</font> 条\n\n"
elif count >= 5:
stats_content += f"📈 {sequence_display} **{word}** : <font color='orange'>{count}</font> 条\n\n"
else:
stats_content += f"📌 {sequence_display} **{word}** : {count}\n\n"
for j, title_data in enumerate(stat["titles"], 1):
formatted_title = format_title_for_platform(
"feishu", title_data, show_source=True
)
stats_content += f" {j}. {formatted_title}\n"
if j < len(stat["titles"]):
stats_content += "\n"
if i < len(report_data["stats"]) - 1:
stats_content += f"\n{separator}\n\n"
# 生成新增新闻部分
new_titles_content = ""
if report_data["new_titles"]:
new_titles_content += (
f"🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
)
for source_data in report_data["new_titles"]:
new_titles_content += (
f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n"
)
for j, title_data in enumerate(source_data["titles"], 1):
title_data_copy = title_data.copy()
title_data_copy["is_new"] = False
formatted_title = format_title_for_platform(
"feishu", title_data_copy, show_source=False
)
new_titles_content += f" {j}. {formatted_title}\n"
new_titles_content += "\n"
# 根据配置决定内容顺序
text_content = ""
if reverse_content_order:
# 新增热点在前,热点词汇统计在后
if new_titles_content:
text_content += new_titles_content
if stats_content:
text_content += f"\n{separator}\n\n"
if stats_content:
text_content += stats_content
else:
# 默认:热点词汇统计在前,新增热点在后
if stats_content:
text_content += stats_content
if new_titles_content:
text_content += f"\n{separator}\n\n"
if new_titles_content:
text_content += new_titles_content
if not text_content:
if mode == "incremental":
mode_text = "增量模式下暂无新增匹配的热点词汇"
elif mode == "current":
mode_text = "当前榜单模式下暂无匹配的热点词汇"
else:
mode_text = "暂无匹配的热点词汇"
text_content = f"📭 {mode_text}\n\n"
if report_data["failed_ids"]:
if text_content and "暂无匹配" not in text_content:
text_content += f"\n{separator}\n\n"
text_content += "⚠️ **数据获取失败的平台:**\n\n"
for i, id_value in enumerate(report_data["failed_ids"], 1):
text_content += f" • <font color='red'>{id_value}</font>\n"
# 获取当前时间
now = get_time_func() if get_time_func else datetime.now()
text_content += (
f"\n\n<font color='grey'>更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}</font>"
)
if update_info:
text_content += f"\n<font color='grey'>TrendRadar 发现新版本 {update_info['remote_version']},当前 {update_info['current_version']}</font>"
return text_content
def render_dingtalk_content(
report_data: Dict,
update_info: Optional[Dict] = None,
mode: str = "daily",
reverse_content_order: bool = False,
get_time_func: Optional[Callable[[], datetime]] = None,
) -> str:
"""渲染钉钉通知内容
Args:
report_data: 报告数据字典,包含 stats, new_titles, failed_ids, total_new_count
update_info: 版本更新信息(可选)
mode: 报告模式 ("daily", "incremental", "current")
reverse_content_order: 是否反转内容顺序(新增在前)
get_time_func: 获取当前时间的函数(可选,默认使用 datetime.now()
Returns:
格式化的钉钉消息内容
"""
total_titles = sum(
len(stat["titles"]) for stat in report_data["stats"] if stat["count"] > 0
)
now = get_time_func() if get_time_func else datetime.now()
# 头部信息
header_content = f"**总新闻数:** {total_titles}\n\n"
header_content += f"**时间:** {now.strftime('%Y-%m-%d %H:%M:%S')}\n\n"
header_content += "**类型:** 热点分析报告\n\n"
header_content += "---\n\n"
# 生成热点词汇统计部分
stats_content = ""
if report_data["stats"]:
stats_content += "📊 **热点词汇统计**\n\n"
total_count = len(report_data["stats"])
for i, stat in enumerate(report_data["stats"]):
word = stat["word"]
count = stat["count"]
sequence_display = f"[{i + 1}/{total_count}]"
if count >= 10:
stats_content += f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n"
elif count >= 5:
stats_content += f"📈 {sequence_display} **{word}** : **{count}** 条\n\n"
else:
stats_content += f"📌 {sequence_display} **{word}** : {count}\n\n"
for j, title_data in enumerate(stat["titles"], 1):
formatted_title = format_title_for_platform(
"dingtalk", title_data, show_source=True
)
stats_content += f" {j}. {formatted_title}\n"
if j < len(stat["titles"]):
stats_content += "\n"
if i < len(report_data["stats"]) - 1:
stats_content += "\n---\n\n"
# 生成新增新闻部分
new_titles_content = ""
if report_data["new_titles"]:
new_titles_content += (
f"🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
)
for source_data in report_data["new_titles"]:
new_titles_content += f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n"
for j, title_data in enumerate(source_data["titles"], 1):
title_data_copy = title_data.copy()
title_data_copy["is_new"] = False
formatted_title = format_title_for_platform(
"dingtalk", title_data_copy, show_source=False
)
new_titles_content += f" {j}. {formatted_title}\n"
new_titles_content += "\n"
# 根据配置决定内容顺序
text_content = header_content
if reverse_content_order:
# 新增热点在前,热点词汇统计在后
if new_titles_content:
text_content += new_titles_content
if stats_content:
text_content += "\n---\n\n"
if stats_content:
text_content += stats_content
else:
# 默认:热点词汇统计在前,新增热点在后
if stats_content:
text_content += stats_content
if new_titles_content:
text_content += "\n---\n\n"
if new_titles_content:
text_content += new_titles_content
if not stats_content and not new_titles_content:
if mode == "incremental":
mode_text = "增量模式下暂无新增匹配的热点词汇"
elif mode == "current":
mode_text = "当前榜单模式下暂无匹配的热点词汇"
else:
mode_text = "暂无匹配的热点词汇"
text_content += f"📭 {mode_text}\n\n"
if report_data["failed_ids"]:
if "暂无匹配" not in text_content:
text_content += "\n---\n\n"
text_content += "⚠️ **数据获取失败的平台:**\n\n"
for i, id_value in enumerate(report_data["failed_ids"], 1):
text_content += f" • **{id_value}**\n"
text_content += f"\n\n> 更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}"
if update_info:
text_content += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**,当前 **{update_info['current_version']}**"
return text_content
File diff suppressed because it is too large Load Diff
+580
View File
@@ -0,0 +1,580 @@
# coding=utf-8
"""
消息分批处理模块
提供消息内容分批拆分功能,确保消息大小不超过各平台限制
"""
from datetime import datetime
from typing import Dict, List, Optional, Callable
from trendradar.report.formatter import format_title_for_platform
# 默认批次大小配置
DEFAULT_BATCH_SIZES = {
"dingtalk": 20000,
"feishu": 29000,
"ntfy": 3800,
"default": 4000,
}
def split_content_into_batches(
report_data: Dict,
format_type: str,
update_info: Optional[Dict] = None,
max_bytes: Optional[int] = None,
mode: str = "daily",
batch_sizes: Optional[Dict[str, int]] = None,
feishu_separator: str = "---",
reverse_content_order: bool = False,
get_time_func: Optional[Callable[[], datetime]] = None,
) -> List[str]:
"""分批处理消息内容,确保词组标题+至少第一条新闻的完整性
Args:
report_data: 报告数据字典,包含 stats, new_titles, failed_ids, total_new_count
format_type: 格式类型 (feishu, dingtalk, wework, telegram, ntfy, bark, slack)
update_info: 版本更新信息(可选)
max_bytes: 最大字节数(可选,如果不指定则使用默认配置)
mode: 报告模式 (daily, incremental, current)
batch_sizes: 批次大小配置字典(可选)
feishu_separator: 飞书消息分隔符
reverse_content_order: 是否反转内容顺序(新增在前)
get_time_func: 获取当前时间的函数(可选)
Returns:
分批后的消息内容列表
"""
# 合并批次大小配置
sizes = {**DEFAULT_BATCH_SIZES, **(batch_sizes or {})}
if max_bytes is None:
if format_type == "dingtalk":
max_bytes = sizes.get("dingtalk", 20000)
elif format_type == "feishu":
max_bytes = sizes.get("feishu", 29000)
elif format_type == "ntfy":
max_bytes = sizes.get("ntfy", 3800)
else:
max_bytes = sizes.get("default", 4000)
batches = []
total_titles = sum(
len(stat["titles"]) for stat in report_data["stats"] if stat["count"] > 0
)
now = get_time_func() if get_time_func else datetime.now()
base_header = ""
if format_type in ("wework", "bark"):
base_header = f"**总新闻数:** {total_titles}\n\n\n\n"
elif format_type == "telegram":
base_header = f"总新闻数: {total_titles}\n\n"
elif format_type == "ntfy":
base_header = f"**总新闻数:** {total_titles}\n\n"
elif format_type == "feishu":
base_header = ""
elif format_type == "dingtalk":
base_header = f"**总新闻数:** {total_titles}\n\n"
base_header += f"**时间:** {now.strftime('%Y-%m-%d %H:%M:%S')}\n\n"
base_header += f"**类型:** 热点分析报告\n\n"
base_header += "---\n\n"
elif format_type == "slack":
base_header = f"*总新闻数:* {total_titles}\n\n"
base_footer = ""
if format_type in ("wework", "bark"):
base_footer = f"\n\n\n> 更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}"
if update_info:
base_footer += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**,当前 **{update_info['current_version']}**"
elif format_type == "telegram":
base_footer = f"\n\n更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}"
if update_info:
base_footer += f"\nTrendRadar 发现新版本 {update_info['remote_version']},当前 {update_info['current_version']}"
elif format_type == "ntfy":
base_footer = f"\n\n> 更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}"
if update_info:
base_footer += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**,当前 **{update_info['current_version']}**"
elif format_type == "feishu":
base_footer = f"\n\n<font color='grey'>更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}</font>"
if update_info:
base_footer += f"\n<font color='grey'>TrendRadar 发现新版本 {update_info['remote_version']},当前 {update_info['current_version']}</font>"
elif format_type == "dingtalk":
base_footer = f"\n\n> 更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}"
if update_info:
base_footer += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**,当前 **{update_info['current_version']}**"
elif format_type == "slack":
base_footer = f"\n\n_更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}_"
if update_info:
base_footer += f"\n_TrendRadar 发现新版本 *{update_info['remote_version']}*,当前 *{update_info['current_version']}_"
stats_header = ""
if report_data["stats"]:
if format_type in ("wework", "bark"):
stats_header = f"📊 **热点词汇统计**\n\n"
elif format_type == "telegram":
stats_header = f"📊 热点词汇统计\n\n"
elif format_type == "ntfy":
stats_header = f"📊 **热点词汇统计**\n\n"
elif format_type == "feishu":
stats_header = f"📊 **热点词汇统计**\n\n"
elif format_type == "dingtalk":
stats_header = f"📊 **热点词汇统计**\n\n"
elif format_type == "slack":
stats_header = f"📊 *热点词汇统计*\n\n"
current_batch = base_header
current_batch_has_content = False
if (
not report_data["stats"]
and not report_data["new_titles"]
and not report_data["failed_ids"]
):
if mode == "incremental":
mode_text = "增量模式下暂无新增匹配的热点词汇"
elif mode == "current":
mode_text = "当前榜单模式下暂无匹配的热点词汇"
else:
mode_text = "暂无匹配的热点词汇"
simple_content = f"📭 {mode_text}\n\n"
final_content = base_header + simple_content + base_footer
batches.append(final_content)
return batches
# 定义处理热点词汇统计的函数
def process_stats_section(current_batch, current_batch_has_content, batches):
"""处理热点词汇统计"""
if not report_data["stats"]:
return current_batch, current_batch_has_content, batches
total_count = len(report_data["stats"])
# 添加统计标题
test_content = current_batch + stats_header
if (
len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
< max_bytes
):
current_batch = test_content
current_batch_has_content = True
else:
if current_batch_has_content:
batches.append(current_batch + base_footer)
current_batch = base_header + stats_header
current_batch_has_content = True
# 逐个处理词组(确保词组标题+第一条新闻的原子性)
for i, stat in enumerate(report_data["stats"]):
word = stat["word"]
count = stat["count"]
sequence_display = f"[{i + 1}/{total_count}]"
# 构建词组标题
word_header = ""
if format_type in ("wework", "bark"):
if count >= 10:
word_header = (
f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n"
)
elif count >= 5:
word_header = (
f"📈 {sequence_display} **{word}** : **{count}** 条\n\n"
)
else:
word_header = f"📌 {sequence_display} **{word}** : {count}\n\n"
elif format_type == "telegram":
if count >= 10:
word_header = f"🔥 {sequence_display} {word} : {count}\n\n"
elif count >= 5:
word_header = f"📈 {sequence_display} {word} : {count}\n\n"
else:
word_header = f"📌 {sequence_display} {word} : {count}\n\n"
elif format_type == "ntfy":
if count >= 10:
word_header = (
f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n"
)
elif count >= 5:
word_header = (
f"📈 {sequence_display} **{word}** : **{count}** 条\n\n"
)
else:
word_header = f"📌 {sequence_display} **{word}** : {count}\n\n"
elif format_type == "feishu":
if count >= 10:
word_header = f"🔥 <font color='grey'>{sequence_display}</font> **{word}** : <font color='red'>{count}</font> 条\n\n"
elif count >= 5:
word_header = f"📈 <font color='grey'>{sequence_display}</font> **{word}** : <font color='orange'>{count}</font> 条\n\n"
else:
word_header = f"📌 <font color='grey'>{sequence_display}</font> **{word}** : {count}\n\n"
elif format_type == "dingtalk":
if count >= 10:
word_header = (
f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n"
)
elif count >= 5:
word_header = (
f"📈 {sequence_display} **{word}** : **{count}** 条\n\n"
)
else:
word_header = f"📌 {sequence_display} **{word}** : {count}\n\n"
elif format_type == "slack":
if count >= 10:
word_header = (
f"🔥 {sequence_display} *{word}* : *{count}* 条\n\n"
)
elif count >= 5:
word_header = (
f"📈 {sequence_display} *{word}* : *{count}* 条\n\n"
)
else:
word_header = f"📌 {sequence_display} *{word}* : {count}\n\n"
# 构建第一条新闻
first_news_line = ""
if stat["titles"]:
first_title_data = stat["titles"][0]
if format_type in ("wework", "bark"):
formatted_title = format_title_for_platform(
"wework", first_title_data, show_source=True
)
elif format_type == "telegram":
formatted_title = format_title_for_platform(
"telegram", first_title_data, show_source=True
)
elif format_type == "ntfy":
formatted_title = format_title_for_platform(
"ntfy", first_title_data, show_source=True
)
elif format_type == "feishu":
formatted_title = format_title_for_platform(
"feishu", first_title_data, show_source=True
)
elif format_type == "dingtalk":
formatted_title = format_title_for_platform(
"dingtalk", first_title_data, show_source=True
)
elif format_type == "slack":
formatted_title = format_title_for_platform(
"slack", first_title_data, show_source=True
)
else:
formatted_title = f"{first_title_data['title']}"
first_news_line = f" 1. {formatted_title}\n"
if len(stat["titles"]) > 1:
first_news_line += "\n"
# 原子性检查:词组标题+第一条新闻必须一起处理
word_with_first_news = word_header + first_news_line
test_content = current_batch + word_with_first_news
if (
len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
>= max_bytes
):
# 当前批次容纳不下,开启新批次
if current_batch_has_content:
batches.append(current_batch + base_footer)
current_batch = base_header + stats_header + word_with_first_news
current_batch_has_content = True
start_index = 1
else:
current_batch = test_content
current_batch_has_content = True
start_index = 1
# 处理剩余新闻条目
for j in range(start_index, len(stat["titles"])):
title_data = stat["titles"][j]
if format_type in ("wework", "bark"):
formatted_title = format_title_for_platform(
"wework", title_data, show_source=True
)
elif format_type == "telegram":
formatted_title = format_title_for_platform(
"telegram", title_data, show_source=True
)
elif format_type == "ntfy":
formatted_title = format_title_for_platform(
"ntfy", title_data, show_source=True
)
elif format_type == "feishu":
formatted_title = format_title_for_platform(
"feishu", title_data, show_source=True
)
elif format_type == "dingtalk":
formatted_title = format_title_for_platform(
"dingtalk", title_data, show_source=True
)
elif format_type == "slack":
formatted_title = format_title_for_platform(
"slack", title_data, show_source=True
)
else:
formatted_title = f"{title_data['title']}"
news_line = f" {j + 1}. {formatted_title}\n"
if j < len(stat["titles"]) - 1:
news_line += "\n"
test_content = current_batch + news_line
if (
len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
>= max_bytes
):
if current_batch_has_content:
batches.append(current_batch + base_footer)
current_batch = base_header + stats_header + word_header + news_line
current_batch_has_content = True
else:
current_batch = test_content
current_batch_has_content = True
# 词组间分隔符
if i < len(report_data["stats"]) - 1:
separator = ""
if format_type in ("wework", "bark"):
separator = f"\n\n\n\n"
elif format_type == "telegram":
separator = f"\n\n"
elif format_type == "ntfy":
separator = f"\n\n"
elif format_type == "feishu":
separator = f"\n{feishu_separator}\n\n"
elif format_type == "dingtalk":
separator = f"\n---\n\n"
elif format_type == "slack":
separator = f"\n\n"
test_content = current_batch + separator
if (
len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
< max_bytes
):
current_batch = test_content
return current_batch, current_batch_has_content, batches
# 定义处理新增新闻的函数
def process_new_titles_section(current_batch, current_batch_has_content, batches):
"""处理新增新闻"""
if not report_data["new_titles"]:
return current_batch, current_batch_has_content, batches
new_header = ""
if format_type in ("wework", "bark"):
new_header = f"\n\n\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
elif format_type == "telegram":
new_header = (
f"\n\n🆕 本次新增热点新闻 (共 {report_data['total_new_count']} 条)\n\n"
)
elif format_type == "ntfy":
new_header = f"\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
elif format_type == "feishu":
new_header = f"\n{feishu_separator}\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
elif format_type == "dingtalk":
new_header = f"\n---\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
elif format_type == "slack":
new_header = f"\n\n🆕 *本次新增热点新闻* (共 {report_data['total_new_count']} 条)\n\n"
test_content = current_batch + new_header
if (
len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
>= max_bytes
):
if current_batch_has_content:
batches.append(current_batch + base_footer)
current_batch = base_header + new_header
current_batch_has_content = True
else:
current_batch = test_content
current_batch_has_content = True
# 逐个处理新增新闻来源
for source_data in report_data["new_titles"]:
source_header = ""
if format_type in ("wework", "bark"):
source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n"
elif format_type == "telegram":
source_header = f"{source_data['source_name']} ({len(source_data['titles'])} 条):\n\n"
elif format_type == "ntfy":
source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n"
elif format_type == "feishu":
source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n"
elif format_type == "dingtalk":
source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n"
elif format_type == "slack":
source_header = f"*{source_data['source_name']}* ({len(source_data['titles'])} 条):\n\n"
# 构建第一条新增新闻
first_news_line = ""
if source_data["titles"]:
first_title_data = source_data["titles"][0]
title_data_copy = first_title_data.copy()
title_data_copy["is_new"] = False
if format_type in ("wework", "bark"):
formatted_title = format_title_for_platform(
"wework", title_data_copy, show_source=False
)
elif format_type == "telegram":
formatted_title = format_title_for_platform(
"telegram", title_data_copy, show_source=False
)
elif format_type == "feishu":
formatted_title = format_title_for_platform(
"feishu", title_data_copy, show_source=False
)
elif format_type == "dingtalk":
formatted_title = format_title_for_platform(
"dingtalk", title_data_copy, show_source=False
)
elif format_type == "slack":
formatted_title = format_title_for_platform(
"slack", title_data_copy, show_source=False
)
else:
formatted_title = f"{title_data_copy['title']}"
first_news_line = f" 1. {formatted_title}\n"
# 原子性检查:来源标题+第一条新闻
source_with_first_news = source_header + first_news_line
test_content = current_batch + source_with_first_news
if (
len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
>= max_bytes
):
if current_batch_has_content:
batches.append(current_batch + base_footer)
current_batch = base_header + new_header + source_with_first_news
current_batch_has_content = True
start_index = 1
else:
current_batch = test_content
current_batch_has_content = True
start_index = 1
# 处理剩余新增新闻
for j in range(start_index, len(source_data["titles"])):
title_data = source_data["titles"][j]
title_data_copy = title_data.copy()
title_data_copy["is_new"] = False
if format_type == "wework":
formatted_title = format_title_for_platform(
"wework", title_data_copy, show_source=False
)
elif format_type == "telegram":
formatted_title = format_title_for_platform(
"telegram", title_data_copy, show_source=False
)
elif format_type == "feishu":
formatted_title = format_title_for_platform(
"feishu", title_data_copy, show_source=False
)
elif format_type == "dingtalk":
formatted_title = format_title_for_platform(
"dingtalk", title_data_copy, show_source=False
)
elif format_type == "slack":
formatted_title = format_title_for_platform(
"slack", title_data_copy, show_source=False
)
else:
formatted_title = f"{title_data_copy['title']}"
news_line = f" {j + 1}. {formatted_title}\n"
test_content = current_batch + news_line
if (
len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
>= max_bytes
):
if current_batch_has_content:
batches.append(current_batch + base_footer)
current_batch = base_header + new_header + source_header + news_line
current_batch_has_content = True
else:
current_batch = test_content
current_batch_has_content = True
current_batch += "\n"
return current_batch, current_batch_has_content, batches
# 根据配置决定处理顺序
if reverse_content_order:
# 新增热点在前,热点词汇统计在后
current_batch, current_batch_has_content, batches = process_new_titles_section(
current_batch, current_batch_has_content, batches
)
current_batch, current_batch_has_content, batches = process_stats_section(
current_batch, current_batch_has_content, batches
)
else:
# 默认:热点词汇统计在前,新增热点在后
current_batch, current_batch_has_content, batches = process_stats_section(
current_batch, current_batch_has_content, batches
)
current_batch, current_batch_has_content, batches = process_new_titles_section(
current_batch, current_batch_has_content, batches
)
if report_data["failed_ids"]:
failed_header = ""
if format_type == "wework":
failed_header = f"\n\n\n\n⚠️ **数据获取失败的平台:**\n\n"
elif format_type == "telegram":
failed_header = f"\n\n⚠️ 数据获取失败的平台:\n\n"
elif format_type == "ntfy":
failed_header = f"\n\n⚠️ **数据获取失败的平台:**\n\n"
elif format_type == "feishu":
failed_header = f"\n{feishu_separator}\n\n⚠️ **数据获取失败的平台:**\n\n"
elif format_type == "dingtalk":
failed_header = f"\n---\n\n⚠️ **数据获取失败的平台:**\n\n"
test_content = current_batch + failed_header
if (
len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
>= max_bytes
):
if current_batch_has_content:
batches.append(current_batch + base_footer)
current_batch = base_header + failed_header
current_batch_has_content = True
else:
current_batch = test_content
current_batch_has_content = True
for i, id_value in enumerate(report_data["failed_ids"], 1):
if format_type == "feishu":
failed_line = f" • <font color='red'>{id_value}</font>\n"
elif format_type == "dingtalk":
failed_line = f" • **{id_value}**\n"
else:
failed_line = f"{id_value}\n"
test_content = current_batch + failed_line
if (
len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
>= max_bytes
):
if current_batch_has_content:
batches.append(current_batch + base_footer)
current_batch = base_header + failed_header + failed_line
current_batch_has_content = True
else:
current_batch = test_content
current_batch_has_content = True
# 完成最后批次
if current_batch_has_content:
batches.append(current_batch + base_footer)
return batches
+40
View File
@@ -0,0 +1,40 @@
# coding=utf-8
"""
报告生成模块
提供报告生成和格式化功能,包括:
- HTML 报告生成
- 标题格式化工具
模块结构:
- helpers: 报告辅助函数(清理、转义、格式化)
- formatter: 平台标题格式化
- html: HTML 报告渲染
- generator: 报告生成器
"""
from trendradar.report.helpers import (
clean_title,
html_escape,
format_rank_display,
)
from trendradar.report.formatter import format_title_for_platform
from trendradar.report.html import render_html_content
from trendradar.report.generator import (
prepare_report_data,
generate_html_report,
)
__all__ = [
# 辅助函数
"clean_title",
"html_escape",
"format_rank_display",
# 格式化函数
"format_title_for_platform",
# HTML 渲染
"render_html_content",
# 报告生成器
"prepare_report_data",
"generate_html_report",
]
+223
View File
@@ -0,0 +1,223 @@
# coding=utf-8
"""
平台标题格式化模块
提供多平台标题格式化功能
"""
from typing import Dict
from trendradar.report.helpers import clean_title, html_escape, format_rank_display
def format_title_for_platform(
platform: str, title_data: Dict, show_source: bool = True
) -> str:
"""统一的标题格式化方法
为不同平台生成对应格式的标题字符串。
Args:
platform: 目标平台,支持:
- "feishu": 飞书
- "dingtalk": 钉钉
- "wework": 企业微信
- "bark": Bark
- "telegram": Telegram
- "ntfy": ntfy
- "slack": Slack
- "html": HTML 报告
title_data: 标题数据字典,包含以下字段:
- title: 标题文本
- source_name: 来源名称
- time_display: 时间显示
- count: 出现次数
- ranks: 排名列表
- rank_threshold: 高亮阈值
- url: PC端链接
- mobile_url: 移动端链接(优先使用)
- is_new: 是否为新增标题(可选)
show_source: 是否显示来源名称
Returns:
格式化后的标题字符串
"""
rank_display = format_rank_display(
title_data["ranks"], title_data["rank_threshold"], platform
)
link_url = title_data["mobile_url"] or title_data["url"]
cleaned_title = clean_title(title_data["title"])
if platform == "feishu":
if link_url:
formatted_title = f"[{cleaned_title}]({link_url})"
else:
formatted_title = cleaned_title
title_prefix = "🆕 " if title_data.get("is_new") else ""
if show_source:
result = f"<font color='grey'>[{title_data['source_name']}]</font> {title_prefix}{formatted_title}"
else:
result = f"{title_prefix}{formatted_title}"
if rank_display:
result += f" {rank_display}"
if title_data["time_display"]:
result += f" <font color='grey'>- {title_data['time_display']}</font>"
if title_data["count"] > 1:
result += f" <font color='green'>({title_data['count']}次)</font>"
return result
elif platform == "dingtalk":
if link_url:
formatted_title = f"[{cleaned_title}]({link_url})"
else:
formatted_title = cleaned_title
title_prefix = "🆕 " if title_data.get("is_new") else ""
if show_source:
result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}"
else:
result = f"{title_prefix}{formatted_title}"
if rank_display:
result += f" {rank_display}"
if title_data["time_display"]:
result += f" - {title_data['time_display']}"
if title_data["count"] > 1:
result += f" ({title_data['count']}次)"
return result
elif platform in ("wework", "bark"):
# WeWork 和 Bark 使用 markdown 格式
if link_url:
formatted_title = f"[{cleaned_title}]({link_url})"
else:
formatted_title = cleaned_title
title_prefix = "🆕 " if title_data.get("is_new") else ""
if show_source:
result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}"
else:
result = f"{title_prefix}{formatted_title}"
if rank_display:
result += f" {rank_display}"
if title_data["time_display"]:
result += f" - {title_data['time_display']}"
if title_data["count"] > 1:
result += f" ({title_data['count']}次)"
return result
elif platform == "telegram":
if link_url:
formatted_title = f'<a href="{link_url}">{html_escape(cleaned_title)}</a>'
else:
formatted_title = cleaned_title
title_prefix = "🆕 " if title_data.get("is_new") else ""
if show_source:
result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}"
else:
result = f"{title_prefix}{formatted_title}"
if rank_display:
result += f" {rank_display}"
if title_data["time_display"]:
result += f" <code>- {title_data['time_display']}</code>"
if title_data["count"] > 1:
result += f" <code>({title_data['count']}次)</code>"
return result
elif platform == "ntfy":
if link_url:
formatted_title = f"[{cleaned_title}]({link_url})"
else:
formatted_title = cleaned_title
title_prefix = "🆕 " if title_data.get("is_new") else ""
if show_source:
result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}"
else:
result = f"{title_prefix}{formatted_title}"
if rank_display:
result += f" {rank_display}"
if title_data["time_display"]:
result += f" `- {title_data['time_display']}`"
if title_data["count"] > 1:
result += f" `({title_data['count']}次)`"
return result
elif platform == "slack":
# Slack 使用 mrkdwn 格式
if link_url:
# Slack 链接格式: <url|text>
formatted_title = f"<{link_url}|{cleaned_title}>"
else:
formatted_title = cleaned_title
title_prefix = "🆕 " if title_data.get("is_new") else ""
if show_source:
result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}"
else:
result = f"{title_prefix}{formatted_title}"
# 排名(使用 * 加粗)
rank_display = format_rank_display(
title_data["ranks"], title_data["rank_threshold"], "slack"
)
if rank_display:
result += f" {rank_display}"
if title_data["time_display"]:
result += f" `- {title_data['time_display']}`"
if title_data["count"] > 1:
result += f" `({title_data['count']}次)`"
return result
elif platform == "html":
rank_display = format_rank_display(
title_data["ranks"], title_data["rank_threshold"], "html"
)
link_url = title_data["mobile_url"] or title_data["url"]
escaped_title = html_escape(cleaned_title)
escaped_source_name = html_escape(title_data["source_name"])
if link_url:
escaped_url = html_escape(link_url)
formatted_title = f'[{escaped_source_name}] <a href="{escaped_url}" target="_blank" class="news-link">{escaped_title}</a>'
else:
formatted_title = (
f'[{escaped_source_name}] <span class="no-link">{escaped_title}</span>'
)
if rank_display:
formatted_title += f" {rank_display}"
if title_data["time_display"]:
escaped_time = html_escape(title_data["time_display"])
formatted_title += f" <font color='grey'>- {escaped_time}</font>"
if title_data["count"] > 1:
formatted_title += f" <font color='green'>({title_data['count']}次)</font>"
if title_data.get("is_new"):
formatted_title = f"<div class='new-title'>🆕 {formatted_title}</div>"
return formatted_title
else:
return cleaned_title
+235
View File
@@ -0,0 +1,235 @@
# coding=utf-8
"""
报告生成模块
提供报告数据准备和 HTML 生成功能:
- prepare_report_data: 准备报告数据
- generate_html_report: 生成 HTML 报告
"""
from pathlib import Path
from typing import Dict, List, Optional, Callable
def prepare_report_data(
stats: List[Dict],
failed_ids: Optional[List] = None,
new_titles: Optional[Dict] = None,
id_to_name: Optional[Dict] = None,
mode: str = "daily",
rank_threshold: int = 3,
matches_word_groups_func: Optional[Callable] = None,
load_frequency_words_func: Optional[Callable] = None,
) -> Dict:
"""
准备报告数据
Args:
stats: 统计结果列表
failed_ids: 失败的 ID 列表
new_titles: 新增标题
id_to_name: ID 到名称的映射
mode: 报告模式 (daily/incremental/current)
rank_threshold: 排名阈值
matches_word_groups_func: 词组匹配函数
load_frequency_words_func: 加载频率词函数
Returns:
Dict: 准备好的报告数据
"""
processed_new_titles = []
# 在增量模式下隐藏新增新闻区域
hide_new_section = mode == "incremental"
# 只有在非隐藏模式下才处理新增新闻部分
if not hide_new_section:
filtered_new_titles = {}
if new_titles and id_to_name:
# 如果提供了匹配函数,使用它过滤
if matches_word_groups_func and load_frequency_words_func:
word_groups, filter_words, global_filters = load_frequency_words_func()
for source_id, titles_data in new_titles.items():
filtered_titles = {}
for title, title_data in titles_data.items():
if matches_word_groups_func(title, word_groups, filter_words, global_filters):
filtered_titles[title] = title_data
if filtered_titles:
filtered_new_titles[source_id] = filtered_titles
else:
# 没有匹配函数时,使用全部
filtered_new_titles = new_titles
# 打印过滤后的新增热点数(与推送显示一致)
original_new_count = sum(len(titles) for titles in new_titles.values()) if new_titles else 0
filtered_new_count = sum(len(titles) for titles in filtered_new_titles.values()) if filtered_new_titles else 0
if original_new_count > 0:
print(f"频率词过滤后:{filtered_new_count} 条新增热点匹配(原始 {original_new_count} 条)")
if filtered_new_titles and id_to_name:
for source_id, titles_data in filtered_new_titles.items():
source_name = id_to_name.get(source_id, source_id)
source_titles = []
for title, title_data in titles_data.items():
url = title_data.get("url", "")
mobile_url = title_data.get("mobileUrl", "")
ranks = title_data.get("ranks", [])
processed_title = {
"title": title,
"source_name": source_name,
"time_display": "",
"count": 1,
"ranks": ranks,
"rank_threshold": rank_threshold,
"url": url,
"mobile_url": mobile_url,
"is_new": True,
}
source_titles.append(processed_title)
if source_titles:
processed_new_titles.append(
{
"source_id": source_id,
"source_name": source_name,
"titles": source_titles,
}
)
processed_stats = []
for stat in stats:
if stat["count"] <= 0:
continue
processed_titles = []
for title_data in stat["titles"]:
processed_title = {
"title": title_data["title"],
"source_name": title_data["source_name"],
"time_display": title_data["time_display"],
"count": title_data["count"],
"ranks": title_data["ranks"],
"rank_threshold": title_data["rank_threshold"],
"url": title_data.get("url", ""),
"mobile_url": title_data.get("mobileUrl", ""),
"is_new": title_data.get("is_new", False),
}
processed_titles.append(processed_title)
processed_stats.append(
{
"word": stat["word"],
"count": stat["count"],
"percentage": stat.get("percentage", 0),
"titles": processed_titles,
}
)
return {
"stats": processed_stats,
"new_titles": processed_new_titles,
"failed_ids": failed_ids or [],
"total_new_count": sum(
len(source["titles"]) for source in processed_new_titles
),
}
def generate_html_report(
stats: List[Dict],
total_titles: int,
failed_ids: Optional[List] = None,
new_titles: Optional[Dict] = None,
id_to_name: Optional[Dict] = None,
mode: str = "daily",
is_daily_summary: bool = False,
update_info: Optional[Dict] = None,
rank_threshold: int = 3,
output_dir: str = "output",
date_folder: str = "",
time_filename: str = "",
render_html_func: Optional[Callable] = None,
matches_word_groups_func: Optional[Callable] = None,
load_frequency_words_func: Optional[Callable] = None,
enable_index_copy: bool = True,
) -> str:
"""
生成 HTML 报告
Args:
stats: 统计结果列表
total_titles: 总标题数
failed_ids: 失败的 ID 列表
new_titles: 新增标题
id_to_name: ID 到名称的映射
mode: 报告模式 (daily/incremental/current)
is_daily_summary: 是否是每日汇总
update_info: 更新信息
rank_threshold: 排名阈值
output_dir: 输出目录
date_folder: 日期文件夹名称
time_filename: 时间文件名
render_html_func: HTML 渲染函数
matches_word_groups_func: 词组匹配函数
load_frequency_words_func: 加载频率词函数
enable_index_copy: 是否复制到 index.html
Returns:
str: 生成的 HTML 文件路径
"""
if is_daily_summary:
if mode == "current":
filename = "当前榜单汇总.html"
elif mode == "incremental":
filename = "当日增量.html"
else:
filename = "当日汇总.html"
else:
filename = f"{time_filename}.html"
# 构建输出路径
output_path = Path(output_dir) / date_folder / "html"
output_path.mkdir(parents=True, exist_ok=True)
file_path = str(output_path / filename)
# 准备报告数据
report_data = prepare_report_data(
stats,
failed_ids,
new_titles,
id_to_name,
mode,
rank_threshold,
matches_word_groups_func,
load_frequency_words_func,
)
# 渲染 HTML 内容
if render_html_func:
html_content = render_html_func(
report_data, total_titles, is_daily_summary, mode, update_info
)
else:
# 默认简单 HTML
html_content = f"<html><body><h1>Report</h1><pre>{report_data}</pre></body></html>"
# 写入文件
with open(file_path, "w", encoding="utf-8") as f:
f.write(html_content)
# 如果是每日汇总且启用 index 复制
if is_daily_summary and enable_index_copy:
# 生成到根目录(供 GitHub Pages 访问)
root_index_path = Path("index.html")
with open(root_index_path, "w", encoding="utf-8") as f:
f.write(html_content)
# 同时生成到 output 目录(供 Docker Volume 挂载访问)
output_index_path = Path(output_dir) / "index.html"
Path(output_dir).mkdir(parents=True, exist_ok=True)
with open(output_index_path, "w", encoding="utf-8") as f:
f.write(html_content)
return file_path
+125
View File
@@ -0,0 +1,125 @@
# coding=utf-8
"""
报告辅助函数模块
提供报告生成相关的通用辅助函数
"""
import re
from typing import List
def clean_title(title: str) -> str:
"""清理标题中的特殊字符
清理规则:
- 将换行符(\n, \r)替换为空格
- 将多个连续空白字符合并为单个空格
- 去除首尾空白
Args:
title: 原始标题字符串
Returns:
清理后的标题字符串
"""
if not isinstance(title, str):
title = str(title)
cleaned_title = title.replace("\n", " ").replace("\r", " ")
cleaned_title = re.sub(r"\s+", " ", cleaned_title)
cleaned_title = cleaned_title.strip()
return cleaned_title
def html_escape(text: str) -> str:
"""HTML特殊字符转义
转义规则(按顺序):
- & → &amp;
- < → &lt;
- > → &gt;
- " → &quot;
- ' → &#x27;
Args:
text: 原始文本
Returns:
转义后的文本
"""
if not isinstance(text, str):
text = str(text)
return (
text.replace("&", "&amp;")
.replace("<", "&lt;")
.replace(">", "&gt;")
.replace('"', "&quot;")
.replace("'", "&#x27;")
)
def format_rank_display(ranks: List[int], rank_threshold: int, format_type: str) -> str:
"""格式化排名显示
根据不同平台类型生成对应格式的排名字符串。
当最小排名小于等于阈值时,使用高亮格式。
Args:
ranks: 排名列表(可能包含重复值)
rank_threshold: 高亮阈值,小于等于此值的排名会高亮显示
format_type: 平台类型,支持:
- "html": HTML格式
- "feishu": 飞书格式
- "dingtalk": 钉钉格式
- "wework": 企业微信格式
- "telegram": Telegram格式
- "slack": Slack格式
- 其他: 默认markdown格式
Returns:
格式化后的排名字符串,如 "[1]""[1 - 5]"
如果排名列表为空,返回空字符串
"""
if not ranks:
return ""
unique_ranks = sorted(set(ranks))
min_rank = unique_ranks[0]
max_rank = unique_ranks[-1]
# 根据平台类型选择高亮格式
if format_type == "html":
highlight_start = "<font color='red'><strong>"
highlight_end = "</strong></font>"
elif format_type == "feishu":
highlight_start = "<font color='red'>**"
highlight_end = "**</font>"
elif format_type == "dingtalk":
highlight_start = "**"
highlight_end = "**"
elif format_type == "wework":
highlight_start = "**"
highlight_end = "**"
elif format_type == "telegram":
highlight_start = "<b>"
highlight_end = "</b>"
elif format_type == "slack":
highlight_start = "*"
highlight_end = "*"
else:
# 默认 markdown 格式
highlight_start = "**"
highlight_end = "**"
# 生成排名显示
if min_rank <= rank_threshold:
if min_rank == max_rank:
return f"{highlight_start}[{min_rank}]{highlight_end}"
else:
return f"{highlight_start}[{min_rank} - {max_rank}]{highlight_end}"
else:
if min_rank == max_rank:
return f"[{min_rank}]"
else:
return f"[{min_rank} - {max_rank}]"
File diff suppressed because it is too large Load Diff
+44
View File
@@ -0,0 +1,44 @@
# coding=utf-8
"""
存储模块 - 支持多种存储后端
支持的存储后端:
- local: 本地 SQLite + TXT/HTML 文件
- remote: 远程云存储(S3 兼容协议:R2/OSS/COS/S3 等)
- auto: 根据环境自动选择(GitHub Actions 用 remote,其他用 local
"""
from trendradar.storage.base import (
StorageBackend,
NewsItem,
NewsData,
convert_crawl_results_to_news_data,
convert_news_data_to_results,
)
from trendradar.storage.local import LocalStorageBackend
from trendradar.storage.manager import StorageManager, get_storage_manager
# 远程后端可选导入(需要 boto3)
try:
from trendradar.storage.remote import RemoteStorageBackend
HAS_REMOTE = True
except ImportError:
RemoteStorageBackend = None
HAS_REMOTE = False
__all__ = [
# 基础类
"StorageBackend",
"NewsItem",
"NewsData",
# 转换函数
"convert_crawl_results_to_news_data",
"convert_news_data_to_results",
# 后端实现
"LocalStorageBackend",
"RemoteStorageBackend",
"HAS_REMOTE",
# 管理器
"StorageManager",
"get_storage_manager",
]
+457
View File
@@ -0,0 +1,457 @@
# coding=utf-8
"""
存储后端抽象基类和数据模型
定义统一的存储接口,所有存储后端都需要实现这些方法
"""
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from datetime import datetime
from typing import Dict, List, Optional, Any
import json
@dataclass
class NewsItem:
"""新闻条目数据模型"""
title: str # 新闻标题
source_id: str # 来源平台ID(如 toutiao, baidu
source_name: str = "" # 来源平台名称(运行时使用,数据库不存储)
rank: int = 0 # 排名
url: str = "" # 链接 URL
mobile_url: str = "" # 移动端 URL
crawl_time: str = "" # 抓取时间(HH:MM 格式)
# 统计信息(用于分析)
ranks: List[int] = field(default_factory=list) # 历史排名列表
first_time: str = "" # 首次出现时间
last_time: str = "" # 最后出现时间
count: int = 1 # 出现次数
def to_dict(self) -> Dict[str, Any]:
"""转换为字典"""
return {
"title": self.title,
"source_id": self.source_id,
"source_name": self.source_name,
"rank": self.rank,
"url": self.url,
"mobile_url": self.mobile_url,
"crawl_time": self.crawl_time,
"ranks": self.ranks,
"first_time": self.first_time,
"last_time": self.last_time,
"count": self.count,
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "NewsItem":
"""从字典创建"""
return cls(
title=data.get("title", ""),
source_id=data.get("source_id", ""),
source_name=data.get("source_name", ""),
rank=data.get("rank", 0),
url=data.get("url", ""),
mobile_url=data.get("mobile_url", ""),
crawl_time=data.get("crawl_time", ""),
ranks=data.get("ranks", []),
first_time=data.get("first_time", ""),
last_time=data.get("last_time", ""),
count=data.get("count", 1),
)
@dataclass
class NewsData:
"""
新闻数据集合
结构:
- date: 日期(YYYY-MM-DD
- crawl_time: 抓取时间(HH时MM分)
- items: 按来源ID分组的新闻条目
- id_to_name: 来源ID到名称的映射
- failed_ids: 失败的来源ID列表
"""
date: str # 日期
crawl_time: str # 抓取时间
items: Dict[str, List[NewsItem]] # 按来源分组的新闻
id_to_name: Dict[str, str] = field(default_factory=dict) # ID到名称映射
failed_ids: List[str] = field(default_factory=list) # 失败的ID
def to_dict(self) -> Dict[str, Any]:
"""转换为字典"""
items_dict = {}
for source_id, news_list in self.items.items():
items_dict[source_id] = [item.to_dict() for item in news_list]
return {
"date": self.date,
"crawl_time": self.crawl_time,
"items": items_dict,
"id_to_name": self.id_to_name,
"failed_ids": self.failed_ids,
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "NewsData":
"""从字典创建"""
items = {}
items_data = data.get("items", {})
for source_id, news_list in items_data.items():
items[source_id] = [NewsItem.from_dict(item) for item in news_list]
return cls(
date=data.get("date", ""),
crawl_time=data.get("crawl_time", ""),
items=items,
id_to_name=data.get("id_to_name", {}),
failed_ids=data.get("failed_ids", []),
)
def get_total_count(self) -> int:
"""获取新闻总数"""
return sum(len(news_list) for news_list in self.items.values())
def merge_with(self, other: "NewsData") -> "NewsData":
"""
合并另一个 NewsData 到当前数据
合并规则:
- 相同 source_id + title 的新闻合并排名历史
- 更新 last_time 和 count
- 保留较早的 first_time
"""
merged_items = {}
# 复制当前数据
for source_id, news_list in self.items.items():
merged_items[source_id] = {item.title: item for item in news_list}
# 合并其他数据
for source_id, news_list in other.items.items():
if source_id not in merged_items:
merged_items[source_id] = {}
for item in news_list:
if item.title in merged_items[source_id]:
# 合并已存在的新闻
existing = merged_items[source_id][item.title]
# 合并排名
existing_ranks = set(existing.ranks) if existing.ranks else set()
new_ranks = set(item.ranks) if item.ranks else set()
merged_ranks = sorted(existing_ranks | new_ranks)
existing.ranks = merged_ranks
# 更新时间
if item.first_time and (not existing.first_time or item.first_time < existing.first_time):
existing.first_time = item.first_time
if item.last_time and (not existing.last_time or item.last_time > existing.last_time):
existing.last_time = item.last_time
# 更新计数
existing.count += 1
# 保留URL(如果原来没有)
if not existing.url and item.url:
existing.url = item.url
if not existing.mobile_url and item.mobile_url:
existing.mobile_url = item.mobile_url
else:
# 添加新新闻
merged_items[source_id][item.title] = item
# 转换回列表格式
final_items = {}
for source_id, items_dict in merged_items.items():
final_items[source_id] = list(items_dict.values())
# 合并 id_to_name
merged_id_to_name = {**self.id_to_name, **other.id_to_name}
# 合并 failed_ids(去重)
merged_failed_ids = list(set(self.failed_ids + other.failed_ids))
return NewsData(
date=self.date or other.date,
crawl_time=other.crawl_time, # 使用较新的抓取时间
items=final_items,
id_to_name=merged_id_to_name,
failed_ids=merged_failed_ids,
)
class StorageBackend(ABC):
"""
存储后端抽象基类
所有存储后端都需要实现这些方法,以支持:
- 保存新闻数据
- 读取当天所有数据
- 检测新增新闻
- 生成报告文件(TXT/HTML
"""
@abstractmethod
def save_news_data(self, data: NewsData) -> bool:
"""
保存新闻数据
Args:
data: 新闻数据
Returns:
是否保存成功
"""
pass
@abstractmethod
def get_today_all_data(self, date: Optional[str] = None) -> Optional[NewsData]:
"""
获取指定日期的所有新闻数据
Args:
date: 日期字符串(YYYY-MM-DD),默认为今天
Returns:
合并后的新闻数据,如果没有数据返回 None
"""
pass
@abstractmethod
def get_latest_crawl_data(self, date: Optional[str] = None) -> Optional[NewsData]:
"""
获取最新一次抓取的数据
Args:
date: 日期字符串,默认为今天
Returns:
最新抓取的新闻数据
"""
pass
@abstractmethod
def detect_new_titles(self, current_data: NewsData) -> Dict[str, Dict]:
"""
检测新增的标题
Args:
current_data: 当前抓取的数据
Returns:
新增的标题数据,格式: {source_id: {title: title_data}}
"""
pass
@abstractmethod
def save_txt_snapshot(self, data: NewsData) -> Optional[str]:
"""
保存 TXT 快照(可选功能,本地环境可用)
Args:
data: 新闻数据
Returns:
保存的文件路径,如果不支持返回 None
"""
pass
@abstractmethod
def save_html_report(self, html_content: str, filename: str, is_summary: bool = False) -> Optional[str]:
"""
保存 HTML 报告
Args:
html_content: HTML 内容
filename: 文件名
is_summary: 是否为汇总报告
Returns:
保存的文件路径
"""
pass
@abstractmethod
def is_first_crawl_today(self, date: Optional[str] = None) -> bool:
"""
检查是否是当天第一次抓取
Args:
date: 日期字符串,默认为今天
Returns:
是否是第一次抓取
"""
pass
@abstractmethod
def cleanup(self) -> None:
"""
清理资源(如临时文件、数据库连接等)
"""
pass
@abstractmethod
def cleanup_old_data(self, retention_days: int) -> int:
"""
清理过期数据
Args:
retention_days: 保留天数(0 表示不清理)
Returns:
删除的日期目录数量
"""
pass
@property
@abstractmethod
def backend_name(self) -> str:
"""
存储后端名称
"""
pass
@property
@abstractmethod
def supports_txt(self) -> bool:
"""
是否支持生成 TXT 快照
"""
pass
# === 推送记录相关方法 ===
@abstractmethod
def has_pushed_today(self, date: Optional[str] = None) -> bool:
"""
检查指定日期是否已推送过
Args:
date: 日期字符串(YYYY-MM-DD),默认为今天
Returns:
是否已推送
"""
pass
@abstractmethod
def record_push(self, report_type: str, date: Optional[str] = None) -> bool:
"""
记录推送
Args:
report_type: 报告类型
date: 日期字符串(YYYY-MM-DD),默认为今天
Returns:
是否记录成功
"""
pass
def convert_crawl_results_to_news_data(
results: Dict[str, Dict],
id_to_name: Dict[str, str],
failed_ids: List[str],
crawl_time: str,
crawl_date: str,
) -> NewsData:
"""
将爬虫结果转换为 NewsData 格式
Args:
results: 爬虫返回的结果 {source_id: {title: {ranks: [], url: "", mobileUrl: ""}}}
id_to_name: 来源ID到名称的映射
failed_ids: 失败的来源ID
crawl_time: 抓取时间(HH:MM
crawl_date: 抓取日期(YYYY-MM-DD
Returns:
NewsData 对象
"""
items = {}
for source_id, titles_data in results.items():
source_name = id_to_name.get(source_id, source_id)
news_list = []
for title, data in titles_data.items():
if isinstance(data, dict):
ranks = data.get("ranks", [])
url = data.get("url", "")
mobile_url = data.get("mobileUrl", "")
else:
# 兼容旧格式
ranks = data if isinstance(data, list) else []
url = ""
mobile_url = ""
rank = ranks[0] if ranks else 99
news_item = NewsItem(
title=title,
source_id=source_id,
source_name=source_name,
rank=rank,
url=url,
mobile_url=mobile_url,
crawl_time=crawl_time,
ranks=ranks,
first_time=crawl_time,
last_time=crawl_time,
count=1,
)
news_list.append(news_item)
items[source_id] = news_list
return NewsData(
date=crawl_date,
crawl_time=crawl_time,
items=items,
id_to_name=id_to_name,
failed_ids=failed_ids,
)
def convert_news_data_to_results(data: NewsData) -> tuple:
"""
将 NewsData 转换回原有的 results 格式(用于兼容现有代码)
Args:
data: NewsData 对象
Returns:
(results, id_to_name, title_info) 元组
"""
results = {}
title_info = {}
for source_id, news_list in data.items.items():
results[source_id] = {}
title_info[source_id] = {}
for item in news_list:
results[source_id][item.title] = {
"ranks": item.ranks,
"url": item.url,
"mobileUrl": item.mobile_url,
}
title_info[source_id][item.title] = {
"first_time": item.first_time,
"last_time": item.last_time,
"count": item.count,
"ranks": item.ranks,
"url": item.url,
"mobileUrl": item.mobile_url,
}
return results, data.id_to_name, title_info
+869
View File
@@ -0,0 +1,869 @@
# coding=utf-8
"""
本地存储后端 - SQLite + TXT/HTML
使用 SQLite 作为主存储,支持可选的 TXT 快照和 HTML 报告
"""
import sqlite3
import os
import shutil
import pytz
import re
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, List, Optional, Any
from trendradar.storage.base import StorageBackend, NewsItem, NewsData
from trendradar.utils.time import (
get_configured_time,
format_date_folder,
format_time_filename,
)
class LocalStorageBackend(StorageBackend):
"""
本地存储后端
使用 SQLite 数据库存储新闻数据,支持:
- 按日期组织的 SQLite 数据库文件
- 可选的 TXT 快照(用于调试)
- HTML 报告生成
"""
def __init__(
self,
data_dir: str = "output",
enable_txt: bool = True,
enable_html: bool = True,
timezone: str = "Asia/Shanghai",
):
"""
初始化本地存储后端
Args:
data_dir: 数据目录路径
enable_txt: 是否启用 TXT 快照
enable_html: 是否启用 HTML 报告
timezone: 时区配置(默认 Asia/Shanghai
"""
self.data_dir = Path(data_dir)
self.enable_txt = enable_txt
self.enable_html = enable_html
self.timezone = timezone
self._db_connections: Dict[str, sqlite3.Connection] = {}
@property
def backend_name(self) -> str:
return "local"
@property
def supports_txt(self) -> bool:
return self.enable_txt
def _get_configured_time(self) -> datetime:
"""获取配置时区的当前时间"""
return get_configured_time(self.timezone)
def _format_date_folder(self, date: Optional[str] = None) -> str:
"""格式化日期文件夹名 (ISO 格式: YYYY-MM-DD)"""
return format_date_folder(date, self.timezone)
def _format_time_filename(self) -> str:
"""格式化时间文件名 (格式: HH-MM)"""
return format_time_filename(self.timezone)
def _get_db_path(self, date: Optional[str] = None) -> Path:
"""获取 SQLite 数据库路径"""
date_folder = self._format_date_folder(date)
db_dir = self.data_dir / date_folder
db_dir.mkdir(parents=True, exist_ok=True)
return db_dir / "news.db"
def _get_connection(self, date: Optional[str] = None) -> sqlite3.Connection:
"""获取数据库连接(带缓存)"""
db_path = str(self._get_db_path(date))
if db_path not in self._db_connections:
conn = sqlite3.connect(db_path)
conn.row_factory = sqlite3.Row
self._init_tables(conn)
self._db_connections[db_path] = conn
return self._db_connections[db_path]
def _get_schema_path(self) -> Path:
"""获取 schema.sql 文件路径"""
return Path(__file__).parent / "schema.sql"
def _init_tables(self, conn: sqlite3.Connection) -> None:
"""从 schema.sql 初始化数据库表结构"""
schema_path = self._get_schema_path()
if schema_path.exists():
with open(schema_path, "r", encoding="utf-8") as f:
schema_sql = f.read()
conn.executescript(schema_sql)
else:
raise FileNotFoundError(f"Schema file not found: {schema_path}")
conn.commit()
def save_news_data(self, data: NewsData) -> bool:
"""
保存新闻数据到 SQLite(以 URL 为唯一标识,支持标题更新检测)
Args:
data: 新闻数据
Returns:
是否保存成功
"""
try:
conn = self._get_connection(data.date)
cursor = conn.cursor()
# 获取配置时区的当前时间
now_str = self._get_configured_time().strftime("%Y-%m-%d %H:%M:%S")
# 首先同步平台信息到 platforms 表
for source_id, source_name in data.id_to_name.items():
cursor.execute("""
INSERT INTO platforms (id, name, updated_at)
VALUES (?, ?, ?)
ON CONFLICT(id) DO UPDATE SET
name = excluded.name,
updated_at = excluded.updated_at
""", (source_id, source_name, now_str))
# 统计计数器
new_count = 0
updated_count = 0
title_changed_count = 0
success_sources = []
for source_id, news_list in data.items.items():
success_sources.append(source_id)
for item in news_list:
try:
# 检查是否已存在(通过 URL + platform_id
if item.url:
cursor.execute("""
SELECT id, title FROM news_items
WHERE url = ? AND platform_id = ?
""", (item.url, source_id))
existing = cursor.fetchone()
if existing:
# 已存在,更新记录
existing_id, existing_title = existing
# 检查标题是否变化
if existing_title != item.title:
# 记录标题变更
cursor.execute("""
INSERT INTO title_changes
(news_item_id, old_title, new_title, changed_at)
VALUES (?, ?, ?, ?)
""", (existing_id, existing_title, item.title, now_str))
title_changed_count += 1
# 记录排名历史
cursor.execute("""
INSERT INTO rank_history
(news_item_id, rank, crawl_time, created_at)
VALUES (?, ?, ?, ?)
""", (existing_id, item.rank, data.crawl_time, now_str))
# 更新现有记录
cursor.execute("""
UPDATE news_items SET
title = ?,
rank = ?,
mobile_url = ?,
last_crawl_time = ?,
crawl_count = crawl_count + 1,
updated_at = ?
WHERE id = ?
""", (item.title, item.rank, item.mobile_url,
data.crawl_time, now_str, existing_id))
updated_count += 1
else:
# 不存在,插入新记录
cursor.execute("""
INSERT INTO news_items
(title, platform_id, rank, url, mobile_url,
first_crawl_time, last_crawl_time, crawl_count,
created_at, updated_at)
VALUES (?, ?, ?, ?, ?, ?, ?, 1, ?, ?)
""", (item.title, source_id, item.rank, item.url,
item.mobile_url, data.crawl_time, data.crawl_time,
now_str, now_str))
new_id = cursor.lastrowid
# 记录初始排名
cursor.execute("""
INSERT INTO rank_history
(news_item_id, rank, crawl_time, created_at)
VALUES (?, ?, ?, ?)
""", (new_id, item.rank, data.crawl_time, now_str))
new_count += 1
else:
# URL 为空的情况,直接插入(不做去重)
cursor.execute("""
INSERT INTO news_items
(title, platform_id, rank, url, mobile_url,
first_crawl_time, last_crawl_time, crawl_count,
created_at, updated_at)
VALUES (?, ?, ?, ?, ?, ?, ?, 1, ?, ?)
""", (item.title, source_id, item.rank, item.url,
item.mobile_url, data.crawl_time, data.crawl_time,
now_str, now_str))
new_id = cursor.lastrowid
# 记录初始排名
cursor.execute("""
INSERT INTO rank_history
(news_item_id, rank, crawl_time, created_at)
VALUES (?, ?, ?, ?)
""", (new_id, item.rank, data.crawl_time, now_str))
new_count += 1
except sqlite3.Error as e:
print(f"保存新闻条目失败 [{item.title[:30]}...]: {e}")
total_items = new_count + updated_count
# 记录抓取信息
cursor.execute("""
INSERT OR REPLACE INTO crawl_records
(crawl_time, total_items, created_at)
VALUES (?, ?, ?)
""", (data.crawl_time, total_items, now_str))
# 获取刚插入的 crawl_record 的 ID
cursor.execute("""
SELECT id FROM crawl_records WHERE crawl_time = ?
""", (data.crawl_time,))
record_row = cursor.fetchone()
if record_row:
crawl_record_id = record_row[0]
# 记录成功的来源
for source_id in success_sources:
cursor.execute("""
INSERT OR REPLACE INTO crawl_source_status
(crawl_record_id, platform_id, status)
VALUES (?, ?, 'success')
""", (crawl_record_id, source_id))
# 记录失败的来源
for failed_id in data.failed_ids:
# 确保失败的平台也在 platforms 表中
cursor.execute("""
INSERT OR IGNORE INTO platforms (id, name, updated_at)
VALUES (?, ?, ?)
""", (failed_id, failed_id, now_str))
cursor.execute("""
INSERT OR REPLACE INTO crawl_source_status
(crawl_record_id, platform_id, status)
VALUES (?, ?, 'failed')
""", (crawl_record_id, failed_id))
conn.commit()
# 输出详细的存储统计日志
log_parts = [f"[本地存储] 处理完成:新增 {new_count}"]
if updated_count > 0:
log_parts.append(f"更新 {updated_count}")
if title_changed_count > 0:
log_parts.append(f"标题变更 {title_changed_count}")
print("".join(log_parts))
return True
except Exception as e:
print(f"[本地存储] 保存失败: {e}")
return False
def get_today_all_data(self, date: Optional[str] = None) -> Optional[NewsData]:
"""
获取指定日期的所有新闻数据(合并后)
Args:
date: 日期字符串,默认为今天
Returns:
合并后的新闻数据
"""
try:
db_path = self._get_db_path(date)
if not db_path.exists():
return None
conn = self._get_connection(date)
cursor = conn.cursor()
# 获取所有新闻数据(包含 id 用于查询排名历史)
cursor.execute("""
SELECT n.id, n.title, n.platform_id, p.name as platform_name,
n.rank, n.url, n.mobile_url,
n.first_crawl_time, n.last_crawl_time, n.crawl_count
FROM news_items n
LEFT JOIN platforms p ON n.platform_id = p.id
ORDER BY n.platform_id, n.last_crawl_time
""")
rows = cursor.fetchall()
if not rows:
return None
# 收集所有 news_item_id
news_ids = [row[0] for row in rows]
# 批量查询排名历史
rank_history_map: Dict[int, List[int]] = {}
if news_ids:
placeholders = ",".join("?" * len(news_ids))
cursor.execute(f"""
SELECT news_item_id, rank FROM rank_history
WHERE news_item_id IN ({placeholders})
ORDER BY news_item_id, crawl_time
""", news_ids)
for rh_row in cursor.fetchall():
news_id, rank = rh_row[0], rh_row[1]
if news_id not in rank_history_map:
rank_history_map[news_id] = []
if rank not in rank_history_map[news_id]:
rank_history_map[news_id].append(rank)
# 按 platform_id 分组
items: Dict[str, List[NewsItem]] = {}
id_to_name: Dict[str, str] = {}
crawl_date = self._format_date_folder(date)
for row in rows:
news_id = row[0]
platform_id = row[2]
title = row[1]
platform_name = row[3] or platform_id
id_to_name[platform_id] = platform_name
if platform_id not in items:
items[platform_id] = []
# 获取排名历史,如果没有则使用当前排名
ranks = rank_history_map.get(news_id, [row[4]])
items[platform_id].append(NewsItem(
title=title,
source_id=platform_id,
source_name=platform_name,
rank=row[4],
url=row[5] or "",
mobile_url=row[6] or "",
crawl_time=row[8], # last_crawl_time
ranks=ranks,
first_time=row[7], # first_crawl_time
last_time=row[8], # last_crawl_time
count=row[9], # crawl_count
))
final_items = items
# 获取失败的来源
cursor.execute("""
SELECT DISTINCT css.platform_id
FROM crawl_source_status css
JOIN crawl_records cr ON css.crawl_record_id = cr.id
WHERE css.status = 'failed'
""")
failed_ids = [row[0] for row in cursor.fetchall()]
# 获取最新的抓取时间
cursor.execute("""
SELECT crawl_time FROM crawl_records
ORDER BY crawl_time DESC
LIMIT 1
""")
time_row = cursor.fetchone()
crawl_time = time_row[0] if time_row else self._format_time_filename()
return NewsData(
date=crawl_date,
crawl_time=crawl_time,
items=final_items,
id_to_name=id_to_name,
failed_ids=failed_ids,
)
except Exception as e:
print(f"[本地存储] 读取数据失败: {e}")
return None
def get_latest_crawl_data(self, date: Optional[str] = None) -> Optional[NewsData]:
"""
获取最新一次抓取的数据
Args:
date: 日期字符串,默认为今天
Returns:
最新抓取的新闻数据
"""
try:
db_path = self._get_db_path(date)
if not db_path.exists():
return None
conn = self._get_connection(date)
cursor = conn.cursor()
# 获取最新的抓取时间
cursor.execute("""
SELECT crawl_time FROM crawl_records
ORDER BY crawl_time DESC
LIMIT 1
""")
time_row = cursor.fetchone()
if not time_row:
return None
latest_time = time_row[0]
# 获取该时间的新闻数据(包含 id 用于查询排名历史)
cursor.execute("""
SELECT n.id, n.title, n.platform_id, p.name as platform_name,
n.rank, n.url, n.mobile_url,
n.first_crawl_time, n.last_crawl_time, n.crawl_count
FROM news_items n
LEFT JOIN platforms p ON n.platform_id = p.id
WHERE n.last_crawl_time = ?
""", (latest_time,))
rows = cursor.fetchall()
if not rows:
return None
# 收集所有 news_item_id
news_ids = [row[0] for row in rows]
# 批量查询排名历史
rank_history_map: Dict[int, List[int]] = {}
if news_ids:
placeholders = ",".join("?" * len(news_ids))
cursor.execute(f"""
SELECT news_item_id, rank FROM rank_history
WHERE news_item_id IN ({placeholders})
ORDER BY news_item_id, crawl_time
""", news_ids)
for rh_row in cursor.fetchall():
news_id, rank = rh_row[0], rh_row[1]
if news_id not in rank_history_map:
rank_history_map[news_id] = []
if rank not in rank_history_map[news_id]:
rank_history_map[news_id].append(rank)
items: Dict[str, List[NewsItem]] = {}
id_to_name: Dict[str, str] = {}
crawl_date = self._format_date_folder(date)
for row in rows:
news_id = row[0]
platform_id = row[2]
platform_name = row[3] or platform_id
id_to_name[platform_id] = platform_name
if platform_id not in items:
items[platform_id] = []
# 获取排名历史,如果没有则使用当前排名
ranks = rank_history_map.get(news_id, [row[4]])
items[platform_id].append(NewsItem(
title=row[1],
source_id=platform_id,
source_name=platform_name,
rank=row[4],
url=row[5] or "",
mobile_url=row[6] or "",
crawl_time=row[8], # last_crawl_time
ranks=ranks,
first_time=row[7], # first_crawl_time
last_time=row[8], # last_crawl_time
count=row[9], # crawl_count
))
# 获取失败的来源(针对最新一次抓取)
cursor.execute("""
SELECT css.platform_id
FROM crawl_source_status css
JOIN crawl_records cr ON css.crawl_record_id = cr.id
WHERE cr.crawl_time = ? AND css.status = 'failed'
""", (latest_time,))
failed_ids = [row[0] for row in cursor.fetchall()]
return NewsData(
date=crawl_date,
crawl_time=latest_time,
items=items,
id_to_name=id_to_name,
failed_ids=failed_ids,
)
except Exception as e:
print(f"[本地存储] 获取最新数据失败: {e}")
return None
def detect_new_titles(self, current_data: NewsData) -> Dict[str, Dict]:
"""
检测新增的标题
Args:
current_data: 当前抓取的数据
Returns:
新增的标题数据 {source_id: {title: NewsItem}}
"""
try:
# 获取历史数据
historical_data = self.get_today_all_data(current_data.date)
if not historical_data:
# 没有历史数据,所有都是新的
new_titles = {}
for source_id, news_list in current_data.items.items():
new_titles[source_id] = {item.title: item for item in news_list}
return new_titles
# 收集历史标题
historical_titles: Dict[str, set] = {}
for source_id, news_list in historical_data.items.items():
historical_titles[source_id] = {item.title for item in news_list}
# 检测新增
new_titles = {}
for source_id, news_list in current_data.items.items():
hist_set = historical_titles.get(source_id, set())
for item in news_list:
if item.title not in hist_set:
if source_id not in new_titles:
new_titles[source_id] = {}
new_titles[source_id][item.title] = item
return new_titles
except Exception as e:
print(f"[本地存储] 检测新标题失败: {e}")
return {}
def save_txt_snapshot(self, data: NewsData) -> Optional[str]:
"""
保存 TXT 快照
Args:
data: 新闻数据
Returns:
保存的文件路径
"""
if not self.enable_txt:
return None
try:
date_folder = self._format_date_folder(data.date)
txt_dir = self.data_dir / date_folder / "txt"
txt_dir.mkdir(parents=True, exist_ok=True)
file_path = txt_dir / f"{data.crawl_time}.txt"
with open(file_path, "w", encoding="utf-8") as f:
for source_id, news_list in data.items.items():
source_name = data.id_to_name.get(source_id, source_id)
# 写入来源标题
if source_name and source_name != source_id:
f.write(f"{source_id} | {source_name}\n")
else:
f.write(f"{source_id}\n")
# 按排名排序
sorted_news = sorted(news_list, key=lambda x: x.rank)
for item in sorted_news:
line = f"{item.rank}. {item.title}"
if item.url:
line += f" [URL:{item.url}]"
if item.mobile_url:
line += f" [MOBILE:{item.mobile_url}]"
f.write(line + "\n")
f.write("\n")
# 写入失败的来源
if data.failed_ids:
f.write("==== 以下ID请求失败 ====\n")
for failed_id in data.failed_ids:
f.write(f"{failed_id}\n")
print(f"[本地存储] TXT 快照已保存: {file_path}")
return str(file_path)
except Exception as e:
print(f"[本地存储] 保存 TXT 快照失败: {e}")
return None
def save_html_report(self, html_content: str, filename: str, is_summary: bool = False) -> Optional[str]:
"""
保存 HTML 报告
Args:
html_content: HTML 内容
filename: 文件名
is_summary: 是否为汇总报告
Returns:
保存的文件路径
"""
if not self.enable_html:
return None
try:
date_folder = self._format_date_folder()
html_dir = self.data_dir / date_folder / "html"
html_dir.mkdir(parents=True, exist_ok=True)
file_path = html_dir / filename
with open(file_path, "w", encoding="utf-8") as f:
f.write(html_content)
print(f"[本地存储] HTML 报告已保存: {file_path}")
return str(file_path)
except Exception as e:
print(f"[本地存储] 保存 HTML 报告失败: {e}")
return None
def is_first_crawl_today(self, date: Optional[str] = None) -> bool:
"""
检查是否是当天第一次抓取
Args:
date: 日期字符串,默认为今天
Returns:
是否是第一次抓取
"""
try:
db_path = self._get_db_path(date)
if not db_path.exists():
return True
conn = self._get_connection(date)
cursor = conn.cursor()
cursor.execute("""
SELECT COUNT(*) as count FROM crawl_records
""")
row = cursor.fetchone()
count = row[0] if row else 0
# 如果只有一条或没有记录,视为第一次抓取
return count <= 1
except Exception as e:
print(f"[本地存储] 检查首次抓取失败: {e}")
return True
def get_crawl_times(self, date: Optional[str] = None) -> List[str]:
"""
获取指定日期的所有抓取时间列表
Args:
date: 日期字符串,默认为今天
Returns:
抓取时间列表(按时间排序)
"""
try:
db_path = self._get_db_path(date)
if not db_path.exists():
return []
conn = self._get_connection(date)
cursor = conn.cursor()
cursor.execute("""
SELECT crawl_time FROM crawl_records
ORDER BY crawl_time
""")
rows = cursor.fetchall()
return [row[0] for row in rows]
except Exception as e:
print(f"[本地存储] 获取抓取时间列表失败: {e}")
return []
def cleanup(self) -> None:
"""清理资源(关闭数据库连接)"""
for db_path, conn in self._db_connections.items():
try:
conn.close()
print(f"[本地存储] 关闭数据库连接: {db_path}")
except Exception as e:
print(f"[本地存储] 关闭连接失败 {db_path}: {e}")
self._db_connections.clear()
def cleanup_old_data(self, retention_days: int) -> int:
"""
清理过期数据
Args:
retention_days: 保留天数(0 表示不清理)
Returns:
删除的日期目录数量
"""
if retention_days <= 0:
return 0
deleted_count = 0
cutoff_date = self._get_configured_time() - timedelta(days=retention_days)
try:
if not self.data_dir.exists():
return 0
for date_folder in self.data_dir.iterdir():
if not date_folder.is_dir() or date_folder.name.startswith('.'):
continue
# 解析日期文件夹名(支持两种格式)
folder_date = None
try:
# ISO 格式: YYYY-MM-DD
date_match = re.match(r'(\d{4})-(\d{2})-(\d{2})', date_folder.name)
if date_match:
folder_date = datetime(
int(date_match.group(1)),
int(date_match.group(2)),
int(date_match.group(3)),
tzinfo=pytz.timezone("Asia/Shanghai")
)
else:
# 旧中文格式: YYYY年MM月DD日
date_match = re.match(r'(\d{4})年(\d{2})月(\d{2})日', date_folder.name)
if date_match:
folder_date = datetime(
int(date_match.group(1)),
int(date_match.group(2)),
int(date_match.group(3)),
tzinfo=pytz.timezone("Asia/Shanghai")
)
except Exception:
continue
if folder_date and folder_date < cutoff_date:
# 先关闭该日期的数据库连接
db_path = str(self._get_db_path(date_folder.name))
if db_path in self._db_connections:
try:
self._db_connections[db_path].close()
del self._db_connections[db_path]
except Exception:
pass
# 删除整个日期目录
try:
shutil.rmtree(date_folder)
deleted_count += 1
print(f"[本地存储] 清理过期数据: {date_folder.name}")
except Exception as e:
print(f"[本地存储] 删除目录失败 {date_folder.name}: {e}")
if deleted_count > 0:
print(f"[本地存储] 共清理 {deleted_count} 个过期日期目录")
return deleted_count
except Exception as e:
print(f"[本地存储] 清理过期数据失败: {e}")
return deleted_count
def has_pushed_today(self, date: Optional[str] = None) -> bool:
"""
检查指定日期是否已推送过
Args:
date: 日期字符串(YYYY-MM-DD),默认为今天
Returns:
是否已推送
"""
try:
conn = self._get_connection(date)
cursor = conn.cursor()
target_date = self._format_date_folder(date)
cursor.execute("""
SELECT pushed FROM push_records WHERE date = ?
""", (target_date,))
row = cursor.fetchone()
if row:
return bool(row[0])
return False
except Exception as e:
print(f"[本地存储] 检查推送记录失败: {e}")
return False
def record_push(self, report_type: str, date: Optional[str] = None) -> bool:
"""
记录推送
Args:
report_type: 报告类型
date: 日期字符串(YYYY-MM-DD),默认为今天
Returns:
是否记录成功
"""
try:
conn = self._get_connection(date)
cursor = conn.cursor()
target_date = self._format_date_folder(date)
now_str = self._get_configured_time().strftime("%Y-%m-%d %H:%M:%S")
cursor.execute("""
INSERT INTO push_records (date, pushed, push_time, report_type, created_at)
VALUES (?, 1, ?, ?, ?)
ON CONFLICT(date) DO UPDATE SET
pushed = 1,
push_time = excluded.push_time,
report_type = excluded.report_type
""", (target_date, now_str, report_type, now_str))
conn.commit()
print(f"[本地存储] 推送记录已保存: {report_type} at {now_str}")
return True
except Exception as e:
print(f"[本地存储] 记录推送失败: {e}")
return False
def __del__(self):
"""析构函数,确保关闭连接"""
self.cleanup()
+316
View File
@@ -0,0 +1,316 @@
# coding=utf-8
"""
存储管理器 - 统一管理存储后端
根据环境和配置自动选择合适的存储后端
"""
import os
from typing import Optional
from trendradar.storage.base import StorageBackend, NewsData
# 存储管理器单例
_storage_manager: Optional["StorageManager"] = None
class StorageManager:
"""
存储管理器
功能:
- 自动检测运行环境(GitHub Actions / Docker / 本地)
- 根据配置选择存储后端(local / remote / auto
- 提供统一的存储接口
- 支持从远程拉取数据到本地
"""
def __init__(
self,
backend_type: str = "auto",
data_dir: str = "output",
enable_txt: bool = True,
enable_html: bool = True,
remote_config: Optional[dict] = None,
local_retention_days: int = 0,
remote_retention_days: int = 0,
pull_enabled: bool = False,
pull_days: int = 0,
timezone: str = "Asia/Shanghai",
):
"""
初始化存储管理器
Args:
backend_type: 存储后端类型 (local / remote / auto)
data_dir: 本地数据目录
enable_txt: 是否启用 TXT 快照
enable_html: 是否启用 HTML 报告
remote_config: 远程存储配置(endpoint_url, bucket_name, access_key_id 等)
local_retention_days: 本地数据保留天数(0 = 无限制)
remote_retention_days: 远程数据保留天数(0 = 无限制)
pull_enabled: 是否启用启动时自动拉取
pull_days: 拉取最近 N 天的数据
timezone: 时区配置(默认 Asia/Shanghai
"""
self.backend_type = backend_type
self.data_dir = data_dir
self.enable_txt = enable_txt
self.enable_html = enable_html
self.remote_config = remote_config or {}
self.local_retention_days = local_retention_days
self.remote_retention_days = remote_retention_days
self.pull_enabled = pull_enabled
self.pull_days = pull_days
self.timezone = timezone
self._backend: Optional[StorageBackend] = None
self._remote_backend: Optional[StorageBackend] = None
@staticmethod
def is_github_actions() -> bool:
"""检测是否在 GitHub Actions 环境中运行"""
return os.environ.get("GITHUB_ACTIONS") == "true"
@staticmethod
def is_docker() -> bool:
"""检测是否在 Docker 容器中运行"""
# 方法1: 检查 /.dockerenv 文件
if os.path.exists("/.dockerenv"):
return True
# 方法2: 检查 cgroupLinux
try:
with open("/proc/1/cgroup", "r") as f:
return "docker" in f.read()
except (FileNotFoundError, PermissionError):
pass
# 方法3: 检查环境变量
return os.environ.get("DOCKER_CONTAINER") == "true"
def _resolve_backend_type(self) -> str:
"""解析实际使用的后端类型"""
if self.backend_type == "auto":
if self.is_github_actions():
# GitHub Actions 环境,检查是否配置了远程存储
if self._has_remote_config():
return "remote"
else:
print("[存储管理器] GitHub Actions 环境但未配置远程存储,使用本地存储")
return "local"
else:
return "local"
return self.backend_type
def _has_remote_config(self) -> bool:
"""检查是否有有效的远程存储配置"""
# 检查配置或环境变量
bucket_name = self.remote_config.get("bucket_name") or os.environ.get("S3_BUCKET_NAME")
access_key = self.remote_config.get("access_key_id") or os.environ.get("S3_ACCESS_KEY_ID")
secret_key = self.remote_config.get("secret_access_key") or os.environ.get("S3_SECRET_ACCESS_KEY")
endpoint = self.remote_config.get("endpoint_url") or os.environ.get("S3_ENDPOINT_URL")
# 调试日志
has_config = bool(bucket_name and access_key and secret_key and endpoint)
if not has_config:
print(f"[存储管理器] 远程存储配置检查失败:")
print(f" - bucket_name: {'已配置' if bucket_name else '未配置'}")
print(f" - access_key_id: {'已配置' if access_key else '未配置'}")
print(f" - secret_access_key: {'已配置' if secret_key else '未配置'}")
print(f" - endpoint_url: {'已配置' if endpoint else '未配置'}")
return has_config
def _create_remote_backend(self) -> Optional[StorageBackend]:
"""创建远程存储后端"""
try:
from trendradar.storage.remote import RemoteStorageBackend
return RemoteStorageBackend(
bucket_name=self.remote_config.get("bucket_name") or os.environ.get("S3_BUCKET_NAME", ""),
access_key_id=self.remote_config.get("access_key_id") or os.environ.get("S3_ACCESS_KEY_ID", ""),
secret_access_key=self.remote_config.get("secret_access_key") or os.environ.get("S3_SECRET_ACCESS_KEY", ""),
endpoint_url=self.remote_config.get("endpoint_url") or os.environ.get("S3_ENDPOINT_URL", ""),
region=self.remote_config.get("region") or os.environ.get("S3_REGION", ""),
enable_txt=self.enable_txt,
enable_html=self.enable_html,
timezone=self.timezone,
)
except ImportError as e:
print(f"[存储管理器] 远程后端导入失败: {e}")
print("[存储管理器] 请确保已安装 boto3: pip install boto3")
return None
except Exception as e:
print(f"[存储管理器] 远程后端初始化失败: {e}")
return None
def get_backend(self) -> StorageBackend:
"""获取存储后端实例"""
if self._backend is None:
resolved_type = self._resolve_backend_type()
if resolved_type == "remote":
self._backend = self._create_remote_backend()
if self._backend:
print(f"[存储管理器] 使用远程存储后端")
else:
print("[存储管理器] 回退到本地存储")
resolved_type = "local"
if resolved_type == "local" or self._backend is None:
from trendradar.storage.local import LocalStorageBackend
self._backend = LocalStorageBackend(
data_dir=self.data_dir,
enable_txt=self.enable_txt,
enable_html=self.enable_html,
timezone=self.timezone,
)
print(f"[存储管理器] 使用本地存储后端 (数据目录: {self.data_dir})")
return self._backend
def pull_from_remote(self) -> int:
"""
从远程拉取数据到本地
Returns:
成功拉取的文件数量
"""
if not self.pull_enabled or self.pull_days <= 0:
return 0
if not self._has_remote_config():
print("[存储管理器] 未配置远程存储,无法拉取")
return 0
# 创建远程后端(如果还没有)
if self._remote_backend is None:
self._remote_backend = self._create_remote_backend()
if self._remote_backend is None:
print("[存储管理器] 无法创建远程后端,拉取失败")
return 0
# 调用拉取方法
return self._remote_backend.pull_recent_days(self.pull_days, self.data_dir)
def save_news_data(self, data: NewsData) -> bool:
"""保存新闻数据"""
return self.get_backend().save_news_data(data)
def get_today_all_data(self, date: Optional[str] = None) -> Optional[NewsData]:
"""获取当天所有数据"""
return self.get_backend().get_today_all_data(date)
def get_latest_crawl_data(self, date: Optional[str] = None) -> Optional[NewsData]:
"""获取最新抓取数据"""
return self.get_backend().get_latest_crawl_data(date)
def detect_new_titles(self, current_data: NewsData) -> dict:
"""检测新增标题"""
return self.get_backend().detect_new_titles(current_data)
def save_txt_snapshot(self, data: NewsData) -> Optional[str]:
"""保存 TXT 快照"""
return self.get_backend().save_txt_snapshot(data)
def save_html_report(self, html_content: str, filename: str, is_summary: bool = False) -> Optional[str]:
"""保存 HTML 报告"""
return self.get_backend().save_html_report(html_content, filename, is_summary)
def is_first_crawl_today(self, date: Optional[str] = None) -> bool:
"""检查是否是当天第一次抓取"""
return self.get_backend().is_first_crawl_today(date)
def cleanup(self) -> None:
"""清理资源"""
if self._backend:
self._backend.cleanup()
if self._remote_backend:
self._remote_backend.cleanup()
def cleanup_old_data(self) -> int:
"""
清理过期数据
Returns:
删除的日期目录数量
"""
total_deleted = 0
# 清理本地数据
if self.local_retention_days > 0:
total_deleted += self.get_backend().cleanup_old_data(self.local_retention_days)
# 清理远程数据(如果配置了)
if self.remote_retention_days > 0 and self._has_remote_config():
if self._remote_backend is None:
self._remote_backend = self._create_remote_backend()
if self._remote_backend:
total_deleted += self._remote_backend.cleanup_old_data(self.remote_retention_days)
return total_deleted
@property
def backend_name(self) -> str:
"""获取当前后端名称"""
return self.get_backend().backend_name
@property
def supports_txt(self) -> bool:
"""是否支持 TXT 快照"""
return self.get_backend().supports_txt
def get_storage_manager(
backend_type: str = "auto",
data_dir: str = "output",
enable_txt: bool = True,
enable_html: bool = True,
remote_config: Optional[dict] = None,
local_retention_days: int = 0,
remote_retention_days: int = 0,
pull_enabled: bool = False,
pull_days: int = 0,
timezone: str = "Asia/Shanghai",
force_new: bool = False,
) -> StorageManager:
"""
获取存储管理器单例
Args:
backend_type: 存储后端类型
data_dir: 本地数据目录
enable_txt: 是否启用 TXT 快照
enable_html: 是否启用 HTML 报告
remote_config: 远程存储配置
local_retention_days: 本地数据保留天数(0 = 无限制)
remote_retention_days: 远程数据保留天数(0 = 无限制)
pull_enabled: 是否启用启动时自动拉取
pull_days: 拉取最近 N 天的数据
timezone: 时区配置(默认 Asia/Shanghai
force_new: 是否强制创建新实例
Returns:
StorageManager 实例
"""
global _storage_manager
if _storage_manager is None or force_new:
_storage_manager = StorageManager(
backend_type=backend_type,
data_dir=data_dir,
enable_txt=enable_txt,
enable_html=enable_html,
remote_config=remote_config,
local_retention_days=local_retention_days,
remote_retention_days=remote_retention_days,
pull_enabled=pull_enabled,
pull_days=pull_days,
timezone=timezone,
)
return _storage_manager
File diff suppressed because it is too large Load Diff
+117
View File
@@ -0,0 +1,117 @@
-- TrendRadar 数据库表结构
-- ============================================
-- 平台信息表
-- 核心:id 不变,name 可变
-- ============================================
CREATE TABLE IF NOT EXISTS platforms (
id TEXT PRIMARY KEY,
name TEXT NOT NULL,
is_active INTEGER DEFAULT 1,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
-- ============================================
-- 新闻条目表
-- 以 URL + platform_id 为唯一标识,支持去重存储
-- ============================================
CREATE TABLE IF NOT EXISTS news_items (
id INTEGER PRIMARY KEY AUTOINCREMENT,
title TEXT NOT NULL,
platform_id TEXT NOT NULL,
rank INTEGER NOT NULL,
url TEXT DEFAULT '',
mobile_url TEXT DEFAULT '',
first_crawl_time TEXT NOT NULL, -- 首次抓取时间
last_crawl_time TEXT NOT NULL, -- 最后抓取时间
crawl_count INTEGER DEFAULT 1, -- 抓取次数
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (platform_id) REFERENCES platforms(id)
);
-- ============================================
-- 标题变更历史表
-- 记录同一 URL 下标题的变化
-- ============================================
CREATE TABLE IF NOT EXISTS title_changes (
id INTEGER PRIMARY KEY AUTOINCREMENT,
news_item_id INTEGER NOT NULL,
old_title TEXT NOT NULL,
new_title TEXT NOT NULL,
changed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (news_item_id) REFERENCES news_items(id)
);
-- ============================================
-- 排名历史表
-- 记录每次抓取时的排名变化
-- ============================================
CREATE TABLE IF NOT EXISTS rank_history (
id INTEGER PRIMARY KEY AUTOINCREMENT,
news_item_id INTEGER NOT NULL,
rank INTEGER NOT NULL,
crawl_time TEXT NOT NULL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (news_item_id) REFERENCES news_items(id)
);
-- ============================================
-- 抓取记录表
-- 记录每次抓取的时间和数量
-- ============================================
CREATE TABLE IF NOT EXISTS crawl_records (
id INTEGER PRIMARY KEY AUTOINCREMENT,
crawl_time TEXT NOT NULL UNIQUE,
total_items INTEGER DEFAULT 0,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
-- ============================================
-- 抓取来源状态表
-- 记录每次抓取各平台的成功/失败状态
-- ============================================
CREATE TABLE IF NOT EXISTS crawl_source_status (
crawl_record_id INTEGER NOT NULL,
platform_id TEXT NOT NULL,
status TEXT NOT NULL CHECK(status IN ('success', 'failed')),
PRIMARY KEY (crawl_record_id, platform_id),
FOREIGN KEY (crawl_record_id) REFERENCES crawl_records(id),
FOREIGN KEY (platform_id) REFERENCES platforms(id)
);
-- ============================================
-- 推送记录表
-- 用于 push_window once_per_day 功能
-- ============================================
CREATE TABLE IF NOT EXISTS push_records (
id INTEGER PRIMARY KEY AUTOINCREMENT,
date TEXT NOT NULL UNIQUE,
pushed INTEGER DEFAULT 0,
push_time TEXT,
report_type TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
-- ============================================
-- 索引定义
-- ============================================
-- 平台索引
CREATE INDEX IF NOT EXISTS idx_news_platform ON news_items(platform_id);
-- 时间索引(用于查询最新数据)
CREATE INDEX IF NOT EXISTS idx_news_crawl_time ON news_items(last_crawl_time);
-- 标题索引(用于标题搜索)
CREATE INDEX IF NOT EXISTS idx_news_title ON news_items(title);
-- URL + platform_id 唯一索引(仅对非空 URL,实现去重)
CREATE UNIQUE INDEX IF NOT EXISTS idx_news_url_platform
ON news_items(url, platform_id) WHERE url != '';
-- 抓取状态索引
CREATE INDEX IF NOT EXISTS idx_crawl_status_record ON crawl_source_status(crawl_record_id);
-- 排名历史索引
CREATE INDEX IF NOT EXISTS idx_rank_history_news ON rank_history(news_item_id);
+20
View File
@@ -0,0 +1,20 @@
# coding=utf-8
"""
工具模块 - 公共工具函数
"""
from trendradar.utils.time import (
get_configured_time,
format_date_folder,
format_time_filename,
get_current_time_display,
convert_time_for_display,
)
__all__ = [
"get_configured_time",
"format_date_folder",
"format_time_filename",
"get_current_time_display",
"convert_time_for_display",
]
+91
View File
@@ -0,0 +1,91 @@
# coding=utf-8
"""
时间工具模块 - 统一时间处理函数
"""
from datetime import datetime
from typing import Optional
import pytz
# 默认时区
DEFAULT_TIMEZONE = "Asia/Shanghai"
def get_configured_time(timezone: str = DEFAULT_TIMEZONE) -> datetime:
"""
获取配置时区的当前时间
Args:
timezone: 时区名称,如 'Asia/Shanghai', 'America/Los_Angeles'
Returns:
带时区信息的当前时间
"""
try:
tz = pytz.timezone(timezone)
except pytz.UnknownTimeZoneError:
print(f"[警告] 未知时区 '{timezone}',使用默认时区 {DEFAULT_TIMEZONE}")
tz = pytz.timezone(DEFAULT_TIMEZONE)
return datetime.now(tz)
def format_date_folder(
date: Optional[str] = None, timezone: str = DEFAULT_TIMEZONE
) -> str:
"""
格式化日期文件夹名 (ISO 格式: YYYY-MM-DD)
Args:
date: 指定日期字符串,为 None 则使用当前日期
timezone: 时区名称
Returns:
格式化后的日期字符串,如 '2025-12-09'
"""
if date:
return date
return get_configured_time(timezone).strftime("%Y-%m-%d")
def format_time_filename(timezone: str = DEFAULT_TIMEZONE) -> str:
"""
格式化时间文件名 (格式: HH-MM,用于文件名)
Windows 系统不支持冒号作为文件名,因此使用连字符
Args:
timezone: 时区名称
Returns:
格式化后的时间字符串,如 '15-30'
"""
return get_configured_time(timezone).strftime("%H-%M")
def get_current_time_display(timezone: str = DEFAULT_TIMEZONE) -> str:
"""
获取当前时间显示 (格式: HH:MM,用于显示)
Args:
timezone: 时区名称
Returns:
格式化后的时间字符串,如 '15:30'
"""
return get_configured_time(timezone).strftime("%H:%M")
def convert_time_for_display(time_str: str) -> str:
"""
将 HH-MM 格式转换为 HH:MM 格式用于显示
Args:
time_str: 输入时间字符串,如 '15-30'
Returns:
转换后的时间字符串,如 '15:30'
"""
if time_str and "-" in time_str and len(time_str) == 5:
return time_str.replace("-", ":")
return time_str