# coding=utf-8 """ 报告生成模块 提供报告数据准备和 HTML 生成功能: - prepare_report_data: 准备报告数据 - generate_html_report: 生成 HTML 报告 """ from pathlib import Path from typing import Dict, List, Optional, Callable def prepare_report_data( stats: List[Dict], failed_ids: Optional[List] = None, new_titles: Optional[Dict] = None, id_to_name: Optional[Dict] = None, mode: str = "daily", rank_threshold: int = 3, matches_word_groups_func: Optional[Callable] = None, load_frequency_words_func: Optional[Callable] = None, ) -> Dict: """ 准备报告数据 Args: stats: 统计结果列表 failed_ids: 失败的 ID 列表 new_titles: 新增标题 id_to_name: ID 到名称的映射 mode: 报告模式 (daily/incremental/current) rank_threshold: 排名阈值 matches_word_groups_func: 词组匹配函数 load_frequency_words_func: 加载频率词函数 Returns: Dict: 准备好的报告数据 """ processed_new_titles = [] # 在增量模式下隐藏新增新闻区域 hide_new_section = mode == "incremental" # 只有在非隐藏模式下才处理新增新闻部分 if not hide_new_section: filtered_new_titles = {} if new_titles and id_to_name: # 如果提供了匹配函数,使用它过滤 if matches_word_groups_func and load_frequency_words_func: word_groups, filter_words, global_filters = load_frequency_words_func() for source_id, titles_data in new_titles.items(): filtered_titles = {} for title, title_data in titles_data.items(): if matches_word_groups_func(title, word_groups, filter_words, global_filters): filtered_titles[title] = title_data if filtered_titles: filtered_new_titles[source_id] = filtered_titles else: # 没有匹配函数时,使用全部 filtered_new_titles = new_titles # 打印过滤后的新增热点数(与推送显示一致) original_new_count = sum(len(titles) for titles in new_titles.values()) if new_titles else 0 filtered_new_count = sum(len(titles) for titles in filtered_new_titles.values()) if filtered_new_titles else 0 if original_new_count > 0: print(f"频率词过滤后:{filtered_new_count} 条新增热点匹配(原始 {original_new_count} 条)") if filtered_new_titles and id_to_name: for source_id, titles_data in filtered_new_titles.items(): source_name = id_to_name.get(source_id, source_id) source_titles = [] for title, title_data in titles_data.items(): url = title_data.get("url", "") mobile_url = title_data.get("mobileUrl", "") ranks = title_data.get("ranks", []) processed_title = { "title": title, "source_name": source_name, "time_display": "", "count": 1, "ranks": ranks, "rank_threshold": rank_threshold, "url": url, "mobile_url": mobile_url, "is_new": True, } source_titles.append(processed_title) if source_titles: processed_new_titles.append( { "source_id": source_id, "source_name": source_name, "titles": source_titles, } ) processed_stats = [] for stat in stats: if stat["count"] <= 0: continue processed_titles = [] for title_data in stat["titles"]: processed_title = { "title": title_data["title"], "source_name": title_data["source_name"], "time_display": title_data["time_display"], "count": title_data["count"], "ranks": title_data["ranks"], "rank_threshold": title_data["rank_threshold"], "url": title_data.get("url", ""), "mobile_url": title_data.get("mobileUrl", ""), "is_new": title_data.get("is_new", False), } processed_titles.append(processed_title) processed_stats.append( { "word": stat["word"], "count": stat["count"], "percentage": stat.get("percentage", 0), "titles": processed_titles, } ) return { "stats": processed_stats, "new_titles": processed_new_titles, "failed_ids": failed_ids or [], "total_new_count": sum( len(source["titles"]) for source in processed_new_titles ), } def generate_html_report( stats: List[Dict], total_titles: int, failed_ids: Optional[List] = None, new_titles: Optional[Dict] = None, id_to_name: Optional[Dict] = None, mode: str = "daily", is_daily_summary: bool = False, update_info: Optional[Dict] = None, rank_threshold: int = 3, output_dir: str = "output", date_folder: str = "", time_filename: str = "", render_html_func: Optional[Callable] = None, matches_word_groups_func: Optional[Callable] = None, load_frequency_words_func: Optional[Callable] = None, enable_index_copy: bool = True, ) -> str: """ 生成 HTML 报告 Args: stats: 统计结果列表 total_titles: 总标题数 failed_ids: 失败的 ID 列表 new_titles: 新增标题 id_to_name: ID 到名称的映射 mode: 报告模式 (daily/incremental/current) is_daily_summary: 是否是每日汇总 update_info: 更新信息 rank_threshold: 排名阈值 output_dir: 输出目录 date_folder: 日期文件夹名称 time_filename: 时间文件名 render_html_func: HTML 渲染函数 matches_word_groups_func: 词组匹配函数 load_frequency_words_func: 加载频率词函数 enable_index_copy: 是否复制到 index.html Returns: str: 生成的 HTML 文件路径 """ if is_daily_summary: if mode == "current": filename = "当前榜单汇总.html" elif mode == "incremental": filename = "当日增量.html" else: filename = "当日汇总.html" else: filename = f"{time_filename}.html" # 构建输出路径 output_path = Path(output_dir) / date_folder / "html" output_path.mkdir(parents=True, exist_ok=True) file_path = str(output_path / filename) # 准备报告数据 report_data = prepare_report_data( stats, failed_ids, new_titles, id_to_name, mode, rank_threshold, matches_word_groups_func, load_frequency_words_func, ) # 渲染 HTML 内容 if render_html_func: html_content = render_html_func( report_data, total_titles, is_daily_summary, mode, update_info ) else: # 默认简单 HTML html_content = f"

Report

{report_data}
" # 写入文件 with open(file_path, "w", encoding="utf-8") as f: f.write(html_content) # 如果是每日汇总且启用 index 复制 if is_daily_summary and enable_index_copy: # 生成到根目录(供 GitHub Pages 访问) root_index_path = Path("index.html") with open(root_index_path, "w", encoding="utf-8") as f: f.write(html_content) # 同时生成到 output 目录(供 Docker Volume 挂载访问) output_index_path = Path(output_dir) / "index.html" Path(output_dir).mkdir(parents=True, exist_ok=True) with open(output_index_path, "w", encoding="utf-8") as f: f.write(html_content) return file_path