TrendRadar/trendradar/core/data.py
2025-12-13 13:44:35 +08:00

292 lines
9.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# coding=utf-8
"""
数据处理模块
提供数据读取、保存和检测功能:
- save_titles_to_file: 保存标题到 TXT 文件
- read_all_today_titles: 从存储后端读取当天所有标题
- detect_latest_new_titles: 检测最新批次的新增标题
Author: TrendRadar Team
"""
from pathlib import Path
from typing import Dict, List, Tuple, Optional, Callable
def save_titles_to_file(
results: Dict,
id_to_name: Dict,
failed_ids: List,
output_path: str,
clean_title_func: Callable[[str], str],
) -> str:
"""
保存标题到 TXT 文件
Args:
results: 抓取结果 {source_id: {title: title_data}}
id_to_name: ID 到名称的映射
failed_ids: 失败的 ID 列表
output_path: 输出文件路径
clean_title_func: 标题清理函数
Returns:
str: 保存的文件路径
"""
# 确保目录存在
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
for id_value, title_data in results.items():
# id | name 或 id
name = id_to_name.get(id_value)
if name and name != id_value:
f.write(f"{id_value} | {name}\n")
else:
f.write(f"{id_value}\n")
# 按排名排序标题
sorted_titles = []
for title, info in title_data.items():
cleaned_title = clean_title_func(title)
if isinstance(info, dict):
ranks = info.get("ranks", [])
url = info.get("url", "")
mobile_url = info.get("mobileUrl", "")
else:
ranks = info if isinstance(info, list) else []
url = ""
mobile_url = ""
rank = ranks[0] if ranks else 1
sorted_titles.append((rank, cleaned_title, url, mobile_url))
sorted_titles.sort(key=lambda x: x[0])
for rank, cleaned_title, url, mobile_url in sorted_titles:
line = f"{rank}. {cleaned_title}"
if url:
line += f" [URL:{url}]"
if mobile_url:
line += f" [MOBILE:{mobile_url}]"
f.write(line + "\n")
f.write("\n")
if failed_ids:
f.write("==== 以下ID请求失败 ====\n")
for id_value in failed_ids:
f.write(f"{id_value}\n")
return output_path
def read_all_today_titles_from_storage(
storage_manager,
current_platform_ids: Optional[List[str]] = None,
) -> Tuple[Dict, Dict, Dict]:
"""
从存储后端读取当天所有标题SQLite 数据)
Args:
storage_manager: 存储管理器实例
current_platform_ids: 当前监控的平台 ID 列表(用于过滤)
Returns:
Tuple[Dict, Dict, Dict]: (all_results, id_to_name, title_info)
"""
try:
news_data = storage_manager.get_today_all_data()
if not news_data or not news_data.items:
return {}, {}, {}
all_results = {}
final_id_to_name = {}
title_info = {}
for source_id, news_list in news_data.items.items():
# 按平台过滤
if current_platform_ids is not None and source_id not in current_platform_ids:
continue
# 获取来源名称
source_name = news_data.id_to_name.get(source_id, source_id)
final_id_to_name[source_id] = source_name
if source_id not in all_results:
all_results[source_id] = {}
title_info[source_id] = {}
for item in news_list:
title = item.title
ranks = getattr(item, 'ranks', [item.rank])
first_time = getattr(item, 'first_time', item.crawl_time)
last_time = getattr(item, 'last_time', item.crawl_time)
count = getattr(item, 'count', 1)
all_results[source_id][title] = {
"ranks": ranks,
"url": item.url or "",
"mobileUrl": item.mobile_url or "",
}
title_info[source_id][title] = {
"first_time": first_time,
"last_time": last_time,
"count": count,
"ranks": ranks,
"url": item.url or "",
"mobileUrl": item.mobile_url or "",
}
return all_results, final_id_to_name, title_info
except Exception as e:
print(f"[存储] 从存储后端读取数据失败: {e}")
return {}, {}, {}
def read_all_today_titles(
storage_manager,
current_platform_ids: Optional[List[str]] = None,
) -> Tuple[Dict, Dict, Dict]:
"""
读取当天所有标题(从存储后端)
Args:
storage_manager: 存储管理器实例
current_platform_ids: 当前监控的平台 ID 列表(用于过滤)
Returns:
Tuple[Dict, Dict, Dict]: (all_results, id_to_name, title_info)
"""
all_results, final_id_to_name, title_info = read_all_today_titles_from_storage(
storage_manager, current_platform_ids
)
if all_results:
total_count = sum(len(titles) for titles in all_results.values())
print(f"[存储] 已从存储后端读取 {total_count} 条标题")
else:
print("[存储] 当天暂无数据")
return all_results, final_id_to_name, title_info
def detect_latest_new_titles_from_storage(
storage_manager,
current_platform_ids: Optional[List[str]] = None,
) -> Dict:
"""
从存储后端检测最新批次的新增标题
Args:
storage_manager: 存储管理器实例
current_platform_ids: 当前监控的平台 ID 列表(用于过滤)
Returns:
Dict: 新增标题 {source_id: {title: title_data}}
"""
try:
# 获取最新抓取数据
latest_data = storage_manager.get_latest_crawl_data()
if not latest_data or not latest_data.items:
return {}
# 获取所有历史数据
all_data = storage_manager.get_today_all_data()
if not all_data or not all_data.items:
# 没有历史数据(第一次抓取),不应该有"新增"标题
return {}
# 收集历史标题(不包括最新批次的时间)
latest_time = latest_data.crawl_time
historical_titles = {}
for source_id, news_list in all_data.items.items():
if current_platform_ids is not None and source_id not in current_platform_ids:
continue
historical_titles[source_id] = set()
for item in news_list:
# 只统计非最新批次的标题
first_time = getattr(item, 'first_time', item.crawl_time)
if first_time != latest_time:
historical_titles[source_id].add(item.title)
# 检查是否是当天第一次抓取(没有任何历史标题)
# 如果所有平台的历史标题集合都为空,说明只有一个抓取批次,不应该有"新增"标题
has_historical_data = any(len(titles) > 0 for titles in historical_titles.values())
if not has_historical_data:
return {}
# 找出新增标题
new_titles = {}
for source_id, news_list in latest_data.items.items():
if current_platform_ids is not None and source_id not in current_platform_ids:
continue
historical_set = historical_titles.get(source_id, set())
source_new_titles = {}
for item in news_list:
if item.title not in historical_set:
source_new_titles[item.title] = {
"ranks": [item.rank],
"url": item.url or "",
"mobileUrl": item.mobile_url or "",
}
if source_new_titles:
new_titles[source_id] = source_new_titles
return new_titles
except Exception as e:
print(f"[存储] 从存储后端检测新标题失败: {e}")
return {}
def detect_latest_new_titles(
storage_manager,
current_platform_ids: Optional[List[str]] = None,
) -> Dict:
"""
检测当日最新批次的新增标题(从存储后端)
Args:
storage_manager: 存储管理器实例
current_platform_ids: 当前监控的平台 ID 列表(用于过滤)
Returns:
Dict: 新增标题 {source_id: {title: title_data}}
"""
new_titles = detect_latest_new_titles_from_storage(storage_manager, current_platform_ids)
if new_titles:
total_new = sum(len(titles) for titles in new_titles.values())
print(f"[存储] 从存储后端检测到 {total_new} 条新增标题")
return new_titles
def is_first_crawl_today(output_dir: str, date_folder: str) -> bool:
"""
检测是否是当天第一次爬取
Args:
output_dir: 输出目录
date_folder: 日期文件夹名称
Returns:
bool: 是否是当天第一次爬取
"""
txt_dir = Path(output_dir) / date_folder / "txt"
if not txt_dir.exists():
return True
files = sorted([f for f in txt_dir.iterdir() if f.suffix == ".txt"])
return len(files) <= 1