mirror of
https://gitee.com/houhuan/TrendRadar.git
synced 2026-05-01 01:12:42 +08:00
v4.0.0 大大大更新
This commit is contained in:
@@ -0,0 +1,47 @@
|
||||
# coding=utf-8
|
||||
"""
|
||||
核心模块 - 配置管理和核心工具
|
||||
"""
|
||||
|
||||
from trendradar.core.config import (
|
||||
parse_multi_account_config,
|
||||
validate_paired_configs,
|
||||
limit_accounts,
|
||||
get_account_at_index,
|
||||
)
|
||||
from trendradar.core.loader import load_config
|
||||
from trendradar.core.frequency import load_frequency_words, matches_word_groups
|
||||
from trendradar.core.data import (
|
||||
save_titles_to_file,
|
||||
read_all_today_titles_from_storage,
|
||||
read_all_today_titles,
|
||||
detect_latest_new_titles_from_storage,
|
||||
detect_latest_new_titles,
|
||||
is_first_crawl_today,
|
||||
)
|
||||
from trendradar.core.analyzer import (
|
||||
calculate_news_weight,
|
||||
format_time_display,
|
||||
count_word_frequency,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"parse_multi_account_config",
|
||||
"validate_paired_configs",
|
||||
"limit_accounts",
|
||||
"get_account_at_index",
|
||||
"load_config",
|
||||
"load_frequency_words",
|
||||
"matches_word_groups",
|
||||
# 数据处理
|
||||
"save_titles_to_file",
|
||||
"read_all_today_titles_from_storage",
|
||||
"read_all_today_titles",
|
||||
"detect_latest_new_titles_from_storage",
|
||||
"detect_latest_new_titles",
|
||||
"is_first_crawl_today",
|
||||
# 统计分析
|
||||
"calculate_news_weight",
|
||||
"format_time_display",
|
||||
"count_word_frequency",
|
||||
]
|
||||
@@ -0,0 +1,469 @@
|
||||
# coding=utf-8
|
||||
"""
|
||||
统计分析模块
|
||||
|
||||
提供新闻统计和分析功能:
|
||||
- calculate_news_weight: 计算新闻权重
|
||||
- format_time_display: 格式化时间显示
|
||||
- count_word_frequency: 统计词频
|
||||
"""
|
||||
|
||||
from typing import Dict, List, Tuple, Optional, Callable
|
||||
|
||||
from trendradar.core.frequency import matches_word_groups
|
||||
|
||||
|
||||
def calculate_news_weight(
|
||||
title_data: Dict,
|
||||
rank_threshold: int,
|
||||
weight_config: Dict,
|
||||
) -> float:
|
||||
"""
|
||||
计算新闻权重,用于排序
|
||||
|
||||
Args:
|
||||
title_data: 标题数据,包含 ranks 和 count
|
||||
rank_threshold: 排名阈值
|
||||
weight_config: 权重配置 {RANK_WEIGHT, FREQUENCY_WEIGHT, HOTNESS_WEIGHT}
|
||||
|
||||
Returns:
|
||||
float: 计算出的权重值
|
||||
"""
|
||||
ranks = title_data.get("ranks", [])
|
||||
if not ranks:
|
||||
return 0.0
|
||||
|
||||
count = title_data.get("count", len(ranks))
|
||||
|
||||
# 排名权重:Σ(11 - min(rank, 10)) / 出现次数
|
||||
rank_scores = []
|
||||
for rank in ranks:
|
||||
score = 11 - min(rank, 10)
|
||||
rank_scores.append(score)
|
||||
|
||||
rank_weight = sum(rank_scores) / len(ranks) if ranks else 0
|
||||
|
||||
# 频次权重:min(出现次数, 10) × 10
|
||||
frequency_weight = min(count, 10) * 10
|
||||
|
||||
# 热度加成:高排名次数 / 总出现次数 × 100
|
||||
high_rank_count = sum(1 for rank in ranks if rank <= rank_threshold)
|
||||
hotness_ratio = high_rank_count / len(ranks) if ranks else 0
|
||||
hotness_weight = hotness_ratio * 100
|
||||
|
||||
total_weight = (
|
||||
rank_weight * weight_config["RANK_WEIGHT"]
|
||||
+ frequency_weight * weight_config["FREQUENCY_WEIGHT"]
|
||||
+ hotness_weight * weight_config["HOTNESS_WEIGHT"]
|
||||
)
|
||||
|
||||
return total_weight
|
||||
|
||||
|
||||
def format_time_display(
|
||||
first_time: str,
|
||||
last_time: str,
|
||||
convert_time_func: Callable[[str], str],
|
||||
) -> str:
|
||||
"""
|
||||
格式化时间显示(将 HH-MM 转换为 HH:MM)
|
||||
|
||||
Args:
|
||||
first_time: 首次出现时间
|
||||
last_time: 最后出现时间
|
||||
convert_time_func: 时间格式转换函数
|
||||
|
||||
Returns:
|
||||
str: 格式化后的时间显示字符串
|
||||
"""
|
||||
if not first_time:
|
||||
return ""
|
||||
# 转换为显示格式
|
||||
first_display = convert_time_func(first_time)
|
||||
last_display = convert_time_func(last_time)
|
||||
if first_display == last_display or not last_display:
|
||||
return first_display
|
||||
else:
|
||||
return f"[{first_display} ~ {last_display}]"
|
||||
|
||||
|
||||
def count_word_frequency(
|
||||
results: Dict,
|
||||
word_groups: List[Dict],
|
||||
filter_words: List[str],
|
||||
id_to_name: Dict,
|
||||
title_info: Optional[Dict] = None,
|
||||
rank_threshold: int = 3,
|
||||
new_titles: Optional[Dict] = None,
|
||||
mode: str = "daily",
|
||||
global_filters: Optional[List[str]] = None,
|
||||
weight_config: Optional[Dict] = None,
|
||||
max_news_per_keyword: int = 0,
|
||||
sort_by_position_first: bool = False,
|
||||
is_first_crawl_func: Optional[Callable[[], bool]] = None,
|
||||
convert_time_func: Optional[Callable[[str], str]] = None,
|
||||
) -> Tuple[List[Dict], int]:
|
||||
"""
|
||||
统计词频,支持必须词、频率词、过滤词、全局过滤词,并标记新增标题
|
||||
|
||||
Args:
|
||||
results: 抓取结果 {source_id: {title: title_data}}
|
||||
word_groups: 词组配置列表
|
||||
filter_words: 过滤词列表
|
||||
id_to_name: ID 到名称的映射
|
||||
title_info: 标题统计信息(可选)
|
||||
rank_threshold: 排名阈值
|
||||
new_titles: 新增标题(可选)
|
||||
mode: 报告模式 (daily/incremental/current)
|
||||
global_filters: 全局过滤词(可选)
|
||||
weight_config: 权重配置
|
||||
max_news_per_keyword: 每个关键词最大显示数量
|
||||
sort_by_position_first: 是否优先按配置位置排序
|
||||
is_first_crawl_func: 检测是否是当天第一次爬取的函数
|
||||
convert_time_func: 时间格式转换函数
|
||||
|
||||
Returns:
|
||||
Tuple[List[Dict], int]: (统计结果列表, 总标题数)
|
||||
"""
|
||||
# 默认权重配置
|
||||
if weight_config is None:
|
||||
weight_config = {
|
||||
"RANK_WEIGHT": 0.4,
|
||||
"FREQUENCY_WEIGHT": 0.3,
|
||||
"HOTNESS_WEIGHT": 0.3,
|
||||
}
|
||||
|
||||
# 默认时间转换函数
|
||||
if convert_time_func is None:
|
||||
convert_time_func = lambda x: x
|
||||
|
||||
# 默认首次爬取检测函数
|
||||
if is_first_crawl_func is None:
|
||||
is_first_crawl_func = lambda: True
|
||||
|
||||
# 如果没有配置词组,创建一个包含所有新闻的虚拟词组
|
||||
if not word_groups:
|
||||
print("频率词配置为空,将显示所有新闻")
|
||||
word_groups = [{"required": [], "normal": [], "group_key": "全部新闻"}]
|
||||
filter_words = [] # 清空过滤词,显示所有新闻
|
||||
|
||||
is_first_today = is_first_crawl_func()
|
||||
|
||||
# 确定处理的数据源和新增标记逻辑
|
||||
if mode == "incremental":
|
||||
if is_first_today:
|
||||
# 增量模式 + 当天第一次:处理所有新闻,都标记为新增
|
||||
results_to_process = results
|
||||
all_news_are_new = True
|
||||
else:
|
||||
# 增量模式 + 当天非第一次:只处理新增的新闻
|
||||
results_to_process = new_titles if new_titles else {}
|
||||
all_news_are_new = True
|
||||
elif mode == "current":
|
||||
# current 模式:只处理当前时间批次的新闻,但统计信息来自全部历史
|
||||
if title_info:
|
||||
latest_time = None
|
||||
for source_titles in title_info.values():
|
||||
for title_data in source_titles.values():
|
||||
last_time = title_data.get("last_time", "")
|
||||
if last_time:
|
||||
if latest_time is None or last_time > latest_time:
|
||||
latest_time = last_time
|
||||
|
||||
# 只处理 last_time 等于最新时间的新闻
|
||||
if latest_time:
|
||||
results_to_process = {}
|
||||
for source_id, source_titles in results.items():
|
||||
if source_id in title_info:
|
||||
filtered_titles = {}
|
||||
for title, title_data in source_titles.items():
|
||||
if title in title_info[source_id]:
|
||||
info = title_info[source_id][title]
|
||||
if info.get("last_time") == latest_time:
|
||||
filtered_titles[title] = title_data
|
||||
if filtered_titles:
|
||||
results_to_process[source_id] = filtered_titles
|
||||
|
||||
print(
|
||||
f"当前榜单模式:最新时间 {latest_time},筛选出 {sum(len(titles) for titles in results_to_process.values())} 条当前榜单新闻"
|
||||
)
|
||||
else:
|
||||
results_to_process = results
|
||||
else:
|
||||
results_to_process = results
|
||||
all_news_are_new = False
|
||||
else:
|
||||
# 当日汇总模式:处理所有新闻
|
||||
results_to_process = results
|
||||
all_news_are_new = False
|
||||
total_input_news = sum(len(titles) for titles in results.values())
|
||||
filter_status = (
|
||||
"全部显示"
|
||||
if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻"
|
||||
else "频率词过滤"
|
||||
)
|
||||
print(f"当日汇总模式:处理 {total_input_news} 条新闻,模式:{filter_status}")
|
||||
|
||||
word_stats = {}
|
||||
total_titles = 0
|
||||
processed_titles = {}
|
||||
matched_new_count = 0
|
||||
|
||||
if title_info is None:
|
||||
title_info = {}
|
||||
if new_titles is None:
|
||||
new_titles = {}
|
||||
|
||||
for group in word_groups:
|
||||
group_key = group["group_key"]
|
||||
word_stats[group_key] = {"count": 0, "titles": {}}
|
||||
|
||||
for source_id, titles_data in results_to_process.items():
|
||||
total_titles += len(titles_data)
|
||||
|
||||
if source_id not in processed_titles:
|
||||
processed_titles[source_id] = {}
|
||||
|
||||
for title, title_data in titles_data.items():
|
||||
if title in processed_titles.get(source_id, {}):
|
||||
continue
|
||||
|
||||
# 使用统一的匹配逻辑
|
||||
matches_frequency_words = matches_word_groups(
|
||||
title, word_groups, filter_words, global_filters
|
||||
)
|
||||
|
||||
if not matches_frequency_words:
|
||||
continue
|
||||
|
||||
# 如果是增量模式或 current 模式第一次,统计匹配的新增新闻数量
|
||||
if (mode == "incremental" and all_news_are_new) or (
|
||||
mode == "current" and is_first_today
|
||||
):
|
||||
matched_new_count += 1
|
||||
|
||||
source_ranks = title_data.get("ranks", [])
|
||||
source_url = title_data.get("url", "")
|
||||
source_mobile_url = title_data.get("mobileUrl", "")
|
||||
|
||||
# 找到匹配的词组(防御性转换确保类型安全)
|
||||
title_lower = str(title).lower() if not isinstance(title, str) else title.lower()
|
||||
for group in word_groups:
|
||||
required_words = group["required"]
|
||||
normal_words = group["normal"]
|
||||
|
||||
# 如果是"全部新闻"模式,所有标题都匹配第一个(唯一的)词组
|
||||
if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻":
|
||||
group_key = group["group_key"]
|
||||
word_stats[group_key]["count"] += 1
|
||||
if source_id not in word_stats[group_key]["titles"]:
|
||||
word_stats[group_key]["titles"][source_id] = []
|
||||
else:
|
||||
# 原有的匹配逻辑
|
||||
if required_words:
|
||||
all_required_present = all(
|
||||
req_word.lower() in title_lower
|
||||
for req_word in required_words
|
||||
)
|
||||
if not all_required_present:
|
||||
continue
|
||||
|
||||
if normal_words:
|
||||
any_normal_present = any(
|
||||
normal_word.lower() in title_lower
|
||||
for normal_word in normal_words
|
||||
)
|
||||
if not any_normal_present:
|
||||
continue
|
||||
|
||||
group_key = group["group_key"]
|
||||
word_stats[group_key]["count"] += 1
|
||||
if source_id not in word_stats[group_key]["titles"]:
|
||||
word_stats[group_key]["titles"][source_id] = []
|
||||
|
||||
first_time = ""
|
||||
last_time = ""
|
||||
count_info = 1
|
||||
ranks = source_ranks if source_ranks else []
|
||||
url = source_url
|
||||
mobile_url = source_mobile_url
|
||||
|
||||
# 对于 current 模式,从历史统计信息中获取完整数据
|
||||
if (
|
||||
mode == "current"
|
||||
and title_info
|
||||
and source_id in title_info
|
||||
and title in title_info[source_id]
|
||||
):
|
||||
info = title_info[source_id][title]
|
||||
first_time = info.get("first_time", "")
|
||||
last_time = info.get("last_time", "")
|
||||
count_info = info.get("count", 1)
|
||||
if "ranks" in info and info["ranks"]:
|
||||
ranks = info["ranks"]
|
||||
url = info.get("url", source_url)
|
||||
mobile_url = info.get("mobileUrl", source_mobile_url)
|
||||
elif (
|
||||
title_info
|
||||
and source_id in title_info
|
||||
and title in title_info[source_id]
|
||||
):
|
||||
info = title_info[source_id][title]
|
||||
first_time = info.get("first_time", "")
|
||||
last_time = info.get("last_time", "")
|
||||
count_info = info.get("count", 1)
|
||||
if "ranks" in info and info["ranks"]:
|
||||
ranks = info["ranks"]
|
||||
url = info.get("url", source_url)
|
||||
mobile_url = info.get("mobileUrl", source_mobile_url)
|
||||
|
||||
if not ranks:
|
||||
ranks = [99]
|
||||
|
||||
time_display = format_time_display(first_time, last_time, convert_time_func)
|
||||
|
||||
source_name = id_to_name.get(source_id, source_id)
|
||||
|
||||
# 判断是否为新增
|
||||
is_new = False
|
||||
if all_news_are_new:
|
||||
# 增量模式下所有处理的新闻都是新增,或者当天第一次的所有新闻都是新增
|
||||
is_new = True
|
||||
elif new_titles and source_id in new_titles:
|
||||
# 检查是否在新增列表中
|
||||
new_titles_for_source = new_titles[source_id]
|
||||
is_new = title in new_titles_for_source
|
||||
|
||||
word_stats[group_key]["titles"][source_id].append(
|
||||
{
|
||||
"title": title,
|
||||
"source_name": source_name,
|
||||
"first_time": first_time,
|
||||
"last_time": last_time,
|
||||
"time_display": time_display,
|
||||
"count": count_info,
|
||||
"ranks": ranks,
|
||||
"rank_threshold": rank_threshold,
|
||||
"url": url,
|
||||
"mobileUrl": mobile_url,
|
||||
"is_new": is_new,
|
||||
}
|
||||
)
|
||||
|
||||
if source_id not in processed_titles:
|
||||
processed_titles[source_id] = {}
|
||||
processed_titles[source_id][title] = True
|
||||
|
||||
break
|
||||
|
||||
# 最后统一打印汇总信息
|
||||
if mode == "incremental":
|
||||
if is_first_today:
|
||||
total_input_news = sum(len(titles) for titles in results.values())
|
||||
filter_status = (
|
||||
"全部显示"
|
||||
if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻"
|
||||
else "频率词匹配"
|
||||
)
|
||||
print(
|
||||
f"增量模式:当天第一次爬取,{total_input_news} 条新闻中有 {matched_new_count} 条{filter_status}"
|
||||
)
|
||||
else:
|
||||
if new_titles:
|
||||
total_new_count = sum(len(titles) for titles in new_titles.values())
|
||||
filter_status = (
|
||||
"全部显示"
|
||||
if len(word_groups) == 1
|
||||
and word_groups[0]["group_key"] == "全部新闻"
|
||||
else "匹配频率词"
|
||||
)
|
||||
print(
|
||||
f"增量模式:{total_new_count} 条新增新闻中,有 {matched_new_count} 条{filter_status}"
|
||||
)
|
||||
if matched_new_count == 0 and len(word_groups) > 1:
|
||||
print("增量模式:没有新增新闻匹配频率词,将不会发送通知")
|
||||
else:
|
||||
print("增量模式:未检测到新增新闻")
|
||||
elif mode == "current":
|
||||
total_input_news = sum(len(titles) for titles in results_to_process.values())
|
||||
if is_first_today:
|
||||
filter_status = (
|
||||
"全部显示"
|
||||
if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻"
|
||||
else "频率词匹配"
|
||||
)
|
||||
print(
|
||||
f"当前榜单模式:当天第一次爬取,{total_input_news} 条当前榜单新闻中有 {matched_new_count} 条{filter_status}"
|
||||
)
|
||||
else:
|
||||
matched_count = sum(stat["count"] for stat in word_stats.values())
|
||||
filter_status = (
|
||||
"全部显示"
|
||||
if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻"
|
||||
else "频率词匹配"
|
||||
)
|
||||
print(
|
||||
f"当前榜单模式:{total_input_news} 条当前榜单新闻中有 {matched_count} 条{filter_status}"
|
||||
)
|
||||
|
||||
stats = []
|
||||
# 创建 group_key 到位置和最大数量的映射
|
||||
group_key_to_position = {
|
||||
group["group_key"]: idx for idx, group in enumerate(word_groups)
|
||||
}
|
||||
group_key_to_max_count = {
|
||||
group["group_key"]: group.get("max_count", 0) for group in word_groups
|
||||
}
|
||||
|
||||
for group_key, data in word_stats.items():
|
||||
all_titles = []
|
||||
for source_id, title_list in data["titles"].items():
|
||||
all_titles.extend(title_list)
|
||||
|
||||
# 按权重排序
|
||||
sorted_titles = sorted(
|
||||
all_titles,
|
||||
key=lambda x: (
|
||||
-calculate_news_weight(x, rank_threshold, weight_config),
|
||||
min(x["ranks"]) if x["ranks"] else 999,
|
||||
-x["count"],
|
||||
),
|
||||
)
|
||||
|
||||
# 应用最大显示数量限制(优先级:单独配置 > 全局配置)
|
||||
group_max_count = group_key_to_max_count.get(group_key, 0)
|
||||
if group_max_count == 0:
|
||||
# 使用全局配置
|
||||
group_max_count = max_news_per_keyword
|
||||
|
||||
if group_max_count > 0:
|
||||
sorted_titles = sorted_titles[:group_max_count]
|
||||
|
||||
stats.append(
|
||||
{
|
||||
"word": group_key,
|
||||
"count": data["count"],
|
||||
"position": group_key_to_position.get(group_key, 999),
|
||||
"titles": sorted_titles,
|
||||
"percentage": (
|
||||
round(data["count"] / total_titles * 100, 2)
|
||||
if total_titles > 0
|
||||
else 0
|
||||
),
|
||||
}
|
||||
)
|
||||
|
||||
# 根据配置选择排序优先级
|
||||
if sort_by_position_first:
|
||||
# 先按配置位置,再按热点条数
|
||||
stats.sort(key=lambda x: (x["position"], -x["count"]))
|
||||
else:
|
||||
# 先按热点条数,再按配置位置(原逻辑)
|
||||
stats.sort(key=lambda x: (-x["count"], x["position"]))
|
||||
|
||||
# 打印过滤后的匹配新闻数(与推送显示一致)
|
||||
matched_news_count = sum(len(stat["titles"]) for stat in stats if stat["count"] > 0)
|
||||
if mode == "daily":
|
||||
print(f"频率词过滤后:{matched_news_count} 条新闻匹配(将显示在推送中)")
|
||||
|
||||
return stats, total_titles
|
||||
@@ -0,0 +1,152 @@
|
||||
# coding=utf-8
|
||||
"""
|
||||
配置工具模块 - 多账号配置解析和验证
|
||||
|
||||
提供多账号推送配置的解析、验证和限制功能
|
||||
"""
|
||||
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
|
||||
def parse_multi_account_config(config_value: str, separator: str = ";") -> List[str]:
|
||||
"""
|
||||
解析多账号配置,返回账号列表
|
||||
|
||||
Args:
|
||||
config_value: 配置值字符串,多个账号用分隔符分隔
|
||||
separator: 分隔符,默认为 ;
|
||||
|
||||
Returns:
|
||||
账号列表,空字符串会被保留(用于占位)
|
||||
|
||||
Examples:
|
||||
>>> parse_multi_account_config("url1;url2;url3")
|
||||
['url1', 'url2', 'url3']
|
||||
>>> parse_multi_account_config(";token2") # 第一个账号无token
|
||||
['', 'token2']
|
||||
>>> parse_multi_account_config("")
|
||||
[]
|
||||
"""
|
||||
if not config_value:
|
||||
return []
|
||||
# 保留空字符串用于占位(如 ";token2" 表示第一个账号无token)
|
||||
accounts = [acc.strip() for acc in config_value.split(separator)]
|
||||
# 过滤掉全部为空的情况
|
||||
if all(not acc for acc in accounts):
|
||||
return []
|
||||
return accounts
|
||||
|
||||
|
||||
def validate_paired_configs(
|
||||
configs: Dict[str, List[str]],
|
||||
channel_name: str,
|
||||
required_keys: Optional[List[str]] = None
|
||||
) -> Tuple[bool, int]:
|
||||
"""
|
||||
验证配对配置的数量是否一致
|
||||
|
||||
对于需要多个配置项配对的渠道(如 Telegram 的 token 和 chat_id),
|
||||
验证所有配置项的账号数量是否一致。
|
||||
|
||||
Args:
|
||||
configs: 配置字典,key 为配置名,value 为账号列表
|
||||
channel_name: 渠道名称,用于日志输出
|
||||
required_keys: 必须有值的配置项列表
|
||||
|
||||
Returns:
|
||||
(是否验证通过, 账号数量)
|
||||
|
||||
Examples:
|
||||
>>> validate_paired_configs({
|
||||
... "token": ["t1", "t2"],
|
||||
... "chat_id": ["c1", "c2"]
|
||||
... }, "Telegram", ["token", "chat_id"])
|
||||
(True, 2)
|
||||
|
||||
>>> validate_paired_configs({
|
||||
... "token": ["t1", "t2"],
|
||||
... "chat_id": ["c1"] # 数量不匹配
|
||||
... }, "Telegram", ["token", "chat_id"])
|
||||
(False, 0)
|
||||
"""
|
||||
# 过滤掉空列表
|
||||
non_empty_configs = {k: v for k, v in configs.items() if v}
|
||||
|
||||
if not non_empty_configs:
|
||||
return True, 0
|
||||
|
||||
# 检查必须项
|
||||
if required_keys:
|
||||
for key in required_keys:
|
||||
if key not in non_empty_configs or not non_empty_configs[key]:
|
||||
return True, 0 # 必须项为空,视为未配置
|
||||
|
||||
# 获取所有非空配置的长度
|
||||
lengths = {k: len(v) for k, v in non_empty_configs.items()}
|
||||
unique_lengths = set(lengths.values())
|
||||
|
||||
if len(unique_lengths) > 1:
|
||||
print(f"❌ {channel_name} 配置错误:配对配置数量不一致,将跳过该渠道推送")
|
||||
for key, length in lengths.items():
|
||||
print(f" - {key}: {length} 个")
|
||||
return False, 0
|
||||
|
||||
return True, list(unique_lengths)[0] if unique_lengths else 0
|
||||
|
||||
|
||||
def limit_accounts(
|
||||
accounts: List[str],
|
||||
max_count: int,
|
||||
channel_name: str
|
||||
) -> List[str]:
|
||||
"""
|
||||
限制账号数量
|
||||
|
||||
当配置的账号数量超过最大限制时,只使用前 N 个账号,
|
||||
并输出警告信息。
|
||||
|
||||
Args:
|
||||
accounts: 账号列表
|
||||
max_count: 最大账号数量
|
||||
channel_name: 渠道名称,用于日志输出
|
||||
|
||||
Returns:
|
||||
限制后的账号列表
|
||||
|
||||
Examples:
|
||||
>>> limit_accounts(["a1", "a2", "a3"], 2, "飞书")
|
||||
⚠️ 飞书 配置了 3 个账号,超过最大限制 2,只使用前 2 个
|
||||
['a1', 'a2']
|
||||
"""
|
||||
if len(accounts) > max_count:
|
||||
print(f"⚠️ {channel_name} 配置了 {len(accounts)} 个账号,超过最大限制 {max_count},只使用前 {max_count} 个")
|
||||
print(f" ⚠️ 警告:如果您是 fork 用户,过多账号可能导致 GitHub Actions 运行时间过长,存在账号风险")
|
||||
return accounts[:max_count]
|
||||
return accounts
|
||||
|
||||
|
||||
def get_account_at_index(accounts: List[str], index: int, default: str = "") -> str:
|
||||
"""
|
||||
安全获取指定索引的账号值
|
||||
|
||||
当索引超出范围或账号值为空时,返回默认值。
|
||||
|
||||
Args:
|
||||
accounts: 账号列表
|
||||
index: 索引
|
||||
default: 默认值
|
||||
|
||||
Returns:
|
||||
账号值或默认值
|
||||
|
||||
Examples:
|
||||
>>> get_account_at_index(["a", "b", "c"], 1)
|
||||
'b'
|
||||
>>> get_account_at_index(["a", "", "c"], 1, "default")
|
||||
'default'
|
||||
>>> get_account_at_index(["a"], 5, "default")
|
||||
'default'
|
||||
"""
|
||||
if index < len(accounts):
|
||||
return accounts[index] if accounts[index] else default
|
||||
return default
|
||||
@@ -0,0 +1,291 @@
|
||||
# coding=utf-8
|
||||
"""
|
||||
数据处理模块
|
||||
|
||||
提供数据读取、保存和检测功能:
|
||||
- save_titles_to_file: 保存标题到 TXT 文件
|
||||
- read_all_today_titles: 从存储后端读取当天所有标题
|
||||
- detect_latest_new_titles: 检测最新批次的新增标题
|
||||
|
||||
Author: TrendRadar Team
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Tuple, Optional, Callable
|
||||
|
||||
|
||||
def save_titles_to_file(
|
||||
results: Dict,
|
||||
id_to_name: Dict,
|
||||
failed_ids: List,
|
||||
output_path: str,
|
||||
clean_title_func: Callable[[str], str],
|
||||
) -> str:
|
||||
"""
|
||||
保存标题到 TXT 文件
|
||||
|
||||
Args:
|
||||
results: 抓取结果 {source_id: {title: title_data}}
|
||||
id_to_name: ID 到名称的映射
|
||||
failed_ids: 失败的 ID 列表
|
||||
output_path: 输出文件路径
|
||||
clean_title_func: 标题清理函数
|
||||
|
||||
Returns:
|
||||
str: 保存的文件路径
|
||||
"""
|
||||
# 确保目录存在
|
||||
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
for id_value, title_data in results.items():
|
||||
# id | name 或 id
|
||||
name = id_to_name.get(id_value)
|
||||
if name and name != id_value:
|
||||
f.write(f"{id_value} | {name}\n")
|
||||
else:
|
||||
f.write(f"{id_value}\n")
|
||||
|
||||
# 按排名排序标题
|
||||
sorted_titles = []
|
||||
for title, info in title_data.items():
|
||||
cleaned_title = clean_title_func(title)
|
||||
if isinstance(info, dict):
|
||||
ranks = info.get("ranks", [])
|
||||
url = info.get("url", "")
|
||||
mobile_url = info.get("mobileUrl", "")
|
||||
else:
|
||||
ranks = info if isinstance(info, list) else []
|
||||
url = ""
|
||||
mobile_url = ""
|
||||
|
||||
rank = ranks[0] if ranks else 1
|
||||
sorted_titles.append((rank, cleaned_title, url, mobile_url))
|
||||
|
||||
sorted_titles.sort(key=lambda x: x[0])
|
||||
|
||||
for rank, cleaned_title, url, mobile_url in sorted_titles:
|
||||
line = f"{rank}. {cleaned_title}"
|
||||
|
||||
if url:
|
||||
line += f" [URL:{url}]"
|
||||
if mobile_url:
|
||||
line += f" [MOBILE:{mobile_url}]"
|
||||
f.write(line + "\n")
|
||||
|
||||
f.write("\n")
|
||||
|
||||
if failed_ids:
|
||||
f.write("==== 以下ID请求失败 ====\n")
|
||||
for id_value in failed_ids:
|
||||
f.write(f"{id_value}\n")
|
||||
|
||||
return output_path
|
||||
|
||||
|
||||
def read_all_today_titles_from_storage(
|
||||
storage_manager,
|
||||
current_platform_ids: Optional[List[str]] = None,
|
||||
) -> Tuple[Dict, Dict, Dict]:
|
||||
"""
|
||||
从存储后端读取当天所有标题(SQLite 数据)
|
||||
|
||||
Args:
|
||||
storage_manager: 存储管理器实例
|
||||
current_platform_ids: 当前监控的平台 ID 列表(用于过滤)
|
||||
|
||||
Returns:
|
||||
Tuple[Dict, Dict, Dict]: (all_results, id_to_name, title_info)
|
||||
"""
|
||||
try:
|
||||
news_data = storage_manager.get_today_all_data()
|
||||
|
||||
if not news_data or not news_data.items:
|
||||
return {}, {}, {}
|
||||
|
||||
all_results = {}
|
||||
final_id_to_name = {}
|
||||
title_info = {}
|
||||
|
||||
for source_id, news_list in news_data.items.items():
|
||||
# 按平台过滤
|
||||
if current_platform_ids is not None and source_id not in current_platform_ids:
|
||||
continue
|
||||
|
||||
# 获取来源名称
|
||||
source_name = news_data.id_to_name.get(source_id, source_id)
|
||||
final_id_to_name[source_id] = source_name
|
||||
|
||||
if source_id not in all_results:
|
||||
all_results[source_id] = {}
|
||||
title_info[source_id] = {}
|
||||
|
||||
for item in news_list:
|
||||
title = item.title
|
||||
ranks = getattr(item, 'ranks', [item.rank])
|
||||
first_time = getattr(item, 'first_time', item.crawl_time)
|
||||
last_time = getattr(item, 'last_time', item.crawl_time)
|
||||
count = getattr(item, 'count', 1)
|
||||
|
||||
all_results[source_id][title] = {
|
||||
"ranks": ranks,
|
||||
"url": item.url or "",
|
||||
"mobileUrl": item.mobile_url or "",
|
||||
}
|
||||
|
||||
title_info[source_id][title] = {
|
||||
"first_time": first_time,
|
||||
"last_time": last_time,
|
||||
"count": count,
|
||||
"ranks": ranks,
|
||||
"url": item.url or "",
|
||||
"mobileUrl": item.mobile_url or "",
|
||||
}
|
||||
|
||||
return all_results, final_id_to_name, title_info
|
||||
|
||||
except Exception as e:
|
||||
print(f"[存储] 从存储后端读取数据失败: {e}")
|
||||
return {}, {}, {}
|
||||
|
||||
|
||||
def read_all_today_titles(
|
||||
storage_manager,
|
||||
current_platform_ids: Optional[List[str]] = None,
|
||||
) -> Tuple[Dict, Dict, Dict]:
|
||||
"""
|
||||
读取当天所有标题(从存储后端)
|
||||
|
||||
Args:
|
||||
storage_manager: 存储管理器实例
|
||||
current_platform_ids: 当前监控的平台 ID 列表(用于过滤)
|
||||
|
||||
Returns:
|
||||
Tuple[Dict, Dict, Dict]: (all_results, id_to_name, title_info)
|
||||
"""
|
||||
all_results, final_id_to_name, title_info = read_all_today_titles_from_storage(
|
||||
storage_manager, current_platform_ids
|
||||
)
|
||||
|
||||
if all_results:
|
||||
total_count = sum(len(titles) for titles in all_results.values())
|
||||
print(f"[存储] 已从存储后端读取 {total_count} 条标题")
|
||||
else:
|
||||
print("[存储] 当天暂无数据")
|
||||
|
||||
return all_results, final_id_to_name, title_info
|
||||
|
||||
|
||||
def detect_latest_new_titles_from_storage(
|
||||
storage_manager,
|
||||
current_platform_ids: Optional[List[str]] = None,
|
||||
) -> Dict:
|
||||
"""
|
||||
从存储后端检测最新批次的新增标题
|
||||
|
||||
Args:
|
||||
storage_manager: 存储管理器实例
|
||||
current_platform_ids: 当前监控的平台 ID 列表(用于过滤)
|
||||
|
||||
Returns:
|
||||
Dict: 新增标题 {source_id: {title: title_data}}
|
||||
"""
|
||||
try:
|
||||
# 获取最新抓取数据
|
||||
latest_data = storage_manager.get_latest_crawl_data()
|
||||
if not latest_data or not latest_data.items:
|
||||
return {}
|
||||
|
||||
# 获取所有历史数据
|
||||
all_data = storage_manager.get_today_all_data()
|
||||
if not all_data or not all_data.items:
|
||||
# 没有历史数据(第一次抓取),不应该有"新增"标题
|
||||
return {}
|
||||
|
||||
# 收集历史标题(不包括最新批次的时间)
|
||||
latest_time = latest_data.crawl_time
|
||||
historical_titles = {}
|
||||
|
||||
for source_id, news_list in all_data.items.items():
|
||||
if current_platform_ids is not None and source_id not in current_platform_ids:
|
||||
continue
|
||||
|
||||
historical_titles[source_id] = set()
|
||||
for item in news_list:
|
||||
# 只统计非最新批次的标题
|
||||
first_time = getattr(item, 'first_time', item.crawl_time)
|
||||
if first_time != latest_time:
|
||||
historical_titles[source_id].add(item.title)
|
||||
|
||||
# 检查是否是当天第一次抓取(没有任何历史标题)
|
||||
# 如果所有平台的历史标题集合都为空,说明只有一个抓取批次,不应该有"新增"标题
|
||||
has_historical_data = any(len(titles) > 0 for titles in historical_titles.values())
|
||||
if not has_historical_data:
|
||||
return {}
|
||||
|
||||
# 找出新增标题
|
||||
new_titles = {}
|
||||
for source_id, news_list in latest_data.items.items():
|
||||
if current_platform_ids is not None and source_id not in current_platform_ids:
|
||||
continue
|
||||
|
||||
historical_set = historical_titles.get(source_id, set())
|
||||
source_new_titles = {}
|
||||
|
||||
for item in news_list:
|
||||
if item.title not in historical_set:
|
||||
source_new_titles[item.title] = {
|
||||
"ranks": [item.rank],
|
||||
"url": item.url or "",
|
||||
"mobileUrl": item.mobile_url or "",
|
||||
}
|
||||
|
||||
if source_new_titles:
|
||||
new_titles[source_id] = source_new_titles
|
||||
|
||||
return new_titles
|
||||
|
||||
except Exception as e:
|
||||
print(f"[存储] 从存储后端检测新标题失败: {e}")
|
||||
return {}
|
||||
|
||||
|
||||
def detect_latest_new_titles(
|
||||
storage_manager,
|
||||
current_platform_ids: Optional[List[str]] = None,
|
||||
) -> Dict:
|
||||
"""
|
||||
检测当日最新批次的新增标题(从存储后端)
|
||||
|
||||
Args:
|
||||
storage_manager: 存储管理器实例
|
||||
current_platform_ids: 当前监控的平台 ID 列表(用于过滤)
|
||||
|
||||
Returns:
|
||||
Dict: 新增标题 {source_id: {title: title_data}}
|
||||
"""
|
||||
new_titles = detect_latest_new_titles_from_storage(storage_manager, current_platform_ids)
|
||||
if new_titles:
|
||||
total_new = sum(len(titles) for titles in new_titles.values())
|
||||
print(f"[存储] 从存储后端检测到 {total_new} 条新增标题")
|
||||
return new_titles
|
||||
|
||||
|
||||
def is_first_crawl_today(output_dir: str, date_folder: str) -> bool:
|
||||
"""
|
||||
检测是否是当天第一次爬取
|
||||
|
||||
Args:
|
||||
output_dir: 输出目录
|
||||
date_folder: 日期文件夹名称
|
||||
|
||||
Returns:
|
||||
bool: 是否是当天第一次爬取
|
||||
"""
|
||||
txt_dir = Path(output_dir) / date_folder / "txt"
|
||||
|
||||
if not txt_dir.exists():
|
||||
return True
|
||||
|
||||
files = sorted([f for f in txt_dir.iterdir() if f.suffix == ".txt"])
|
||||
return len(files) <= 1
|
||||
@@ -0,0 +1,194 @@
|
||||
# coding=utf-8
|
||||
"""
|
||||
频率词配置加载模块
|
||||
|
||||
负责从配置文件加载频率词规则,支持:
|
||||
- 普通词组
|
||||
- 必须词(+前缀)
|
||||
- 过滤词(!前缀)
|
||||
- 全局过滤词([GLOBAL_FILTER] 区域)
|
||||
- 最大显示数量(@前缀)
|
||||
"""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Tuple, Optional
|
||||
|
||||
|
||||
def load_frequency_words(
|
||||
frequency_file: Optional[str] = None,
|
||||
) -> Tuple[List[Dict], List[str], List[str]]:
|
||||
"""
|
||||
加载频率词配置
|
||||
|
||||
配置文件格式说明:
|
||||
- 每个词组由空行分隔
|
||||
- [GLOBAL_FILTER] 区域定义全局过滤词
|
||||
- [WORD_GROUPS] 区域定义词组(默认)
|
||||
|
||||
词组语法:
|
||||
- 普通词:直接写入,任意匹配即可
|
||||
- +词:必须词,所有必须词都要匹配
|
||||
- !词:过滤词,匹配则排除
|
||||
- @数字:该词组最多显示的条数
|
||||
|
||||
Args:
|
||||
frequency_file: 频率词配置文件路径,默认从环境变量 FREQUENCY_WORDS_PATH 获取或使用 config/frequency_words.txt
|
||||
|
||||
Returns:
|
||||
(词组列表, 词组内过滤词, 全局过滤词)
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: 频率词文件不存在
|
||||
"""
|
||||
if frequency_file is None:
|
||||
frequency_file = os.environ.get(
|
||||
"FREQUENCY_WORDS_PATH", "config/frequency_words.txt"
|
||||
)
|
||||
|
||||
frequency_path = Path(frequency_file)
|
||||
if not frequency_path.exists():
|
||||
raise FileNotFoundError(f"频率词文件 {frequency_file} 不存在")
|
||||
|
||||
with open(frequency_path, "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
|
||||
word_groups = [group.strip() for group in content.split("\n\n") if group.strip()]
|
||||
|
||||
processed_groups = []
|
||||
filter_words = []
|
||||
global_filters = []
|
||||
|
||||
# 默认区域(向后兼容)
|
||||
current_section = "WORD_GROUPS"
|
||||
|
||||
for group in word_groups:
|
||||
lines = [line.strip() for line in group.split("\n") if line.strip()]
|
||||
|
||||
if not lines:
|
||||
continue
|
||||
|
||||
# 检查是否为区域标记
|
||||
if lines[0].startswith("[") and lines[0].endswith("]"):
|
||||
section_name = lines[0][1:-1].upper()
|
||||
if section_name in ("GLOBAL_FILTER", "WORD_GROUPS"):
|
||||
current_section = section_name
|
||||
lines = lines[1:] # 移除标记行
|
||||
|
||||
# 处理全局过滤区域
|
||||
if current_section == "GLOBAL_FILTER":
|
||||
# 直接添加所有非空行到全局过滤列表
|
||||
for line in lines:
|
||||
# 忽略特殊语法前缀,只提取纯文本
|
||||
if line.startswith(("!", "+", "@")):
|
||||
continue # 全局过滤区不支持特殊语法
|
||||
if line:
|
||||
global_filters.append(line)
|
||||
continue
|
||||
|
||||
# 处理词组区域
|
||||
words = lines
|
||||
|
||||
group_required_words = []
|
||||
group_normal_words = []
|
||||
group_filter_words = []
|
||||
group_max_count = 0 # 默认不限制
|
||||
|
||||
for word in words:
|
||||
if word.startswith("@"):
|
||||
# 解析最大显示数量(只接受正整数)
|
||||
try:
|
||||
count = int(word[1:])
|
||||
if count > 0:
|
||||
group_max_count = count
|
||||
except (ValueError, IndexError):
|
||||
pass # 忽略无效的@数字格式
|
||||
elif word.startswith("!"):
|
||||
filter_words.append(word[1:])
|
||||
group_filter_words.append(word[1:])
|
||||
elif word.startswith("+"):
|
||||
group_required_words.append(word[1:])
|
||||
else:
|
||||
group_normal_words.append(word)
|
||||
|
||||
if group_required_words or group_normal_words:
|
||||
if group_normal_words:
|
||||
group_key = " ".join(group_normal_words)
|
||||
else:
|
||||
group_key = " ".join(group_required_words)
|
||||
|
||||
processed_groups.append(
|
||||
{
|
||||
"required": group_required_words,
|
||||
"normal": group_normal_words,
|
||||
"group_key": group_key,
|
||||
"max_count": group_max_count,
|
||||
}
|
||||
)
|
||||
|
||||
return processed_groups, filter_words, global_filters
|
||||
|
||||
|
||||
def matches_word_groups(
|
||||
title: str,
|
||||
word_groups: List[Dict],
|
||||
filter_words: List[str],
|
||||
global_filters: Optional[List[str]] = None
|
||||
) -> bool:
|
||||
"""
|
||||
检查标题是否匹配词组规则
|
||||
|
||||
Args:
|
||||
title: 标题文本
|
||||
word_groups: 词组列表
|
||||
filter_words: 过滤词列表
|
||||
global_filters: 全局过滤词列表
|
||||
|
||||
Returns:
|
||||
是否匹配
|
||||
"""
|
||||
# 防御性类型检查:确保 title 是有效字符串
|
||||
if not isinstance(title, str):
|
||||
title = str(title) if title is not None else ""
|
||||
if not title.strip():
|
||||
return False
|
||||
|
||||
title_lower = title.lower()
|
||||
|
||||
# 全局过滤检查(优先级最高)
|
||||
if global_filters:
|
||||
if any(global_word.lower() in title_lower for global_word in global_filters):
|
||||
return False
|
||||
|
||||
# 如果没有配置词组,则匹配所有标题(支持显示全部新闻)
|
||||
if not word_groups:
|
||||
return True
|
||||
|
||||
# 过滤词检查
|
||||
if any(filter_word.lower() in title_lower for filter_word in filter_words):
|
||||
return False
|
||||
|
||||
# 词组匹配检查
|
||||
for group in word_groups:
|
||||
required_words = group["required"]
|
||||
normal_words = group["normal"]
|
||||
|
||||
# 必须词检查
|
||||
if required_words:
|
||||
all_required_present = all(
|
||||
req_word.lower() in title_lower for req_word in required_words
|
||||
)
|
||||
if not all_required_present:
|
||||
continue
|
||||
|
||||
# 普通词检查
|
||||
if normal_words:
|
||||
any_normal_present = any(
|
||||
normal_word.lower() in title_lower for normal_word in normal_words
|
||||
)
|
||||
if not any_normal_present:
|
||||
continue
|
||||
|
||||
return True
|
||||
|
||||
return False
|
||||
@@ -0,0 +1,332 @@
|
||||
# coding=utf-8
|
||||
"""
|
||||
配置加载模块
|
||||
|
||||
负责从 YAML 配置文件和环境变量加载配置。
|
||||
"""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
import yaml
|
||||
|
||||
from .config import parse_multi_account_config, validate_paired_configs
|
||||
|
||||
|
||||
def _get_env_bool(key: str, default: bool = False) -> Optional[bool]:
|
||||
"""从环境变量获取布尔值,如果未设置返回 None"""
|
||||
value = os.environ.get(key, "").strip().lower()
|
||||
if not value:
|
||||
return None
|
||||
return value in ("true", "1")
|
||||
|
||||
|
||||
def _get_env_int(key: str, default: int = 0) -> int:
|
||||
"""从环境变量获取整数值"""
|
||||
value = os.environ.get(key, "").strip()
|
||||
if not value:
|
||||
return default
|
||||
try:
|
||||
return int(value)
|
||||
except ValueError:
|
||||
return default
|
||||
|
||||
|
||||
def _get_env_str(key: str, default: str = "") -> str:
|
||||
"""从环境变量获取字符串值"""
|
||||
return os.environ.get(key, "").strip() or default
|
||||
|
||||
|
||||
def _load_app_config(config_data: Dict) -> Dict:
|
||||
"""加载应用配置"""
|
||||
app_config = config_data.get("app", {})
|
||||
return {
|
||||
"VERSION_CHECK_URL": app_config.get("version_check_url", ""),
|
||||
"SHOW_VERSION_UPDATE": app_config.get("show_version_update", True),
|
||||
"TIMEZONE": _get_env_str("TIMEZONE") or app_config.get("timezone", "Asia/Shanghai"),
|
||||
}
|
||||
|
||||
|
||||
def _load_crawler_config(config_data: Dict) -> Dict:
|
||||
"""加载爬虫配置"""
|
||||
crawler_config = config_data.get("crawler", {})
|
||||
enable_crawler_env = _get_env_bool("ENABLE_CRAWLER")
|
||||
return {
|
||||
"REQUEST_INTERVAL": crawler_config.get("request_interval", 100),
|
||||
"USE_PROXY": crawler_config.get("use_proxy", False),
|
||||
"DEFAULT_PROXY": crawler_config.get("default_proxy", ""),
|
||||
"ENABLE_CRAWLER": enable_crawler_env if enable_crawler_env is not None else crawler_config.get("enable_crawler", True),
|
||||
}
|
||||
|
||||
|
||||
def _load_report_config(config_data: Dict) -> Dict:
|
||||
"""加载报告配置"""
|
||||
report_config = config_data.get("report", {})
|
||||
|
||||
# 环境变量覆盖
|
||||
sort_by_position_env = _get_env_bool("SORT_BY_POSITION_FIRST")
|
||||
reverse_content_env = _get_env_bool("REVERSE_CONTENT_ORDER")
|
||||
max_news_env = _get_env_int("MAX_NEWS_PER_KEYWORD")
|
||||
|
||||
return {
|
||||
"REPORT_MODE": _get_env_str("REPORT_MODE") or report_config.get("mode", "daily"),
|
||||
"RANK_THRESHOLD": report_config.get("rank_threshold", 10),
|
||||
"SORT_BY_POSITION_FIRST": sort_by_position_env if sort_by_position_env is not None else report_config.get("sort_by_position_first", False),
|
||||
"MAX_NEWS_PER_KEYWORD": max_news_env or report_config.get("max_news_per_keyword", 0),
|
||||
"REVERSE_CONTENT_ORDER": reverse_content_env if reverse_content_env is not None else report_config.get("reverse_content_order", False),
|
||||
}
|
||||
|
||||
|
||||
def _load_notification_config(config_data: Dict) -> Dict:
|
||||
"""加载通知配置"""
|
||||
notification = config_data.get("notification", {})
|
||||
enable_notification_env = _get_env_bool("ENABLE_NOTIFICATION")
|
||||
|
||||
return {
|
||||
"ENABLE_NOTIFICATION": enable_notification_env if enable_notification_env is not None else notification.get("enable_notification", True),
|
||||
"MESSAGE_BATCH_SIZE": notification.get("message_batch_size", 4000),
|
||||
"DINGTALK_BATCH_SIZE": notification.get("dingtalk_batch_size", 20000),
|
||||
"FEISHU_BATCH_SIZE": notification.get("feishu_batch_size", 29000),
|
||||
"BARK_BATCH_SIZE": notification.get("bark_batch_size", 3600),
|
||||
"SLACK_BATCH_SIZE": notification.get("slack_batch_size", 4000),
|
||||
"BATCH_SEND_INTERVAL": notification.get("batch_send_interval", 1.0),
|
||||
"FEISHU_MESSAGE_SEPARATOR": notification.get("feishu_message_separator", "---"),
|
||||
"MAX_ACCOUNTS_PER_CHANNEL": _get_env_int("MAX_ACCOUNTS_PER_CHANNEL") or notification.get("max_accounts_per_channel", 3),
|
||||
}
|
||||
|
||||
|
||||
def _load_push_window_config(config_data: Dict) -> Dict:
|
||||
"""加载推送窗口配置"""
|
||||
notification = config_data.get("notification", {})
|
||||
push_window = notification.get("push_window", {})
|
||||
time_range = push_window.get("time_range", {})
|
||||
|
||||
enabled_env = _get_env_bool("PUSH_WINDOW_ENABLED")
|
||||
once_per_day_env = _get_env_bool("PUSH_WINDOW_ONCE_PER_DAY")
|
||||
|
||||
return {
|
||||
"ENABLED": enabled_env if enabled_env is not None else push_window.get("enabled", False),
|
||||
"TIME_RANGE": {
|
||||
"START": _get_env_str("PUSH_WINDOW_START") or time_range.get("start", "08:00"),
|
||||
"END": _get_env_str("PUSH_WINDOW_END") or time_range.get("end", "22:00"),
|
||||
},
|
||||
"ONCE_PER_DAY": once_per_day_env if once_per_day_env is not None else push_window.get("once_per_day", True),
|
||||
}
|
||||
|
||||
|
||||
def _load_weight_config(config_data: Dict) -> Dict:
|
||||
"""加载权重配置"""
|
||||
weight = config_data.get("weight", {})
|
||||
return {
|
||||
"RANK_WEIGHT": weight.get("rank_weight", 1.0),
|
||||
"FREQUENCY_WEIGHT": weight.get("frequency_weight", 1.0),
|
||||
"HOTNESS_WEIGHT": weight.get("hotness_weight", 1.0),
|
||||
}
|
||||
|
||||
|
||||
def _load_storage_config(config_data: Dict) -> Dict:
|
||||
"""加载存储配置"""
|
||||
storage = config_data.get("storage", {})
|
||||
formats = storage.get("formats", {})
|
||||
local = storage.get("local", {})
|
||||
remote = storage.get("remote", {})
|
||||
pull = storage.get("pull", {})
|
||||
|
||||
txt_enabled_env = _get_env_bool("STORAGE_TXT_ENABLED")
|
||||
html_enabled_env = _get_env_bool("STORAGE_HTML_ENABLED")
|
||||
pull_enabled_env = _get_env_bool("PULL_ENABLED")
|
||||
|
||||
return {
|
||||
"BACKEND": _get_env_str("STORAGE_BACKEND") or storage.get("backend", "auto"),
|
||||
"FORMATS": {
|
||||
"SQLITE": formats.get("sqlite", True),
|
||||
"TXT": txt_enabled_env if txt_enabled_env is not None else formats.get("txt", True),
|
||||
"HTML": html_enabled_env if html_enabled_env is not None else formats.get("html", True),
|
||||
},
|
||||
"LOCAL": {
|
||||
"DATA_DIR": local.get("data_dir", "output"),
|
||||
"RETENTION_DAYS": _get_env_int("LOCAL_RETENTION_DAYS") or local.get("retention_days", 0),
|
||||
},
|
||||
"REMOTE": {
|
||||
"ENDPOINT_URL": _get_env_str("S3_ENDPOINT_URL") or remote.get("endpoint_url", ""),
|
||||
"BUCKET_NAME": _get_env_str("S3_BUCKET_NAME") or remote.get("bucket_name", ""),
|
||||
"ACCESS_KEY_ID": _get_env_str("S3_ACCESS_KEY_ID") or remote.get("access_key_id", ""),
|
||||
"SECRET_ACCESS_KEY": _get_env_str("S3_SECRET_ACCESS_KEY") or remote.get("secret_access_key", ""),
|
||||
"REGION": _get_env_str("S3_REGION") or remote.get("region", ""),
|
||||
"RETENTION_DAYS": _get_env_int("REMOTE_RETENTION_DAYS") or remote.get("retention_days", 0),
|
||||
},
|
||||
"PULL": {
|
||||
"ENABLED": pull_enabled_env if pull_enabled_env is not None else pull.get("enabled", False),
|
||||
"DAYS": _get_env_int("PULL_DAYS") or pull.get("days", 7),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _load_webhook_config(config_data: Dict) -> Dict:
|
||||
"""加载 Webhook 配置"""
|
||||
notification = config_data.get("notification", {})
|
||||
webhooks = notification.get("webhooks", {})
|
||||
|
||||
return {
|
||||
# 飞书
|
||||
"FEISHU_WEBHOOK_URL": _get_env_str("FEISHU_WEBHOOK_URL") or webhooks.get("feishu_url", ""),
|
||||
# 钉钉
|
||||
"DINGTALK_WEBHOOK_URL": _get_env_str("DINGTALK_WEBHOOK_URL") or webhooks.get("dingtalk_url", ""),
|
||||
# 企业微信
|
||||
"WEWORK_WEBHOOK_URL": _get_env_str("WEWORK_WEBHOOK_URL") or webhooks.get("wework_url", ""),
|
||||
"WEWORK_MSG_TYPE": _get_env_str("WEWORK_MSG_TYPE") or webhooks.get("wework_msg_type", "markdown"),
|
||||
# Telegram
|
||||
"TELEGRAM_BOT_TOKEN": _get_env_str("TELEGRAM_BOT_TOKEN") or webhooks.get("telegram_bot_token", ""),
|
||||
"TELEGRAM_CHAT_ID": _get_env_str("TELEGRAM_CHAT_ID") or webhooks.get("telegram_chat_id", ""),
|
||||
# 邮件
|
||||
"EMAIL_FROM": _get_env_str("EMAIL_FROM") or webhooks.get("email_from", ""),
|
||||
"EMAIL_PASSWORD": _get_env_str("EMAIL_PASSWORD") or webhooks.get("email_password", ""),
|
||||
"EMAIL_TO": _get_env_str("EMAIL_TO") or webhooks.get("email_to", ""),
|
||||
"EMAIL_SMTP_SERVER": _get_env_str("EMAIL_SMTP_SERVER") or webhooks.get("email_smtp_server", ""),
|
||||
"EMAIL_SMTP_PORT": _get_env_str("EMAIL_SMTP_PORT") or webhooks.get("email_smtp_port", ""),
|
||||
# ntfy
|
||||
"NTFY_SERVER_URL": _get_env_str("NTFY_SERVER_URL") or webhooks.get("ntfy_server_url") or "https://ntfy.sh",
|
||||
"NTFY_TOPIC": _get_env_str("NTFY_TOPIC") or webhooks.get("ntfy_topic", ""),
|
||||
"NTFY_TOKEN": _get_env_str("NTFY_TOKEN") or webhooks.get("ntfy_token", ""),
|
||||
# Bark
|
||||
"BARK_URL": _get_env_str("BARK_URL") or webhooks.get("bark_url", ""),
|
||||
# Slack
|
||||
"SLACK_WEBHOOK_URL": _get_env_str("SLACK_WEBHOOK_URL") or webhooks.get("slack_webhook_url", ""),
|
||||
}
|
||||
|
||||
|
||||
def _print_notification_sources(config: Dict) -> None:
|
||||
"""打印通知渠道配置来源信息"""
|
||||
notification_sources = []
|
||||
max_accounts = config["MAX_ACCOUNTS_PER_CHANNEL"]
|
||||
|
||||
if config["FEISHU_WEBHOOK_URL"]:
|
||||
accounts = parse_multi_account_config(config["FEISHU_WEBHOOK_URL"])
|
||||
count = min(len(accounts), max_accounts)
|
||||
source = "环境变量" if os.environ.get("FEISHU_WEBHOOK_URL") else "配置文件"
|
||||
notification_sources.append(f"飞书({source}, {count}个账号)")
|
||||
|
||||
if config["DINGTALK_WEBHOOK_URL"]:
|
||||
accounts = parse_multi_account_config(config["DINGTALK_WEBHOOK_URL"])
|
||||
count = min(len(accounts), max_accounts)
|
||||
source = "环境变量" if os.environ.get("DINGTALK_WEBHOOK_URL") else "配置文件"
|
||||
notification_sources.append(f"钉钉({source}, {count}个账号)")
|
||||
|
||||
if config["WEWORK_WEBHOOK_URL"]:
|
||||
accounts = parse_multi_account_config(config["WEWORK_WEBHOOK_URL"])
|
||||
count = min(len(accounts), max_accounts)
|
||||
source = "环境变量" if os.environ.get("WEWORK_WEBHOOK_URL") else "配置文件"
|
||||
notification_sources.append(f"企业微信({source}, {count}个账号)")
|
||||
|
||||
if config["TELEGRAM_BOT_TOKEN"] and config["TELEGRAM_CHAT_ID"]:
|
||||
tokens = parse_multi_account_config(config["TELEGRAM_BOT_TOKEN"])
|
||||
chat_ids = parse_multi_account_config(config["TELEGRAM_CHAT_ID"])
|
||||
valid, count = validate_paired_configs(
|
||||
{"bot_token": tokens, "chat_id": chat_ids},
|
||||
"Telegram",
|
||||
required_keys=["bot_token", "chat_id"]
|
||||
)
|
||||
if valid and count > 0:
|
||||
count = min(count, max_accounts)
|
||||
token_source = "环境变量" if os.environ.get("TELEGRAM_BOT_TOKEN") else "配置文件"
|
||||
notification_sources.append(f"Telegram({token_source}, {count}个账号)")
|
||||
|
||||
if config["EMAIL_FROM"] and config["EMAIL_PASSWORD"] and config["EMAIL_TO"]:
|
||||
from_source = "环境变量" if os.environ.get("EMAIL_FROM") else "配置文件"
|
||||
notification_sources.append(f"邮件({from_source})")
|
||||
|
||||
if config["NTFY_SERVER_URL"] and config["NTFY_TOPIC"]:
|
||||
topics = parse_multi_account_config(config["NTFY_TOPIC"])
|
||||
tokens = parse_multi_account_config(config["NTFY_TOKEN"])
|
||||
if tokens:
|
||||
valid, count = validate_paired_configs(
|
||||
{"topic": topics, "token": tokens},
|
||||
"ntfy"
|
||||
)
|
||||
if valid and count > 0:
|
||||
count = min(count, max_accounts)
|
||||
server_source = "环境变量" if os.environ.get("NTFY_SERVER_URL") else "配置文件"
|
||||
notification_sources.append(f"ntfy({server_source}, {count}个账号)")
|
||||
else:
|
||||
count = min(len(topics), max_accounts)
|
||||
server_source = "环境变量" if os.environ.get("NTFY_SERVER_URL") else "配置文件"
|
||||
notification_sources.append(f"ntfy({server_source}, {count}个账号)")
|
||||
|
||||
if config["BARK_URL"]:
|
||||
accounts = parse_multi_account_config(config["BARK_URL"])
|
||||
count = min(len(accounts), max_accounts)
|
||||
bark_source = "环境变量" if os.environ.get("BARK_URL") else "配置文件"
|
||||
notification_sources.append(f"Bark({bark_source}, {count}个账号)")
|
||||
|
||||
if config["SLACK_WEBHOOK_URL"]:
|
||||
accounts = parse_multi_account_config(config["SLACK_WEBHOOK_URL"])
|
||||
count = min(len(accounts), max_accounts)
|
||||
slack_source = "环境变量" if os.environ.get("SLACK_WEBHOOK_URL") else "配置文件"
|
||||
notification_sources.append(f"Slack({slack_source}, {count}个账号)")
|
||||
|
||||
if notification_sources:
|
||||
print(f"通知渠道配置来源: {', '.join(notification_sources)}")
|
||||
print(f"每个渠道最大账号数: {max_accounts}")
|
||||
else:
|
||||
print("未配置任何通知渠道")
|
||||
|
||||
|
||||
def load_config(config_path: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
加载配置文件
|
||||
|
||||
Args:
|
||||
config_path: 配置文件路径,默认从环境变量 CONFIG_PATH 获取或使用 config/config.yaml
|
||||
|
||||
Returns:
|
||||
包含所有配置的字典
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: 配置文件不存在
|
||||
"""
|
||||
if config_path is None:
|
||||
config_path = os.environ.get("CONFIG_PATH", "config/config.yaml")
|
||||
|
||||
if not Path(config_path).exists():
|
||||
raise FileNotFoundError(f"配置文件 {config_path} 不存在")
|
||||
|
||||
with open(config_path, "r", encoding="utf-8") as f:
|
||||
config_data = yaml.safe_load(f)
|
||||
|
||||
print(f"配置文件加载成功: {config_path}")
|
||||
|
||||
# 合并所有配置
|
||||
config = {}
|
||||
|
||||
# 应用配置
|
||||
config.update(_load_app_config(config_data))
|
||||
|
||||
# 爬虫配置
|
||||
config.update(_load_crawler_config(config_data))
|
||||
|
||||
# 报告配置
|
||||
config.update(_load_report_config(config_data))
|
||||
|
||||
# 通知配置
|
||||
config.update(_load_notification_config(config_data))
|
||||
|
||||
# 推送窗口配置
|
||||
config["PUSH_WINDOW"] = _load_push_window_config(config_data)
|
||||
|
||||
# 权重配置
|
||||
config["WEIGHT_CONFIG"] = _load_weight_config(config_data)
|
||||
|
||||
# 平台配置
|
||||
config["PLATFORMS"] = config_data.get("platforms", [])
|
||||
|
||||
# 存储配置
|
||||
config["STORAGE"] = _load_storage_config(config_data)
|
||||
|
||||
# Webhook 配置
|
||||
config.update(_load_webhook_config(config_data))
|
||||
|
||||
# 打印通知渠道配置来源
|
||||
_print_notification_sources(config)
|
||||
|
||||
return config
|
||||
Reference in New Issue
Block a user