v4.0.0 大大大更新

2026-05-01 01:12:42 +08:00 · 2025-12-13 13:44:35 +08:00
parent 97c05aa33c
commit c7bacdfff7
61 changed files with 12407 additions and 5889 deletions
@@ -0,0 +1,47 @@
+# coding=utf-8
+"""
+核心模块 - 配置管理和核心工具
+"""
+
+from trendradar.core.config import (
+    parse_multi_account_config,
+    validate_paired_configs,
+    limit_accounts,
+    get_account_at_index,
+)
+from trendradar.core.loader import load_config
+from trendradar.core.frequency import load_frequency_words, matches_word_groups
+from trendradar.core.data import (
+    save_titles_to_file,
+    read_all_today_titles_from_storage,
+    read_all_today_titles,
+    detect_latest_new_titles_from_storage,
+    detect_latest_new_titles,
+    is_first_crawl_today,
+)
+from trendradar.core.analyzer import (
+    calculate_news_weight,
+    format_time_display,
+    count_word_frequency,
+)
+
+__all__ = [
+    "parse_multi_account_config",
+    "validate_paired_configs",
+    "limit_accounts",
+    "get_account_at_index",
+    "load_config",
+    "load_frequency_words",
+    "matches_word_groups",
+    # 数据处理
+    "save_titles_to_file",
+    "read_all_today_titles_from_storage",
+    "read_all_today_titles",
+    "detect_latest_new_titles_from_storage",
+    "detect_latest_new_titles",
+    "is_first_crawl_today",
+    # 统计分析
+    "calculate_news_weight",
+    "format_time_display",
+    "count_word_frequency",
+]
@@ -0,0 +1,469 @@
+# coding=utf-8
+"""
+统计分析模块
+
+提供新闻统计和分析功能：
+- calculate_news_weight: 计算新闻权重
+- format_time_display: 格式化时间显示
+- count_word_frequency: 统计词频
+"""
+
+from typing import Dict, List, Tuple, Optional, Callable
+
+from trendradar.core.frequency import matches_word_groups
+
+
+def calculate_news_weight(
+    title_data: Dict,
+    rank_threshold: int,
+    weight_config: Dict,
+) -> float:
+    """
+    计算新闻权重，用于排序
+
+    Args:
+        title_data: 标题数据，包含 ranks 和 count
+        rank_threshold: 排名阈值
+        weight_config: 权重配置 {RANK_WEIGHT, FREQUENCY_WEIGHT, HOTNESS_WEIGHT}
+
+    Returns:
+        float: 计算出的权重值
+    """
+    ranks = title_data.get("ranks", [])
+    if not ranks:
+        return 0.0
+
+    count = title_data.get("count", len(ranks))
+
+    # 排名权重：Σ(11 - min(rank, 10)) / 出现次数
+    rank_scores = []
+    for rank in ranks:
+        score = 11 - min(rank, 10)
+        rank_scores.append(score)
+
+    rank_weight = sum(rank_scores) / len(ranks) if ranks else 0
+
+    # 频次权重：min(出现次数, 10) × 10
+    frequency_weight = min(count, 10) * 10
+
+    # 热度加成：高排名次数 / 总出现次数 × 100
+    high_rank_count = sum(1 for rank in ranks if rank <= rank_threshold)
+    hotness_ratio = high_rank_count / len(ranks) if ranks else 0
+    hotness_weight = hotness_ratio * 100
+
+    total_weight = (
+        rank_weight * weight_config["RANK_WEIGHT"]
+        + frequency_weight * weight_config["FREQUENCY_WEIGHT"]
+        + hotness_weight * weight_config["HOTNESS_WEIGHT"]
+    )
+
+    return total_weight
+
+
+def format_time_display(
+    first_time: str,
+    last_time: str,
+    convert_time_func: Callable[[str], str],
+) -> str:
+    """
+    格式化时间显示（将 HH-MM 转换为 HH:MM）
+
+    Args:
+        first_time: 首次出现时间
+        last_time: 最后出现时间
+        convert_time_func: 时间格式转换函数
+
+    Returns:
+        str: 格式化后的时间显示字符串
+    """
+    if not first_time:
+        return ""
+    # 转换为显示格式
+    first_display = convert_time_func(first_time)
+    last_display = convert_time_func(last_time)
+    if first_display == last_display or not last_display:
+        return first_display
+    else:
+        return f"[{first_display} ~ {last_display}]"
+
+
+def count_word_frequency(
+    results: Dict,
+    word_groups: List[Dict],
+    filter_words: List[str],
+    id_to_name: Dict,
+    title_info: Optional[Dict] = None,
+    rank_threshold: int = 3,
+    new_titles: Optional[Dict] = None,
+    mode: str = "daily",
+    global_filters: Optional[List[str]] = None,
+    weight_config: Optional[Dict] = None,
+    max_news_per_keyword: int = 0,
+    sort_by_position_first: bool = False,
+    is_first_crawl_func: Optional[Callable[[], bool]] = None,
+    convert_time_func: Optional[Callable[[str], str]] = None,
+) -> Tuple[List[Dict], int]:
+    """
+    统计词频，支持必须词、频率词、过滤词、全局过滤词，并标记新增标题
+
+    Args:
+        results: 抓取结果 {source_id: {title: title_data}}
+        word_groups: 词组配置列表
+        filter_words: 过滤词列表
+        id_to_name: ID 到名称的映射
+        title_info: 标题统计信息（可选）
+        rank_threshold: 排名阈值
+        new_titles: 新增标题（可选）
+        mode: 报告模式 (daily/incremental/current)
+        global_filters: 全局过滤词（可选）
+        weight_config: 权重配置
+        max_news_per_keyword: 每个关键词最大显示数量
+        sort_by_position_first: 是否优先按配置位置排序
+        is_first_crawl_func: 检测是否是当天第一次爬取的函数
+        convert_time_func: 时间格式转换函数
+
+    Returns:
+        Tuple[List[Dict], int]: (统计结果列表, 总标题数)
+    """
+    # 默认权重配置
+    if weight_config is None:
+        weight_config = {
+            "RANK_WEIGHT": 0.4,
+            "FREQUENCY_WEIGHT": 0.3,
+            "HOTNESS_WEIGHT": 0.3,
+        }
+
+    # 默认时间转换函数
+    if convert_time_func is None:
+        convert_time_func = lambda x: x
+
+    # 默认首次爬取检测函数
+    if is_first_crawl_func is None:
+        is_first_crawl_func = lambda: True
+
+    # 如果没有配置词组，创建一个包含所有新闻的虚拟词组
+    if not word_groups:
+        print("频率词配置为空，将显示所有新闻")
+        word_groups = [{"required": [], "normal": [], "group_key": "全部新闻"}]
+        filter_words = []  # 清空过滤词，显示所有新闻
+
+    is_first_today = is_first_crawl_func()
+
+    # 确定处理的数据源和新增标记逻辑
+    if mode == "incremental":
+        if is_first_today:
+            # 增量模式 + 当天第一次：处理所有新闻，都标记为新增
+            results_to_process = results
+            all_news_are_new = True
+        else:
+            # 增量模式 + 当天非第一次：只处理新增的新闻
+            results_to_process = new_titles if new_titles else {}
+            all_news_are_new = True
+    elif mode == "current":
+        # current 模式：只处理当前时间批次的新闻，但统计信息来自全部历史
+        if title_info:
+            latest_time = None
+            for source_titles in title_info.values():
+                for title_data in source_titles.values():
+                    last_time = title_data.get("last_time", "")
+                    if last_time:
+                        if latest_time is None or last_time > latest_time:
+                            latest_time = last_time
+
+            # 只处理 last_time 等于最新时间的新闻
+            if latest_time:
+                results_to_process = {}
+                for source_id, source_titles in results.items():
+                    if source_id in title_info:
+                        filtered_titles = {}
+                        for title, title_data in source_titles.items():
+                            if title in title_info[source_id]:
+                                info = title_info[source_id][title]
+                                if info.get("last_time") == latest_time:
+                                    filtered_titles[title] = title_data
+                        if filtered_titles:
+                            results_to_process[source_id] = filtered_titles
+
+                print(
+                    f"当前榜单模式：最新时间 {latest_time}，筛选出 {sum(len(titles) for titles in results_to_process.values())} 条当前榜单新闻"
+                )
+            else:
+                results_to_process = results
+        else:
+            results_to_process = results
+        all_news_are_new = False
+    else:
+        # 当日汇总模式：处理所有新闻
+        results_to_process = results
+        all_news_are_new = False
+        total_input_news = sum(len(titles) for titles in results.values())
+        filter_status = (
+            "全部显示"
+            if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻"
+            else "频率词过滤"
+        )
+        print(f"当日汇总模式：处理 {total_input_news} 条新闻，模式：{filter_status}")
+
+    word_stats = {}
+    total_titles = 0
+    processed_titles = {}
+    matched_new_count = 0
+
+    if title_info is None:
+        title_info = {}
+    if new_titles is None:
+        new_titles = {}
+
+    for group in word_groups:
+        group_key = group["group_key"]
+        word_stats[group_key] = {"count": 0, "titles": {}}
+
+    for source_id, titles_data in results_to_process.items():
+        total_titles += len(titles_data)
+
+        if source_id not in processed_titles:
+            processed_titles[source_id] = {}
+
+        for title, title_data in titles_data.items():
+            if title in processed_titles.get(source_id, {}):
+                continue
+
+            # 使用统一的匹配逻辑
+            matches_frequency_words = matches_word_groups(
+                title, word_groups, filter_words, global_filters
+            )
+
+            if not matches_frequency_words:
+                continue
+
+            # 如果是增量模式或 current 模式第一次，统计匹配的新增新闻数量
+            if (mode == "incremental" and all_news_are_new) or (
+                mode == "current" and is_first_today
+            ):
+                matched_new_count += 1
+
+            source_ranks = title_data.get("ranks", [])
+            source_url = title_data.get("url", "")
+            source_mobile_url = title_data.get("mobileUrl", "")
+
+            # 找到匹配的词组（防御性转换确保类型安全）
+            title_lower = str(title).lower() if not isinstance(title, str) else title.lower()
+            for group in word_groups:
+                required_words = group["required"]
+                normal_words = group["normal"]
+
+                # 如果是"全部新闻"模式，所有标题都匹配第一个（唯一的）词组
+                if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻":
+                    group_key = group["group_key"]
+                    word_stats[group_key]["count"] += 1
+                    if source_id not in word_stats[group_key]["titles"]:
+                        word_stats[group_key]["titles"][source_id] = []
+                else:
+                    # 原有的匹配逻辑
+                    if required_words:
+                        all_required_present = all(
+                            req_word.lower() in title_lower
+                            for req_word in required_words
+                        )
+                        if not all_required_present:
+                            continue
+
+                    if normal_words:
+                        any_normal_present = any(
+                            normal_word.lower() in title_lower
+                            for normal_word in normal_words
+                        )
+                        if not any_normal_present:
+                            continue
+
+                    group_key = group["group_key"]
+                    word_stats[group_key]["count"] += 1
+                    if source_id not in word_stats[group_key]["titles"]:
+                        word_stats[group_key]["titles"][source_id] = []
+
+                first_time = ""
+                last_time = ""
+                count_info = 1
+                ranks = source_ranks if source_ranks else []
+                url = source_url
+                mobile_url = source_mobile_url
+
+                # 对于 current 模式，从历史统计信息中获取完整数据
+                if (
+                    mode == "current"
+                    and title_info
+                    and source_id in title_info
+                    and title in title_info[source_id]
+                ):
+                    info = title_info[source_id][title]
+                    first_time = info.get("first_time", "")
+                    last_time = info.get("last_time", "")
+                    count_info = info.get("count", 1)
+                    if "ranks" in info and info["ranks"]:
+                        ranks = info["ranks"]
+                    url = info.get("url", source_url)
+                    mobile_url = info.get("mobileUrl", source_mobile_url)
+                elif (
+                    title_info
+                    and source_id in title_info
+                    and title in title_info[source_id]
+                ):
+                    info = title_info[source_id][title]
+                    first_time = info.get("first_time", "")
+                    last_time = info.get("last_time", "")
+                    count_info = info.get("count", 1)
+                    if "ranks" in info and info["ranks"]:
+                        ranks = info["ranks"]
+                    url = info.get("url", source_url)
+                    mobile_url = info.get("mobileUrl", source_mobile_url)
+
+                if not ranks:
+                    ranks = [99]
+
+                time_display = format_time_display(first_time, last_time, convert_time_func)
+
+                source_name = id_to_name.get(source_id, source_id)
+
+                # 判断是否为新增
+                is_new = False
+                if all_news_are_new:
+                    # 增量模式下所有处理的新闻都是新增，或者当天第一次的所有新闻都是新增
+                    is_new = True
+                elif new_titles and source_id in new_titles:
+                    # 检查是否在新增列表中
+                    new_titles_for_source = new_titles[source_id]
+                    is_new = title in new_titles_for_source
+
+                word_stats[group_key]["titles"][source_id].append(
+                    {
+                        "title": title,
+                        "source_name": source_name,
+                        "first_time": first_time,
+                        "last_time": last_time,
+                        "time_display": time_display,
+                        "count": count_info,
+                        "ranks": ranks,
+                        "rank_threshold": rank_threshold,
+                        "url": url,
+                        "mobileUrl": mobile_url,
+                        "is_new": is_new,
+                    }
+                )
+
+                if source_id not in processed_titles:
+                    processed_titles[source_id] = {}
+                processed_titles[source_id][title] = True
+
+                break
+
+    # 最后统一打印汇总信息
+    if mode == "incremental":
+        if is_first_today:
+            total_input_news = sum(len(titles) for titles in results.values())
+            filter_status = (
+                "全部显示"
+                if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻"
+                else "频率词匹配"
+            )
+            print(
+                f"增量模式：当天第一次爬取，{total_input_news} 条新闻中有 {matched_new_count} 条{filter_status}"
+            )
+        else:
+            if new_titles:
+                total_new_count = sum(len(titles) for titles in new_titles.values())
+                filter_status = (
+                    "全部显示"
+                    if len(word_groups) == 1
+                    and word_groups[0]["group_key"] == "全部新闻"
+                    else "匹配频率词"
+                )
+                print(
+                    f"增量模式：{total_new_count} 条新增新闻中，有 {matched_new_count} 条{filter_status}"
+                )
+                if matched_new_count == 0 and len(word_groups) > 1:
+                    print("增量模式：没有新增新闻匹配频率词，将不会发送通知")
+            else:
+                print("增量模式：未检测到新增新闻")
+    elif mode == "current":
+        total_input_news = sum(len(titles) for titles in results_to_process.values())
+        if is_first_today:
+            filter_status = (
+                "全部显示"
+                if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻"
+                else "频率词匹配"
+            )
+            print(
+                f"当前榜单模式：当天第一次爬取，{total_input_news} 条当前榜单新闻中有 {matched_new_count} 条{filter_status}"
+            )
+        else:
+            matched_count = sum(stat["count"] for stat in word_stats.values())
+            filter_status = (
+                "全部显示"
+                if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻"
+                else "频率词匹配"
+            )
+            print(
+                f"当前榜单模式：{total_input_news} 条当前榜单新闻中有 {matched_count} 条{filter_status}"
+            )
+
+    stats = []
+    # 创建 group_key 到位置和最大数量的映射
+    group_key_to_position = {
+        group["group_key"]: idx for idx, group in enumerate(word_groups)
+    }
+    group_key_to_max_count = {
+        group["group_key"]: group.get("max_count", 0) for group in word_groups
+    }
+
+    for group_key, data in word_stats.items():
+        all_titles = []
+        for source_id, title_list in data["titles"].items():
+            all_titles.extend(title_list)
+
+        # 按权重排序
+        sorted_titles = sorted(
+            all_titles,
+            key=lambda x: (
+                -calculate_news_weight(x, rank_threshold, weight_config),
+                min(x["ranks"]) if x["ranks"] else 999,
+                -x["count"],
+            ),
+        )
+
+        # 应用最大显示数量限制（优先级：单独配置 > 全局配置）
+        group_max_count = group_key_to_max_count.get(group_key, 0)
+        if group_max_count == 0:
+            # 使用全局配置
+            group_max_count = max_news_per_keyword
+
+        if group_max_count > 0:
+            sorted_titles = sorted_titles[:group_max_count]
+
+        stats.append(
+            {
+                "word": group_key,
+                "count": data["count"],
+                "position": group_key_to_position.get(group_key, 999),
+                "titles": sorted_titles,
+                "percentage": (
+                    round(data["count"] / total_titles * 100, 2)
+                    if total_titles > 0
+                    else 0
+                ),
+            }
+        )
+
+    # 根据配置选择排序优先级
+    if sort_by_position_first:
+        # 先按配置位置，再按热点条数
+        stats.sort(key=lambda x: (x["position"], -x["count"]))
+    else:
+        # 先按热点条数，再按配置位置（原逻辑）
+        stats.sort(key=lambda x: (-x["count"], x["position"]))
+
+    # 打印过滤后的匹配新闻数（与推送显示一致）
+    matched_news_count = sum(len(stat["titles"]) for stat in stats if stat["count"] > 0)
+    if mode == "daily":
+        print(f"频率词过滤后：{matched_news_count} 条新闻匹配（将显示在推送中）")
+
+    return stats, total_titles
@@ -0,0 +1,152 @@
+# coding=utf-8
+"""
+配置工具模块 - 多账号配置解析和验证
+
+提供多账号推送配置的解析、验证和限制功能
+"""
+
+from typing import Dict, List, Optional, Tuple
+
+
+def parse_multi_account_config(config_value: str, separator: str = ";") -> List[str]:
+    """
+    解析多账号配置，返回账号列表
+
+    Args:
+        config_value: 配置值字符串，多个账号用分隔符分隔
+        separator: 分隔符，默认为 ;
+
+    Returns:
+        账号列表，空字符串会被保留（用于占位）
+
+    Examples:
+        >>> parse_multi_account_config("url1;url2;url3")
+        ['url1', 'url2', 'url3']
+        >>> parse_multi_account_config(";token2")  # 第一个账号无token
+        ['', 'token2']
+        >>> parse_multi_account_config("")
+        []
+    """
+    if not config_value:
+        return []
+    # 保留空字符串用于占位（如 ";token2" 表示第一个账号无token）
+    accounts = [acc.strip() for acc in config_value.split(separator)]
+    # 过滤掉全部为空的情况
+    if all(not acc for acc in accounts):
+        return []
+    return accounts
+
+
+def validate_paired_configs(
+    configs: Dict[str, List[str]],
+    channel_name: str,
+    required_keys: Optional[List[str]] = None
+) -> Tuple[bool, int]:
+    """
+    验证配对配置的数量是否一致
+
+    对于需要多个配置项配对的渠道（如 Telegram 的 token 和 chat_id），
+    验证所有配置项的账号数量是否一致。
+
+    Args:
+        configs: 配置字典，key 为配置名，value 为账号列表
+        channel_name: 渠道名称，用于日志输出
+        required_keys: 必须有值的配置项列表
+
+    Returns:
+        (是否验证通过, 账号数量)
+
+    Examples:
+        >>> validate_paired_configs({
+        ...     "token": ["t1", "t2"],
+        ...     "chat_id": ["c1", "c2"]
+        ... }, "Telegram", ["token", "chat_id"])
+        (True, 2)
+
+        >>> validate_paired_configs({
+        ...     "token": ["t1", "t2"],
+        ...     "chat_id": ["c1"]  # 数量不匹配
+        ... }, "Telegram", ["token", "chat_id"])
+        (False, 0)
+    """
+    # 过滤掉空列表
+    non_empty_configs = {k: v for k, v in configs.items() if v}
+
+    if not non_empty_configs:
+        return True, 0
+
+    # 检查必须项
+    if required_keys:
+        for key in required_keys:
+            if key not in non_empty_configs or not non_empty_configs[key]:
+                return True, 0  # 必须项为空，视为未配置
+
+    # 获取所有非空配置的长度
+    lengths = {k: len(v) for k, v in non_empty_configs.items()}
+    unique_lengths = set(lengths.values())
+
+    if len(unique_lengths) > 1:
+        print(f"❌ {channel_name} 配置错误：配对配置数量不一致，将跳过该渠道推送")
+        for key, length in lengths.items():
+            print(f"   - {key}: {length} 个")
+        return False, 0
+
+    return True, list(unique_lengths)[0] if unique_lengths else 0
+
+
+def limit_accounts(
+    accounts: List[str],
+    max_count: int,
+    channel_name: str
+) -> List[str]:
+    """
+    限制账号数量
+
+    当配置的账号数量超过最大限制时，只使用前 N 个账号，
+    并输出警告信息。
+
+    Args:
+        accounts: 账号列表
+        max_count: 最大账号数量
+        channel_name: 渠道名称，用于日志输出
+
+    Returns:
+        限制后的账号列表
+
+    Examples:
+        >>> limit_accounts(["a1", "a2", "a3"], 2, "飞书")
+        ⚠️ 飞书 配置了 3 个账号，超过最大限制 2，只使用前 2 个
+        ['a1', 'a2']
+    """
+    if len(accounts) > max_count:
+        print(f"⚠️ {channel_name} 配置了 {len(accounts)} 个账号，超过最大限制 {max_count}，只使用前 {max_count} 个")
+        print(f"   ⚠️ 警告：如果您是 fork 用户，过多账号可能导致 GitHub Actions 运行时间过长，存在账号风险")
+        return accounts[:max_count]
+    return accounts
+
+
+def get_account_at_index(accounts: List[str], index: int, default: str = "") -> str:
+    """
+    安全获取指定索引的账号值
+
+    当索引超出范围或账号值为空时，返回默认值。
+
+    Args:
+        accounts: 账号列表
+        index: 索引
+        default: 默认值
+
+    Returns:
+        账号值或默认值
+
+    Examples:
+        >>> get_account_at_index(["a", "b", "c"], 1)
+        'b'
+        >>> get_account_at_index(["a", "", "c"], 1, "default")
+        'default'
+        >>> get_account_at_index(["a"], 5, "default")
+        'default'
+    """
+    if index < len(accounts):
+        return accounts[index] if accounts[index] else default
+    return default
@@ -0,0 +1,291 @@
+# coding=utf-8
+"""
+数据处理模块
+
+提供数据读取、保存和检测功能：
+- save_titles_to_file: 保存标题到 TXT 文件
+- read_all_today_titles: 从存储后端读取当天所有标题
+- detect_latest_new_titles: 检测最新批次的新增标题
+
+Author: TrendRadar Team
+"""
+
+from pathlib import Path
+from typing import Dict, List, Tuple, Optional, Callable
+
+
+def save_titles_to_file(
+    results: Dict,
+    id_to_name: Dict,
+    failed_ids: List,
+    output_path: str,
+    clean_title_func: Callable[[str], str],
+) -> str:
+    """
+    保存标题到 TXT 文件
+
+    Args:
+        results: 抓取结果 {source_id: {title: title_data}}
+        id_to_name: ID 到名称的映射
+        failed_ids: 失败的 ID 列表
+        output_path: 输出文件路径
+        clean_title_func: 标题清理函数
+
+    Returns:
+        str: 保存的文件路径
+    """
+    # 确保目录存在
+    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+
+    with open(output_path, "w", encoding="utf-8") as f:
+        for id_value, title_data in results.items():
+            # id | name 或 id
+            name = id_to_name.get(id_value)
+            if name and name != id_value:
+                f.write(f"{id_value} | {name}\n")
+            else:
+                f.write(f"{id_value}\n")
+
+            # 按排名排序标题
+            sorted_titles = []
+            for title, info in title_data.items():
+                cleaned_title = clean_title_func(title)
+                if isinstance(info, dict):
+                    ranks = info.get("ranks", [])
+                    url = info.get("url", "")
+                    mobile_url = info.get("mobileUrl", "")
+                else:
+                    ranks = info if isinstance(info, list) else []
+                    url = ""
+                    mobile_url = ""
+
+                rank = ranks[0] if ranks else 1
+                sorted_titles.append((rank, cleaned_title, url, mobile_url))
+
+            sorted_titles.sort(key=lambda x: x[0])
+
+            for rank, cleaned_title, url, mobile_url in sorted_titles:
+                line = f"{rank}. {cleaned_title}"
+
+                if url:
+                    line += f" [URL:{url}]"
+                if mobile_url:
+                    line += f" [MOBILE:{mobile_url}]"
+                f.write(line + "\n")
+
+            f.write("\n")
+
+        if failed_ids:
+            f.write("==== 以下ID请求失败 ====\n")
+            for id_value in failed_ids:
+                f.write(f"{id_value}\n")
+
+    return output_path
+
+
+def read_all_today_titles_from_storage(
+    storage_manager,
+    current_platform_ids: Optional[List[str]] = None,
+) -> Tuple[Dict, Dict, Dict]:
+    """
+    从存储后端读取当天所有标题（SQLite 数据）
+
+    Args:
+        storage_manager: 存储管理器实例
+        current_platform_ids: 当前监控的平台 ID 列表（用于过滤）
+
+    Returns:
+        Tuple[Dict, Dict, Dict]: (all_results, id_to_name, title_info)
+    """
+    try:
+        news_data = storage_manager.get_today_all_data()
+
+        if not news_data or not news_data.items:
+            return {}, {}, {}
+
+        all_results = {}
+        final_id_to_name = {}
+        title_info = {}
+
+        for source_id, news_list in news_data.items.items():
+            # 按平台过滤
+            if current_platform_ids is not None and source_id not in current_platform_ids:
+                continue
+
+            # 获取来源名称
+            source_name = news_data.id_to_name.get(source_id, source_id)
+            final_id_to_name[source_id] = source_name
+
+            if source_id not in all_results:
+                all_results[source_id] = {}
+                title_info[source_id] = {}
+
+            for item in news_list:
+                title = item.title
+                ranks = getattr(item, 'ranks', [item.rank])
+                first_time = getattr(item, 'first_time', item.crawl_time)
+                last_time = getattr(item, 'last_time', item.crawl_time)
+                count = getattr(item, 'count', 1)
+
+                all_results[source_id][title] = {
+                    "ranks": ranks,
+                    "url": item.url or "",
+                    "mobileUrl": item.mobile_url or "",
+                }
+
+                title_info[source_id][title] = {
+                    "first_time": first_time,
+                    "last_time": last_time,
+                    "count": count,
+                    "ranks": ranks,
+                    "url": item.url or "",
+                    "mobileUrl": item.mobile_url or "",
+                }
+
+        return all_results, final_id_to_name, title_info
+
+    except Exception as e:
+        print(f"[存储] 从存储后端读取数据失败: {e}")
+        return {}, {}, {}
+
+
+def read_all_today_titles(
+    storage_manager,
+    current_platform_ids: Optional[List[str]] = None,
+) -> Tuple[Dict, Dict, Dict]:
+    """
+    读取当天所有标题（从存储后端）
+
+    Args:
+        storage_manager: 存储管理器实例
+        current_platform_ids: 当前监控的平台 ID 列表（用于过滤）
+
+    Returns:
+        Tuple[Dict, Dict, Dict]: (all_results, id_to_name, title_info)
+    """
+    all_results, final_id_to_name, title_info = read_all_today_titles_from_storage(
+        storage_manager, current_platform_ids
+    )
+
+    if all_results:
+        total_count = sum(len(titles) for titles in all_results.values())
+        print(f"[存储] 已从存储后端读取 {total_count} 条标题")
+    else:
+        print("[存储] 当天暂无数据")
+
+    return all_results, final_id_to_name, title_info
+
+
+def detect_latest_new_titles_from_storage(
+    storage_manager,
+    current_platform_ids: Optional[List[str]] = None,
+) -> Dict:
+    """
+    从存储后端检测最新批次的新增标题
+
+    Args:
+        storage_manager: 存储管理器实例
+        current_platform_ids: 当前监控的平台 ID 列表（用于过滤）
+
+    Returns:
+        Dict: 新增标题 {source_id: {title: title_data}}
+    """
+    try:
+        # 获取最新抓取数据
+        latest_data = storage_manager.get_latest_crawl_data()
+        if not latest_data or not latest_data.items:
+            return {}
+
+        # 获取所有历史数据
+        all_data = storage_manager.get_today_all_data()
+        if not all_data or not all_data.items:
+            # 没有历史数据（第一次抓取），不应该有"新增"标题
+            return {}
+
+        # 收集历史标题（不包括最新批次的时间）
+        latest_time = latest_data.crawl_time
+        historical_titles = {}
+
+        for source_id, news_list in all_data.items.items():
+            if current_platform_ids is not None and source_id not in current_platform_ids:
+                continue
+
+            historical_titles[source_id] = set()
+            for item in news_list:
+                # 只统计非最新批次的标题
+                first_time = getattr(item, 'first_time', item.crawl_time)
+                if first_time != latest_time:
+                    historical_titles[source_id].add(item.title)
+
+        # 检查是否是当天第一次抓取（没有任何历史标题）
+        # 如果所有平台的历史标题集合都为空，说明只有一个抓取批次，不应该有"新增"标题
+        has_historical_data = any(len(titles) > 0 for titles in historical_titles.values())
+        if not has_historical_data:
+            return {}
+
+        # 找出新增标题
+        new_titles = {}
+        for source_id, news_list in latest_data.items.items():
+            if current_platform_ids is not None and source_id not in current_platform_ids:
+                continue
+
+            historical_set = historical_titles.get(source_id, set())
+            source_new_titles = {}
+
+            for item in news_list:
+                if item.title not in historical_set:
+                    source_new_titles[item.title] = {
+                        "ranks": [item.rank],
+                        "url": item.url or "",
+                        "mobileUrl": item.mobile_url or "",
+                    }
+
+            if source_new_titles:
+                new_titles[source_id] = source_new_titles
+
+        return new_titles
+
+    except Exception as e:
+        print(f"[存储] 从存储后端检测新标题失败: {e}")
+        return {}
+
+
+def detect_latest_new_titles(
+    storage_manager,
+    current_platform_ids: Optional[List[str]] = None,
+) -> Dict:
+    """
+    检测当日最新批次的新增标题（从存储后端）
+
+    Args:
+        storage_manager: 存储管理器实例
+        current_platform_ids: 当前监控的平台 ID 列表（用于过滤）
+
+    Returns:
+        Dict: 新增标题 {source_id: {title: title_data}}
+    """
+    new_titles = detect_latest_new_titles_from_storage(storage_manager, current_platform_ids)
+    if new_titles:
+        total_new = sum(len(titles) for titles in new_titles.values())
+        print(f"[存储] 从存储后端检测到 {total_new} 条新增标题")
+    return new_titles
+
+
+def is_first_crawl_today(output_dir: str, date_folder: str) -> bool:
+    """
+    检测是否是当天第一次爬取
+
+    Args:
+        output_dir: 输出目录
+        date_folder: 日期文件夹名称
+
+    Returns:
+        bool: 是否是当天第一次爬取
+    """
+    txt_dir = Path(output_dir) / date_folder / "txt"
+
+    if not txt_dir.exists():
+        return True
+
+    files = sorted([f for f in txt_dir.iterdir() if f.suffix == ".txt"])
+    return len(files) <= 1
@@ -0,0 +1,194 @@
+# coding=utf-8
+"""
+频率词配置加载模块
+
+负责从配置文件加载频率词规则，支持：
+- 普通词组
+- 必须词（+前缀）
+- 过滤词（!前缀）
+- 全局过滤词（[GLOBAL_FILTER] 区域）
+- 最大显示数量（@前缀）
+"""
+
+import os
+from pathlib import Path
+from typing import Dict, List, Tuple, Optional
+
+
+def load_frequency_words(
+    frequency_file: Optional[str] = None,
+) -> Tuple[List[Dict], List[str], List[str]]:
+    """
+    加载频率词配置
+
+    配置文件格式说明：
+    - 每个词组由空行分隔
+    - [GLOBAL_FILTER] 区域定义全局过滤词
+    - [WORD_GROUPS] 区域定义词组（默认）
+
+    词组语法：
+    - 普通词：直接写入，任意匹配即可
+    - +词：必须词，所有必须词都要匹配
+    - !词：过滤词，匹配则排除
+    - @数字：该词组最多显示的条数
+
+    Args:
+        frequency_file: 频率词配置文件路径，默认从环境变量 FREQUENCY_WORDS_PATH 获取或使用 config/frequency_words.txt
+
+    Returns:
+        (词组列表, 词组内过滤词, 全局过滤词)
+
+    Raises:
+        FileNotFoundError: 频率词文件不存在
+    """
+    if frequency_file is None:
+        frequency_file = os.environ.get(
+            "FREQUENCY_WORDS_PATH", "config/frequency_words.txt"
+        )
+
+    frequency_path = Path(frequency_file)
+    if not frequency_path.exists():
+        raise FileNotFoundError(f"频率词文件 {frequency_file} 不存在")
+
+    with open(frequency_path, "r", encoding="utf-8") as f:
+        content = f.read()
+
+    word_groups = [group.strip() for group in content.split("\n\n") if group.strip()]
+
+    processed_groups = []
+    filter_words = []
+    global_filters = []
+
+    # 默认区域（向后兼容）
+    current_section = "WORD_GROUPS"
+
+    for group in word_groups:
+        lines = [line.strip() for line in group.split("\n") if line.strip()]
+
+        if not lines:
+            continue
+
+        # 检查是否为区域标记
+        if lines[0].startswith("[") and lines[0].endswith("]"):
+            section_name = lines[0][1:-1].upper()
+            if section_name in ("GLOBAL_FILTER", "WORD_GROUPS"):
+                current_section = section_name
+                lines = lines[1:]  # 移除标记行
+
+        # 处理全局过滤区域
+        if current_section == "GLOBAL_FILTER":
+            # 直接添加所有非空行到全局过滤列表
+            for line in lines:
+                # 忽略特殊语法前缀，只提取纯文本
+                if line.startswith(("!", "+", "@")):
+                    continue  # 全局过滤区不支持特殊语法
+                if line:
+                    global_filters.append(line)
+            continue
+
+        # 处理词组区域
+        words = lines
+
+        group_required_words = []
+        group_normal_words = []
+        group_filter_words = []
+        group_max_count = 0  # 默认不限制
+
+        for word in words:
+            if word.startswith("@"):
+                # 解析最大显示数量（只接受正整数）
+                try:
+                    count = int(word[1:])
+                    if count > 0:
+                        group_max_count = count
+                except (ValueError, IndexError):
+                    pass  # 忽略无效的@数字格式
+            elif word.startswith("!"):
+                filter_words.append(word[1:])
+                group_filter_words.append(word[1:])
+            elif word.startswith("+"):
+                group_required_words.append(word[1:])
+            else:
+                group_normal_words.append(word)
+
+        if group_required_words or group_normal_words:
+            if group_normal_words:
+                group_key = " ".join(group_normal_words)
+            else:
+                group_key = " ".join(group_required_words)
+
+            processed_groups.append(
+                {
+                    "required": group_required_words,
+                    "normal": group_normal_words,
+                    "group_key": group_key,
+                    "max_count": group_max_count,
+                }
+            )
+
+    return processed_groups, filter_words, global_filters
+
+
+def matches_word_groups(
+    title: str,
+    word_groups: List[Dict],
+    filter_words: List[str],
+    global_filters: Optional[List[str]] = None
+) -> bool:
+    """
+    检查标题是否匹配词组规则
+
+    Args:
+        title: 标题文本
+        word_groups: 词组列表
+        filter_words: 过滤词列表
+        global_filters: 全局过滤词列表
+
+    Returns:
+        是否匹配
+    """
+    # 防御性类型检查：确保 title 是有效字符串
+    if not isinstance(title, str):
+        title = str(title) if title is not None else ""
+    if not title.strip():
+        return False
+
+    title_lower = title.lower()
+
+    # 全局过滤检查（优先级最高）
+    if global_filters:
+        if any(global_word.lower() in title_lower for global_word in global_filters):
+            return False
+
+    # 如果没有配置词组，则匹配所有标题（支持显示全部新闻）
+    if not word_groups:
+        return True
+
+    # 过滤词检查
+    if any(filter_word.lower() in title_lower for filter_word in filter_words):
+        return False
+
+    # 词组匹配检查
+    for group in word_groups:
+        required_words = group["required"]
+        normal_words = group["normal"]
+
+        # 必须词检查
+        if required_words:
+            all_required_present = all(
+                req_word.lower() in title_lower for req_word in required_words
+            )
+            if not all_required_present:
+                continue
+
+        # 普通词检查
+        if normal_words:
+            any_normal_present = any(
+                normal_word.lower() in title_lower for normal_word in normal_words
+            )
+            if not any_normal_present:
+                continue
+
+        return True
+
+    return False
@@ -0,0 +1,332 @@
+# coding=utf-8
+"""
+配置加载模块
+
+负责从 YAML 配置文件和环境变量加载配置。
+"""
+
+import os
+from pathlib import Path
+from typing import Dict, Any, Optional
+
+import yaml
+
+from .config import parse_multi_account_config, validate_paired_configs
+
+
+def _get_env_bool(key: str, default: bool = False) -> Optional[bool]:
+    """从环境变量获取布尔值，如果未设置返回 None"""
+    value = os.environ.get(key, "").strip().lower()
+    if not value:
+        return None
+    return value in ("true", "1")
+
+
+def _get_env_int(key: str, default: int = 0) -> int:
+    """从环境变量获取整数值"""
+    value = os.environ.get(key, "").strip()
+    if not value:
+        return default
+    try:
+        return int(value)
+    except ValueError:
+        return default
+
+
+def _get_env_str(key: str, default: str = "") -> str:
+    """从环境变量获取字符串值"""
+    return os.environ.get(key, "").strip() or default
+
+
+def _load_app_config(config_data: Dict) -> Dict:
+    """加载应用配置"""
+    app_config = config_data.get("app", {})
+    return {
+        "VERSION_CHECK_URL": app_config.get("version_check_url", ""),
+        "SHOW_VERSION_UPDATE": app_config.get("show_version_update", True),
+        "TIMEZONE": _get_env_str("TIMEZONE") or app_config.get("timezone", "Asia/Shanghai"),
+    }
+
+
+def _load_crawler_config(config_data: Dict) -> Dict:
+    """加载爬虫配置"""
+    crawler_config = config_data.get("crawler", {})
+    enable_crawler_env = _get_env_bool("ENABLE_CRAWLER")
+    return {
+        "REQUEST_INTERVAL": crawler_config.get("request_interval", 100),
+        "USE_PROXY": crawler_config.get("use_proxy", False),
+        "DEFAULT_PROXY": crawler_config.get("default_proxy", ""),
+        "ENABLE_CRAWLER": enable_crawler_env if enable_crawler_env is not None else crawler_config.get("enable_crawler", True),
+    }
+
+
+def _load_report_config(config_data: Dict) -> Dict:
+    """加载报告配置"""
+    report_config = config_data.get("report", {})
+
+    # 环境变量覆盖
+    sort_by_position_env = _get_env_bool("SORT_BY_POSITION_FIRST")
+    reverse_content_env = _get_env_bool("REVERSE_CONTENT_ORDER")
+    max_news_env = _get_env_int("MAX_NEWS_PER_KEYWORD")
+
+    return {
+        "REPORT_MODE": _get_env_str("REPORT_MODE") or report_config.get("mode", "daily"),
+        "RANK_THRESHOLD": report_config.get("rank_threshold", 10),
+        "SORT_BY_POSITION_FIRST": sort_by_position_env if sort_by_position_env is not None else report_config.get("sort_by_position_first", False),
+        "MAX_NEWS_PER_KEYWORD": max_news_env or report_config.get("max_news_per_keyword", 0),
+        "REVERSE_CONTENT_ORDER": reverse_content_env if reverse_content_env is not None else report_config.get("reverse_content_order", False),
+    }
+
+
+def _load_notification_config(config_data: Dict) -> Dict:
+    """加载通知配置"""
+    notification = config_data.get("notification", {})
+    enable_notification_env = _get_env_bool("ENABLE_NOTIFICATION")
+
+    return {
+        "ENABLE_NOTIFICATION": enable_notification_env if enable_notification_env is not None else notification.get("enable_notification", True),
+        "MESSAGE_BATCH_SIZE": notification.get("message_batch_size", 4000),
+        "DINGTALK_BATCH_SIZE": notification.get("dingtalk_batch_size", 20000),
+        "FEISHU_BATCH_SIZE": notification.get("feishu_batch_size", 29000),
+        "BARK_BATCH_SIZE": notification.get("bark_batch_size", 3600),
+        "SLACK_BATCH_SIZE": notification.get("slack_batch_size", 4000),
+        "BATCH_SEND_INTERVAL": notification.get("batch_send_interval", 1.0),
+        "FEISHU_MESSAGE_SEPARATOR": notification.get("feishu_message_separator", "---"),
+        "MAX_ACCOUNTS_PER_CHANNEL": _get_env_int("MAX_ACCOUNTS_PER_CHANNEL") or notification.get("max_accounts_per_channel", 3),
+    }
+
+
+def _load_push_window_config(config_data: Dict) -> Dict:
+    """加载推送窗口配置"""
+    notification = config_data.get("notification", {})
+    push_window = notification.get("push_window", {})
+    time_range = push_window.get("time_range", {})
+
+    enabled_env = _get_env_bool("PUSH_WINDOW_ENABLED")
+    once_per_day_env = _get_env_bool("PUSH_WINDOW_ONCE_PER_DAY")
+
+    return {
+        "ENABLED": enabled_env if enabled_env is not None else push_window.get("enabled", False),
+        "TIME_RANGE": {
+            "START": _get_env_str("PUSH_WINDOW_START") or time_range.get("start", "08:00"),
+            "END": _get_env_str("PUSH_WINDOW_END") or time_range.get("end", "22:00"),
+        },
+        "ONCE_PER_DAY": once_per_day_env if once_per_day_env is not None else push_window.get("once_per_day", True),
+    }
+
+
+def _load_weight_config(config_data: Dict) -> Dict:
+    """加载权重配置"""
+    weight = config_data.get("weight", {})
+    return {
+        "RANK_WEIGHT": weight.get("rank_weight", 1.0),
+        "FREQUENCY_WEIGHT": weight.get("frequency_weight", 1.0),
+        "HOTNESS_WEIGHT": weight.get("hotness_weight", 1.0),
+    }
+
+
+def _load_storage_config(config_data: Dict) -> Dict:
+    """加载存储配置"""
+    storage = config_data.get("storage", {})
+    formats = storage.get("formats", {})
+    local = storage.get("local", {})
+    remote = storage.get("remote", {})
+    pull = storage.get("pull", {})
+
+    txt_enabled_env = _get_env_bool("STORAGE_TXT_ENABLED")
+    html_enabled_env = _get_env_bool("STORAGE_HTML_ENABLED")
+    pull_enabled_env = _get_env_bool("PULL_ENABLED")
+
+    return {
+        "BACKEND": _get_env_str("STORAGE_BACKEND") or storage.get("backend", "auto"),
+        "FORMATS": {
+            "SQLITE": formats.get("sqlite", True),
+            "TXT": txt_enabled_env if txt_enabled_env is not None else formats.get("txt", True),
+            "HTML": html_enabled_env if html_enabled_env is not None else formats.get("html", True),
+        },
+        "LOCAL": {
+            "DATA_DIR": local.get("data_dir", "output"),
+            "RETENTION_DAYS": _get_env_int("LOCAL_RETENTION_DAYS") or local.get("retention_days", 0),
+        },
+        "REMOTE": {
+            "ENDPOINT_URL": _get_env_str("S3_ENDPOINT_URL") or remote.get("endpoint_url", ""),
+            "BUCKET_NAME": _get_env_str("S3_BUCKET_NAME") or remote.get("bucket_name", ""),
+            "ACCESS_KEY_ID": _get_env_str("S3_ACCESS_KEY_ID") or remote.get("access_key_id", ""),
+            "SECRET_ACCESS_KEY": _get_env_str("S3_SECRET_ACCESS_KEY") or remote.get("secret_access_key", ""),
+            "REGION": _get_env_str("S3_REGION") or remote.get("region", ""),
+            "RETENTION_DAYS": _get_env_int("REMOTE_RETENTION_DAYS") or remote.get("retention_days", 0),
+        },
+        "PULL": {
+            "ENABLED": pull_enabled_env if pull_enabled_env is not None else pull.get("enabled", False),
+            "DAYS": _get_env_int("PULL_DAYS") or pull.get("days", 7),
+        },
+    }
+
+
+def _load_webhook_config(config_data: Dict) -> Dict:
+    """加载 Webhook 配置"""
+    notification = config_data.get("notification", {})
+    webhooks = notification.get("webhooks", {})
+
+    return {
+        # 飞书
+        "FEISHU_WEBHOOK_URL": _get_env_str("FEISHU_WEBHOOK_URL") or webhooks.get("feishu_url", ""),
+        # 钉钉
+        "DINGTALK_WEBHOOK_URL": _get_env_str("DINGTALK_WEBHOOK_URL") or webhooks.get("dingtalk_url", ""),
+        # 企业微信
+        "WEWORK_WEBHOOK_URL": _get_env_str("WEWORK_WEBHOOK_URL") or webhooks.get("wework_url", ""),
+        "WEWORK_MSG_TYPE": _get_env_str("WEWORK_MSG_TYPE") or webhooks.get("wework_msg_type", "markdown"),
+        # Telegram
+        "TELEGRAM_BOT_TOKEN": _get_env_str("TELEGRAM_BOT_TOKEN") or webhooks.get("telegram_bot_token", ""),
+        "TELEGRAM_CHAT_ID": _get_env_str("TELEGRAM_CHAT_ID") or webhooks.get("telegram_chat_id", ""),
+        # 邮件
+        "EMAIL_FROM": _get_env_str("EMAIL_FROM") or webhooks.get("email_from", ""),
+        "EMAIL_PASSWORD": _get_env_str("EMAIL_PASSWORD") or webhooks.get("email_password", ""),
+        "EMAIL_TO": _get_env_str("EMAIL_TO") or webhooks.get("email_to", ""),
+        "EMAIL_SMTP_SERVER": _get_env_str("EMAIL_SMTP_SERVER") or webhooks.get("email_smtp_server", ""),
+        "EMAIL_SMTP_PORT": _get_env_str("EMAIL_SMTP_PORT") or webhooks.get("email_smtp_port", ""),
+        # ntfy
+        "NTFY_SERVER_URL": _get_env_str("NTFY_SERVER_URL") or webhooks.get("ntfy_server_url") or "https://ntfy.sh",
+        "NTFY_TOPIC": _get_env_str("NTFY_TOPIC") or webhooks.get("ntfy_topic", ""),
+        "NTFY_TOKEN": _get_env_str("NTFY_TOKEN") or webhooks.get("ntfy_token", ""),
+        # Bark
+        "BARK_URL": _get_env_str("BARK_URL") or webhooks.get("bark_url", ""),
+        # Slack
+        "SLACK_WEBHOOK_URL": _get_env_str("SLACK_WEBHOOK_URL") or webhooks.get("slack_webhook_url", ""),
+    }
+
+
+def _print_notification_sources(config: Dict) -> None:
+    """打印通知渠道配置来源信息"""
+    notification_sources = []
+    max_accounts = config["MAX_ACCOUNTS_PER_CHANNEL"]
+
+    if config["FEISHU_WEBHOOK_URL"]:
+        accounts = parse_multi_account_config(config["FEISHU_WEBHOOK_URL"])
+        count = min(len(accounts), max_accounts)
+        source = "环境变量" if os.environ.get("FEISHU_WEBHOOK_URL") else "配置文件"
+        notification_sources.append(f"飞书({source}, {count}个账号)")
+
+    if config["DINGTALK_WEBHOOK_URL"]:
+        accounts = parse_multi_account_config(config["DINGTALK_WEBHOOK_URL"])
+        count = min(len(accounts), max_accounts)
+        source = "环境变量" if os.environ.get("DINGTALK_WEBHOOK_URL") else "配置文件"
+        notification_sources.append(f"钉钉({source}, {count}个账号)")
+
+    if config["WEWORK_WEBHOOK_URL"]:
+        accounts = parse_multi_account_config(config["WEWORK_WEBHOOK_URL"])
+        count = min(len(accounts), max_accounts)
+        source = "环境变量" if os.environ.get("WEWORK_WEBHOOK_URL") else "配置文件"
+        notification_sources.append(f"企业微信({source}, {count}个账号)")
+
+    if config["TELEGRAM_BOT_TOKEN"] and config["TELEGRAM_CHAT_ID"]:
+        tokens = parse_multi_account_config(config["TELEGRAM_BOT_TOKEN"])
+        chat_ids = parse_multi_account_config(config["TELEGRAM_CHAT_ID"])
+        valid, count = validate_paired_configs(
+            {"bot_token": tokens, "chat_id": chat_ids},
+            "Telegram",
+            required_keys=["bot_token", "chat_id"]
+        )
+        if valid and count > 0:
+            count = min(count, max_accounts)
+            token_source = "环境变量" if os.environ.get("TELEGRAM_BOT_TOKEN") else "配置文件"
+            notification_sources.append(f"Telegram({token_source}, {count}个账号)")
+
+    if config["EMAIL_FROM"] and config["EMAIL_PASSWORD"] and config["EMAIL_TO"]:
+        from_source = "环境变量" if os.environ.get("EMAIL_FROM") else "配置文件"
+        notification_sources.append(f"邮件({from_source})")
+
+    if config["NTFY_SERVER_URL"] and config["NTFY_TOPIC"]:
+        topics = parse_multi_account_config(config["NTFY_TOPIC"])
+        tokens = parse_multi_account_config(config["NTFY_TOKEN"])
+        if tokens:
+            valid, count = validate_paired_configs(
+                {"topic": topics, "token": tokens},
+                "ntfy"
+            )
+            if valid and count > 0:
+                count = min(count, max_accounts)
+                server_source = "环境变量" if os.environ.get("NTFY_SERVER_URL") else "配置文件"
+                notification_sources.append(f"ntfy({server_source}, {count}个账号)")
+        else:
+            count = min(len(topics), max_accounts)
+            server_source = "环境变量" if os.environ.get("NTFY_SERVER_URL") else "配置文件"
+            notification_sources.append(f"ntfy({server_source}, {count}个账号)")
+
+    if config["BARK_URL"]:
+        accounts = parse_multi_account_config(config["BARK_URL"])
+        count = min(len(accounts), max_accounts)
+        bark_source = "环境变量" if os.environ.get("BARK_URL") else "配置文件"
+        notification_sources.append(f"Bark({bark_source}, {count}个账号)")
+
+    if config["SLACK_WEBHOOK_URL"]:
+        accounts = parse_multi_account_config(config["SLACK_WEBHOOK_URL"])
+        count = min(len(accounts), max_accounts)
+        slack_source = "环境变量" if os.environ.get("SLACK_WEBHOOK_URL") else "配置文件"
+        notification_sources.append(f"Slack({slack_source}, {count}个账号)")
+
+    if notification_sources:
+        print(f"通知渠道配置来源: {', '.join(notification_sources)}")
+        print(f"每个渠道最大账号数: {max_accounts}")
+    else:
+        print("未配置任何通知渠道")
+
+
+def load_config(config_path: Optional[str] = None) -> Dict[str, Any]:
+    """
+    加载配置文件
+
+    Args:
+        config_path: 配置文件路径，默认从环境变量 CONFIG_PATH 获取或使用 config/config.yaml
+
+    Returns:
+        包含所有配置的字典
+
+    Raises:
+        FileNotFoundError: 配置文件不存在
+    """
+    if config_path is None:
+        config_path = os.environ.get("CONFIG_PATH", "config/config.yaml")
+
+    if not Path(config_path).exists():
+        raise FileNotFoundError(f"配置文件 {config_path} 不存在")
+
+    with open(config_path, "r", encoding="utf-8") as f:
+        config_data = yaml.safe_load(f)
+
+    print(f"配置文件加载成功: {config_path}")
+
+    # 合并所有配置
+    config = {}
+
+    # 应用配置
+    config.update(_load_app_config(config_data))
+
+    # 爬虫配置
+    config.update(_load_crawler_config(config_data))
+
+    # 报告配置
+    config.update(_load_report_config(config_data))
+
+    # 通知配置
+    config.update(_load_notification_config(config_data))
+
+    # 推送窗口配置
+    config["PUSH_WINDOW"] = _load_push_window_config(config_data)
+
+    # 权重配置
+    config["WEIGHT_CONFIG"] = _load_weight_config(config_data)
+
+    # 平台配置
+    config["PLATFORMS"] = config_data.get("platforms", [])
+
+    # 存储配置
+    config["STORAGE"] = _load_storage_config(config_data)
+
+    # Webhook 配置
+    config.update(_load_webhook_config(config_data))
+
+    # 打印通知渠道配置来源
+    _print_notification_sources(config)
+
+    return config