v4.0.0 大大大更新

2025-12-21 13:17:16 +08:00 · 2025-12-13 13:44:35 +08:00 · 2025-12-13 13:44:35 +08:00 · c7bacdfff7
commit c7bacdfff7
parent 97c05aa33c
61 changed files with 12407 additions and 5889 deletions
--- a/.github/ISSUE_TEMPLATE/01-bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/01-bug-report.yml
@ -4,8 +4,6 @@ name: 🐛 遇到问题了
 description: 程序运行不正常或出现错误
 title: "[问题] "
 labels: ["bug"]
 assignees:
  - sansan0
 body:
  - type: markdown
    attributes:
--- a/.github/ISSUE_TEMPLATE/02-feature-request.yml
+++ b/.github/ISSUE_TEMPLATE/02-feature-request.yml
@ -4,8 +4,6 @@ name: 💡 我有个想法
 description: 建议新功能或改进现有功能
 title: "[建议] "
 labels: ["enhancement"]
 assignees:
  - sansan0
 body:
  - type: markdown
    attributes:
--- a/.github/ISSUE_TEMPLATE/03-config-help.yml
+++ b/.github/ISSUE_TEMPLATE/03-config-help.yml
@ -4,8 +4,6 @@ name: ⚙️ 设置遇到困难
 description: 配置相关的问题或需要帮助
 title: "[设置] "
 labels: ["配置", "帮助"]
 assignees:
  - sansan0
 body:
  - type: markdown
    attributes:
--- a/.github/workflows/clean-crawler.yml
+++ b/.github/workflows/clean-crawler.yml
@ -0,0 +1,28 @@
 name: Check In
 # ✅ 签到续期：运行此 workflow 可重置 7 天计时，保持 "Get Hot News" 正常运行
 # ✅ Renewal: Run this workflow to reset the 7-day timer and keep "Get Hot News" active
 #
 # 📌 操作方法 / How to use:
 #   1. 点击 "Run workflow" 按钮 / Click "Run workflow" button
 #   2. 每 7 天内至少运行一次 / Run at least once every 7 days
 on:
  workflow_dispatch:
 jobs:
  del_runs:
    runs-on: ubuntu-latest
    permissions:
      actions: write
      contents: read
    steps:
      - name: Delete all workflow runs
        uses: Mattraks/delete-workflow-runs@v2
        with:
          token: ${{ github.token }}
          repository: ${{ github.repository }}
          retain_days: 0
          keep_minimum_runs: 0
          delete_workflow_by_state_pattern: "ALL"
          delete_run_by_conclusion_pattern: "ALL"
--- a/.github/workflows/crawler.yml
+++ b/.github/workflows/crawler.yml
@ -0,0 +1,163 @@
 name: Get Hot News
 on:
  schedule:
    # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
    # ⚠️ 试用版说明 / Trial Mode
    # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
    #
    # 🔄 运行机制 / How it works:
    #    - 每个周期为 7 天，届时自动停止
    #    - 运行 "Check In" 会重置周期（重新开始 7 天倒计时，而非累加）
    #    - Each cycle is 7 days, then auto-stops
    #    - "Check In" resets the cycle (restarts 7-day countdown, not cumulative)
    #
    # 💡 设计初衷 / Why this design:
    #    如果 7 天都忘了签到，或许这些资讯对你来说并非刚需
    #    适时的暂停，能帮你从信息流中抽离，给大脑留出喘息的空间
    #    If you forget for 7 days, maybe you don't really need it
    #    A timely pause helps you detach from the stream and gives your mind space
    #
    # 🙏 珍惜资源 / Respect shared resources:
    #    GitHub Actions 是平台提供的公共资源，每次运行都会消耗算力
    #    签到机制确保资源分配给真正需要的用户，感谢你的理解与配合
    #    GitHub Actions is a shared public resource provided by the platform
    #    Check-in ensures resources go to those who truly need it — thank you
    #
    # 🚀 长期使用请部署 Docker 版本 / For long-term use, deploy Docker version
    #
    # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
    #
    # 📝 修改运行时间：只改第一个数字（0-59），表示每小时第几分钟运行
    # 📝 Change time: Only modify the first number (0-59) = minute of each hour
    #
    # 示例 / Examples:
    #   "15 * * * *"     → 每小时第15分钟 / minute 15 every hour
    #   "30 0-14 * * *"  → 北京时间 8:00-22:00 每小时第30分钟 / Beijing 8am-10pm
    #
    - cron: "33 * * * *"
  workflow_dispatch:
 concurrency:
  group: crawler-${{ github.ref_name }}
  cancel-in-progress: true
 permissions:
  contents: read
  actions: write
 jobs:
  crawl:
    runs-on: ubuntu-latest
    timeout-minutes: 15
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
        with:
          fetch-depth: 1
          clean: true
      - name: Check Expiration
        env:
          GH_TOKEN: ${{ github.token }}
        run: |
          WORKFLOW_FILE="crawler.yml"
          API_URL="repos/${{ github.repository }}/actions/workflows/$WORKFLOW_FILE/runs"
          TOTAL=$(gh api "$API_URL" --jq '.total_count')
          if [ -z "$TOTAL" ] || [ "$TOTAL" -eq 0 ]; then
            echo "No previous runs found, skipping expiration check"
            exit 0
          fi
          LAST_PAGE=$(( (TOTAL + 99) / 100 ))
          FIRST_RUN_DATE=$(gh api "$API_URL?per_page=100&page=$LAST_PAGE" --jq '.workflow_runs[-1].created_at')
          if [ -n "$FIRST_RUN_DATE" ]; then
            CURRENT_TIMESTAMP=$(date +%s)
            FIRST_RUN_TIMESTAMP=$(date -d "$FIRST_RUN_DATE" +%s)
            DIFF_SECONDS=$((CURRENT_TIMESTAMP - FIRST_RUN_TIMESTAMP))
            LIMIT_SECONDS=604800
            if [ $DIFF_SECONDS -gt $LIMIT_SECONDS ]; then
              echo "⚠️ 试用期已结束，请运行 'Check In' 签到续期"
              echo "⚠️ Trial expired. Run 'Check In' to renew."
              gh workflow disable "$WORKFLOW_FILE"
              exit 1
            else
              DAYS_LEFT=$(( (LIMIT_SECONDS - DIFF_SECONDS) / 86400 ))
              echo "✅ 试用期剩余 ${DAYS_LEFT} 天，到期前请运行 'Check In' 签到续期"
              echo "✅ Trial: ${DAYS_LEFT} days left. Run 'Check In' before expiry to renew."
            fi
          fi
      # --------------------------------------------------------------------------------
      # 🚦 TRAFFIC CONTROL / 流量控制
      # --------------------------------------------------------------------------------
      # EN: Generates a random delay between 1 and 300 seconds (5 minutes).
      #     Critical for load balancing.
      #
      # CN: 生成 1 到 300 秒（5分钟）之间的随机延迟。
      #     这对负载均衡至关重要。
      - name: Random Delay (Traffic Control)
        if: success()
        run: |
          echo "🎲 Traffic Control: Generating random delay..."
          DELAY=$(( ( RANDOM % 300 )  + 1 ))
          echo "⏸️  Sleeping for ${DELAY} seconds to spread the load..."
          sleep ${DELAY}s
          echo "▶️  Delay finished. Starting crawler..."
      - name: Set up Python
        if: success()
        uses: actions/setup-python@v5
        with:
          python-version: "3.10"
          cache: "pip"
      - name: Install dependencies
        if: success()
        run: |
          python -m pip install --upgrade pip
          pip install -r requirements.txt
      - name: Verify required files
        if: success()
        run: |
          if [ ! -f config/config.yaml ]; then
            echo "Error: Config missing"
            exit 1
          fi
      - name: Run crawler
        if: success()
        env:
          FEISHU_WEBHOOK_URL: ${{ secrets.FEISHU_WEBHOOK_URL }}
          TELEGRAM_BOT_TOKEN: ${{ secrets.TELEGRAM_BOT_TOKEN }}
          TELEGRAM_CHAT_ID: ${{ secrets.TELEGRAM_CHAT_ID }}
          DINGTALK_WEBHOOK_URL: ${{ secrets.DINGTALK_WEBHOOK_URL }}
          WEWORK_WEBHOOK_URL: ${{ secrets.WEWORK_WEBHOOK_URL }}
          WEWORK_MSG_TYPE: ${{ secrets.WEWORK_MSG_TYPE }}
          EMAIL_FROM: ${{ secrets.EMAIL_FROM }}
          EMAIL_PASSWORD: ${{ secrets.EMAIL_PASSWORD }}
          EMAIL_TO: ${{ secrets.EMAIL_TO }}
          EMAIL_SMTP_SERVER: ${{ secrets.EMAIL_SMTP_SERVER }}
          EMAIL_SMTP_PORT: ${{ secrets.EMAIL_SMTP_PORT }}
          NTFY_TOPIC: ${{ secrets.NTFY_TOPIC }}
          NTFY_SERVER_URL: ${{ secrets.NTFY_SERVER_URL }}
          NTFY_TOKEN: ${{ secrets.NTFY_TOKEN }}
          BARK_URL: ${{ secrets.BARK_URL }}
          SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
          STORAGE_BACKEND: auto
          LOCAL_RETENTION_DAYS: ${{ secrets.LOCAL_RETENTION_DAYS }}
          REMOTE_RETENTION_DAYS: ${{ secrets.REMOTE_RETENTION_DAYS }}
          S3_BUCKET_NAME: ${{ secrets.S3_BUCKET_NAME }}
          S3_ACCESS_KEY_ID: ${{ secrets.S3_ACCESS_KEY_ID }}
          S3_SECRET_ACCESS_KEY: ${{ secrets.S3_SECRET_ACCESS_KEY }}
          S3_ENDPOINT_URL: ${{ secrets.S3_ENDPOINT_URL }}
          S3_REGION: ${{ secrets.S3_REGION }}
          GITHUB_ACTIONS: true
        run: python -m trendradar
--- a/README-EN.md
+++ b/README-EN.md
@ -1,6 +1,6 @@
 <div align="center" id="trendradar">
-> **📢 Announcement:** After communicating with GitHub officials, "One-Click Fork Deployment" will be restored after compliance adjustments are completed. Please stay tuned for **v4.0.0** update
+> **📢 Announcement:** **v4.0.0** has been released! Including storage architecture refactoring, database optimization, modularization improvements, and more major updates
 <a href="https://github.com/sansan0/TrendRadar" title="TrendRadar">
  <img src="/_image/banner.webp" alt="TrendRadar Banner" width="80%">
@ -16,8 +16,8 @@
 [![GitHub Stars](https://img.shields.io/github/stars/sansan0/TrendRadar?style=flat-square&logo=github&color=yellow)](https://github.com/sansan0/TrendRadar/stargazers)
 [![GitHub Forks](https://img.shields.io/github/forks/sansan0/TrendRadar?style=flat-square&logo=github&color=blue)](https://github.com/sansan0/TrendRadar/network/members)
 [![License](https://img.shields.io/badge/license-GPL--3.0-blue.svg?style=flat-square)](LICENSE)
-[![Version](https://img.shields.io/badge/version-v3.5.0-blue.svg)](https://github.com/sansan0/TrendRadar)
+[![Version](https://img.shields.io/badge/version-v4.0.0-blue.svg)](https://github.com/sansan0/TrendRadar)
-[![MCP](https://img.shields.io/badge/MCP-v1.0.3-green.svg)](https://github.com/sansan0/TrendRadar)
+[![MCP](https://img.shields.io/badge/MCP-v1.1.0-green.svg)](https://github.com/sansan0/TrendRadar)
 [![WeWork](https://img.shields.io/badge/WeWork-Notification-00D4AA?style=flat-square)](https://work.weixin.qq.com/)
 [![WeChat](https://img.shields.io/badge/WeChat-Notification-00D4AA?style=flat-square)](https://weixin.qq.com/)
@ -48,62 +48,61 @@
 <br>
 <details>
-<summary>🚨 <strong>【MUST READ】Important Announcement: The Correct Way to Deploy This Project</strong></summary>
+<summary>🚨 <strong>【Must Read】Important Announcement: v4.0.0 Deployment & Storage Architecture Changes</strong></summary>
 <br>
-> **⚠️ December 2025 Urgent Notice**
+### 🛠️ Choose the Deployment Method That Fits You
 >
 > Due to a surge in Fork numbers causing excessive load on GitHub servers, **GitHub Actions and GitHub Pages deployments are currently restricted**. Please read the following instructions carefully to ensure successful deployment.
-### 1. ✅ Only Recommended Deployment Method: Docker
+#### 🅰️ Option 1: Docker Deployment (Recommended 🔥)
-**This is currently the most stable solution, free from GitHub restrictions.** Data is stored locally and won't be affected by GitHub policy changes.
+* **Features**: Most stable and simplest. Data is stored in **local SQLite**, fully under your control.
 * **Best for**: Users with their own server, NAS, or an always-on PC.
 * 👉 [Jump to Docker Deployment Tutorial](#6-docker-deployment)
 ---
-### 2. If You Were Planning to Fork This Project...
+#### 🅱️ Option 2: GitHub Actions Deployment (Restored ✅)
-To reduce pressure on GitHub servers, **please DO NOT directly click the "Fork" button!**
+* **Features**: Data is no longer committed directly to the repo. Instead, it is stored in **Remote Cloud Storage** (supports S3-compatible protocols: Cloudflare R2, Alibaba Cloud OSS, Tencent Cloud COS, etc.).
-Please use the **"Use this template"** feature instead of Fork:
+* **Requirement**: You **must** configure an S3-compatible object storage service (Cloudflare R2 recommended, it's free).
 > **⚠️ Note**: If you choose this option, you must complete the following two configuration steps:
 #### 1. 🚀 Recommended Start: Use this template
 To keep the repository clean and avoid inheriting redundant history, I **recommend** using Template mode:
 1.  **Click** the green **[Use this template]** button at the top right of the original repository page.
 1.  **Click** the green **[Use this template]** button in the top right corner of the original repository page.
 2.  **Select** "Create a new repository".
-**Why do this?**
+> **💡 Why do this?**
-* **❌ Fork**: Copies complete history records. Many forks running simultaneously will trigger GitHub risk control.
+> * **Use this template**: Creates a brand new, clean repository with no historical baggage.
-* **✅ Use this template**: Creates a completely new independent repository without historical baggage, more server-friendly.
+> * **Fork**: Retains the complete commit history and relationships, consuming more GitHub resources.
---
+#### 2. ☁️ About the Mandatory Remote Storage for GitHub Actions
-### 3. About New Data Storage
+If you choose **Option 2 (GitHub Actions)**, you must configure an S3-compatible object storage service.
-The new version will use **Cloudflare R2** to store news data, ensuring data persistence.
+**Supported Storage Services:**
 - **Cloudflare R2** (Recommended, generous free tier)
 - Other S3-compatible services
-**⚠️ Configuration Prerequisites:**
+**⚠️ Configuration Prerequisites (Using Cloudflare R2 as Example):**
-According to Cloudflare platform rules, activating R2 requires binding a payment method.
+According to Cloudflare platform rules, enabling R2 requires binding a payment method.
- **Purpose:** Identity verification only (Verify Only), no charges will be incurred.
+* **Purpose**: Identity verification only (Verify Only). **No charges will be incurred**.
 - **Payment:** Supports credit cards or PayPal (China region).
 - **Usage:** R2's free tier is sufficient to cover this project's daily operation, no payment required.
---
+* **Payment**: Supports international credit cards or PayPal.
-### 4. 📅 Future Plans & Documentation Reading Notes
+* **Usage**: The R2 free tier (10GB storage/month) is sufficient to cover the daily operation of this project. No need to worry about costs.
-> **Future Plans:**
+👉 **[Click to View Detailed Configuration Tutorial](#-quick-start)**
 > - Exploring new approach: keep Actions for fetching and pushing, but no longer save data to repository, use external storage instead.
 **⚠️ Reading Note:**
 Given that the above plans mean **Fork deployment mode may return in a new form in the future**, and the workload to fully revise documentation is massive, we have temporarily retained the old descriptions.
 **At the current stage, if "Fork" related expressions still appear in subsequent tutorials, please ignore them or understand them as "Use this template"**.
 👉 **[Click here to view TrendRadar's latest official documentation](https://github.com/sansan0/TrendRadar?tab=readme-ov-file)**
 </details>
@ -287,10 +286,32 @@ Supports **WeWork** (+ WeChat push solution), **Feishu**, **DingTalk**, **Telegr
 - ⚠️ **Paired Configuration**: Telegram and ntfy require paired parameter quantities to match (e.g., token and chat_id both have 2 values)
 - ⚠️ **Quantity Limit**: Default maximum 3 accounts per channel, exceeded values will be truncated
-### **Multi-Platform Support**
+### **Flexible Storage Architecture (v4.0.0 Major Update)**
- **GitHub Pages**: Auto-generate beautiful web reports, PC/mobile adapted
+
 **Multi-Backend Support**:
 - ☁️ **Remote Cloud Storage**: GitHub Actions environment default, supports S3-compatible protocols (R2/OSS/COS, etc.), data stored in cloud, keeping repository clean
 - 💾 **Local SQLite**: Traditional SQLite database, stable and efficient (Docker/local deployment)
 - 🔀 **Auto Selection**: Auto-selects appropriate backend based on runtime environment
 **Data Format Hierarchy**:
 | Format | Role | Description |
 |--------|------|-------------|
 | **SQLite** | Primary storage | Complete data with statistics information |
 | **TXT** | Human-readable backup | Optional text records for manual viewing |
 | **HTML** | Web report | Beautiful visual report (GitHub Pages) |
 **Data Management Features**:
 - Auto data cleanup (configurable retention period)
 - Timezone support (configurable IANA time zone)
 - Cloud/local seamless switching
 > 💡 For storage configuration details, see [Configuration Details - Storage Configuration](#11-storage-configuration-v400-new)
 ### **Multi-Platform Deployment**
 - **GitHub Actions**: Cloud automated operations (7-day check-in cycle + remote cloud storage)
 - **Docker Deployment**: Supports multi-architecture containerized operation
- **Data Persistence**: HTML/TXT multi-format history saving
+- **Local Running**: Python environment direct execution
 ### **AI Smart Analysis (v3.0.0 New)**
@ -341,10 +362,32 @@ Transform from "algorithm recommendation captivity" to "actively getting the inf
 >**Upgrade Instructions**:
 - **📌 Check Latest Updates**: **[Original Repository Changelog](https://github.com/sansan0/TrendRadar?tab=readme-ov-file#-changelog)**
 - **Tip**: Do NOT update this project via **Sync fork**. Check [Changelog] to understand specific [Upgrade Methods] and [Features]
 - **Minor Version Update**: Upgrading from v2.x to v2.y, replace `main.py` in your forked repo with the latest version
 - **Major Version Upgrade**: Upgrading from v1.x to v2.y, recommend deleting existing fork and re-forking to save effort and avoid config conflicts
 ### 2025/12/13 - v4.0.0
 **🎉 Major Update: Comprehensive Refactoring of Storage and Core Architecture**
 - **Multi-Storage Backend Support**: Introduced a brand new storage module supporting local SQLite and remote cloud storage (S3-compatible protocols, Cloudflare R2 recommended for free tier), adaptable to GitHub Actions, Docker, and local environments.
 - **Database Structure Optimization**: Refactored SQLite database table structures to improve data efficiency and query performance.
 - **Enhanced Features**: Implemented date format standardization, data retention policies, timezone configuration support, and optimized time display. Fixed remote storage data persistence issues to ensure accurate data merging.
 - **Cleanup and Compatibility**: Removed most legacy compatibility code and unified data storage and retrieval methods.
 ### 2025/12/13 - mcp-v1.1.0
 **MCP Module Update:**
 - Adapted for v4.0.0, while maintaining compatibility with v3.x data.
 - Added storage sync tools:
  - `sync_from_remote`: Pull data from remote storage to local
  - `get_storage_status`: Get storage configuration and status
  - `list_available_dates`: List available dates in local/remote storage
 <details>
 <summary>👉 Click to expand: <strong>Historical Updates</strong></summary>
 ### 2025/12/03 - v3.5.0
 **🎉 Core Feature Enhancements**
@ -397,7 +440,7 @@ Transform from "algorithm recommendation captivity" to "actively getting the inf
 **🔧 Upgrade Instructions**:
 - **GitHub Fork Users**: Update `main.py`, `config/config.yaml` (Added multi-account push support, existing single-account configuration unaffected)
- **Docker Users**: Update `.env`, `docker compose.yml` or set environment variables `REVERSE_CONTENT_ORDER`, `MAX_ACCOUNTS_PER_CHANNEL`
+- **Docker Users**: Update `.env`, `docker-compose.yml` or set environment variables `REVERSE_CONTENT_ORDER`, `MAX_ACCOUNTS_PER_CHANNEL`
 - **Multi-Account Push**: New feature, disabled by default, existing single-account configuration unaffected
@ -431,10 +474,6 @@ Transform from "algorithm recommendation captivity" to "actively getting the inf
  - Tool count increased from 13 to 14
 <details>
 <summary>👉 Click to expand: <strong>Historical Updates</strong></summary>
 ### 2025/11/25 - v3.4.0
 **🎉 Added Slack Push Support**
@ -819,11 +858,44 @@ frequency_words.txt file added **required word** feature, using + sign
 > **📖 Reminder**: Fork users should first **[check the latest official documentation](https://github.com/sansan0/TrendRadar?tab=readme-ov-file)** to ensure the configuration steps are up to date.
 ### ⚠️ GitHub Actions Usage Instructions
 **v4.0.0 Important Change**: Introduced "Activity Detection" mechanism—GitHub Actions now requires periodic check-in to maintain operation.
 #### 🔄 Check-In Renewal Mechanism
 - **Running Cycle**: Valid for **7 days**—service will automatically suspend when countdown ends.
 - **Renewal Method**: Manually trigger the "Check In" workflow on the Actions page to reset the 7-day validity period.
 - **Operation Path**: `Actions` → `Check In` → `Run workflow`
 - **Design Philosophy**:
    - If you forget for 7 days, maybe you don't really need it. Letting it stop is a digital detox, freeing you from the constant impact.
    - GitHub Actions is a valuable public computing resource. The check-in mechanism aims to prevent wasted computing cycles, ensuring resources are allocated to truly active users who need them. Thank you for your understanding and support.
 #### 📦 Data Storage (Required Configuration)
 In GitHub Actions environment, data is stored in **Remote Cloud Storage** (supports S3-compatible protocols, Cloudflare R2 recommended for free tier), keeping your repository clean (see **Required Configuration: Remote Cloud Storage** below).
 #### 🚀 Recommended: Docker Deployment
 For long-term stable operation, we recommend [Docker Deployment](#6-docker-deployment), with data stored locally and no check-in required—though it does require purchasing a cloud server.
 <br>
 > 🎉 **Now Supported: Multi-Cloud Storage Options**
 >
 > This project now supports S3-compatible protocols. You can choose:
 > - **Cloudflare R2** (Recommended, generous free tier)
 > - Other S3-compatible storage services
 >
 > Simply configure the corresponding `S3_ENDPOINT_URL`, `S3_BUCKET_NAME` and other environment variables to switch.
 ---
 1. **Fork this project** to your GitHub account
   - Click the "Fork" button at the top right of this page
-2. **Setup GitHub Secrets (Choose your needed platforms)**:
+2. **Setup GitHub Secrets (Required + Optional Platforms)**:
   In your forked repo, go to `Settings` > `Secrets and variables` > `Actions` > `New repository secret`
@ -862,6 +934,35 @@ frequency_words.txt file added **required word** feature, using + sign
   <br>
   <details>
   <summary>⚠️ <strong>Required Configuration: Remote Cloud Storage</strong> (Required for GitHub Actions Environment, Cloudflare R2 Recommended)</summary>
   <br>
   **GitHub Secret Configuration (⚠️ All 4 configuration items below are required):**
   | Name | Secret (Value) Description |
   |------|----------------------------|
   | `S3_BUCKET_NAME` | Bucket name (e.g., `trendradar-data`) |
   | `S3_ACCESS_KEY_ID` | Access key ID |
   | `S3_SECRET_ACCESS_KEY` | Access key |
   | `S3_ENDPOINT_URL` | S3 API endpoint (e.g., R2: `https://<account-id>.r2.cloudflarestorage.com`) |
   <br>
   **How to Get Credentials (Using Cloudflare R2 as Example):**
   1. Visit [Cloudflare Dashboard](https://dash.cloudflare.com/) and log in
   2. Select `R2` in left menu → Click `Create Bucket` → Enter name (e.g., `trendradar-data`)
   3. Click `Manage R2 API Tokens` at top right → `Create API Token`
   4. Select `Object Read & Write` permission → After creation, it will display `Access Key ID` and `Secret Access Key`
   5. Endpoint URL can be found in bucket details page (format: `https://<account-id>.r2.cloudflarestorage.com`)
   **Notes**:
   - R2 free tier: 10GB storage + 1 million reads per month, sufficient for this project
   - Activation requires binding a payment method (identity verification only, no charges)
   - Data stored in cloud, keeps GitHub repository clean
   </details>
   <details>
   <summary> <strong>👉 Click to expand: WeWork Bot</strong> (Simplest and fastest configuration)</summary>
@ -2041,7 +2142,7 @@ TrendRadar provides two independent Docker images, deploy according to your need
   # Download docker compose config
   wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/.env -P docker/
-   wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/docker compose.yml -P docker/
+   wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/docker-compose.yml -P docker/
   ```
   > 💡 **Note**: Key directory structure required for Docker deployment:
@ -2052,7 +2153,7 @@ current directory/
 │   └── frequency_words.txt
 └── docker/
    ├── .env
-    └── docker compose.yml
+    └── docker-compose.yml
 ```
 2. **Config File Description**:
@ -2146,7 +2247,7 @@ vim config/frequency_words.txt
 # Use build version docker compose
 cd docker
-cp docker compose-build.yml docker compose.yml
+cp docker-compose-build.yml docker-compose.yml
 ```
 **Build and Start Services**:
@ -2232,7 +2333,7 @@ docker rm trend-radar
 > 💡 **Web Server Notes**:
 > - After starting, access latest report at `http://localhost:8080`
-> - Access historical reports via directory navigation (e.g., `http://localhost:8080/2025年xx月xx日/`)
+> - Access historical reports via directory navigation (e.g., `http://localhost:8080/2025-xx-xx/`)
 > - Port can be configured in `.env` file with `WEBSERVER_PORT` parameter
 > - Auto-start: Set `ENABLE_WEBSERVER=true` in `.env`
 > - Security: Static files only, limited to output directory, localhost binding only
@ -2249,7 +2350,7 @@ TrendRadar generates daily summary HTML reports to two locations simultaneously:
 |--------------|---------------|----------|
 | `output/index.html` | Direct host access | **Docker Deployment** (via Volume mount, visible on host) |
 | `index.html` | Root directory access | **GitHub Pages** (repository root, auto-detected by Pages) |
-| `output/YYYY年MM月DD日/html/当日汇总.html` | Historical reports | All environments (archived by date) |
+| `output/YYYY-MM-DD/html/当日汇总.html` | Historical reports | All environments (archived by date) |
 **Local Access Examples**:
 ```bash
@ -2258,8 +2359,8 @@ TrendRadar generates daily summary HTML reports to two locations simultaneously:
 docker exec -it trend-radar python manage.py start_webserver
 # 2. Access in browser
 http://localhost:8080                           # Access latest report (default index.html)
-http://localhost:8080/2025年xx月xx日/            # Access reports for specific date
+http://localhost:8080/2025-xx-xx/               # Access reports for specific date
-http://localhost:8080/2025年xx月xx日/html/       # Browse all HTML files for that date
+http://localhost:8080/2025-xx-xx/html/          # Browse all HTML files for that date
 # Method 2: Direct file access (local environment)
 open ./output/index.html             # macOS
@ -2267,7 +2368,7 @@ start ./output/index.html            # Windows
 xdg-open ./output/index.html         # Linux
 # Method 3: Access historical archives
-open ./output/2025年xx月xx日/html/当日汇总.html
+open ./output/2025-xx-xx/html/当日汇总.html
 ```
 **Why two index.html files?**
@ -2324,10 +2425,20 @@ flowchart TB
 Use docker compose to start both news push and MCP services:
 ```bash
-# Download latest docker compose.yml (includes MCP service config)
+# Method 1: Clone project (Recommended)
-wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/docker compose.yml
+git clone https://github.com/sansan0/TrendRadar.git
 cd TrendRadar/docker
 docker compose up -d
-# Start all services
+# Method 2: Download docker-compose.yml separately
 mkdir trendradar && cd trendradar
 wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/docker-compose.yml
 wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/.env
 mkdir -p config output
 # Download config files
 wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/config/config.yaml -P config/
 wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/config/frequency_words.txt -P config/
 # Modify volume paths in docker-compose.yml: ../config -> ./config, ../output -> ./output
 docker compose up -d
 # Check running status
@ -2337,18 +2448,29 @@ docker ps | grep trend-radar
 **Start MCP Service Separately**:
 ```bash
 # Linux/Mac
 docker run -d --name trend-radar-mcp \
  -p 127.0.0.1:3333:3333 \
-  -v ./config:/app/config:ro \
+  -v $(pwd)/config:/app/config:ro \
-  -v ./output:/app/output:ro \
+  -v $(pwd)/output:/app/output:ro \
  -e TZ=Asia/Shanghai \
  wantcat/trendradar-mcp:latest
 # Windows PowerShell
 docker run -d --name trend-radar-mcp `
  -p 127.0.0.1:3333:3333 `
  -v ${PWD}/config:/app/config:ro `
  -v ${PWD}/output:/app/output:ro `
  -e TZ=Asia/Shanghai `
  wantcat/trendradar-mcp:latest
 ```
 > ⚠️ **Note**: Ensure `config/` and `output/` folders exist in current directory with config files and news data before running.
 **Verify Service**:
 ```bash
-# Check if MCP service is running properly
+# Check MCP service health
 curl http://127.0.0.1:3333/mcp
 # View MCP service logs
@ -2357,14 +2479,20 @@ docker logs -f trend-radar-mcp
 **Configure in AI Clients**:
-After MCP service starts, configure in Claude Desktop, Cherry Studio, Cursor, etc.:
+After MCP service starts, configure based on your client:
 **Cherry Studio** (Recommended, GUI config):
 - Settings → MCP Server → Add
 - Type: `streamableHttp`
 - URL: `http://127.0.0.1:3333/mcp`
 **Claude Desktop / Cline** (JSON config):
 ```json
 {
  "mcpServers": {
    "trendradar": {
      "url": "http://127.0.0.1:3333/mcp",
-      "description": "TrendRadar News Trending Analysis"
+      "type": "streamableHttp"
    }
  }
 }
@ -2452,7 +2580,6 @@ notification:
      start: "20:00"                  # Start time (Beijing time)
      end: "22:00"                    # End time (Beijing time)
    once_per_day: true                # Push only once per day
    push_record_retention_days: 7     # Push record retention days
 ```
 #### Configuration Details
@ -2463,7 +2590,6 @@ notification:
 | `time_range.start` | string | `"20:00"` | Push window start time (Beijing time, HH:MM format) |
 | `time_range.end` | string | `"22:00"` | Push window end time (Beijing time, HH:MM format) |
 | `once_per_day` | bool | `true` | `true`=push only once per day within window, `false`=push every execution within window |
 | `push_record_retention_days` | int | `7` | Push record retention days (used to determine if already pushed) |
 #### Use Cases
@ -2487,7 +2613,6 @@ PUSH_WINDOW_ENABLED=true
 PUSH_WINDOW_START=09:00
 PUSH_WINDOW_END=18:00
 PUSH_WINDOW_ONCE_PER_DAY=false
 PUSH_WINDOW_RETENTION_DAYS=7
 ```
 #### Complete Configuration Examples
@ -2502,7 +2627,6 @@ notification:
      start: "20:00"
      end: "22:00"
    once_per_day: true
    push_record_retention_days: 7
 ```
 **Scenario: Push every hour during working hours**
@ -2515,7 +2639,6 @@ notification:
      start: "09:00"
      end: "18:00"
    once_per_day: false
    push_record_retention_days: 7
 ```
 </details>
@ -2811,6 +2934,207 @@ notification:
 <br>
 ### 11. Storage Configuration (v4.0.0 New)
 <details>
 <summary>👉 Click to expand: <strong>Storage Configuration Guide</strong></summary>
 <br>
 #### Storage Backend Selection
 TrendRadar v4.0.0 introduces **multi-backend storage architecture**, supporting automatic backend selection or manual specification:
 | Configuration Value | Description | Applicable Scenarios |
 |---------------------|-------------|---------------------|
 | `auto` (default) | Auto-select backend: GitHub Actions→R2, other environments→Local | Most users (recommended) |
 | `local` | Force use of local SQLite | Docker/local deployment |
 | `r2` | Force use of Cloudflare R2 | Cloud storage required |
 **Configuration Location**:
 - GitHub Actions: Set `STORAGE_BACKEND` environment variable in GitHub Secrets
 - Docker: Configure `STORAGE_BACKEND=local` in `.env` file
 - Local: Add `STORAGE_BACKEND` in environment variables or use auto mode
 ---
 #### Database Structure Optimization (v4.0.0)
 v4.0.0 made significant optimizations to database structure, removing redundant fields and improving data normalization:
 ##### 1. Removed Redundant Fields
 Removed the following redundant fields from `news` table:
 | Field Name | Removal Reason | Alternative |
 |------------|----------------|------------|
 | `source_name` | Duplicate with platform name | Get via `platforms` table JOIN query |
 | `crawl_date` | Duplicate with file path date | Infer from file path timestamp |
 **Migration Notes**: Old databases are incompatible, see [Breaking Changes](#breaking-changes-v400) section
 ##### 2. New Platforms Table
 Added `platforms` table for unified management of platform information:
 ```sql
 CREATE TABLE IF NOT EXISTS platforms (
    id TEXT PRIMARY KEY,     -- Platform ID (immutable, e.g., 'zhihu', 'weibo')
    name TEXT NOT NULL,      -- Platform display name (mutable, e.g., 'Zhihu', 'Weibo')
    enabled INTEGER DEFAULT 1 -- Whether enabled (1=enabled, 0=disabled)
 );
 ```
 **Design Advantages**:
 - `id` field is immutable, maintains data consistency
 - `name` field is mutable, supports internationalization and customization
 - Historical data remains valid when modifying platform names
 ##### 3. Crawl Source Status Normalization
 Replaced original comma-separated string storage `successful_sources` field with normalized `crawl_source_status` table:
 ```sql
 CREATE TABLE IF NOT EXISTS crawl_source_status (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    file_path TEXT NOT NULL,           -- File path (e.g., 'output/2025-12-09/news.db')
    platform_id TEXT NOT NULL,         -- Platform ID (foreign key to platforms.id)
    success INTEGER NOT NULL,          -- Whether crawl succeeded (1=success, 0=failed)
    crawl_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    FOREIGN KEY (platform_id) REFERENCES platforms(id)
 );
 ```
 **Design Advantages**:
 - Supports efficient SQL queries (e.g., calculate success rate by platform)
 - Easy statistics and analysis (no string splitting required)
 - Normalized structure, avoids data redundancy
 ##### 4. File Path Format Standardization
 **Old Format**: `output/2025年12月09日/news_14-30.txt`
 **New Format**: `output/2025-12-09/news.db`
 **Changes**:
 - Date format: Chinese format → ISO 8601 standard format
 - Filename: Multiple time-stamped TXT files → single SQLite database file
 - Extension: `.txt` → `.db`
 **Advantages**:
 - Cross-platform compatibility (avoids Chinese path issues)
 - Easier programmatic parsing
 - International standard, better maintainability
 ---
 #### Remote Cloud Storage Configuration
 When using remote cloud storage (required for GitHub Actions environment), configure the following environment variables:
 | Environment Variable | Description | Required | Example Value |
 |----------------------|-------------|----------|--------------|
 | `S3_BUCKET_NAME` | Bucket name | ✅ Yes | `trendradar-data` |
 | `S3_ACCESS_KEY_ID` | Access key ID | ✅ Yes | `abc123...` |
 | `S3_SECRET_ACCESS_KEY` | Access key | ✅ Yes | `xyz789...` |
 | `S3_ENDPOINT_URL` | S3 API endpoint | ✅ Yes | `https://<account-id>.r2.cloudflarestorage.com` |
 | `S3_REGION` | Region (optional) | ❌ No | `auto` |
 **Configuration Method**:
 - GitHub Actions: Configure in GitHub Secrets (see [Quick Start - Remote Storage Configuration](#2-setup-github-secrets-required--optional-platforms))
 - Docker/Local: Configure in `.env` file (remote storage is optional)
 ---
 #### Data Cleanup Strategy
 v4.0.0 added automatic data cleanup feature, supporting scheduled cleanup of old data:
 **Configuration Items**: `LOCAL_RETENTION_DAYS` and `REMOTE_RETENTION_DAYS`
 | Configuration Value | Description |
 |---------------------|-------------|
 | `0` (default) | Disable cleanup, keep all data |
 | Positive integer (e.g., `30`) | Only keep recent N days of data, auto-delete old data |
 **Configuration Method**:
 ```bash
 # GitHub Actions: Configure in GitHub Secrets
 LOCAL_RETENTION_DAYS=30
 REMOTE_RETENTION_DAYS=30
 # Docker: Configure in .env file
 LOCAL_RETENTION_DAYS=30
 REMOTE_RETENTION_DAYS=30
 # Local: Add to environment variables
 export LOCAL_RETENTION_DAYS=30
 ```
 **Cleanup Rules**:
 - Cleanup executes during each crawl task
 - Local: Deletes `output/YYYY-MM-DD/` directories older than N days
 - Remote: Deletes cloud objects older than N days (e.g., `news/2025-11-10.db`)
 ---
 #### Timezone Configuration
 v4.0.0 added timezone configuration support, using IANA standard time zone names:
 **Configuration Item**: `TIMEZONE`
 | Configuration Value | Description | Example |
 |---------------------|-------------|---------|
 | Not set (default) | Use UTC+0 | - |
 | IANA time zone name | Specify time zone | `Asia/Shanghai`, `America/New_York`, `Europe/London` |
 **Configuration Method**:
 ```bash
 # GitHub Actions: Configure in GitHub Secrets
 TIMEZONE=Asia/Shanghai
 # Docker: Configure in .env file
 TIMEZONE=Asia/Shanghai
 # Local: Add to environment variables
 export TIMEZONE=Asia/Shanghai
 ```
 **Common IANA Time Zones**:
 - China: `Asia/Shanghai`
 - United States East: `America/New_York`
 - United States West: `America/Los_Angeles`
 - United Kingdom: `Europe/London`
 - Japan: `Asia/Tokyo`
 ---
 #### Breaking Changes (v4.0.0)
 **⚠️ Important Notice**: v4.0.0 made breaking changes to database structure, **old databases are incompatible**
 **Impact**:
 - Cannot directly read v3.x version data
 - Need to re-crawl data to build new database
 - **No automatic migration tool provided**
 **Recommendations**:
 1. **Fresh Start**: Recommended to start from scratch to accumulate data
 2. **Keep Historical Data**: If need to preserve v3.x historical data, can rename old `output/` directory (e.g., `output_v3_backup/`) before running new version
 **Data Format Comparison**:
 | Item | v3.x | v4.0.0 |
 |------|------|--------|
 | File path format | `output/2025年12月09日/` | `output/2025-12-09/` |
 | Data file | Multiple `news_HH-MM.txt` files | Single `news.db` file |
 | Database fields | Contains `source_name`, `crawl_date` | Removed redundant fields |
 | Platform management | No independent table | Added `platforms` table |
 | Crawl status | Comma-separated string | Normalized `crawl_source_status` table |
 </details>
 <br>
 ## 🤖 AI Analysis
 TrendRadar v3.0.0 added **MCP (Model Context Protocol)** based AI analysis feature, allowing natural language conversations with news data for deep analysis.
--- a/README-MCP-FAQ-EN.md
+++ b/README-MCP-FAQ-EN.md
@ -450,7 +450,89 @@ AI: (date_range={"start": "2024-12-01", "end": "2024-12-31"})
 ---
-### Q14: How to parse natural language date expressions? (Recommended to use first)
+## Storage Sync
 ### Q14: How to sync data from remote storage to local?
 **You can ask like this:**
 - "Sync last 7 days data from remote"
 - "Pull data from remote storage to local"
 - "Sync last 30 days of news data"
 **Tool called:** `sync_from_remote`
 **Use cases:**
 - Crawler deployed in the cloud (e.g., GitHub Actions), data stored remotely (e.g., Cloudflare R2)
 - MCP Server deployed locally, needs to pull data from remote for analysis
 **Return information:**
 - synced_files: Number of successfully synced files
 - synced_dates: List of successfully synced dates
 - skipped_dates: Skipped dates (already exist locally)
 - failed_dates: Failed dates and error information
 **Prerequisites:**
 Need to configure remote storage in `config/config.yaml` or set environment variables:
 - `S3_ENDPOINT_URL`: Service endpoint
 - `S3_BUCKET_NAME`: Bucket name
 - `S3_ACCESS_KEY_ID`: Access key ID
 - `S3_SECRET_ACCESS_KEY`: Secret access key
 ---
 ### Q15: How to view storage status?
 **You can ask like this:**
 - "View current storage status"
 - "What's the storage configuration"
 - "How much data is stored locally"
 - "Is remote storage configured"
 **Tool called:** `get_storage_status`
 **Return information:**
 | Category | Information |
 |----------|-------------|
 | **Local Storage** | Data directory, total size, date count, date range |
 | **Remote Storage** | Whether configured, endpoint URL, bucket name, date count |
 | **Pull Config** | Whether auto-pull enabled, pull days |
 ---
 ### Q16: How to view available data dates?
 **You can ask like this:**
 - "What dates are available locally"
 - "What dates are in remote storage"
 - "Compare local and remote data dates"
 - "Which dates only exist remotely"
 **Tool called:** `list_available_dates`
 **Three query modes:**
 | Mode | Description | Example Question |
 |------|-------------|------------------|
 | **local** | View local only | "What dates are available locally" |
 | **remote** | View remote only | "What dates are in remote" |
 | **both** | Compare both (default) | "Compare local and remote data" |
 **Return information (both mode):**
 - only_local: Dates only existing locally
 - only_remote: Dates only existing remotely (useful for deciding which dates to sync)
 - both: Dates existing in both places
 ---
 ### Q17: How to parse natural language date expressions? (Recommended to use first)
 **You can ask like this:**
--- a/README-MCP-FAQ.md
+++ b/README-MCP-FAQ.md
@ -450,7 +450,89 @@ AI：（date_range={"start": "2024-12-01", "end": "2024-12-31"}）
 ---
-### Q14: 如何解析自然语言日期表达式？（推荐优先使用）
+## 存储同步
 ### Q14: 如何从远程存储同步数据到本地？
 **你可以这样问：**
 - "从远程同步最近 7 天的数据"
 - "拉取远程存储的数据到本地"
 - "同步最近 30 天的新闻数据"
 **调用的工具：** `sync_from_remote`
 **使用场景：**
 - 爬虫部署在云端（如 GitHub Actions），数据存储到远程（如 Cloudflare R2）
 - MCP Server 部署在本地，需要从远程拉取数据进行分析
 **返回信息：**
 - synced_files: 成功同步的文件数量
 - synced_dates: 成功同步的日期列表
 - skipped_dates: 跳过的日期（本地已存在）
 - failed_dates: 失败的日期及错误信息
 **前提条件：**
 需要在 `config/config.yaml` 中配置远程存储或设置环境变量：
 - `S3_ENDPOINT_URL`: 服务端点
 - `S3_BUCKET_NAME`: 存储桶名称
 - `S3_ACCESS_KEY_ID`: 访问密钥 ID
 - `S3_SECRET_ACCESS_KEY`: 访问密钥
 ---
 ### Q15: 如何查看存储状态？
 **你可以这样问：**
 - "查看当前存储状态"
 - "存储配置是什么"
 - "本地有多少数据"
 - "远程存储配置了吗"
 **调用的工具：** `get_storage_status`
 **返回信息：**
 | 类别 | 信息 |
 |------|------|
 | **本地存储** | 数据目录、总大小、日期数量、日期范围 |
 | **远程存储** | 是否配置、端点地址、存储桶名称、日期数量 |
 | **拉取配置** | 是否启用自动拉取、拉取天数 |
 ---
 ### Q16: 如何查看可用的数据日期？
 **你可以这样问：**
 - "本地有哪些日期的数据"
 - "远程存储有哪些日期"
 - "对比本地和远程的数据日期"
 - "哪些日期只在远程有"
 **调用的工具：** `list_available_dates`
 **三种查询模式：**
 | 模式 | 说明 | 示例问法 |
 |------|------|---------|
 | **local** | 仅查看本地 | "本地有哪些日期" |
 | **remote** | 仅查看远程 | "远程有哪些日期" |
 | **both** | 对比两者（默认） | "对比本地和远程的数据" |
 **返回信息（both 模式）：**
 - only_local: 仅本地存在的日期
 - only_remote: 仅远程存在的日期（可用于决定同步哪些日期）
 - both: 两边都存在的日期
 ---
 ### Q17: 如何解析自然语言日期表达式？（推荐优先使用）
 **你可以这样问：**
--- a/README.md
+++ b/README.md
@ -1,6 +1,6 @@
 <div align="center" id="trendradar">
-> **📢 公告：** 经与 GitHub 官方沟通，完成合规调整后将恢复"一键 Fork 部署"，请关注 **v4.0.0** 版本的更新
+> **📢 公告：** **v4.0.0** 版本已发布！包含存储架构重构、数据库优化、模块化改进等重大更新
 <a href="https://github.com/sansan0/TrendRadar" title="TrendRadar">
  <img src="/_image/banner.webp" alt="TrendRadar Banner" width="80%">
@ -16,8 +16,8 @@
 [![GitHub Stars](https://img.shields.io/github/stars/sansan0/TrendRadar?style=flat-square&logo=github&color=yellow)](https://github.com/sansan0/TrendRadar/stargazers)
 [![GitHub Forks](https://img.shields.io/github/forks/sansan0/TrendRadar?style=flat-square&logo=github&color=blue)](https://github.com/sansan0/TrendRadar/network/members)
 [![License](https://img.shields.io/badge/license-GPL--3.0-blue.svg?style=flat-square)](LICENSE)
-[![Version](https://img.shields.io/badge/version-v3.5.0-blue.svg)](https://github.com/sansan0/TrendRadar)
+[![Version](https://img.shields.io/badge/version-v4.0.0-blue.svg)](https://github.com/sansan0/TrendRadar)
-[![MCP](https://img.shields.io/badge/MCP-v1.0.3-green.svg)](https://github.com/sansan0/TrendRadar)
+[![MCP](https://img.shields.io/badge/MCP-v1.1.0-green.svg)](https://github.com/sansan0/TrendRadar)
 [![企业微信通知](https://img.shields.io/badge/企业微信-通知-00D4AA?style=flat-square)](https://work.weixin.qq.com/)
 [![个人微信通知](https://img.shields.io/badge/个人微信-通知-00D4AA?style=flat-square)](https://weixin.qq.com/)
@ -48,62 +48,61 @@
 <br>
 <details>
-<summary>🚨 <strong>【必读】重要公告：本项目的正确部署姿势</strong></summary>
+<summary>🚨 <strong>【必读】重要公告：v4.0.0 部署方式与存储架构变更</strong></summary>
 <br>
-> **⚠️ 2025年12月紧急通知**
+### 🛠️ 请选择适合你的部署方式
 >
 > 由于 Fork 数量激增导致 GitHub 服务器压力过大，**GitHub Actions 及 GitHub Pages 部署目前已受限**。为确保顺利部署，请务必阅读以下说明。
-### 1. ✅ 唯一推荐部署方式：Docker
+#### 🅰️ 方案一：Docker 部署（推荐 🔥）
-**这是目前最稳定、不受 GitHub 限制的方案。** 数据存储在本地，不会因为 GitHub 策略调整而失效。
+* **特点**：最稳定、最简单，数据存储在 **本地 SQLite**，完全自主可控。
 * **适用**：有自己的服务器、NAS 或长期运行的电脑。
 * 👉 [跳转到 Docker 部署教程](#6-docker-部署)
 ---
-### 2. 如果你本打算 Fork 本项目...
+#### 🅱️ 方案二：GitHub Actions 部署（已恢复 ✅）
-为了减少对 GitHub 服务器的压力，**请千万不要直接点击 "Fork" 按钮！**
+* **特点**：数据不再直接写入仓库（Git Commit），而是存储在 **远程云存储**（支持 S3 兼容协议：Cloudflare R2、阿里云 OSS、腾讯云 COS 等）。
-请务必使用 **"Use this template"** 功能来替代 Fork：
+* **门槛**：**必须**配置一个 S3 兼容的对象存储服务（推荐免费的 Cloudflare R2）。
 > **⚠️ 注意**：选择此方案，请务必执行以下两步配置：
 #### 1. 🚀 推荐的开始方式：Use this template
 为了保持仓库整洁，避免继承冗余的历史记录，我**建议**你使用 Template 模式：
 1.  **点击**原仓库页面右上角的绿色 **[Use this template]** 按钮。
 1.  **点击**原仓库页面右上角的绿色的 **[Use this template]** 按钮。
 2.  **选择** "Create a new repository"。
-**为什么要这样做？**
+> **💡 为什么要这样做？**
-* **❌ Fork**：复制完整历史记录，大量 Fork 同时运行会触发 GitHub 风控。
+> * **Use this template**：创建一个全新的、干净的仓库，没有历史包袱。
-* **✅ Use this template**：创建的是一个全新的独立仓库，没有历史包袱，对服务器更友好。
+> * **Fork**：会保留完整的提交历史和关联关系，占用 GitHub 更多资源。
---
+#### 2. ☁️ 关于 GitHub Actions 必配的远程存储
-### 3. 关于新版数据存储的说明
+如果你选择 **方案二 (GitHub Actions)**，则必须配置一个 S3 兼容的对象存储服务。
-新版将使用 **Cloudflare R2** 存储新闻数据，以保证持久化。
+**支持的存储服务：**
 - **Cloudflare R2**（推荐，免费额度充足）
 - 其他 S3 兼容服务
-**⚠️ 配置前置条件：**
+**⚠️ 以 Cloudflare R2 为例的配置前置条件：**
 根据 Cloudflare 平台规则，开通 R2 需绑定支付方式。
- **目的：** 仅作身份验证（Verify Only），不产生扣费。
+* **目的**：仅作身份验证（Verify Only），**不产生扣费**。
 - **支付：** 支持信用卡或国区 PayPal。
 - **用量：** R2 的免费额度足以覆盖本项目日常运行，无需付费。
---
+* **支付**：支持双币信用卡或国区 PayPal。
-### 4. 📅 后续计划与文档阅读说明
+* **用量**：R2 的免费额度（10GB存储/月）足以覆盖本项目日常运行，无需担心付费。
-> **后续计划：**
+👉 **[点击查看详细配置教程](#-快速开始)**
 > - 探索新方案：保留 Actions 用于抓取和推送，但不再将数据保存到仓库，改用外部存储。
 **⚠️ 阅读注意：**
 鉴于上述计划意味着 **Fork 部署模式未来可能会以新形式回归**，且当前全面修改文档工作量巨大，我们暂时保留了旧版描述。
 **在当前阶段，若后续教程中仍出现 "Fork" 相关表述，请一律忽略或将其理解为 "Use this template"**。
 👉 **[点击此处查看 TrendRadar 最新官方文档](https://github.com/sansan0/TrendRadar?tab=readme-ov-file)**
 </details>
@ -335,10 +334,30 @@
 - ⚠️ **配对配置**：Telegram 和 ntfy 需要保证配对参数数量一致（如 token 和 chat_id 都是 2 个）
 - ⚠️ **数量限制**：默认每个渠道最多 3 个账号，超出会被截断
-### **多端适配**
+### **灵活存储架构**（v4.0.0 重大更新）
- **GitHub Pages**：自动生成精美网页报告，PC/移动端适配
+
- **Docker部署**：支持多架构容器化运行
+**多存储后端支持**：
- **数据持久化**：HTML/TXT多格式历史记录保存
+- ☁️ **远程云存储**：GitHub Actions 环境默认，支持 S3 兼容协议（R2/OSS/COS 等），数据存储在云端，不污染仓库
 - 💾 **本地 SQLite 数据库**：Docker/本地环境默认，数据完全可控
 - 🔄 **自动后端选择**：根据运行环境智能切换存储方式
 **数据格式**：
 | 格式 | 用途 | 说明 |
 |------|------|------|
 | **SQLite** | 主存储 | 单文件数据库，查询快速，支持 MCP AI 分析 |
 | **TXT** | 可选快照 | 可读文本格式，方便直接查看 |
 | **HTML** | 报告展示 | 精美可视化页面，PC/移动端适配 |
 **数据管理**：
 - ✅ 自动清理过期数据（可配置保留天数）
 - ✅ 时区配置支持（全球时区）
 > 💡 详细说明见 [配置详解 - 存储配置](#9-存储配置)
 ### **多端部署**
 - **GitHub Actions**：定时自动爬取 + 远程云存储（需签到续期）
 - **Docker 部署**：支持多架构容器化运行，数据本地存储
 - **本地运行**：Windows/Mac/Linux 直接运行
 ### **AI 智能分析（v3.0.0 新增）**
@ -389,10 +408,34 @@ GitHub 一键 Fork 即可使用，无需编程基础。
 >**升级说明**：
 - **📌 查看最新更新**：**[原仓库更新日志](https://github.com/sansan0/TrendRadar?tab=readme-ov-file#-更新日志)**
 - **提示**：不要通过 **Sync fork** 更新本项目，建议查看【历史更新】，明确具体的【升级方式】和【功能内容】
 - **小版本更新**：从 v2.x 升级到 v2.y，用本项目的 `main.py` 代码替换你 fork 仓库中的对应文件
 - **大版本升级**：从 v1.x 升级到 v2.y，建议删除现有 fork 后重新 fork，这样更省力且避免配置冲突
 ### 2025/12/13 - v4.0.0
 **🎉 重大更新：全面重构存储和核心架构**
 - **多存储后端支持**：引入全新的存储模块，支持本地 SQLite 和远程云存储（S3 兼容协议，推荐免费的 Cloudflare R2），适应 GitHub Actions、Docker 和本地环境。
 - **数据库结构优化**：重构 SQLite 数据库表结构，提升数据效率和查询能力。
 - **核心代码模块化**：将主程序逻辑拆分为 trendradar 包的多个模块，显著提升代码可维护性。
 - **增强功能**：实现日期格式标准化、数据保留策略、时区配置支持、时间显示优化，并修复远程存储数据持久化问题，确保数据合并的准确性。
 - **清理和兼容**：移除了大部分历史兼容代码，统一了数据存储和读取方式。
 ### 2025/12/13 - mcp-v1.1.0
  **MCP 模块更新:**
  - 适配 v4.0.0，同时也兼容 v3.x 的数据
  - 新增存储同步工具：
    - `sync_from_remote`: 从远程存储拉取数据到本地
    - `get_storage_status`: 获取存储配置和状态
    - `list_available_dates`: 列出本地/远程可用日期范围
 <details>
 <summary>👉 点击展开：<strong>历史更新</strong></summary>
 ### 2025/12/03 - v3.5.0
 **🎉 核心功能增强**
@ -456,10 +499,6 @@ GitHub 一键 Fork 即可使用，无需编程基础。
  - 工具总数从 13 个增加到 14 个
 <details>
 <summary>👉 点击展开：<strong>历史更新</strong></summary>
 ### 2025/11/28 - v3.4.1
 **🔧 格式优化**
@ -857,11 +896,44 @@ frequency_words.txt 文件增加了一个【必须词】功能，使用 + 号
 > **📖 提醒**：Fork 用户建议先 **[查看最新官方文档](https://github.com/sansan0/TrendRadar?tab=readme-ov-file)**，确保配置步骤是最新的。
 ### ⚠️ GitHub Actions 使用说明
 **v4.0.0 重要变更**：引入「活跃度检测」机制，GitHub Actions 需定期签到以维持运行。
 #### 🔄 签到续期机制
 - **运行周期**：有效期为 **7 天**，倒计时结束后服务将自动挂起。
 - **续期方式**：在 Actions 页面手动触发 "Check In" workflow，即可重置 7 天有效期。
 - **操作路径**：`Actions` → `Check In` → `Run workflow`
 - **设计理念**：
    - 如果 7 天都忘了签到，或许这些资讯对你来说并非刚需。适时的暂停，能帮你从信息流中抽离，给大脑留出喘息的空间。
    - GitHub Actions 是宝贵的公共计算资源。引入签到机制旨在避免算力的无效空转，确保资源能分配给真正活跃且需要的用户。感谢你的理解与支持。
 #### 📦 数据存储（必需配置）
 GitHub Actions 环境下，数据存储在 **远程云存储**（支持 S3 兼容协议，推荐免费的 Cloudflare R2），不会污染仓库（见下方 **必需配置：远程云存储**）
 #### 🚀 推荐：Docker 部署
 如需长期稳定运行，建议使用 [Docker 部署](#6-docker-部署)，数据存储在本地，无需签到，不过需要额外付费购买云服务器。
 <br>
 > 🎉 **已支持：多云存储方案**
 >
 > 本项目现已支持 S3 兼容协议，你可以选择：
 > - **Cloudflare R2**（推荐，免费额度充足）
 > - 其他 S3 兼容存储服务
 >
 > 只需配置对应的 `S3_ENDPOINT_URL`、`S3_BUCKET_NAME` 等环境变量即可切换。
 ---
 1. **Fork 本项目**到你的 GitHub 账户
   - 点击本页面右上角的"Fork"按钮
-2. **设置 GitHub Secrets（选择你需要的平台）**:
+2. **设置 GitHub Secrets（必需 + 可选平台）**:
   在你 Fork 后的仓库中，进入 `Settings` > `Secrets and variables` > `Actions` > `New repository secret`
@ -900,6 +972,53 @@ frequency_words.txt 文件增加了一个【必须词】功能，使用 + 号
   <br>
   <details>
   <summary>⚠️ <strong>必需配置：远程云存储</strong>（GitHub Actions 环境必需，推荐 Cloudflare R2）</summary>
   <br>
    **GitHub Secret 配置（⚠️ 以下 4 个配置项都是必需的）：**
    | Name（名称） | Secret（值）说明 |
    |-------------|-----------------|
    | `S3_BUCKET_NAME` | 存储桶名称（如 `trendradar-data`） |
    | `S3_ACCESS_KEY_ID` | 访问密钥 ID（Access Key ID） |
    | `S3_SECRET_ACCESS_KEY` | 访问密钥（Secret Access Key） |
    | `S3_ENDPOINT_URL` | S3 API 端点（如 R2：`https://<account-id>.r2.cloudflarestorage.com`） |
    <br>
    **如何获取凭据（以 Cloudflare R2 为例）：**
    1. **进入 R2 概览**：
    - 登录 [Cloudflare Dashboard](https://dash.cloudflare.com/)。
    - 在左侧侧边栏找到并点击 `R2对象存储`。
    <br>
    2. **创建存储桶**：
    - 点击`概述`
    - 点击右上角的 `创建存储桶` (Create bucket)。
    - 输入名称（例如 `trendradar-data`），点击 `创建存储桶`。
    <br>
    3. **创建 API 令牌**：
    - 回到 **概述**页面。
    - 点击**右下角** `Account Details `找到并点击 `Manage` (Manage R2 API Tokens)。
    - 同时你会看到 `S3 API`：`https://<account-id>.r2.cloudflarestorage.com`(这就是 S3_ENDPOINT_URL)
    - 点击 `创建 Account APl 令牌` 。
    - **⚠️ 关键设置**：
        - **令牌名称**：随意填写（如 `github-action-write`）。
        - **权限**：选择 `管理员读和写` 。
        - **指定存储桶**：为了安全，建议选择 `仅适用于指定存储桶` 并选中你的桶（如 `trendradar-data`）。
    - 点击 `创建 API 令牌`，**立即复制** 显示的 `Access Key ID` 和 `Secret Access Key`（只显示一次！）。
    <br>
    - **R2 免费额度**：每月 10GB 存储 + 100万次读取，对本项目来说非常充足。
    - **支付验证**：开通 R2 即使是免费额度，Cloudflare 也要求绑定 PayPal 或信用卡进行身份验证（不会实际扣费，除非超过额度）。
   </details>
   <details>
   <summary>👉 点击展开：<strong>企业微信机器人</strong>（配置最简单最迅速）</summary>
@ -1489,10 +1608,11 @@ frequency_words.txt 文件增加了一个【必须词】功能，使用 + 号
   **测试步骤**：
   1. 进入你项目的 Actions 页面
-   2. 找到 **"Hot News Crawler"** 点进去
+   2. 找到 **"Get Hot News"**(必须得是这个字)点进去，点击右侧的 **"Run workflow"** 按钮运行 
      - 如果看不到该字样，参照 [#109](https://github.com/sansan0/TrendRadar/issues/109) 解决
-   3. 点击右侧的 **"Run workflow"** 按钮运行
+   3. 3 分钟左右，消息会推送到你配置的平台
-   4. 等待 1 分钟左右，消息会推送到你配置的平台
+
   <br>
   > ⏱️ **测试提示**：
   > - 手动测试不要太频繁，避免触发 GitHub Actions 限制
@ -2069,7 +2189,7 @@ TrendRadar 提供两个独立的 Docker 镜像，可根据需求选择部署：
   # 下载 docker compose 配置
   wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/.env  -P docker/
-   wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/docker compose.yml  -P docker/
+   wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/docker-compose.yml  -P docker/
   ```
   > 💡 **说明**：Docker 部署需要的关键目录结构如下：
@ -2080,7 +2200,7 @@ TrendRadar 提供两个独立的 Docker 镜像，可根据需求选择部署：
 │   └── frequency_words.txt
 └── docker/
    ├── .env
-    └── docker compose.yml
+    └── docker-compose.yml
 ```
 2. **配置文件说明**:
@ -2174,7 +2294,7 @@ vim config/frequency_words.txt
 # 使用构建版本的 docker compose
 cd docker
-cp docker compose-build.yml docker compose.yml
+cp docker-compose-build.yml docker-compose.yml
 ```
 **构建并启动服务**：
@ -2260,7 +2380,7 @@ docker rm trend-radar
 > 💡 **Web 服务器说明**：
 > - 启动后可通过浏览器访问 `http://localhost:8080` 查看最新报告
-> - 通过目录导航访问历史报告（如：`http://localhost:8080/2025年xx月xx日/`）
+> - 通过目录导航访问历史报告（如：`http://localhost:8080/2025-xx-xx/`）
 > - 端口可在 `.env` 文件中配置 `WEBSERVER_PORT` 参数
 > - 自动启动：在 `.env` 中设置 `ENABLE_WEBSERVER=true`
 > - 安全提示：仅提供静态文件访问，限制在 output 目录，只绑定本地访问
@ -2277,7 +2397,7 @@ TrendRadar 生成的当日汇总 HTML 报告会同时保存到两个位置：
 |---------|---------|---------|
 | `output/index.html` | 宿主机直接访问 | **Docker 部署**（通过 Volume 挂载，宿主机可见） |
 | `index.html` | 根目录访问 | **GitHub Pages**（仓库根目录，Pages 自动识别） |
-| `output/YYYY年MM月DD日/html/当日汇总.html` | 历史报告访问 | 所有环境（按日期归档） |
+| `output/YYYY-MM-DD/html/当日汇总.html` | 历史报告访问 | 所有环境（按日期归档） |
 **本地访问示例**：
 ```bash
@ -2286,8 +2406,8 @@ TrendRadar 生成的当日汇总 HTML 报告会同时保存到两个位置：
 docker exec -it trend-radar python manage.py start_webserver
 # 2. 在浏览器访问
 http://localhost:8080                           # 访问最新报告（默认 index.html）
-http://localhost:8080/2025年xx月xx日/            # 访问指定日期的报告
+http://localhost:8080/2025-xx-xx/               # 访问指定日期的报告
-http://localhost:8080/2025年xx月xx日/html/       # 浏览该日期下的所有 HTML 文件
+http://localhost:8080/2025-xx-xx/html/          # 浏览该日期下的所有 HTML 文件
 # 方式 2：直接打开文件（本地环境）
 open ./output/index.html             # macOS
@ -2295,7 +2415,7 @@ start ./output/index.html            # Windows
 xdg-open ./output/index.html         # Linux
 # 方式 3：访问历史归档
-open ./output/2025年xx月xx日/html/当日汇总.html
+open ./output/2025-xx-xx/html/当日汇总.html
 ```
 **为什么有两个 index.html？**
@ -2349,34 +2469,42 @@ flowchart TB
 **快速启动**：
-使用 docker compose 同时启动新闻推送和 MCP 服务：
+如果已按照 [方式一：使用 docker compose](#方式一使用-docker-compose推荐) 完成部署，只需启动 MCP 服务：
 ```bash
-# 下载最新的 docker compose.yml（已包含 MCP 服务配置）
+cd TrendRadar/docker
-wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/docker compose.yml
+docker compose up -d trend-radar-mcp
 # 启动所有服务
 docker compose up -d
 # 查看运行状态
-docker ps | grep trend-radar
+docker ps | grep trend-radar-mcp
 ```
-**单独启动 MCP 服务**：
+**单独启动 MCP 服务**（不使用 docker compose）：
 ```bash
 # Linux/Mac
 docker run -d --name trend-radar-mcp \
  -p 127.0.0.1:3333:3333 \
-  -v ./config:/app/config:ro \
+  -v $(pwd)/config:/app/config:ro \
-  -v ./output:/app/output:ro \
+  -v $(pwd)/output:/app/output:ro \
  -e TZ=Asia/Shanghai \
  wantcat/trendradar-mcp:latest
 # Windows PowerShell
 docker run -d --name trend-radar-mcp `
  -p 127.0.0.1:3333:3333 `
  -v ${PWD}/config:/app/config:ro `
  -v ${PWD}/output:/app/output:ro `
  -e TZ=Asia/Shanghai `
  wantcat/trendradar-mcp:latest
 ```
 > ⚠️ **注意**：单独运行时，确保当前目录下有 `config/` 和 `output/` 文件夹，且包含配置文件和新闻数据。
 **验证服务**：
 ```bash
-# 检查 MCP 服务是否正常运行
+# 检查 MCP 服务健康状态
 curl http://127.0.0.1:3333/mcp
 # 查看 MCP 服务日志
@ -2385,14 +2513,20 @@ docker logs -f trend-radar-mcp
 **在 AI 客户端中配置**：
-MCP 服务启动后，在 Claude Desktop、Cherry Studio、Cursor 等客户端中配置：
+MCP 服务启动后，根据不同客户端进行配置：
 **Cherry Studio**（推荐，GUI 配置）：
 - 设置 → MCP 服务器 → 添加
 - 类型：`streamableHttp`
 - URL：`http://127.0.0.1:3333/mcp`
 **Claude Desktop / Cline**（JSON 配置）：
 ```json
 {
  "mcpServers": {
    "trendradar": {
      "url": "http://127.0.0.1:3333/mcp",
-      "description": "TrendRadar 新闻热点分析"
+      "type": "streamableHttp"
    }
  }
 }
@ -2480,7 +2614,6 @@ notification:
      start: "20:00"                  # 开始时间（北京时间）
      end: "22:00"                    # 结束时间（北京时间）
    once_per_day: true                # 每天只推送一次
    push_record_retention_days: 7     # 推送记录保留天数
 ```
 #### 配置项详解
@ -2491,7 +2624,6 @@ notification:
 | `time_range.start` | string | `"20:00"` | 推送时间窗口开始时间（北京时间，HH:MM 格式） |
 | `time_range.end` | string | `"22:00"` | 推送时间窗口结束时间（北京时间，HH:MM 格式） |
 | `once_per_day` | bool | `true` | `true`=每天在窗口内只推送一次，`false`=窗口内每次执行都推送 |
 | `push_record_retention_days` | int | `7` | 推送记录保留天数（用于判断是否已推送） |
 #### 使用场景
@ -2515,7 +2647,6 @@ PUSH_WINDOW_ENABLED=true
 PUSH_WINDOW_START=09:00
 PUSH_WINDOW_END=18:00
 PUSH_WINDOW_ONCE_PER_DAY=false
 PUSH_WINDOW_RETENTION_DAYS=7
 ```
 #### 完整配置示例
@ -2530,7 +2661,6 @@ notification:
      start: "20:00"
      end: "22:00"
    once_per_day: true
    push_record_retention_days: 7
 ```
 **场景：工作时间内每小时推送**
@ -2543,7 +2673,6 @@ notification:
      start: "09:00"
      end: "18:00"
    once_per_day: false
    push_record_retention_days: 7
 ```
 </details>
@ -2829,6 +2958,123 @@ notification:
 </details>
 ### 11. 存储配置
 <details id="storage-config">
 <summary>👉 点击展开：<strong>存储架构配置详解</strong></summary>
 <br>
 #### 存储后端选择
 **配置位置**：`config/config.yaml` 的 `storage` 部分
 v4.0.0 版本重构了存储架构，支持多种存储后端：
 ```yaml
 storage:
  backend: auto  # 存储后端：auto（自动选择）/ local（本地SQLite）/ remote（远程云存储）
  formats:
    sqlite: true   # 是否启用SQLite存储
    txt: true      # 是否生成TXT快照
    html: true     # 是否生成HTML报告
  local:
    data_dir: "output"    # 本地存储目录
    retention_days: 0     # 本地数据保留天数，0表示永久保留
  remote:
    endpoint_url: ""      # S3 API 端点
    bucket_name: ""       # 存储桶名称
    access_key_id: ""     # 访问密钥ID
    secret_access_key: "" # 访问密钥
    region: ""            # 区域（可选）
    retention_days: 0     # 远程数据保留天数，0表示永久保留
  pull:
    enabled: false        # 是否启用启动时从远程拉取数据
    days: 7               # 拉取最近N天的数据
 ```
 #### 后端选择策略
 | backend 值 | 说明 | 适用场景 |
 |-----------|------|---------|
 | `auto` | **自动选择**（推荐） | 根据运行环境智能选择：<br>• GitHub Actions → Remote<br>• Docker/本地 → Local |
 | `local` | 本地 SQLite 数据库 | Docker 部署、本地开发 |
 | `remote` | 远程云存储（S3 兼容，如 Cloudflare R2） | GitHub Actions、多机器同步 |
 #### 远程云存储配置
 **环境变量**（推荐方式）：
 ```bash
 # GitHub Actions / Docker 环境变量
 STORAGE_BACKEND=remote  # 或 auto
 # 本地/远程数据保留天数（0 表示永久保留）
 LOCAL_RETENTION_DAYS=0
 REMOTE_RETENTION_DAYS=0
 # S3 兼容存储配置（以 Cloudflare R2 为例）
 S3_BUCKET_NAME=your-bucket-name
 S3_ACCESS_KEY_ID=your-access-key-id
 S3_SECRET_ACCESS_KEY=your-secret-access-key
 S3_ENDPOINT_URL=https://<account-id>.r2.cloudflarestorage.com
 S3_REGION=auto
 # 数据拉取配置（可选，从远程同步到本地）
 PULL_ENABLED=false
 PULL_DAYS=7
 ```
 **获取凭据**：参见 [快速开始 - 远程存储配置](#-快速开始)
 #### 数据清理策略
 **自动清理**：每次运行结束时检查并删除超过保留天数的数据。
 ```yaml
 storage:
  local:
    retention_days: 30  # 本地保留最近30天数据
  remote:
    retention_days: 30  # 远程保留最近30天数据
 ```
 **清理逻辑**：
 - 本地存储：删除过期日期的文件夹（如 `output/2025-11-10/`）
 - 远程存储：批量删除过期的云端对象（如 `news/2025-11-10.db`）
 #### 时区配置（v4.0.0 新增）
 **全球时区支持**：解决非中国用户推送时间窗口问题。
 ```yaml
 app:
  timezone: "Asia/Shanghai"  # 默认中国时区
  # 其他示例：
  # timezone: "America/Los_Angeles"  # 美西时间
  # timezone: "Europe/London"        # 英国时间
 ```
 **支持所有 IANA 时区名称**：[时区列表](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones)
 #### 不兼容变更
 ⚠️ **v4.0.0 不兼容 v3.x 数据**：
 1. 数据库结构完全重构，无法读取旧数据
 2. 文件路径格式变更（ISO 格式）
 **迁移建议**：
 - 从 v4.0.0 开始重新收集数据
 - 旧数据如需保留，请手动重命名目录格式（不推荐）
 </details>
 <br>
 ## 🤖 AI 智能分析
@ -2846,7 +3092,7 @@ AI 分析功能**不是**直接查询网络实时数据，而是分析你**本
 #### 使用说明：
-1. **项目自带测试数据**：`output` 目录默认包含 **2025年11月1日～11月15日** 的新闻数据，可用于快速体验 AI 功能
+1. **项目自带测试数据**：`output` 目录默认包含 **2025-11-01～2025-11-15** 的新闻数据，可用于快速体验 AI 功能
 2. **查询限制**：
   - ✅ 只能查询已有日期范围内的数据（11月1-15日）
--- a/config/config.yaml
+++ b/config/config.yaml
@ -1,12 +1,60 @@
 app:
  version_check_url: "https://raw.githubusercontent.com/sansan0/TrendRadar/refs/heads/master/version"
  show_version_update: true # 控制显示版本更新提示，如果 false，则不接受新版本提示
  # 时区配置（影响所有时间显示、推送窗口判断、数据存储）
  # 常用时区：
  #   - Asia/Shanghai (北京时间 UTC+8)
  #   - America/New_York (美东时间 UTC-5/-4)
  #   - Europe/London (伦敦时间 UTC+0/+1)
  # 完整时区列表: https://en.wikipedia.org/wiki/List_of_tz_database_time_zones
  timezone: "Asia/Shanghai"
 # 存储配置
 storage:
  # 存储后端选择: local / remote / auto
  # - local: 本地 SQLite + TXT/HTML 文件
  # - remote: 远程云存储（S3 兼容协议，支持 R2/OSS/COS 等）
  # - auto: 自动选择（GitHub Actions 环境且配置了远程存储则用 remote，否则用 local）
  backend: "auto"
  # 数据格式选项
  formats:
    sqlite: true       # 主存储（必须启用）
    txt: false         # 是否生成 TXT 快照
    html: false        # 是否生成 HTML 报告
  # 本地存储配置
  local:
    data_dir: "output"        # 数据目录
    retention_days: 0         # 本地数据保留天数（0 = 不清理）
  # 远程存储配置（S3 兼容协议）
  # 支持: Cloudflare R2, 阿里云 OSS, 腾讯云 COS, AWS S3, MinIO 等
  # 建议将敏感信息配置在 GitHub Secrets 或环境变量中
  remote:
    # 数据保留天数（0 = 不清理远程数据）
    retention_days: 0
    # S3 兼容配置
    endpoint_url: ""          # 服务端点（或环境变量 S3_ENDPOINT_URL）
                              # Cloudflare R2: https://<account_id>.r2.cloudflarestorage.com
                              # 阿里云 OSS: https://oss-cn-hangzhou.aliyuncs.com
                              # 腾讯云 COS: https://cos.ap-guangzhou.myqcloud.com
    bucket_name: ""           # 存储桶名称（或环境变量 S3_BUCKET_NAME）
    access_key_id: ""         # 访问密钥 ID（或环境变量 S3_ACCESS_KEY_ID）
    secret_access_key: ""     # 访问密钥（或环境变量 S3_SECRET_ACCESS_KEY）
    region: ""                # 区域（可选，部分服务商需要，或环境变量 S3_REGION）
  # 数据拉取配置（从远程同步到本地）
  # 用于 MCP Server 等场景：爬虫存到远程，MCP 拉取到本地分析
  pull:
    enabled: false            # 是否启用启动时自动拉取
    days: 7                   # 拉取最近 N 天的数据（0 = 不拉取）
 crawler:
  request_interval: 1000 # 请求间隔(毫秒)
  enable_crawler: true # 是否启用爬取新闻功能，如果 false，则直接停止程序
  use_proxy: false # 是否启用代理，false 时为关闭
-  default_proxy: "http://127.0.0.1:10086"
+  default_proxy: "http://127.0.0.1:10801"
 # 🔸 daily（当日汇总模式）
 #   • 推送时机：按时推送(默认每小时推送一次)
@ -55,7 +103,6 @@ notification:
      start: "20:00"  # 推送时间窗口开始（北京时间）
      end: "22:00"    # 推送时间窗口结束（北京时间）
    once_per_day: true  # 每天在时间窗口内只推送一次，如果 false，则窗口内每次执行都推送
    push_record_retention_days: 7  # 推送记录保留天数
  # ⚠️⚠️⚠️ 重要安全警告 / IMPORTANT SECURITY WARNING ⚠️⚠️⚠️
  #
--- a/docker/.env
+++ b/docker/.env
@ -40,8 +40,6 @@ PUSH_WINDOW_START=
 PUSH_WINDOW_END=
 # 每天只推送一次 (true/false)
 PUSH_WINDOW_ONCE_PER_DAY=
 # 推送记录保留天数 (数字，如 7)
 PUSH_WINDOW_RETENTION_DAYS=
 # ============================================
 # 多账号配置
@ -87,6 +85,39 @@ BARK_URL=
 # Slack 推送配置（多账号用 ; 分隔）
 SLACK_WEBHOOK_URL=
 # ============================================
 # 存储配置
 # ============================================
 # 存储后端选择 (local/remote/auto)
 # - local: 本地 SQLite + TXT/HTML 文件
 # - remote: 远程云存储（S3 兼容协议）
 # - auto: 自动选择（GitHub Actions 用 remote，其他用 local）
 STORAGE_BACKEND=auto
 # 本地数据保留天数（0 = 无限制，不清理历史数据）
 LOCAL_RETENTION_DAYS=0
 # 远程数据保留天数（0 = 无限制，不清理历史数据）
 REMOTE_RETENTION_DAYS=0
 # 是否生成 TXT 快照 (true/false)
 STORAGE_TXT_ENABLED=
 # 是否生成 HTML 报告 (true/false)
 STORAGE_HTML_ENABLED=
 # 远程存储配置（S3 兼容协议，支持 R2/OSS/COS/S3 等）
 S3_ENDPOINT_URL=
 S3_BUCKET_NAME=
 S3_ACCESS_KEY_ID=
 S3_SECRET_ACCESS_KEY=
 S3_REGION=
 # 数据拉取配置（从远程同步到本地）
 PULL_ENABLED=false
 PULL_DAYS=7
 # ============================================
 # 运行配置
 # ============================================
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -53,8 +53,8 @@ RUN set -ex && \
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 COPY main.py .
 COPY docker/manage.py .
 COPY trendradar/ ./trendradar/
 # 复制 entrypoint.sh 并强制转换为 LF 格式
 COPY docker/entrypoint.sh /entrypoint.sh.tmp
--- a/docker/Dockerfile.mcp
+++ b/docker/Dockerfile.mcp
@ -8,6 +8,8 @@ RUN pip install --no-cache-dir -r requirements.txt
 # 复制 MCP 服务器代码
 COPY mcp_server/ ./mcp_server/
 # 复制 trendradar 模块（MCP 服务需要读取 SQLite 数据）
 COPY trendradar/ ./trendradar/
 # 创建必要目录
 RUN mkdir -p /app/config /app/output
--- a/docker/docker-compose-build.yml
+++ b/docker/docker-compose-build.yml
@ -32,7 +32,6 @@ services:
      - PUSH_WINDOW_START=${PUSH_WINDOW_START:-}
      - PUSH_WINDOW_END=${PUSH_WINDOW_END:-}
      - PUSH_WINDOW_ONCE_PER_DAY=${PUSH_WINDOW_ONCE_PER_DAY:-}
      - PUSH_WINDOW_RETENTION_DAYS=${PUSH_WINDOW_RETENTION_DAYS:-}
      # 通知渠道
      - FEISHU_WEBHOOK_URL=${FEISHU_WEBHOOK_URL:-}
      - TELEGRAM_BOT_TOKEN=${TELEGRAM_BOT_TOKEN:-}
@ -54,6 +53,21 @@ services:
      - BARK_URL=${BARK_URL:-}
      # Slack配置
      - SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL:-}
      # 存储配置
      - STORAGE_BACKEND=${STORAGE_BACKEND:-auto}
      - LOCAL_RETENTION_DAYS=${LOCAL_RETENTION_DAYS:-0}
      - REMOTE_RETENTION_DAYS=${REMOTE_RETENTION_DAYS:-0}
      - STORAGE_TXT_ENABLED=${STORAGE_TXT_ENABLED:-true}
      - STORAGE_HTML_ENABLED=${STORAGE_HTML_ENABLED:-true}
      # 远程存储配置（S3 兼容协议）
      - S3_ENDPOINT_URL=${S3_ENDPOINT_URL:-}
      - S3_BUCKET_NAME=${S3_BUCKET_NAME:-}
      - S3_ACCESS_KEY_ID=${S3_ACCESS_KEY_ID:-}
      - S3_SECRET_ACCESS_KEY=${S3_SECRET_ACCESS_KEY:-}
      - S3_REGION=${S3_REGION:-}
      # 数据拉取配置
      - PULL_ENABLED=${PULL_ENABLED:-false}
      - PULL_DAYS=${PULL_DAYS:-7}
      # 运行模式
      - CRON_SCHEDULE=${CRON_SCHEDULE:-*/5 * * * *}
      - RUN_MODE=${RUN_MODE:-cron}
@ -71,7 +85,7 @@ services:
    volumes:
      - ../config:/app/config:ro
-      - ../output:/app/output:ro
+      - ../output:/app/output
    environment:
      - TZ=Asia/Shanghai
--- a/docker/docker-compose.yml
+++ b/docker/docker-compose.yml
@ -30,7 +30,6 @@ services:
      - PUSH_WINDOW_START=${PUSH_WINDOW_START:-}
      - PUSH_WINDOW_END=${PUSH_WINDOW_END:-}
      - PUSH_WINDOW_ONCE_PER_DAY=${PUSH_WINDOW_ONCE_PER_DAY:-}
      - PUSH_WINDOW_RETENTION_DAYS=${PUSH_WINDOW_RETENTION_DAYS:-}
      # 通知渠道
      - FEISHU_WEBHOOK_URL=${FEISHU_WEBHOOK_URL:-}
      - TELEGRAM_BOT_TOKEN=${TELEGRAM_BOT_TOKEN:-}
@ -52,6 +51,21 @@ services:
      - BARK_URL=${BARK_URL:-}
      # Slack配置
      - SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL:-}
      # 存储配置
      - STORAGE_BACKEND=${STORAGE_BACKEND:-auto}
      - LOCAL_RETENTION_DAYS=${LOCAL_RETENTION_DAYS:-0}
      - REMOTE_RETENTION_DAYS=${REMOTE_RETENTION_DAYS:-0}
      - STORAGE_TXT_ENABLED=${STORAGE_TXT_ENABLED:-true}
      - STORAGE_HTML_ENABLED=${STORAGE_HTML_ENABLED:-true}
      # 远程存储配置（S3 兼容协议）
      - S3_ENDPOINT_URL=${S3_ENDPOINT_URL:-}
      - S3_BUCKET_NAME=${S3_BUCKET_NAME:-}
      - S3_ACCESS_KEY_ID=${S3_ACCESS_KEY_ID:-}
      - S3_SECRET_ACCESS_KEY=${S3_SECRET_ACCESS_KEY:-}
      - S3_REGION=${S3_REGION:-}
      # 数据拉取配置
      - PULL_ENABLED=${PULL_ENABLED:-false}
      - PULL_DAYS=${PULL_DAYS:-7}
      # 运行模式
      - CRON_SCHEDULE=${CRON_SCHEDULE:-*/5 * * * *}
      - RUN_MODE=${RUN_MODE:-cron}
@ -67,7 +81,7 @@ services:
    volumes:
      - ../config:/app/config:ro
-      - ../output:/app/output:ro
+      - ../output:/app/output
    environment:
      - TZ=Asia/Shanghai
--- a/docker/entrypoint.sh
+++ b/docker/entrypoint.sh
@ -13,11 +13,11 @@ env >> /etc/environment
 case "${RUN_MODE:-cron}" in
 "once")
    echo "🔄 单次执行"
-    exec /usr/local/bin/python main.py
+    exec /usr/local/bin/python -m trendradar
    ;;
 "cron")
    # 生成 crontab
-    echo "${CRON_SCHEDULE:-*/30 * * * *} cd /app && /usr/local/bin/python main.py" > /tmp/crontab
+    echo "${CRON_SCHEDULE:-*/30 * * * *} cd /app && /usr/local/bin/python -m trendradar" > /tmp/crontab
    echo "📅 生成的crontab内容:"
    cat /tmp/crontab
@ -30,7 +30,7 @@ case "${RUN_MODE:-cron}" in
    # 立即执行一次（如果配置了）
    if [ "${IMMEDIATE_RUN:-false}" = "true" ]; then
        echo "▶️ 立即执行一次"
-        /usr/local/bin/python main.py
+        /usr/local/bin/python -m trendradar
    fi
    # 启动 Web 服务器（如果配置了）
--- a/docker/manage.py
+++ b/docker/manage.py
@ -33,7 +33,7 @@ def manual_run():
    print("🔄 手动执行爬虫...")
    try:
        result = subprocess.run(
-            ["python", "main.py"], cwd="/app", capture_output=False, text=True
+            ["python", "-m", "trendradar"], cwd="/app", capture_output=False, text=True
        )
        if result.returncode == 0:
            print("✅ 执行完成")
@ -285,12 +285,24 @@ def show_config():
        "TELEGRAM_CHAT_ID",
        "CONFIG_PATH",
        "FREQUENCY_WORDS_PATH",
        # 存储配置
        "STORAGE_BACKEND",
        "LOCAL_RETENTION_DAYS",
        "REMOTE_RETENTION_DAYS",
        "STORAGE_TXT_ENABLED",
        "STORAGE_HTML_ENABLED",
        "S3_BUCKET_NAME",
        "S3_ACCESS_KEY_ID",
        "S3_ENDPOINT_URL",
        "S3_REGION",
        "PULL_ENABLED",
        "PULL_DAYS",
    ]
    for var in env_vars:
        value = os.environ.get(var, "未设置")
        # 隐藏敏感信息
-        if any(sensitive in var for sensitive in ["WEBHOOK", "TOKEN", "KEY"]):
+        if any(sensitive in var for sensitive in ["WEBHOOK", "TOKEN", "KEY", "SECRET"]):
            if value and value != "未设置":
                masked_value = value[:10] + "***" if len(value) > 10 else "***"
                print(f"  {var}: {masked_value}")
@ -331,6 +343,17 @@ def show_files():
    # 显示最近2天的文件
    for date_dir in date_dirs[:2]:
        print(f"  📅 {date_dir.name}:")
        # 检查 SQLite 数据库文件
        db_files = list(date_dir.glob("*.db"))
        if db_files:
            print(f"    💾 SQLite: {len(db_files)} 个数据库")
            for db_file in db_files[:3]:
                mtime = time.ctime(db_file.stat().st_mtime)
                size_kb = db_file.stat().st_size // 1024
                print(f"      📀 {db_file.name} ({size_kb}KB, {mtime.split()[3][:5]})")
        # 检查子目录（html, txt）
        for subdir in ["html", "txt"]:
            sub_path = date_dir / subdir
            if sub_path.exists():
--- a/main.py
+++ b/main.py
--- a/mcp_server/init.py
+++ b/mcp_server/init.py
@ -4,4 +4,4 @@ TrendRadar MCP Server
 提供基于MCP协议的新闻聚合数据查询和系统管理接口。
 """
-__version__ = "1.0.0"
+__version__ = "1.1.0"
--- a/mcp_server/server.py
+++ b/mcp_server/server.py
@ -15,6 +15,7 @@ from .tools.analytics import AnalyticsTools
 from .tools.search_tools import SearchTools
 from .tools.config_mgmt import ConfigManagementTools
 from .tools.system import SystemManagementTools
 from .tools.storage_sync import StorageSyncTools
 from .utils.date_parser import DateParser
 from .utils.errors import MCPError
@ -34,6 +35,7 @@ def _get_tools(project_root: Optional[str] = None):
        _tools_instances['search'] = SearchTools(project_root)
        _tools_instances['config'] = ConfigManagementTools(project_root)
        _tools_instances['system'] = SystemManagementTools(project_root)
        _tools_instances['storage'] = StorageSyncTools(project_root)
    return _tools_instances
@ -657,6 +659,127 @@ async def trigger_crawl(
    return json.dumps(result, ensure_ascii=False, indent=2)
 # ==================== 存储同步工具 ====================
@mcp.tool
 async def sync_from_remote(
    days: int = 7
 ) -> str:
    """
    从远程存储拉取数据到本地
    用于 MCP Server 等场景：爬虫存到远程云存储（如 Cloudflare R2），
    MCP Server 拉取到本地进行分析查询。
    Args:
        days: 拉取最近 N 天的数据，默认 7 天
              - 0: 不拉取
              - 7: 拉取最近一周的数据
              - 30: 拉取最近一个月的数据
    Returns:
        JSON格式的同步结果，包含：
        - success: 是否成功
        - synced_files: 成功同步的文件数量
        - synced_dates: 成功同步的日期列表
        - skipped_dates: 跳过的日期（本地已存在）
        - failed_dates: 失败的日期及错误信息
        - message: 操作结果描述
    Examples:
        - sync_from_remote()  # 拉取最近7天
        - sync_from_remote(days=30)  # 拉取最近30天
    Note:
        需要在 config/config.yaml 中配置远程存储（storage.remote）或设置环境变量：
        - S3_ENDPOINT_URL: 服务端点
        - S3_BUCKET_NAME: 存储桶名称
        - S3_ACCESS_KEY_ID: 访问密钥 ID
        - S3_SECRET_ACCESS_KEY: 访问密钥
    """
    tools = _get_tools()
    result = tools['storage'].sync_from_remote(days=days)
    return json.dumps(result, ensure_ascii=False, indent=2)
@mcp.tool
 async def get_storage_status() -> str:
    """
    获取存储配置和状态
    查看当前存储后端配置、本地和远程存储的状态信息。
    Returns:
        JSON格式的存储状态信息，包含：
        - backend: 当前使用的后端类型（local/remote/auto）
        - local: 本地存储状态
            - data_dir: 数据目录
            - retention_days: 保留天数
            - total_size: 总大小
            - date_count: 日期数量
            - earliest_date: 最早日期
            - latest_date: 最新日期
        - remote: 远程存储状态
            - configured: 是否已配置
            - endpoint_url: 服务端点
            - bucket_name: 存储桶名称
            - date_count: 远程日期数量
        - pull: 拉取配置
            - enabled: 是否启用自动拉取
            - days: 自动拉取天数
    Examples:
        - get_storage_status()  # 查看所有存储状态
    """
    tools = _get_tools()
    result = tools['storage'].get_storage_status()
    return json.dumps(result, ensure_ascii=False, indent=2)
@mcp.tool
 async def list_available_dates(
    source: str = "both"
 ) -> str:
    """
    列出本地/远程可用的日期范围
    查看本地和远程存储中有哪些日期的数据可用，
    帮助了解数据覆盖范围和同步状态。
    Args:
        source: 数据来源，可选值：
            - "local": 仅列出本地可用日期
            - "remote": 仅列出远程可用日期
            - "both": 同时列出两者并进行对比（默认）
    Returns:
        JSON格式的日期列表，包含：
        - local: 本地日期信息（如果 source 包含 local）
            - dates: 日期列表（按时间倒序）
            - count: 日期数量
            - earliest: 最早日期
            - latest: 最新日期
        - remote: 远程日期信息（如果 source 包含 remote）
            - configured: 是否已配置远程存储
            - dates: 日期列表
            - count: 日期数量
            - earliest: 最早日期
            - latest: 最新日期
        - comparison: 对比结果（仅当 source="both" 时）
            - only_local: 仅本地存在的日期
            - only_remote: 仅远程存在的日期
            - both: 两边都存在的日期
    Examples:
        - list_available_dates()  # 查看本地和远程的对比
        - list_available_dates(source="local")  # 仅查看本地
        - list_available_dates(source="remote")  # 仅查看远程
    """
    tools = _get_tools()
    result = tools['storage'].list_available_dates(source=source)
    return json.dumps(result, ensure_ascii=False, indent=2)
 # ==================== 启动入口 ====================
 def run_server(
@ -721,6 +844,11 @@ def run_server(
    print("    11. get_current_config      - 获取当前系统配置")
    print("    12. get_system_status       - 获取系统运行状态")
    print("    13. trigger_crawl           - 手动触发爬取任务")
    print()
    print("    === 存储同步工具 ===")
    print("    14. sync_from_remote        - 从远程存储拉取数据到本地")
    print("    15. get_storage_status      - 获取存储配置和状态")
    print("    16. list_available_dates    - 列出本地/远程可用日期")
    print("=" * 60)
    print()
--- a/mcp_server/services/data_service.py
+++ b/mcp_server/services/data_service.py
@ -517,24 +517,55 @@ class DataService:
        # 遍历日期文件夹
        for date_folder in output_dir.iterdir():
            if date_folder.is_dir() and not date_folder.name.startswith('.'):
-                # 解析日期（格式: YYYY年MM月DD日）
+                folder_date = self._parse_date_folder_name(date_folder.name)
-                try:
+                if folder_date:
                    date_match = re.match(r'(\d{4})年(\d{2})月(\d{2})日', date_folder.name)
                    if date_match:
                        folder_date = datetime(
                            int(date_match.group(1)),
                            int(date_match.group(2)),
                            int(date_match.group(3))
                        )
                    available_dates.append(folder_date)
                except Exception:
                    pass
        if not available_dates:
            return (None, None)
        return (min(available_dates), max(available_dates))
    def _parse_date_folder_name(self, folder_name: str) -> Optional[datetime]:
        """
        解析日期文件夹名称（兼容中文和ISO格式）
        支持两种格式：
        - 中文格式：YYYY年MM月DD日
        - ISO格式：YYYY-MM-DD
        Args:
            folder_name: 文件夹名称
        Returns:
            datetime 对象，解析失败返回 None
        """
        # 尝试中文格式：YYYY年MM月DD日
        chinese_match = re.match(r'(\d{4})年(\d{2})月(\d{2})日', folder_name)
        if chinese_match:
            try:
                return datetime(
                    int(chinese_match.group(1)),
                    int(chinese_match.group(2)),
                    int(chinese_match.group(3))
                )
            except ValueError:
                pass
        # 尝试 ISO 格式：YYYY-MM-DD
        iso_match = re.match(r'(\d{4})-(\d{2})-(\d{2})', folder_name)
        if iso_match:
            try:
                return datetime(
                    int(iso_match.group(1)),
                    int(iso_match.group(2)),
                    int(iso_match.group(3))
                )
            except ValueError:
                pass
        return None
    def get_system_status(self) -> Dict:
        """
        获取系统运行状态
@ -553,27 +584,15 @@ class DataService:
        if output_dir.exists():
            # 遍历日期文件夹
            for date_folder in output_dir.iterdir():
-                if date_folder.is_dir():
+                if date_folder.is_dir() and not date_folder.name.startswith('.'):
-                    # 解析日期
+                    # 解析日期（兼容中文和ISO格式）
-                    try:
+                    folder_date = self._parse_date_folder_name(date_folder.name)
-                        date_str = date_folder.name
+                    if folder_date:
                        # 格式: YYYY年MM月DD日
                        date_match = re.match(r'(\d{4})年(\d{2})月(\d{2})日', date_str)
                        if date_match:
                            folder_date = datetime(
                                int(date_match.group(1)),
                                int(date_match.group(2)),
                                int(date_match.group(3))
                            )
                        if oldest_record is None or folder_date < oldest_record:
                            oldest_record = folder_date
                        if latest_record is None or folder_date > latest_record:
                            latest_record = folder_date
                    except:
                        pass
                    # 计算存储大小
                    for item in date_folder.rglob("*"):
                        if item.is_file():
--- a/mcp_server/services/parser_service.py
+++ b/mcp_server/services/parser_service.py
@ -2,9 +2,12 @@
 文件解析服务
 提供txt格式新闻数据和YAML配置文件的解析功能。
 支持从 SQLite 数据库和 TXT 文件两种数据源读取。
 """
 import json
 import re
 import sqlite3
 from pathlib import Path
 from typing import Dict, List, Tuple, Optional
 from datetime import datetime
@ -145,17 +148,310 @@ class ParserService:
    def get_date_folder_name(self, date: datetime = None) -> str:
        """
-        获取日期文件夹名称
+        获取日期文件夹名称（兼容中文和ISO格式）
        Args:
            date: 日期对象，默认为今天
        Returns:
-            文件夹名称，格式: YYYY年MM月DD日
+            实际存在的文件夹名称，优先返回中文格式（YYYY年MM月DD日），
            若不存在则返回 ISO 格式（YYYY-MM-DD）
        """
        if date is None:
            date = datetime.now()
-        return date.strftime("%Y年%m月%d日")
+        return self._find_date_folder(date)
    def _get_date_folder_name(self, date: datetime = None) -> str:
        """
        获取日期文件夹名称（兼容中文和ISO格式）
        Args:
            date: 日期对象，默认为今天
        Returns:
            实际存在的文件夹名称，优先返回中文格式（YYYY年MM月DD日），
            若不存在则返回 ISO 格式（YYYY-MM-DD）
        """
        if date is None:
            date = datetime.now()
        return self._find_date_folder(date)
    def _find_date_folder(self, date: datetime) -> str:
        """
        查找实际存在的日期文件夹
        支持两种格式：
        - 中文格式：YYYY年MM月DD日（优先）
        - ISO格式：YYYY-MM-DD
        Args:
            date: 日期对象
        Returns:
            实际存在的文件夹名称，若都不存在则返回中文格式
        """
        output_dir = self.project_root / "output"
        # 中文格式：YYYY年MM月DD日
        chinese_format = date.strftime("%Y年%m月%d日")
        # ISO格式：YYYY-MM-DD
        iso_format = date.strftime("%Y-%m-%d")
        # 优先检查中文格式
        if (output_dir / chinese_format).exists():
            return chinese_format
        # 其次检查 ISO 格式
        if (output_dir / iso_format).exists():
            return iso_format
        # 都不存在，返回中文格式（与项目现有风格一致）
        return chinese_format
    def _get_sqlite_db_path(self, date: datetime = None) -> Optional[Path]:
        """
        获取 SQLite 数据库文件路径
        Args:
            date: 日期对象，默认为今天
        Returns:
            数据库文件路径，如果不存在则返回 None
        """
        date_folder = self._get_date_folder_name(date)
        db_path = self.project_root / "output" / date_folder / "news.db"
        if db_path.exists():
            return db_path
        return None
    def _get_txt_folder_path(self, date: datetime = None) -> Optional[Path]:
        """
        获取 TXT 文件夹路径
        Args:
            date: 日期对象，默认为今天
        Returns:
            TXT 文件夹路径，如果不存在则返回 None
        """
        date_folder = self._get_date_folder_name(date)
        txt_path = self.project_root / "output" / date_folder / "txt"
        if txt_path.exists() and txt_path.is_dir():
            return txt_path
        return None
    def _read_from_txt(
        self,
        date: datetime = None,
        platform_ids: Optional[List[str]] = None
    ) -> Optional[Tuple[Dict, Dict, Dict]]:
        """
        从 TXT 文件夹读取新闻数据
        Args:
            date: 日期对象，默认为今天
            platform_ids: 平台ID列表，None表示所有平台
        Returns:
            (all_titles, id_to_name, all_timestamps) 元组，如果不存在返回 None
        """
        txt_folder = self._get_txt_folder_path(date)
        if txt_folder is None:
            return None
        # 获取所有 TXT 文件并按时间排序
        txt_files = sorted(txt_folder.glob("*.txt"))
        if not txt_files:
            return None
        all_titles = {}
        id_to_name = {}
        all_timestamps = {}
        for txt_file in txt_files:
            try:
                titles_by_id, file_id_to_name = self.parse_txt_file(txt_file)
                # 记录时间戳
                all_timestamps[txt_file.name] = txt_file.stat().st_mtime
                # 合并 id_to_name
                id_to_name.update(file_id_to_name)
                # 合并标题数据
                for source_id, titles in titles_by_id.items():
                    # 如果指定了 platform_ids，过滤
                    if platform_ids and source_id not in platform_ids:
                        continue
                    if source_id not in all_titles:
                        all_titles[source_id] = {}
                    for title, data in titles.items():
                        if title not in all_titles[source_id]:
                            # 新标题
                            all_titles[source_id][title] = {
                                "ranks": data.get("ranks", []),
                                "url": data.get("url", ""),
                                "mobileUrl": data.get("mobileUrl", ""),
                                "first_time": txt_file.stem,  # 使用文件名作为时间
                                "last_time": txt_file.stem,
                                "count": 1,
                            }
                        else:
                            # 合并已存在的标题
                            existing = all_titles[source_id][title]
                            # 合并排名
                            for rank in data.get("ranks", []):
                                if rank not in existing["ranks"]:
                                    existing["ranks"].append(rank)
                            # 更新 last_time
                            existing["last_time"] = txt_file.stem
                            existing["count"] += 1
                            # 保留 URL
                            if not existing["url"] and data.get("url"):
                                existing["url"] = data["url"]
                            if not existing["mobileUrl"] and data.get("mobileUrl"):
                                existing["mobileUrl"] = data["mobileUrl"]
            except Exception as e:
                print(f"Warning: 解析 TXT 文件失败 {txt_file}: {e}")
                continue
        if not all_titles:
            return None
        return (all_titles, id_to_name, all_timestamps)
    def _read_from_sqlite(
        self,
        date: datetime = None,
        platform_ids: Optional[List[str]] = None
    ) -> Optional[Tuple[Dict, Dict, Dict]]:
        """
        从 SQLite 数据库读取新闻数据
        新表结构数据已按 URL 去重，包含：
        - first_crawl_time: 首次抓取时间
        - last_crawl_time: 最后抓取时间
        - crawl_count: 抓取次数
        Args:
            date: 日期对象，默认为今天
            platform_ids: 平台ID列表，None表示所有平台
        Returns:
            (all_titles, id_to_name, all_timestamps) 元组，如果数据库不存在返回 None
        """
        db_path = self._get_sqlite_db_path(date)
        if db_path is None:
            return None
        all_titles = {}
        id_to_name = {}
        all_timestamps = {}
        try:
            conn = sqlite3.connect(str(db_path))
            conn.row_factory = sqlite3.Row
            cursor = conn.cursor()
            # 检查表是否存在
            cursor.execute("""
                SELECT name FROM sqlite_master
                WHERE type='table' AND name='news_items'
            """)
            if not cursor.fetchone():
                conn.close()
                return None
            # 构建查询
            if platform_ids:
                placeholders = ','.join(['?' for _ in platform_ids])
                query = f"""
                    SELECT n.id, n.platform_id, p.name as platform_name, n.title,
                           n.rank, n.url, n.mobile_url,
                           n.first_crawl_time, n.last_crawl_time, n.crawl_count
                    FROM news_items n
                    LEFT JOIN platforms p ON n.platform_id = p.id
                    WHERE n.platform_id IN ({placeholders})
                """
                cursor.execute(query, platform_ids)
            else:
                cursor.execute("""
                    SELECT n.id, n.platform_id, p.name as platform_name, n.title,
                           n.rank, n.url, n.mobile_url,
                           n.first_crawl_time, n.last_crawl_time, n.crawl_count
                    FROM news_items n
                    LEFT JOIN platforms p ON n.platform_id = p.id
                """)
            rows = cursor.fetchall()
            # 收集所有 news_item_id 用于查询历史排名
            news_ids = [row['id'] for row in rows]
            rank_history_map = {}
            if news_ids:
                placeholders = ",".join("?" * len(news_ids))
                cursor.execute(f"""
                    SELECT news_item_id, rank FROM rank_history
                    WHERE news_item_id IN ({placeholders})
                    ORDER BY news_item_id, crawl_time
                """, news_ids)
                for rh_row in cursor.fetchall():
                    news_id = rh_row['news_item_id']
                    rank = rh_row['rank']
                    if news_id not in rank_history_map:
                        rank_history_map[news_id] = []
                    rank_history_map[news_id].append(rank)
            for row in rows:
                news_id = row['id']
                platform_id = row['platform_id']
                platform_name = row['platform_name'] or platform_id
                title = row['title']
                # 更新 id_to_name
                if platform_id not in id_to_name:
                    id_to_name[platform_id] = platform_name
                # 初始化平台字典
                if platform_id not in all_titles:
                    all_titles[platform_id] = {}
                # 获取排名历史，如果为空则使用当前排名
                ranks = rank_history_map.get(news_id, [row['rank']])
                # 直接使用数据（已去重）
                all_titles[platform_id][title] = {
                    "ranks": ranks,
                    "url": row['url'] or "",
                    "mobileUrl": row['mobile_url'] or "",
                    "first_time": row['first_crawl_time'] or "",
                    "last_time": row['last_crawl_time'] or "",
                    "count": row['crawl_count'] or 1,
                }
            # 获取抓取时间作为 timestamps
            cursor.execute("""
                SELECT crawl_time FROM crawl_records
                ORDER BY crawl_time
            """)
            for row in cursor.fetchall():
                crawl_time = row['crawl_time']
                all_timestamps[f"{crawl_time}.db"] = 0  # 用虚拟时间戳
            conn.close()
            if not all_titles:
                return None
            return (all_titles, id_to_name, all_timestamps)
        except Exception as e:
            print(f"Warning: 从 SQLite 读取数据失败: {e}")
            return None
    def read_all_titles_for_date(
        self,
@ -163,7 +459,7 @@ class ParserService:
        platform_ids: Optional[List[str]] = None
    ) -> Tuple[Dict, Dict, Dict]:
        """
-        读取指定日期的所有标题文件（带缓存）
+        读取指定日期的所有标题（带缓存）
        Args:
            date: 日期对象，默认为今天
@ -193,72 +489,24 @@ class ParserService:
        if cached:
            return cached
-        # 缓存未命中，读取文件
+        # 优先从 SQLite 读取
-        date_folder = self.get_date_folder_name(date)
+        sqlite_result = self._read_from_sqlite(date, platform_ids)
-        txt_dir = self.project_root / "output" / date_folder / "txt"
+        if sqlite_result:
            self.cache.set(cache_key, sqlite_result)
            return sqlite_result
-        if not txt_dir.exists():
+        # SQLite 不存在，尝试从 TXT 读取
        txt_result = self._read_from_txt(date, platform_ids)
        if txt_result:
            self.cache.set(cache_key, txt_result)
            return txt_result
        # 两种数据源都不存在
        raise DataNotFoundError(
-                f"未找到 {date_folder} 的数据目录",
+            f"未找到 {date_str} 的数据",
            suggestion="请先运行爬虫或检查日期是否正确"
        )
        all_titles = {}
        id_to_name = {}
        all_timestamps = {}
        # 读取所有txt文件
        txt_files = sorted(txt_dir.glob("*.txt"))
        if not txt_files:
            raise DataNotFoundError(
                f"{date_folder} 没有数据文件",
                suggestion="请等待爬虫任务完成"
            )
        for txt_file in txt_files:
            try:
                titles_by_id, file_id_to_name = self.parse_txt_file(txt_file)
                # 更新id_to_name
                id_to_name.update(file_id_to_name)
                # 合并标题数据
                for platform_id, titles in titles_by_id.items():
                    # 如果指定了平台过滤
                    if platform_ids and platform_id not in platform_ids:
                        continue
                    if platform_id not in all_titles:
                        all_titles[platform_id] = {}
                    for title, info in titles.items():
                        if title in all_titles[platform_id]:
                            # 合并排名
                            all_titles[platform_id][title]["ranks"].extend(info["ranks"])
                        else:
                            all_titles[platform_id][title] = info.copy()
                # 记录文件时间戳
                all_timestamps[txt_file.name] = txt_file.stat().st_mtime
            except Exception as e:
                # 忽略单个文件的解析错误，继续处理其他文件
                print(f"Warning: 解析文件 {txt_file} 失败: {e}")
                continue
        if not all_titles:
            raise DataNotFoundError(
                f"{date_folder} 没有有效的数据",
                suggestion="请检查数据文件格式或重新运行爬虫"
            )
        # 缓存结果
        result = (all_titles, id_to_name, all_timestamps)
        self.cache.set(cache_key, result)
        return result
    def parse_yaml_config(self, config_path: str = None) -> dict:
        """
        解析YAML配置文件
--- a/mcp_server/tools/analytics.py
+++ b/mcp_server/tools/analytics.py
@ -25,7 +25,6 @@ def calculate_news_weight(news_data: Dict, rank_threshold: int = 5) -> float:
    """
    计算新闻权重（用于排序）
    基于 main.py 的权重算法实现，综合考虑：
    - 排名权重 (60%)：新闻在榜单中的排名
    - 频次权重 (30%)：新闻出现的次数
    - 热度权重 (10%)：高排名出现的比例
--- a/mcp_server/tools/storage_sync.py
+++ b/mcp_server/tools/storage_sync.py
@ -0,0 +1,468 @@
 # coding=utf-8
 """
 存储同步工具
 实现从远程存储拉取数据到本地、获取存储状态、列出可用日期等功能。
 """
 import os
 import re
 from pathlib import Path
 from datetime import datetime, timedelta
 from typing import Dict, List, Optional
 import yaml
 from ..utils.errors import MCPError
 class StorageSyncTools:
    """存储同步工具类"""
    def __init__(self, project_root: str = None):
        """
        初始化存储同步工具
        Args:
            project_root: 项目根目录
        """
        if project_root:
            self.project_root = Path(project_root)
        else:
            current_file = Path(__file__)
            self.project_root = current_file.parent.parent.parent
        self._config = None
        self._remote_backend = None
    def _load_config(self) -> dict:
        """加载配置文件"""
        if self._config is None:
            config_path = self.project_root / "config" / "config.yaml"
            if config_path.exists():
                with open(config_path, "r", encoding="utf-8") as f:
                    self._config = yaml.safe_load(f)
            else:
                self._config = {}
        return self._config
    def _get_storage_config(self) -> dict:
        """获取存储配置"""
        config = self._load_config()
        return config.get("storage", {})
    def _get_remote_config(self) -> dict:
        """
        获取远程存储配置（合并配置文件和环境变量）
        """
        storage_config = self._get_storage_config()
        remote_config = storage_config.get("remote", {})
        return {
            "endpoint_url": remote_config.get("endpoint_url") or os.environ.get("S3_ENDPOINT_URL", ""),
            "bucket_name": remote_config.get("bucket_name") or os.environ.get("S3_BUCKET_NAME", ""),
            "access_key_id": remote_config.get("access_key_id") or os.environ.get("S3_ACCESS_KEY_ID", ""),
            "secret_access_key": remote_config.get("secret_access_key") or os.environ.get("S3_SECRET_ACCESS_KEY", ""),
            "region": remote_config.get("region") or os.environ.get("S3_REGION", ""),
        }
    def _has_remote_config(self) -> bool:
        """检查是否有有效的远程存储配置"""
        config = self._get_remote_config()
        return bool(
            config.get("bucket_name") and
            config.get("access_key_id") and
            config.get("secret_access_key") and
            config.get("endpoint_url")
        )
    def _get_remote_backend(self):
        """获取远程存储后端实例"""
        if self._remote_backend is not None:
            return self._remote_backend
        if not self._has_remote_config():
            return None
        try:
            from trendradar.storage.remote import RemoteStorageBackend
            remote_config = self._get_remote_config()
            config = self._load_config()
            timezone = config.get("app", {}).get("timezone", "Asia/Shanghai")
            self._remote_backend = RemoteStorageBackend(
                bucket_name=remote_config["bucket_name"],
                access_key_id=remote_config["access_key_id"],
                secret_access_key=remote_config["secret_access_key"],
                endpoint_url=remote_config["endpoint_url"],
                region=remote_config.get("region", ""),
                timezone=timezone,
            )
            return self._remote_backend
        except ImportError:
            print("[存储同步] 远程存储后端需要安装 boto3: pip install boto3")
            return None
        except Exception as e:
            print(f"[存储同步] 创建远程后端失败: {e}")
            return None
    def _get_local_data_dir(self) -> Path:
        """获取本地数据目录"""
        storage_config = self._get_storage_config()
        local_config = storage_config.get("local", {})
        data_dir = local_config.get("data_dir", "output")
        return self.project_root / data_dir
    def _parse_date_folder_name(self, folder_name: str) -> Optional[datetime]:
        """
        解析日期文件夹名称（兼容中文和 ISO 格式）
        支持两种格式：
        - 中文格式：YYYY年MM月DD日
        - ISO 格式：YYYY-MM-DD
        """
        # 尝试 ISO 格式
        iso_match = re.match(r'(\d{4})-(\d{2})-(\d{2})', folder_name)
        if iso_match:
            try:
                return datetime(
                    int(iso_match.group(1)),
                    int(iso_match.group(2)),
                    int(iso_match.group(3))
                )
            except ValueError:
                pass
        # 尝试中文格式
        chinese_match = re.match(r'(\d{4})年(\d{2})月(\d{2})日', folder_name)
        if chinese_match:
            try:
                return datetime(
                    int(chinese_match.group(1)),
                    int(chinese_match.group(2)),
                    int(chinese_match.group(3))
                )
            except ValueError:
                pass
        return None
    def _get_local_dates(self) -> List[str]:
        """获取本地可用的日期列表"""
        local_dir = self._get_local_data_dir()
        dates = []
        if not local_dir.exists():
            return dates
        for item in local_dir.iterdir():
            if item.is_dir() and not item.name.startswith('.'):
                folder_date = self._parse_date_folder_name(item.name)
                if folder_date:
                    dates.append(folder_date.strftime("%Y-%m-%d"))
        return sorted(dates, reverse=True)
    def _calculate_dir_size(self, path: Path) -> int:
        """计算目录大小（字节）"""
        total_size = 0
        if path.exists():
            for item in path.rglob("*"):
                if item.is_file():
                    total_size += item.stat().st_size
        return total_size
    def sync_from_remote(self, days: int = 7) -> Dict:
        """
        从远程存储拉取数据到本地
        Args:
            days: 拉取最近 N 天的数据，默认 7 天
        Returns:
            同步结果字典
        """
        try:
            # 检查远程配置
            if not self._has_remote_config():
                return {
                    "success": False,
                    "error": {
                        "code": "REMOTE_NOT_CONFIGURED",
                        "message": "未配置远程存储",
                        "suggestion": "请在 config/config.yaml 中配置 storage.remote 或设置环境变量"
                    }
                }
            # 获取远程后端
            remote_backend = self._get_remote_backend()
            if remote_backend is None:
                return {
                    "success": False,
                    "error": {
                        "code": "REMOTE_BACKEND_FAILED",
                        "message": "无法创建远程存储后端",
                        "suggestion": "请检查远程存储配置和 boto3 是否已安装"
                    }
                }
            # 获取本地数据目录
            local_dir = self._get_local_data_dir()
            local_dir.mkdir(parents=True, exist_ok=True)
            # 获取远程可用日期
            remote_dates = remote_backend.list_remote_dates()
            # 获取本地已有日期
            local_dates = set(self._get_local_dates())
            # 计算需要拉取的日期（最近 N 天）
            from trendradar.utils.time import get_configured_time
            config = self._load_config()
            timezone = config.get("app", {}).get("timezone", "Asia/Shanghai")
            now = get_configured_time(timezone)
            target_dates = []
            for i in range(days):
                date = now - timedelta(days=i)
                date_str = date.strftime("%Y-%m-%d")
                if date_str in remote_dates:
                    target_dates.append(date_str)
            # 执行拉取
            synced_dates = []
            skipped_dates = []
            failed_dates = []
            for date_str in target_dates:
                # 检查本地是否已存在
                if date_str in local_dates:
                    skipped_dates.append(date_str)
                    continue
                # 拉取单个日期
                try:
                    local_date_dir = local_dir / date_str
                    local_db_path = local_date_dir / "news.db"
                    remote_key = f"news/{date_str}.db"
                    local_date_dir.mkdir(parents=True, exist_ok=True)
                    remote_backend.s3_client.download_file(
                        remote_backend.bucket_name,
                        remote_key,
                        str(local_db_path)
                    )
                    synced_dates.append(date_str)
                    print(f"[存储同步] 已拉取: {date_str}")
                except Exception as e:
                    failed_dates.append({"date": date_str, "error": str(e)})
                    print(f"[存储同步] 拉取失败 ({date_str}): {e}")
            return {
                "success": True,
                "synced_files": len(synced_dates),
                "synced_dates": synced_dates,
                "skipped_dates": skipped_dates,
                "failed_dates": failed_dates,
                "message": f"成功同步 {len(synced_dates)} 天数据" + (
                    f"，跳过 {len(skipped_dates)} 天（本地已存在）" if skipped_dates else ""
                ) + (
                    f"，失败 {len(failed_dates)} 天" if failed_dates else ""
                )
            }
        except MCPError as e:
            return {
                "success": False,
                "error": e.to_dict()
            }
        except Exception as e:
            return {
                "success": False,
                "error": {
                    "code": "INTERNAL_ERROR",
                    "message": str(e)
                }
            }
    def get_storage_status(self) -> Dict:
        """
        获取存储配置和状态
        Returns:
            存储状态字典
        """
        try:
            storage_config = self._get_storage_config()
            config = self._load_config()
            # 本地存储状态
            local_config = storage_config.get("local", {})
            local_dir = self._get_local_data_dir()
            local_size = self._calculate_dir_size(local_dir)
            local_dates = self._get_local_dates()
            local_status = {
                "data_dir": local_config.get("data_dir", "output"),
                "retention_days": local_config.get("retention_days", 0),
                "total_size": f"{local_size / 1024 / 1024:.2f} MB",
                "total_size_bytes": local_size,
                "date_count": len(local_dates),
                "earliest_date": local_dates[-1] if local_dates else None,
                "latest_date": local_dates[0] if local_dates else None,
            }
            # 远程存储状态
            remote_config = storage_config.get("remote", {})
            has_remote = self._has_remote_config()
            remote_status = {
                "configured": has_remote,
                "retention_days": remote_config.get("retention_days", 0),
            }
            if has_remote:
                merged_config = self._get_remote_config()
                # 脱敏显示
                endpoint = merged_config.get("endpoint_url", "")
                bucket = merged_config.get("bucket_name", "")
                remote_status["endpoint_url"] = endpoint
                remote_status["bucket_name"] = bucket
                # 尝试获取远程日期列表
                remote_backend = self._get_remote_backend()
                if remote_backend:
                    try:
                        remote_dates = remote_backend.list_remote_dates()
                        remote_status["date_count"] = len(remote_dates)
                        remote_status["earliest_date"] = remote_dates[-1] if remote_dates else None
                        remote_status["latest_date"] = remote_dates[0] if remote_dates else None
                    except Exception as e:
                        remote_status["error"] = str(e)
            # 拉取配置状态
            pull_config = storage_config.get("pull", {})
            pull_status = {
                "enabled": pull_config.get("enabled", False),
                "days": pull_config.get("days", 7),
            }
            return {
                "success": True,
                "backend": storage_config.get("backend", "auto"),
                "local": local_status,
                "remote": remote_status,
                "pull": pull_status,
            }
        except MCPError as e:
            return {
                "success": False,
                "error": e.to_dict()
            }
        except Exception as e:
            return {
                "success": False,
                "error": {
                    "code": "INTERNAL_ERROR",
                    "message": str(e)
                }
            }
    def list_available_dates(self, source: str = "both") -> Dict:
        """
        列出可用的日期范围
        Args:
            source: 数据来源
                - "local": 仅本地
                - "remote": 仅远程
                - "both": 两者都列出（默认）
        Returns:
            日期列表字典
        """
        try:
            result = {
                "success": True,
            }
            # 本地日期
            if source in ("local", "both"):
                local_dates = self._get_local_dates()
                result["local"] = {
                    "dates": local_dates,
                    "count": len(local_dates),
                    "earliest": local_dates[-1] if local_dates else None,
                    "latest": local_dates[0] if local_dates else None,
                }
            # 远程日期
            if source in ("remote", "both"):
                if not self._has_remote_config():
                    result["remote"] = {
                        "configured": False,
                        "dates": [],
                        "count": 0,
                        "earliest": None,
                        "latest": None,
                        "error": "未配置远程存储"
                    }
                else:
                    remote_backend = self._get_remote_backend()
                    if remote_backend:
                        try:
                            remote_dates = remote_backend.list_remote_dates()
                            result["remote"] = {
                                "configured": True,
                                "dates": remote_dates,
                                "count": len(remote_dates),
                                "earliest": remote_dates[-1] if remote_dates else None,
                                "latest": remote_dates[0] if remote_dates else None,
                            }
                        except Exception as e:
                            result["remote"] = {
                                "configured": True,
                                "dates": [],
                                "count": 0,
                                "earliest": None,
                                "latest": None,
                                "error": str(e)
                            }
                    else:
                        result["remote"] = {
                            "configured": True,
                            "dates": [],
                            "count": 0,
                            "earliest": None,
                            "latest": None,
                            "error": "无法创建远程存储后端"
                        }
            # 如果同时查询两者，计算差异
            if source == "both" and "local" in result and "remote" in result:
                local_set = set(result["local"]["dates"])
                remote_set = set(result["remote"].get("dates", []))
                result["comparison"] = {
                    "only_local": sorted(list(local_set - remote_set), reverse=True),
                    "only_remote": sorted(list(remote_set - local_set), reverse=True),
                    "both": sorted(list(local_set & remote_set), reverse=True),
                }
            return result
        except MCPError as e:
            return {
                "success": False,
                "error": e.to_dict()
            }
        except Exception as e:
            return {
                "success": False,
                "error": {
                    "code": "INTERNAL_ERROR",
                    "message": str(e)
                }
            }
--- a/mcp_server/tools/system.py
+++ b/mcp_server/tools/system.py
@ -87,13 +87,13 @@ class SystemManagementTools:
            >>> print(result['saved_files'])
        """
        try:
            import json
            import time
            import random
            import requests
            from datetime import datetime
            import pytz
            import yaml
            from trendradar.crawler.fetcher import DataFetcher
            from trendradar.storage.local import LocalStorageBackend
            from trendradar.storage.base import convert_crawl_results_to_news_data
            from trendradar.utils.time import get_configured_time, format_date_folder, format_time_filename
            from ..services.cache_service import get_cache
            # 参数验证
            platforms = validate_platforms(platforms)
@ -129,9 +129,6 @@ class SystemManagementTools:
            else:
                target_platforms = all_platforms
            # 获取请求间隔
            request_interval = config_data.get("crawler", {}).get("request_interval", 100)
            # 构建平台ID列表
            ids = []
            for platform in target_platforms:
@ -142,87 +139,82 @@ class SystemManagementTools:
            print(f"开始临时爬取，平台: {[p.get('name', p['id']) for p in target_platforms]}")
-            # 爬取数据
+            # 初始化数据获取器
-            results = {}
+            crawler_config = config_data.get("crawler", {})
-            id_to_name = {}
+            proxy_url = None
-            failed_ids = []
+            if crawler_config.get("use_proxy"):
                proxy_url = crawler_config.get("proxy_url")
-            for i, id_info in enumerate(ids):
+            fetcher = DataFetcher(proxy_url=proxy_url)
-                if isinstance(id_info, tuple):
+            request_interval = crawler_config.get("request_interval", 100)
                    id_value, name = id_info
                else:
                    id_value = id_info
                    name = id_value
-                id_to_name[id_value] = name
+            # 执行爬取
            results, id_to_name, failed_ids = fetcher.crawl_websites(
                ids_list=ids,
                request_interval=request_interval
            )
-                # 构建请求URL
+            # 获取当前时间（统一使用 trendradar 的时间工具）
-                url = f"https://newsnow.busiyi.world/api/s?id={id_value}&latest"
+            # 从配置中读取时区，默认为 Asia/Shanghai
            timezone = config_data.get("app", {}).get("timezone", "Asia/Shanghai")
            current_time = get_configured_time(timezone)
            crawl_date = format_date_folder(None, timezone)
            crawl_time_str = format_time_filename(timezone)
-                headers = {
+            # 转换为标准数据模型
-                    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
+            news_data = convert_crawl_results_to_news_data(
-                    "Accept": "application/json, text/plain, */*",
+                results=results,
-                    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+                id_to_name=id_to_name,
-                    "Connection": "keep-alive",
+                failed_ids=failed_ids,
-                    "Cache-Control": "no-cache",
+                crawl_time=crawl_time_str,
-                }
+                crawl_date=crawl_date
            )
-                # 重试机制
+            # 初始化存储后端
-                max_retries = 2
+            storage = LocalStorageBackend(
-                retries = 0
+                data_dir=str(self.project_root / "output"),
-                success = False
+                enable_txt=True,
                enable_html=True,
                timezone=timezone
            )
            # 尝试持久化数据
            save_success = False
            save_error_msg = ""
            saved_files = {}
                while retries <= max_retries and not success:
            try:
-                        response = requests.get(url, headers=headers, timeout=10)
+                # 1. 保存到 SQLite (核心持久化)
-                        response.raise_for_status()
+                if storage.save_news_data(news_data):
                    save_success = True
-                        data_text = response.text
+                # 2. 如果请求保存到本地，生成 TXT/HTML 快照
-                        data_json = json.loads(data_text)
+                if save_to_local:
                    # 保存 TXT
                    txt_path = storage.save_txt_snapshot(news_data)
                    if txt_path:
                        saved_files["txt"] = txt_path
-                        status = data_json.get("status", "未知")
+                    # 保存 HTML (使用简化版生成器)
-                        if status not in ["success", "cache"]:
+                    html_content = self._generate_simple_html(results, id_to_name, failed_ids, current_time)
-                            raise ValueError(f"响应状态异常: {status}")
+                    html_filename = f"{crawl_time_str}.html"
-
+                    html_path = storage.save_html_report(html_content, html_filename)
-                        status_info = "最新数据" if status == "success" else "缓存数据"
+                    if html_path:
-                        print(f"获取 {id_value} 成功（{status_info}）")
+                        saved_files["html"] = html_path
                        # 解析数据
                        results[id_value] = {}
                        for index, item in enumerate(data_json.get("items", []), 1):
                            title = item["title"]
                            url_link = item.get("url", "")
                            mobile_url = item.get("mobileUrl", "")
                            if title in results[id_value]:
                                results[id_value][title]["ranks"].append(index)
                            else:
                                results[id_value][title] = {
                                    "ranks": [index],
                                    "url": url_link,
                                    "mobileUrl": mobile_url,
                                }
                        success = True
            except Exception as e:
-                        retries += 1
+                # 捕获所有保存错误（特别是 Docker 只读卷导致的 PermissionError）
-                        if retries <= max_retries:
+                print(f"[System] 数据保存失败: {e}")
-                            wait_time = random.uniform(3, 5)
+                save_success = False
-                            print(f"请求 {id_value} 失败: {e}. {wait_time:.2f}秒后重试...")
+                save_error_msg = str(e)
                            time.sleep(wait_time)
                        else:
                            print(f"请求 {id_value} 失败: {e}")
                            failed_ids.append(id_value)
-                # 请求间隔
+            # 3. 清除缓存，确保下次查询获取最新数据
-                if i < len(ids) - 1:
+            # 即使保存失败，内存中的数据可能已经通过其他方式更新，或者是临时的
-                    actual_interval = request_interval + random.randint(-10, 20)
+            get_cache().clear()
-                    actual_interval = max(50, actual_interval)
+            print("[System] 缓存已清除")
                    time.sleep(actual_interval / 1000)
-            # 格式化返回数据
+            # 构建返回结果
-            news_data = []
+            news_response_data = []
            for platform_id, titles_data in results.items():
                platform_name = id_to_name.get(platform_id, platform_id)
                for title, info in titles_data.items():
@ -230,131 +222,42 @@ class SystemManagementTools:
                        "platform_id": platform_id,
                        "platform_name": platform_name,
                        "title": title,
-                        "ranks": info["ranks"]
+                        "ranks": info.get("ranks", [])
                    }
                    # 条件性添加 URL 字段
                    if include_url:
                        news_item["url"] = info.get("url", "")
                        news_item["mobile_url"] = info.get("mobileUrl", "")
                    news_response_data.append(news_item)
                    news_data.append(news_item)
            # 获取北京时间
            beijing_tz = pytz.timezone("Asia/Shanghai")
            now = datetime.now(beijing_tz)
            # 构建返回结果
            result = {
                "success": True,
                "task_id": f"crawl_{int(time.time())}",
                "status": "completed",
-                "crawl_time": now.strftime("%Y-%m-%d %H:%M:%S"),
+                "crawl_time": current_time.strftime("%Y-%m-%d %H:%M:%S"),
                "platforms": list(results.keys()),
-                "total_news": len(news_data),
+                "total_news": len(news_response_data),
                "failed_platforms": failed_ids,
-                "data": news_data,
+                "data": news_response_data,
-                "saved_to_local": save_to_local
+                "saved_to_local": save_success and save_to_local
            }
-            # 如果需要持久化，调用保存逻辑
+            if save_success:
                if save_to_local:
-                try:
+                    result["saved_files"] = saved_files
-                    import re
+                    result["note"] = "数据已保存到 SQLite 数据库及 output 文件夹"
                    # 辅助函数：清理标题
                    def clean_title(title: str) -> str:
                        """清理标题中的特殊字符"""
                        if not isinstance(title, str):
                            title = str(title)
                        cleaned_title = title.replace("\n", " ").replace("\r", " ")
                        cleaned_title = re.sub(r"\s+", " ", cleaned_title)
                        cleaned_title = cleaned_title.strip()
                        return cleaned_title
                    # 辅助函数：创建目录
                    def ensure_directory_exists(directory: str):
                        """确保目录存在"""
                        Path(directory).mkdir(parents=True, exist_ok=True)
                    # 格式化日期和时间
                    date_folder = now.strftime("%Y年%m月%d日")
                    time_filename = now.strftime("%H时%M分")
                    # 创建 txt 文件路径
                    txt_dir = self.project_root / "output" / date_folder / "txt"
                    ensure_directory_exists(str(txt_dir))
                    txt_file_path = txt_dir / f"{time_filename}.txt"
                    # 创建 html 文件路径
                    html_dir = self.project_root / "output" / date_folder / "html"
                    ensure_directory_exists(str(html_dir))
                    html_file_path = html_dir / f"{time_filename}.html"
                    # 保存 txt 文件（按照 main.py 的格式）
                    with open(txt_file_path, "w", encoding="utf-8") as f:
                        for id_value, title_data in results.items():
                            # id | name 或 id
                            name = id_to_name.get(id_value)
                            if name and name != id_value:
                                f.write(f"{id_value} | {name}\n")
                else:
-                                f.write(f"{id_value}\n")
+                    result["note"] = "数据已保存到 SQLite 数据库 (仅内存中返回结果，未生成TXT快照)"
                            # 按排名排序标题
                            sorted_titles = []
                            for title, info in title_data.items():
                                cleaned = clean_title(title)
                                if isinstance(info, dict):
                                    ranks = info.get("ranks", [])
                                    url = info.get("url", "")
                                    mobile_url = info.get("mobileUrl", "")
            else:
-                                    ranks = info if isinstance(info, list) else []
+                # 明确告知用户保存失败
-                                    url = ""
+                result["saved_to_local"] = False
-                                    mobile_url = ""
+                result["save_error"] = save_error_msg
-
+                if "Read-only file system" in save_error_msg or "Permission denied" in save_error_msg:
-                                rank = ranks[0] if ranks else 1
+                    result["note"] = "爬取成功，但无法写入数据库（Docker只读模式）。数据仅在本次返回中有效。"
                                sorted_titles.append((rank, cleaned, url, mobile_url))
                            sorted_titles.sort(key=lambda x: x[0])
                            for rank, cleaned, url, mobile_url in sorted_titles:
                                line = f"{rank}. {cleaned}"
                                if url:
                                    line += f" [URL:{url}]"
                                if mobile_url:
                                    line += f" [MOBILE:{mobile_url}]"
                                f.write(line + "\n")
                            f.write("\n")
                        if failed_ids:
                            f.write("==== 以下ID请求失败 ====\n")
                            for id_value in failed_ids:
                                f.write(f"{id_value}\n")
                    # 保存 html 文件（简化版）
                    html_content = self._generate_simple_html(results, id_to_name, failed_ids, now)
                    with open(html_file_path, "w", encoding="utf-8") as f:
                        f.write(html_content)
                    print(f"数据已保存到:")
                    print(f"  TXT: {txt_file_path}")
                    print(f"  HTML: {html_file_path}")
                    result["saved_files"] = {
                        "txt": str(txt_file_path),
                        "html": str(html_file_path)
                    }
                    result["note"] = "数据已持久化到 output 文件夹"
                except Exception as e:
                    print(f"保存文件失败: {e}")
                    result["save_error"] = str(e)
                    result["note"] = "爬取成功但保存失败，数据仅在内存中"
                else:
-                result["note"] = "临时爬取结果，未持久化到output文件夹"
+                    result["note"] = f"爬取成功但保存失败: {save_error_msg}"
            # 清理资源
            storage.cleanup()
            return result
--- a/mcp_server/utils/date_parser.py
+++ b/mcp_server/utils/date_parser.py
@ -283,13 +283,13 @@ class DateParser:
            date: datetime对象
        Returns:
-            文件夹名称，格式: YYYY年MM月DD日
+            文件夹名称，格式: YYYY-MM-DD
        Examples:
            >>> DateParser.format_date_folder(datetime(2025, 10, 11))
-            '2025年10月11日'
+            '2025-10-11'
        """
-        return date.strftime("%Y年%m月%d日")
+        return date.strftime("%Y-%m-%d")
    @staticmethod
    def validate_date_not_future(date: datetime) -> None:
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [project]
 name = "trendradar-mcp"
-version = "1.0.3"
+version = "1.1.0"
 description = "TrendRadar MCP Server - 新闻热点聚合工具"
 requires-python = ">=3.10"
 dependencies = [
--- a/requirements.txt
+++ b/requirements.txt
@ -3,3 +3,4 @@ pytz>=2025.2,<2026.0
 PyYAML>=6.0.3,<7.0.0
 fastmcp>=2.12.0,<2.14.0
 websockets>=13.0,<14.0
 boto3>=1.35.0,<2.0.0
--- a/trendradar/init.py
+++ b/trendradar/init.py
@ -0,0 +1,13 @@
 # coding=utf-8
 """
 TrendRadar - 热点新闻聚合与分析工具
 使用方式:
  python -m trendradar        # 模块执行
  trendradar                  # 安装后执行
 """
 from trendradar.context import AppContext
 __version__ = "4.0.0"
 __all__ = ["AppContext", "__version__"]
--- a/trendradar/main.py
+++ b/trendradar/main.py
@ -0,0 +1,719 @@
 # coding=utf-8
 """
 TrendRadar 主程序
 热点新闻聚合与分析工具
 支持: python -m trendradar
 """
 import os
 import webbrowser
 from pathlib import Path
 from typing import Dict, List, Tuple, Optional
 import requests
 from trendradar.context import AppContext
 # 版本号直接定义，避免循环导入
 VERSION = "4.0.0"
 from trendradar.core import load_config
 from trendradar.crawler import DataFetcher
 from trendradar.storage import convert_crawl_results_to_news_data
 def check_version_update(
    current_version: str, version_url: str, proxy_url: Optional[str] = None
 ) -> Tuple[bool, Optional[str]]:
    """检查版本更新"""
    try:
        proxies = None
        if proxy_url:
            proxies = {"http": proxy_url, "https": proxy_url}
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
            "Accept": "text/plain, */*",
            "Cache-Control": "no-cache",
        }
        response = requests.get(
            version_url, proxies=proxies, headers=headers, timeout=10
        )
        response.raise_for_status()
        remote_version = response.text.strip()
        print(f"当前版本: {current_version}, 远程版本: {remote_version}")
        # 比较版本
        def parse_version(version_str):
            try:
                parts = version_str.strip().split(".")
                if len(parts) != 3:
                    raise ValueError("版本号格式不正确")
                return int(parts[0]), int(parts[1]), int(parts[2])
            except:
                return 0, 0, 0
        current_tuple = parse_version(current_version)
        remote_tuple = parse_version(remote_version)
        need_update = current_tuple < remote_tuple
        return need_update, remote_version if need_update else None
    except Exception as e:
        print(f"版本检查失败: {e}")
        return False, None
 # === 主分析器 ===
 class NewsAnalyzer:
    """新闻分析器"""
    # 模式策略定义
    MODE_STRATEGIES = {
        "incremental": {
            "mode_name": "增量模式",
            "description": "增量模式（只关注新增新闻，无新增时不推送）",
            "realtime_report_type": "实时增量",
            "summary_report_type": "当日汇总",
            "should_send_realtime": True,
            "should_generate_summary": True,
            "summary_mode": "daily",
        },
        "current": {
            "mode_name": "当前榜单模式",
            "description": "当前榜单模式（当前榜单匹配新闻 + 新增新闻区域 + 按时推送）",
            "realtime_report_type": "实时当前榜单",
            "summary_report_type": "当前榜单汇总",
            "should_send_realtime": True,
            "should_generate_summary": True,
            "summary_mode": "current",
        },
        "daily": {
            "mode_name": "当日汇总模式",
            "description": "当日汇总模式（所有匹配新闻 + 新增新闻区域 + 按时推送）",
            "realtime_report_type": "",
            "summary_report_type": "当日汇总",
            "should_send_realtime": False,
            "should_generate_summary": True,
            "summary_mode": "daily",
        },
    }
    def __init__(self):
        # 加载配置
        print("正在加载配置...")
        config = load_config()
        print(f"TrendRadar v{VERSION} 配置加载完成")
        print(f"监控平台数量: {len(config['PLATFORMS'])}")
        print(f"时区: {config.get('TIMEZONE', 'Asia/Shanghai')}")
        # 创建应用上下文
        self.ctx = AppContext(config)
        self.request_interval = self.ctx.config["REQUEST_INTERVAL"]
        self.report_mode = self.ctx.config["REPORT_MODE"]
        self.rank_threshold = self.ctx.rank_threshold
        self.is_github_actions = os.environ.get("GITHUB_ACTIONS") == "true"
        self.is_docker_container = self._detect_docker_environment()
        self.update_info = None
        self.proxy_url = None
        self._setup_proxy()
        self.data_fetcher = DataFetcher(self.proxy_url)
        # 初始化存储管理器（使用 AppContext）
        self._init_storage_manager()
        if self.is_github_actions:
            self._check_version_update()
    def _init_storage_manager(self) -> None:
        """初始化存储管理器（使用 AppContext）"""
        # 获取数据保留天数（支持环境变量覆盖）
        env_retention = os.environ.get("STORAGE_RETENTION_DAYS", "").strip()
        if env_retention:
            # 环境变量覆盖配置
            self.ctx.config["STORAGE"]["RETENTION_DAYS"] = int(env_retention)
        self.storage_manager = self.ctx.get_storage_manager()
        print(f"存储后端: {self.storage_manager.backend_name}")
        retention_days = self.ctx.config.get("STORAGE", {}).get("RETENTION_DAYS", 0)
        if retention_days > 0:
            print(f"数据保留天数: {retention_days} 天")
    def _detect_docker_environment(self) -> bool:
        """检测是否运行在 Docker 容器中"""
        try:
            if os.environ.get("DOCKER_CONTAINER") == "true":
                return True
            if os.path.exists("/.dockerenv"):
                return True
            return False
        except Exception:
            return False
    def _should_open_browser(self) -> bool:
        """判断是否应该打开浏览器"""
        return not self.is_github_actions and not self.is_docker_container
    def _setup_proxy(self) -> None:
        """设置代理配置"""
        if not self.is_github_actions and self.ctx.config["USE_PROXY"]:
            self.proxy_url = self.ctx.config["DEFAULT_PROXY"]
            print("本地环境，使用代理")
        elif not self.is_github_actions and not self.ctx.config["USE_PROXY"]:
            print("本地环境，未启用代理")
        else:
            print("GitHub Actions环境，不使用代理")
    def _check_version_update(self) -> None:
        """检查版本更新"""
        try:
            need_update, remote_version = check_version_update(
                VERSION, self.ctx.config["VERSION_CHECK_URL"], self.proxy_url
            )
            if need_update and remote_version:
                self.update_info = {
                    "current_version": VERSION,
                    "remote_version": remote_version,
                }
                print(f"发现新版本: {remote_version} (当前: {VERSION})")
            else:
                print("版本检查完成，当前为最新版本")
        except Exception as e:
            print(f"版本检查出错: {e}")
    def _get_mode_strategy(self) -> Dict:
        """获取当前模式的策略配置"""
        return self.MODE_STRATEGIES.get(self.report_mode, self.MODE_STRATEGIES["daily"])
    def _has_notification_configured(self) -> bool:
        """检查是否配置了任何通知渠道"""
        cfg = self.ctx.config
        return any(
            [
                cfg["FEISHU_WEBHOOK_URL"],
                cfg["DINGTALK_WEBHOOK_URL"],
                cfg["WEWORK_WEBHOOK_URL"],
                (cfg["TELEGRAM_BOT_TOKEN"] and cfg["TELEGRAM_CHAT_ID"]),
                (
                    cfg["EMAIL_FROM"]
                    and cfg["EMAIL_PASSWORD"]
                    and cfg["EMAIL_TO"]
                ),
                (cfg["NTFY_SERVER_URL"] and cfg["NTFY_TOPIC"]),
                cfg["BARK_URL"],
                cfg["SLACK_WEBHOOK_URL"],
            ]
        )
    def _has_valid_content(
        self, stats: List[Dict], new_titles: Optional[Dict] = None
    ) -> bool:
        """检查是否有有效的新闻内容"""
        if self.report_mode in ["incremental", "current"]:
            # 增量模式和current模式下，只要stats有内容就说明有匹配的新闻
            return any(stat["count"] > 0 for stat in stats)
        else:
            # 当日汇总模式下，检查是否有匹配的频率词新闻或新增新闻
            has_matched_news = any(stat["count"] > 0 for stat in stats)
            has_new_news = bool(
                new_titles and any(len(titles) > 0 for titles in new_titles.values())
            )
            return has_matched_news or has_new_news
    def _load_analysis_data(
        self,
    ) -> Optional[Tuple[Dict, Dict, Dict, Dict, List, List]]:
        """统一的数据加载和预处理，使用当前监控平台列表过滤历史数据"""
        try:
            # 获取当前配置的监控平台ID列表
            current_platform_ids = self.ctx.platform_ids
            print(f"当前监控平台: {current_platform_ids}")
            all_results, id_to_name, title_info = self.ctx.read_today_titles(
                current_platform_ids
            )
            if not all_results:
                print("没有找到当天的数据")
                return None
            total_titles = sum(len(titles) for titles in all_results.values())
            print(f"读取到 {total_titles} 个标题（已按当前监控平台过滤）")
            new_titles = self.ctx.detect_new_titles(current_platform_ids)
            word_groups, filter_words, global_filters = self.ctx.load_frequency_words()
            return (
                all_results,
                id_to_name,
                title_info,
                new_titles,
                word_groups,
                filter_words,
                global_filters,
            )
        except Exception as e:
            print(f"数据加载失败: {e}")
            return None
    def _prepare_current_title_info(self, results: Dict, time_info: str) -> Dict:
        """从当前抓取结果构建标题信息"""
        title_info = {}
        for source_id, titles_data in results.items():
            title_info[source_id] = {}
            for title, title_data in titles_data.items():
                ranks = title_data.get("ranks", [])
                url = title_data.get("url", "")
                mobile_url = title_data.get("mobileUrl", "")
                title_info[source_id][title] = {
                    "first_time": time_info,
                    "last_time": time_info,
                    "count": 1,
                    "ranks": ranks,
                    "url": url,
                    "mobileUrl": mobile_url,
                }
        return title_info
    def _run_analysis_pipeline(
        self,
        data_source: Dict,
        mode: str,
        title_info: Dict,
        new_titles: Dict,
        word_groups: List[Dict],
        filter_words: List[str],
        id_to_name: Dict,
        failed_ids: Optional[List] = None,
        is_daily_summary: bool = False,
        global_filters: Optional[List[str]] = None,
    ) -> Tuple[List[Dict], Optional[str]]:
        """统一的分析流水线：数据处理 → 统计计算 → HTML生成"""
        # 统计计算（使用 AppContext）
        stats, total_titles = self.ctx.count_frequency(
            data_source,
            word_groups,
            filter_words,
            id_to_name,
            title_info,
            new_titles,
            mode=mode,
            global_filters=global_filters,
        )
        # HTML生成（如果启用）
        html_file = None
        if self.ctx.config["STORAGE"]["FORMATS"]["HTML"]:
            html_file = self.ctx.generate_html(
                stats,
                total_titles,
                failed_ids=failed_ids,
                new_titles=new_titles,
                id_to_name=id_to_name,
                mode=mode,
                is_daily_summary=is_daily_summary,
                update_info=self.update_info if self.ctx.config["SHOW_VERSION_UPDATE"] else None,
            )
        return stats, html_file
    def _send_notification_if_needed(
        self,
        stats: List[Dict],
        report_type: str,
        mode: str,
        failed_ids: Optional[List] = None,
        new_titles: Optional[Dict] = None,
        id_to_name: Optional[Dict] = None,
        html_file_path: Optional[str] = None,
    ) -> bool:
        """统一的通知发送逻辑，包含所有判断条件"""
        has_notification = self._has_notification_configured()
        cfg = self.ctx.config
        if (
            cfg["ENABLE_NOTIFICATION"]
            and has_notification
            and self._has_valid_content(stats, new_titles)
        ):
            # 推送窗口控制
            if cfg["PUSH_WINDOW"]["ENABLED"]:
                push_manager = self.ctx.create_push_manager()
                time_range_start = cfg["PUSH_WINDOW"]["TIME_RANGE"]["START"]
                time_range_end = cfg["PUSH_WINDOW"]["TIME_RANGE"]["END"]
                if not push_manager.is_in_time_range(time_range_start, time_range_end):
                    now = self.ctx.get_time()
                    print(
                        f"推送窗口控制：当前时间 {now.strftime('%H:%M')} 不在推送时间窗口 {time_range_start}-{time_range_end} 内，跳过推送"
                    )
                    return False
                if cfg["PUSH_WINDOW"]["ONCE_PER_DAY"]:
                    if push_manager.has_pushed_today():
                        print(f"推送窗口控制：今天已推送过，跳过本次推送")
                        return False
                    else:
                        print(f"推送窗口控制：今天首次推送")
            # 准备报告数据
            report_data = self.ctx.prepare_report(stats, failed_ids, new_titles, id_to_name, mode)
            # 是否发送版本更新信息
            update_info_to_send = self.update_info if cfg["SHOW_VERSION_UPDATE"] else None
            # 使用 NotificationDispatcher 发送到所有渠道
            dispatcher = self.ctx.create_notification_dispatcher()
            results = dispatcher.dispatch_all(
                report_data=report_data,
                report_type=report_type,
                update_info=update_info_to_send,
                proxy_url=self.proxy_url,
                mode=mode,
                html_file_path=html_file_path,
            )
            if not results:
                print("未配置任何通知渠道，跳过通知发送")
                return False
            # 如果成功发送了任何通知，且启用了每天只推一次，则记录推送
            if (
                cfg["PUSH_WINDOW"]["ENABLED"]
                and cfg["PUSH_WINDOW"]["ONCE_PER_DAY"]
                and any(results.values())
            ):
                push_manager = self.ctx.create_push_manager()
                push_manager.record_push(report_type)
            return True
        elif cfg["ENABLE_NOTIFICATION"] and not has_notification:
            print("⚠️ 警告：通知功能已启用但未配置任何通知渠道，将跳过通知发送")
        elif not cfg["ENABLE_NOTIFICATION"]:
            print(f"跳过{report_type}通知：通知功能已禁用")
        elif (
            cfg["ENABLE_NOTIFICATION"]
            and has_notification
            and not self._has_valid_content(stats, new_titles)
        ):
            mode_strategy = self._get_mode_strategy()
            if "实时" in report_type:
                print(
                    f"跳过实时推送通知：{mode_strategy['mode_name']}下未检测到匹配的新闻"
                )
            else:
                print(
                    f"跳过{mode_strategy['summary_report_type']}通知：未匹配到有效的新闻内容"
                )
        return False
    def _generate_summary_report(self, mode_strategy: Dict) -> Optional[str]:
        """生成汇总报告（带通知）"""
        summary_type = (
            "当前榜单汇总" if mode_strategy["summary_mode"] == "current" else "当日汇总"
        )
        print(f"生成{summary_type}报告...")
        # 加载分析数据
        analysis_data = self._load_analysis_data()
        if not analysis_data:
            return None
        all_results, id_to_name, title_info, new_titles, word_groups, filter_words, global_filters = (
            analysis_data
        )
        # 运行分析流水线
        stats, html_file = self._run_analysis_pipeline(
            all_results,
            mode_strategy["summary_mode"],
            title_info,
            new_titles,
            word_groups,
            filter_words,
            id_to_name,
            is_daily_summary=True,
            global_filters=global_filters,
        )
        if html_file:
            print(f"{summary_type}报告已生成: {html_file}")
        # 发送通知
        self._send_notification_if_needed(
            stats,
            mode_strategy["summary_report_type"],
            mode_strategy["summary_mode"],
            failed_ids=[],
            new_titles=new_titles,
            id_to_name=id_to_name,
            html_file_path=html_file,
        )
        return html_file
    def _generate_summary_html(self, mode: str = "daily") -> Optional[str]:
        """生成汇总HTML"""
        summary_type = "当前榜单汇总" if mode == "current" else "当日汇总"
        print(f"生成{summary_type}HTML...")
        # 加载分析数据
        analysis_data = self._load_analysis_data()
        if not analysis_data:
            return None
        all_results, id_to_name, title_info, new_titles, word_groups, filter_words, global_filters = (
            analysis_data
        )
        # 运行分析流水线
        _, html_file = self._run_analysis_pipeline(
            all_results,
            mode,
            title_info,
            new_titles,
            word_groups,
            filter_words,
            id_to_name,
            is_daily_summary=True,
            global_filters=global_filters,
        )
        if html_file:
            print(f"{summary_type}HTML已生成: {html_file}")
        return html_file
    def _initialize_and_check_config(self) -> None:
        """通用初始化和配置检查"""
        now = self.ctx.get_time()
        print(f"当前北京时间: {now.strftime('%Y-%m-%d %H:%M:%S')}")
        if not self.ctx.config["ENABLE_CRAWLER"]:
            print("爬虫功能已禁用（ENABLE_CRAWLER=False），程序退出")
            return
        has_notification = self._has_notification_configured()
        if not self.ctx.config["ENABLE_NOTIFICATION"]:
            print("通知功能已禁用（ENABLE_NOTIFICATION=False），将只进行数据抓取")
        elif not has_notification:
            print("未配置任何通知渠道，将只进行数据抓取，不发送通知")
        else:
            print("通知功能已启用，将发送通知")
        mode_strategy = self._get_mode_strategy()
        print(f"报告模式: {self.report_mode}")
        print(f"运行模式: {mode_strategy['description']}")
    def _crawl_data(self) -> Tuple[Dict, Dict, List]:
        """执行数据爬取"""
        ids = []
        for platform in self.ctx.platforms:
            if "name" in platform:
                ids.append((platform["id"], platform["name"]))
            else:
                ids.append(platform["id"])
        print(
            f"配置的监控平台: {[p.get('name', p['id']) for p in self.ctx.platforms]}"
        )
        print(f"开始爬取数据，请求间隔 {self.request_interval} 毫秒")
        Path("output").mkdir(parents=True, exist_ok=True)
        results, id_to_name, failed_ids = self.data_fetcher.crawl_websites(
            ids, self.request_interval
        )
        # 转换为 NewsData 格式并保存到存储后端
        crawl_time = self.ctx.format_time()
        crawl_date = self.ctx.format_date()
        news_data = convert_crawl_results_to_news_data(
            results, id_to_name, failed_ids, crawl_time, crawl_date
        )
        # 保存到存储后端（SQLite）
        if self.storage_manager.save_news_data(news_data):
            print(f"数据已保存到存储后端: {self.storage_manager.backend_name}")
        # 保存 TXT 快照（如果启用）
        txt_file = self.storage_manager.save_txt_snapshot(news_data)
        if txt_file:
            print(f"TXT 快照已保存: {txt_file}")
        # 兼容：同时保存到原有 TXT 格式（确保向后兼容）
        if self.ctx.config["STORAGE"]["FORMATS"]["TXT"]:
            title_file = self.ctx.save_titles(results, id_to_name, failed_ids)
            print(f"标题已保存到: {title_file}")
        return results, id_to_name, failed_ids
    def _execute_mode_strategy(
        self, mode_strategy: Dict, results: Dict, id_to_name: Dict, failed_ids: List
    ) -> Optional[str]:
        """执行模式特定逻辑"""
        # 获取当前监控平台ID列表
        current_platform_ids = self.ctx.platform_ids
        new_titles = self.ctx.detect_new_titles(current_platform_ids)
        time_info = self.ctx.format_time()
        if self.ctx.config["STORAGE"]["FORMATS"]["TXT"]:
            self.ctx.save_titles(results, id_to_name, failed_ids)
        word_groups, filter_words, global_filters = self.ctx.load_frequency_words()
        # current模式下，实时推送需要使用完整的历史数据来保证统计信息的完整性
        if self.report_mode == "current":
            # 加载完整的历史数据（已按当前平台过滤）
            analysis_data = self._load_analysis_data()
            if analysis_data:
                (
                    all_results,
                    historical_id_to_name,
                    historical_title_info,
                    historical_new_titles,
                    _,
                    _,
                    _,
                ) = analysis_data
                print(
                    f"current模式：使用过滤后的历史数据，包含平台：{list(all_results.keys())}"
                )
                stats, html_file = self._run_analysis_pipeline(
                    all_results,
                    self.report_mode,
                    historical_title_info,
                    historical_new_titles,
                    word_groups,
                    filter_words,
                    historical_id_to_name,
                    failed_ids=failed_ids,
                    global_filters=global_filters,
                )
                combined_id_to_name = {**historical_id_to_name, **id_to_name}
                if html_file:
                    print(f"HTML报告已生成: {html_file}")
                # 发送实时通知（使用完整历史数据的统计结果）
                summary_html = None
                if mode_strategy["should_send_realtime"]:
                    self._send_notification_if_needed(
                        stats,
                        mode_strategy["realtime_report_type"],
                        self.report_mode,
                        failed_ids=failed_ids,
                        new_titles=historical_new_titles,
                        id_to_name=combined_id_to_name,
                        html_file_path=html_file,
                    )
            else:
                print("❌ 严重错误：无法读取刚保存的数据文件")
                raise RuntimeError("数据一致性检查失败：保存后立即读取失败")
        else:
            title_info = self._prepare_current_title_info(results, time_info)
            stats, html_file = self._run_analysis_pipeline(
                results,
                self.report_mode,
                title_info,
                new_titles,
                word_groups,
                filter_words,
                id_to_name,
                failed_ids=failed_ids,
                global_filters=global_filters,
            )
            if html_file:
                print(f"HTML报告已生成: {html_file}")
            # 发送实时通知（如果需要）
            summary_html = None
            if mode_strategy["should_send_realtime"]:
                self._send_notification_if_needed(
                    stats,
                    mode_strategy["realtime_report_type"],
                    self.report_mode,
                    failed_ids=failed_ids,
                    new_titles=new_titles,
                    id_to_name=id_to_name,
                    html_file_path=html_file,
                )
        # 生成汇总报告（如果需要）
        summary_html = None
        if mode_strategy["should_generate_summary"]:
            if mode_strategy["should_send_realtime"]:
                # 如果已经发送了实时通知，汇总只生成HTML不发送通知
                summary_html = self._generate_summary_html(
                    mode_strategy["summary_mode"]
                )
            else:
                # daily模式：直接生成汇总报告并发送通知
                summary_html = self._generate_summary_report(mode_strategy)
        # 打开浏览器（仅在非容器环境）
        if self._should_open_browser() and html_file:
            if summary_html:
                summary_url = "file://" + str(Path(summary_html).resolve())
                print(f"正在打开汇总报告: {summary_url}")
                webbrowser.open(summary_url)
            else:
                file_url = "file://" + str(Path(html_file).resolve())
                print(f"正在打开HTML报告: {file_url}")
                webbrowser.open(file_url)
        elif self.is_docker_container and html_file:
            if summary_html:
                print(f"汇总报告已生成（Docker环境）: {summary_html}")
            else:
                print(f"HTML报告已生成（Docker环境）: {html_file}")
        return summary_html
    def run(self) -> None:
        """执行分析流程"""
        try:
            self._initialize_and_check_config()
            mode_strategy = self._get_mode_strategy()
            results, id_to_name, failed_ids = self._crawl_data()
            self._execute_mode_strategy(mode_strategy, results, id_to_name, failed_ids)
        except Exception as e:
            print(f"分析流程执行出错: {e}")
            raise
        finally:
            # 清理资源（包括过期数据清理和数据库连接关闭）
            self.ctx.cleanup()
 def main():
    """主程序入口"""
    try:
        analyzer = NewsAnalyzer()
        analyzer.run()
    except FileNotFoundError as e:
        print(f"❌ 配置文件错误: {e}")
        print("\n请确保以下文件存在:")
        print("  • config/config.yaml")
        print("  • config/frequency_words.txt")
        print("\n参考项目文档进行正确配置")
    except Exception as e:
        print(f"❌ 程序运行错误: {e}")
        raise
 if __name__ == "__main__":
    main()
--- a/trendradar/context.py
+++ b/trendradar/context.py
@ -0,0 +1,388 @@
 # coding=utf-8
 """
 应用上下文模块
 提供配置上下文类，封装所有依赖配置的操作，消除全局状态和包装函数。
 """
 from datetime import datetime
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Tuple
 from trendradar.utils.time import (
    get_configured_time,
    format_date_folder,
    format_time_filename,
    get_current_time_display,
    convert_time_for_display,
 )
 from trendradar.core import (
    load_frequency_words,
    matches_word_groups,
    save_titles_to_file,
    read_all_today_titles,
    detect_latest_new_titles,
    is_first_crawl_today,
    count_word_frequency,
 )
 from trendradar.report import (
    clean_title,
    prepare_report_data,
    generate_html_report,
    render_html_content,
 )
 from trendradar.notification import (
    render_feishu_content,
    render_dingtalk_content,
    split_content_into_batches,
    NotificationDispatcher,
    PushRecordManager,
 )
 from trendradar.storage import get_storage_manager
 class AppContext:
    """
    应用上下文类
    封装所有依赖配置的操作，提供统一的接口。
    消除对全局 CONFIG 的依赖，提高可测试性。
    使用示例:
        config = load_config()
        ctx = AppContext(config)
        # 时间操作
        now = ctx.get_time()
        date_folder = ctx.format_date()
        # 存储操作
        storage = ctx.get_storage_manager()
        # 报告生成
        html = ctx.generate_html_report(stats, total_titles, ...)
    """
    def __init__(self, config: Dict[str, Any]):
        """
        初始化应用上下文
        Args:
            config: 完整的配置字典
        """
        self.config = config
        self._storage_manager = None
    # === 配置访问 ===
    @property
    def timezone(self) -> str:
        """获取配置的时区"""
        return self.config.get("TIMEZONE", "Asia/Shanghai")
    @property
    def rank_threshold(self) -> int:
        """获取排名阈值"""
        return self.config.get("RANK_THRESHOLD", 50)
    @property
    def weight_config(self) -> Dict:
        """获取权重配置"""
        return self.config.get("WEIGHT_CONFIG", {})
    @property
    def platforms(self) -> List[Dict]:
        """获取平台配置列表"""
        return self.config.get("PLATFORMS", [])
    @property
    def platform_ids(self) -> List[str]:
        """获取平台ID列表"""
        return [p["id"] for p in self.platforms]
    # === 时间操作 ===
    def get_time(self) -> datetime:
        """获取当前配置时区的时间"""
        return get_configured_time(self.timezone)
    def format_date(self) -> str:
        """格式化日期文件夹 (YYYY-MM-DD)"""
        return format_date_folder(timezone=self.timezone)
    def format_time(self) -> str:
        """格式化时间文件名 (HH-MM)"""
        return format_time_filename(self.timezone)
    def get_time_display(self) -> str:
        """获取时间显示 (HH:MM)"""
        return get_current_time_display(self.timezone)
    @staticmethod
    def convert_time_display(time_str: str) -> str:
        """将 HH-MM 转换为 HH:MM"""
        return convert_time_for_display(time_str)
    # === 存储操作 ===
    def get_storage_manager(self):
        """获取存储管理器（延迟初始化，单例）"""
        if self._storage_manager is None:
            storage_config = self.config.get("STORAGE", {})
            remote_config = storage_config.get("REMOTE", {})
            local_config = storage_config.get("LOCAL", {})
            pull_config = storage_config.get("PULL", {})
            self._storage_manager = get_storage_manager(
                backend_type=storage_config.get("BACKEND", "auto"),
                data_dir=local_config.get("DATA_DIR", "output"),
                enable_txt=storage_config.get("FORMATS", {}).get("TXT", True),
                enable_html=storage_config.get("FORMATS", {}).get("HTML", True),
                remote_config={
                    "bucket_name": remote_config.get("BUCKET_NAME", ""),
                    "access_key_id": remote_config.get("ACCESS_KEY_ID", ""),
                    "secret_access_key": remote_config.get("SECRET_ACCESS_KEY", ""),
                    "endpoint_url": remote_config.get("ENDPOINT_URL", ""),
                    "region": remote_config.get("REGION", ""),
                },
                local_retention_days=local_config.get("RETENTION_DAYS", 0),
                remote_retention_days=remote_config.get("RETENTION_DAYS", 0),
                pull_enabled=pull_config.get("ENABLED", False),
                pull_days=pull_config.get("DAYS", 7),
                timezone=self.timezone,
            )
        return self._storage_manager
    def get_output_path(self, subfolder: str, filename: str) -> str:
        """获取输出路径"""
        output_dir = Path("output") / self.format_date() / subfolder
        output_dir.mkdir(parents=True, exist_ok=True)
        return str(output_dir / filename)
    # === 数据处理 ===
    def save_titles(self, results: Dict, id_to_name: Dict, failed_ids: List) -> str:
        """保存标题到文件"""
        output_path = self.get_output_path("txt", f"{self.format_time()}.txt")
        return save_titles_to_file(results, id_to_name, failed_ids, output_path, clean_title)
    def read_today_titles(
        self, platform_ids: Optional[List[str]] = None
    ) -> Tuple[Dict, Dict, Dict]:
        """读取当天所有标题"""
        return read_all_today_titles(self.get_storage_manager(), platform_ids)
    def detect_new_titles(
        self, platform_ids: Optional[List[str]] = None
    ) -> Dict:
        """检测最新批次的新增标题"""
        return detect_latest_new_titles(self.get_storage_manager(), platform_ids)
    def is_first_crawl(self) -> bool:
        """检测是否是当天第一次爬取"""
        return is_first_crawl_today("output", self.format_date())
    # === 频率词处理 ===
    def load_frequency_words(
        self, frequency_file: Optional[str] = None
    ) -> Tuple[List[Dict], List[str], List[str]]:
        """加载频率词配置"""
        return load_frequency_words(frequency_file)
    def matches_word_groups(
        self,
        title: str,
        word_groups: List[Dict],
        filter_words: List[str],
        global_filters: Optional[List[str]] = None,
    ) -> bool:
        """检查标题是否匹配词组规则"""
        return matches_word_groups(title, word_groups, filter_words, global_filters)
    # === 统计分析 ===
    def count_frequency(
        self,
        results: Dict,
        word_groups: List[Dict],
        filter_words: List[str],
        id_to_name: Dict,
        title_info: Optional[Dict] = None,
        new_titles: Optional[Dict] = None,
        mode: str = "daily",
        global_filters: Optional[List[str]] = None,
    ) -> Tuple[List[Dict], int]:
        """统计词频"""
        return count_word_frequency(
            results=results,
            word_groups=word_groups,
            filter_words=filter_words,
            id_to_name=id_to_name,
            title_info=title_info,
            rank_threshold=self.rank_threshold,
            new_titles=new_titles,
            mode=mode,
            global_filters=global_filters,
            weight_config=self.weight_config,
            max_news_per_keyword=self.config.get("MAX_NEWS_PER_KEYWORD", 0),
            sort_by_position_first=self.config.get("SORT_BY_POSITION_FIRST", False),
            is_first_crawl_func=self.is_first_crawl,
            convert_time_func=self.convert_time_display,
        )
    # === 报告生成 ===
    def prepare_report(
        self,
        stats: List[Dict],
        failed_ids: Optional[List] = None,
        new_titles: Optional[Dict] = None,
        id_to_name: Optional[Dict] = None,
        mode: str = "daily",
    ) -> Dict:
        """准备报告数据"""
        return prepare_report_data(
            stats=stats,
            failed_ids=failed_ids,
            new_titles=new_titles,
            id_to_name=id_to_name,
            mode=mode,
            rank_threshold=self.rank_threshold,
            matches_word_groups_func=self.matches_word_groups,
            load_frequency_words_func=self.load_frequency_words,
        )
    def generate_html(
        self,
        stats: List[Dict],
        total_titles: int,
        failed_ids: Optional[List] = None,
        new_titles: Optional[Dict] = None,
        id_to_name: Optional[Dict] = None,
        mode: str = "daily",
        is_daily_summary: bool = False,
        update_info: Optional[Dict] = None,
    ) -> str:
        """生成HTML报告"""
        return generate_html_report(
            stats=stats,
            total_titles=total_titles,
            failed_ids=failed_ids,
            new_titles=new_titles,
            id_to_name=id_to_name,
            mode=mode,
            is_daily_summary=is_daily_summary,
            update_info=update_info,
            rank_threshold=self.rank_threshold,
            output_dir="output",
            date_folder=self.format_date(),
            time_filename=self.format_time(),
            render_html_func=lambda *args, **kwargs: self.render_html(*args, **kwargs),
            matches_word_groups_func=self.matches_word_groups,
            load_frequency_words_func=self.load_frequency_words,
            enable_index_copy=True,
        )
    def render_html(
        self,
        report_data: Dict,
        total_titles: int,
        is_daily_summary: bool = False,
        mode: str = "daily",
        update_info: Optional[Dict] = None,
    ) -> str:
        """渲染HTML内容"""
        return render_html_content(
            report_data=report_data,
            total_titles=total_titles,
            is_daily_summary=is_daily_summary,
            mode=mode,
            update_info=update_info,
            reverse_content_order=self.config.get("REVERSE_CONTENT_ORDER", False),
            get_time_func=self.get_time,
        )
    # === 通知内容渲染 ===
    def render_feishu(
        self,
        report_data: Dict,
        update_info: Optional[Dict] = None,
        mode: str = "daily",
    ) -> str:
        """渲染飞书内容"""
        return render_feishu_content(
            report_data=report_data,
            update_info=update_info,
            mode=mode,
            separator=self.config.get("FEISHU_MESSAGE_SEPARATOR", "---"),
            reverse_content_order=self.config.get("REVERSE_CONTENT_ORDER", False),
            get_time_func=self.get_time,
        )
    def render_dingtalk(
        self,
        report_data: Dict,
        update_info: Optional[Dict] = None,
        mode: str = "daily",
    ) -> str:
        """渲染钉钉内容"""
        return render_dingtalk_content(
            report_data=report_data,
            update_info=update_info,
            mode=mode,
            reverse_content_order=self.config.get("REVERSE_CONTENT_ORDER", False),
            get_time_func=self.get_time,
        )
    def split_content(
        self,
        report_data: Dict,
        format_type: str,
        update_info: Optional[Dict] = None,
        max_bytes: Optional[int] = None,
        mode: str = "daily",
    ) -> List[str]:
        """分批处理消息内容"""
        return split_content_into_batches(
            report_data=report_data,
            format_type=format_type,
            update_info=update_info,
            max_bytes=max_bytes,
            mode=mode,
            batch_sizes={
                "dingtalk": self.config.get("DINGTALK_BATCH_SIZE", 20000),
                "feishu": self.config.get("FEISHU_BATCH_SIZE", 29000),
                "default": self.config.get("MESSAGE_BATCH_SIZE", 4000),
            },
            feishu_separator=self.config.get("FEISHU_MESSAGE_SEPARATOR", "---"),
            reverse_content_order=self.config.get("REVERSE_CONTENT_ORDER", False),
            get_time_func=self.get_time,
        )
    # === 通知发送 ===
    def create_notification_dispatcher(self) -> NotificationDispatcher:
        """创建通知调度器"""
        return NotificationDispatcher(
            config=self.config,
            get_time_func=self.get_time,
            split_content_func=self.split_content,
        )
    def create_push_manager(self) -> PushRecordManager:
        """创建推送记录管理器"""
        return PushRecordManager(
            storage_backend=self.get_storage_manager(),
            get_time_func=self.get_time,
        )
    # === 资源清理 ===
    def cleanup(self):
        """清理资源"""
        if self._storage_manager:
            self._storage_manager.cleanup_old_data()
            self._storage_manager.cleanup()
            self._storage_manager = None
--- a/trendradar/core/init.py
+++ b/trendradar/core/init.py
@ -0,0 +1,47 @@
 # coding=utf-8
 """
 核心模块 - 配置管理和核心工具
 """
 from trendradar.core.config import (
    parse_multi_account_config,
    validate_paired_configs,
    limit_accounts,
    get_account_at_index,
 )
 from trendradar.core.loader import load_config
 from trendradar.core.frequency import load_frequency_words, matches_word_groups
 from trendradar.core.data import (
    save_titles_to_file,
    read_all_today_titles_from_storage,
    read_all_today_titles,
    detect_latest_new_titles_from_storage,
    detect_latest_new_titles,
    is_first_crawl_today,
 )
 from trendradar.core.analyzer import (
    calculate_news_weight,
    format_time_display,
    count_word_frequency,
 )
 __all__ = [
    "parse_multi_account_config",
    "validate_paired_configs",
    "limit_accounts",
    "get_account_at_index",
    "load_config",
    "load_frequency_words",
    "matches_word_groups",
    # 数据处理
    "save_titles_to_file",
    "read_all_today_titles_from_storage",
    "read_all_today_titles",
    "detect_latest_new_titles_from_storage",
    "detect_latest_new_titles",
    "is_first_crawl_today",
    # 统计分析
    "calculate_news_weight",
    "format_time_display",
    "count_word_frequency",
 ]
--- a/trendradar/core/analyzer.py
+++ b/trendradar/core/analyzer.py
@ -0,0 +1,469 @@
 # coding=utf-8
 """
 统计分析模块
 提供新闻统计和分析功能：
 - calculate_news_weight: 计算新闻权重
 - format_time_display: 格式化时间显示
 - count_word_frequency: 统计词频
 """
 from typing import Dict, List, Tuple, Optional, Callable
 from trendradar.core.frequency import matches_word_groups
 def calculate_news_weight(
    title_data: Dict,
    rank_threshold: int,
    weight_config: Dict,
 ) -> float:
    """
    计算新闻权重，用于排序
    Args:
        title_data: 标题数据，包含 ranks 和 count
        rank_threshold: 排名阈值
        weight_config: 权重配置 {RANK_WEIGHT, FREQUENCY_WEIGHT, HOTNESS_WEIGHT}
    Returns:
        float: 计算出的权重值
    """
    ranks = title_data.get("ranks", [])
    if not ranks:
        return 0.0
    count = title_data.get("count", len(ranks))
    # 排名权重：Σ(11 - min(rank, 10)) / 出现次数
    rank_scores = []
    for rank in ranks:
        score = 11 - min(rank, 10)
        rank_scores.append(score)
    rank_weight = sum(rank_scores) / len(ranks) if ranks else 0
    # 频次权重：min(出现次数, 10) × 10
    frequency_weight = min(count, 10) * 10
    # 热度加成：高排名次数 / 总出现次数 × 100
    high_rank_count = sum(1 for rank in ranks if rank <= rank_threshold)
    hotness_ratio = high_rank_count / len(ranks) if ranks else 0
    hotness_weight = hotness_ratio * 100
    total_weight = (
        rank_weight * weight_config["RANK_WEIGHT"]
        + frequency_weight * weight_config["FREQUENCY_WEIGHT"]
        + hotness_weight * weight_config["HOTNESS_WEIGHT"]
    )
    return total_weight
 def format_time_display(
    first_time: str,
    last_time: str,
    convert_time_func: Callable[[str], str],
 ) -> str:
    """
    格式化时间显示（将 HH-MM 转换为 HH:MM）
    Args:
        first_time: 首次出现时间
        last_time: 最后出现时间
        convert_time_func: 时间格式转换函数
    Returns:
        str: 格式化后的时间显示字符串
    """
    if not first_time:
        return ""
    # 转换为显示格式
    first_display = convert_time_func(first_time)
    last_display = convert_time_func(last_time)
    if first_display == last_display or not last_display:
        return first_display
    else:
        return f"[{first_display} ~ {last_display}]"
 def count_word_frequency(
    results: Dict,
    word_groups: List[Dict],
    filter_words: List[str],
    id_to_name: Dict,
    title_info: Optional[Dict] = None,
    rank_threshold: int = 3,
    new_titles: Optional[Dict] = None,
    mode: str = "daily",
    global_filters: Optional[List[str]] = None,
    weight_config: Optional[Dict] = None,
    max_news_per_keyword: int = 0,
    sort_by_position_first: bool = False,
    is_first_crawl_func: Optional[Callable[[], bool]] = None,
    convert_time_func: Optional[Callable[[str], str]] = None,
 ) -> Tuple[List[Dict], int]:
    """
    统计词频，支持必须词、频率词、过滤词、全局过滤词，并标记新增标题
    Args:
        results: 抓取结果 {source_id: {title: title_data}}
        word_groups: 词组配置列表
        filter_words: 过滤词列表
        id_to_name: ID 到名称的映射
        title_info: 标题统计信息（可选）
        rank_threshold: 排名阈值
        new_titles: 新增标题（可选）
        mode: 报告模式 (daily/incremental/current)
        global_filters: 全局过滤词（可选）
        weight_config: 权重配置
        max_news_per_keyword: 每个关键词最大显示数量
        sort_by_position_first: 是否优先按配置位置排序
        is_first_crawl_func: 检测是否是当天第一次爬取的函数
        convert_time_func: 时间格式转换函数
    Returns:
        Tuple[List[Dict], int]: (统计结果列表, 总标题数)
    """
    # 默认权重配置
    if weight_config is None:
        weight_config = {
            "RANK_WEIGHT": 0.4,
            "FREQUENCY_WEIGHT": 0.3,
            "HOTNESS_WEIGHT": 0.3,
        }
    # 默认时间转换函数
    if convert_time_func is None:
        convert_time_func = lambda x: x
    # 默认首次爬取检测函数
    if is_first_crawl_func is None:
        is_first_crawl_func = lambda: True
    # 如果没有配置词组，创建一个包含所有新闻的虚拟词组
    if not word_groups:
        print("频率词配置为空，将显示所有新闻")
        word_groups = [{"required": [], "normal": [], "group_key": "全部新闻"}]
        filter_words = []  # 清空过滤词，显示所有新闻
    is_first_today = is_first_crawl_func()
    # 确定处理的数据源和新增标记逻辑
    if mode == "incremental":
        if is_first_today:
            # 增量模式 + 当天第一次：处理所有新闻，都标记为新增
            results_to_process = results
            all_news_are_new = True
        else:
            # 增量模式 + 当天非第一次：只处理新增的新闻
            results_to_process = new_titles if new_titles else {}
            all_news_are_new = True
    elif mode == "current":
        # current 模式：只处理当前时间批次的新闻，但统计信息来自全部历史
        if title_info:
            latest_time = None
            for source_titles in title_info.values():
                for title_data in source_titles.values():
                    last_time = title_data.get("last_time", "")
                    if last_time:
                        if latest_time is None or last_time > latest_time:
                            latest_time = last_time
            # 只处理 last_time 等于最新时间的新闻
            if latest_time:
                results_to_process = {}
                for source_id, source_titles in results.items():
                    if source_id in title_info:
                        filtered_titles = {}
                        for title, title_data in source_titles.items():
                            if title in title_info[source_id]:
                                info = title_info[source_id][title]
                                if info.get("last_time") == latest_time:
                                    filtered_titles[title] = title_data
                        if filtered_titles:
                            results_to_process[source_id] = filtered_titles
                print(
                    f"当前榜单模式：最新时间 {latest_time}，筛选出 {sum(len(titles) for titles in results_to_process.values())} 条当前榜单新闻"
                )
            else:
                results_to_process = results
        else:
            results_to_process = results
        all_news_are_new = False
    else:
        # 当日汇总模式：处理所有新闻
        results_to_process = results
        all_news_are_new = False
        total_input_news = sum(len(titles) for titles in results.values())
        filter_status = (
            "全部显示"
            if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻"
            else "频率词过滤"
        )
        print(f"当日汇总模式：处理 {total_input_news} 条新闻，模式：{filter_status}")
    word_stats = {}
    total_titles = 0
    processed_titles = {}
    matched_new_count = 0
    if title_info is None:
        title_info = {}
    if new_titles is None:
        new_titles = {}
    for group in word_groups:
        group_key = group["group_key"]
        word_stats[group_key] = {"count": 0, "titles": {}}
    for source_id, titles_data in results_to_process.items():
        total_titles += len(titles_data)
        if source_id not in processed_titles:
            processed_titles[source_id] = {}
        for title, title_data in titles_data.items():
            if title in processed_titles.get(source_id, {}):
                continue
            # 使用统一的匹配逻辑
            matches_frequency_words = matches_word_groups(
                title, word_groups, filter_words, global_filters
            )
            if not matches_frequency_words:
                continue
            # 如果是增量模式或 current 模式第一次，统计匹配的新增新闻数量
            if (mode == "incremental" and all_news_are_new) or (
                mode == "current" and is_first_today
            ):
                matched_new_count += 1
            source_ranks = title_data.get("ranks", [])
            source_url = title_data.get("url", "")
            source_mobile_url = title_data.get("mobileUrl", "")
            # 找到匹配的词组（防御性转换确保类型安全）
            title_lower = str(title).lower() if not isinstance(title, str) else title.lower()
            for group in word_groups:
                required_words = group["required"]
                normal_words = group["normal"]
                # 如果是"全部新闻"模式，所有标题都匹配第一个（唯一的）词组
                if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻":
                    group_key = group["group_key"]
                    word_stats[group_key]["count"] += 1
                    if source_id not in word_stats[group_key]["titles"]:
                        word_stats[group_key]["titles"][source_id] = []
                else:
                    # 原有的匹配逻辑
                    if required_words:
                        all_required_present = all(
                            req_word.lower() in title_lower
                            for req_word in required_words
                        )
                        if not all_required_present:
                            continue
                    if normal_words:
                        any_normal_present = any(
                            normal_word.lower() in title_lower
                            for normal_word in normal_words
                        )
                        if not any_normal_present:
                            continue
                    group_key = group["group_key"]
                    word_stats[group_key]["count"] += 1
                    if source_id not in word_stats[group_key]["titles"]:
                        word_stats[group_key]["titles"][source_id] = []
                first_time = ""
                last_time = ""
                count_info = 1
                ranks = source_ranks if source_ranks else []
                url = source_url
                mobile_url = source_mobile_url
                # 对于 current 模式，从历史统计信息中获取完整数据
                if (
                    mode == "current"
                    and title_info
                    and source_id in title_info
                    and title in title_info[source_id]
                ):
                    info = title_info[source_id][title]
                    first_time = info.get("first_time", "")
                    last_time = info.get("last_time", "")
                    count_info = info.get("count", 1)
                    if "ranks" in info and info["ranks"]:
                        ranks = info["ranks"]
                    url = info.get("url", source_url)
                    mobile_url = info.get("mobileUrl", source_mobile_url)
                elif (
                    title_info
                    and source_id in title_info
                    and title in title_info[source_id]
                ):
                    info = title_info[source_id][title]
                    first_time = info.get("first_time", "")
                    last_time = info.get("last_time", "")
                    count_info = info.get("count", 1)
                    if "ranks" in info and info["ranks"]:
                        ranks = info["ranks"]
                    url = info.get("url", source_url)
                    mobile_url = info.get("mobileUrl", source_mobile_url)
                if not ranks:
                    ranks = [99]
                time_display = format_time_display(first_time, last_time, convert_time_func)
                source_name = id_to_name.get(source_id, source_id)
                # 判断是否为新增
                is_new = False
                if all_news_are_new:
                    # 增量模式下所有处理的新闻都是新增，或者当天第一次的所有新闻都是新增
                    is_new = True
                elif new_titles and source_id in new_titles:
                    # 检查是否在新增列表中
                    new_titles_for_source = new_titles[source_id]
                    is_new = title in new_titles_for_source
                word_stats[group_key]["titles"][source_id].append(
                    {
                        "title": title,
                        "source_name": source_name,
                        "first_time": first_time,
                        "last_time": last_time,
                        "time_display": time_display,
                        "count": count_info,
                        "ranks": ranks,
                        "rank_threshold": rank_threshold,
                        "url": url,
                        "mobileUrl": mobile_url,
                        "is_new": is_new,
                    }
                )
                if source_id not in processed_titles:
                    processed_titles[source_id] = {}
                processed_titles[source_id][title] = True
                break
    # 最后统一打印汇总信息
    if mode == "incremental":
        if is_first_today:
            total_input_news = sum(len(titles) for titles in results.values())
            filter_status = (
                "全部显示"
                if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻"
                else "频率词匹配"
            )
            print(
                f"增量模式：当天第一次爬取，{total_input_news} 条新闻中有 {matched_new_count} 条{filter_status}"
            )
        else:
            if new_titles:
                total_new_count = sum(len(titles) for titles in new_titles.values())
                filter_status = (
                    "全部显示"
                    if len(word_groups) == 1
                    and word_groups[0]["group_key"] == "全部新闻"
                    else "匹配频率词"
                )
                print(
                    f"增量模式：{total_new_count} 条新增新闻中，有 {matched_new_count} 条{filter_status}"
                )
                if matched_new_count == 0 and len(word_groups) > 1:
                    print("增量模式：没有新增新闻匹配频率词，将不会发送通知")
            else:
                print("增量模式：未检测到新增新闻")
    elif mode == "current":
        total_input_news = sum(len(titles) for titles in results_to_process.values())
        if is_first_today:
            filter_status = (
                "全部显示"
                if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻"
                else "频率词匹配"
            )
            print(
                f"当前榜单模式：当天第一次爬取，{total_input_news} 条当前榜单新闻中有 {matched_new_count} 条{filter_status}"
            )
        else:
            matched_count = sum(stat["count"] for stat in word_stats.values())
            filter_status = (
                "全部显示"
                if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻"
                else "频率词匹配"
            )
            print(
                f"当前榜单模式：{total_input_news} 条当前榜单新闻中有 {matched_count} 条{filter_status}"
            )
    stats = []
    # 创建 group_key 到位置和最大数量的映射
    group_key_to_position = {
        group["group_key"]: idx for idx, group in enumerate(word_groups)
    }
    group_key_to_max_count = {
        group["group_key"]: group.get("max_count", 0) for group in word_groups
    }
    for group_key, data in word_stats.items():
        all_titles = []
        for source_id, title_list in data["titles"].items():
            all_titles.extend(title_list)
        # 按权重排序
        sorted_titles = sorted(
            all_titles,
            key=lambda x: (
                -calculate_news_weight(x, rank_threshold, weight_config),
                min(x["ranks"]) if x["ranks"] else 999,
                -x["count"],
            ),
        )
        # 应用最大显示数量限制（优先级：单独配置 > 全局配置）
        group_max_count = group_key_to_max_count.get(group_key, 0)
        if group_max_count == 0:
            # 使用全局配置
            group_max_count = max_news_per_keyword
        if group_max_count > 0:
            sorted_titles = sorted_titles[:group_max_count]
        stats.append(
            {
                "word": group_key,
                "count": data["count"],
                "position": group_key_to_position.get(group_key, 999),
                "titles": sorted_titles,
                "percentage": (
                    round(data["count"] / total_titles * 100, 2)
                    if total_titles > 0
                    else 0
                ),
            }
        )
    # 根据配置选择排序优先级
    if sort_by_position_first:
        # 先按配置位置，再按热点条数
        stats.sort(key=lambda x: (x["position"], -x["count"]))
    else:
        # 先按热点条数，再按配置位置（原逻辑）
        stats.sort(key=lambda x: (-x["count"], x["position"]))
    # 打印过滤后的匹配新闻数（与推送显示一致）
    matched_news_count = sum(len(stat["titles"]) for stat in stats if stat["count"] > 0)
    if mode == "daily":
        print(f"频率词过滤后：{matched_news_count} 条新闻匹配（将显示在推送中）")
    return stats, total_titles
--- a/trendradar/core/config.py
+++ b/trendradar/core/config.py
@ -0,0 +1,152 @@
 # coding=utf-8
 """
 配置工具模块 - 多账号配置解析和验证
 提供多账号推送配置的解析、验证和限制功能
 """
 from typing import Dict, List, Optional, Tuple
 def parse_multi_account_config(config_value: str, separator: str = ";") -> List[str]:
    """
    解析多账号配置，返回账号列表
    Args:
        config_value: 配置值字符串，多个账号用分隔符分隔
        separator: 分隔符，默认为 ;
    Returns:
        账号列表，空字符串会被保留（用于占位）
    Examples:
        >>> parse_multi_account_config("url1;url2;url3")
        ['url1', 'url2', 'url3']
        >>> parse_multi_account_config(";token2")  # 第一个账号无token
        ['', 'token2']
        >>> parse_multi_account_config("")
        []
    """
    if not config_value:
        return []
    # 保留空字符串用于占位（如 ";token2" 表示第一个账号无token）
    accounts = [acc.strip() for acc in config_value.split(separator)]
    # 过滤掉全部为空的情况
    if all(not acc for acc in accounts):
        return []
    return accounts
 def validate_paired_configs(
    configs: Dict[str, List[str]],
    channel_name: str,
    required_keys: Optional[List[str]] = None
 ) -> Tuple[bool, int]:
    """
    验证配对配置的数量是否一致
    对于需要多个配置项配对的渠道（如 Telegram 的 token 和 chat_id），
    验证所有配置项的账号数量是否一致。
    Args:
        configs: 配置字典，key 为配置名，value 为账号列表
        channel_name: 渠道名称，用于日志输出
        required_keys: 必须有值的配置项列表
    Returns:
        (是否验证通过, 账号数量)
    Examples:
        >>> validate_paired_configs({
        ...     "token": ["t1", "t2"],
        ...     "chat_id": ["c1", "c2"]
        ... }, "Telegram", ["token", "chat_id"])
        (True, 2)
        >>> validate_paired_configs({
        ...     "token": ["t1", "t2"],
        ...     "chat_id": ["c1"]  # 数量不匹配
        ... }, "Telegram", ["token", "chat_id"])
        (False, 0)
    """
    # 过滤掉空列表
    non_empty_configs = {k: v for k, v in configs.items() if v}
    if not non_empty_configs:
        return True, 0
    # 检查必须项
    if required_keys:
        for key in required_keys:
            if key not in non_empty_configs or not non_empty_configs[key]:
                return True, 0  # 必须项为空，视为未配置
    # 获取所有非空配置的长度
    lengths = {k: len(v) for k, v in non_empty_configs.items()}
    unique_lengths = set(lengths.values())
    if len(unique_lengths) > 1:
        print(f"❌ {channel_name} 配置错误：配对配置数量不一致，将跳过该渠道推送")
        for key, length in lengths.items():
            print(f"   - {key}: {length} 个")
        return False, 0
    return True, list(unique_lengths)[0] if unique_lengths else 0
 def limit_accounts(
    accounts: List[str],
    max_count: int,
    channel_name: str
 ) -> List[str]:
    """
    限制账号数量
    当配置的账号数量超过最大限制时，只使用前 N 个账号，
    并输出警告信息。
    Args:
        accounts: 账号列表
        max_count: 最大账号数量
        channel_name: 渠道名称，用于日志输出
    Returns:
        限制后的账号列表
    Examples:
        >>> limit_accounts(["a1", "a2", "a3"], 2, "飞书")
        ⚠️ 飞书 配置了 3 个账号，超过最大限制 2，只使用前 2 个
        ['a1', 'a2']
    """
    if len(accounts) > max_count:
        print(f"⚠️ {channel_name} 配置了 {len(accounts)} 个账号，超过最大限制 {max_count}，只使用前 {max_count} 个")
        print(f"   ⚠️ 警告：如果您是 fork 用户，过多账号可能导致 GitHub Actions 运行时间过长，存在账号风险")
        return accounts[:max_count]
    return accounts
 def get_account_at_index(accounts: List[str], index: int, default: str = "") -> str:
    """
    安全获取指定索引的账号值
    当索引超出范围或账号值为空时，返回默认值。
    Args:
        accounts: 账号列表
        index: 索引
        default: 默认值
    Returns:
        账号值或默认值
    Examples:
        >>> get_account_at_index(["a", "b", "c"], 1)
        'b'
        >>> get_account_at_index(["a", "", "c"], 1, "default")
        'default'
        >>> get_account_at_index(["a"], 5, "default")
        'default'
    """
    if index < len(accounts):
        return accounts[index] if accounts[index] else default
    return default
--- a/trendradar/core/data.py
+++ b/trendradar/core/data.py
@ -0,0 +1,291 @@
 # coding=utf-8
 """
 数据处理模块
 提供数据读取、保存和检测功能：
 - save_titles_to_file: 保存标题到 TXT 文件
 - read_all_today_titles: 从存储后端读取当天所有标题
 - detect_latest_new_titles: 检测最新批次的新增标题
 Author: TrendRadar Team
 """
 from pathlib import Path
 from typing import Dict, List, Tuple, Optional, Callable
 def save_titles_to_file(
    results: Dict,
    id_to_name: Dict,
    failed_ids: List,
    output_path: str,
    clean_title_func: Callable[[str], str],
 ) -> str:
    """
    保存标题到 TXT 文件
    Args:
        results: 抓取结果 {source_id: {title: title_data}}
        id_to_name: ID 到名称的映射
        failed_ids: 失败的 ID 列表
        output_path: 输出文件路径
        clean_title_func: 标题清理函数
    Returns:
        str: 保存的文件路径
    """
    # 确保目录存在
    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
    with open(output_path, "w", encoding="utf-8") as f:
        for id_value, title_data in results.items():
            # id | name 或 id
            name = id_to_name.get(id_value)
            if name and name != id_value:
                f.write(f"{id_value} | {name}\n")
            else:
                f.write(f"{id_value}\n")
            # 按排名排序标题
            sorted_titles = []
            for title, info in title_data.items():
                cleaned_title = clean_title_func(title)
                if isinstance(info, dict):
                    ranks = info.get("ranks", [])
                    url = info.get("url", "")
                    mobile_url = info.get("mobileUrl", "")
                else:
                    ranks = info if isinstance(info, list) else []
                    url = ""
                    mobile_url = ""
                rank = ranks[0] if ranks else 1
                sorted_titles.append((rank, cleaned_title, url, mobile_url))
            sorted_titles.sort(key=lambda x: x[0])
            for rank, cleaned_title, url, mobile_url in sorted_titles:
                line = f"{rank}. {cleaned_title}"
                if url:
                    line += f" [URL:{url}]"
                if mobile_url:
                    line += f" [MOBILE:{mobile_url}]"
                f.write(line + "\n")
            f.write("\n")
        if failed_ids:
            f.write("==== 以下ID请求失败 ====\n")
            for id_value in failed_ids:
                f.write(f"{id_value}\n")
    return output_path
 def read_all_today_titles_from_storage(
    storage_manager,
    current_platform_ids: Optional[List[str]] = None,
 ) -> Tuple[Dict, Dict, Dict]:
    """
    从存储后端读取当天所有标题（SQLite 数据）
    Args:
        storage_manager: 存储管理器实例
        current_platform_ids: 当前监控的平台 ID 列表（用于过滤）
    Returns:
        Tuple[Dict, Dict, Dict]: (all_results, id_to_name, title_info)
    """
    try:
        news_data = storage_manager.get_today_all_data()
        if not news_data or not news_data.items:
            return {}, {}, {}
        all_results = {}
        final_id_to_name = {}
        title_info = {}
        for source_id, news_list in news_data.items.items():
            # 按平台过滤
            if current_platform_ids is not None and source_id not in current_platform_ids:
                continue
            # 获取来源名称
            source_name = news_data.id_to_name.get(source_id, source_id)
            final_id_to_name[source_id] = source_name
            if source_id not in all_results:
                all_results[source_id] = {}
                title_info[source_id] = {}
            for item in news_list:
                title = item.title
                ranks = getattr(item, 'ranks', [item.rank])
                first_time = getattr(item, 'first_time', item.crawl_time)
                last_time = getattr(item, 'last_time', item.crawl_time)
                count = getattr(item, 'count', 1)
                all_results[source_id][title] = {
                    "ranks": ranks,
                    "url": item.url or "",
                    "mobileUrl": item.mobile_url or "",
                }
                title_info[source_id][title] = {
                    "first_time": first_time,
                    "last_time": last_time,
                    "count": count,
                    "ranks": ranks,
                    "url": item.url or "",
                    "mobileUrl": item.mobile_url or "",
                }
        return all_results, final_id_to_name, title_info
    except Exception as e:
        print(f"[存储] 从存储后端读取数据失败: {e}")
        return {}, {}, {}
 def read_all_today_titles(
    storage_manager,
    current_platform_ids: Optional[List[str]] = None,
 ) -> Tuple[Dict, Dict, Dict]:
    """
    读取当天所有标题（从存储后端）
    Args:
        storage_manager: 存储管理器实例
        current_platform_ids: 当前监控的平台 ID 列表（用于过滤）
    Returns:
        Tuple[Dict, Dict, Dict]: (all_results, id_to_name, title_info)
    """
    all_results, final_id_to_name, title_info = read_all_today_titles_from_storage(
        storage_manager, current_platform_ids
    )
    if all_results:
        total_count = sum(len(titles) for titles in all_results.values())
        print(f"[存储] 已从存储后端读取 {total_count} 条标题")
    else:
        print("[存储] 当天暂无数据")
    return all_results, final_id_to_name, title_info
 def detect_latest_new_titles_from_storage(
    storage_manager,
    current_platform_ids: Optional[List[str]] = None,
 ) -> Dict:
    """
    从存储后端检测最新批次的新增标题
    Args:
        storage_manager: 存储管理器实例
        current_platform_ids: 当前监控的平台 ID 列表（用于过滤）
    Returns:
        Dict: 新增标题 {source_id: {title: title_data}}
    """
    try:
        # 获取最新抓取数据
        latest_data = storage_manager.get_latest_crawl_data()
        if not latest_data or not latest_data.items:
            return {}
        # 获取所有历史数据
        all_data = storage_manager.get_today_all_data()
        if not all_data or not all_data.items:
            # 没有历史数据（第一次抓取），不应该有"新增"标题
            return {}
        # 收集历史标题（不包括最新批次的时间）
        latest_time = latest_data.crawl_time
        historical_titles = {}
        for source_id, news_list in all_data.items.items():
            if current_platform_ids is not None and source_id not in current_platform_ids:
                continue
            historical_titles[source_id] = set()
            for item in news_list:
                # 只统计非最新批次的标题
                first_time = getattr(item, 'first_time', item.crawl_time)
                if first_time != latest_time:
                    historical_titles[source_id].add(item.title)
        # 检查是否是当天第一次抓取（没有任何历史标题）
        # 如果所有平台的历史标题集合都为空，说明只有一个抓取批次，不应该有"新增"标题
        has_historical_data = any(len(titles) > 0 for titles in historical_titles.values())
        if not has_historical_data:
            return {}
        # 找出新增标题
        new_titles = {}
        for source_id, news_list in latest_data.items.items():
            if current_platform_ids is not None and source_id not in current_platform_ids:
                continue
            historical_set = historical_titles.get(source_id, set())
            source_new_titles = {}
            for item in news_list:
                if item.title not in historical_set:
                    source_new_titles[item.title] = {
                        "ranks": [item.rank],
                        "url": item.url or "",
                        "mobileUrl": item.mobile_url or "",
                    }
            if source_new_titles:
                new_titles[source_id] = source_new_titles
        return new_titles
    except Exception as e:
        print(f"[存储] 从存储后端检测新标题失败: {e}")
        return {}
 def detect_latest_new_titles(
    storage_manager,
    current_platform_ids: Optional[List[str]] = None,
 ) -> Dict:
    """
    检测当日最新批次的新增标题（从存储后端）
    Args:
        storage_manager: 存储管理器实例
        current_platform_ids: 当前监控的平台 ID 列表（用于过滤）
    Returns:
        Dict: 新增标题 {source_id: {title: title_data}}
    """
    new_titles = detect_latest_new_titles_from_storage(storage_manager, current_platform_ids)
    if new_titles:
        total_new = sum(len(titles) for titles in new_titles.values())
        print(f"[存储] 从存储后端检测到 {total_new} 条新增标题")
    return new_titles
 def is_first_crawl_today(output_dir: str, date_folder: str) -> bool:
    """
    检测是否是当天第一次爬取
    Args:
        output_dir: 输出目录
        date_folder: 日期文件夹名称
    Returns:
        bool: 是否是当天第一次爬取
    """
    txt_dir = Path(output_dir) / date_folder / "txt"
    if not txt_dir.exists():
        return True
    files = sorted([f for f in txt_dir.iterdir() if f.suffix == ".txt"])
    return len(files) <= 1
--- a/trendradar/core/frequency.py
+++ b/trendradar/core/frequency.py
@ -0,0 +1,194 @@
 # coding=utf-8
 """
 频率词配置加载模块
 负责从配置文件加载频率词规则，支持：
 - 普通词组
 - 必须词（+前缀）
 - 过滤词（!前缀）
 - 全局过滤词（[GLOBAL_FILTER] 区域）
 - 最大显示数量（@前缀）
 """
 import os
 from pathlib import Path
 from typing import Dict, List, Tuple, Optional
 def load_frequency_words(
    frequency_file: Optional[str] = None,
 ) -> Tuple[List[Dict], List[str], List[str]]:
    """
    加载频率词配置
    配置文件格式说明：
    - 每个词组由空行分隔
    - [GLOBAL_FILTER] 区域定义全局过滤词
    - [WORD_GROUPS] 区域定义词组（默认）
    词组语法：
    - 普通词：直接写入，任意匹配即可
    - +词：必须词，所有必须词都要匹配
    - !词：过滤词，匹配则排除
    - @数字：该词组最多显示的条数
    Args:
        frequency_file: 频率词配置文件路径，默认从环境变量 FREQUENCY_WORDS_PATH 获取或使用 config/frequency_words.txt
    Returns:
        (词组列表, 词组内过滤词, 全局过滤词)
    Raises:
        FileNotFoundError: 频率词文件不存在
    """
    if frequency_file is None:
        frequency_file = os.environ.get(
            "FREQUENCY_WORDS_PATH", "config/frequency_words.txt"
        )
    frequency_path = Path(frequency_file)
    if not frequency_path.exists():
        raise FileNotFoundError(f"频率词文件 {frequency_file} 不存在")
    with open(frequency_path, "r", encoding="utf-8") as f:
        content = f.read()
    word_groups = [group.strip() for group in content.split("\n\n") if group.strip()]
    processed_groups = []
    filter_words = []
    global_filters = []
    # 默认区域（向后兼容）
    current_section = "WORD_GROUPS"
    for group in word_groups:
        lines = [line.strip() for line in group.split("\n") if line.strip()]
        if not lines:
            continue
        # 检查是否为区域标记
        if lines[0].startswith("[") and lines[0].endswith("]"):
            section_name = lines[0][1:-1].upper()
            if section_name in ("GLOBAL_FILTER", "WORD_GROUPS"):
                current_section = section_name
                lines = lines[1:]  # 移除标记行
        # 处理全局过滤区域
        if current_section == "GLOBAL_FILTER":
            # 直接添加所有非空行到全局过滤列表
            for line in lines:
                # 忽略特殊语法前缀，只提取纯文本
                if line.startswith(("!", "+", "@")):
                    continue  # 全局过滤区不支持特殊语法
                if line:
                    global_filters.append(line)
            continue
        # 处理词组区域
        words = lines
        group_required_words = []
        group_normal_words = []
        group_filter_words = []
        group_max_count = 0  # 默认不限制
        for word in words:
            if word.startswith("@"):
                # 解析最大显示数量（只接受正整数）
                try:
                    count = int(word[1:])
                    if count > 0:
                        group_max_count = count
                except (ValueError, IndexError):
                    pass  # 忽略无效的@数字格式
            elif word.startswith("!"):
                filter_words.append(word[1:])
                group_filter_words.append(word[1:])
            elif word.startswith("+"):
                group_required_words.append(word[1:])
            else:
                group_normal_words.append(word)
        if group_required_words or group_normal_words:
            if group_normal_words:
                group_key = " ".join(group_normal_words)
            else:
                group_key = " ".join(group_required_words)
            processed_groups.append(
                {
                    "required": group_required_words,
                    "normal": group_normal_words,
                    "group_key": group_key,
                    "max_count": group_max_count,
                }
            )
    return processed_groups, filter_words, global_filters
 def matches_word_groups(
    title: str,
    word_groups: List[Dict],
    filter_words: List[str],
    global_filters: Optional[List[str]] = None
 ) -> bool:
    """
    检查标题是否匹配词组规则
    Args:
        title: 标题文本
        word_groups: 词组列表
        filter_words: 过滤词列表
        global_filters: 全局过滤词列表
    Returns:
        是否匹配
    """
    # 防御性类型检查：确保 title 是有效字符串
    if not isinstance(title, str):
        title = str(title) if title is not None else ""
    if not title.strip():
        return False
    title_lower = title.lower()
    # 全局过滤检查（优先级最高）
    if global_filters:
        if any(global_word.lower() in title_lower for global_word in global_filters):
            return False
    # 如果没有配置词组，则匹配所有标题（支持显示全部新闻）
    if not word_groups:
        return True
    # 过滤词检查
    if any(filter_word.lower() in title_lower for filter_word in filter_words):
        return False
    # 词组匹配检查
    for group in word_groups:
        required_words = group["required"]
        normal_words = group["normal"]
        # 必须词检查
        if required_words:
            all_required_present = all(
                req_word.lower() in title_lower for req_word in required_words
            )
            if not all_required_present:
                continue
        # 普通词检查
        if normal_words:
            any_normal_present = any(
                normal_word.lower() in title_lower for normal_word in normal_words
            )
            if not any_normal_present:
                continue
        return True
    return False
--- a/trendradar/core/loader.py
+++ b/trendradar/core/loader.py
@ -0,0 +1,332 @@
 # coding=utf-8
 """
 配置加载模块
 负责从 YAML 配置文件和环境变量加载配置。
 """
 import os
 from pathlib import Path
 from typing import Dict, Any, Optional
 import yaml
 from .config import parse_multi_account_config, validate_paired_configs
 def _get_env_bool(key: str, default: bool = False) -> Optional[bool]:
    """从环境变量获取布尔值，如果未设置返回 None"""
    value = os.environ.get(key, "").strip().lower()
    if not value:
        return None
    return value in ("true", "1")
 def _get_env_int(key: str, default: int = 0) -> int:
    """从环境变量获取整数值"""
    value = os.environ.get(key, "").strip()
    if not value:
        return default
    try:
        return int(value)
    except ValueError:
        return default
 def _get_env_str(key: str, default: str = "") -> str:
    """从环境变量获取字符串值"""
    return os.environ.get(key, "").strip() or default
 def _load_app_config(config_data: Dict) -> Dict:
    """加载应用配置"""
    app_config = config_data.get("app", {})
    return {
        "VERSION_CHECK_URL": app_config.get("version_check_url", ""),
        "SHOW_VERSION_UPDATE": app_config.get("show_version_update", True),
        "TIMEZONE": _get_env_str("TIMEZONE") or app_config.get("timezone", "Asia/Shanghai"),
    }
 def _load_crawler_config(config_data: Dict) -> Dict:
    """加载爬虫配置"""
    crawler_config = config_data.get("crawler", {})
    enable_crawler_env = _get_env_bool("ENABLE_CRAWLER")
    return {
        "REQUEST_INTERVAL": crawler_config.get("request_interval", 100),
        "USE_PROXY": crawler_config.get("use_proxy", False),
        "DEFAULT_PROXY": crawler_config.get("default_proxy", ""),
        "ENABLE_CRAWLER": enable_crawler_env if enable_crawler_env is not None else crawler_config.get("enable_crawler", True),
    }
 def _load_report_config(config_data: Dict) -> Dict:
    """加载报告配置"""
    report_config = config_data.get("report", {})
    # 环境变量覆盖
    sort_by_position_env = _get_env_bool("SORT_BY_POSITION_FIRST")
    reverse_content_env = _get_env_bool("REVERSE_CONTENT_ORDER")
    max_news_env = _get_env_int("MAX_NEWS_PER_KEYWORD")
    return {
        "REPORT_MODE": _get_env_str("REPORT_MODE") or report_config.get("mode", "daily"),
        "RANK_THRESHOLD": report_config.get("rank_threshold", 10),
        "SORT_BY_POSITION_FIRST": sort_by_position_env if sort_by_position_env is not None else report_config.get("sort_by_position_first", False),
        "MAX_NEWS_PER_KEYWORD": max_news_env or report_config.get("max_news_per_keyword", 0),
        "REVERSE_CONTENT_ORDER": reverse_content_env if reverse_content_env is not None else report_config.get("reverse_content_order", False),
    }
 def _load_notification_config(config_data: Dict) -> Dict:
    """加载通知配置"""
    notification = config_data.get("notification", {})
    enable_notification_env = _get_env_bool("ENABLE_NOTIFICATION")
    return {
        "ENABLE_NOTIFICATION": enable_notification_env if enable_notification_env is not None else notification.get("enable_notification", True),
        "MESSAGE_BATCH_SIZE": notification.get("message_batch_size", 4000),
        "DINGTALK_BATCH_SIZE": notification.get("dingtalk_batch_size", 20000),
        "FEISHU_BATCH_SIZE": notification.get("feishu_batch_size", 29000),
        "BARK_BATCH_SIZE": notification.get("bark_batch_size", 3600),
        "SLACK_BATCH_SIZE": notification.get("slack_batch_size", 4000),
        "BATCH_SEND_INTERVAL": notification.get("batch_send_interval", 1.0),
        "FEISHU_MESSAGE_SEPARATOR": notification.get("feishu_message_separator", "---"),
        "MAX_ACCOUNTS_PER_CHANNEL": _get_env_int("MAX_ACCOUNTS_PER_CHANNEL") or notification.get("max_accounts_per_channel", 3),
    }
 def _load_push_window_config(config_data: Dict) -> Dict:
    """加载推送窗口配置"""
    notification = config_data.get("notification", {})
    push_window = notification.get("push_window", {})
    time_range = push_window.get("time_range", {})
    enabled_env = _get_env_bool("PUSH_WINDOW_ENABLED")
    once_per_day_env = _get_env_bool("PUSH_WINDOW_ONCE_PER_DAY")
    return {
        "ENABLED": enabled_env if enabled_env is not None else push_window.get("enabled", False),
        "TIME_RANGE": {
            "START": _get_env_str("PUSH_WINDOW_START") or time_range.get("start", "08:00"),
            "END": _get_env_str("PUSH_WINDOW_END") or time_range.get("end", "22:00"),
        },
        "ONCE_PER_DAY": once_per_day_env if once_per_day_env is not None else push_window.get("once_per_day", True),
    }
 def _load_weight_config(config_data: Dict) -> Dict:
    """加载权重配置"""
    weight = config_data.get("weight", {})
    return {
        "RANK_WEIGHT": weight.get("rank_weight", 1.0),
        "FREQUENCY_WEIGHT": weight.get("frequency_weight", 1.0),
        "HOTNESS_WEIGHT": weight.get("hotness_weight", 1.0),
    }
 def _load_storage_config(config_data: Dict) -> Dict:
    """加载存储配置"""
    storage = config_data.get("storage", {})
    formats = storage.get("formats", {})
    local = storage.get("local", {})
    remote = storage.get("remote", {})
    pull = storage.get("pull", {})
    txt_enabled_env = _get_env_bool("STORAGE_TXT_ENABLED")
    html_enabled_env = _get_env_bool("STORAGE_HTML_ENABLED")
    pull_enabled_env = _get_env_bool("PULL_ENABLED")
    return {
        "BACKEND": _get_env_str("STORAGE_BACKEND") or storage.get("backend", "auto"),
        "FORMATS": {
            "SQLITE": formats.get("sqlite", True),
            "TXT": txt_enabled_env if txt_enabled_env is not None else formats.get("txt", True),
            "HTML": html_enabled_env if html_enabled_env is not None else formats.get("html", True),
        },
        "LOCAL": {
            "DATA_DIR": local.get("data_dir", "output"),
            "RETENTION_DAYS": _get_env_int("LOCAL_RETENTION_DAYS") or local.get("retention_days", 0),
        },
        "REMOTE": {
            "ENDPOINT_URL": _get_env_str("S3_ENDPOINT_URL") or remote.get("endpoint_url", ""),
            "BUCKET_NAME": _get_env_str("S3_BUCKET_NAME") or remote.get("bucket_name", ""),
            "ACCESS_KEY_ID": _get_env_str("S3_ACCESS_KEY_ID") or remote.get("access_key_id", ""),
            "SECRET_ACCESS_KEY": _get_env_str("S3_SECRET_ACCESS_KEY") or remote.get("secret_access_key", ""),
            "REGION": _get_env_str("S3_REGION") or remote.get("region", ""),
            "RETENTION_DAYS": _get_env_int("REMOTE_RETENTION_DAYS") or remote.get("retention_days", 0),
        },
        "PULL": {
            "ENABLED": pull_enabled_env if pull_enabled_env is not None else pull.get("enabled", False),
            "DAYS": _get_env_int("PULL_DAYS") or pull.get("days", 7),
        },
    }
 def _load_webhook_config(config_data: Dict) -> Dict:
    """加载 Webhook 配置"""
    notification = config_data.get("notification", {})
    webhooks = notification.get("webhooks", {})
    return {
        # 飞书
        "FEISHU_WEBHOOK_URL": _get_env_str("FEISHU_WEBHOOK_URL") or webhooks.get("feishu_url", ""),
        # 钉钉
        "DINGTALK_WEBHOOK_URL": _get_env_str("DINGTALK_WEBHOOK_URL") or webhooks.get("dingtalk_url", ""),
        # 企业微信
        "WEWORK_WEBHOOK_URL": _get_env_str("WEWORK_WEBHOOK_URL") or webhooks.get("wework_url", ""),
        "WEWORK_MSG_TYPE": _get_env_str("WEWORK_MSG_TYPE") or webhooks.get("wework_msg_type", "markdown"),
        # Telegram
        "TELEGRAM_BOT_TOKEN": _get_env_str("TELEGRAM_BOT_TOKEN") or webhooks.get("telegram_bot_token", ""),
        "TELEGRAM_CHAT_ID": _get_env_str("TELEGRAM_CHAT_ID") or webhooks.get("telegram_chat_id", ""),
        # 邮件
        "EMAIL_FROM": _get_env_str("EMAIL_FROM") or webhooks.get("email_from", ""),
        "EMAIL_PASSWORD": _get_env_str("EMAIL_PASSWORD") or webhooks.get("email_password", ""),
        "EMAIL_TO": _get_env_str("EMAIL_TO") or webhooks.get("email_to", ""),
        "EMAIL_SMTP_SERVER": _get_env_str("EMAIL_SMTP_SERVER") or webhooks.get("email_smtp_server", ""),
        "EMAIL_SMTP_PORT": _get_env_str("EMAIL_SMTP_PORT") or webhooks.get("email_smtp_port", ""),
        # ntfy
        "NTFY_SERVER_URL": _get_env_str("NTFY_SERVER_URL") or webhooks.get("ntfy_server_url") or "https://ntfy.sh",
        "NTFY_TOPIC": _get_env_str("NTFY_TOPIC") or webhooks.get("ntfy_topic", ""),
        "NTFY_TOKEN": _get_env_str("NTFY_TOKEN") or webhooks.get("ntfy_token", ""),
        # Bark
        "BARK_URL": _get_env_str("BARK_URL") or webhooks.get("bark_url", ""),
        # Slack
        "SLACK_WEBHOOK_URL": _get_env_str("SLACK_WEBHOOK_URL") or webhooks.get("slack_webhook_url", ""),
    }
 def _print_notification_sources(config: Dict) -> None:
    """打印通知渠道配置来源信息"""
    notification_sources = []
    max_accounts = config["MAX_ACCOUNTS_PER_CHANNEL"]
    if config["FEISHU_WEBHOOK_URL"]:
        accounts = parse_multi_account_config(config["FEISHU_WEBHOOK_URL"])
        count = min(len(accounts), max_accounts)
        source = "环境变量" if os.environ.get("FEISHU_WEBHOOK_URL") else "配置文件"
        notification_sources.append(f"飞书({source}, {count}个账号)")
    if config["DINGTALK_WEBHOOK_URL"]:
        accounts = parse_multi_account_config(config["DINGTALK_WEBHOOK_URL"])
        count = min(len(accounts), max_accounts)
        source = "环境变量" if os.environ.get("DINGTALK_WEBHOOK_URL") else "配置文件"
        notification_sources.append(f"钉钉({source}, {count}个账号)")
    if config["WEWORK_WEBHOOK_URL"]:
        accounts = parse_multi_account_config(config["WEWORK_WEBHOOK_URL"])
        count = min(len(accounts), max_accounts)
        source = "环境变量" if os.environ.get("WEWORK_WEBHOOK_URL") else "配置文件"
        notification_sources.append(f"企业微信({source}, {count}个账号)")
    if config["TELEGRAM_BOT_TOKEN"] and config["TELEGRAM_CHAT_ID"]:
        tokens = parse_multi_account_config(config["TELEGRAM_BOT_TOKEN"])
        chat_ids = parse_multi_account_config(config["TELEGRAM_CHAT_ID"])
        valid, count = validate_paired_configs(
            {"bot_token": tokens, "chat_id": chat_ids},
            "Telegram",
            required_keys=["bot_token", "chat_id"]
        )
        if valid and count > 0:
            count = min(count, max_accounts)
            token_source = "环境变量" if os.environ.get("TELEGRAM_BOT_TOKEN") else "配置文件"
            notification_sources.append(f"Telegram({token_source}, {count}个账号)")
    if config["EMAIL_FROM"] and config["EMAIL_PASSWORD"] and config["EMAIL_TO"]:
        from_source = "环境变量" if os.environ.get("EMAIL_FROM") else "配置文件"
        notification_sources.append(f"邮件({from_source})")
    if config["NTFY_SERVER_URL"] and config["NTFY_TOPIC"]:
        topics = parse_multi_account_config(config["NTFY_TOPIC"])
        tokens = parse_multi_account_config(config["NTFY_TOKEN"])
        if tokens:
            valid, count = validate_paired_configs(
                {"topic": topics, "token": tokens},
                "ntfy"
            )
            if valid and count > 0:
                count = min(count, max_accounts)
                server_source = "环境变量" if os.environ.get("NTFY_SERVER_URL") else "配置文件"
                notification_sources.append(f"ntfy({server_source}, {count}个账号)")
        else:
            count = min(len(topics), max_accounts)
            server_source = "环境变量" if os.environ.get("NTFY_SERVER_URL") else "配置文件"
            notification_sources.append(f"ntfy({server_source}, {count}个账号)")
    if config["BARK_URL"]:
        accounts = parse_multi_account_config(config["BARK_URL"])
        count = min(len(accounts), max_accounts)
        bark_source = "环境变量" if os.environ.get("BARK_URL") else "配置文件"
        notification_sources.append(f"Bark({bark_source}, {count}个账号)")
    if config["SLACK_WEBHOOK_URL"]:
        accounts = parse_multi_account_config(config["SLACK_WEBHOOK_URL"])
        count = min(len(accounts), max_accounts)
        slack_source = "环境变量" if os.environ.get("SLACK_WEBHOOK_URL") else "配置文件"
        notification_sources.append(f"Slack({slack_source}, {count}个账号)")
    if notification_sources:
        print(f"通知渠道配置来源: {', '.join(notification_sources)}")
        print(f"每个渠道最大账号数: {max_accounts}")
    else:
        print("未配置任何通知渠道")
 def load_config(config_path: Optional[str] = None) -> Dict[str, Any]:
    """
    加载配置文件
    Args:
        config_path: 配置文件路径，默认从环境变量 CONFIG_PATH 获取或使用 config/config.yaml
    Returns:
        包含所有配置的字典
    Raises:
        FileNotFoundError: 配置文件不存在
    """
    if config_path is None:
        config_path = os.environ.get("CONFIG_PATH", "config/config.yaml")
    if not Path(config_path).exists():
        raise FileNotFoundError(f"配置文件 {config_path} 不存在")
    with open(config_path, "r", encoding="utf-8") as f:
        config_data = yaml.safe_load(f)
    print(f"配置文件加载成功: {config_path}")
    # 合并所有配置
    config = {}
    # 应用配置
    config.update(_load_app_config(config_data))
    # 爬虫配置
    config.update(_load_crawler_config(config_data))
    # 报告配置
    config.update(_load_report_config(config_data))
    # 通知配置
    config.update(_load_notification_config(config_data))
    # 推送窗口配置
    config["PUSH_WINDOW"] = _load_push_window_config(config_data)
    # 权重配置
    config["WEIGHT_CONFIG"] = _load_weight_config(config_data)
    # 平台配置
    config["PLATFORMS"] = config_data.get("platforms", [])
    # 存储配置
    config["STORAGE"] = _load_storage_config(config_data)
    # Webhook 配置
    config.update(_load_webhook_config(config_data))
    # 打印通知渠道配置来源
    _print_notification_sources(config)
    return config
--- a/trendradar/crawler/init.py
+++ b/trendradar/crawler/init.py
@ -0,0 +1,8 @@
 # coding=utf-8
 """
 爬虫模块 - 数据抓取功能
 """
 from trendradar.crawler.fetcher import DataFetcher
 __all__ = ["DataFetcher"]
--- a/trendradar/crawler/fetcher.py
+++ b/trendradar/crawler/fetcher.py
@ -0,0 +1,184 @@
 # coding=utf-8
 """
 数据获取器模块
 负责从 NewsNow API 抓取新闻数据，支持：
 - 单个平台数据获取
 - 批量平台数据爬取
 - 自动重试机制
 - 代理支持
 """
 import json
 import random
 import time
 from typing import Dict, List, Tuple, Optional, Union
 import requests
 class DataFetcher:
    """数据获取器"""
    # 默认 API 地址
    DEFAULT_API_URL = "https://newsnow.busiyi.world/api/s"
    # 默认请求头
    DEFAULT_HEADERS = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Accept": "application/json, text/plain, */*",
        "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
        "Connection": "keep-alive",
        "Cache-Control": "no-cache",
    }
    def __init__(
        self,
        proxy_url: Optional[str] = None,
        api_url: Optional[str] = None,
    ):
        """
        初始化数据获取器
        Args:
            proxy_url: 代理服务器 URL（可选）
            api_url: API 基础 URL（可选，默认使用 DEFAULT_API_URL）
        """
        self.proxy_url = proxy_url
        self.api_url = api_url or self.DEFAULT_API_URL
    def fetch_data(
        self,
        id_info: Union[str, Tuple[str, str]],
        max_retries: int = 2,
        min_retry_wait: int = 3,
        max_retry_wait: int = 5,
    ) -> Tuple[Optional[str], str, str]:
        """
        获取指定ID数据，支持重试
        Args:
            id_info: 平台ID 或 (平台ID, 别名) 元组
            max_retries: 最大重试次数
            min_retry_wait: 最小重试等待时间（秒）
            max_retry_wait: 最大重试等待时间（秒）
        Returns:
            (响应文本, 平台ID, 别名) 元组，失败时响应文本为 None
        """
        if isinstance(id_info, tuple):
            id_value, alias = id_info
        else:
            id_value = id_info
            alias = id_value
        url = f"{self.api_url}?id={id_value}&latest"
        proxies = None
        if self.proxy_url:
            proxies = {"http": self.proxy_url, "https": self.proxy_url}
        retries = 0
        while retries <= max_retries:
            try:
                response = requests.get(
                    url,
                    proxies=proxies,
                    headers=self.DEFAULT_HEADERS,
                    timeout=10,
                )
                response.raise_for_status()
                data_text = response.text
                data_json = json.loads(data_text)
                status = data_json.get("status", "未知")
                if status not in ["success", "cache"]:
                    raise ValueError(f"响应状态异常: {status}")
                status_info = "最新数据" if status == "success" else "缓存数据"
                print(f"获取 {id_value} 成功（{status_info}）")
                return data_text, id_value, alias
            except Exception as e:
                retries += 1
                if retries <= max_retries:
                    base_wait = random.uniform(min_retry_wait, max_retry_wait)
                    additional_wait = (retries - 1) * random.uniform(1, 2)
                    wait_time = base_wait + additional_wait
                    print(f"请求 {id_value} 失败: {e}. {wait_time:.2f}秒后重试...")
                    time.sleep(wait_time)
                else:
                    print(f"请求 {id_value} 失败: {e}")
                    return None, id_value, alias
        return None, id_value, alias
    def crawl_websites(
        self,
        ids_list: List[Union[str, Tuple[str, str]]],
        request_interval: int = 100,
    ) -> Tuple[Dict, Dict, List]:
        """
        爬取多个网站数据
        Args:
            ids_list: 平台ID列表，每个元素可以是字符串或 (平台ID, 别名) 元组
            request_interval: 请求间隔（毫秒）
        Returns:
            (结果字典, ID到名称的映射, 失败ID列表) 元组
        """
        results = {}
        id_to_name = {}
        failed_ids = []
        for i, id_info in enumerate(ids_list):
            if isinstance(id_info, tuple):
                id_value, name = id_info
            else:
                id_value = id_info
                name = id_value
            id_to_name[id_value] = name
            response, _, _ = self.fetch_data(id_info)
            if response:
                try:
                    data = json.loads(response)
                    results[id_value] = {}
                    for index, item in enumerate(data.get("items", []), 1):
                        title = item.get("title")
                        # 跳过无效标题（None、float、空字符串）
                        if title is None or isinstance(title, float) or not str(title).strip():
                            continue
                        title = str(title).strip()
                        url = item.get("url", "")
                        mobile_url = item.get("mobileUrl", "")
                        if title in results[id_value]:
                            results[id_value][title]["ranks"].append(index)
                        else:
                            results[id_value][title] = {
                                "ranks": [index],
                                "url": url,
                                "mobileUrl": mobile_url,
                            }
                except json.JSONDecodeError:
                    print(f"解析 {id_value} 响应失败")
                    failed_ids.append(id_value)
                except Exception as e:
                    print(f"处理 {id_value} 数据出错: {e}")
                    failed_ids.append(id_value)
            else:
                failed_ids.append(id_value)
            # 请求间隔（除了最后一个）
            if i < len(ids_list) - 1:
                actual_interval = request_interval + random.randint(-10, 20)
                actual_interval = max(50, actual_interval)
                time.sleep(actual_interval / 1000)
        print(f"成功: {list(results.keys())}, 失败: {failed_ids}")
        return results, id_to_name, failed_ids
--- a/trendradar/notification/init.py
+++ b/trendradar/notification/init.py
@ -0,0 +1,81 @@
 # coding=utf-8
 """
 通知推送模块
 提供多渠道通知推送功能，包括：
 - 飞书、钉钉、企业微信
 - Telegram、Slack
 - Email、ntfy、Bark
 模块结构：
 - push_manager: 推送记录管理
 - formatters: 内容格式转换
 - batch: 批次处理工具
 - renderer: 通知内容渲染
 - splitter: 消息分批拆分
 - senders: 消息发送器（各渠道发送函数）
 - dispatcher: 多账号通知调度器
 """
 from trendradar.notification.push_manager import PushRecordManager
 from trendradar.notification.formatters import (
    strip_markdown,
    convert_markdown_to_mrkdwn,
 )
 from trendradar.notification.batch import (
    get_batch_header,
    get_max_batch_header_size,
    truncate_to_bytes,
    add_batch_headers,
 )
 from trendradar.notification.renderer import (
    render_feishu_content,
    render_dingtalk_content,
 )
 from trendradar.notification.splitter import (
    split_content_into_batches,
    DEFAULT_BATCH_SIZES,
 )
 from trendradar.notification.senders import (
    send_to_feishu,
    send_to_dingtalk,
    send_to_wework,
    send_to_telegram,
    send_to_email,
    send_to_ntfy,
    send_to_bark,
    send_to_slack,
    SMTP_CONFIGS,
 )
 from trendradar.notification.dispatcher import NotificationDispatcher
 __all__ = [
    # 推送记录管理
    "PushRecordManager",
    # 格式转换
    "strip_markdown",
    "convert_markdown_to_mrkdwn",
    # 批次处理
    "get_batch_header",
    "get_max_batch_header_size",
    "truncate_to_bytes",
    "add_batch_headers",
    # 内容渲染
    "render_feishu_content",
    "render_dingtalk_content",
    # 消息分批
    "split_content_into_batches",
    "DEFAULT_BATCH_SIZES",
    # 消息发送器
    "send_to_feishu",
    "send_to_dingtalk",
    "send_to_wework",
    "send_to_telegram",
    "send_to_email",
    "send_to_ntfy",
    "send_to_bark",
    "send_to_slack",
    "SMTP_CONFIGS",
    # 通知调度器
    "NotificationDispatcher",
 ]
--- a/trendradar/notification/batch.py
+++ b/trendradar/notification/batch.py
@ -0,0 +1,115 @@
 # coding=utf-8
 """
 批次处理模块
 提供消息分批发送的辅助函数
 """
 from typing import List
 def get_batch_header(format_type: str, batch_num: int, total_batches: int) -> str:
    """根据 format_type 生成对应格式的批次头部
    Args:
        format_type: 推送类型（telegram, slack, wework_text, bark, feishu, dingtalk, ntfy, wework）
        batch_num: 当前批次编号
        total_batches: 总批次数
    Returns:
        格式化的批次头部字符串
    """
    if format_type == "telegram":
        return f"<b>[第 {batch_num}/{total_batches} 批次]</b>\n\n"
    elif format_type == "slack":
        return f"*[第 {batch_num}/{total_batches} 批次]*\n\n"
    elif format_type in ("wework_text", "bark"):
        # 企业微信文本模式和 Bark 使用纯文本格式
        return f"[第 {batch_num}/{total_batches} 批次]\n\n"
    else:
        # 飞书、钉钉、ntfy、企业微信 markdown 模式
        return f"**[第 {batch_num}/{total_batches} 批次]**\n\n"
 def get_max_batch_header_size(format_type: str) -> int:
    """估算批次头部的最大字节数（假设最多 99 批次）
    用于在分批时预留空间，避免事后截断破坏内容完整性。
    Args:
        format_type: 推送类型
    Returns:
        最大头部字节数
    """
    # 生成最坏情况的头部（99/99 批次）
    max_header = get_batch_header(format_type, 99, 99)
    return len(max_header.encode("utf-8"))
 def truncate_to_bytes(text: str, max_bytes: int) -> str:
    """安全截断字符串到指定字节数，避免截断多字节字符
    Args:
        text: 要截断的文本
        max_bytes: 最大字节数
    Returns:
        截断后的文本
    """
    text_bytes = text.encode("utf-8")
    if len(text_bytes) <= max_bytes:
        return text
    # 截断到指定字节数
    truncated = text_bytes[:max_bytes]
    # 处理可能的不完整 UTF-8 字符
    for i in range(min(4, len(truncated))):
        try:
            return truncated[: len(truncated) - i].decode("utf-8")
        except UnicodeDecodeError:
            continue
    # 极端情况：返回空字符串
    return ""
 def add_batch_headers(
    batches: List[str], format_type: str, max_bytes: int
 ) -> List[str]:
    """为批次添加头部，动态计算确保总大小不超过限制
    Args:
        batches: 原始批次列表
        format_type: 推送类型（bark, telegram, feishu 等）
        max_bytes: 该推送类型的最大字节限制
    Returns:
        添加头部后的批次列表
    """
    if len(batches) <= 1:
        return batches
    total = len(batches)
    result = []
    for i, content in enumerate(batches, 1):
        # 生成批次头部
        header = get_batch_header(format_type, i, total)
        header_size = len(header.encode("utf-8"))
        # 动态计算允许的最大内容大小
        max_content_size = max_bytes - header_size
        content_size = len(content.encode("utf-8"))
        # 如果超出，截断到安全大小
        if content_size > max_content_size:
            print(
                f"警告：{format_type} 第 {i}/{total} 批次内容({content_size}字节) + 头部({header_size}字节) 超出限制({max_bytes}字节)，截断到 {max_content_size} 字节"
            )
            content = truncate_to_bytes(content, max_content_size)
        result.append(header + content)
    return result
--- a/trendradar/notification/dispatcher.py
+++ b/trendradar/notification/dispatcher.py
@ -0,0 +1,420 @@
 # coding=utf-8
 """
 通知调度器模块
 提供统一的通知分发接口。
 支持所有通知渠道的多账号配置，使用 `;` 分隔多个账号。
 使用示例:
    dispatcher = NotificationDispatcher(config, get_time_func, split_content_func)
    results = dispatcher.dispatch_all(report_data, report_type, ...)
 """
 from typing import Any, Callable, Dict, List, Optional
 from trendradar.core.config import (
    get_account_at_index,
    limit_accounts,
    parse_multi_account_config,
    validate_paired_configs,
 )
 from .senders import (
    send_to_bark,
    send_to_dingtalk,
    send_to_email,
    send_to_feishu,
    send_to_ntfy,
    send_to_slack,
    send_to_telegram,
    send_to_wework,
 )
 class NotificationDispatcher:
    """
    统一的多账号通知调度器
    将多账号发送逻辑封装，提供简洁的 dispatch_all 接口。
    内部处理账号解析、数量限制、配对验证等逻辑。
    """
    def __init__(
        self,
        config: Dict[str, Any],
        get_time_func: Callable,
        split_content_func: Callable,
    ):
        """
        初始化通知调度器
        Args:
            config: 完整的配置字典，包含所有通知渠道的配置
            get_time_func: 获取当前时间的函数
            split_content_func: 内容分批函数
        """
        self.config = config
        self.get_time_func = get_time_func
        self.split_content_func = split_content_func
        self.max_accounts = config.get("MAX_ACCOUNTS_PER_CHANNEL", 3)
    def dispatch_all(
        self,
        report_data: Dict,
        report_type: str,
        update_info: Optional[Dict] = None,
        proxy_url: Optional[str] = None,
        mode: str = "daily",
        html_file_path: Optional[str] = None,
    ) -> Dict[str, bool]:
        """
        分发通知到所有已配置的渠道
        Args:
            report_data: 报告数据（由 prepare_report_data 生成）
            report_type: 报告类型（如 "当日汇总"、"实时增量"）
            update_info: 版本更新信息（可选）
            proxy_url: 代理 URL（可选）
            mode: 报告模式 (daily/current/incremental)
            html_file_path: HTML 报告文件路径（邮件使用）
        Returns:
            Dict[str, bool]: 每个渠道的发送结果，key 为渠道名，value 为是否成功
        """
        results = {}
        # 飞书
        if self.config.get("FEISHU_WEBHOOK_URL"):
            results["feishu"] = self._send_feishu(
                report_data, report_type, update_info, proxy_url, mode
            )
        # 钉钉
        if self.config.get("DINGTALK_WEBHOOK_URL"):
            results["dingtalk"] = self._send_dingtalk(
                report_data, report_type, update_info, proxy_url, mode
            )
        # 企业微信
        if self.config.get("WEWORK_WEBHOOK_URL"):
            results["wework"] = self._send_wework(
                report_data, report_type, update_info, proxy_url, mode
            )
        # Telegram（需要配对验证）
        if self.config.get("TELEGRAM_BOT_TOKEN") and self.config.get("TELEGRAM_CHAT_ID"):
            results["telegram"] = self._send_telegram(
                report_data, report_type, update_info, proxy_url, mode
            )
        # ntfy（需要配对验证）
        if self.config.get("NTFY_SERVER_URL") and self.config.get("NTFY_TOPIC"):
            results["ntfy"] = self._send_ntfy(
                report_data, report_type, update_info, proxy_url, mode
            )
        # Bark
        if self.config.get("BARK_URL"):
            results["bark"] = self._send_bark(
                report_data, report_type, update_info, proxy_url, mode
            )
        # Slack
        if self.config.get("SLACK_WEBHOOK_URL"):
            results["slack"] = self._send_slack(
                report_data, report_type, update_info, proxy_url, mode
            )
        # 邮件（保持原有逻辑，已支持多收件人）
        if (
            self.config.get("EMAIL_FROM")
            and self.config.get("EMAIL_PASSWORD")
            and self.config.get("EMAIL_TO")
        ):
            results["email"] = self._send_email(report_type, html_file_path)
        return results
    def _send_to_multi_accounts(
        self,
        channel_name: str,
        config_value: str,
        send_func: Callable[..., bool],
        **kwargs,
    ) -> bool:
        """
        通用多账号发送逻辑
        Args:
            channel_name: 渠道名称（用于日志和账号数量限制提示）
            config_value: 配置值（可能包含多个账号，用 ; 分隔）
            send_func: 发送函数，签名为 (account, account_label=..., **kwargs) -> bool
            **kwargs: 传递给发送函数的其他参数
        Returns:
            bool: 任一账号发送成功则返回 True
        """
        accounts = parse_multi_account_config(config_value)
        if not accounts:
            return False
        accounts = limit_accounts(accounts, self.max_accounts, channel_name)
        results = []
        for i, account in enumerate(accounts):
            if account:
                account_label = f"账号{i+1}" if len(accounts) > 1 else ""
                result = send_func(account, account_label=account_label, **kwargs)
                results.append(result)
        return any(results) if results else False
    def _send_feishu(
        self,
        report_data: Dict,
        report_type: str,
        update_info: Optional[Dict],
        proxy_url: Optional[str],
        mode: str,
    ) -> bool:
        """发送到飞书（多账号）"""
        return self._send_to_multi_accounts(
            channel_name="飞书",
            config_value=self.config["FEISHU_WEBHOOK_URL"],
            send_func=lambda url, account_label: send_to_feishu(
                webhook_url=url,
                report_data=report_data,
                report_type=report_type,
                update_info=update_info,
                proxy_url=proxy_url,
                mode=mode,
                account_label=account_label,
                batch_size=self.config.get("FEISHU_BATCH_SIZE", 29000),
                batch_interval=self.config.get("BATCH_SEND_INTERVAL", 1.0),
                split_content_func=self.split_content_func,
                get_time_func=self.get_time_func,
            ),
        )
    def _send_dingtalk(
        self,
        report_data: Dict,
        report_type: str,
        update_info: Optional[Dict],
        proxy_url: Optional[str],
        mode: str,
    ) -> bool:
        """发送到钉钉（多账号）"""
        return self._send_to_multi_accounts(
            channel_name="钉钉",
            config_value=self.config["DINGTALK_WEBHOOK_URL"],
            send_func=lambda url, account_label: send_to_dingtalk(
                webhook_url=url,
                report_data=report_data,
                report_type=report_type,
                update_info=update_info,
                proxy_url=proxy_url,
                mode=mode,
                account_label=account_label,
                batch_size=self.config.get("DINGTALK_BATCH_SIZE", 20000),
                batch_interval=self.config.get("BATCH_SEND_INTERVAL", 1.0),
                split_content_func=self.split_content_func,
            ),
        )
    def _send_wework(
        self,
        report_data: Dict,
        report_type: str,
        update_info: Optional[Dict],
        proxy_url: Optional[str],
        mode: str,
    ) -> bool:
        """发送到企业微信（多账号）"""
        return self._send_to_multi_accounts(
            channel_name="企业微信",
            config_value=self.config["WEWORK_WEBHOOK_URL"],
            send_func=lambda url, account_label: send_to_wework(
                webhook_url=url,
                report_data=report_data,
                report_type=report_type,
                update_info=update_info,
                proxy_url=proxy_url,
                mode=mode,
                account_label=account_label,
                batch_size=self.config.get("MESSAGE_BATCH_SIZE", 4000),
                batch_interval=self.config.get("BATCH_SEND_INTERVAL", 1.0),
                msg_type=self.config.get("WEWORK_MSG_TYPE", "markdown"),
                split_content_func=self.split_content_func,
            ),
        )
    def _send_telegram(
        self,
        report_data: Dict,
        report_type: str,
        update_info: Optional[Dict],
        proxy_url: Optional[str],
        mode: str,
    ) -> bool:
        """发送到 Telegram（多账号，需验证 token 和 chat_id 配对）"""
        telegram_tokens = parse_multi_account_config(self.config["TELEGRAM_BOT_TOKEN"])
        telegram_chat_ids = parse_multi_account_config(self.config["TELEGRAM_CHAT_ID"])
        if not telegram_tokens or not telegram_chat_ids:
            return False
        # 验证配对
        valid, count = validate_paired_configs(
            {"bot_token": telegram_tokens, "chat_id": telegram_chat_ids},
            "Telegram",
            required_keys=["bot_token", "chat_id"],
        )
        if not valid or count == 0:
            return False
        # 限制账号数量
        telegram_tokens = limit_accounts(telegram_tokens, self.max_accounts, "Telegram")
        telegram_chat_ids = telegram_chat_ids[: len(telegram_tokens)]
        results = []
        for i in range(len(telegram_tokens)):
            token = telegram_tokens[i]
            chat_id = telegram_chat_ids[i]
            if token and chat_id:
                account_label = f"账号{i+1}" if len(telegram_tokens) > 1 else ""
                result = send_to_telegram(
                    bot_token=token,
                    chat_id=chat_id,
                    report_data=report_data,
                    report_type=report_type,
                    update_info=update_info,
                    proxy_url=proxy_url,
                    mode=mode,
                    account_label=account_label,
                    batch_size=self.config.get("MESSAGE_BATCH_SIZE", 4000),
                    batch_interval=self.config.get("BATCH_SEND_INTERVAL", 1.0),
                    split_content_func=self.split_content_func,
                )
                results.append(result)
        return any(results) if results else False
    def _send_ntfy(
        self,
        report_data: Dict,
        report_type: str,
        update_info: Optional[Dict],
        proxy_url: Optional[str],
        mode: str,
    ) -> bool:
        """发送到 ntfy（多账号，需验证 topic 和 token 配对）"""
        ntfy_server_url = self.config["NTFY_SERVER_URL"]
        ntfy_topics = parse_multi_account_config(self.config["NTFY_TOPIC"])
        ntfy_tokens = parse_multi_account_config(self.config.get("NTFY_TOKEN", ""))
        if not ntfy_server_url or not ntfy_topics:
            return False
        # 验证 token 和 topic 数量一致（如果配置了 token）
        if ntfy_tokens and len(ntfy_tokens) != len(ntfy_topics):
            print(
                f"❌ ntfy 配置错误：topic 数量({len(ntfy_topics)})与 token 数量({len(ntfy_tokens)})不一致，跳过 ntfy 推送"
            )
            return False
        # 限制账号数量
        ntfy_topics = limit_accounts(ntfy_topics, self.max_accounts, "ntfy")
        if ntfy_tokens:
            ntfy_tokens = ntfy_tokens[: len(ntfy_topics)]
        results = []
        for i, topic in enumerate(ntfy_topics):
            if topic:
                token = get_account_at_index(ntfy_tokens, i, "") if ntfy_tokens else ""
                account_label = f"账号{i+1}" if len(ntfy_topics) > 1 else ""
                result = send_to_ntfy(
                    server_url=ntfy_server_url,
                    topic=topic,
                    token=token,
                    report_data=report_data,
                    report_type=report_type,
                    update_info=update_info,
                    proxy_url=proxy_url,
                    mode=mode,
                    account_label=account_label,
                    batch_size=3800,
                    split_content_func=self.split_content_func,
                )
                results.append(result)
        return any(results) if results else False
    def _send_bark(
        self,
        report_data: Dict,
        report_type: str,
        update_info: Optional[Dict],
        proxy_url: Optional[str],
        mode: str,
    ) -> bool:
        """发送到 Bark（多账号）"""
        return self._send_to_multi_accounts(
            channel_name="Bark",
            config_value=self.config["BARK_URL"],
            send_func=lambda url, account_label: send_to_bark(
                bark_url=url,
                report_data=report_data,
                report_type=report_type,
                update_info=update_info,
                proxy_url=proxy_url,
                mode=mode,
                account_label=account_label,
                batch_size=self.config.get("BARK_BATCH_SIZE", 3600),
                batch_interval=self.config.get("BATCH_SEND_INTERVAL", 1.0),
                split_content_func=self.split_content_func,
            ),
        )
    def _send_slack(
        self,
        report_data: Dict,
        report_type: str,
        update_info: Optional[Dict],
        proxy_url: Optional[str],
        mode: str,
    ) -> bool:
        """发送到 Slack（多账号）"""
        return self._send_to_multi_accounts(
            channel_name="Slack",
            config_value=self.config["SLACK_WEBHOOK_URL"],
            send_func=lambda url, account_label: send_to_slack(
                webhook_url=url,
                report_data=report_data,
                report_type=report_type,
                update_info=update_info,
                proxy_url=proxy_url,
                mode=mode,
                account_label=account_label,
                batch_size=self.config.get("SLACK_BATCH_SIZE", 4000),
                batch_interval=self.config.get("BATCH_SEND_INTERVAL", 1.0),
                split_content_func=self.split_content_func,
            ),
        )
    def _send_email(
        self,
        report_type: str,
        html_file_path: Optional[str],
    ) -> bool:
        """发送邮件（保持原有逻辑，已支持多收件人）"""
        return send_to_email(
            from_email=self.config["EMAIL_FROM"],
            password=self.config["EMAIL_PASSWORD"],
            to_email=self.config["EMAIL_TO"],
            report_type=report_type,
            html_file_path=html_file_path,
            custom_smtp_server=self.config.get("EMAIL_SMTP_SERVER", ""),
            custom_smtp_port=self.config.get("EMAIL_SMTP_PORT", ""),
            get_time_func=self.get_time_func,
        )
--- a/trendradar/notification/formatters.py
+++ b/trendradar/notification/formatters.py
@ -0,0 +1,80 @@
 # coding=utf-8
 """
 通知内容格式转换模块
 提供不同推送平台间的格式转换功能
 """
 import re
 def strip_markdown(text: str) -> str:
    """去除文本中的 markdown 语法格式，用于个人微信推送
    Args:
        text: 包含 markdown 格式的文本
    Returns:
        纯文本内容
    """
    # 去除粗体 **text** 或 __text__
    text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)
    text = re.sub(r'__(.+?)__', r'\1', text)
    # 去除斜体 *text* 或 _text_
    text = re.sub(r'\*(.+?)\*', r'\1', text)
    text = re.sub(r'_(.+?)_', r'\1', text)
    # 去除删除线 ~~text~~
    text = re.sub(r'~~(.+?)~~', r'\1', text)
    # 转换链接 [text](url) -> text url（保留 URL）
    text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'\1 \2', text)
    # 去除图片 ![alt](url) -> alt
    text = re.sub(r'!\[(.+?)\]\(.+?\)', r'\1', text)
    # 去除行内代码 `code`
    text = re.sub(r'`(.+?)`', r'\1', text)
    # 去除引用符号 >
    text = re.sub(r'^>\s*', '', text, flags=re.MULTILINE)
    # 去除标题符号 # ## ### 等
    text = re.sub(r'^#+\s*', '', text, flags=re.MULTILINE)
    # 去除水平分割线 --- 或 ***
    text = re.sub(r'^[\-\*]{3,}\s*$', '', text, flags=re.MULTILINE)
    # 去除 HTML 标签 <font color='xxx'>text</font> -> text
    text = re.sub(r'<font[^>]*>(.+?)</font>', r'\1', text)
    text = re.sub(r'<[^>]+>', '', text)
    # 清理多余的空行（保留最多两个连续空行）
    text = re.sub(r'\n{3,}', '\n\n', text)
    return text.strip()
 def convert_markdown_to_mrkdwn(content: str) -> str:
    """
    将标准 Markdown 转换为 Slack 的 mrkdwn 格式
    转换规则：
    - **粗体** → *粗体*
    - [文本](url) → <url|文本>
    - 保留其他格式（代码块、列表等）
    Args:
        content: Markdown 格式的内容
    Returns:
        Slack mrkdwn 格式的内容
    """
    # 1. 转换链接格式: [文本](url) → <url|文本>
    content = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'<\2|\1>', content)
    # 2. 转换粗体: **文本** → *文本*
    content = re.sub(r'\*\*([^*]+)\*\*', r'*\1*', content)
    return content
--- a/trendradar/notification/push_manager.py
+++ b/trendradar/notification/push_manager.py
@ -0,0 +1,109 @@
 # coding=utf-8
 """
 推送记录管理模块
 管理推送记录，支持每日只推送一次和时间窗口控制
 通过 storage_backend 统一存储，支持本地 SQLite 和远程云存储
 """
 from datetime import datetime
 from typing import Callable, Optional, Any
 import pytz
 class PushRecordManager:
    """
    推送记录管理器
    通过 storage_backend 统一管理推送记录：
    - 本地环境：使用 LocalStorageBackend，数据存储在本地 SQLite
    - GitHub Actions：使用 RemoteStorageBackend，数据存储在云端
    这样 once_per_day 功能在 GitHub Actions 上也能正常工作。
    """
    def __init__(
        self,
        storage_backend: Any,
        get_time_func: Optional[Callable[[], datetime]] = None,
    ):
        """
        初始化推送记录管理器
        Args:
            storage_backend: 存储后端实例（LocalStorageBackend 或 RemoteStorageBackend）
            get_time_func: 获取当前时间的函数（应使用配置的时区）
        """
        self.storage_backend = storage_backend
        self.get_time = get_time_func or self._default_get_time
        print(f"[推送记录] 使用 {storage_backend.backend_name} 存储后端")
    def _default_get_time(self) -> datetime:
        """默认时间获取函数（UTC+8）"""
        return datetime.now(pytz.timezone("Asia/Shanghai"))
    def has_pushed_today(self) -> bool:
        """
        检查今天是否已经推送过
        Returns:
            是否已推送
        """
        return self.storage_backend.has_pushed_today()
    def record_push(self, report_type: str) -> bool:
        """
        记录推送
        Args:
            report_type: 报告类型
        Returns:
            是否记录成功
        """
        return self.storage_backend.record_push(report_type)
    def is_in_time_range(self, start_time: str, end_time: str) -> bool:
        """
        检查当前时间是否在指定时间范围内
        Args:
            start_time: 开始时间（格式：HH:MM）
            end_time: 结束时间（格式：HH:MM）
        Returns:
            是否在时间范围内
        """
        now = self.get_time()
        current_time = now.strftime("%H:%M")
        def normalize_time(time_str: str) -> str:
            """将时间字符串标准化为 HH:MM 格式"""
            try:
                parts = time_str.strip().split(":")
                if len(parts) != 2:
                    raise ValueError(f"时间格式错误: {time_str}")
                hour = int(parts[0])
                minute = int(parts[1])
                if not (0 <= hour <= 23 and 0 <= minute <= 59):
                    raise ValueError(f"时间范围错误: {time_str}")
                return f"{hour:02d}:{minute:02d}"
            except Exception as e:
                print(f"时间格式化错误 '{time_str}': {e}")
                return time_str
        normalized_start = normalize_time(start_time)
        normalized_end = normalize_time(end_time)
        normalized_current = normalize_time(current_time)
        result = normalized_start <= normalized_current <= normalized_end
        if not result:
            print(f"时间窗口判断：当前 {normalized_current}，窗口 {normalized_start}-{normalized_end}")
        return result
--- a/trendradar/notification/renderer.py
+++ b/trendradar/notification/renderer.py
@ -0,0 +1,260 @@
 # coding=utf-8
 """
 通知内容渲染模块
 提供多平台通知内容渲染功能，生成格式化的推送消息
 """
 from datetime import datetime
 from typing import Dict, List, Optional, Callable
 from trendradar.report.formatter import format_title_for_platform
 def render_feishu_content(
    report_data: Dict,
    update_info: Optional[Dict] = None,
    mode: str = "daily",
    separator: str = "---",
    reverse_content_order: bool = False,
    get_time_func: Optional[Callable[[], datetime]] = None,
 ) -> str:
    """渲染飞书通知内容
    Args:
        report_data: 报告数据字典，包含 stats, new_titles, failed_ids, total_new_count
        update_info: 版本更新信息（可选）
        mode: 报告模式 ("daily", "incremental", "current")
        separator: 内容分隔符
        reverse_content_order: 是否反转内容顺序（新增在前）
        get_time_func: 获取当前时间的函数（可选，默认使用 datetime.now()）
    Returns:
        格式化的飞书消息内容
    """
    # 生成热点词汇统计部分
    stats_content = ""
    if report_data["stats"]:
        stats_content += "📊 **热点词汇统计**\n\n"
        total_count = len(report_data["stats"])
        for i, stat in enumerate(report_data["stats"]):
            word = stat["word"]
            count = stat["count"]
            sequence_display = f"<font color='grey'>[{i + 1}/{total_count}]</font>"
            if count >= 10:
                stats_content += f"🔥 {sequence_display} **{word}** : <font color='red'>{count}</font> 条\n\n"
            elif count >= 5:
                stats_content += f"📈 {sequence_display} **{word}** : <font color='orange'>{count}</font> 条\n\n"
            else:
                stats_content += f"📌 {sequence_display} **{word}** : {count} 条\n\n"
            for j, title_data in enumerate(stat["titles"], 1):
                formatted_title = format_title_for_platform(
                    "feishu", title_data, show_source=True
                )
                stats_content += f"  {j}. {formatted_title}\n"
                if j < len(stat["titles"]):
                    stats_content += "\n"
            if i < len(report_data["stats"]) - 1:
                stats_content += f"\n{separator}\n\n"
    # 生成新增新闻部分
    new_titles_content = ""
    if report_data["new_titles"]:
        new_titles_content += (
            f"🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
        )
        for source_data in report_data["new_titles"]:
            new_titles_content += (
                f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n"
            )
            for j, title_data in enumerate(source_data["titles"], 1):
                title_data_copy = title_data.copy()
                title_data_copy["is_new"] = False
                formatted_title = format_title_for_platform(
                    "feishu", title_data_copy, show_source=False
                )
                new_titles_content += f"  {j}. {formatted_title}\n"
            new_titles_content += "\n"
    # 根据配置决定内容顺序
    text_content = ""
    if reverse_content_order:
        # 新增热点在前，热点词汇统计在后
        if new_titles_content:
            text_content += new_titles_content
            if stats_content:
                text_content += f"\n{separator}\n\n"
        if stats_content:
            text_content += stats_content
    else:
        # 默认：热点词汇统计在前，新增热点在后
        if stats_content:
            text_content += stats_content
            if new_titles_content:
                text_content += f"\n{separator}\n\n"
        if new_titles_content:
            text_content += new_titles_content
    if not text_content:
        if mode == "incremental":
            mode_text = "增量模式下暂无新增匹配的热点词汇"
        elif mode == "current":
            mode_text = "当前榜单模式下暂无匹配的热点词汇"
        else:
            mode_text = "暂无匹配的热点词汇"
        text_content = f"📭 {mode_text}\n\n"
    if report_data["failed_ids"]:
        if text_content and "暂无匹配" not in text_content:
            text_content += f"\n{separator}\n\n"
        text_content += "⚠️ **数据获取失败的平台：**\n\n"
        for i, id_value in enumerate(report_data["failed_ids"], 1):
            text_content += f"  • <font color='red'>{id_value}</font>\n"
    # 获取当前时间
    now = get_time_func() if get_time_func else datetime.now()
    text_content += (
        f"\n\n<font color='grey'>更新时间：{now.strftime('%Y-%m-%d %H:%M:%S')}</font>"
    )
    if update_info:
        text_content += f"\n<font color='grey'>TrendRadar 发现新版本 {update_info['remote_version']}，当前 {update_info['current_version']}</font>"
    return text_content
 def render_dingtalk_content(
    report_data: Dict,
    update_info: Optional[Dict] = None,
    mode: str = "daily",
    reverse_content_order: bool = False,
    get_time_func: Optional[Callable[[], datetime]] = None,
 ) -> str:
    """渲染钉钉通知内容
    Args:
        report_data: 报告数据字典，包含 stats, new_titles, failed_ids, total_new_count
        update_info: 版本更新信息（可选）
        mode: 报告模式 ("daily", "incremental", "current")
        reverse_content_order: 是否反转内容顺序（新增在前）
        get_time_func: 获取当前时间的函数（可选，默认使用 datetime.now()）
    Returns:
        格式化的钉钉消息内容
    """
    total_titles = sum(
        len(stat["titles"]) for stat in report_data["stats"] if stat["count"] > 0
    )
    now = get_time_func() if get_time_func else datetime.now()
    # 头部信息
    header_content = f"**总新闻数：** {total_titles}\n\n"
    header_content += f"**时间：** {now.strftime('%Y-%m-%d %H:%M:%S')}\n\n"
    header_content += "**类型：** 热点分析报告\n\n"
    header_content += "---\n\n"
    # 生成热点词汇统计部分
    stats_content = ""
    if report_data["stats"]:
        stats_content += "📊 **热点词汇统计**\n\n"
        total_count = len(report_data["stats"])
        for i, stat in enumerate(report_data["stats"]):
            word = stat["word"]
            count = stat["count"]
            sequence_display = f"[{i + 1}/{total_count}]"
            if count >= 10:
                stats_content += f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n"
            elif count >= 5:
                stats_content += f"📈 {sequence_display} **{word}** : **{count}** 条\n\n"
            else:
                stats_content += f"📌 {sequence_display} **{word}** : {count} 条\n\n"
            for j, title_data in enumerate(stat["titles"], 1):
                formatted_title = format_title_for_platform(
                    "dingtalk", title_data, show_source=True
                )
                stats_content += f"  {j}. {formatted_title}\n"
                if j < len(stat["titles"]):
                    stats_content += "\n"
            if i < len(report_data["stats"]) - 1:
                stats_content += "\n---\n\n"
    # 生成新增新闻部分
    new_titles_content = ""
    if report_data["new_titles"]:
        new_titles_content += (
            f"🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
        )
        for source_data in report_data["new_titles"]:
            new_titles_content += f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n"
            for j, title_data in enumerate(source_data["titles"], 1):
                title_data_copy = title_data.copy()
                title_data_copy["is_new"] = False
                formatted_title = format_title_for_platform(
                    "dingtalk", title_data_copy, show_source=False
                )
                new_titles_content += f"  {j}. {formatted_title}\n"
            new_titles_content += "\n"
    # 根据配置决定内容顺序
    text_content = header_content
    if reverse_content_order:
        # 新增热点在前，热点词汇统计在后
        if new_titles_content:
            text_content += new_titles_content
            if stats_content:
                text_content += "\n---\n\n"
        if stats_content:
            text_content += stats_content
    else:
        # 默认：热点词汇统计在前，新增热点在后
        if stats_content:
            text_content += stats_content
            if new_titles_content:
                text_content += "\n---\n\n"
        if new_titles_content:
            text_content += new_titles_content
    if not stats_content and not new_titles_content:
        if mode == "incremental":
            mode_text = "增量模式下暂无新增匹配的热点词汇"
        elif mode == "current":
            mode_text = "当前榜单模式下暂无匹配的热点词汇"
        else:
            mode_text = "暂无匹配的热点词汇"
        text_content += f"📭 {mode_text}\n\n"
    if report_data["failed_ids"]:
        if "暂无匹配" not in text_content:
            text_content += "\n---\n\n"
        text_content += "⚠️ **数据获取失败的平台：**\n\n"
        for i, id_value in enumerate(report_data["failed_ids"], 1):
            text_content += f"  • **{id_value}**\n"
    text_content += f"\n\n> 更新时间：{now.strftime('%Y-%m-%d %H:%M:%S')}"
    if update_info:
        text_content += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**，当前 **{update_info['current_version']}**"
    return text_content
--- a/trendradar/notification/senders.py
+++ b/trendradar/notification/senders.py
--- a/trendradar/notification/splitter.py
+++ b/trendradar/notification/splitter.py
@ -0,0 +1,580 @@
 # coding=utf-8
 """
 消息分批处理模块
 提供消息内容分批拆分功能，确保消息大小不超过各平台限制
 """
 from datetime import datetime
 from typing import Dict, List, Optional, Callable
 from trendradar.report.formatter import format_title_for_platform
 # 默认批次大小配置
 DEFAULT_BATCH_SIZES = {
    "dingtalk": 20000,
    "feishu": 29000,
    "ntfy": 3800,
    "default": 4000,
 }
 def split_content_into_batches(
    report_data: Dict,
    format_type: str,
    update_info: Optional[Dict] = None,
    max_bytes: Optional[int] = None,
    mode: str = "daily",
    batch_sizes: Optional[Dict[str, int]] = None,
    feishu_separator: str = "---",
    reverse_content_order: bool = False,
    get_time_func: Optional[Callable[[], datetime]] = None,
 ) -> List[str]:
    """分批处理消息内容，确保词组标题+至少第一条新闻的完整性
    Args:
        report_data: 报告数据字典，包含 stats, new_titles, failed_ids, total_new_count
        format_type: 格式类型 (feishu, dingtalk, wework, telegram, ntfy, bark, slack)
        update_info: 版本更新信息（可选）
        max_bytes: 最大字节数（可选，如果不指定则使用默认配置）
        mode: 报告模式 (daily, incremental, current)
        batch_sizes: 批次大小配置字典（可选）
        feishu_separator: 飞书消息分隔符
        reverse_content_order: 是否反转内容顺序（新增在前）
        get_time_func: 获取当前时间的函数（可选）
    Returns:
        分批后的消息内容列表
    """
    # 合并批次大小配置
    sizes = {**DEFAULT_BATCH_SIZES, **(batch_sizes or {})}
    if max_bytes is None:
        if format_type == "dingtalk":
            max_bytes = sizes.get("dingtalk", 20000)
        elif format_type == "feishu":
            max_bytes = sizes.get("feishu", 29000)
        elif format_type == "ntfy":
            max_bytes = sizes.get("ntfy", 3800)
        else:
            max_bytes = sizes.get("default", 4000)
    batches = []
    total_titles = sum(
        len(stat["titles"]) for stat in report_data["stats"] if stat["count"] > 0
    )
    now = get_time_func() if get_time_func else datetime.now()
    base_header = ""
    if format_type in ("wework", "bark"):
        base_header = f"**总新闻数：** {total_titles}\n\n\n\n"
    elif format_type == "telegram":
        base_header = f"总新闻数： {total_titles}\n\n"
    elif format_type == "ntfy":
        base_header = f"**总新闻数：** {total_titles}\n\n"
    elif format_type == "feishu":
        base_header = ""
    elif format_type == "dingtalk":
        base_header = f"**总新闻数：** {total_titles}\n\n"
        base_header += f"**时间：** {now.strftime('%Y-%m-%d %H:%M:%S')}\n\n"
        base_header += f"**类型：** 热点分析报告\n\n"
        base_header += "---\n\n"
    elif format_type == "slack":
        base_header = f"*总新闻数：* {total_titles}\n\n"
    base_footer = ""
    if format_type in ("wework", "bark"):
        base_footer = f"\n\n\n> 更新时间：{now.strftime('%Y-%m-%d %H:%M:%S')}"
        if update_info:
            base_footer += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**，当前 **{update_info['current_version']}**"
    elif format_type == "telegram":
        base_footer = f"\n\n更新时间：{now.strftime('%Y-%m-%d %H:%M:%S')}"
        if update_info:
            base_footer += f"\nTrendRadar 发现新版本 {update_info['remote_version']}，当前 {update_info['current_version']}"
    elif format_type == "ntfy":
        base_footer = f"\n\n> 更新时间：{now.strftime('%Y-%m-%d %H:%M:%S')}"
        if update_info:
            base_footer += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**，当前 **{update_info['current_version']}**"
    elif format_type == "feishu":
        base_footer = f"\n\n<font color='grey'>更新时间：{now.strftime('%Y-%m-%d %H:%M:%S')}</font>"
        if update_info:
            base_footer += f"\n<font color='grey'>TrendRadar 发现新版本 {update_info['remote_version']}，当前 {update_info['current_version']}</font>"
    elif format_type == "dingtalk":
        base_footer = f"\n\n> 更新时间：{now.strftime('%Y-%m-%d %H:%M:%S')}"
        if update_info:
            base_footer += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**，当前 **{update_info['current_version']}**"
    elif format_type == "slack":
        base_footer = f"\n\n_更新时间：{now.strftime('%Y-%m-%d %H:%M:%S')}_"
        if update_info:
            base_footer += f"\n_TrendRadar 发现新版本 *{update_info['remote_version']}*，当前 *{update_info['current_version']}_"
    stats_header = ""
    if report_data["stats"]:
        if format_type in ("wework", "bark"):
            stats_header = f"📊 **热点词汇统计**\n\n"
        elif format_type == "telegram":
            stats_header = f"📊 热点词汇统计\n\n"
        elif format_type == "ntfy":
            stats_header = f"📊 **热点词汇统计**\n\n"
        elif format_type == "feishu":
            stats_header = f"📊 **热点词汇统计**\n\n"
        elif format_type == "dingtalk":
            stats_header = f"📊 **热点词汇统计**\n\n"
        elif format_type == "slack":
            stats_header = f"📊 *热点词汇统计*\n\n"
    current_batch = base_header
    current_batch_has_content = False
    if (
        not report_data["stats"]
        and not report_data["new_titles"]
        and not report_data["failed_ids"]
    ):
        if mode == "incremental":
            mode_text = "增量模式下暂无新增匹配的热点词汇"
        elif mode == "current":
            mode_text = "当前榜单模式下暂无匹配的热点词汇"
        else:
            mode_text = "暂无匹配的热点词汇"
        simple_content = f"📭 {mode_text}\n\n"
        final_content = base_header + simple_content + base_footer
        batches.append(final_content)
        return batches
    # 定义处理热点词汇统计的函数
    def process_stats_section(current_batch, current_batch_has_content, batches):
        """处理热点词汇统计"""
        if not report_data["stats"]:
            return current_batch, current_batch_has_content, batches
        total_count = len(report_data["stats"])
        # 添加统计标题
        test_content = current_batch + stats_header
        if (
            len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
            < max_bytes
        ):
            current_batch = test_content
            current_batch_has_content = True
        else:
            if current_batch_has_content:
                batches.append(current_batch + base_footer)
            current_batch = base_header + stats_header
            current_batch_has_content = True
        # 逐个处理词组（确保词组标题+第一条新闻的原子性）
        for i, stat in enumerate(report_data["stats"]):
            word = stat["word"]
            count = stat["count"]
            sequence_display = f"[{i + 1}/{total_count}]"
            # 构建词组标题
            word_header = ""
            if format_type in ("wework", "bark"):
                if count >= 10:
                    word_header = (
                        f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n"
                    )
                elif count >= 5:
                    word_header = (
                        f"📈 {sequence_display} **{word}** : **{count}** 条\n\n"
                    )
                else:
                    word_header = f"📌 {sequence_display} **{word}** : {count} 条\n\n"
            elif format_type == "telegram":
                if count >= 10:
                    word_header = f"🔥 {sequence_display} {word} : {count} 条\n\n"
                elif count >= 5:
                    word_header = f"📈 {sequence_display} {word} : {count} 条\n\n"
                else:
                    word_header = f"📌 {sequence_display} {word} : {count} 条\n\n"
            elif format_type == "ntfy":
                if count >= 10:
                    word_header = (
                        f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n"
                    )
                elif count >= 5:
                    word_header = (
                        f"📈 {sequence_display} **{word}** : **{count}** 条\n\n"
                    )
                else:
                    word_header = f"📌 {sequence_display} **{word}** : {count} 条\n\n"
            elif format_type == "feishu":
                if count >= 10:
                    word_header = f"🔥 <font color='grey'>{sequence_display}</font> **{word}** : <font color='red'>{count}</font> 条\n\n"
                elif count >= 5:
                    word_header = f"📈 <font color='grey'>{sequence_display}</font> **{word}** : <font color='orange'>{count}</font> 条\n\n"
                else:
                    word_header = f"📌 <font color='grey'>{sequence_display}</font> **{word}** : {count} 条\n\n"
            elif format_type == "dingtalk":
                if count >= 10:
                    word_header = (
                        f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n"
                    )
                elif count >= 5:
                    word_header = (
                        f"📈 {sequence_display} **{word}** : **{count}** 条\n\n"
                    )
                else:
                    word_header = f"📌 {sequence_display} **{word}** : {count} 条\n\n"
            elif format_type == "slack":
                if count >= 10:
                    word_header = (
                        f"🔥 {sequence_display} *{word}* : *{count}* 条\n\n"
                    )
                elif count >= 5:
                    word_header = (
                        f"📈 {sequence_display} *{word}* : *{count}* 条\n\n"
                    )
                else:
                    word_header = f"📌 {sequence_display} *{word}* : {count} 条\n\n"
            # 构建第一条新闻
            first_news_line = ""
            if stat["titles"]:
                first_title_data = stat["titles"][0]
                if format_type in ("wework", "bark"):
                    formatted_title = format_title_for_platform(
                        "wework", first_title_data, show_source=True
                    )
                elif format_type == "telegram":
                    formatted_title = format_title_for_platform(
                        "telegram", first_title_data, show_source=True
                    )
                elif format_type == "ntfy":
                    formatted_title = format_title_for_platform(
                        "ntfy", first_title_data, show_source=True
                    )
                elif format_type == "feishu":
                    formatted_title = format_title_for_platform(
                        "feishu", first_title_data, show_source=True
                    )
                elif format_type == "dingtalk":
                    formatted_title = format_title_for_platform(
                        "dingtalk", first_title_data, show_source=True
                    )
                elif format_type == "slack":
                    formatted_title = format_title_for_platform(
                        "slack", first_title_data, show_source=True
                    )
                else:
                    formatted_title = f"{first_title_data['title']}"
                first_news_line = f"  1. {formatted_title}\n"
                if len(stat["titles"]) > 1:
                    first_news_line += "\n"
            # 原子性检查：词组标题+第一条新闻必须一起处理
            word_with_first_news = word_header + first_news_line
            test_content = current_batch + word_with_first_news
            if (
                len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
                >= max_bytes
            ):
                # 当前批次容纳不下，开启新批次
                if current_batch_has_content:
                    batches.append(current_batch + base_footer)
                current_batch = base_header + stats_header + word_with_first_news
                current_batch_has_content = True
                start_index = 1
            else:
                current_batch = test_content
                current_batch_has_content = True
                start_index = 1
            # 处理剩余新闻条目
            for j in range(start_index, len(stat["titles"])):
                title_data = stat["titles"][j]
                if format_type in ("wework", "bark"):
                    formatted_title = format_title_for_platform(
                        "wework", title_data, show_source=True
                    )
                elif format_type == "telegram":
                    formatted_title = format_title_for_platform(
                        "telegram", title_data, show_source=True
                    )
                elif format_type == "ntfy":
                    formatted_title = format_title_for_platform(
                        "ntfy", title_data, show_source=True
                    )
                elif format_type == "feishu":
                    formatted_title = format_title_for_platform(
                        "feishu", title_data, show_source=True
                    )
                elif format_type == "dingtalk":
                    formatted_title = format_title_for_platform(
                        "dingtalk", title_data, show_source=True
                    )
                elif format_type == "slack":
                    formatted_title = format_title_for_platform(
                        "slack", title_data, show_source=True
                    )
                else:
                    formatted_title = f"{title_data['title']}"
                news_line = f"  {j + 1}. {formatted_title}\n"
                if j < len(stat["titles"]) - 1:
                    news_line += "\n"
                test_content = current_batch + news_line
                if (
                    len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
                    >= max_bytes
                ):
                    if current_batch_has_content:
                        batches.append(current_batch + base_footer)
                    current_batch = base_header + stats_header + word_header + news_line
                    current_batch_has_content = True
                else:
                    current_batch = test_content
                    current_batch_has_content = True
            # 词组间分隔符
            if i < len(report_data["stats"]) - 1:
                separator = ""
                if format_type in ("wework", "bark"):
                    separator = f"\n\n\n\n"
                elif format_type == "telegram":
                    separator = f"\n\n"
                elif format_type == "ntfy":
                    separator = f"\n\n"
                elif format_type == "feishu":
                    separator = f"\n{feishu_separator}\n\n"
                elif format_type == "dingtalk":
                    separator = f"\n---\n\n"
                elif format_type == "slack":
                    separator = f"\n\n"
                test_content = current_batch + separator
                if (
                    len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
                    < max_bytes
                ):
                    current_batch = test_content
        return current_batch, current_batch_has_content, batches
    # 定义处理新增新闻的函数
    def process_new_titles_section(current_batch, current_batch_has_content, batches):
        """处理新增新闻"""
        if not report_data["new_titles"]:
            return current_batch, current_batch_has_content, batches
        new_header = ""
        if format_type in ("wework", "bark"):
            new_header = f"\n\n\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
        elif format_type == "telegram":
            new_header = (
                f"\n\n🆕 本次新增热点新闻 (共 {report_data['total_new_count']} 条)\n\n"
            )
        elif format_type == "ntfy":
            new_header = f"\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
        elif format_type == "feishu":
            new_header = f"\n{feishu_separator}\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
        elif format_type == "dingtalk":
            new_header = f"\n---\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
        elif format_type == "slack":
            new_header = f"\n\n🆕 *本次新增热点新闻* (共 {report_data['total_new_count']} 条)\n\n"
        test_content = current_batch + new_header
        if (
            len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
            >= max_bytes
        ):
            if current_batch_has_content:
                batches.append(current_batch + base_footer)
            current_batch = base_header + new_header
            current_batch_has_content = True
        else:
            current_batch = test_content
            current_batch_has_content = True
        # 逐个处理新增新闻来源
        for source_data in report_data["new_titles"]:
            source_header = ""
            if format_type in ("wework", "bark"):
                source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n"
            elif format_type == "telegram":
                source_header = f"{source_data['source_name']} ({len(source_data['titles'])} 条):\n\n"
            elif format_type == "ntfy":
                source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n"
            elif format_type == "feishu":
                source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n"
            elif format_type == "dingtalk":
                source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n"
            elif format_type == "slack":
                source_header = f"*{source_data['source_name']}* ({len(source_data['titles'])} 条):\n\n"
            # 构建第一条新增新闻
            first_news_line = ""
            if source_data["titles"]:
                first_title_data = source_data["titles"][0]
                title_data_copy = first_title_data.copy()
                title_data_copy["is_new"] = False
                if format_type in ("wework", "bark"):
                    formatted_title = format_title_for_platform(
                        "wework", title_data_copy, show_source=False
                    )
                elif format_type == "telegram":
                    formatted_title = format_title_for_platform(
                        "telegram", title_data_copy, show_source=False
                    )
                elif format_type == "feishu":
                    formatted_title = format_title_for_platform(
                        "feishu", title_data_copy, show_source=False
                    )
                elif format_type == "dingtalk":
                    formatted_title = format_title_for_platform(
                        "dingtalk", title_data_copy, show_source=False
                    )
                elif format_type == "slack":
                    formatted_title = format_title_for_platform(
                        "slack", title_data_copy, show_source=False
                    )
                else:
                    formatted_title = f"{title_data_copy['title']}"
                first_news_line = f"  1. {formatted_title}\n"
            # 原子性检查：来源标题+第一条新闻
            source_with_first_news = source_header + first_news_line
            test_content = current_batch + source_with_first_news
            if (
                len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
                >= max_bytes
            ):
                if current_batch_has_content:
                    batches.append(current_batch + base_footer)
                current_batch = base_header + new_header + source_with_first_news
                current_batch_has_content = True
                start_index = 1
            else:
                current_batch = test_content
                current_batch_has_content = True
                start_index = 1
            # 处理剩余新增新闻
            for j in range(start_index, len(source_data["titles"])):
                title_data = source_data["titles"][j]
                title_data_copy = title_data.copy()
                title_data_copy["is_new"] = False
                if format_type == "wework":
                    formatted_title = format_title_for_platform(
                        "wework", title_data_copy, show_source=False
                    )
                elif format_type == "telegram":
                    formatted_title = format_title_for_platform(
                        "telegram", title_data_copy, show_source=False
                    )
                elif format_type == "feishu":
                    formatted_title = format_title_for_platform(
                        "feishu", title_data_copy, show_source=False
                    )
                elif format_type == "dingtalk":
                    formatted_title = format_title_for_platform(
                        "dingtalk", title_data_copy, show_source=False
                    )
                elif format_type == "slack":
                    formatted_title = format_title_for_platform(
                        "slack", title_data_copy, show_source=False
                    )
                else:
                    formatted_title = f"{title_data_copy['title']}"
                news_line = f"  {j + 1}. {formatted_title}\n"
                test_content = current_batch + news_line
                if (
                    len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
                    >= max_bytes
                ):
                    if current_batch_has_content:
                        batches.append(current_batch + base_footer)
                    current_batch = base_header + new_header + source_header + news_line
                    current_batch_has_content = True
                else:
                    current_batch = test_content
                    current_batch_has_content = True
            current_batch += "\n"
        return current_batch, current_batch_has_content, batches
    # 根据配置决定处理顺序
    if reverse_content_order:
        # 新增热点在前，热点词汇统计在后
        current_batch, current_batch_has_content, batches = process_new_titles_section(
            current_batch, current_batch_has_content, batches
        )
        current_batch, current_batch_has_content, batches = process_stats_section(
            current_batch, current_batch_has_content, batches
        )
    else:
        # 默认：热点词汇统计在前，新增热点在后
        current_batch, current_batch_has_content, batches = process_stats_section(
            current_batch, current_batch_has_content, batches
        )
        current_batch, current_batch_has_content, batches = process_new_titles_section(
            current_batch, current_batch_has_content, batches
        )
    if report_data["failed_ids"]:
        failed_header = ""
        if format_type == "wework":
            failed_header = f"\n\n\n\n⚠️ **数据获取失败的平台：**\n\n"
        elif format_type == "telegram":
            failed_header = f"\n\n⚠️ 数据获取失败的平台：\n\n"
        elif format_type == "ntfy":
            failed_header = f"\n\n⚠️ **数据获取失败的平台：**\n\n"
        elif format_type == "feishu":
            failed_header = f"\n{feishu_separator}\n\n⚠️ **数据获取失败的平台：**\n\n"
        elif format_type == "dingtalk":
            failed_header = f"\n---\n\n⚠️ **数据获取失败的平台：**\n\n"
        test_content = current_batch + failed_header
        if (
            len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
            >= max_bytes
        ):
            if current_batch_has_content:
                batches.append(current_batch + base_footer)
            current_batch = base_header + failed_header
            current_batch_has_content = True
        else:
            current_batch = test_content
            current_batch_has_content = True
        for i, id_value in enumerate(report_data["failed_ids"], 1):
            if format_type == "feishu":
                failed_line = f"  • <font color='red'>{id_value}</font>\n"
            elif format_type == "dingtalk":
                failed_line = f"  • **{id_value}**\n"
            else:
                failed_line = f"  • {id_value}\n"
            test_content = current_batch + failed_line
            if (
                len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
                >= max_bytes
            ):
                if current_batch_has_content:
                    batches.append(current_batch + base_footer)
                current_batch = base_header + failed_header + failed_line
                current_batch_has_content = True
            else:
                current_batch = test_content
                current_batch_has_content = True
    # 完成最后批次
    if current_batch_has_content:
        batches.append(current_batch + base_footer)
    return batches
--- a/trendradar/report/init.py
+++ b/trendradar/report/init.py
@ -0,0 +1,40 @@
 # coding=utf-8
 """
 报告生成模块
 提供报告生成和格式化功能，包括：
 - HTML 报告生成
 - 标题格式化工具
 模块结构：
 - helpers: 报告辅助函数（清理、转义、格式化）
 - formatter: 平台标题格式化
 - html: HTML 报告渲染
 - generator: 报告生成器
 """
 from trendradar.report.helpers import (
    clean_title,
    html_escape,
    format_rank_display,
 )
 from trendradar.report.formatter import format_title_for_platform
 from trendradar.report.html import render_html_content
 from trendradar.report.generator import (
    prepare_report_data,
    generate_html_report,
 )
 __all__ = [
    # 辅助函数
    "clean_title",
    "html_escape",
    "format_rank_display",
    # 格式化函数
    "format_title_for_platform",
    # HTML 渲染
    "render_html_content",
    # 报告生成器
    "prepare_report_data",
    "generate_html_report",
 ]
--- a/trendradar/report/formatter.py
+++ b/trendradar/report/formatter.py
@ -0,0 +1,223 @@
 # coding=utf-8
 """
 平台标题格式化模块
 提供多平台标题格式化功能
 """
 from typing import Dict
 from trendradar.report.helpers import clean_title, html_escape, format_rank_display
 def format_title_for_platform(
    platform: str, title_data: Dict, show_source: bool = True
 ) -> str:
    """统一的标题格式化方法
    为不同平台生成对应格式的标题字符串。
    Args:
        platform: 目标平台，支持:
            - "feishu": 飞书
            - "dingtalk": 钉钉
            - "wework": 企业微信
            - "bark": Bark
            - "telegram": Telegram
            - "ntfy": ntfy
            - "slack": Slack
            - "html": HTML 报告
        title_data: 标题数据字典，包含以下字段:
            - title: 标题文本
            - source_name: 来源名称
            - time_display: 时间显示
            - count: 出现次数
            - ranks: 排名列表
            - rank_threshold: 高亮阈值
            - url: PC端链接
            - mobile_url: 移动端链接（优先使用）
            - is_new: 是否为新增标题（可选）
        show_source: 是否显示来源名称
    Returns:
        格式化后的标题字符串
    """
    rank_display = format_rank_display(
        title_data["ranks"], title_data["rank_threshold"], platform
    )
    link_url = title_data["mobile_url"] or title_data["url"]
    cleaned_title = clean_title(title_data["title"])
    if platform == "feishu":
        if link_url:
            formatted_title = f"[{cleaned_title}]({link_url})"
        else:
            formatted_title = cleaned_title
        title_prefix = "🆕 " if title_data.get("is_new") else ""
        if show_source:
            result = f"<font color='grey'>[{title_data['source_name']}]</font> {title_prefix}{formatted_title}"
        else:
            result = f"{title_prefix}{formatted_title}"
        if rank_display:
            result += f" {rank_display}"
        if title_data["time_display"]:
            result += f" <font color='grey'>- {title_data['time_display']}</font>"
        if title_data["count"] > 1:
            result += f" <font color='green'>({title_data['count']}次)</font>"
        return result
    elif platform == "dingtalk":
        if link_url:
            formatted_title = f"[{cleaned_title}]({link_url})"
        else:
            formatted_title = cleaned_title
        title_prefix = "🆕 " if title_data.get("is_new") else ""
        if show_source:
            result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}"
        else:
            result = f"{title_prefix}{formatted_title}"
        if rank_display:
            result += f" {rank_display}"
        if title_data["time_display"]:
            result += f" - {title_data['time_display']}"
        if title_data["count"] > 1:
            result += f" ({title_data['count']}次)"
        return result
    elif platform in ("wework", "bark"):
        # WeWork 和 Bark 使用 markdown 格式
        if link_url:
            formatted_title = f"[{cleaned_title}]({link_url})"
        else:
            formatted_title = cleaned_title
        title_prefix = "🆕 " if title_data.get("is_new") else ""
        if show_source:
            result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}"
        else:
            result = f"{title_prefix}{formatted_title}"
        if rank_display:
            result += f" {rank_display}"
        if title_data["time_display"]:
            result += f" - {title_data['time_display']}"
        if title_data["count"] > 1:
            result += f" ({title_data['count']}次)"
        return result
    elif platform == "telegram":
        if link_url:
            formatted_title = f'<a href="{link_url}">{html_escape(cleaned_title)}</a>'
        else:
            formatted_title = cleaned_title
        title_prefix = "🆕 " if title_data.get("is_new") else ""
        if show_source:
            result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}"
        else:
            result = f"{title_prefix}{formatted_title}"
        if rank_display:
            result += f" {rank_display}"
        if title_data["time_display"]:
            result += f" <code>- {title_data['time_display']}</code>"
        if title_data["count"] > 1:
            result += f" <code>({title_data['count']}次)</code>"
        return result
    elif platform == "ntfy":
        if link_url:
            formatted_title = f"[{cleaned_title}]({link_url})"
        else:
            formatted_title = cleaned_title
        title_prefix = "🆕 " if title_data.get("is_new") else ""
        if show_source:
            result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}"
        else:
            result = f"{title_prefix}{formatted_title}"
        if rank_display:
            result += f" {rank_display}"
        if title_data["time_display"]:
            result += f" `- {title_data['time_display']}`"
        if title_data["count"] > 1:
            result += f" `({title_data['count']}次)`"
        return result
    elif platform == "slack":
        # Slack 使用 mrkdwn 格式
        if link_url:
            # Slack 链接格式: <url|text>
            formatted_title = f"<{link_url}|{cleaned_title}>"
        else:
            formatted_title = cleaned_title
        title_prefix = "🆕 " if title_data.get("is_new") else ""
        if show_source:
            result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}"
        else:
            result = f"{title_prefix}{formatted_title}"
        # 排名（使用 * 加粗）
        rank_display = format_rank_display(
            title_data["ranks"], title_data["rank_threshold"], "slack"
        )
        if rank_display:
            result += f" {rank_display}"
        if title_data["time_display"]:
            result += f" `- {title_data['time_display']}`"
        if title_data["count"] > 1:
            result += f" `({title_data['count']}次)`"
        return result
    elif platform == "html":
        rank_display = format_rank_display(
            title_data["ranks"], title_data["rank_threshold"], "html"
        )
        link_url = title_data["mobile_url"] or title_data["url"]
        escaped_title = html_escape(cleaned_title)
        escaped_source_name = html_escape(title_data["source_name"])
        if link_url:
            escaped_url = html_escape(link_url)
            formatted_title = f'[{escaped_source_name}] <a href="{escaped_url}" target="_blank" class="news-link">{escaped_title}</a>'
        else:
            formatted_title = (
                f'[{escaped_source_name}] <span class="no-link">{escaped_title}</span>'
            )
        if rank_display:
            formatted_title += f" {rank_display}"
        if title_data["time_display"]:
            escaped_time = html_escape(title_data["time_display"])
            formatted_title += f" <font color='grey'>- {escaped_time}</font>"
        if title_data["count"] > 1:
            formatted_title += f" <font color='green'>({title_data['count']}次)</font>"
        if title_data.get("is_new"):
            formatted_title = f"<div class='new-title'>🆕 {formatted_title}</div>"
        return formatted_title
    else:
        return cleaned_title
--- a/trendradar/report/generator.py
+++ b/trendradar/report/generator.py
@ -0,0 +1,235 @@
 # coding=utf-8
 """
 报告生成模块
 提供报告数据准备和 HTML 生成功能：
 - prepare_report_data: 准备报告数据
 - generate_html_report: 生成 HTML 报告
 """
 from pathlib import Path
 from typing import Dict, List, Optional, Callable
 def prepare_report_data(
    stats: List[Dict],
    failed_ids: Optional[List] = None,
    new_titles: Optional[Dict] = None,
    id_to_name: Optional[Dict] = None,
    mode: str = "daily",
    rank_threshold: int = 3,
    matches_word_groups_func: Optional[Callable] = None,
    load_frequency_words_func: Optional[Callable] = None,
 ) -> Dict:
    """
    准备报告数据
    Args:
        stats: 统计结果列表
        failed_ids: 失败的 ID 列表
        new_titles: 新增标题
        id_to_name: ID 到名称的映射
        mode: 报告模式 (daily/incremental/current)
        rank_threshold: 排名阈值
        matches_word_groups_func: 词组匹配函数
        load_frequency_words_func: 加载频率词函数
    Returns:
        Dict: 准备好的报告数据
    """
    processed_new_titles = []
    # 在增量模式下隐藏新增新闻区域
    hide_new_section = mode == "incremental"
    # 只有在非隐藏模式下才处理新增新闻部分
    if not hide_new_section:
        filtered_new_titles = {}
        if new_titles and id_to_name:
            # 如果提供了匹配函数，使用它过滤
            if matches_word_groups_func and load_frequency_words_func:
                word_groups, filter_words, global_filters = load_frequency_words_func()
                for source_id, titles_data in new_titles.items():
                    filtered_titles = {}
                    for title, title_data in titles_data.items():
                        if matches_word_groups_func(title, word_groups, filter_words, global_filters):
                            filtered_titles[title] = title_data
                    if filtered_titles:
                        filtered_new_titles[source_id] = filtered_titles
            else:
                # 没有匹配函数时，使用全部
                filtered_new_titles = new_titles
            # 打印过滤后的新增热点数（与推送显示一致）
            original_new_count = sum(len(titles) for titles in new_titles.values()) if new_titles else 0
            filtered_new_count = sum(len(titles) for titles in filtered_new_titles.values()) if filtered_new_titles else 0
            if original_new_count > 0:
                print(f"频率词过滤后：{filtered_new_count} 条新增热点匹配（原始 {original_new_count} 条）")
        if filtered_new_titles and id_to_name:
            for source_id, titles_data in filtered_new_titles.items():
                source_name = id_to_name.get(source_id, source_id)
                source_titles = []
                for title, title_data in titles_data.items():
                    url = title_data.get("url", "")
                    mobile_url = title_data.get("mobileUrl", "")
                    ranks = title_data.get("ranks", [])
                    processed_title = {
                        "title": title,
                        "source_name": source_name,
                        "time_display": "",
                        "count": 1,
                        "ranks": ranks,
                        "rank_threshold": rank_threshold,
                        "url": url,
                        "mobile_url": mobile_url,
                        "is_new": True,
                    }
                    source_titles.append(processed_title)
                if source_titles:
                    processed_new_titles.append(
                        {
                            "source_id": source_id,
                            "source_name": source_name,
                            "titles": source_titles,
                        }
                    )
    processed_stats = []
    for stat in stats:
        if stat["count"] <= 0:
            continue
        processed_titles = []
        for title_data in stat["titles"]:
            processed_title = {
                "title": title_data["title"],
                "source_name": title_data["source_name"],
                "time_display": title_data["time_display"],
                "count": title_data["count"],
                "ranks": title_data["ranks"],
                "rank_threshold": title_data["rank_threshold"],
                "url": title_data.get("url", ""),
                "mobile_url": title_data.get("mobileUrl", ""),
                "is_new": title_data.get("is_new", False),
            }
            processed_titles.append(processed_title)
        processed_stats.append(
            {
                "word": stat["word"],
                "count": stat["count"],
                "percentage": stat.get("percentage", 0),
                "titles": processed_titles,
            }
        )
    return {
        "stats": processed_stats,
        "new_titles": processed_new_titles,
        "failed_ids": failed_ids or [],
        "total_new_count": sum(
            len(source["titles"]) for source in processed_new_titles
        ),
    }
 def generate_html_report(
    stats: List[Dict],
    total_titles: int,
    failed_ids: Optional[List] = None,
    new_titles: Optional[Dict] = None,
    id_to_name: Optional[Dict] = None,
    mode: str = "daily",
    is_daily_summary: bool = False,
    update_info: Optional[Dict] = None,
    rank_threshold: int = 3,
    output_dir: str = "output",
    date_folder: str = "",
    time_filename: str = "",
    render_html_func: Optional[Callable] = None,
    matches_word_groups_func: Optional[Callable] = None,
    load_frequency_words_func: Optional[Callable] = None,
    enable_index_copy: bool = True,
 ) -> str:
    """
    生成 HTML 报告
    Args:
        stats: 统计结果列表
        total_titles: 总标题数
        failed_ids: 失败的 ID 列表
        new_titles: 新增标题
        id_to_name: ID 到名称的映射
        mode: 报告模式 (daily/incremental/current)
        is_daily_summary: 是否是每日汇总
        update_info: 更新信息
        rank_threshold: 排名阈值
        output_dir: 输出目录
        date_folder: 日期文件夹名称
        time_filename: 时间文件名
        render_html_func: HTML 渲染函数
        matches_word_groups_func: 词组匹配函数
        load_frequency_words_func: 加载频率词函数
        enable_index_copy: 是否复制到 index.html
    Returns:
        str: 生成的 HTML 文件路径
    """
    if is_daily_summary:
        if mode == "current":
            filename = "当前榜单汇总.html"
        elif mode == "incremental":
            filename = "当日增量.html"
        else:
            filename = "当日汇总.html"
    else:
        filename = f"{time_filename}.html"
    # 构建输出路径
    output_path = Path(output_dir) / date_folder / "html"
    output_path.mkdir(parents=True, exist_ok=True)
    file_path = str(output_path / filename)
    # 准备报告数据
    report_data = prepare_report_data(
        stats,
        failed_ids,
        new_titles,
        id_to_name,
        mode,
        rank_threshold,
        matches_word_groups_func,
        load_frequency_words_func,
    )
    # 渲染 HTML 内容
    if render_html_func:
        html_content = render_html_func(
            report_data, total_titles, is_daily_summary, mode, update_info
        )
    else:
        # 默认简单 HTML
        html_content = f"<html><body><h1>Report</h1><pre>{report_data}</pre></body></html>"
    # 写入文件
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(html_content)
    # 如果是每日汇总且启用 index 复制
    if is_daily_summary and enable_index_copy:
        # 生成到根目录（供 GitHub Pages 访问）
        root_index_path = Path("index.html")
        with open(root_index_path, "w", encoding="utf-8") as f:
            f.write(html_content)
        # 同时生成到 output 目录（供 Docker Volume 挂载访问）
        output_index_path = Path(output_dir) / "index.html"
        Path(output_dir).mkdir(parents=True, exist_ok=True)
        with open(output_index_path, "w", encoding="utf-8") as f:
            f.write(html_content)
    return file_path
--- a/trendradar/report/helpers.py
+++ b/trendradar/report/helpers.py
@ -0,0 +1,125 @@
 # coding=utf-8
 """
 报告辅助函数模块
 提供报告生成相关的通用辅助函数
 """
 import re
 from typing import List
 def clean_title(title: str) -> str:
    """清理标题中的特殊字符
    清理规则：
    - 将换行符(\n, \r)替换为空格
    - 将多个连续空白字符合并为单个空格
    - 去除首尾空白
    Args:
        title: 原始标题字符串
    Returns:
        清理后的标题字符串
    """
    if not isinstance(title, str):
        title = str(title)
    cleaned_title = title.replace("\n", " ").replace("\r", " ")
    cleaned_title = re.sub(r"\s+", " ", cleaned_title)
    cleaned_title = cleaned_title.strip()
    return cleaned_title
 def html_escape(text: str) -> str:
    """HTML特殊字符转义
    转义规则（按顺序）：
    - & → &amp;
    - < → &lt;
    - > → &gt;
    - " → &quot;
    - ' → &#x27;
    Args:
        text: 原始文本
    Returns:
        转义后的文本
    """
    if not isinstance(text, str):
        text = str(text)
    return (
        text.replace("&", "&amp;")
        .replace("<", "&lt;")
        .replace(">", "&gt;")
        .replace('"', "&quot;")
        .replace("'", "&#x27;")
    )
 def format_rank_display(ranks: List[int], rank_threshold: int, format_type: str) -> str:
    """格式化排名显示
    根据不同平台类型生成对应格式的排名字符串。
    当最小排名小于等于阈值时，使用高亮格式。
    Args:
        ranks: 排名列表（可能包含重复值）
        rank_threshold: 高亮阈值，小于等于此值的排名会高亮显示
        format_type: 平台类型，支持:
            - "html": HTML格式
            - "feishu": 飞书格式
            - "dingtalk": 钉钉格式
            - "wework": 企业微信格式
            - "telegram": Telegram格式
            - "slack": Slack格式
            - 其他: 默认markdown格式
    Returns:
        格式化后的排名字符串，如 "[1]" 或 "[1 - 5]"
        如果排名列表为空，返回空字符串
    """
    if not ranks:
        return ""
    unique_ranks = sorted(set(ranks))
    min_rank = unique_ranks[0]
    max_rank = unique_ranks[-1]
    # 根据平台类型选择高亮格式
    if format_type == "html":
        highlight_start = "<font color='red'><strong>"
        highlight_end = "</strong></font>"
    elif format_type == "feishu":
        highlight_start = "<font color='red'>**"
        highlight_end = "**</font>"
    elif format_type == "dingtalk":
        highlight_start = "**"
        highlight_end = "**"
    elif format_type == "wework":
        highlight_start = "**"
        highlight_end = "**"
    elif format_type == "telegram":
        highlight_start = "<b>"
        highlight_end = "</b>"
    elif format_type == "slack":
        highlight_start = "*"
        highlight_end = "*"
    else:
        # 默认 markdown 格式
        highlight_start = "**"
        highlight_end = "**"
    # 生成排名显示
    if min_rank <= rank_threshold:
        if min_rank == max_rank:
            return f"{highlight_start}[{min_rank}]{highlight_end}"
        else:
            return f"{highlight_start}[{min_rank} - {max_rank}]{highlight_end}"
    else:
        if min_rank == max_rank:
            return f"[{min_rank}]"
        else:
            return f"[{min_rank} - {max_rank}]"
--- a/trendradar/report/html.py
+++ b/trendradar/report/html.py
--- a/trendradar/storage/init.py
+++ b/trendradar/storage/init.py
@ -0,0 +1,44 @@
 # coding=utf-8
 """
 存储模块 - 支持多种存储后端
 支持的存储后端:
 - local: 本地 SQLite + TXT/HTML 文件
 - remote: 远程云存储（S3 兼容协议：R2/OSS/COS/S3 等）
 - auto: 根据环境自动选择（GitHub Actions 用 remote，其他用 local）
 """
 from trendradar.storage.base import (
    StorageBackend,
    NewsItem,
    NewsData,
    convert_crawl_results_to_news_data,
    convert_news_data_to_results,
 )
 from trendradar.storage.local import LocalStorageBackend
 from trendradar.storage.manager import StorageManager, get_storage_manager
 # 远程后端可选导入（需要 boto3）
 try:
    from trendradar.storage.remote import RemoteStorageBackend
    HAS_REMOTE = True
 except ImportError:
    RemoteStorageBackend = None
    HAS_REMOTE = False
 __all__ = [
    # 基础类
    "StorageBackend",
    "NewsItem",
    "NewsData",
    # 转换函数
    "convert_crawl_results_to_news_data",
    "convert_news_data_to_results",
    # 后端实现
    "LocalStorageBackend",
    "RemoteStorageBackend",
    "HAS_REMOTE",
    # 管理器
    "StorageManager",
    "get_storage_manager",
 ]
--- a/trendradar/storage/base.py
+++ b/trendradar/storage/base.py
@ -0,0 +1,457 @@
 # coding=utf-8
 """
 存储后端抽象基类和数据模型
 定义统一的存储接口，所有存储后端都需要实现这些方法
 """
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
 from datetime import datetime
 from typing import Dict, List, Optional, Any
 import json
@dataclass
 class NewsItem:
    """新闻条目数据模型"""
    title: str                          # 新闻标题
    source_id: str                      # 来源平台ID（如 toutiao, baidu）
    source_name: str = ""               # 来源平台名称（运行时使用，数据库不存储）
    rank: int = 0                       # 排名
    url: str = ""                       # 链接 URL
    mobile_url: str = ""                # 移动端 URL
    crawl_time: str = ""                # 抓取时间（HH:MM 格式）
    # 统计信息（用于分析）
    ranks: List[int] = field(default_factory=list)  # 历史排名列表
    first_time: str = ""                # 首次出现时间
    last_time: str = ""                 # 最后出现时间
    count: int = 1                      # 出现次数
    def to_dict(self) -> Dict[str, Any]:
        """转换为字典"""
        return {
            "title": self.title,
            "source_id": self.source_id,
            "source_name": self.source_name,
            "rank": self.rank,
            "url": self.url,
            "mobile_url": self.mobile_url,
            "crawl_time": self.crawl_time,
            "ranks": self.ranks,
            "first_time": self.first_time,
            "last_time": self.last_time,
            "count": self.count,
        }
    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> "NewsItem":
        """从字典创建"""
        return cls(
            title=data.get("title", ""),
            source_id=data.get("source_id", ""),
            source_name=data.get("source_name", ""),
            rank=data.get("rank", 0),
            url=data.get("url", ""),
            mobile_url=data.get("mobile_url", ""),
            crawl_time=data.get("crawl_time", ""),
            ranks=data.get("ranks", []),
            first_time=data.get("first_time", ""),
            last_time=data.get("last_time", ""),
            count=data.get("count", 1),
        )
@dataclass
 class NewsData:
    """
    新闻数据集合
    结构:
    - date: 日期（YYYY-MM-DD）
    - crawl_time: 抓取时间（HH时MM分）
    - items: 按来源ID分组的新闻条目
    - id_to_name: 来源ID到名称的映射
    - failed_ids: 失败的来源ID列表
    """
    date: str                                   # 日期
    crawl_time: str                             # 抓取时间
    items: Dict[str, List[NewsItem]]            # 按来源分组的新闻
    id_to_name: Dict[str, str] = field(default_factory=dict)   # ID到名称映射
    failed_ids: List[str] = field(default_factory=list)        # 失败的ID
    def to_dict(self) -> Dict[str, Any]:
        """转换为字典"""
        items_dict = {}
        for source_id, news_list in self.items.items():
            items_dict[source_id] = [item.to_dict() for item in news_list]
        return {
            "date": self.date,
            "crawl_time": self.crawl_time,
            "items": items_dict,
            "id_to_name": self.id_to_name,
            "failed_ids": self.failed_ids,
        }
    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> "NewsData":
        """从字典创建"""
        items = {}
        items_data = data.get("items", {})
        for source_id, news_list in items_data.items():
            items[source_id] = [NewsItem.from_dict(item) for item in news_list]
        return cls(
            date=data.get("date", ""),
            crawl_time=data.get("crawl_time", ""),
            items=items,
            id_to_name=data.get("id_to_name", {}),
            failed_ids=data.get("failed_ids", []),
        )
    def get_total_count(self) -> int:
        """获取新闻总数"""
        return sum(len(news_list) for news_list in self.items.values())
    def merge_with(self, other: "NewsData") -> "NewsData":
        """
        合并另一个 NewsData 到当前数据
        合并规则:
        - 相同 source_id + title 的新闻合并排名历史
        - 更新 last_time 和 count
        - 保留较早的 first_time
        """
        merged_items = {}
        # 复制当前数据
        for source_id, news_list in self.items.items():
            merged_items[source_id] = {item.title: item for item in news_list}
        # 合并其他数据
        for source_id, news_list in other.items.items():
            if source_id not in merged_items:
                merged_items[source_id] = {}
            for item in news_list:
                if item.title in merged_items[source_id]:
                    # 合并已存在的新闻
                    existing = merged_items[source_id][item.title]
                    # 合并排名
                    existing_ranks = set(existing.ranks) if existing.ranks else set()
                    new_ranks = set(item.ranks) if item.ranks else set()
                    merged_ranks = sorted(existing_ranks | new_ranks)
                    existing.ranks = merged_ranks
                    # 更新时间
                    if item.first_time and (not existing.first_time or item.first_time < existing.first_time):
                        existing.first_time = item.first_time
                    if item.last_time and (not existing.last_time or item.last_time > existing.last_time):
                        existing.last_time = item.last_time
                    # 更新计数
                    existing.count += 1
                    # 保留URL（如果原来没有）
                    if not existing.url and item.url:
                        existing.url = item.url
                    if not existing.mobile_url and item.mobile_url:
                        existing.mobile_url = item.mobile_url
                else:
                    # 添加新新闻
                    merged_items[source_id][item.title] = item
        # 转换回列表格式
        final_items = {}
        for source_id, items_dict in merged_items.items():
            final_items[source_id] = list(items_dict.values())
        # 合并 id_to_name
        merged_id_to_name = {**self.id_to_name, **other.id_to_name}
        # 合并 failed_ids（去重）
        merged_failed_ids = list(set(self.failed_ids + other.failed_ids))
        return NewsData(
            date=self.date or other.date,
            crawl_time=other.crawl_time,  # 使用较新的抓取时间
            items=final_items,
            id_to_name=merged_id_to_name,
            failed_ids=merged_failed_ids,
        )
 class StorageBackend(ABC):
    """
    存储后端抽象基类
    所有存储后端都需要实现这些方法，以支持:
    - 保存新闻数据
    - 读取当天所有数据
    - 检测新增新闻
    - 生成报告文件（TXT/HTML）
    """
    @abstractmethod
    def save_news_data(self, data: NewsData) -> bool:
        """
        保存新闻数据
        Args:
            data: 新闻数据
        Returns:
            是否保存成功
        """
        pass
    @abstractmethod
    def get_today_all_data(self, date: Optional[str] = None) -> Optional[NewsData]:
        """
        获取指定日期的所有新闻数据
        Args:
            date: 日期字符串（YYYY-MM-DD），默认为今天
        Returns:
            合并后的新闻数据，如果没有数据返回 None
        """
        pass
    @abstractmethod
    def get_latest_crawl_data(self, date: Optional[str] = None) -> Optional[NewsData]:
        """
        获取最新一次抓取的数据
        Args:
            date: 日期字符串，默认为今天
        Returns:
            最新抓取的新闻数据
        """
        pass
    @abstractmethod
    def detect_new_titles(self, current_data: NewsData) -> Dict[str, Dict]:
        """
        检测新增的标题
        Args:
            current_data: 当前抓取的数据
        Returns:
            新增的标题数据，格式: {source_id: {title: title_data}}
        """
        pass
    @abstractmethod
    def save_txt_snapshot(self, data: NewsData) -> Optional[str]:
        """
        保存 TXT 快照（可选功能，本地环境可用）
        Args:
            data: 新闻数据
        Returns:
            保存的文件路径，如果不支持返回 None
        """
        pass
    @abstractmethod
    def save_html_report(self, html_content: str, filename: str, is_summary: bool = False) -> Optional[str]:
        """
        保存 HTML 报告
        Args:
            html_content: HTML 内容
            filename: 文件名
            is_summary: 是否为汇总报告
        Returns:
            保存的文件路径
        """
        pass
    @abstractmethod
    def is_first_crawl_today(self, date: Optional[str] = None) -> bool:
        """
        检查是否是当天第一次抓取
        Args:
            date: 日期字符串，默认为今天
        Returns:
            是否是第一次抓取
        """
        pass
    @abstractmethod
    def cleanup(self) -> None:
        """
        清理资源（如临时文件、数据库连接等）
        """
        pass
    @abstractmethod
    def cleanup_old_data(self, retention_days: int) -> int:
        """
        清理过期数据
        Args:
            retention_days: 保留天数（0 表示不清理）
        Returns:
            删除的日期目录数量
        """
        pass
    @property
    @abstractmethod
    def backend_name(self) -> str:
        """
        存储后端名称
        """
        pass
    @property
    @abstractmethod
    def supports_txt(self) -> bool:
        """
        是否支持生成 TXT 快照
        """
        pass
    # === 推送记录相关方法 ===
    @abstractmethod
    def has_pushed_today(self, date: Optional[str] = None) -> bool:
        """
        检查指定日期是否已推送过
        Args:
            date: 日期字符串（YYYY-MM-DD），默认为今天
        Returns:
            是否已推送
        """
        pass
    @abstractmethod
    def record_push(self, report_type: str, date: Optional[str] = None) -> bool:
        """
        记录推送
        Args:
            report_type: 报告类型
            date: 日期字符串（YYYY-MM-DD），默认为今天
        Returns:
            是否记录成功
        """
        pass
 def convert_crawl_results_to_news_data(
    results: Dict[str, Dict],
    id_to_name: Dict[str, str],
    failed_ids: List[str],
    crawl_time: str,
    crawl_date: str,
 ) -> NewsData:
    """
    将爬虫结果转换为 NewsData 格式
    Args:
        results: 爬虫返回的结果 {source_id: {title: {ranks: [], url: "", mobileUrl: ""}}}
        id_to_name: 来源ID到名称的映射
        failed_ids: 失败的来源ID
        crawl_time: 抓取时间（HH:MM）
        crawl_date: 抓取日期（YYYY-MM-DD）
    Returns:
        NewsData 对象
    """
    items = {}
    for source_id, titles_data in results.items():
        source_name = id_to_name.get(source_id, source_id)
        news_list = []
        for title, data in titles_data.items():
            if isinstance(data, dict):
                ranks = data.get("ranks", [])
                url = data.get("url", "")
                mobile_url = data.get("mobileUrl", "")
            else:
                # 兼容旧格式
                ranks = data if isinstance(data, list) else []
                url = ""
                mobile_url = ""
            rank = ranks[0] if ranks else 99
            news_item = NewsItem(
                title=title,
                source_id=source_id,
                source_name=source_name,
                rank=rank,
                url=url,
                mobile_url=mobile_url,
                crawl_time=crawl_time,
                ranks=ranks,
                first_time=crawl_time,
                last_time=crawl_time,
                count=1,
            )
            news_list.append(news_item)
        items[source_id] = news_list
    return NewsData(
        date=crawl_date,
        crawl_time=crawl_time,
        items=items,
        id_to_name=id_to_name,
        failed_ids=failed_ids,
    )
 def convert_news_data_to_results(data: NewsData) -> tuple:
    """
    将 NewsData 转换回原有的 results 格式（用于兼容现有代码）
    Args:
        data: NewsData 对象
    Returns:
        (results, id_to_name, title_info) 元组
    """
    results = {}
    title_info = {}
    for source_id, news_list in data.items.items():
        results[source_id] = {}
        title_info[source_id] = {}
        for item in news_list:
            results[source_id][item.title] = {
                "ranks": item.ranks,
                "url": item.url,
                "mobileUrl": item.mobile_url,
            }
            title_info[source_id][item.title] = {
                "first_time": item.first_time,
                "last_time": item.last_time,
                "count": item.count,
                "ranks": item.ranks,
                "url": item.url,
                "mobileUrl": item.mobile_url,
            }
    return results, data.id_to_name, title_info
--- a/trendradar/storage/local.py
+++ b/trendradar/storage/local.py
@ -0,0 +1,869 @@
 # coding=utf-8
 """
 本地存储后端 - SQLite + TXT/HTML
 使用 SQLite 作为主存储，支持可选的 TXT 快照和 HTML 报告
 """
 import sqlite3
 import os
 import shutil
 import pytz
 import re
 from datetime import datetime, timedelta
 from pathlib import Path
 from typing import Dict, List, Optional, Any
 from trendradar.storage.base import StorageBackend, NewsItem, NewsData
 from trendradar.utils.time import (
    get_configured_time,
    format_date_folder,
    format_time_filename,
 )
 class LocalStorageBackend(StorageBackend):
    """
    本地存储后端
    使用 SQLite 数据库存储新闻数据，支持：
    - 按日期组织的 SQLite 数据库文件
    - 可选的 TXT 快照（用于调试）
    - HTML 报告生成
    """
    def __init__(
        self,
        data_dir: str = "output",
        enable_txt: bool = True,
        enable_html: bool = True,
        timezone: str = "Asia/Shanghai",
    ):
        """
        初始化本地存储后端
        Args:
            data_dir: 数据目录路径
            enable_txt: 是否启用 TXT 快照
            enable_html: 是否启用 HTML 报告
            timezone: 时区配置（默认 Asia/Shanghai）
        """
        self.data_dir = Path(data_dir)
        self.enable_txt = enable_txt
        self.enable_html = enable_html
        self.timezone = timezone
        self._db_connections: Dict[str, sqlite3.Connection] = {}
    @property
    def backend_name(self) -> str:
        return "local"
    @property
    def supports_txt(self) -> bool:
        return self.enable_txt
    def _get_configured_time(self) -> datetime:
        """获取配置时区的当前时间"""
        return get_configured_time(self.timezone)
    def _format_date_folder(self, date: Optional[str] = None) -> str:
        """格式化日期文件夹名 (ISO 格式: YYYY-MM-DD)"""
        return format_date_folder(date, self.timezone)
    def _format_time_filename(self) -> str:
        """格式化时间文件名 (格式: HH-MM)"""
        return format_time_filename(self.timezone)
    def _get_db_path(self, date: Optional[str] = None) -> Path:
        """获取 SQLite 数据库路径"""
        date_folder = self._format_date_folder(date)
        db_dir = self.data_dir / date_folder
        db_dir.mkdir(parents=True, exist_ok=True)
        return db_dir / "news.db"
    def _get_connection(self, date: Optional[str] = None) -> sqlite3.Connection:
        """获取数据库连接（带缓存）"""
        db_path = str(self._get_db_path(date))
        if db_path not in self._db_connections:
            conn = sqlite3.connect(db_path)
            conn.row_factory = sqlite3.Row
            self._init_tables(conn)
            self._db_connections[db_path] = conn
        return self._db_connections[db_path]
    def _get_schema_path(self) -> Path:
        """获取 schema.sql 文件路径"""
        return Path(__file__).parent / "schema.sql"
    def _init_tables(self, conn: sqlite3.Connection) -> None:
        """从 schema.sql 初始化数据库表结构"""
        schema_path = self._get_schema_path()
        if schema_path.exists():
            with open(schema_path, "r", encoding="utf-8") as f:
                schema_sql = f.read()
            conn.executescript(schema_sql)
        else:
            raise FileNotFoundError(f"Schema file not found: {schema_path}")
        conn.commit()
    def save_news_data(self, data: NewsData) -> bool:
        """
        保存新闻数据到 SQLite（以 URL 为唯一标识，支持标题更新检测）
        Args:
            data: 新闻数据
        Returns:
            是否保存成功
        """
        try:
            conn = self._get_connection(data.date)
            cursor = conn.cursor()
            # 获取配置时区的当前时间
            now_str = self._get_configured_time().strftime("%Y-%m-%d %H:%M:%S")
            # 首先同步平台信息到 platforms 表
            for source_id, source_name in data.id_to_name.items():
                cursor.execute("""
                    INSERT INTO platforms (id, name, updated_at)
                    VALUES (?, ?, ?)
                    ON CONFLICT(id) DO UPDATE SET
                        name = excluded.name,
                        updated_at = excluded.updated_at
                """, (source_id, source_name, now_str))
            # 统计计数器
            new_count = 0
            updated_count = 0
            title_changed_count = 0
            success_sources = []
            for source_id, news_list in data.items.items():
                success_sources.append(source_id)
                for item in news_list:
                    try:
                        # 检查是否已存在（通过 URL + platform_id）
                        if item.url:
                            cursor.execute("""
                                SELECT id, title FROM news_items
                                WHERE url = ? AND platform_id = ?
                            """, (item.url, source_id))
                            existing = cursor.fetchone()
                            if existing:
                                # 已存在，更新记录
                                existing_id, existing_title = existing
                                # 检查标题是否变化
                                if existing_title != item.title:
                                    # 记录标题变更
                                    cursor.execute("""
                                        INSERT INTO title_changes
                                        (news_item_id, old_title, new_title, changed_at)
                                        VALUES (?, ?, ?, ?)
                                    """, (existing_id, existing_title, item.title, now_str))
                                    title_changed_count += 1
                                # 记录排名历史
                                cursor.execute("""
                                    INSERT INTO rank_history
                                    (news_item_id, rank, crawl_time, created_at)
                                    VALUES (?, ?, ?, ?)
                                """, (existing_id, item.rank, data.crawl_time, now_str))
                                # 更新现有记录
                                cursor.execute("""
                                    UPDATE news_items SET
                                        title = ?,
                                        rank = ?,
                                        mobile_url = ?,
                                        last_crawl_time = ?,
                                        crawl_count = crawl_count + 1,
                                        updated_at = ?
                                    WHERE id = ?
                                """, (item.title, item.rank, item.mobile_url,
                                      data.crawl_time, now_str, existing_id))
                                updated_count += 1
                            else:
                                # 不存在，插入新记录
                                cursor.execute("""
                                    INSERT INTO news_items
                                    (title, platform_id, rank, url, mobile_url,
                                     first_crawl_time, last_crawl_time, crawl_count,
                                     created_at, updated_at)
                                    VALUES (?, ?, ?, ?, ?, ?, ?, 1, ?, ?)
                                """, (item.title, source_id, item.rank, item.url,
                                      item.mobile_url, data.crawl_time, data.crawl_time,
                                      now_str, now_str))
                                new_id = cursor.lastrowid
                                # 记录初始排名
                                cursor.execute("""
                                    INSERT INTO rank_history
                                    (news_item_id, rank, crawl_time, created_at)
                                    VALUES (?, ?, ?, ?)
                                """, (new_id, item.rank, data.crawl_time, now_str))
                                new_count += 1
                        else:
                            # URL 为空的情况，直接插入（不做去重）
                            cursor.execute("""
                                INSERT INTO news_items
                                (title, platform_id, rank, url, mobile_url,
                                 first_crawl_time, last_crawl_time, crawl_count,
                                 created_at, updated_at)
                                VALUES (?, ?, ?, ?, ?, ?, ?, 1, ?, ?)
                            """, (item.title, source_id, item.rank, item.url,
                                  item.mobile_url, data.crawl_time, data.crawl_time,
                                  now_str, now_str))
                            new_id = cursor.lastrowid
                            # 记录初始排名
                            cursor.execute("""
                                INSERT INTO rank_history
                                (news_item_id, rank, crawl_time, created_at)
                                VALUES (?, ?, ?, ?)
                            """, (new_id, item.rank, data.crawl_time, now_str))
                            new_count += 1
                    except sqlite3.Error as e:
                        print(f"保存新闻条目失败 [{item.title[:30]}...]: {e}")
            total_items = new_count + updated_count
            # 记录抓取信息
            cursor.execute("""
                INSERT OR REPLACE INTO crawl_records
                (crawl_time, total_items, created_at)
                VALUES (?, ?, ?)
            """, (data.crawl_time, total_items, now_str))
            # 获取刚插入的 crawl_record 的 ID
            cursor.execute("""
                SELECT id FROM crawl_records WHERE crawl_time = ?
            """, (data.crawl_time,))
            record_row = cursor.fetchone()
            if record_row:
                crawl_record_id = record_row[0]
                # 记录成功的来源
                for source_id in success_sources:
                    cursor.execute("""
                        INSERT OR REPLACE INTO crawl_source_status
                        (crawl_record_id, platform_id, status)
                        VALUES (?, ?, 'success')
                    """, (crawl_record_id, source_id))
                # 记录失败的来源
                for failed_id in data.failed_ids:
                    # 确保失败的平台也在 platforms 表中
                    cursor.execute("""
                        INSERT OR IGNORE INTO platforms (id, name, updated_at)
                        VALUES (?, ?, ?)
                    """, (failed_id, failed_id, now_str))
                    cursor.execute("""
                        INSERT OR REPLACE INTO crawl_source_status
                        (crawl_record_id, platform_id, status)
                        VALUES (?, ?, 'failed')
                    """, (crawl_record_id, failed_id))
            conn.commit()
            # 输出详细的存储统计日志
            log_parts = [f"[本地存储] 处理完成：新增 {new_count} 条"]
            if updated_count > 0:
                log_parts.append(f"更新 {updated_count} 条")
            if title_changed_count > 0:
                log_parts.append(f"标题变更 {title_changed_count} 条")
            print("，".join(log_parts))
            return True
        except Exception as e:
            print(f"[本地存储] 保存失败: {e}")
            return False
    def get_today_all_data(self, date: Optional[str] = None) -> Optional[NewsData]:
        """
        获取指定日期的所有新闻数据（合并后）
        Args:
            date: 日期字符串，默认为今天
        Returns:
            合并后的新闻数据
        """
        try:
            db_path = self._get_db_path(date)
            if not db_path.exists():
                return None
            conn = self._get_connection(date)
            cursor = conn.cursor()
            # 获取所有新闻数据（包含 id 用于查询排名历史）
            cursor.execute("""
                SELECT n.id, n.title, n.platform_id, p.name as platform_name,
                       n.rank, n.url, n.mobile_url,
                       n.first_crawl_time, n.last_crawl_time, n.crawl_count
                FROM news_items n
                LEFT JOIN platforms p ON n.platform_id = p.id
                ORDER BY n.platform_id, n.last_crawl_time
            """)
            rows = cursor.fetchall()
            if not rows:
                return None
            # 收集所有 news_item_id
            news_ids = [row[0] for row in rows]
            # 批量查询排名历史
            rank_history_map: Dict[int, List[int]] = {}
            if news_ids:
                placeholders = ",".join("?" * len(news_ids))
                cursor.execute(f"""
                    SELECT news_item_id, rank FROM rank_history
                    WHERE news_item_id IN ({placeholders})
                    ORDER BY news_item_id, crawl_time
                """, news_ids)
                for rh_row in cursor.fetchall():
                    news_id, rank = rh_row[0], rh_row[1]
                    if news_id not in rank_history_map:
                        rank_history_map[news_id] = []
                    if rank not in rank_history_map[news_id]:
                        rank_history_map[news_id].append(rank)
            # 按 platform_id 分组
            items: Dict[str, List[NewsItem]] = {}
            id_to_name: Dict[str, str] = {}
            crawl_date = self._format_date_folder(date)
            for row in rows:
                news_id = row[0]
                platform_id = row[2]
                title = row[1]
                platform_name = row[3] or platform_id
                id_to_name[platform_id] = platform_name
                if platform_id not in items:
                    items[platform_id] = []
                # 获取排名历史，如果没有则使用当前排名
                ranks = rank_history_map.get(news_id, [row[4]])
                items[platform_id].append(NewsItem(
                    title=title,
                    source_id=platform_id,
                    source_name=platform_name,
                    rank=row[4],
                    url=row[5] or "",
                    mobile_url=row[6] or "",
                    crawl_time=row[8],  # last_crawl_time
                    ranks=ranks,
                    first_time=row[7],  # first_crawl_time
                    last_time=row[8],   # last_crawl_time
                    count=row[9],       # crawl_count
                ))
            final_items = items
            # 获取失败的来源
            cursor.execute("""
                SELECT DISTINCT css.platform_id
                FROM crawl_source_status css
                JOIN crawl_records cr ON css.crawl_record_id = cr.id
                WHERE css.status = 'failed'
            """)
            failed_ids = [row[0] for row in cursor.fetchall()]
            # 获取最新的抓取时间
            cursor.execute("""
                SELECT crawl_time FROM crawl_records
                ORDER BY crawl_time DESC
                LIMIT 1
            """)
            time_row = cursor.fetchone()
            crawl_time = time_row[0] if time_row else self._format_time_filename()
            return NewsData(
                date=crawl_date,
                crawl_time=crawl_time,
                items=final_items,
                id_to_name=id_to_name,
                failed_ids=failed_ids,
            )
        except Exception as e:
            print(f"[本地存储] 读取数据失败: {e}")
            return None
    def get_latest_crawl_data(self, date: Optional[str] = None) -> Optional[NewsData]:
        """
        获取最新一次抓取的数据
        Args:
            date: 日期字符串，默认为今天
        Returns:
            最新抓取的新闻数据
        """
        try:
            db_path = self._get_db_path(date)
            if not db_path.exists():
                return None
            conn = self._get_connection(date)
            cursor = conn.cursor()
            # 获取最新的抓取时间
            cursor.execute("""
                SELECT crawl_time FROM crawl_records
                ORDER BY crawl_time DESC
                LIMIT 1
            """)
            time_row = cursor.fetchone()
            if not time_row:
                return None
            latest_time = time_row[0]
            # 获取该时间的新闻数据（包含 id 用于查询排名历史）
            cursor.execute("""
                SELECT n.id, n.title, n.platform_id, p.name as platform_name,
                       n.rank, n.url, n.mobile_url,
                       n.first_crawl_time, n.last_crawl_time, n.crawl_count
                FROM news_items n
                LEFT JOIN platforms p ON n.platform_id = p.id
                WHERE n.last_crawl_time = ?
            """, (latest_time,))
            rows = cursor.fetchall()
            if not rows:
                return None
            # 收集所有 news_item_id
            news_ids = [row[0] for row in rows]
            # 批量查询排名历史
            rank_history_map: Dict[int, List[int]] = {}
            if news_ids:
                placeholders = ",".join("?" * len(news_ids))
                cursor.execute(f"""
                    SELECT news_item_id, rank FROM rank_history
                    WHERE news_item_id IN ({placeholders})
                    ORDER BY news_item_id, crawl_time
                """, news_ids)
                for rh_row in cursor.fetchall():
                    news_id, rank = rh_row[0], rh_row[1]
                    if news_id not in rank_history_map:
                        rank_history_map[news_id] = []
                    if rank not in rank_history_map[news_id]:
                        rank_history_map[news_id].append(rank)
            items: Dict[str, List[NewsItem]] = {}
            id_to_name: Dict[str, str] = {}
            crawl_date = self._format_date_folder(date)
            for row in rows:
                news_id = row[0]
                platform_id = row[2]
                platform_name = row[3] or platform_id
                id_to_name[platform_id] = platform_name
                if platform_id not in items:
                    items[platform_id] = []
                # 获取排名历史，如果没有则使用当前排名
                ranks = rank_history_map.get(news_id, [row[4]])
                items[platform_id].append(NewsItem(
                    title=row[1],
                    source_id=platform_id,
                    source_name=platform_name,
                    rank=row[4],
                    url=row[5] or "",
                    mobile_url=row[6] or "",
                    crawl_time=row[8],  # last_crawl_time
                    ranks=ranks,
                    first_time=row[7],  # first_crawl_time
                    last_time=row[8],   # last_crawl_time
                    count=row[9],       # crawl_count
                ))
            # 获取失败的来源（针对最新一次抓取）
            cursor.execute("""
                SELECT css.platform_id
                FROM crawl_source_status css
                JOIN crawl_records cr ON css.crawl_record_id = cr.id
                WHERE cr.crawl_time = ? AND css.status = 'failed'
            """, (latest_time,))
            failed_ids = [row[0] for row in cursor.fetchall()]
            return NewsData(
                date=crawl_date,
                crawl_time=latest_time,
                items=items,
                id_to_name=id_to_name,
                failed_ids=failed_ids,
            )
        except Exception as e:
            print(f"[本地存储] 获取最新数据失败: {e}")
            return None
    def detect_new_titles(self, current_data: NewsData) -> Dict[str, Dict]:
        """
        检测新增的标题
        Args:
            current_data: 当前抓取的数据
        Returns:
            新增的标题数据 {source_id: {title: NewsItem}}
        """
        try:
            # 获取历史数据
            historical_data = self.get_today_all_data(current_data.date)
            if not historical_data:
                # 没有历史数据，所有都是新的
                new_titles = {}
                for source_id, news_list in current_data.items.items():
                    new_titles[source_id] = {item.title: item for item in news_list}
                return new_titles
            # 收集历史标题
            historical_titles: Dict[str, set] = {}
            for source_id, news_list in historical_data.items.items():
                historical_titles[source_id] = {item.title for item in news_list}
            # 检测新增
            new_titles = {}
            for source_id, news_list in current_data.items.items():
                hist_set = historical_titles.get(source_id, set())
                for item in news_list:
                    if item.title not in hist_set:
                        if source_id not in new_titles:
                            new_titles[source_id] = {}
                        new_titles[source_id][item.title] = item
            return new_titles
        except Exception as e:
            print(f"[本地存储] 检测新标题失败: {e}")
            return {}
    def save_txt_snapshot(self, data: NewsData) -> Optional[str]:
        """
        保存 TXT 快照
        Args:
            data: 新闻数据
        Returns:
            保存的文件路径
        """
        if not self.enable_txt:
            return None
        try:
            date_folder = self._format_date_folder(data.date)
            txt_dir = self.data_dir / date_folder / "txt"
            txt_dir.mkdir(parents=True, exist_ok=True)
            file_path = txt_dir / f"{data.crawl_time}.txt"
            with open(file_path, "w", encoding="utf-8") as f:
                for source_id, news_list in data.items.items():
                    source_name = data.id_to_name.get(source_id, source_id)
                    # 写入来源标题
                    if source_name and source_name != source_id:
                        f.write(f"{source_id} | {source_name}\n")
                    else:
                        f.write(f"{source_id}\n")
                    # 按排名排序
                    sorted_news = sorted(news_list, key=lambda x: x.rank)
                    for item in sorted_news:
                        line = f"{item.rank}. {item.title}"
                        if item.url:
                            line += f" [URL:{item.url}]"
                        if item.mobile_url:
                            line += f" [MOBILE:{item.mobile_url}]"
                        f.write(line + "\n")
                    f.write("\n")
                # 写入失败的来源
                if data.failed_ids:
                    f.write("==== 以下ID请求失败 ====\n")
                    for failed_id in data.failed_ids:
                        f.write(f"{failed_id}\n")
            print(f"[本地存储] TXT 快照已保存: {file_path}")
            return str(file_path)
        except Exception as e:
            print(f"[本地存储] 保存 TXT 快照失败: {e}")
            return None
    def save_html_report(self, html_content: str, filename: str, is_summary: bool = False) -> Optional[str]:
        """
        保存 HTML 报告
        Args:
            html_content: HTML 内容
            filename: 文件名
            is_summary: 是否为汇总报告
        Returns:
            保存的文件路径
        """
        if not self.enable_html:
            return None
        try:
            date_folder = self._format_date_folder()
            html_dir = self.data_dir / date_folder / "html"
            html_dir.mkdir(parents=True, exist_ok=True)
            file_path = html_dir / filename
            with open(file_path, "w", encoding="utf-8") as f:
                f.write(html_content)
            print(f"[本地存储] HTML 报告已保存: {file_path}")
            return str(file_path)
        except Exception as e:
            print(f"[本地存储] 保存 HTML 报告失败: {e}")
            return None
    def is_first_crawl_today(self, date: Optional[str] = None) -> bool:
        """
        检查是否是当天第一次抓取
        Args:
            date: 日期字符串，默认为今天
        Returns:
            是否是第一次抓取
        """
        try:
            db_path = self._get_db_path(date)
            if not db_path.exists():
                return True
            conn = self._get_connection(date)
            cursor = conn.cursor()
            cursor.execute("""
                SELECT COUNT(*) as count FROM crawl_records
            """)
            row = cursor.fetchone()
            count = row[0] if row else 0
            # 如果只有一条或没有记录，视为第一次抓取
            return count <= 1
        except Exception as e:
            print(f"[本地存储] 检查首次抓取失败: {e}")
            return True
    def get_crawl_times(self, date: Optional[str] = None) -> List[str]:
        """
        获取指定日期的所有抓取时间列表
        Args:
            date: 日期字符串，默认为今天
        Returns:
            抓取时间列表（按时间排序）
        """
        try:
            db_path = self._get_db_path(date)
            if not db_path.exists():
                return []
            conn = self._get_connection(date)
            cursor = conn.cursor()
            cursor.execute("""
                SELECT crawl_time FROM crawl_records
                ORDER BY crawl_time
            """)
            rows = cursor.fetchall()
            return [row[0] for row in rows]
        except Exception as e:
            print(f"[本地存储] 获取抓取时间列表失败: {e}")
            return []
    def cleanup(self) -> None:
        """清理资源（关闭数据库连接）"""
        for db_path, conn in self._db_connections.items():
            try:
                conn.close()
                print(f"[本地存储] 关闭数据库连接: {db_path}")
            except Exception as e:
                print(f"[本地存储] 关闭连接失败 {db_path}: {e}")
        self._db_connections.clear()
    def cleanup_old_data(self, retention_days: int) -> int:
        """
        清理过期数据
        Args:
            retention_days: 保留天数（0 表示不清理）
        Returns:
            删除的日期目录数量
        """
        if retention_days <= 0:
            return 0
        deleted_count = 0
        cutoff_date = self._get_configured_time() - timedelta(days=retention_days)
        try:
            if not self.data_dir.exists():
                return 0
            for date_folder in self.data_dir.iterdir():
                if not date_folder.is_dir() or date_folder.name.startswith('.'):
                    continue
                # 解析日期文件夹名（支持两种格式）
                folder_date = None
                try:
                    # ISO 格式: YYYY-MM-DD
                    date_match = re.match(r'(\d{4})-(\d{2})-(\d{2})', date_folder.name)
                    if date_match:
                        folder_date = datetime(
                            int(date_match.group(1)),
                            int(date_match.group(2)),
                            int(date_match.group(3)),
                            tzinfo=pytz.timezone("Asia/Shanghai")
                        )
                    else:
                        # 旧中文格式: YYYY年MM月DD日
                        date_match = re.match(r'(\d{4})年(\d{2})月(\d{2})日', date_folder.name)
                        if date_match:
                            folder_date = datetime(
                                int(date_match.group(1)),
                                int(date_match.group(2)),
                                int(date_match.group(3)),
                                tzinfo=pytz.timezone("Asia/Shanghai")
                            )
                except Exception:
                    continue
                if folder_date and folder_date < cutoff_date:
                    # 先关闭该日期的数据库连接
                    db_path = str(self._get_db_path(date_folder.name))
                    if db_path in self._db_connections:
                        try:
                            self._db_connections[db_path].close()
                            del self._db_connections[db_path]
                        except Exception:
                            pass
                    # 删除整个日期目录
                    try:
                        shutil.rmtree(date_folder)
                        deleted_count += 1
                        print(f"[本地存储] 清理过期数据: {date_folder.name}")
                    except Exception as e:
                        print(f"[本地存储] 删除目录失败 {date_folder.name}: {e}")
            if deleted_count > 0:
                print(f"[本地存储] 共清理 {deleted_count} 个过期日期目录")
            return deleted_count
        except Exception as e:
            print(f"[本地存储] 清理过期数据失败: {e}")
            return deleted_count
    def has_pushed_today(self, date: Optional[str] = None) -> bool:
        """
        检查指定日期是否已推送过
        Args:
            date: 日期字符串（YYYY-MM-DD），默认为今天
        Returns:
            是否已推送
        """
        try:
            conn = self._get_connection(date)
            cursor = conn.cursor()
            target_date = self._format_date_folder(date)
            cursor.execute("""
                SELECT pushed FROM push_records WHERE date = ?
            """, (target_date,))
            row = cursor.fetchone()
            if row:
                return bool(row[0])
            return False
        except Exception as e:
            print(f"[本地存储] 检查推送记录失败: {e}")
            return False
    def record_push(self, report_type: str, date: Optional[str] = None) -> bool:
        """
        记录推送
        Args:
            report_type: 报告类型
            date: 日期字符串（YYYY-MM-DD），默认为今天
        Returns:
            是否记录成功
        """
        try:
            conn = self._get_connection(date)
            cursor = conn.cursor()
            target_date = self._format_date_folder(date)
            now_str = self._get_configured_time().strftime("%Y-%m-%d %H:%M:%S")
            cursor.execute("""
                INSERT INTO push_records (date, pushed, push_time, report_type, created_at)
                VALUES (?, 1, ?, ?, ?)
                ON CONFLICT(date) DO UPDATE SET
                    pushed = 1,
                    push_time = excluded.push_time,
                    report_type = excluded.report_type
            """, (target_date, now_str, report_type, now_str))
            conn.commit()
            print(f"[本地存储] 推送记录已保存: {report_type} at {now_str}")
            return True
        except Exception as e:
            print(f"[本地存储] 记录推送失败: {e}")
            return False
    def __del__(self):
        """析构函数，确保关闭连接"""
        self.cleanup()
--- a/trendradar/storage/manager.py
+++ b/trendradar/storage/manager.py
@ -0,0 +1,316 @@
 # coding=utf-8
 """
 存储管理器 - 统一管理存储后端
 根据环境和配置自动选择合适的存储后端
 """
 import os
 from typing import Optional
 from trendradar.storage.base import StorageBackend, NewsData
 # 存储管理器单例
 _storage_manager: Optional["StorageManager"] = None
 class StorageManager:
    """
    存储管理器
    功能：
    - 自动检测运行环境（GitHub Actions / Docker / 本地）
    - 根据配置选择存储后端（local / remote / auto）
    - 提供统一的存储接口
    - 支持从远程拉取数据到本地
    """
    def __init__(
        self,
        backend_type: str = "auto",
        data_dir: str = "output",
        enable_txt: bool = True,
        enable_html: bool = True,
        remote_config: Optional[dict] = None,
        local_retention_days: int = 0,
        remote_retention_days: int = 0,
        pull_enabled: bool = False,
        pull_days: int = 0,
        timezone: str = "Asia/Shanghai",
    ):
        """
        初始化存储管理器
        Args:
            backend_type: 存储后端类型 (local / remote / auto)
            data_dir: 本地数据目录
            enable_txt: 是否启用 TXT 快照
            enable_html: 是否启用 HTML 报告
            remote_config: 远程存储配置（endpoint_url, bucket_name, access_key_id 等）
            local_retention_days: 本地数据保留天数（0 = 无限制）
            remote_retention_days: 远程数据保留天数（0 = 无限制）
            pull_enabled: 是否启用启动时自动拉取
            pull_days: 拉取最近 N 天的数据
            timezone: 时区配置（默认 Asia/Shanghai）
        """
        self.backend_type = backend_type
        self.data_dir = data_dir
        self.enable_txt = enable_txt
        self.enable_html = enable_html
        self.remote_config = remote_config or {}
        self.local_retention_days = local_retention_days
        self.remote_retention_days = remote_retention_days
        self.pull_enabled = pull_enabled
        self.pull_days = pull_days
        self.timezone = timezone
        self._backend: Optional[StorageBackend] = None
        self._remote_backend: Optional[StorageBackend] = None
    @staticmethod
    def is_github_actions() -> bool:
        """检测是否在 GitHub Actions 环境中运行"""
        return os.environ.get("GITHUB_ACTIONS") == "true"
    @staticmethod
    def is_docker() -> bool:
        """检测是否在 Docker 容器中运行"""
        # 方法1: 检查 /.dockerenv 文件
        if os.path.exists("/.dockerenv"):
            return True
        # 方法2: 检查 cgroup（Linux）
        try:
            with open("/proc/1/cgroup", "r") as f:
                return "docker" in f.read()
        except (FileNotFoundError, PermissionError):
            pass
        # 方法3: 检查环境变量
        return os.environ.get("DOCKER_CONTAINER") == "true"
    def _resolve_backend_type(self) -> str:
        """解析实际使用的后端类型"""
        if self.backend_type == "auto":
            if self.is_github_actions():
                # GitHub Actions 环境，检查是否配置了远程存储
                if self._has_remote_config():
                    return "remote"
                else:
                    print("[存储管理器] GitHub Actions 环境但未配置远程存储，使用本地存储")
                    return "local"
            else:
                return "local"
        return self.backend_type
    def _has_remote_config(self) -> bool:
        """检查是否有有效的远程存储配置"""
        # 检查配置或环境变量
        bucket_name = self.remote_config.get("bucket_name") or os.environ.get("S3_BUCKET_NAME")
        access_key = self.remote_config.get("access_key_id") or os.environ.get("S3_ACCESS_KEY_ID")
        secret_key = self.remote_config.get("secret_access_key") or os.environ.get("S3_SECRET_ACCESS_KEY")
        endpoint = self.remote_config.get("endpoint_url") or os.environ.get("S3_ENDPOINT_URL")
        # 调试日志
        has_config = bool(bucket_name and access_key and secret_key and endpoint)
        if not has_config:
            print(f"[存储管理器] 远程存储配置检查失败:")
            print(f"  - bucket_name: {'已配置' if bucket_name else '未配置'}")
            print(f"  - access_key_id: {'已配置' if access_key else '未配置'}")
            print(f"  - secret_access_key: {'已配置' if secret_key else '未配置'}")
            print(f"  - endpoint_url: {'已配置' if endpoint else '未配置'}")
        return has_config
    def _create_remote_backend(self) -> Optional[StorageBackend]:
        """创建远程存储后端"""
        try:
            from trendradar.storage.remote import RemoteStorageBackend
            return RemoteStorageBackend(
                bucket_name=self.remote_config.get("bucket_name") or os.environ.get("S3_BUCKET_NAME", ""),
                access_key_id=self.remote_config.get("access_key_id") or os.environ.get("S3_ACCESS_KEY_ID", ""),
                secret_access_key=self.remote_config.get("secret_access_key") or os.environ.get("S3_SECRET_ACCESS_KEY", ""),
                endpoint_url=self.remote_config.get("endpoint_url") or os.environ.get("S3_ENDPOINT_URL", ""),
                region=self.remote_config.get("region") or os.environ.get("S3_REGION", ""),
                enable_txt=self.enable_txt,
                enable_html=self.enable_html,
                timezone=self.timezone,
            )
        except ImportError as e:
            print(f"[存储管理器] 远程后端导入失败: {e}")
            print("[存储管理器] 请确保已安装 boto3: pip install boto3")
            return None
        except Exception as e:
            print(f"[存储管理器] 远程后端初始化失败: {e}")
            return None
    def get_backend(self) -> StorageBackend:
        """获取存储后端实例"""
        if self._backend is None:
            resolved_type = self._resolve_backend_type()
            if resolved_type == "remote":
                self._backend = self._create_remote_backend()
                if self._backend:
                    print(f"[存储管理器] 使用远程存储后端")
                else:
                    print("[存储管理器] 回退到本地存储")
                    resolved_type = "local"
            if resolved_type == "local" or self._backend is None:
                from trendradar.storage.local import LocalStorageBackend
                self._backend = LocalStorageBackend(
                    data_dir=self.data_dir,
                    enable_txt=self.enable_txt,
                    enable_html=self.enable_html,
                    timezone=self.timezone,
                )
                print(f"[存储管理器] 使用本地存储后端 (数据目录: {self.data_dir})")
        return self._backend
    def pull_from_remote(self) -> int:
        """
        从远程拉取数据到本地
        Returns:
            成功拉取的文件数量
        """
        if not self.pull_enabled or self.pull_days <= 0:
            return 0
        if not self._has_remote_config():
            print("[存储管理器] 未配置远程存储，无法拉取")
            return 0
        # 创建远程后端（如果还没有）
        if self._remote_backend is None:
            self._remote_backend = self._create_remote_backend()
        if self._remote_backend is None:
            print("[存储管理器] 无法创建远程后端，拉取失败")
            return 0
        # 调用拉取方法
        return self._remote_backend.pull_recent_days(self.pull_days, self.data_dir)
    def save_news_data(self, data: NewsData) -> bool:
        """保存新闻数据"""
        return self.get_backend().save_news_data(data)
    def get_today_all_data(self, date: Optional[str] = None) -> Optional[NewsData]:
        """获取当天所有数据"""
        return self.get_backend().get_today_all_data(date)
    def get_latest_crawl_data(self, date: Optional[str] = None) -> Optional[NewsData]:
        """获取最新抓取数据"""
        return self.get_backend().get_latest_crawl_data(date)
    def detect_new_titles(self, current_data: NewsData) -> dict:
        """检测新增标题"""
        return self.get_backend().detect_new_titles(current_data)
    def save_txt_snapshot(self, data: NewsData) -> Optional[str]:
        """保存 TXT 快照"""
        return self.get_backend().save_txt_snapshot(data)
    def save_html_report(self, html_content: str, filename: str, is_summary: bool = False) -> Optional[str]:
        """保存 HTML 报告"""
        return self.get_backend().save_html_report(html_content, filename, is_summary)
    def is_first_crawl_today(self, date: Optional[str] = None) -> bool:
        """检查是否是当天第一次抓取"""
        return self.get_backend().is_first_crawl_today(date)
    def cleanup(self) -> None:
        """清理资源"""
        if self._backend:
            self._backend.cleanup()
        if self._remote_backend:
            self._remote_backend.cleanup()
    def cleanup_old_data(self) -> int:
        """
        清理过期数据
        Returns:
            删除的日期目录数量
        """
        total_deleted = 0
        # 清理本地数据
        if self.local_retention_days > 0:
            total_deleted += self.get_backend().cleanup_old_data(self.local_retention_days)
        # 清理远程数据（如果配置了）
        if self.remote_retention_days > 0 and self._has_remote_config():
            if self._remote_backend is None:
                self._remote_backend = self._create_remote_backend()
            if self._remote_backend:
                total_deleted += self._remote_backend.cleanup_old_data(self.remote_retention_days)
        return total_deleted
    @property
    def backend_name(self) -> str:
        """获取当前后端名称"""
        return self.get_backend().backend_name
    @property
    def supports_txt(self) -> bool:
        """是否支持 TXT 快照"""
        return self.get_backend().supports_txt
 def get_storage_manager(
    backend_type: str = "auto",
    data_dir: str = "output",
    enable_txt: bool = True,
    enable_html: bool = True,
    remote_config: Optional[dict] = None,
    local_retention_days: int = 0,
    remote_retention_days: int = 0,
    pull_enabled: bool = False,
    pull_days: int = 0,
    timezone: str = "Asia/Shanghai",
    force_new: bool = False,
 ) -> StorageManager:
    """
    获取存储管理器单例
    Args:
        backend_type: 存储后端类型
        data_dir: 本地数据目录
        enable_txt: 是否启用 TXT 快照
        enable_html: 是否启用 HTML 报告
        remote_config: 远程存储配置
        local_retention_days: 本地数据保留天数（0 = 无限制）
        remote_retention_days: 远程数据保留天数（0 = 无限制）
        pull_enabled: 是否启用启动时自动拉取
        pull_days: 拉取最近 N 天的数据
        timezone: 时区配置（默认 Asia/Shanghai）
        force_new: 是否强制创建新实例
    Returns:
        StorageManager 实例
    """
    global _storage_manager
    if _storage_manager is None or force_new:
        _storage_manager = StorageManager(
            backend_type=backend_type,
            data_dir=data_dir,
            enable_txt=enable_txt,
            enable_html=enable_html,
            remote_config=remote_config,
            local_retention_days=local_retention_days,
            remote_retention_days=remote_retention_days,
            pull_enabled=pull_enabled,
            pull_days=pull_days,
            timezone=timezone,
        )
    return _storage_manager
--- a/trendradar/storage/remote.py
+++ b/trendradar/storage/remote.py
--- a/trendradar/storage/schema.sql
+++ b/trendradar/storage/schema.sql
@ -0,0 +1,117 @@
 -- TrendRadar 数据库表结构
 -- ============================================
 -- 平台信息表
 -- 核心：id 不变，name 可变
 -- ============================================
 CREATE TABLE IF NOT EXISTS platforms (
    id TEXT PRIMARY KEY,
    name TEXT NOT NULL,
    is_active INTEGER DEFAULT 1,
    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
 );
 -- ============================================
 -- 新闻条目表
 -- 以 URL + platform_id 为唯一标识，支持去重存储
 -- ============================================
 CREATE TABLE IF NOT EXISTS news_items (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    title TEXT NOT NULL,
    platform_id TEXT NOT NULL,
    rank INTEGER NOT NULL,
    url TEXT DEFAULT '',
    mobile_url TEXT DEFAULT '',
    first_crawl_time TEXT NOT NULL,      -- 首次抓取时间
    last_crawl_time TEXT NOT NULL,       -- 最后抓取时间
    crawl_count INTEGER DEFAULT 1,       -- 抓取次数
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    FOREIGN KEY (platform_id) REFERENCES platforms(id)
 );
 -- ============================================
 -- 标题变更历史表
 -- 记录同一 URL 下标题的变化
 -- ============================================
 CREATE TABLE IF NOT EXISTS title_changes (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    news_item_id INTEGER NOT NULL,
    old_title TEXT NOT NULL,
    new_title TEXT NOT NULL,
    changed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    FOREIGN KEY (news_item_id) REFERENCES news_items(id)
 );
 -- ============================================
 -- 排名历史表
 -- 记录每次抓取时的排名变化
 -- ============================================
 CREATE TABLE IF NOT EXISTS rank_history (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    news_item_id INTEGER NOT NULL,
    rank INTEGER NOT NULL,
    crawl_time TEXT NOT NULL,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    FOREIGN KEY (news_item_id) REFERENCES news_items(id)
 );
 -- ============================================
 -- 抓取记录表
 -- 记录每次抓取的时间和数量
 -- ============================================
 CREATE TABLE IF NOT EXISTS crawl_records (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    crawl_time TEXT NOT NULL UNIQUE,
    total_items INTEGER DEFAULT 0,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
 );
 -- ============================================
 -- 抓取来源状态表
 -- 记录每次抓取各平台的成功/失败状态
 -- ============================================
 CREATE TABLE IF NOT EXISTS crawl_source_status (
    crawl_record_id INTEGER NOT NULL,
    platform_id TEXT NOT NULL,
    status TEXT NOT NULL CHECK(status IN ('success', 'failed')),
    PRIMARY KEY (crawl_record_id, platform_id),
    FOREIGN KEY (crawl_record_id) REFERENCES crawl_records(id),
    FOREIGN KEY (platform_id) REFERENCES platforms(id)
 );
 -- ============================================
 -- 推送记录表
 -- 用于 push_window once_per_day 功能
 -- ============================================
 CREATE TABLE IF NOT EXISTS push_records (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    date TEXT NOT NULL UNIQUE,
    pushed INTEGER DEFAULT 0,
    push_time TEXT,
    report_type TEXT,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
 );
 -- ============================================
 -- 索引定义
 -- ============================================
 -- 平台索引
 CREATE INDEX IF NOT EXISTS idx_news_platform ON news_items(platform_id);
 -- 时间索引（用于查询最新数据）
 CREATE INDEX IF NOT EXISTS idx_news_crawl_time ON news_items(last_crawl_time);
 -- 标题索引（用于标题搜索）
 CREATE INDEX IF NOT EXISTS idx_news_title ON news_items(title);
 -- URL + platform_id 唯一索引（仅对非空 URL，实现去重）
 CREATE UNIQUE INDEX IF NOT EXISTS idx_news_url_platform
    ON news_items(url, platform_id) WHERE url != '';
 -- 抓取状态索引
 CREATE INDEX IF NOT EXISTS idx_crawl_status_record ON crawl_source_status(crawl_record_id);
 -- 排名历史索引
 CREATE INDEX IF NOT EXISTS idx_rank_history_news ON rank_history(news_item_id);
--- a/trendradar/utils/init.py
+++ b/trendradar/utils/init.py
@ -0,0 +1,20 @@
 # coding=utf-8
 """
 工具模块 - 公共工具函数
 """
 from trendradar.utils.time import (
    get_configured_time,
    format_date_folder,
    format_time_filename,
    get_current_time_display,
    convert_time_for_display,
 )
 __all__ = [
    "get_configured_time",
    "format_date_folder",
    "format_time_filename",
    "get_current_time_display",
    "convert_time_for_display",
 ]
--- a/trendradar/utils/time.py
+++ b/trendradar/utils/time.py
@ -0,0 +1,91 @@
 # coding=utf-8
 """
 时间工具模块 - 统一时间处理函数
 """
 from datetime import datetime
 from typing import Optional
 import pytz
 # 默认时区
 DEFAULT_TIMEZONE = "Asia/Shanghai"
 def get_configured_time(timezone: str = DEFAULT_TIMEZONE) -> datetime:
    """
    获取配置时区的当前时间
    Args:
        timezone: 时区名称，如 'Asia/Shanghai', 'America/Los_Angeles'
    Returns:
        带时区信息的当前时间
    """
    try:
        tz = pytz.timezone(timezone)
    except pytz.UnknownTimeZoneError:
        print(f"[警告] 未知时区 '{timezone}'，使用默认时区 {DEFAULT_TIMEZONE}")
        tz = pytz.timezone(DEFAULT_TIMEZONE)
    return datetime.now(tz)
 def format_date_folder(
    date: Optional[str] = None, timezone: str = DEFAULT_TIMEZONE
 ) -> str:
    """
    格式化日期文件夹名 (ISO 格式: YYYY-MM-DD)
    Args:
        date: 指定日期字符串，为 None 则使用当前日期
        timezone: 时区名称
    Returns:
        格式化后的日期字符串，如 '2025-12-09'
    """
    if date:
        return date
    return get_configured_time(timezone).strftime("%Y-%m-%d")
 def format_time_filename(timezone: str = DEFAULT_TIMEZONE) -> str:
    """
    格式化时间文件名 (格式: HH-MM，用于文件名)
    Windows 系统不支持冒号作为文件名，因此使用连字符
    Args:
        timezone: 时区名称
    Returns:
        格式化后的时间字符串，如 '15-30'
    """
    return get_configured_time(timezone).strftime("%H-%M")
 def get_current_time_display(timezone: str = DEFAULT_TIMEZONE) -> str:
    """
    获取当前时间显示 (格式: HH:MM，用于显示)
    Args:
        timezone: 时区名称
    Returns:
        格式化后的时间字符串，如 '15:30'
    """
    return get_configured_time(timezone).strftime("%H:%M")
 def convert_time_for_display(time_str: str) -> str:
    """
    将 HH-MM 格式转换为 HH:MM 格式用于显示
    Args:
        time_str: 输入时间字符串，如 '15-30'
    Returns:
        转换后的时间字符串，如 '15:30'
    """
    if time_str and "-" in time_str and len(time_str) == 5:
        return time_str.replace("-", ":")
    return time_str
--- a/2
+++ b/2
@ -1 +1 @@
-3.5.0
+4.0.0