diff --git a/.github/ISSUE_TEMPLATE/01-bug-report.yml b/.github/ISSUE_TEMPLATE/01-bug-report.yml index f028116..0c3db59 100644 --- a/.github/ISSUE_TEMPLATE/01-bug-report.yml +++ b/.github/ISSUE_TEMPLATE/01-bug-report.yml @@ -4,8 +4,6 @@ name: 🐛 遇到问题了 description: 程序运行不正常或出现错误 title: "[问题] " labels: ["bug"] -assignees: - - sansan0 body: - type: markdown attributes: diff --git a/.github/ISSUE_TEMPLATE/02-feature-request.yml b/.github/ISSUE_TEMPLATE/02-feature-request.yml index 227625c..77963eb 100644 --- a/.github/ISSUE_TEMPLATE/02-feature-request.yml +++ b/.github/ISSUE_TEMPLATE/02-feature-request.yml @@ -4,8 +4,6 @@ name: 💡 我有个想法 description: 建议新功能或改进现有功能 title: "[建议] " labels: ["enhancement"] -assignees: - - sansan0 body: - type: markdown attributes: diff --git a/.github/ISSUE_TEMPLATE/03-config-help.yml b/.github/ISSUE_TEMPLATE/03-config-help.yml index bdaff53..e86cbaa 100644 --- a/.github/ISSUE_TEMPLATE/03-config-help.yml +++ b/.github/ISSUE_TEMPLATE/03-config-help.yml @@ -4,8 +4,6 @@ name: ⚙️ 设置遇到困难 description: 配置相关的问题或需要帮助 title: "[设置] " labels: ["配置", "帮助"] -assignees: - - sansan0 body: - type: markdown attributes: diff --git a/.github/workflows/clean-crawler.yml b/.github/workflows/clean-crawler.yml new file mode 100644 index 0000000..a84a142 --- /dev/null +++ b/.github/workflows/clean-crawler.yml @@ -0,0 +1,28 @@ +name: Check In + +# ✅ 签到续期:运行此 workflow 可重置 7 天计时,保持 "Get Hot News" 正常运行 +# ✅ Renewal: Run this workflow to reset the 7-day timer and keep "Get Hot News" active +# +# 📌 操作方法 / How to use: +# 1. 点击 "Run workflow" 按钮 / Click "Run workflow" button +# 2. 每 7 天内至少运行一次 / Run at least once every 7 days + +on: + workflow_dispatch: + +jobs: + del_runs: + runs-on: ubuntu-latest + permissions: + actions: write + contents: read + steps: + - name: Delete all workflow runs + uses: Mattraks/delete-workflow-runs@v2 + with: + token: ${{ github.token }} + repository: ${{ github.repository }} + retain_days: 0 + keep_minimum_runs: 0 + delete_workflow_by_state_pattern: "ALL" + delete_run_by_conclusion_pattern: "ALL" \ No newline at end of file diff --git a/.github/workflows/crawler.yml b/.github/workflows/crawler.yml new file mode 100644 index 0000000..733de22 --- /dev/null +++ b/.github/workflows/crawler.yml @@ -0,0 +1,163 @@ +name: Get Hot News + +on: + schedule: + # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + # ⚠️ 试用版说明 / Trial Mode + # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + # + # 🔄 运行机制 / How it works: + # - 每个周期为 7 天,届时自动停止 + # - 运行 "Check In" 会重置周期(重新开始 7 天倒计时,而非累加) + # - Each cycle is 7 days, then auto-stops + # - "Check In" resets the cycle (restarts 7-day countdown, not cumulative) + # + # 💡 设计初衷 / Why this design: + # 如果 7 天都忘了签到,或许这些资讯对你来说并非刚需 + # 适时的暂停,能帮你从信息流中抽离,给大脑留出喘息的空间 + # If you forget for 7 days, maybe you don't really need it + # A timely pause helps you detach from the stream and gives your mind space + # + # 🙏 珍惜资源 / Respect shared resources: + # GitHub Actions 是平台提供的公共资源,每次运行都会消耗算力 + # 签到机制确保资源分配给真正需要的用户,感谢你的理解与配合 + # GitHub Actions is a shared public resource provided by the platform + # Check-in ensures resources go to those who truly need it — thank you + # + # 🚀 长期使用请部署 Docker 版本 / For long-term use, deploy Docker version + # + # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + # + # 📝 修改运行时间:只改第一个数字(0-59),表示每小时第几分钟运行 + # 📝 Change time: Only modify the first number (0-59) = minute of each hour + # + # 示例 / Examples: + # "15 * * * *" → 每小时第15分钟 / minute 15 every hour + # "30 0-14 * * *" → 北京时间 8:00-22:00 每小时第30分钟 / Beijing 8am-10pm + # + - cron: "33 * * * *" + + workflow_dispatch: + +concurrency: + group: crawler-${{ github.ref_name }} + cancel-in-progress: true + +permissions: + contents: read + actions: write + +jobs: + crawl: + runs-on: ubuntu-latest + timeout-minutes: 15 + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 1 + clean: true + + - name: Check Expiration + env: + GH_TOKEN: ${{ github.token }} + run: | + WORKFLOW_FILE="crawler.yml" + API_URL="repos/${{ github.repository }}/actions/workflows/$WORKFLOW_FILE/runs" + + TOTAL=$(gh api "$API_URL" --jq '.total_count') + if [ -z "$TOTAL" ] || [ "$TOTAL" -eq 0 ]; then + echo "No previous runs found, skipping expiration check" + exit 0 + fi + + LAST_PAGE=$(( (TOTAL + 99) / 100 )) + FIRST_RUN_DATE=$(gh api "$API_URL?per_page=100&page=$LAST_PAGE" --jq '.workflow_runs[-1].created_at') + + if [ -n "$FIRST_RUN_DATE" ]; then + CURRENT_TIMESTAMP=$(date +%s) + FIRST_RUN_TIMESTAMP=$(date -d "$FIRST_RUN_DATE" +%s) + DIFF_SECONDS=$((CURRENT_TIMESTAMP - FIRST_RUN_TIMESTAMP)) + LIMIT_SECONDS=604800 + + if [ $DIFF_SECONDS -gt $LIMIT_SECONDS ]; then + echo "⚠️ 试用期已结束,请运行 'Check In' 签到续期" + echo "⚠️ Trial expired. Run 'Check In' to renew." + gh workflow disable "$WORKFLOW_FILE" + exit 1 + else + DAYS_LEFT=$(( (LIMIT_SECONDS - DIFF_SECONDS) / 86400 )) + echo "✅ 试用期剩余 ${DAYS_LEFT} 天,到期前请运行 'Check In' 签到续期" + echo "✅ Trial: ${DAYS_LEFT} days left. Run 'Check In' before expiry to renew." + fi + fi + + + # -------------------------------------------------------------------------------- + # 🚦 TRAFFIC CONTROL / 流量控制 + # -------------------------------------------------------------------------------- + # EN: Generates a random delay between 1 and 300 seconds (5 minutes). + # Critical for load balancing. + # + # CN: 生成 1 到 300 秒(5分钟)之间的随机延迟。 + # 这对负载均衡至关重要。 + - name: Random Delay (Traffic Control) + if: success() + run: | + echo "🎲 Traffic Control: Generating random delay..." + DELAY=$(( ( RANDOM % 300 ) + 1 )) + echo "⏸️ Sleeping for ${DELAY} seconds to spread the load..." + sleep ${DELAY}s + echo "▶️ Delay finished. Starting crawler..." + + - name: Set up Python + if: success() + uses: actions/setup-python@v5 + with: + python-version: "3.10" + cache: "pip" + + - name: Install dependencies + if: success() + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + + - name: Verify required files + if: success() + run: | + if [ ! -f config/config.yaml ]; then + echo "Error: Config missing" + exit 1 + fi + + - name: Run crawler + if: success() + env: + FEISHU_WEBHOOK_URL: ${{ secrets.FEISHU_WEBHOOK_URL }} + TELEGRAM_BOT_TOKEN: ${{ secrets.TELEGRAM_BOT_TOKEN }} + TELEGRAM_CHAT_ID: ${{ secrets.TELEGRAM_CHAT_ID }} + DINGTALK_WEBHOOK_URL: ${{ secrets.DINGTALK_WEBHOOK_URL }} + WEWORK_WEBHOOK_URL: ${{ secrets.WEWORK_WEBHOOK_URL }} + WEWORK_MSG_TYPE: ${{ secrets.WEWORK_MSG_TYPE }} + EMAIL_FROM: ${{ secrets.EMAIL_FROM }} + EMAIL_PASSWORD: ${{ secrets.EMAIL_PASSWORD }} + EMAIL_TO: ${{ secrets.EMAIL_TO }} + EMAIL_SMTP_SERVER: ${{ secrets.EMAIL_SMTP_SERVER }} + EMAIL_SMTP_PORT: ${{ secrets.EMAIL_SMTP_PORT }} + NTFY_TOPIC: ${{ secrets.NTFY_TOPIC }} + NTFY_SERVER_URL: ${{ secrets.NTFY_SERVER_URL }} + NTFY_TOKEN: ${{ secrets.NTFY_TOKEN }} + BARK_URL: ${{ secrets.BARK_URL }} + SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} + STORAGE_BACKEND: auto + LOCAL_RETENTION_DAYS: ${{ secrets.LOCAL_RETENTION_DAYS }} + REMOTE_RETENTION_DAYS: ${{ secrets.REMOTE_RETENTION_DAYS }} + S3_BUCKET_NAME: ${{ secrets.S3_BUCKET_NAME }} + S3_ACCESS_KEY_ID: ${{ secrets.S3_ACCESS_KEY_ID }} + S3_SECRET_ACCESS_KEY: ${{ secrets.S3_SECRET_ACCESS_KEY }} + S3_ENDPOINT_URL: ${{ secrets.S3_ENDPOINT_URL }} + S3_REGION: ${{ secrets.S3_REGION }} + GITHUB_ACTIONS: true + run: python -m trendradar diff --git a/README-EN.md b/README-EN.md index 084dda9..36be381 100644 --- a/README-EN.md +++ b/README-EN.md @@ -1,6 +1,6 @@
-> **📢 Announcement:** After communicating with GitHub officials, "One-Click Fork Deployment" will be restored after compliance adjustments are completed. Please stay tuned for **v4.0.0** update +> **📢 Announcement:** **v4.0.0** has been released! Including storage architecture refactoring, database optimization, modularization improvements, and more major updates TrendRadar Banner @@ -16,8 +16,8 @@ [![GitHub Stars](https://img.shields.io/github/stars/sansan0/TrendRadar?style=flat-square&logo=github&color=yellow)](https://github.com/sansan0/TrendRadar/stargazers) [![GitHub Forks](https://img.shields.io/github/forks/sansan0/TrendRadar?style=flat-square&logo=github&color=blue)](https://github.com/sansan0/TrendRadar/network/members) [![License](https://img.shields.io/badge/license-GPL--3.0-blue.svg?style=flat-square)](LICENSE) -[![Version](https://img.shields.io/badge/version-v3.5.0-blue.svg)](https://github.com/sansan0/TrendRadar) -[![MCP](https://img.shields.io/badge/MCP-v1.0.3-green.svg)](https://github.com/sansan0/TrendRadar) +[![Version](https://img.shields.io/badge/version-v4.0.0-blue.svg)](https://github.com/sansan0/TrendRadar) +[![MCP](https://img.shields.io/badge/MCP-v1.1.0-green.svg)](https://github.com/sansan0/TrendRadar) [![WeWork](https://img.shields.io/badge/WeWork-Notification-00D4AA?style=flat-square)](https://work.weixin.qq.com/) [![WeChat](https://img.shields.io/badge/WeChat-Notification-00D4AA?style=flat-square)](https://weixin.qq.com/) @@ -48,62 +48,61 @@
-🚨 【MUST READ】Important Announcement: The Correct Way to Deploy This Project +🚨 【Must Read】Important Announcement: v4.0.0 Deployment & Storage Architecture Changes
-> **⚠️ December 2025 Urgent Notice** -> -> Due to a surge in Fork numbers causing excessive load on GitHub servers, **GitHub Actions and GitHub Pages deployments are currently restricted**. Please read the following instructions carefully to ensure successful deployment. +### 🛠️ Choose the Deployment Method That Fits You -### 1. ✅ Only Recommended Deployment Method: Docker +#### 🅰️ Option 1: Docker Deployment (Recommended 🔥) -**This is currently the most stable solution, free from GitHub restrictions.** Data is stored locally and won't be affected by GitHub policy changes. +* **Features**: Most stable and simplest. Data is stored in **local SQLite**, fully under your control. + +* **Best for**: Users with their own server, NAS, or an always-on PC. * 👉 [Jump to Docker Deployment Tutorial](#6-docker-deployment) --- -### 2. If You Were Planning to Fork This Project... +#### 🅱️ Option 2: GitHub Actions Deployment (Restored ✅) -To reduce pressure on GitHub servers, **please DO NOT directly click the "Fork" button!** +* **Features**: Data is no longer committed directly to the repo. Instead, it is stored in **Remote Cloud Storage** (supports S3-compatible protocols: Cloudflare R2, Alibaba Cloud OSS, Tencent Cloud COS, etc.). -Please use the **"Use this template"** feature instead of Fork: +* **Requirement**: You **must** configure an S3-compatible object storage service (Cloudflare R2 recommended, it's free). + +> **⚠️ Note**: If you choose this option, you must complete the following two configuration steps: + +#### 1. 🚀 Recommended Start: Use this template + +To keep the repository clean and avoid inheriting redundant history, I **recommend** using Template mode: + +1. **Click** the green **[Use this template]** button at the top right of the original repository page. -1. **Click** the green **[Use this template]** button in the top right corner of the original repository page. 2. **Select** "Create a new repository". -**Why do this?** -* **❌ Fork**: Copies complete history records. Many forks running simultaneously will trigger GitHub risk control. -* **✅ Use this template**: Creates a completely new independent repository without historical baggage, more server-friendly. +> **💡 Why do this?** +> * **Use this template**: Creates a brand new, clean repository with no historical baggage. +> * **Fork**: Retains the complete commit history and relationships, consuming more GitHub resources. ---- +#### 2. ☁️ About the Mandatory Remote Storage for GitHub Actions -### 3. About New Data Storage +If you choose **Option 2 (GitHub Actions)**, you must configure an S3-compatible object storage service. -The new version will use **Cloudflare R2** to store news data, ensuring data persistence. +**Supported Storage Services:** +- **Cloudflare R2** (Recommended, generous free tier) +- Other S3-compatible services -**⚠️ Configuration Prerequisites:** +**⚠️ Configuration Prerequisites (Using Cloudflare R2 as Example):** -According to Cloudflare platform rules, activating R2 requires binding a payment method. +According to Cloudflare platform rules, enabling R2 requires binding a payment method. -- **Purpose:** Identity verification only (Verify Only), no charges will be incurred. -- **Payment:** Supports credit cards or PayPal (China region). -- **Usage:** R2's free tier is sufficient to cover this project's daily operation, no payment required. +* **Purpose**: Identity verification only (Verify Only). **No charges will be incurred**. ---- +* **Payment**: Supports international credit cards or PayPal. -### 4. 📅 Future Plans & Documentation Reading Notes +* **Usage**: The R2 free tier (10GB storage/month) is sufficient to cover the daily operation of this project. No need to worry about costs. -> **Future Plans:** -> - Exploring new approach: keep Actions for fetching and pushing, but no longer save data to repository, use external storage instead. - -**⚠️ Reading Note:** -Given that the above plans mean **Fork deployment mode may return in a new form in the future**, and the workload to fully revise documentation is massive, we have temporarily retained the old descriptions. - -**At the current stage, if "Fork" related expressions still appear in subsequent tutorials, please ignore them or understand them as "Use this template"**. - -👉 **[Click here to view TrendRadar's latest official documentation](https://github.com/sansan0/TrendRadar?tab=readme-ov-file)** +👉 **[Click to View Detailed Configuration Tutorial](#-quick-start)**
@@ -287,10 +286,32 @@ Supports **WeWork** (+ WeChat push solution), **Feishu**, **DingTalk**, **Telegr - ⚠️ **Paired Configuration**: Telegram and ntfy require paired parameter quantities to match (e.g., token and chat_id both have 2 values) - ⚠️ **Quantity Limit**: Default maximum 3 accounts per channel, exceeded values will be truncated -### **Multi-Platform Support** -- **GitHub Pages**: Auto-generate beautiful web reports, PC/mobile adapted +### **Flexible Storage Architecture (v4.0.0 Major Update)** + +**Multi-Backend Support**: +- ☁️ **Remote Cloud Storage**: GitHub Actions environment default, supports S3-compatible protocols (R2/OSS/COS, etc.), data stored in cloud, keeping repository clean +- 💾 **Local SQLite**: Traditional SQLite database, stable and efficient (Docker/local deployment) +- 🔀 **Auto Selection**: Auto-selects appropriate backend based on runtime environment + +**Data Format Hierarchy**: + +| Format | Role | Description | +|--------|------|-------------| +| **SQLite** | Primary storage | Complete data with statistics information | +| **TXT** | Human-readable backup | Optional text records for manual viewing | +| **HTML** | Web report | Beautiful visual report (GitHub Pages) | + +**Data Management Features**: +- Auto data cleanup (configurable retention period) +- Timezone support (configurable IANA time zone) +- Cloud/local seamless switching + +> 💡 For storage configuration details, see [Configuration Details - Storage Configuration](#11-storage-configuration-v400-new) + +### **Multi-Platform Deployment** +- **GitHub Actions**: Cloud automated operations (7-day check-in cycle + remote cloud storage) - **Docker Deployment**: Supports multi-architecture containerized operation -- **Data Persistence**: HTML/TXT multi-format history saving +- **Local Running**: Python environment direct execution ### **AI Smart Analysis (v3.0.0 New)** @@ -341,10 +362,32 @@ Transform from "algorithm recommendation captivity" to "actively getting the inf >**Upgrade Instructions**: - **📌 Check Latest Updates**: **[Original Repository Changelog](https://github.com/sansan0/TrendRadar?tab=readme-ov-file#-changelog)** - **Tip**: Do NOT update this project via **Sync fork**. Check [Changelog] to understand specific [Upgrade Methods] and [Features] -- **Minor Version Update**: Upgrading from v2.x to v2.y, replace `main.py` in your forked repo with the latest version - **Major Version Upgrade**: Upgrading from v1.x to v2.y, recommend deleting existing fork and re-forking to save effort and avoid config conflicts +### 2025/12/13 - v4.0.0 + +**🎉 Major Update: Comprehensive Refactoring of Storage and Core Architecture** + +- **Multi-Storage Backend Support**: Introduced a brand new storage module supporting local SQLite and remote cloud storage (S3-compatible protocols, Cloudflare R2 recommended for free tier), adaptable to GitHub Actions, Docker, and local environments. +- **Database Structure Optimization**: Refactored SQLite database table structures to improve data efficiency and query performance. +- **Enhanced Features**: Implemented date format standardization, data retention policies, timezone configuration support, and optimized time display. Fixed remote storage data persistence issues to ensure accurate data merging. +- **Cleanup and Compatibility**: Removed most legacy compatibility code and unified data storage and retrieval methods. + +### 2025/12/13 - mcp-v1.1.0 + +**MCP Module Update:** +- Adapted for v4.0.0, while maintaining compatibility with v3.x data. +- Added storage sync tools: + - `sync_from_remote`: Pull data from remote storage to local + - `get_storage_status`: Get storage configuration and status + - `list_available_dates`: List available dates in local/remote storage + + +
+👉 Click to expand: Historical Updates + + ### 2025/12/03 - v3.5.0 **🎉 Core Feature Enhancements** @@ -397,7 +440,7 @@ Transform from "algorithm recommendation captivity" to "actively getting the inf **🔧 Upgrade Instructions**: - **GitHub Fork Users**: Update `main.py`, `config/config.yaml` (Added multi-account push support, existing single-account configuration unaffected) -- **Docker Users**: Update `.env`, `docker compose.yml` or set environment variables `REVERSE_CONTENT_ORDER`, `MAX_ACCOUNTS_PER_CHANNEL` +- **Docker Users**: Update `.env`, `docker-compose.yml` or set environment variables `REVERSE_CONTENT_ORDER`, `MAX_ACCOUNTS_PER_CHANNEL` - **Multi-Account Push**: New feature, disabled by default, existing single-account configuration unaffected @@ -431,10 +474,6 @@ Transform from "algorithm recommendation captivity" to "actively getting the inf - Tool count increased from 13 to 14 -
-👉 Click to expand: Historical Updates - - ### 2025/11/25 - v3.4.0 **🎉 Added Slack Push Support** @@ -819,11 +858,44 @@ frequency_words.txt file added **required word** feature, using + sign > **📖 Reminder**: Fork users should first **[check the latest official documentation](https://github.com/sansan0/TrendRadar?tab=readme-ov-file)** to ensure the configuration steps are up to date. +### ⚠️ GitHub Actions Usage Instructions + +**v4.0.0 Important Change**: Introduced "Activity Detection" mechanism—GitHub Actions now requires periodic check-in to maintain operation. + +#### 🔄 Check-In Renewal Mechanism + +- **Running Cycle**: Valid for **7 days**—service will automatically suspend when countdown ends. +- **Renewal Method**: Manually trigger the "Check In" workflow on the Actions page to reset the 7-day validity period. +- **Operation Path**: `Actions` → `Check In` → `Run workflow` +- **Design Philosophy**: + - If you forget for 7 days, maybe you don't really need it. Letting it stop is a digital detox, freeing you from the constant impact. + - GitHub Actions is a valuable public computing resource. The check-in mechanism aims to prevent wasted computing cycles, ensuring resources are allocated to truly active users who need them. Thank you for your understanding and support. + +#### 📦 Data Storage (Required Configuration) + +In GitHub Actions environment, data is stored in **Remote Cloud Storage** (supports S3-compatible protocols, Cloudflare R2 recommended for free tier), keeping your repository clean (see **Required Configuration: Remote Cloud Storage** below). + +#### 🚀 Recommended: Docker Deployment + +For long-term stable operation, we recommend [Docker Deployment](#6-docker-deployment), with data stored locally and no check-in required—though it does require purchasing a cloud server. + +
+ +> 🎉 **Now Supported: Multi-Cloud Storage Options** +> +> This project now supports S3-compatible protocols. You can choose: +> - **Cloudflare R2** (Recommended, generous free tier) +> - Other S3-compatible storage services +> +> Simply configure the corresponding `S3_ENDPOINT_URL`, `S3_BUCKET_NAME` and other environment variables to switch. + +--- + 1. **Fork this project** to your GitHub account - Click the "Fork" button at the top right of this page -2. **Setup GitHub Secrets (Choose your needed platforms)**: +2. **Setup GitHub Secrets (Required + Optional Platforms)**: In your forked repo, go to `Settings` > `Secrets and variables` > `Actions` > `New repository secret` @@ -862,6 +934,35 @@ frequency_words.txt file added **required word** feature, using + sign
+
+ ⚠️ Required Configuration: Remote Cloud Storage (Required for GitHub Actions Environment, Cloudflare R2 Recommended) +
+ + **GitHub Secret Configuration (⚠️ All 4 configuration items below are required):** + + | Name | Secret (Value) Description | + |------|----------------------------| + | `S3_BUCKET_NAME` | Bucket name (e.g., `trendradar-data`) | + | `S3_ACCESS_KEY_ID` | Access key ID | + | `S3_SECRET_ACCESS_KEY` | Access key | + | `S3_ENDPOINT_URL` | S3 API endpoint (e.g., R2: `https://.r2.cloudflarestorage.com`) | + +
+ + **How to Get Credentials (Using Cloudflare R2 as Example):** + + 1. Visit [Cloudflare Dashboard](https://dash.cloudflare.com/) and log in + 2. Select `R2` in left menu → Click `Create Bucket` → Enter name (e.g., `trendradar-data`) + 3. Click `Manage R2 API Tokens` at top right → `Create API Token` + 4. Select `Object Read & Write` permission → After creation, it will display `Access Key ID` and `Secret Access Key` + 5. Endpoint URL can be found in bucket details page (format: `https://.r2.cloudflarestorage.com`) + + **Notes**: + - R2 free tier: 10GB storage + 1 million reads per month, sufficient for this project + - Activation requires binding a payment method (identity verification only, no charges) + - Data stored in cloud, keeps GitHub repository clean + +
👉 Click to expand: WeWork Bot (Simplest and fastest configuration) @@ -2041,7 +2142,7 @@ TrendRadar provides two independent Docker images, deploy according to your need # Download docker compose config wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/.env -P docker/ - wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/docker compose.yml -P docker/ + wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/docker-compose.yml -P docker/ ``` > 💡 **Note**: Key directory structure required for Docker deployment: @@ -2052,7 +2153,7 @@ current directory/ │ └── frequency_words.txt └── docker/ ├── .env - └── docker compose.yml + └── docker-compose.yml ``` 2. **Config File Description**: @@ -2146,7 +2247,7 @@ vim config/frequency_words.txt # Use build version docker compose cd docker -cp docker compose-build.yml docker compose.yml +cp docker-compose-build.yml docker-compose.yml ``` **Build and Start Services**: @@ -2232,7 +2333,7 @@ docker rm trend-radar > 💡 **Web Server Notes**: > - After starting, access latest report at `http://localhost:8080` -> - Access historical reports via directory navigation (e.g., `http://localhost:8080/2025年xx月xx日/`) +> - Access historical reports via directory navigation (e.g., `http://localhost:8080/2025-xx-xx/`) > - Port can be configured in `.env` file with `WEBSERVER_PORT` parameter > - Auto-start: Set `ENABLE_WEBSERVER=true` in `.env` > - Security: Static files only, limited to output directory, localhost binding only @@ -2249,7 +2350,7 @@ TrendRadar generates daily summary HTML reports to two locations simultaneously: |--------------|---------------|----------| | `output/index.html` | Direct host access | **Docker Deployment** (via Volume mount, visible on host) | | `index.html` | Root directory access | **GitHub Pages** (repository root, auto-detected by Pages) | -| `output/YYYY年MM月DD日/html/当日汇总.html` | Historical reports | All environments (archived by date) | +| `output/YYYY-MM-DD/html/当日汇总.html` | Historical reports | All environments (archived by date) | **Local Access Examples**: ```bash @@ -2258,8 +2359,8 @@ TrendRadar generates daily summary HTML reports to two locations simultaneously: docker exec -it trend-radar python manage.py start_webserver # 2. Access in browser http://localhost:8080 # Access latest report (default index.html) -http://localhost:8080/2025年xx月xx日/ # Access reports for specific date -http://localhost:8080/2025年xx月xx日/html/ # Browse all HTML files for that date +http://localhost:8080/2025-xx-xx/ # Access reports for specific date +http://localhost:8080/2025-xx-xx/html/ # Browse all HTML files for that date # Method 2: Direct file access (local environment) open ./output/index.html # macOS @@ -2267,7 +2368,7 @@ start ./output/index.html # Windows xdg-open ./output/index.html # Linux # Method 3: Access historical archives -open ./output/2025年xx月xx日/html/当日汇总.html +open ./output/2025-xx-xx/html/当日汇总.html ``` **Why two index.html files?** @@ -2324,10 +2425,20 @@ flowchart TB Use docker compose to start both news push and MCP services: ```bash -# Download latest docker compose.yml (includes MCP service config) -wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/docker compose.yml +# Method 1: Clone project (Recommended) +git clone https://github.com/sansan0/TrendRadar.git +cd TrendRadar/docker +docker compose up -d -# Start all services +# Method 2: Download docker-compose.yml separately +mkdir trendradar && cd trendradar +wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/docker-compose.yml +wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/.env +mkdir -p config output +# Download config files +wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/config/config.yaml -P config/ +wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/config/frequency_words.txt -P config/ +# Modify volume paths in docker-compose.yml: ../config -> ./config, ../output -> ./output docker compose up -d # Check running status @@ -2337,18 +2448,29 @@ docker ps | grep trend-radar **Start MCP Service Separately**: ```bash +# Linux/Mac docker run -d --name trend-radar-mcp \ -p 127.0.0.1:3333:3333 \ - -v ./config:/app/config:ro \ - -v ./output:/app/output:ro \ + -v $(pwd)/config:/app/config:ro \ + -v $(pwd)/output:/app/output:ro \ -e TZ=Asia/Shanghai \ wantcat/trendradar-mcp:latest + +# Windows PowerShell +docker run -d --name trend-radar-mcp ` + -p 127.0.0.1:3333:3333 ` + -v ${PWD}/config:/app/config:ro ` + -v ${PWD}/output:/app/output:ro ` + -e TZ=Asia/Shanghai ` + wantcat/trendradar-mcp:latest ``` +> ⚠️ **Note**: Ensure `config/` and `output/` folders exist in current directory with config files and news data before running. + **Verify Service**: ```bash -# Check if MCP service is running properly +# Check MCP service health curl http://127.0.0.1:3333/mcp # View MCP service logs @@ -2357,14 +2479,20 @@ docker logs -f trend-radar-mcp **Configure in AI Clients**: -After MCP service starts, configure in Claude Desktop, Cherry Studio, Cursor, etc.: +After MCP service starts, configure based on your client: +**Cherry Studio** (Recommended, GUI config): +- Settings → MCP Server → Add +- Type: `streamableHttp` +- URL: `http://127.0.0.1:3333/mcp` + +**Claude Desktop / Cline** (JSON config): ```json { "mcpServers": { "trendradar": { "url": "http://127.0.0.1:3333/mcp", - "description": "TrendRadar News Trending Analysis" + "type": "streamableHttp" } } } @@ -2452,7 +2580,6 @@ notification: start: "20:00" # Start time (Beijing time) end: "22:00" # End time (Beijing time) once_per_day: true # Push only once per day - push_record_retention_days: 7 # Push record retention days ``` #### Configuration Details @@ -2463,7 +2590,6 @@ notification: | `time_range.start` | string | `"20:00"` | Push window start time (Beijing time, HH:MM format) | | `time_range.end` | string | `"22:00"` | Push window end time (Beijing time, HH:MM format) | | `once_per_day` | bool | `true` | `true`=push only once per day within window, `false`=push every execution within window | -| `push_record_retention_days` | int | `7` | Push record retention days (used to determine if already pushed) | #### Use Cases @@ -2487,7 +2613,6 @@ PUSH_WINDOW_ENABLED=true PUSH_WINDOW_START=09:00 PUSH_WINDOW_END=18:00 PUSH_WINDOW_ONCE_PER_DAY=false -PUSH_WINDOW_RETENTION_DAYS=7 ``` #### Complete Configuration Examples @@ -2502,7 +2627,6 @@ notification: start: "20:00" end: "22:00" once_per_day: true - push_record_retention_days: 7 ``` **Scenario: Push every hour during working hours** @@ -2515,7 +2639,6 @@ notification: start: "09:00" end: "18:00" once_per_day: false - push_record_retention_days: 7 ```
@@ -2811,6 +2934,207 @@ notification:
+### 11. Storage Configuration (v4.0.0 New) + +
+👉 Click to expand: Storage Configuration Guide +
+ +#### Storage Backend Selection + +TrendRadar v4.0.0 introduces **multi-backend storage architecture**, supporting automatic backend selection or manual specification: + +| Configuration Value | Description | Applicable Scenarios | +|---------------------|-------------|---------------------| +| `auto` (default) | Auto-select backend: GitHub Actions→R2, other environments→Local | Most users (recommended) | +| `local` | Force use of local SQLite | Docker/local deployment | +| `r2` | Force use of Cloudflare R2 | Cloud storage required | + +**Configuration Location**: +- GitHub Actions: Set `STORAGE_BACKEND` environment variable in GitHub Secrets +- Docker: Configure `STORAGE_BACKEND=local` in `.env` file +- Local: Add `STORAGE_BACKEND` in environment variables or use auto mode + +--- + +#### Database Structure Optimization (v4.0.0) + +v4.0.0 made significant optimizations to database structure, removing redundant fields and improving data normalization: + +##### 1. Removed Redundant Fields + +Removed the following redundant fields from `news` table: + +| Field Name | Removal Reason | Alternative | +|------------|----------------|------------| +| `source_name` | Duplicate with platform name | Get via `platforms` table JOIN query | +| `crawl_date` | Duplicate with file path date | Infer from file path timestamp | + +**Migration Notes**: Old databases are incompatible, see [Breaking Changes](#breaking-changes-v400) section + +##### 2. New Platforms Table + +Added `platforms` table for unified management of platform information: + +```sql +CREATE TABLE IF NOT EXISTS platforms ( + id TEXT PRIMARY KEY, -- Platform ID (immutable, e.g., 'zhihu', 'weibo') + name TEXT NOT NULL, -- Platform display name (mutable, e.g., 'Zhihu', 'Weibo') + enabled INTEGER DEFAULT 1 -- Whether enabled (1=enabled, 0=disabled) +); +``` + +**Design Advantages**: +- `id` field is immutable, maintains data consistency +- `name` field is mutable, supports internationalization and customization +- Historical data remains valid when modifying platform names + +##### 3. Crawl Source Status Normalization + +Replaced original comma-separated string storage `successful_sources` field with normalized `crawl_source_status` table: + +```sql +CREATE TABLE IF NOT EXISTS crawl_source_status ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + file_path TEXT NOT NULL, -- File path (e.g., 'output/2025-12-09/news.db') + platform_id TEXT NOT NULL, -- Platform ID (foreign key to platforms.id) + success INTEGER NOT NULL, -- Whether crawl succeeded (1=success, 0=failed) + crawl_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY (platform_id) REFERENCES platforms(id) +); +``` + +**Design Advantages**: +- Supports efficient SQL queries (e.g., calculate success rate by platform) +- Easy statistics and analysis (no string splitting required) +- Normalized structure, avoids data redundancy + +##### 4. File Path Format Standardization + +**Old Format**: `output/2025年12月09日/news_14-30.txt` +**New Format**: `output/2025-12-09/news.db` + +**Changes**: +- Date format: Chinese format → ISO 8601 standard format +- Filename: Multiple time-stamped TXT files → single SQLite database file +- Extension: `.txt` → `.db` + +**Advantages**: +- Cross-platform compatibility (avoids Chinese path issues) +- Easier programmatic parsing +- International standard, better maintainability + +--- + +#### Remote Cloud Storage Configuration + +When using remote cloud storage (required for GitHub Actions environment), configure the following environment variables: + +| Environment Variable | Description | Required | Example Value | +|----------------------|-------------|----------|--------------| +| `S3_BUCKET_NAME` | Bucket name | ✅ Yes | `trendradar-data` | +| `S3_ACCESS_KEY_ID` | Access key ID | ✅ Yes | `abc123...` | +| `S3_SECRET_ACCESS_KEY` | Access key | ✅ Yes | `xyz789...` | +| `S3_ENDPOINT_URL` | S3 API endpoint | ✅ Yes | `https://.r2.cloudflarestorage.com` | +| `S3_REGION` | Region (optional) | ❌ No | `auto` | + +**Configuration Method**: +- GitHub Actions: Configure in GitHub Secrets (see [Quick Start - Remote Storage Configuration](#2-setup-github-secrets-required--optional-platforms)) +- Docker/Local: Configure in `.env` file (remote storage is optional) + +--- + +#### Data Cleanup Strategy + +v4.0.0 added automatic data cleanup feature, supporting scheduled cleanup of old data: + +**Configuration Items**: `LOCAL_RETENTION_DAYS` and `REMOTE_RETENTION_DAYS` + +| Configuration Value | Description | +|---------------------|-------------| +| `0` (default) | Disable cleanup, keep all data | +| Positive integer (e.g., `30`) | Only keep recent N days of data, auto-delete old data | + +**Configuration Method**: +```bash +# GitHub Actions: Configure in GitHub Secrets +LOCAL_RETENTION_DAYS=30 +REMOTE_RETENTION_DAYS=30 + +# Docker: Configure in .env file +LOCAL_RETENTION_DAYS=30 +REMOTE_RETENTION_DAYS=30 + +# Local: Add to environment variables +export LOCAL_RETENTION_DAYS=30 +``` + +**Cleanup Rules**: +- Cleanup executes during each crawl task +- Local: Deletes `output/YYYY-MM-DD/` directories older than N days +- Remote: Deletes cloud objects older than N days (e.g., `news/2025-11-10.db`) + +--- + +#### Timezone Configuration + +v4.0.0 added timezone configuration support, using IANA standard time zone names: + +**Configuration Item**: `TIMEZONE` + +| Configuration Value | Description | Example | +|---------------------|-------------|---------| +| Not set (default) | Use UTC+0 | - | +| IANA time zone name | Specify time zone | `Asia/Shanghai`, `America/New_York`, `Europe/London` | + +**Configuration Method**: +```bash +# GitHub Actions: Configure in GitHub Secrets +TIMEZONE=Asia/Shanghai + +# Docker: Configure in .env file +TIMEZONE=Asia/Shanghai + +# Local: Add to environment variables +export TIMEZONE=Asia/Shanghai +``` + +**Common IANA Time Zones**: +- China: `Asia/Shanghai` +- United States East: `America/New_York` +- United States West: `America/Los_Angeles` +- United Kingdom: `Europe/London` +- Japan: `Asia/Tokyo` + +--- + +#### Breaking Changes (v4.0.0) + +**⚠️ Important Notice**: v4.0.0 made breaking changes to database structure, **old databases are incompatible** + +**Impact**: +- Cannot directly read v3.x version data +- Need to re-crawl data to build new database +- **No automatic migration tool provided** + +**Recommendations**: +1. **Fresh Start**: Recommended to start from scratch to accumulate data +2. **Keep Historical Data**: If need to preserve v3.x historical data, can rename old `output/` directory (e.g., `output_v3_backup/`) before running new version + +**Data Format Comparison**: + +| Item | v3.x | v4.0.0 | +|------|------|--------| +| File path format | `output/2025年12月09日/` | `output/2025-12-09/` | +| Data file | Multiple `news_HH-MM.txt` files | Single `news.db` file | +| Database fields | Contains `source_name`, `crawl_date` | Removed redundant fields | +| Platform management | No independent table | Added `platforms` table | +| Crawl status | Comma-separated string | Normalized `crawl_source_status` table | + +
+ +
+ ## 🤖 AI Analysis TrendRadar v3.0.0 added **MCP (Model Context Protocol)** based AI analysis feature, allowing natural language conversations with news data for deep analysis. diff --git a/README-MCP-FAQ-EN.md b/README-MCP-FAQ-EN.md index 7ef24d4..0fb95d6 100644 --- a/README-MCP-FAQ-EN.md +++ b/README-MCP-FAQ-EN.md @@ -450,7 +450,89 @@ AI: (date_range={"start": "2024-12-01", "end": "2024-12-31"}) --- -### Q14: How to parse natural language date expressions? (Recommended to use first) +## Storage Sync + +### Q14: How to sync data from remote storage to local? + +**You can ask like this:** + +- "Sync last 7 days data from remote" +- "Pull data from remote storage to local" +- "Sync last 30 days of news data" + +**Tool called:** `sync_from_remote` + +**Use cases:** + +- Crawler deployed in the cloud (e.g., GitHub Actions), data stored remotely (e.g., Cloudflare R2) +- MCP Server deployed locally, needs to pull data from remote for analysis + +**Return information:** + +- synced_files: Number of successfully synced files +- synced_dates: List of successfully synced dates +- skipped_dates: Skipped dates (already exist locally) +- failed_dates: Failed dates and error information + +**Prerequisites:** + +Need to configure remote storage in `config/config.yaml` or set environment variables: +- `S3_ENDPOINT_URL`: Service endpoint +- `S3_BUCKET_NAME`: Bucket name +- `S3_ACCESS_KEY_ID`: Access key ID +- `S3_SECRET_ACCESS_KEY`: Secret access key + +--- + +### Q15: How to view storage status? + +**You can ask like this:** + +- "View current storage status" +- "What's the storage configuration" +- "How much data is stored locally" +- "Is remote storage configured" + +**Tool called:** `get_storage_status` + +**Return information:** + +| Category | Information | +|----------|-------------| +| **Local Storage** | Data directory, total size, date count, date range | +| **Remote Storage** | Whether configured, endpoint URL, bucket name, date count | +| **Pull Config** | Whether auto-pull enabled, pull days | + +--- + +### Q16: How to view available data dates? + +**You can ask like this:** + +- "What dates are available locally" +- "What dates are in remote storage" +- "Compare local and remote data dates" +- "Which dates only exist remotely" + +**Tool called:** `list_available_dates` + +**Three query modes:** + +| Mode | Description | Example Question | +|------|-------------|------------------| +| **local** | View local only | "What dates are available locally" | +| **remote** | View remote only | "What dates are in remote" | +| **both** | Compare both (default) | "Compare local and remote data" | + +**Return information (both mode):** + +- only_local: Dates only existing locally +- only_remote: Dates only existing remotely (useful for deciding which dates to sync) +- both: Dates existing in both places + +--- + +### Q17: How to parse natural language date expressions? (Recommended to use first) **You can ask like this:** diff --git a/README-MCP-FAQ.md b/README-MCP-FAQ.md index 8892b08..e1ae36b 100644 --- a/README-MCP-FAQ.md +++ b/README-MCP-FAQ.md @@ -450,7 +450,89 @@ AI:(date_range={"start": "2024-12-01", "end": "2024-12-31"}) --- -### Q14: 如何解析自然语言日期表达式?(推荐优先使用) +## 存储同步 + +### Q14: 如何从远程存储同步数据到本地? + +**你可以这样问:** + +- "从远程同步最近 7 天的数据" +- "拉取远程存储的数据到本地" +- "同步最近 30 天的新闻数据" + +**调用的工具:** `sync_from_remote` + +**使用场景:** + +- 爬虫部署在云端(如 GitHub Actions),数据存储到远程(如 Cloudflare R2) +- MCP Server 部署在本地,需要从远程拉取数据进行分析 + +**返回信息:** + +- synced_files: 成功同步的文件数量 +- synced_dates: 成功同步的日期列表 +- skipped_dates: 跳过的日期(本地已存在) +- failed_dates: 失败的日期及错误信息 + +**前提条件:** + +需要在 `config/config.yaml` 中配置远程存储或设置环境变量: +- `S3_ENDPOINT_URL`: 服务端点 +- `S3_BUCKET_NAME`: 存储桶名称 +- `S3_ACCESS_KEY_ID`: 访问密钥 ID +- `S3_SECRET_ACCESS_KEY`: 访问密钥 + +--- + +### Q15: 如何查看存储状态? + +**你可以这样问:** + +- "查看当前存储状态" +- "存储配置是什么" +- "本地有多少数据" +- "远程存储配置了吗" + +**调用的工具:** `get_storage_status` + +**返回信息:** + +| 类别 | 信息 | +|------|------| +| **本地存储** | 数据目录、总大小、日期数量、日期范围 | +| **远程存储** | 是否配置、端点地址、存储桶名称、日期数量 | +| **拉取配置** | 是否启用自动拉取、拉取天数 | + +--- + +### Q16: 如何查看可用的数据日期? + +**你可以这样问:** + +- "本地有哪些日期的数据" +- "远程存储有哪些日期" +- "对比本地和远程的数据日期" +- "哪些日期只在远程有" + +**调用的工具:** `list_available_dates` + +**三种查询模式:** + +| 模式 | 说明 | 示例问法 | +|------|------|---------| +| **local** | 仅查看本地 | "本地有哪些日期" | +| **remote** | 仅查看远程 | "远程有哪些日期" | +| **both** | 对比两者(默认) | "对比本地和远程的数据" | + +**返回信息(both 模式):** + +- only_local: 仅本地存在的日期 +- only_remote: 仅远程存在的日期(可用于决定同步哪些日期) +- both: 两边都存在的日期 + +--- + +### Q17: 如何解析自然语言日期表达式?(推荐优先使用) **你可以这样问:** diff --git a/README.md b/README.md index cbeb802..735d220 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@
-> **📢 公告:** 经与 GitHub 官方沟通,完成合规调整后将恢复"一键 Fork 部署",请关注 **v4.0.0** 版本的更新 +> **📢 公告:** **v4.0.0** 版本已发布!包含存储架构重构、数据库优化、模块化改进等重大更新 TrendRadar Banner @@ -16,8 +16,8 @@ [![GitHub Stars](https://img.shields.io/github/stars/sansan0/TrendRadar?style=flat-square&logo=github&color=yellow)](https://github.com/sansan0/TrendRadar/stargazers) [![GitHub Forks](https://img.shields.io/github/forks/sansan0/TrendRadar?style=flat-square&logo=github&color=blue)](https://github.com/sansan0/TrendRadar/network/members) [![License](https://img.shields.io/badge/license-GPL--3.0-blue.svg?style=flat-square)](LICENSE) -[![Version](https://img.shields.io/badge/version-v3.5.0-blue.svg)](https://github.com/sansan0/TrendRadar) -[![MCP](https://img.shields.io/badge/MCP-v1.0.3-green.svg)](https://github.com/sansan0/TrendRadar) +[![Version](https://img.shields.io/badge/version-v4.0.0-blue.svg)](https://github.com/sansan0/TrendRadar) +[![MCP](https://img.shields.io/badge/MCP-v1.1.0-green.svg)](https://github.com/sansan0/TrendRadar) [![企业微信通知](https://img.shields.io/badge/企业微信-通知-00D4AA?style=flat-square)](https://work.weixin.qq.com/) [![个人微信通知](https://img.shields.io/badge/个人微信-通知-00D4AA?style=flat-square)](https://weixin.qq.com/) @@ -48,62 +48,61 @@
-🚨 【必读】重要公告:本项目的正确部署姿势 +🚨 【必读】重要公告:v4.0.0 部署方式与存储架构变更
-> **⚠️ 2025年12月紧急通知** -> -> 由于 Fork 数量激增导致 GitHub 服务器压力过大,**GitHub Actions 及 GitHub Pages 部署目前已受限**。为确保顺利部署,请务必阅读以下说明。 +### 🛠️ 请选择适合你的部署方式 -### 1. ✅ 唯一推荐部署方式:Docker +#### 🅰️ 方案一:Docker 部署(推荐 🔥) -**这是目前最稳定、不受 GitHub 限制的方案。** 数据存储在本地,不会因为 GitHub 策略调整而失效。 +* **特点**:最稳定、最简单,数据存储在 **本地 SQLite**,完全自主可控。 + +* **适用**:有自己的服务器、NAS 或长期运行的电脑。 * 👉 [跳转到 Docker 部署教程](#6-docker-部署) --- -### 2. 如果你本打算 Fork 本项目... +#### 🅱️ 方案二:GitHub Actions 部署(已恢复 ✅) -为了减少对 GitHub 服务器的压力,**请千万不要直接点击 "Fork" 按钮!** +* **特点**:数据不再直接写入仓库(Git Commit),而是存储在 **远程云存储**(支持 S3 兼容协议:Cloudflare R2、阿里云 OSS、腾讯云 COS 等)。 -请务必使用 **"Use this template"** 功能来替代 Fork: +* **门槛**:**必须**配置一个 S3 兼容的对象存储服务(推荐免费的 Cloudflare R2)。 + +> **⚠️ 注意**:选择此方案,请务必执行以下两步配置: + +#### 1. 🚀 推荐的开始方式:Use this template + +为了保持仓库整洁,避免继承冗余的历史记录,我**建议**你使用 Template 模式: + +1. **点击**原仓库页面右上角的绿色 **[Use this template]** 按钮。 -1. **点击**原仓库页面右上角的绿色的 **[Use this template]** 按钮。 2. **选择** "Create a new repository"。 -**为什么要这样做?** -* **❌ Fork**:复制完整历史记录,大量 Fork 同时运行会触发 GitHub 风控。 -* **✅ Use this template**:创建的是一个全新的独立仓库,没有历史包袱,对服务器更友好。 +> **💡 为什么要这样做?** +> * **Use this template**:创建一个全新的、干净的仓库,没有历史包袱。 +> * **Fork**:会保留完整的提交历史和关联关系,占用 GitHub 更多资源。 ---- +#### 2. ☁️ 关于 GitHub Actions 必配的远程存储 -### 3. 关于新版数据存储的说明 +如果你选择 **方案二 (GitHub Actions)**,则必须配置一个 S3 兼容的对象存储服务。 -新版将使用 **Cloudflare R2** 存储新闻数据,以保证持久化。 +**支持的存储服务:** +- **Cloudflare R2**(推荐,免费额度充足) +- 其他 S3 兼容服务 -**⚠️ 配置前置条件:** +**⚠️ 以 Cloudflare R2 为例的配置前置条件:** 根据 Cloudflare 平台规则,开通 R2 需绑定支付方式。 -- **目的:** 仅作身份验证(Verify Only),不产生扣费。 -- **支付:** 支持信用卡或国区 PayPal。 -- **用量:** R2 的免费额度足以覆盖本项目日常运行,无需付费。 +* **目的**:仅作身份验证(Verify Only),**不产生扣费**。 ---- +* **支付**:支持双币信用卡或国区 PayPal。 -### 4. 📅 后续计划与文档阅读说明 +* **用量**:R2 的免费额度(10GB存储/月)足以覆盖本项目日常运行,无需担心付费。 -> **后续计划:** -> - 探索新方案:保留 Actions 用于抓取和推送,但不再将数据保存到仓库,改用外部存储。 - -**⚠️ 阅读注意:** -鉴于上述计划意味着 **Fork 部署模式未来可能会以新形式回归**,且当前全面修改文档工作量巨大,我们暂时保留了旧版描述。 - -**在当前阶段,若后续教程中仍出现 "Fork" 相关表述,请一律忽略或将其理解为 "Use this template"**。 - -👉 **[点击此处查看 TrendRadar 最新官方文档](https://github.com/sansan0/TrendRadar?tab=readme-ov-file)** +👉 **[点击查看详细配置教程](#-快速开始)**
@@ -335,10 +334,30 @@ - ⚠️ **配对配置**:Telegram 和 ntfy 需要保证配对参数数量一致(如 token 和 chat_id 都是 2 个) - ⚠️ **数量限制**:默认每个渠道最多 3 个账号,超出会被截断 -### **多端适配** -- **GitHub Pages**:自动生成精美网页报告,PC/移动端适配 -- **Docker部署**:支持多架构容器化运行 -- **数据持久化**:HTML/TXT多格式历史记录保存 +### **灵活存储架构**(v4.0.0 重大更新) + +**多存储后端支持**: +- ☁️ **远程云存储**:GitHub Actions 环境默认,支持 S3 兼容协议(R2/OSS/COS 等),数据存储在云端,不污染仓库 +- 💾 **本地 SQLite 数据库**:Docker/本地环境默认,数据完全可控 +- 🔄 **自动后端选择**:根据运行环境智能切换存储方式 + +**数据格式**: +| 格式 | 用途 | 说明 | +|------|------|------| +| **SQLite** | 主存储 | 单文件数据库,查询快速,支持 MCP AI 分析 | +| **TXT** | 可选快照 | 可读文本格式,方便直接查看 | +| **HTML** | 报告展示 | 精美可视化页面,PC/移动端适配 | + +**数据管理**: +- ✅ 自动清理过期数据(可配置保留天数) +- ✅ 时区配置支持(全球时区) + +> 💡 详细说明见 [配置详解 - 存储配置](#9-存储配置) + +### **多端部署** +- **GitHub Actions**:定时自动爬取 + 远程云存储(需签到续期) +- **Docker 部署**:支持多架构容器化运行,数据本地存储 +- **本地运行**:Windows/Mac/Linux 直接运行 ### **AI 智能分析(v3.0.0 新增)** @@ -389,10 +408,34 @@ GitHub 一键 Fork 即可使用,无需编程基础。 >**升级说明**: - **📌 查看最新更新**:**[原仓库更新日志](https://github.com/sansan0/TrendRadar?tab=readme-ov-file#-更新日志)** - **提示**:不要通过 **Sync fork** 更新本项目,建议查看【历史更新】,明确具体的【升级方式】和【功能内容】 -- **小版本更新**:从 v2.x 升级到 v2.y,用本项目的 `main.py` 代码替换你 fork 仓库中的对应文件 - **大版本升级**:从 v1.x 升级到 v2.y,建议删除现有 fork 后重新 fork,这样更省力且避免配置冲突 +### 2025/12/13 - v4.0.0 + +**🎉 重大更新:全面重构存储和核心架构** + +- **多存储后端支持**:引入全新的存储模块,支持本地 SQLite 和远程云存储(S3 兼容协议,推荐免费的 Cloudflare R2),适应 GitHub Actions、Docker 和本地环境。 +- **数据库结构优化**:重构 SQLite 数据库表结构,提升数据效率和查询能力。 +- **核心代码模块化**:将主程序逻辑拆分为 trendradar 包的多个模块,显著提升代码可维护性。 +- **增强功能**:实现日期格式标准化、数据保留策略、时区配置支持、时间显示优化,并修复远程存储数据持久化问题,确保数据合并的准确性。 +- **清理和兼容**:移除了大部分历史兼容代码,统一了数据存储和读取方式。 + + +### 2025/12/13 - mcp-v1.1.0 + + **MCP 模块更新:** + - 适配 v4.0.0,同时也兼容 v3.x 的数据 + - 新增存储同步工具: + - `sync_from_remote`: 从远程存储拉取数据到本地 + - `get_storage_status`: 获取存储配置和状态 + - `list_available_dates`: 列出本地/远程可用日期范围 + + +
+👉 点击展开:历史更新 + + ### 2025/12/03 - v3.5.0 **🎉 核心功能增强** @@ -456,10 +499,6 @@ GitHub 一键 Fork 即可使用,无需编程基础。 - 工具总数从 13 个增加到 14 个 -
-👉 点击展开:历史更新 - - ### 2025/11/28 - v3.4.1 **🔧 格式优化** @@ -857,11 +896,44 @@ frequency_words.txt 文件增加了一个【必须词】功能,使用 + 号 > **📖 提醒**:Fork 用户建议先 **[查看最新官方文档](https://github.com/sansan0/TrendRadar?tab=readme-ov-file)**,确保配置步骤是最新的。 +### ⚠️ GitHub Actions 使用说明 + +**v4.0.0 重要变更**:引入「活跃度检测」机制,GitHub Actions 需定期签到以维持运行。 + +#### 🔄 签到续期机制 + +- **运行周期**:有效期为 **7 天**,倒计时结束后服务将自动挂起。 +- **续期方式**:在 Actions 页面手动触发 "Check In" workflow,即可重置 7 天有效期。 +- **操作路径**:`Actions` → `Check In` → `Run workflow` +- **设计理念**: + - 如果 7 天都忘了签到,或许这些资讯对你来说并非刚需。适时的暂停,能帮你从信息流中抽离,给大脑留出喘息的空间。 + - GitHub Actions 是宝贵的公共计算资源。引入签到机制旨在避免算力的无效空转,确保资源能分配给真正活跃且需要的用户。感谢你的理解与支持。 + +#### 📦 数据存储(必需配置) + +GitHub Actions 环境下,数据存储在 **远程云存储**(支持 S3 兼容协议,推荐免费的 Cloudflare R2),不会污染仓库(见下方 **必需配置:远程云存储**) + +#### 🚀 推荐:Docker 部署 + +如需长期稳定运行,建议使用 [Docker 部署](#6-docker-部署),数据存储在本地,无需签到,不过需要额外付费购买云服务器。 + +
+ +> 🎉 **已支持:多云存储方案** +> +> 本项目现已支持 S3 兼容协议,你可以选择: +> - **Cloudflare R2**(推荐,免费额度充足) +> - 其他 S3 兼容存储服务 +> +> 只需配置对应的 `S3_ENDPOINT_URL`、`S3_BUCKET_NAME` 等环境变量即可切换。 + +--- + 1. **Fork 本项目**到你的 GitHub 账户 - 点击本页面右上角的"Fork"按钮 -2. **设置 GitHub Secrets(选择你需要的平台)**: +2. **设置 GitHub Secrets(必需 + 可选平台)**: 在你 Fork 后的仓库中,进入 `Settings` > `Secrets and variables` > `Actions` > `New repository secret` @@ -900,6 +972,53 @@ frequency_words.txt 文件增加了一个【必须词】功能,使用 + 号
+
+ ⚠️ 必需配置:远程云存储(GitHub Actions 环境必需,推荐 Cloudflare R2) +
+ + **GitHub Secret 配置(⚠️ 以下 4 个配置项都是必需的):** + + | Name(名称) | Secret(值)说明 | + |-------------|-----------------| + | `S3_BUCKET_NAME` | 存储桶名称(如 `trendradar-data`) | + | `S3_ACCESS_KEY_ID` | 访问密钥 ID(Access Key ID) | + | `S3_SECRET_ACCESS_KEY` | 访问密钥(Secret Access Key) | + | `S3_ENDPOINT_URL` | S3 API 端点(如 R2:`https://.r2.cloudflarestorage.com`) | + +
+ + **如何获取凭据(以 Cloudflare R2 为例):** + + 1. **进入 R2 概览**: + - 登录 [Cloudflare Dashboard](https://dash.cloudflare.com/)。 + - 在左侧侧边栏找到并点击 `R2对象存储`。 + +
+ + 2. **创建存储桶**: + - 点击`概述` + - 点击右上角的 `创建存储桶` (Create bucket)。 + - 输入名称(例如 `trendradar-data`),点击 `创建存储桶`。 + +
+ + 3. **创建 API 令牌**: + - 回到 **概述**页面。 + - 点击**右下角** `Account Details `找到并点击 `Manage` (Manage R2 API Tokens)。 + - 同时你会看到 `S3 API`:`https://.r2.cloudflarestorage.com`(这就是 S3_ENDPOINT_URL) + - 点击 `创建 Account APl 令牌` 。 + - **⚠️ 关键设置**: + - **令牌名称**:随意填写(如 `github-action-write`)。 + - **权限**:选择 `管理员读和写` 。 + - **指定存储桶**:为了安全,建议选择 `仅适用于指定存储桶` 并选中你的桶(如 `trendradar-data`)。 + - 点击 `创建 API 令牌`,**立即复制** 显示的 `Access Key ID` 和 `Secret Access Key`(只显示一次!)。 + +
+ + - **R2 免费额度**:每月 10GB 存储 + 100万次读取,对本项目来说非常充足。 + - **支付验证**:开通 R2 即使是免费额度,Cloudflare 也要求绑定 PayPal 或信用卡进行身份验证(不会实际扣费,除非超过额度)。 + +
👉 点击展开:企业微信机器人(配置最简单最迅速) @@ -1489,10 +1608,11 @@ frequency_words.txt 文件增加了一个【必须词】功能,使用 + 号 **测试步骤**: 1. 进入你项目的 Actions 页面 - 2. 找到 **"Hot News Crawler"** 点进去 + 2. 找到 **"Get Hot News"**(必须得是这个字)点进去,点击右侧的 **"Run workflow"** 按钮运行 - 如果看不到该字样,参照 [#109](https://github.com/sansan0/TrendRadar/issues/109) 解决 - 3. 点击右侧的 **"Run workflow"** 按钮运行 - 4. 等待 1 分钟左右,消息会推送到你配置的平台 + 3. 3 分钟左右,消息会推送到你配置的平台 + +
> ⏱️ **测试提示**: > - 手动测试不要太频繁,避免触发 GitHub Actions 限制 @@ -2069,7 +2189,7 @@ TrendRadar 提供两个独立的 Docker 镜像,可根据需求选择部署: # 下载 docker compose 配置 wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/.env -P docker/ - wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/docker compose.yml -P docker/ + wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/docker-compose.yml -P docker/ ``` > 💡 **说明**:Docker 部署需要的关键目录结构如下: @@ -2080,7 +2200,7 @@ TrendRadar 提供两个独立的 Docker 镜像,可根据需求选择部署: │ └── frequency_words.txt └── docker/ ├── .env - └── docker compose.yml + └── docker-compose.yml ``` 2. **配置文件说明**: @@ -2174,7 +2294,7 @@ vim config/frequency_words.txt # 使用构建版本的 docker compose cd docker -cp docker compose-build.yml docker compose.yml +cp docker-compose-build.yml docker-compose.yml ``` **构建并启动服务**: @@ -2260,7 +2380,7 @@ docker rm trend-radar > 💡 **Web 服务器说明**: > - 启动后可通过浏览器访问 `http://localhost:8080` 查看最新报告 -> - 通过目录导航访问历史报告(如:`http://localhost:8080/2025年xx月xx日/`) +> - 通过目录导航访问历史报告(如:`http://localhost:8080/2025-xx-xx/`) > - 端口可在 `.env` 文件中配置 `WEBSERVER_PORT` 参数 > - 自动启动:在 `.env` 中设置 `ENABLE_WEBSERVER=true` > - 安全提示:仅提供静态文件访问,限制在 output 目录,只绑定本地访问 @@ -2277,7 +2397,7 @@ TrendRadar 生成的当日汇总 HTML 报告会同时保存到两个位置: |---------|---------|---------| | `output/index.html` | 宿主机直接访问 | **Docker 部署**(通过 Volume 挂载,宿主机可见) | | `index.html` | 根目录访问 | **GitHub Pages**(仓库根目录,Pages 自动识别) | -| `output/YYYY年MM月DD日/html/当日汇总.html` | 历史报告访问 | 所有环境(按日期归档) | +| `output/YYYY-MM-DD/html/当日汇总.html` | 历史报告访问 | 所有环境(按日期归档) | **本地访问示例**: ```bash @@ -2286,8 +2406,8 @@ TrendRadar 生成的当日汇总 HTML 报告会同时保存到两个位置: docker exec -it trend-radar python manage.py start_webserver # 2. 在浏览器访问 http://localhost:8080 # 访问最新报告(默认 index.html) -http://localhost:8080/2025年xx月xx日/ # 访问指定日期的报告 -http://localhost:8080/2025年xx月xx日/html/ # 浏览该日期下的所有 HTML 文件 +http://localhost:8080/2025-xx-xx/ # 访问指定日期的报告 +http://localhost:8080/2025-xx-xx/html/ # 浏览该日期下的所有 HTML 文件 # 方式 2:直接打开文件(本地环境) open ./output/index.html # macOS @@ -2295,7 +2415,7 @@ start ./output/index.html # Windows xdg-open ./output/index.html # Linux # 方式 3:访问历史归档 -open ./output/2025年xx月xx日/html/当日汇总.html +open ./output/2025-xx-xx/html/当日汇总.html ``` **为什么有两个 index.html?** @@ -2349,34 +2469,42 @@ flowchart TB **快速启动**: -使用 docker compose 同时启动新闻推送和 MCP 服务: +如果已按照 [方式一:使用 docker compose](#方式一使用-docker-compose推荐) 完成部署,只需启动 MCP 服务: ```bash -# 下载最新的 docker compose.yml(已包含 MCP 服务配置) -wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/docker compose.yml - -# 启动所有服务 -docker compose up -d +cd TrendRadar/docker +docker compose up -d trend-radar-mcp # 查看运行状态 -docker ps | grep trend-radar +docker ps | grep trend-radar-mcp ``` -**单独启动 MCP 服务**: +**单独启动 MCP 服务**(不使用 docker compose): ```bash +# Linux/Mac docker run -d --name trend-radar-mcp \ -p 127.0.0.1:3333:3333 \ - -v ./config:/app/config:ro \ - -v ./output:/app/output:ro \ + -v $(pwd)/config:/app/config:ro \ + -v $(pwd)/output:/app/output:ro \ -e TZ=Asia/Shanghai \ wantcat/trendradar-mcp:latest + +# Windows PowerShell +docker run -d --name trend-radar-mcp ` + -p 127.0.0.1:3333:3333 ` + -v ${PWD}/config:/app/config:ro ` + -v ${PWD}/output:/app/output:ro ` + -e TZ=Asia/Shanghai ` + wantcat/trendradar-mcp:latest ``` +> ⚠️ **注意**:单独运行时,确保当前目录下有 `config/` 和 `output/` 文件夹,且包含配置文件和新闻数据。 + **验证服务**: ```bash -# 检查 MCP 服务是否正常运行 +# 检查 MCP 服务健康状态 curl http://127.0.0.1:3333/mcp # 查看 MCP 服务日志 @@ -2385,14 +2513,20 @@ docker logs -f trend-radar-mcp **在 AI 客户端中配置**: -MCP 服务启动后,在 Claude Desktop、Cherry Studio、Cursor 等客户端中配置: +MCP 服务启动后,根据不同客户端进行配置: +**Cherry Studio**(推荐,GUI 配置): +- 设置 → MCP 服务器 → 添加 +- 类型:`streamableHttp` +- URL:`http://127.0.0.1:3333/mcp` + +**Claude Desktop / Cline**(JSON 配置): ```json { "mcpServers": { "trendradar": { "url": "http://127.0.0.1:3333/mcp", - "description": "TrendRadar 新闻热点分析" + "type": "streamableHttp" } } } @@ -2480,7 +2614,6 @@ notification: start: "20:00" # 开始时间(北京时间) end: "22:00" # 结束时间(北京时间) once_per_day: true # 每天只推送一次 - push_record_retention_days: 7 # 推送记录保留天数 ``` #### 配置项详解 @@ -2491,7 +2624,6 @@ notification: | `time_range.start` | string | `"20:00"` | 推送时间窗口开始时间(北京时间,HH:MM 格式) | | `time_range.end` | string | `"22:00"` | 推送时间窗口结束时间(北京时间,HH:MM 格式) | | `once_per_day` | bool | `true` | `true`=每天在窗口内只推送一次,`false`=窗口内每次执行都推送 | -| `push_record_retention_days` | int | `7` | 推送记录保留天数(用于判断是否已推送) | #### 使用场景 @@ -2515,7 +2647,6 @@ PUSH_WINDOW_ENABLED=true PUSH_WINDOW_START=09:00 PUSH_WINDOW_END=18:00 PUSH_WINDOW_ONCE_PER_DAY=false -PUSH_WINDOW_RETENTION_DAYS=7 ``` #### 完整配置示例 @@ -2530,7 +2661,6 @@ notification: start: "20:00" end: "22:00" once_per_day: true - push_record_retention_days: 7 ``` **场景:工作时间内每小时推送** @@ -2543,7 +2673,6 @@ notification: start: "09:00" end: "18:00" once_per_day: false - push_record_retention_days: 7 ```
@@ -2829,6 +2958,123 @@ notification:
+### 11. 存储配置 + +
+👉 点击展开:存储架构配置详解 +
+ +#### 存储后端选择 + +**配置位置**:`config/config.yaml` 的 `storage` 部分 + +v4.0.0 版本重构了存储架构,支持多种存储后端: + +```yaml +storage: + backend: auto # 存储后端:auto(自动选择)/ local(本地SQLite)/ remote(远程云存储) + + formats: + sqlite: true # 是否启用SQLite存储 + txt: true # 是否生成TXT快照 + html: true # 是否生成HTML报告 + + local: + data_dir: "output" # 本地存储目录 + retention_days: 0 # 本地数据保留天数,0表示永久保留 + + remote: + endpoint_url: "" # S3 API 端点 + bucket_name: "" # 存储桶名称 + access_key_id: "" # 访问密钥ID + secret_access_key: "" # 访问密钥 + region: "" # 区域(可选) + retention_days: 0 # 远程数据保留天数,0表示永久保留 + + pull: + enabled: false # 是否启用启动时从远程拉取数据 + days: 7 # 拉取最近N天的数据 +``` + +#### 后端选择策略 + +| backend 值 | 说明 | 适用场景 | +|-----------|------|---------| +| `auto` | **自动选择**(推荐) | 根据运行环境智能选择:
• GitHub Actions → Remote
• Docker/本地 → Local | +| `local` | 本地 SQLite 数据库 | Docker 部署、本地开发 | +| `remote` | 远程云存储(S3 兼容,如 Cloudflare R2) | GitHub Actions、多机器同步 | + + +#### 远程云存储配置 + +**环境变量**(推荐方式): + +```bash +# GitHub Actions / Docker 环境变量 +STORAGE_BACKEND=remote # 或 auto + +# 本地/远程数据保留天数(0 表示永久保留) +LOCAL_RETENTION_DAYS=0 +REMOTE_RETENTION_DAYS=0 + +# S3 兼容存储配置(以 Cloudflare R2 为例) +S3_BUCKET_NAME=your-bucket-name +S3_ACCESS_KEY_ID=your-access-key-id +S3_SECRET_ACCESS_KEY=your-secret-access-key +S3_ENDPOINT_URL=https://.r2.cloudflarestorage.com +S3_REGION=auto + +# 数据拉取配置(可选,从远程同步到本地) +PULL_ENABLED=false +PULL_DAYS=7 +``` + +**获取凭据**:参见 [快速开始 - 远程存储配置](#-快速开始) + +#### 数据清理策略 + +**自动清理**:每次运行结束时检查并删除超过保留天数的数据。 + +```yaml +storage: + local: + retention_days: 30 # 本地保留最近30天数据 + remote: + retention_days: 30 # 远程保留最近30天数据 +``` + +**清理逻辑**: +- 本地存储:删除过期日期的文件夹(如 `output/2025-11-10/`) +- 远程存储:批量删除过期的云端对象(如 `news/2025-11-10.db`) + +#### 时区配置(v4.0.0 新增) + +**全球时区支持**:解决非中国用户推送时间窗口问题。 + +```yaml +app: + timezone: "Asia/Shanghai" # 默认中国时区 + # 其他示例: + # timezone: "America/Los_Angeles" # 美西时间 + # timezone: "Europe/London" # 英国时间 +``` + +**支持所有 IANA 时区名称**:[时区列表](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones) + + +#### 不兼容变更 + +⚠️ **v4.0.0 不兼容 v3.x 数据**: + +1. 数据库结构完全重构,无法读取旧数据 +2. 文件路径格式变更(ISO 格式) + +**迁移建议**: +- 从 v4.0.0 开始重新收集数据 +- 旧数据如需保留,请手动重命名目录格式(不推荐) + +
+
## 🤖 AI 智能分析 @@ -2846,7 +3092,7 @@ AI 分析功能**不是**直接查询网络实时数据,而是分析你**本 #### 使用说明: -1. **项目自带测试数据**:`output` 目录默认包含 **2025年11月1日~11月15日** 的新闻数据,可用于快速体验 AI 功能 +1. **项目自带测试数据**:`output` 目录默认包含 **2025-11-01~2025-11-15** 的新闻数据,可用于快速体验 AI 功能 2. **查询限制**: - ✅ 只能查询已有日期范围内的数据(11月1-15日) diff --git a/config/config.yaml b/config/config.yaml index 81d7cbf..d5064d4 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -1,12 +1,60 @@ app: version_check_url: "https://raw.githubusercontent.com/sansan0/TrendRadar/refs/heads/master/version" show_version_update: true # 控制显示版本更新提示,如果 false,则不接受新版本提示 + # 时区配置(影响所有时间显示、推送窗口判断、数据存储) + # 常用时区: + # - Asia/Shanghai (北京时间 UTC+8) + # - America/New_York (美东时间 UTC-5/-4) + # - Europe/London (伦敦时间 UTC+0/+1) + # 完整时区列表: https://en.wikipedia.org/wiki/List_of_tz_database_time_zones + timezone: "Asia/Shanghai" + +# 存储配置 +storage: + # 存储后端选择: local / remote / auto + # - local: 本地 SQLite + TXT/HTML 文件 + # - remote: 远程云存储(S3 兼容协议,支持 R2/OSS/COS 等) + # - auto: 自动选择(GitHub Actions 环境且配置了远程存储则用 remote,否则用 local) + backend: "auto" + + # 数据格式选项 + formats: + sqlite: true # 主存储(必须启用) + txt: false # 是否生成 TXT 快照 + html: false # 是否生成 HTML 报告 + + # 本地存储配置 + local: + data_dir: "output" # 数据目录 + retention_days: 0 # 本地数据保留天数(0 = 不清理) + + # 远程存储配置(S3 兼容协议) + # 支持: Cloudflare R2, 阿里云 OSS, 腾讯云 COS, AWS S3, MinIO 等 + # 建议将敏感信息配置在 GitHub Secrets 或环境变量中 + remote: + # 数据保留天数(0 = 不清理远程数据) + retention_days: 0 + # S3 兼容配置 + endpoint_url: "" # 服务端点(或环境变量 S3_ENDPOINT_URL) + # Cloudflare R2: https://.r2.cloudflarestorage.com + # 阿里云 OSS: https://oss-cn-hangzhou.aliyuncs.com + # 腾讯云 COS: https://cos.ap-guangzhou.myqcloud.com + bucket_name: "" # 存储桶名称(或环境变量 S3_BUCKET_NAME) + access_key_id: "" # 访问密钥 ID(或环境变量 S3_ACCESS_KEY_ID) + secret_access_key: "" # 访问密钥(或环境变量 S3_SECRET_ACCESS_KEY) + region: "" # 区域(可选,部分服务商需要,或环境变量 S3_REGION) + + # 数据拉取配置(从远程同步到本地) + # 用于 MCP Server 等场景:爬虫存到远程,MCP 拉取到本地分析 + pull: + enabled: false # 是否启用启动时自动拉取 + days: 7 # 拉取最近 N 天的数据(0 = 不拉取) crawler: request_interval: 1000 # 请求间隔(毫秒) enable_crawler: true # 是否启用爬取新闻功能,如果 false,则直接停止程序 use_proxy: false # 是否启用代理,false 时为关闭 - default_proxy: "http://127.0.0.1:10086" + default_proxy: "http://127.0.0.1:10801" # 🔸 daily(当日汇总模式) # • 推送时机:按时推送(默认每小时推送一次) @@ -55,7 +103,6 @@ notification: start: "20:00" # 推送时间窗口开始(北京时间) end: "22:00" # 推送时间窗口结束(北京时间) once_per_day: true # 每天在时间窗口内只推送一次,如果 false,则窗口内每次执行都推送 - push_record_retention_days: 7 # 推送记录保留天数 # ⚠️⚠️⚠️ 重要安全警告 / IMPORTANT SECURITY WARNING ⚠️⚠️⚠️ # diff --git a/docker/.env b/docker/.env index 9946b30..03a8524 100644 --- a/docker/.env +++ b/docker/.env @@ -40,8 +40,6 @@ PUSH_WINDOW_START= PUSH_WINDOW_END= # 每天只推送一次 (true/false) PUSH_WINDOW_ONCE_PER_DAY= -# 推送记录保留天数 (数字,如 7) -PUSH_WINDOW_RETENTION_DAYS= # ============================================ # 多账号配置 @@ -87,6 +85,39 @@ BARK_URL= # Slack 推送配置(多账号用 ; 分隔) SLACK_WEBHOOK_URL= +# ============================================ +# 存储配置 +# ============================================ + +# 存储后端选择 (local/remote/auto) +# - local: 本地 SQLite + TXT/HTML 文件 +# - remote: 远程云存储(S3 兼容协议) +# - auto: 自动选择(GitHub Actions 用 remote,其他用 local) +STORAGE_BACKEND=auto + +# 本地数据保留天数(0 = 无限制,不清理历史数据) +LOCAL_RETENTION_DAYS=0 + +# 远程数据保留天数(0 = 无限制,不清理历史数据) +REMOTE_RETENTION_DAYS=0 + +# 是否生成 TXT 快照 (true/false) +STORAGE_TXT_ENABLED= + +# 是否生成 HTML 报告 (true/false) +STORAGE_HTML_ENABLED= + +# 远程存储配置(S3 兼容协议,支持 R2/OSS/COS/S3 等) +S3_ENDPOINT_URL= +S3_BUCKET_NAME= +S3_ACCESS_KEY_ID= +S3_SECRET_ACCESS_KEY= +S3_REGION= + +# 数据拉取配置(从远程同步到本地) +PULL_ENABLED=false +PULL_DAYS=7 + # ============================================ # 运行配置 # ============================================ diff --git a/docker/Dockerfile b/docker/Dockerfile index 574eb11..8f32098 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -53,8 +53,8 @@ RUN set -ex && \ COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt -COPY main.py . COPY docker/manage.py . +COPY trendradar/ ./trendradar/ # 复制 entrypoint.sh 并强制转换为 LF 格式 COPY docker/entrypoint.sh /entrypoint.sh.tmp diff --git a/docker/Dockerfile.mcp b/docker/Dockerfile.mcp index 6aacfb6..364042d 100644 --- a/docker/Dockerfile.mcp +++ b/docker/Dockerfile.mcp @@ -8,6 +8,8 @@ RUN pip install --no-cache-dir -r requirements.txt # 复制 MCP 服务器代码 COPY mcp_server/ ./mcp_server/ +# 复制 trendradar 模块(MCP 服务需要读取 SQLite 数据) +COPY trendradar/ ./trendradar/ # 创建必要目录 RUN mkdir -p /app/config /app/output diff --git a/docker/docker-compose-build.yml b/docker/docker-compose-build.yml index 955c4c9..b0af9cb 100644 --- a/docker/docker-compose-build.yml +++ b/docker/docker-compose-build.yml @@ -32,7 +32,6 @@ services: - PUSH_WINDOW_START=${PUSH_WINDOW_START:-} - PUSH_WINDOW_END=${PUSH_WINDOW_END:-} - PUSH_WINDOW_ONCE_PER_DAY=${PUSH_WINDOW_ONCE_PER_DAY:-} - - PUSH_WINDOW_RETENTION_DAYS=${PUSH_WINDOW_RETENTION_DAYS:-} # 通知渠道 - FEISHU_WEBHOOK_URL=${FEISHU_WEBHOOK_URL:-} - TELEGRAM_BOT_TOKEN=${TELEGRAM_BOT_TOKEN:-} @@ -54,6 +53,21 @@ services: - BARK_URL=${BARK_URL:-} # Slack配置 - SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL:-} + # 存储配置 + - STORAGE_BACKEND=${STORAGE_BACKEND:-auto} + - LOCAL_RETENTION_DAYS=${LOCAL_RETENTION_DAYS:-0} + - REMOTE_RETENTION_DAYS=${REMOTE_RETENTION_DAYS:-0} + - STORAGE_TXT_ENABLED=${STORAGE_TXT_ENABLED:-true} + - STORAGE_HTML_ENABLED=${STORAGE_HTML_ENABLED:-true} + # 远程存储配置(S3 兼容协议) + - S3_ENDPOINT_URL=${S3_ENDPOINT_URL:-} + - S3_BUCKET_NAME=${S3_BUCKET_NAME:-} + - S3_ACCESS_KEY_ID=${S3_ACCESS_KEY_ID:-} + - S3_SECRET_ACCESS_KEY=${S3_SECRET_ACCESS_KEY:-} + - S3_REGION=${S3_REGION:-} + # 数据拉取配置 + - PULL_ENABLED=${PULL_ENABLED:-false} + - PULL_DAYS=${PULL_DAYS:-7} # 运行模式 - CRON_SCHEDULE=${CRON_SCHEDULE:-*/5 * * * *} - RUN_MODE=${RUN_MODE:-cron} @@ -71,7 +85,7 @@ services: volumes: - ../config:/app/config:ro - - ../output:/app/output:ro + - ../output:/app/output environment: - TZ=Asia/Shanghai diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index c7115b3..fc8b78f 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -30,7 +30,6 @@ services: - PUSH_WINDOW_START=${PUSH_WINDOW_START:-} - PUSH_WINDOW_END=${PUSH_WINDOW_END:-} - PUSH_WINDOW_ONCE_PER_DAY=${PUSH_WINDOW_ONCE_PER_DAY:-} - - PUSH_WINDOW_RETENTION_DAYS=${PUSH_WINDOW_RETENTION_DAYS:-} # 通知渠道 - FEISHU_WEBHOOK_URL=${FEISHU_WEBHOOK_URL:-} - TELEGRAM_BOT_TOKEN=${TELEGRAM_BOT_TOKEN:-} @@ -52,6 +51,21 @@ services: - BARK_URL=${BARK_URL:-} # Slack配置 - SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL:-} + # 存储配置 + - STORAGE_BACKEND=${STORAGE_BACKEND:-auto} + - LOCAL_RETENTION_DAYS=${LOCAL_RETENTION_DAYS:-0} + - REMOTE_RETENTION_DAYS=${REMOTE_RETENTION_DAYS:-0} + - STORAGE_TXT_ENABLED=${STORAGE_TXT_ENABLED:-true} + - STORAGE_HTML_ENABLED=${STORAGE_HTML_ENABLED:-true} + # 远程存储配置(S3 兼容协议) + - S3_ENDPOINT_URL=${S3_ENDPOINT_URL:-} + - S3_BUCKET_NAME=${S3_BUCKET_NAME:-} + - S3_ACCESS_KEY_ID=${S3_ACCESS_KEY_ID:-} + - S3_SECRET_ACCESS_KEY=${S3_SECRET_ACCESS_KEY:-} + - S3_REGION=${S3_REGION:-} + # 数据拉取配置 + - PULL_ENABLED=${PULL_ENABLED:-false} + - PULL_DAYS=${PULL_DAYS:-7} # 运行模式 - CRON_SCHEDULE=${CRON_SCHEDULE:-*/5 * * * *} - RUN_MODE=${RUN_MODE:-cron} @@ -67,7 +81,7 @@ services: volumes: - ../config:/app/config:ro - - ../output:/app/output:ro + - ../output:/app/output environment: - TZ=Asia/Shanghai diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh index d33bb1c..e68f0aa 100644 --- a/docker/entrypoint.sh +++ b/docker/entrypoint.sh @@ -13,11 +13,11 @@ env >> /etc/environment case "${RUN_MODE:-cron}" in "once") echo "🔄 单次执行" - exec /usr/local/bin/python main.py + exec /usr/local/bin/python -m trendradar ;; "cron") # 生成 crontab - echo "${CRON_SCHEDULE:-*/30 * * * *} cd /app && /usr/local/bin/python main.py" > /tmp/crontab + echo "${CRON_SCHEDULE:-*/30 * * * *} cd /app && /usr/local/bin/python -m trendradar" > /tmp/crontab echo "📅 生成的crontab内容:" cat /tmp/crontab @@ -30,7 +30,7 @@ case "${RUN_MODE:-cron}" in # 立即执行一次(如果配置了) if [ "${IMMEDIATE_RUN:-false}" = "true" ]; then echo "▶️ 立即执行一次" - /usr/local/bin/python main.py + /usr/local/bin/python -m trendradar fi # 启动 Web 服务器(如果配置了) diff --git a/docker/manage.py b/docker/manage.py index 944cccf..e326931 100644 --- a/docker/manage.py +++ b/docker/manage.py @@ -33,7 +33,7 @@ def manual_run(): print("🔄 手动执行爬虫...") try: result = subprocess.run( - ["python", "main.py"], cwd="/app", capture_output=False, text=True + ["python", "-m", "trendradar"], cwd="/app", capture_output=False, text=True ) if result.returncode == 0: print("✅ 执行完成") @@ -285,12 +285,24 @@ def show_config(): "TELEGRAM_CHAT_ID", "CONFIG_PATH", "FREQUENCY_WORDS_PATH", + # 存储配置 + "STORAGE_BACKEND", + "LOCAL_RETENTION_DAYS", + "REMOTE_RETENTION_DAYS", + "STORAGE_TXT_ENABLED", + "STORAGE_HTML_ENABLED", + "S3_BUCKET_NAME", + "S3_ACCESS_KEY_ID", + "S3_ENDPOINT_URL", + "S3_REGION", + "PULL_ENABLED", + "PULL_DAYS", ] for var in env_vars: value = os.environ.get(var, "未设置") # 隐藏敏感信息 - if any(sensitive in var for sensitive in ["WEBHOOK", "TOKEN", "KEY"]): + if any(sensitive in var for sensitive in ["WEBHOOK", "TOKEN", "KEY", "SECRET"]): if value and value != "未设置": masked_value = value[:10] + "***" if len(value) > 10 else "***" print(f" {var}: {masked_value}") @@ -331,6 +343,17 @@ def show_files(): # 显示最近2天的文件 for date_dir in date_dirs[:2]: print(f" 📅 {date_dir.name}:") + + # 检查 SQLite 数据库文件 + db_files = list(date_dir.glob("*.db")) + if db_files: + print(f" 💾 SQLite: {len(db_files)} 个数据库") + for db_file in db_files[:3]: + mtime = time.ctime(db_file.stat().st_mtime) + size_kb = db_file.stat().st_size // 1024 + print(f" 📀 {db_file.name} ({size_kb}KB, {mtime.split()[3][:5]})") + + # 检查子目录(html, txt) for subdir in ["html", "txt"]: sub_path = date_dir / subdir if sub_path.exists(): diff --git a/main.py b/main.py deleted file mode 100644 index df9f73a..0000000 --- a/main.py +++ /dev/null @@ -1,5431 +0,0 @@ -# coding=utf-8 - -import json -import os -import random -import re -import time -import webbrowser -import smtplib -from email.mime.text import MIMEText -from email.mime.multipart import MIMEMultipart -from email.header import Header -from email.utils import formataddr, formatdate, make_msgid -from datetime import datetime -from pathlib import Path -from typing import Dict, List, Tuple, Optional, Union - -import pytz -import requests -import yaml - - -VERSION = "3.5.0" - - -# === SMTP邮件配置 === -SMTP_CONFIGS = { - # Gmail(使用 STARTTLS) - "gmail.com": {"server": "smtp.gmail.com", "port": 587, "encryption": "TLS"}, - # QQ邮箱(使用 SSL,更稳定) - "qq.com": {"server": "smtp.qq.com", "port": 465, "encryption": "SSL"}, - # Outlook(使用 STARTTLS) - "outlook.com": { - "server": "smtp-mail.outlook.com", - "port": 587, - "encryption": "TLS", - }, - "hotmail.com": { - "server": "smtp-mail.outlook.com", - "port": 587, - "encryption": "TLS", - }, - "live.com": {"server": "smtp-mail.outlook.com", "port": 587, "encryption": "TLS"}, - # 网易邮箱(使用 SSL,更稳定) - "163.com": {"server": "smtp.163.com", "port": 465, "encryption": "SSL"}, - "126.com": {"server": "smtp.126.com", "port": 465, "encryption": "SSL"}, - # 新浪邮箱(使用 SSL) - "sina.com": {"server": "smtp.sina.com", "port": 465, "encryption": "SSL"}, - # 搜狐邮箱(使用 SSL) - "sohu.com": {"server": "smtp.sohu.com", "port": 465, "encryption": "SSL"}, - # 天翼邮箱(使用 SSL) - "189.cn": {"server": "smtp.189.cn", "port": 465, "encryption": "SSL"}, - # 阿里云邮箱(使用 TLS) - "aliyun.com": {"server": "smtp.aliyun.com", "port": 465, "encryption": "TLS"}, -} - - -# === 多账号推送工具函数 === -def parse_multi_account_config(config_value: str, separator: str = ";") -> List[str]: - """ - 解析多账号配置,返回账号列表 - - Args: - config_value: 配置值字符串,多个账号用分隔符分隔 - separator: 分隔符,默认为 ; - - Returns: - 账号列表,空字符串会被保留(用于占位) - """ - if not config_value: - return [] - # 保留空字符串用于占位(如 ";token2" 表示第一个账号无token) - accounts = [acc.strip() for acc in config_value.split(separator)] - # 过滤掉全部为空的情况 - if all(not acc for acc in accounts): - return [] - return accounts - - -def validate_paired_configs( - configs: Dict[str, List[str]], - channel_name: str, - required_keys: Optional[List[str]] = None -) -> Tuple[bool, int]: - """ - 验证配对配置的数量是否一致 - - Args: - configs: 配置字典,key 为配置名,value 为账号列表 - channel_name: 渠道名称,用于日志输出 - required_keys: 必须有值的配置项列表 - - Returns: - (是否验证通过, 账号数量) - """ - # 过滤掉空列表 - non_empty_configs = {k: v for k, v in configs.items() if v} - - if not non_empty_configs: - return True, 0 - - # 检查必须项 - if required_keys: - for key in required_keys: - if key not in non_empty_configs or not non_empty_configs[key]: - return True, 0 # 必须项为空,视为未配置 - - # 获取所有非空配置的长度 - lengths = {k: len(v) for k, v in non_empty_configs.items()} - unique_lengths = set(lengths.values()) - - if len(unique_lengths) > 1: - print(f"❌ {channel_name} 配置错误:配对配置数量不一致,将跳过该渠道推送") - for key, length in lengths.items(): - print(f" - {key}: {length} 个") - return False, 0 - - return True, list(unique_lengths)[0] if unique_lengths else 0 - - -def limit_accounts( - accounts: List[str], - max_count: int, - channel_name: str -) -> List[str]: - """ - 限制账号数量 - - Args: - accounts: 账号列表 - max_count: 最大账号数量 - channel_name: 渠道名称,用于日志输出 - - Returns: - 限制后的账号列表 - """ - if len(accounts) > max_count: - print(f"⚠️ {channel_name} 配置了 {len(accounts)} 个账号,超过最大限制 {max_count},只使用前 {max_count} 个") - print(f" ⚠️ 警告:如果您是 fork 用户,过多账号可能导致 GitHub Actions 运行时间过长,存在账号风险") - return accounts[:max_count] - return accounts - - -def get_account_at_index(accounts: List[str], index: int, default: str = "") -> str: - """ - 安全获取指定索引的账号值 - - Args: - accounts: 账号列表 - index: 索引 - default: 默认值 - - Returns: - 账号值或默认值 - """ - if index < len(accounts): - return accounts[index] if accounts[index] else default - return default - - -# === 配置管理 === -def load_config(): - """加载配置文件""" - config_path = os.environ.get("CONFIG_PATH", "config/config.yaml") - - if not Path(config_path).exists(): - raise FileNotFoundError(f"配置文件 {config_path} 不存在") - - with open(config_path, "r", encoding="utf-8") as f: - config_data = yaml.safe_load(f) - - print(f"配置文件加载成功: {config_path}") - - # 构建配置 - config = { - "VERSION_CHECK_URL": config_data["app"]["version_check_url"], - "SHOW_VERSION_UPDATE": config_data["app"]["show_version_update"], - "REQUEST_INTERVAL": config_data["crawler"]["request_interval"], - "REPORT_MODE": os.environ.get("REPORT_MODE", "").strip() - or config_data["report"]["mode"], - "RANK_THRESHOLD": config_data["report"]["rank_threshold"], - "SORT_BY_POSITION_FIRST": os.environ.get("SORT_BY_POSITION_FIRST", "").strip().lower() - in ("true", "1") - if os.environ.get("SORT_BY_POSITION_FIRST", "").strip() - else config_data["report"].get("sort_by_position_first", False), - "MAX_NEWS_PER_KEYWORD": int( - os.environ.get("MAX_NEWS_PER_KEYWORD", "").strip() or "0" - ) - or config_data["report"].get("max_news_per_keyword", 0), - "REVERSE_CONTENT_ORDER": os.environ.get("REVERSE_CONTENT_ORDER", "").strip().lower() - in ("true", "1") - if os.environ.get("REVERSE_CONTENT_ORDER", "").strip() - else config_data["report"].get("reverse_content_order", False), - "USE_PROXY": config_data["crawler"]["use_proxy"], - "DEFAULT_PROXY": config_data["crawler"]["default_proxy"], - "ENABLE_CRAWLER": os.environ.get("ENABLE_CRAWLER", "").strip().lower() - in ("true", "1") - if os.environ.get("ENABLE_CRAWLER", "").strip() - else config_data["crawler"]["enable_crawler"], - "ENABLE_NOTIFICATION": os.environ.get("ENABLE_NOTIFICATION", "").strip().lower() - in ("true", "1") - if os.environ.get("ENABLE_NOTIFICATION", "").strip() - else config_data["notification"]["enable_notification"], - "MESSAGE_BATCH_SIZE": config_data["notification"]["message_batch_size"], - "DINGTALK_BATCH_SIZE": config_data["notification"].get( - "dingtalk_batch_size", 20000 - ), - "FEISHU_BATCH_SIZE": config_data["notification"].get("feishu_batch_size", 29000), - "BARK_BATCH_SIZE": config_data["notification"].get("bark_batch_size", 3600), - "SLACK_BATCH_SIZE": config_data["notification"].get("slack_batch_size", 4000), - "BATCH_SEND_INTERVAL": config_data["notification"]["batch_send_interval"], - "FEISHU_MESSAGE_SEPARATOR": config_data["notification"][ - "feishu_message_separator" - ], - # 多账号配置 - "MAX_ACCOUNTS_PER_CHANNEL": int( - os.environ.get("MAX_ACCOUNTS_PER_CHANNEL", "").strip() or "0" - ) - or config_data["notification"].get("max_accounts_per_channel", 3), - "PUSH_WINDOW": { - "ENABLED": os.environ.get("PUSH_WINDOW_ENABLED", "").strip().lower() - in ("true", "1") - if os.environ.get("PUSH_WINDOW_ENABLED", "").strip() - else config_data["notification"] - .get("push_window", {}) - .get("enabled", False), - "TIME_RANGE": { - "START": os.environ.get("PUSH_WINDOW_START", "").strip() - or config_data["notification"] - .get("push_window", {}) - .get("time_range", {}) - .get("start", "08:00"), - "END": os.environ.get("PUSH_WINDOW_END", "").strip() - or config_data["notification"] - .get("push_window", {}) - .get("time_range", {}) - .get("end", "22:00"), - }, - "ONCE_PER_DAY": os.environ.get("PUSH_WINDOW_ONCE_PER_DAY", "").strip().lower() - in ("true", "1") - if os.environ.get("PUSH_WINDOW_ONCE_PER_DAY", "").strip() - else config_data["notification"] - .get("push_window", {}) - .get("once_per_day", True), - "RECORD_RETENTION_DAYS": int( - os.environ.get("PUSH_WINDOW_RETENTION_DAYS", "").strip() or "0" - ) - or config_data["notification"] - .get("push_window", {}) - .get("push_record_retention_days", 7), - }, - "WEIGHT_CONFIG": { - "RANK_WEIGHT": config_data["weight"]["rank_weight"], - "FREQUENCY_WEIGHT": config_data["weight"]["frequency_weight"], - "HOTNESS_WEIGHT": config_data["weight"]["hotness_weight"], - }, - "PLATFORMS": config_data["platforms"], - } - - # 通知渠道配置(环境变量优先) - notification = config_data.get("notification", {}) - webhooks = notification.get("webhooks", {}) - - config["FEISHU_WEBHOOK_URL"] = os.environ.get( - "FEISHU_WEBHOOK_URL", "" - ).strip() or webhooks.get("feishu_url", "") - config["DINGTALK_WEBHOOK_URL"] = os.environ.get( - "DINGTALK_WEBHOOK_URL", "" - ).strip() or webhooks.get("dingtalk_url", "") - config["WEWORK_WEBHOOK_URL"] = os.environ.get( - "WEWORK_WEBHOOK_URL", "" - ).strip() or webhooks.get("wework_url", "") - config["WEWORK_MSG_TYPE"] = os.environ.get( - "WEWORK_MSG_TYPE", "" - ).strip() or webhooks.get("wework_msg_type", "markdown") - config["TELEGRAM_BOT_TOKEN"] = os.environ.get( - "TELEGRAM_BOT_TOKEN", "" - ).strip() or webhooks.get("telegram_bot_token", "") - config["TELEGRAM_CHAT_ID"] = os.environ.get( - "TELEGRAM_CHAT_ID", "" - ).strip() or webhooks.get("telegram_chat_id", "") - - # 邮件配置 - config["EMAIL_FROM"] = os.environ.get("EMAIL_FROM", "").strip() or webhooks.get( - "email_from", "" - ) - config["EMAIL_PASSWORD"] = os.environ.get( - "EMAIL_PASSWORD", "" - ).strip() or webhooks.get("email_password", "") - config["EMAIL_TO"] = os.environ.get("EMAIL_TO", "").strip() or webhooks.get( - "email_to", "" - ) - config["EMAIL_SMTP_SERVER"] = os.environ.get( - "EMAIL_SMTP_SERVER", "" - ).strip() or webhooks.get("email_smtp_server", "") - config["EMAIL_SMTP_PORT"] = os.environ.get( - "EMAIL_SMTP_PORT", "" - ).strip() or webhooks.get("email_smtp_port", "") - - # ntfy配置 - config["NTFY_SERVER_URL"] = ( - os.environ.get("NTFY_SERVER_URL", "").strip() - or webhooks.get("ntfy_server_url") - or "https://ntfy.sh" - ) - config["NTFY_TOPIC"] = os.environ.get("NTFY_TOPIC", "").strip() or webhooks.get( - "ntfy_topic", "" - ) - config["NTFY_TOKEN"] = os.environ.get("NTFY_TOKEN", "").strip() or webhooks.get( - "ntfy_token", "" - ) - - # Bark配置 - config["BARK_URL"] = os.environ.get("BARK_URL", "").strip() or webhooks.get( - "bark_url", "" - ) - - # Slack配置 - config["SLACK_WEBHOOK_URL"] = os.environ.get("SLACK_WEBHOOK_URL", "").strip() or webhooks.get( - "slack_webhook_url", "" - ) - - # 输出配置来源信息 - notification_sources = [] - max_accounts = config["MAX_ACCOUNTS_PER_CHANNEL"] - - if config["FEISHU_WEBHOOK_URL"]: - accounts = parse_multi_account_config(config["FEISHU_WEBHOOK_URL"]) - count = min(len(accounts), max_accounts) - source = "环境变量" if os.environ.get("FEISHU_WEBHOOK_URL") else "配置文件" - notification_sources.append(f"飞书({source}, {count}个账号)") - if config["DINGTALK_WEBHOOK_URL"]: - accounts = parse_multi_account_config(config["DINGTALK_WEBHOOK_URL"]) - count = min(len(accounts), max_accounts) - source = "环境变量" if os.environ.get("DINGTALK_WEBHOOK_URL") else "配置文件" - notification_sources.append(f"钉钉({source}, {count}个账号)") - if config["WEWORK_WEBHOOK_URL"]: - accounts = parse_multi_account_config(config["WEWORK_WEBHOOK_URL"]) - count = min(len(accounts), max_accounts) - source = "环境变量" if os.environ.get("WEWORK_WEBHOOK_URL") else "配置文件" - notification_sources.append(f"企业微信({source}, {count}个账号)") - if config["TELEGRAM_BOT_TOKEN"] and config["TELEGRAM_CHAT_ID"]: - tokens = parse_multi_account_config(config["TELEGRAM_BOT_TOKEN"]) - chat_ids = parse_multi_account_config(config["TELEGRAM_CHAT_ID"]) - # 验证数量一致性 - valid, count = validate_paired_configs( - {"bot_token": tokens, "chat_id": chat_ids}, - "Telegram", - required_keys=["bot_token", "chat_id"] - ) - if valid and count > 0: - count = min(count, max_accounts) - token_source = "环境变量" if os.environ.get("TELEGRAM_BOT_TOKEN") else "配置文件" - notification_sources.append(f"Telegram({token_source}, {count}个账号)") - if config["EMAIL_FROM"] and config["EMAIL_PASSWORD"] and config["EMAIL_TO"]: - from_source = "环境变量" if os.environ.get("EMAIL_FROM") else "配置文件" - notification_sources.append(f"邮件({from_source})") - - if config["NTFY_SERVER_URL"] and config["NTFY_TOPIC"]: - topics = parse_multi_account_config(config["NTFY_TOPIC"]) - tokens = parse_multi_account_config(config["NTFY_TOKEN"]) - # ntfy 的 token 是可选的,但如果配置了,数量必须与 topic 一致 - if tokens: - valid, count = validate_paired_configs( - {"topic": topics, "token": tokens}, - "ntfy" - ) - if valid and count > 0: - count = min(count, max_accounts) - server_source = "环境变量" if os.environ.get("NTFY_SERVER_URL") else "配置文件" - notification_sources.append(f"ntfy({server_source}, {count}个账号)") - else: - count = min(len(topics), max_accounts) - server_source = "环境变量" if os.environ.get("NTFY_SERVER_URL") else "配置文件" - notification_sources.append(f"ntfy({server_source}, {count}个账号)") - - if config["BARK_URL"]: - accounts = parse_multi_account_config(config["BARK_URL"]) - count = min(len(accounts), max_accounts) - bark_source = "环境变量" if os.environ.get("BARK_URL") else "配置文件" - notification_sources.append(f"Bark({bark_source}, {count}个账号)") - - if config["SLACK_WEBHOOK_URL"]: - accounts = parse_multi_account_config(config["SLACK_WEBHOOK_URL"]) - count = min(len(accounts), max_accounts) - slack_source = "环境变量" if os.environ.get("SLACK_WEBHOOK_URL") else "配置文件" - notification_sources.append(f"Slack({slack_source}, {count}个账号)") - - if notification_sources: - print(f"通知渠道配置来源: {', '.join(notification_sources)}") - print(f"每个渠道最大账号数: {max_accounts}") - else: - print("未配置任何通知渠道") - - return config - - -print("正在加载配置...") -CONFIG = load_config() -print(f"TrendRadar v{VERSION} 配置加载完成") -print(f"监控平台数量: {len(CONFIG['PLATFORMS'])}") - - -# === 工具函数 === -def get_beijing_time(): - """获取北京时间""" - return datetime.now(pytz.timezone("Asia/Shanghai")) - - -def format_date_folder(): - """格式化日期文件夹""" - return get_beijing_time().strftime("%Y年%m月%d日") - - -def format_time_filename(): - """格式化时间文件名""" - return get_beijing_time().strftime("%H时%M分") - - -def clean_title(title: str) -> str: - """清理标题中的特殊字符""" - if not isinstance(title, str): - title = str(title) - cleaned_title = title.replace("\n", " ").replace("\r", " ") - cleaned_title = re.sub(r"\s+", " ", cleaned_title) - cleaned_title = cleaned_title.strip() - return cleaned_title - - -def ensure_directory_exists(directory: str): - """确保目录存在""" - Path(directory).mkdir(parents=True, exist_ok=True) - - -def get_output_path(subfolder: str, filename: str) -> str: - """获取输出路径""" - date_folder = format_date_folder() - output_dir = Path("output") / date_folder / subfolder - ensure_directory_exists(str(output_dir)) - return str(output_dir / filename) - - -def check_version_update( - current_version: str, version_url: str, proxy_url: Optional[str] = None -) -> Tuple[bool, Optional[str]]: - """检查版本更新""" - try: - proxies = None - if proxy_url: - proxies = {"http": proxy_url, "https": proxy_url} - - headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", - "Accept": "text/plain, */*", - "Cache-Control": "no-cache", - } - - response = requests.get( - version_url, proxies=proxies, headers=headers, timeout=10 - ) - response.raise_for_status() - - remote_version = response.text.strip() - print(f"当前版本: {current_version}, 远程版本: {remote_version}") - - # 比较版本 - def parse_version(version_str): - try: - parts = version_str.strip().split(".") - if len(parts) != 3: - raise ValueError("版本号格式不正确") - return int(parts[0]), int(parts[1]), int(parts[2]) - except: - return 0, 0, 0 - - current_tuple = parse_version(current_version) - remote_tuple = parse_version(remote_version) - - need_update = current_tuple < remote_tuple - return need_update, remote_version if need_update else None - - except Exception as e: - print(f"版本检查失败: {e}") - return False, None - - -def is_first_crawl_today() -> bool: - """检测是否是当天第一次爬取""" - date_folder = format_date_folder() - txt_dir = Path("output") / date_folder / "txt" - - if not txt_dir.exists(): - return True - - files = sorted([f for f in txt_dir.iterdir() if f.suffix == ".txt"]) - return len(files) <= 1 - - -def html_escape(text: str) -> str: - """HTML转义""" - if not isinstance(text, str): - text = str(text) - - return ( - text.replace("&", "&") - .replace("<", "<") - .replace(">", ">") - .replace('"', """) - .replace("'", "'") - ) - - -# === 推送记录管理 === -class PushRecordManager: - """推送记录管理器""" - - def __init__(self): - self.record_dir = Path("output") / ".push_records" - self.ensure_record_dir() - self.cleanup_old_records() - - def ensure_record_dir(self): - """确保记录目录存在""" - self.record_dir.mkdir(parents=True, exist_ok=True) - - def get_today_record_file(self) -> Path: - """获取今天的记录文件路径""" - today = get_beijing_time().strftime("%Y%m%d") - return self.record_dir / f"push_record_{today}.json" - - def cleanup_old_records(self): - """清理过期的推送记录""" - retention_days = CONFIG["PUSH_WINDOW"]["RECORD_RETENTION_DAYS"] - current_time = get_beijing_time() - - for record_file in self.record_dir.glob("push_record_*.json"): - try: - date_str = record_file.stem.replace("push_record_", "") - file_date = datetime.strptime(date_str, "%Y%m%d") - file_date = pytz.timezone("Asia/Shanghai").localize(file_date) - - if (current_time - file_date).days > retention_days: - record_file.unlink() - print(f"清理过期推送记录: {record_file.name}") - except Exception as e: - print(f"清理记录文件失败 {record_file}: {e}") - - def has_pushed_today(self) -> bool: - """检查今天是否已经推送过""" - record_file = self.get_today_record_file() - - if not record_file.exists(): - return False - - try: - with open(record_file, "r", encoding="utf-8") as f: - record = json.load(f) - return record.get("pushed", False) - except Exception as e: - print(f"读取推送记录失败: {e}") - return False - - def record_push(self, report_type: str): - """记录推送""" - record_file = self.get_today_record_file() - now = get_beijing_time() - - record = { - "pushed": True, - "push_time": now.strftime("%Y-%m-%d %H:%M:%S"), - "report_type": report_type, - } - - try: - with open(record_file, "w", encoding="utf-8") as f: - json.dump(record, f, ensure_ascii=False, indent=2) - print(f"推送记录已保存: {report_type} at {now.strftime('%H:%M:%S')}") - except Exception as e: - print(f"保存推送记录失败: {e}") - - def is_in_time_range(self, start_time: str, end_time: str) -> bool: - """检查当前时间是否在指定时间范围内""" - now = get_beijing_time() - current_time = now.strftime("%H:%M") - - def normalize_time(time_str: str) -> str: - """将时间字符串标准化为 HH:MM 格式""" - try: - parts = time_str.strip().split(":") - if len(parts) != 2: - raise ValueError(f"时间格式错误: {time_str}") - - hour = int(parts[0]) - minute = int(parts[1]) - - if not (0 <= hour <= 23 and 0 <= minute <= 59): - raise ValueError(f"时间范围错误: {time_str}") - - return f"{hour:02d}:{minute:02d}" - except Exception as e: - print(f"时间格式化错误 '{time_str}': {e}") - return time_str - - normalized_start = normalize_time(start_time) - normalized_end = normalize_time(end_time) - normalized_current = normalize_time(current_time) - - result = normalized_start <= normalized_current <= normalized_end - - if not result: - print(f"时间窗口判断:当前 {normalized_current},窗口 {normalized_start}-{normalized_end}") - - return result - - -# === 数据获取 === -class DataFetcher: - """数据获取器""" - - def __init__(self, proxy_url: Optional[str] = None): - self.proxy_url = proxy_url - - def fetch_data( - self, - id_info: Union[str, Tuple[str, str]], - max_retries: int = 2, - min_retry_wait: int = 3, - max_retry_wait: int = 5, - ) -> Tuple[Optional[str], str, str]: - """获取指定ID数据,支持重试""" - if isinstance(id_info, tuple): - id_value, alias = id_info - else: - id_value = id_info - alias = id_value - - url = f"https://newsnow.busiyi.world/api/s?id={id_value}&latest" - - proxies = None - if self.proxy_url: - proxies = {"http": self.proxy_url, "https": self.proxy_url} - - headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", - "Accept": "application/json, text/plain, */*", - "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", - "Connection": "keep-alive", - "Cache-Control": "no-cache", - } - - retries = 0 - while retries <= max_retries: - try: - response = requests.get( - url, proxies=proxies, headers=headers, timeout=10 - ) - response.raise_for_status() - - data_text = response.text - data_json = json.loads(data_text) - - status = data_json.get("status", "未知") - if status not in ["success", "cache"]: - raise ValueError(f"响应状态异常: {status}") - - status_info = "最新数据" if status == "success" else "缓存数据" - print(f"获取 {id_value} 成功({status_info})") - return data_text, id_value, alias - - except Exception as e: - retries += 1 - if retries <= max_retries: - base_wait = random.uniform(min_retry_wait, max_retry_wait) - additional_wait = (retries - 1) * random.uniform(1, 2) - wait_time = base_wait + additional_wait - print(f"请求 {id_value} 失败: {e}. {wait_time:.2f}秒后重试...") - time.sleep(wait_time) - else: - print(f"请求 {id_value} 失败: {e}") - return None, id_value, alias - return None, id_value, alias - - def crawl_websites( - self, - ids_list: List[Union[str, Tuple[str, str]]], - request_interval: int = CONFIG["REQUEST_INTERVAL"], - ) -> Tuple[Dict, Dict, List]: - """爬取多个网站数据""" - results = {} - id_to_name = {} - failed_ids = [] - - for i, id_info in enumerate(ids_list): - if isinstance(id_info, tuple): - id_value, name = id_info - else: - id_value = id_info - name = id_value - - id_to_name[id_value] = name - response, _, _ = self.fetch_data(id_info) - - if response: - try: - data = json.loads(response) - results[id_value] = {} - for index, item in enumerate(data.get("items", []), 1): - title = item.get("title") - # 跳过无效标题(None、float、空字符串) - if title is None or isinstance(title, float) or not str(title).strip(): - continue - title = str(title).strip() - url = item.get("url", "") - mobile_url = item.get("mobileUrl", "") - - if title in results[id_value]: - results[id_value][title]["ranks"].append(index) - else: - results[id_value][title] = { - "ranks": [index], - "url": url, - "mobileUrl": mobile_url, - } - except json.JSONDecodeError: - print(f"解析 {id_value} 响应失败") - failed_ids.append(id_value) - except Exception as e: - print(f"处理 {id_value} 数据出错: {e}") - failed_ids.append(id_value) - else: - failed_ids.append(id_value) - - if i < len(ids_list) - 1: - actual_interval = request_interval + random.randint(-10, 20) - actual_interval = max(50, actual_interval) - time.sleep(actual_interval / 1000) - - print(f"成功: {list(results.keys())}, 失败: {failed_ids}") - return results, id_to_name, failed_ids - - -# === 数据处理 === -def save_titles_to_file(results: Dict, id_to_name: Dict, failed_ids: List) -> str: - """保存标题到文件""" - file_path = get_output_path("txt", f"{format_time_filename()}.txt") - - with open(file_path, "w", encoding="utf-8") as f: - for id_value, title_data in results.items(): - # id | name 或 id - name = id_to_name.get(id_value) - if name and name != id_value: - f.write(f"{id_value} | {name}\n") - else: - f.write(f"{id_value}\n") - - # 按排名排序标题 - sorted_titles = [] - for title, info in title_data.items(): - cleaned_title = clean_title(title) - if isinstance(info, dict): - ranks = info.get("ranks", []) - url = info.get("url", "") - mobile_url = info.get("mobileUrl", "") - else: - ranks = info if isinstance(info, list) else [] - url = "" - mobile_url = "" - - rank = ranks[0] if ranks else 1 - sorted_titles.append((rank, cleaned_title, url, mobile_url)) - - sorted_titles.sort(key=lambda x: x[0]) - - for rank, cleaned_title, url, mobile_url in sorted_titles: - line = f"{rank}. {cleaned_title}" - - if url: - line += f" [URL:{url}]" - if mobile_url: - line += f" [MOBILE:{mobile_url}]" - f.write(line + "\n") - - f.write("\n") - - if failed_ids: - f.write("==== 以下ID请求失败 ====\n") - for id_value in failed_ids: - f.write(f"{id_value}\n") - - return file_path - - -def load_frequency_words( - frequency_file: Optional[str] = None, -) -> Tuple[List[Dict], List[str], List[str]]: - """ - 加载频率词配置 - - Returns: - (词组列表, 词组内过滤词, 全局过滤词) - """ - if frequency_file is None: - frequency_file = os.environ.get( - "FREQUENCY_WORDS_PATH", "config/frequency_words.txt" - ) - - frequency_path = Path(frequency_file) - if not frequency_path.exists(): - raise FileNotFoundError(f"频率词文件 {frequency_file} 不存在") - - with open(frequency_path, "r", encoding="utf-8") as f: - content = f.read() - - word_groups = [group.strip() for group in content.split("\n\n") if group.strip()] - - processed_groups = [] - filter_words = [] - global_filters = [] # 新增:全局过滤词列表 - - # 默认区域(向后兼容) - current_section = "WORD_GROUPS" - - for group in word_groups: - lines = [line.strip() for line in group.split("\n") if line.strip()] - - if not lines: - continue - - # 检查是否为区域标记 - if lines[0].startswith("[") and lines[0].endswith("]"): - section_name = lines[0][1:-1].upper() - if section_name in ("GLOBAL_FILTER", "WORD_GROUPS"): - current_section = section_name - lines = lines[1:] # 移除标记行 - - # 处理全局过滤区域 - if current_section == "GLOBAL_FILTER": - # 直接添加所有非空行到全局过滤列表 - for line in lines: - # 忽略特殊语法前缀,只提取纯文本 - if line.startswith(("!", "+", "@")): - continue # 全局过滤区不支持特殊语法 - if line: - global_filters.append(line) - continue - - # 处理词组区域(保持现有逻辑) - words = lines - - group_required_words = [] - group_normal_words = [] - group_filter_words = [] - group_max_count = 0 # 默认不限制 - - for word in words: - if word.startswith("@"): - # 解析最大显示数量(只接受正整数) - try: - count = int(word[1:]) - if count > 0: - group_max_count = count - except (ValueError, IndexError): - pass # 忽略无效的@数字格式 - elif word.startswith("!"): - filter_words.append(word[1:]) - group_filter_words.append(word[1:]) - elif word.startswith("+"): - group_required_words.append(word[1:]) - else: - group_normal_words.append(word) - - if group_required_words or group_normal_words: - if group_normal_words: - group_key = " ".join(group_normal_words) - else: - group_key = " ".join(group_required_words) - - processed_groups.append( - { - "required": group_required_words, - "normal": group_normal_words, - "group_key": group_key, - "max_count": group_max_count, # 新增字段 - } - ) - - return processed_groups, filter_words, global_filters - - -def parse_file_titles(file_path: Path) -> Tuple[Dict, Dict]: - """解析单个txt文件的标题数据,返回(titles_by_id, id_to_name)""" - titles_by_id = {} - id_to_name = {} - - with open(file_path, "r", encoding="utf-8") as f: - content = f.read() - sections = content.split("\n\n") - - for section in sections: - if not section.strip() or "==== 以下ID请求失败 ====" in section: - continue - - lines = section.strip().split("\n") - if len(lines) < 2: - continue - - # id | name 或 id - header_line = lines[0].strip() - if " | " in header_line: - parts = header_line.split(" | ", 1) - source_id = parts[0].strip() - name = parts[1].strip() - id_to_name[source_id] = name - else: - source_id = header_line - id_to_name[source_id] = source_id - - titles_by_id[source_id] = {} - - for line in lines[1:]: - if line.strip(): - try: - title_part = line.strip() - rank = None - - # 提取排名 - if ". " in title_part and title_part.split(". ")[0].isdigit(): - rank_str, title_part = title_part.split(". ", 1) - rank = int(rank_str) - - # 提取 MOBILE URL - mobile_url = "" - if " [MOBILE:" in title_part: - title_part, mobile_part = title_part.rsplit(" [MOBILE:", 1) - if mobile_part.endswith("]"): - mobile_url = mobile_part[:-1] - - # 提取 URL - url = "" - if " [URL:" in title_part: - title_part, url_part = title_part.rsplit(" [URL:", 1) - if url_part.endswith("]"): - url = url_part[:-1] - - title = clean_title(title_part.strip()) - ranks = [rank] if rank is not None else [1] - - titles_by_id[source_id][title] = { - "ranks": ranks, - "url": url, - "mobileUrl": mobile_url, - } - - except Exception as e: - print(f"解析标题行出错: {line}, 错误: {e}") - - return titles_by_id, id_to_name - - -def read_all_today_titles( - current_platform_ids: Optional[List[str]] = None, -) -> Tuple[Dict, Dict, Dict]: - """读取当天所有标题文件,支持按当前监控平台过滤""" - date_folder = format_date_folder() - txt_dir = Path("output") / date_folder / "txt" - - if not txt_dir.exists(): - return {}, {}, {} - - all_results = {} - final_id_to_name = {} - title_info = {} - - files = sorted([f for f in txt_dir.iterdir() if f.suffix == ".txt"]) - - for file_path in files: - time_info = file_path.stem - - titles_by_id, file_id_to_name = parse_file_titles(file_path) - - if current_platform_ids is not None: - filtered_titles_by_id = {} - filtered_id_to_name = {} - - for source_id, title_data in titles_by_id.items(): - if source_id in current_platform_ids: - filtered_titles_by_id[source_id] = title_data - if source_id in file_id_to_name: - filtered_id_to_name[source_id] = file_id_to_name[source_id] - - titles_by_id = filtered_titles_by_id - file_id_to_name = filtered_id_to_name - - final_id_to_name.update(file_id_to_name) - - for source_id, title_data in titles_by_id.items(): - process_source_data( - source_id, title_data, time_info, all_results, title_info - ) - - return all_results, final_id_to_name, title_info - - -def process_source_data( - source_id: str, - title_data: Dict, - time_info: str, - all_results: Dict, - title_info: Dict, -) -> None: - """处理来源数据,合并重复标题""" - if source_id not in all_results: - all_results[source_id] = title_data - - if source_id not in title_info: - title_info[source_id] = {} - - for title, data in title_data.items(): - ranks = data.get("ranks", []) - url = data.get("url", "") - mobile_url = data.get("mobileUrl", "") - - title_info[source_id][title] = { - "first_time": time_info, - "last_time": time_info, - "count": 1, - "ranks": ranks, - "url": url, - "mobileUrl": mobile_url, - } - else: - for title, data in title_data.items(): - ranks = data.get("ranks", []) - url = data.get("url", "") - mobile_url = data.get("mobileUrl", "") - - if title not in all_results[source_id]: - all_results[source_id][title] = { - "ranks": ranks, - "url": url, - "mobileUrl": mobile_url, - } - title_info[source_id][title] = { - "first_time": time_info, - "last_time": time_info, - "count": 1, - "ranks": ranks, - "url": url, - "mobileUrl": mobile_url, - } - else: - existing_data = all_results[source_id][title] - existing_ranks = existing_data.get("ranks", []) - existing_url = existing_data.get("url", "") - existing_mobile_url = existing_data.get("mobileUrl", "") - - merged_ranks = existing_ranks.copy() - for rank in ranks: - if rank not in merged_ranks: - merged_ranks.append(rank) - - all_results[source_id][title] = { - "ranks": merged_ranks, - "url": existing_url or url, - "mobileUrl": existing_mobile_url or mobile_url, - } - - title_info[source_id][title]["last_time"] = time_info - title_info[source_id][title]["ranks"] = merged_ranks - title_info[source_id][title]["count"] += 1 - if not title_info[source_id][title].get("url"): - title_info[source_id][title]["url"] = url - if not title_info[source_id][title].get("mobileUrl"): - title_info[source_id][title]["mobileUrl"] = mobile_url - - -def detect_latest_new_titles(current_platform_ids: Optional[List[str]] = None) -> Dict: - """检测当日最新批次的新增标题,支持按当前监控平台过滤""" - date_folder = format_date_folder() - txt_dir = Path("output") / date_folder / "txt" - - if not txt_dir.exists(): - return {} - - files = sorted([f for f in txt_dir.iterdir() if f.suffix == ".txt"]) - if len(files) < 2: - return {} - - # 解析最新文件 - latest_file = files[-1] - latest_titles, _ = parse_file_titles(latest_file) - - # 如果指定了当前平台列表,过滤最新文件数据 - if current_platform_ids is not None: - filtered_latest_titles = {} - for source_id, title_data in latest_titles.items(): - if source_id in current_platform_ids: - filtered_latest_titles[source_id] = title_data - latest_titles = filtered_latest_titles - - # 汇总历史标题(按平台过滤) - historical_titles = {} - for file_path in files[:-1]: - historical_data, _ = parse_file_titles(file_path) - - # 过滤历史数据 - if current_platform_ids is not None: - filtered_historical_data = {} - for source_id, title_data in historical_data.items(): - if source_id in current_platform_ids: - filtered_historical_data[source_id] = title_data - historical_data = filtered_historical_data - - for source_id, titles_data in historical_data.items(): - if source_id not in historical_titles: - historical_titles[source_id] = set() - for title in titles_data.keys(): - historical_titles[source_id].add(title) - - # 找出新增标题 - new_titles = {} - for source_id, latest_source_titles in latest_titles.items(): - historical_set = historical_titles.get(source_id, set()) - source_new_titles = {} - - for title, title_data in latest_source_titles.items(): - if title not in historical_set: - source_new_titles[title] = title_data - - if source_new_titles: - new_titles[source_id] = source_new_titles - - return new_titles - - -# === 统计和分析 === -def calculate_news_weight( - title_data: Dict, rank_threshold: int = CONFIG["RANK_THRESHOLD"] -) -> float: - """计算新闻权重,用于排序""" - ranks = title_data.get("ranks", []) - if not ranks: - return 0.0 - - count = title_data.get("count", len(ranks)) - weight_config = CONFIG["WEIGHT_CONFIG"] - - # 排名权重:Σ(11 - min(rank, 10)) / 出现次数 - rank_scores = [] - for rank in ranks: - score = 11 - min(rank, 10) - rank_scores.append(score) - - rank_weight = sum(rank_scores) / len(ranks) if ranks else 0 - - # 频次权重:min(出现次数, 10) × 10 - frequency_weight = min(count, 10) * 10 - - # 热度加成:高排名次数 / 总出现次数 × 100 - high_rank_count = sum(1 for rank in ranks if rank <= rank_threshold) - hotness_ratio = high_rank_count / len(ranks) if ranks else 0 - hotness_weight = hotness_ratio * 100 - - total_weight = ( - rank_weight * weight_config["RANK_WEIGHT"] - + frequency_weight * weight_config["FREQUENCY_WEIGHT"] - + hotness_weight * weight_config["HOTNESS_WEIGHT"] - ) - - return total_weight - - -def matches_word_groups( - title: str, word_groups: List[Dict], filter_words: List[str], global_filters: Optional[List[str]] = None -) -> bool: - """检查标题是否匹配词组规则""" - # 防御性类型检查:确保 title 是有效字符串 - if not isinstance(title, str): - title = str(title) if title is not None else "" - if not title.strip(): - return False - - title_lower = title.lower() - - # 全局过滤检查(优先级最高) - if global_filters: - if any(global_word.lower() in title_lower for global_word in global_filters): - return False - - # 如果没有配置词组,则匹配所有标题(支持显示全部新闻) - if not word_groups: - return True - - # 过滤词检查 - if any(filter_word.lower() in title_lower for filter_word in filter_words): - return False - - # 词组匹配检查 - for group in word_groups: - required_words = group["required"] - normal_words = group["normal"] - - # 必须词检查 - if required_words: - all_required_present = all( - req_word.lower() in title_lower for req_word in required_words - ) - if not all_required_present: - continue - - # 普通词检查 - if normal_words: - any_normal_present = any( - normal_word.lower() in title_lower for normal_word in normal_words - ) - if not any_normal_present: - continue - - return True - - return False - - -def format_time_display(first_time: str, last_time: str) -> str: - """格式化时间显示""" - if not first_time: - return "" - if first_time == last_time or not last_time: - return first_time - else: - return f"[{first_time} ~ {last_time}]" - - -def format_rank_display(ranks: List[int], rank_threshold: int, format_type: str) -> str: - """统一的排名格式化方法""" - if not ranks: - return "" - - unique_ranks = sorted(set(ranks)) - min_rank = unique_ranks[0] - max_rank = unique_ranks[-1] - - if format_type == "html": - highlight_start = "" - highlight_end = "" - elif format_type == "feishu": - highlight_start = "**" - highlight_end = "**" - elif format_type == "dingtalk": - highlight_start = "**" - highlight_end = "**" - elif format_type == "wework": - highlight_start = "**" - highlight_end = "**" - elif format_type == "telegram": - highlight_start = "" - highlight_end = "" - elif format_type == "slack": - highlight_start = "*" - highlight_end = "*" - else: - highlight_start = "**" - highlight_end = "**" - - if min_rank <= rank_threshold: - if min_rank == max_rank: - return f"{highlight_start}[{min_rank}]{highlight_end}" - else: - return f"{highlight_start}[{min_rank} - {max_rank}]{highlight_end}" - else: - if min_rank == max_rank: - return f"[{min_rank}]" - else: - return f"[{min_rank} - {max_rank}]" - - -def count_word_frequency( - results: Dict, - word_groups: List[Dict], - filter_words: List[str], - id_to_name: Dict, - title_info: Optional[Dict] = None, - rank_threshold: int = CONFIG["RANK_THRESHOLD"], - new_titles: Optional[Dict] = None, - mode: str = "daily", - global_filters: Optional[List[str]] = None, -) -> Tuple[List[Dict], int]: - """统计词频,支持必须词、频率词、过滤词、全局过滤词,并标记新增标题""" - - # 如果没有配置词组,创建一个包含所有新闻的虚拟词组 - if not word_groups: - print("频率词配置为空,将显示所有新闻") - word_groups = [{"required": [], "normal": [], "group_key": "全部新闻"}] - filter_words = [] # 清空过滤词,显示所有新闻 - - is_first_today = is_first_crawl_today() - - # 确定处理的数据源和新增标记逻辑 - if mode == "incremental": - if is_first_today: - # 增量模式 + 当天第一次:处理所有新闻,都标记为新增 - results_to_process = results - all_news_are_new = True - else: - # 增量模式 + 当天非第一次:只处理新增的新闻 - results_to_process = new_titles if new_titles else {} - all_news_are_new = True - elif mode == "current": - # current 模式:只处理当前时间批次的新闻,但统计信息来自全部历史 - if title_info: - latest_time = None - for source_titles in title_info.values(): - for title_data in source_titles.values(): - last_time = title_data.get("last_time", "") - if last_time: - if latest_time is None or last_time > latest_time: - latest_time = last_time - - # 只处理 last_time 等于最新时间的新闻 - if latest_time: - results_to_process = {} - for source_id, source_titles in results.items(): - if source_id in title_info: - filtered_titles = {} - for title, title_data in source_titles.items(): - if title in title_info[source_id]: - info = title_info[source_id][title] - if info.get("last_time") == latest_time: - filtered_titles[title] = title_data - if filtered_titles: - results_to_process[source_id] = filtered_titles - - print( - f"当前榜单模式:最新时间 {latest_time},筛选出 {sum(len(titles) for titles in results_to_process.values())} 条当前榜单新闻" - ) - else: - results_to_process = results - else: - results_to_process = results - all_news_are_new = False - else: - # 当日汇总模式:处理所有新闻 - results_to_process = results - all_news_are_new = False - total_input_news = sum(len(titles) for titles in results.values()) - filter_status = ( - "全部显示" - if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻" - else "频率词过滤" - ) - print(f"当日汇总模式:处理 {total_input_news} 条新闻,模式:{filter_status}") - - word_stats = {} - total_titles = 0 - processed_titles = {} - matched_new_count = 0 - - if title_info is None: - title_info = {} - if new_titles is None: - new_titles = {} - - for group in word_groups: - group_key = group["group_key"] - word_stats[group_key] = {"count": 0, "titles": {}} - - for source_id, titles_data in results_to_process.items(): - total_titles += len(titles_data) - - if source_id not in processed_titles: - processed_titles[source_id] = {} - - for title, title_data in titles_data.items(): - if title in processed_titles.get(source_id, {}): - continue - - # 使用统一的匹配逻辑 - matches_frequency_words = matches_word_groups( - title, word_groups, filter_words, global_filters - ) - - if not matches_frequency_words: - continue - - # 如果是增量模式或 current 模式第一次,统计匹配的新增新闻数量 - if (mode == "incremental" and all_news_are_new) or ( - mode == "current" and is_first_today - ): - matched_new_count += 1 - - source_ranks = title_data.get("ranks", []) - source_url = title_data.get("url", "") - source_mobile_url = title_data.get("mobileUrl", "") - - # 找到匹配的词组(防御性转换确保类型安全) - title_lower = str(title).lower() if not isinstance(title, str) else title.lower() - for group in word_groups: - required_words = group["required"] - normal_words = group["normal"] - - # 如果是"全部新闻"模式,所有标题都匹配第一个(唯一的)词组 - if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻": - group_key = group["group_key"] - word_stats[group_key]["count"] += 1 - if source_id not in word_stats[group_key]["titles"]: - word_stats[group_key]["titles"][source_id] = [] - else: - # 原有的匹配逻辑 - if required_words: - all_required_present = all( - req_word.lower() in title_lower - for req_word in required_words - ) - if not all_required_present: - continue - - if normal_words: - any_normal_present = any( - normal_word.lower() in title_lower - for normal_word in normal_words - ) - if not any_normal_present: - continue - - group_key = group["group_key"] - word_stats[group_key]["count"] += 1 - if source_id not in word_stats[group_key]["titles"]: - word_stats[group_key]["titles"][source_id] = [] - - first_time = "" - last_time = "" - count_info = 1 - ranks = source_ranks if source_ranks else [] - url = source_url - mobile_url = source_mobile_url - - # 对于 current 模式,从历史统计信息中获取完整数据 - if ( - mode == "current" - and title_info - and source_id in title_info - and title in title_info[source_id] - ): - info = title_info[source_id][title] - first_time = info.get("first_time", "") - last_time = info.get("last_time", "") - count_info = info.get("count", 1) - if "ranks" in info and info["ranks"]: - ranks = info["ranks"] - url = info.get("url", source_url) - mobile_url = info.get("mobileUrl", source_mobile_url) - elif ( - title_info - and source_id in title_info - and title in title_info[source_id] - ): - info = title_info[source_id][title] - first_time = info.get("first_time", "") - last_time = info.get("last_time", "") - count_info = info.get("count", 1) - if "ranks" in info and info["ranks"]: - ranks = info["ranks"] - url = info.get("url", source_url) - mobile_url = info.get("mobileUrl", source_mobile_url) - - if not ranks: - ranks = [99] - - time_display = format_time_display(first_time, last_time) - - source_name = id_to_name.get(source_id, source_id) - - # 判断是否为新增 - is_new = False - if all_news_are_new: - # 增量模式下所有处理的新闻都是新增,或者当天第一次的所有新闻都是新增 - is_new = True - elif new_titles and source_id in new_titles: - # 检查是否在新增列表中 - new_titles_for_source = new_titles[source_id] - is_new = title in new_titles_for_source - - word_stats[group_key]["titles"][source_id].append( - { - "title": title, - "source_name": source_name, - "first_time": first_time, - "last_time": last_time, - "time_display": time_display, - "count": count_info, - "ranks": ranks, - "rank_threshold": rank_threshold, - "url": url, - "mobileUrl": mobile_url, - "is_new": is_new, - } - ) - - if source_id not in processed_titles: - processed_titles[source_id] = {} - processed_titles[source_id][title] = True - - break - - # 最后统一打印汇总信息 - if mode == "incremental": - if is_first_today: - total_input_news = sum(len(titles) for titles in results.values()) - filter_status = ( - "全部显示" - if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻" - else "频率词匹配" - ) - print( - f"增量模式:当天第一次爬取,{total_input_news} 条新闻中有 {matched_new_count} 条{filter_status}" - ) - else: - if new_titles: - total_new_count = sum(len(titles) for titles in new_titles.values()) - filter_status = ( - "全部显示" - if len(word_groups) == 1 - and word_groups[0]["group_key"] == "全部新闻" - else "匹配频率词" - ) - print( - f"增量模式:{total_new_count} 条新增新闻中,有 {matched_new_count} 条{filter_status}" - ) - if matched_new_count == 0 and len(word_groups) > 1: - print("增量模式:没有新增新闻匹配频率词,将不会发送通知") - else: - print("增量模式:未检测到新增新闻") - elif mode == "current": - total_input_news = sum(len(titles) for titles in results_to_process.values()) - if is_first_today: - filter_status = ( - "全部显示" - if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻" - else "频率词匹配" - ) - print( - f"当前榜单模式:当天第一次爬取,{total_input_news} 条当前榜单新闻中有 {matched_new_count} 条{filter_status}" - ) - else: - matched_count = sum(stat["count"] for stat in word_stats.values()) - filter_status = ( - "全部显示" - if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻" - else "频率词匹配" - ) - print( - f"当前榜单模式:{total_input_news} 条当前榜单新闻中有 {matched_count} 条{filter_status}" - ) - - stats = [] - # 创建 group_key 到位置和最大数量的映射 - group_key_to_position = { - group["group_key"]: idx for idx, group in enumerate(word_groups) - } - group_key_to_max_count = { - group["group_key"]: group.get("max_count", 0) for group in word_groups - } - - for group_key, data in word_stats.items(): - all_titles = [] - for source_id, title_list in data["titles"].items(): - all_titles.extend(title_list) - - # 按权重排序 - sorted_titles = sorted( - all_titles, - key=lambda x: ( - -calculate_news_weight(x, rank_threshold), - min(x["ranks"]) if x["ranks"] else 999, - -x["count"], - ), - ) - - # 应用最大显示数量限制(优先级:单独配置 > 全局配置) - group_max_count = group_key_to_max_count.get(group_key, 0) - if group_max_count == 0: - # 使用全局配置 - group_max_count = CONFIG.get("MAX_NEWS_PER_KEYWORD", 0) - - if group_max_count > 0: - sorted_titles = sorted_titles[:group_max_count] - - stats.append( - { - "word": group_key, - "count": data["count"], - "position": group_key_to_position.get(group_key, 999), - "titles": sorted_titles, - "percentage": ( - round(data["count"] / total_titles * 100, 2) - if total_titles > 0 - else 0 - ), - } - ) - - # 根据配置选择排序优先级 - if CONFIG.get("SORT_BY_POSITION_FIRST", False): - # 先按配置位置,再按热点条数 - stats.sort(key=lambda x: (x["position"], -x["count"])) - else: - # 先按热点条数,再按配置位置(原逻辑) - stats.sort(key=lambda x: (-x["count"], x["position"])) - - return stats, total_titles - - -# === 报告生成 === -def prepare_report_data( - stats: List[Dict], - failed_ids: Optional[List] = None, - new_titles: Optional[Dict] = None, - id_to_name: Optional[Dict] = None, - mode: str = "daily", -) -> Dict: - """准备报告数据""" - processed_new_titles = [] - - # 在增量模式下隐藏新增新闻区域 - hide_new_section = mode == "incremental" - - # 只有在非隐藏模式下才处理新增新闻部分 - if not hide_new_section: - filtered_new_titles = {} - if new_titles and id_to_name: - word_groups, filter_words, global_filters = load_frequency_words() - for source_id, titles_data in new_titles.items(): - filtered_titles = {} - for title, title_data in titles_data.items(): - if matches_word_groups(title, word_groups, filter_words, global_filters): - filtered_titles[title] = title_data - if filtered_titles: - filtered_new_titles[source_id] = filtered_titles - - if filtered_new_titles and id_to_name: - for source_id, titles_data in filtered_new_titles.items(): - source_name = id_to_name.get(source_id, source_id) - source_titles = [] - - for title, title_data in titles_data.items(): - url = title_data.get("url", "") - mobile_url = title_data.get("mobileUrl", "") - ranks = title_data.get("ranks", []) - - processed_title = { - "title": title, - "source_name": source_name, - "time_display": "", - "count": 1, - "ranks": ranks, - "rank_threshold": CONFIG["RANK_THRESHOLD"], - "url": url, - "mobile_url": mobile_url, - "is_new": True, - } - source_titles.append(processed_title) - - if source_titles: - processed_new_titles.append( - { - "source_id": source_id, - "source_name": source_name, - "titles": source_titles, - } - ) - - processed_stats = [] - for stat in stats: - if stat["count"] <= 0: - continue - - processed_titles = [] - for title_data in stat["titles"]: - processed_title = { - "title": title_data["title"], - "source_name": title_data["source_name"], - "time_display": title_data["time_display"], - "count": title_data["count"], - "ranks": title_data["ranks"], - "rank_threshold": title_data["rank_threshold"], - "url": title_data.get("url", ""), - "mobile_url": title_data.get("mobileUrl", ""), - "is_new": title_data.get("is_new", False), - } - processed_titles.append(processed_title) - - processed_stats.append( - { - "word": stat["word"], - "count": stat["count"], - "percentage": stat.get("percentage", 0), - "titles": processed_titles, - } - ) - - return { - "stats": processed_stats, - "new_titles": processed_new_titles, - "failed_ids": failed_ids or [], - "total_new_count": sum( - len(source["titles"]) for source in processed_new_titles - ), - } - - -def format_title_for_platform( - platform: str, title_data: Dict, show_source: bool = True -) -> str: - """统一的标题格式化方法""" - rank_display = format_rank_display( - title_data["ranks"], title_data["rank_threshold"], platform - ) - - link_url = title_data["mobile_url"] or title_data["url"] - - cleaned_title = clean_title(title_data["title"]) - - if platform == "feishu": - if link_url: - formatted_title = f"[{cleaned_title}]({link_url})" - else: - formatted_title = cleaned_title - - title_prefix = "🆕 " if title_data.get("is_new") else "" - - if show_source: - result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}" - else: - result = f"{title_prefix}{formatted_title}" - - if rank_display: - result += f" {rank_display}" - if title_data["time_display"]: - result += f" - {title_data['time_display']}" - if title_data["count"] > 1: - result += f" ({title_data['count']}次)" - - return result - - elif platform == "dingtalk": - if link_url: - formatted_title = f"[{cleaned_title}]({link_url})" - else: - formatted_title = cleaned_title - - title_prefix = "🆕 " if title_data.get("is_new") else "" - - if show_source: - result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}" - else: - result = f"{title_prefix}{formatted_title}" - - if rank_display: - result += f" {rank_display}" - if title_data["time_display"]: - result += f" - {title_data['time_display']}" - if title_data["count"] > 1: - result += f" ({title_data['count']}次)" - - return result - - elif platform in ("wework", "bark"): - # WeWork 和 Bark 使用 markdown 格式 - if link_url: - formatted_title = f"[{cleaned_title}]({link_url})" - else: - formatted_title = cleaned_title - - title_prefix = "🆕 " if title_data.get("is_new") else "" - - if show_source: - result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}" - else: - result = f"{title_prefix}{formatted_title}" - - if rank_display: - result += f" {rank_display}" - if title_data["time_display"]: - result += f" - {title_data['time_display']}" - if title_data["count"] > 1: - result += f" ({title_data['count']}次)" - - return result - - elif platform == "telegram": - if link_url: - formatted_title = f'
{html_escape(cleaned_title)}' - else: - formatted_title = cleaned_title - - title_prefix = "🆕 " if title_data.get("is_new") else "" - - if show_source: - result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}" - else: - result = f"{title_prefix}{formatted_title}" - - if rank_display: - result += f" {rank_display}" - if title_data["time_display"]: - result += f" - {title_data['time_display']}" - if title_data["count"] > 1: - result += f" ({title_data['count']}次)" - - return result - - elif platform == "ntfy": - if link_url: - formatted_title = f"[{cleaned_title}]({link_url})" - else: - formatted_title = cleaned_title - - title_prefix = "🆕 " if title_data.get("is_new") else "" - - if show_source: - result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}" - else: - result = f"{title_prefix}{formatted_title}" - - if rank_display: - result += f" {rank_display}" - if title_data["time_display"]: - result += f" `- {title_data['time_display']}`" - if title_data["count"] > 1: - result += f" `({title_data['count']}次)`" - - return result - - elif platform == "slack": - # Slack 使用 mrkdwn 格式 - if link_url: - # Slack 链接格式: - formatted_title = f"<{link_url}|{cleaned_title}>" - else: - formatted_title = cleaned_title - - title_prefix = "🆕 " if title_data.get("is_new") else "" - - if show_source: - result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}" - else: - result = f"{title_prefix}{formatted_title}" - - # 排名(使用 * 加粗) - rank_display = format_rank_display( - title_data["ranks"], title_data["rank_threshold"], "slack" - ) - if rank_display: - result += f" {rank_display}" - if title_data["time_display"]: - result += f" `- {title_data['time_display']}`" - if title_data["count"] > 1: - result += f" `({title_data['count']}次)`" - - return result - - elif platform == "html": - rank_display = format_rank_display( - title_data["ranks"], title_data["rank_threshold"], "html" - ) - - link_url = title_data["mobile_url"] or title_data["url"] - - escaped_title = html_escape(cleaned_title) - escaped_source_name = html_escape(title_data["source_name"]) - - if link_url: - escaped_url = html_escape(link_url) - formatted_title = f'[{escaped_source_name}] {escaped_title}' - else: - formatted_title = ( - f'[{escaped_source_name}] {escaped_title}' - ) - - if rank_display: - formatted_title += f" {rank_display}" - if title_data["time_display"]: - escaped_time = html_escape(title_data["time_display"]) - formatted_title += f" - {escaped_time}" - if title_data["count"] > 1: - formatted_title += f" ({title_data['count']}次)" - - if title_data.get("is_new"): - formatted_title = f"
🆕 {formatted_title}
" - - return formatted_title - - else: - return cleaned_title - - -def generate_html_report( - stats: List[Dict], - total_titles: int, - failed_ids: Optional[List] = None, - new_titles: Optional[Dict] = None, - id_to_name: Optional[Dict] = None, - mode: str = "daily", - is_daily_summary: bool = False, - update_info: Optional[Dict] = None, -) -> str: - """生成HTML报告""" - if is_daily_summary: - if mode == "current": - filename = "当前榜单汇总.html" - elif mode == "incremental": - filename = "当日增量.html" - else: - filename = "当日汇总.html" - else: - filename = f"{format_time_filename()}.html" - - file_path = get_output_path("html", filename) - - report_data = prepare_report_data(stats, failed_ids, new_titles, id_to_name, mode) - - html_content = render_html_content( - report_data, total_titles, is_daily_summary, mode, update_info - ) - - with open(file_path, "w", encoding="utf-8") as f: - f.write(html_content) - - if is_daily_summary: - # 生成到根目录(供 GitHub Pages 访问) - root_index_path = Path("index.html") - with open(root_index_path, "w", encoding="utf-8") as f: - f.write(html_content) - - # 同时生成到 output 目录(供 Docker Volume 挂载访问) - output_index_path = Path("output") / "index.html" - ensure_directory_exists("output") - with open(output_index_path, "w", encoding="utf-8") as f: - f.write(html_content) - - return file_path - - -def render_html_content( - report_data: Dict, - total_titles: int, - is_daily_summary: bool = False, - mode: str = "daily", - update_info: Optional[Dict] = None, -) -> str: - """渲染HTML内容""" - html = """ - - - - - - 热点新闻分析 - - - - -
-
-
- - -
-
热点新闻分析
-
-
- 报告类型 - """ - - # 处理报告类型显示 - if is_daily_summary: - if mode == "current": - html += "当前榜单" - elif mode == "incremental": - html += "增量模式" - else: - html += "当日汇总" - else: - html += "实时分析" - - html += """ -
-
- 新闻总数 - """ - - html += f"{total_titles} 条" - - # 计算筛选后的热点新闻数量 - hot_news_count = sum(len(stat["titles"]) for stat in report_data["stats"]) - - html += """ -
-
- 热点新闻 - """ - - html += f"{hot_news_count} 条" - - html += """ -
-
- 生成时间 - """ - - now = get_beijing_time() - html += now.strftime("%m-%d %H:%M") - - html += """ -
-
-
- -
""" - - # 处理失败ID错误信息 - if report_data["failed_ids"]: - html += """ -
-
⚠️ 请求失败的平台
-
    """ - for id_value in report_data["failed_ids"]: - html += f'
  • {html_escape(id_value)}
  • ' - html += """ -
-
""" - - # 生成热点词汇统计部分的HTML - stats_html = "" - if report_data["stats"]: - total_count = len(report_data["stats"]) - - for i, stat in enumerate(report_data["stats"], 1): - count = stat["count"] - - # 确定热度等级 - if count >= 10: - count_class = "hot" - elif count >= 5: - count_class = "warm" - else: - count_class = "" - - escaped_word = html_escape(stat["word"]) - - stats_html += f""" -
-
-
-
{escaped_word}
-
{count} 条
-
-
{i}/{total_count}
-
""" - - # 处理每个词组下的新闻标题,给每条新闻标上序号 - for j, title_data in enumerate(stat["titles"], 1): - is_new = title_data.get("is_new", False) - new_class = "new" if is_new else "" - - stats_html += f""" -
-
{j}
-
-
- {html_escape(title_data["source_name"])}""" - - # 处理排名显示 - ranks = title_data.get("ranks", []) - if ranks: - min_rank = min(ranks) - max_rank = max(ranks) - rank_threshold = title_data.get("rank_threshold", 10) - - # 确定排名等级 - if min_rank <= 3: - rank_class = "top" - elif min_rank <= rank_threshold: - rank_class = "high" - else: - rank_class = "" - - if min_rank == max_rank: - rank_text = str(min_rank) - else: - rank_text = f"{min_rank}-{max_rank}" - - stats_html += f'{rank_text}' - - # 处理时间显示 - time_display = title_data.get("time_display", "") - if time_display: - # 简化时间显示格式,将波浪线替换为~ - simplified_time = ( - time_display.replace(" ~ ", "~") - .replace("[", "") - .replace("]", "") - ) - stats_html += ( - f'{html_escape(simplified_time)}' - ) - - # 处理出现次数 - count_info = title_data.get("count", 1) - if count_info > 1: - stats_html += f'{count_info}次' - - stats_html += """ -
-
""" - - # 处理标题和链接 - escaped_title = html_escape(title_data["title"]) - link_url = title_data.get("mobile_url") or title_data.get("url", "") - - if link_url: - escaped_url = html_escape(link_url) - stats_html += f'{escaped_title}' - else: - stats_html += escaped_title - - stats_html += """ -
-
-
""" - - stats_html += """ -
""" - - # 生成新增新闻区域的HTML - new_titles_html = "" - if report_data["new_titles"]: - new_titles_html += f""" -
-
本次新增热点 (共 {report_data['total_new_count']} 条)
""" - - for source_data in report_data["new_titles"]: - escaped_source = html_escape(source_data["source_name"]) - titles_count = len(source_data["titles"]) - - new_titles_html += f""" -
-
{escaped_source} · {titles_count}条
""" - - # 为新增新闻也添加序号 - for idx, title_data in enumerate(source_data["titles"], 1): - ranks = title_data.get("ranks", []) - - # 处理新增新闻的排名显示 - rank_class = "" - if ranks: - min_rank = min(ranks) - if min_rank <= 3: - rank_class = "top" - elif min_rank <= title_data.get("rank_threshold", 10): - rank_class = "high" - - if len(ranks) == 1: - rank_text = str(ranks[0]) - else: - rank_text = f"{min(ranks)}-{max(ranks)}" - else: - rank_text = "?" - - new_titles_html += f""" -
-
{idx}
-
{rank_text}
-
-
""" - - # 处理新增新闻的链接 - escaped_title = html_escape(title_data["title"]) - link_url = title_data.get("mobile_url") or title_data.get("url", "") - - if link_url: - escaped_url = html_escape(link_url) - new_titles_html += f'{escaped_title}' - else: - new_titles_html += escaped_title - - new_titles_html += """ -
-
-
""" - - new_titles_html += """ -
""" - - new_titles_html += """ -
""" - - # 根据配置决定内容顺序 - if CONFIG.get("REVERSE_CONTENT_ORDER", False): - # 新增热点在前,热点词汇统计在后 - html += new_titles_html + stats_html - else: - # 默认:热点词汇统计在前,新增热点在后 - html += stats_html + new_titles_html - - html += """ -
- - -
- - - - - """ - - return html - - -def render_feishu_content( - report_data: Dict, update_info: Optional[Dict] = None, mode: str = "daily" -) -> str: - """渲染飞书内容""" - # 生成热点词汇统计部分 - stats_content = "" - if report_data["stats"]: - stats_content += f"📊 **热点词汇统计**\n\n" - - total_count = len(report_data["stats"]) - - for i, stat in enumerate(report_data["stats"]): - word = stat["word"] - count = stat["count"] - - sequence_display = f"[{i + 1}/{total_count}]" - - if count >= 10: - stats_content += f"🔥 {sequence_display} **{word}** : {count} 条\n\n" - elif count >= 5: - stats_content += f"📈 {sequence_display} **{word}** : {count} 条\n\n" - else: - stats_content += f"📌 {sequence_display} **{word}** : {count} 条\n\n" - - for j, title_data in enumerate(stat["titles"], 1): - formatted_title = format_title_for_platform( - "feishu", title_data, show_source=True - ) - stats_content += f" {j}. {formatted_title}\n" - - if j < len(stat["titles"]): - stats_content += "\n" - - if i < len(report_data["stats"]) - 1: - stats_content += f"\n{CONFIG['FEISHU_MESSAGE_SEPARATOR']}\n\n" - - # 生成新增新闻部分 - new_titles_content = "" - if report_data["new_titles"]: - new_titles_content += ( - f"🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n" - ) - - for source_data in report_data["new_titles"]: - new_titles_content += ( - f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n" - ) - - for j, title_data in enumerate(source_data["titles"], 1): - title_data_copy = title_data.copy() - title_data_copy["is_new"] = False - formatted_title = format_title_for_platform( - "feishu", title_data_copy, show_source=False - ) - new_titles_content += f" {j}. {formatted_title}\n" - - new_titles_content += "\n" - - # 根据配置决定内容顺序 - text_content = "" - if CONFIG.get("REVERSE_CONTENT_ORDER", False): - # 新增热点在前,热点词汇统计在后 - if new_titles_content: - text_content += new_titles_content - if stats_content: - text_content += f"\n{CONFIG['FEISHU_MESSAGE_SEPARATOR']}\n\n" - if stats_content: - text_content += stats_content - else: - # 默认:热点词汇统计在前,新增热点在后 - if stats_content: - text_content += stats_content - if new_titles_content: - text_content += f"\n{CONFIG['FEISHU_MESSAGE_SEPARATOR']}\n\n" - if new_titles_content: - text_content += new_titles_content - - if not text_content: - if mode == "incremental": - mode_text = "增量模式下暂无新增匹配的热点词汇" - elif mode == "current": - mode_text = "当前榜单模式下暂无匹配的热点词汇" - else: - mode_text = "暂无匹配的热点词汇" - text_content = f"📭 {mode_text}\n\n" - - if report_data["failed_ids"]: - if text_content and "暂无匹配" not in text_content: - text_content += f"\n{CONFIG['FEISHU_MESSAGE_SEPARATOR']}\n\n" - - text_content += "⚠️ **数据获取失败的平台:**\n\n" - for i, id_value in enumerate(report_data["failed_ids"], 1): - text_content += f" • {id_value}\n" - - now = get_beijing_time() - text_content += ( - f"\n\n更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}" - ) - - if update_info: - text_content += f"\nTrendRadar 发现新版本 {update_info['remote_version']},当前 {update_info['current_version']}" - - return text_content - - -def render_dingtalk_content( - report_data: Dict, update_info: Optional[Dict] = None, mode: str = "daily" -) -> str: - """渲染钉钉内容""" - total_titles = sum( - len(stat["titles"]) for stat in report_data["stats"] if stat["count"] > 0 - ) - now = get_beijing_time() - - # 头部信息 - header_content = f"**总新闻数:** {total_titles}\n\n" - header_content += f"**时间:** {now.strftime('%Y-%m-%d %H:%M:%S')}\n\n" - header_content += f"**类型:** 热点分析报告\n\n" - header_content += "---\n\n" - - # 生成热点词汇统计部分 - stats_content = "" - if report_data["stats"]: - stats_content += f"📊 **热点词汇统计**\n\n" - - total_count = len(report_data["stats"]) - - for i, stat in enumerate(report_data["stats"]): - word = stat["word"] - count = stat["count"] - - sequence_display = f"[{i + 1}/{total_count}]" - - if count >= 10: - stats_content += f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n" - elif count >= 5: - stats_content += f"📈 {sequence_display} **{word}** : **{count}** 条\n\n" - else: - stats_content += f"📌 {sequence_display} **{word}** : {count} 条\n\n" - - for j, title_data in enumerate(stat["titles"], 1): - formatted_title = format_title_for_platform( - "dingtalk", title_data, show_source=True - ) - stats_content += f" {j}. {formatted_title}\n" - - if j < len(stat["titles"]): - stats_content += "\n" - - if i < len(report_data["stats"]) - 1: - stats_content += f"\n---\n\n" - - # 生成新增新闻部分 - new_titles_content = "" - if report_data["new_titles"]: - new_titles_content += ( - f"🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n" - ) - - for source_data in report_data["new_titles"]: - new_titles_content += f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n" - - for j, title_data in enumerate(source_data["titles"], 1): - title_data_copy = title_data.copy() - title_data_copy["is_new"] = False - formatted_title = format_title_for_platform( - "dingtalk", title_data_copy, show_source=False - ) - new_titles_content += f" {j}. {formatted_title}\n" - - new_titles_content += "\n" - - # 根据配置决定内容顺序 - text_content = header_content - if CONFIG.get("REVERSE_CONTENT_ORDER", False): - # 新增热点在前,热点词汇统计在后 - if new_titles_content: - text_content += new_titles_content - if stats_content: - text_content += f"\n---\n\n" - if stats_content: - text_content += stats_content - else: - # 默认:热点词汇统计在前,新增热点在后 - if stats_content: - text_content += stats_content - if new_titles_content: - text_content += f"\n---\n\n" - if new_titles_content: - text_content += new_titles_content - - if not stats_content and not new_titles_content: - if mode == "incremental": - mode_text = "增量模式下暂无新增匹配的热点词汇" - elif mode == "current": - mode_text = "当前榜单模式下暂无匹配的热点词汇" - else: - mode_text = "暂无匹配的热点词汇" - text_content += f"📭 {mode_text}\n\n" - - if report_data["failed_ids"]: - if "暂无匹配" not in text_content: - text_content += f"\n---\n\n" - - text_content += "⚠️ **数据获取失败的平台:**\n\n" - for i, id_value in enumerate(report_data["failed_ids"], 1): - text_content += f" • **{id_value}**\n" - - text_content += f"\n\n> 更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}" - - if update_info: - text_content += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**,当前 **{update_info['current_version']}**" - - return text_content - - -def _get_batch_header(format_type: str, batch_num: int, total_batches: int) -> str: - """根据 format_type 生成对应格式的批次头部""" - if format_type == "telegram": - return f"[第 {batch_num}/{total_batches} 批次]\n\n" - elif format_type == "slack": - return f"*[第 {batch_num}/{total_batches} 批次]*\n\n" - elif format_type in ("wework_text", "bark"): - # 企业微信文本模式和 Bark 使用纯文本格式 - return f"[第 {batch_num}/{total_batches} 批次]\n\n" - else: - # 飞书、钉钉、ntfy、企业微信 markdown 模式 - return f"**[第 {batch_num}/{total_batches} 批次]**\n\n" - - -def _get_max_batch_header_size(format_type: str) -> int: - """估算批次头部的最大字节数(假设最多 99 批次) - - 用于在分批时预留空间,避免事后截断破坏内容完整性。 - """ - # 生成最坏情况的头部(99/99 批次) - max_header = _get_batch_header(format_type, 99, 99) - return len(max_header.encode("utf-8")) - - -def _truncate_to_bytes(text: str, max_bytes: int) -> str: - """安全截断字符串到指定字节数,避免截断多字节字符""" - text_bytes = text.encode("utf-8") - if len(text_bytes) <= max_bytes: - return text - - # 截断到指定字节数 - truncated = text_bytes[:max_bytes] - - # 处理可能的不完整 UTF-8 字符 - for i in range(min(4, len(truncated))): - try: - return truncated[: len(truncated) - i].decode("utf-8") - except UnicodeDecodeError: - continue - - # 极端情况:返回空字符串 - return "" - - -def add_batch_headers( - batches: List[str], format_type: str, max_bytes: int -) -> List[str]: - """为批次添加头部,动态计算确保总大小不超过限制 - - Args: - batches: 原始批次列表 - format_type: 推送类型(bark, telegram, feishu 等) - max_bytes: 该推送类型的最大字节限制 - - Returns: - 添加头部后的批次列表 - """ - if len(batches) <= 1: - return batches - - total = len(batches) - result = [] - - for i, content in enumerate(batches, 1): - # 生成批次头部 - header = _get_batch_header(format_type, i, total) - header_size = len(header.encode("utf-8")) - - # 动态计算允许的最大内容大小 - max_content_size = max_bytes - header_size - content_size = len(content.encode("utf-8")) - - # 如果超出,截断到安全大小 - if content_size > max_content_size: - print( - f"警告:{format_type} 第 {i}/{total} 批次内容({content_size}字节) + 头部({header_size}字节) 超出限制({max_bytes}字节),截断到 {max_content_size} 字节" - ) - content = _truncate_to_bytes(content, max_content_size) - - result.append(header + content) - - return result - - -def split_content_into_batches( - report_data: Dict, - format_type: str, - update_info: Optional[Dict] = None, - max_bytes: int = None, - mode: str = "daily", -) -> List[str]: - """分批处理消息内容,确保词组标题+至少第一条新闻的完整性""" - if max_bytes is None: - if format_type == "dingtalk": - max_bytes = CONFIG.get("DINGTALK_BATCH_SIZE", 20000) - elif format_type == "feishu": - max_bytes = CONFIG.get("FEISHU_BATCH_SIZE", 29000) - elif format_type == "ntfy": - max_bytes = 3800 - else: - max_bytes = CONFIG.get("MESSAGE_BATCH_SIZE", 4000) - - batches = [] - - total_titles = sum( - len(stat["titles"]) for stat in report_data["stats"] if stat["count"] > 0 - ) - now = get_beijing_time() - - base_header = "" - if format_type in ("wework", "bark"): - base_header = f"**总新闻数:** {total_titles}\n\n\n\n" - elif format_type == "telegram": - base_header = f"总新闻数: {total_titles}\n\n" - elif format_type == "ntfy": - base_header = f"**总新闻数:** {total_titles}\n\n" - elif format_type == "feishu": - base_header = "" - elif format_type == "dingtalk": - base_header = f"**总新闻数:** {total_titles}\n\n" - base_header += f"**时间:** {now.strftime('%Y-%m-%d %H:%M:%S')}\n\n" - base_header += f"**类型:** 热点分析报告\n\n" - base_header += "---\n\n" - elif format_type == "slack": - base_header = f"*总新闻数:* {total_titles}\n\n" - - base_footer = "" - if format_type in ("wework", "bark"): - base_footer = f"\n\n\n> 更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}" - if update_info: - base_footer += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**,当前 **{update_info['current_version']}**" - elif format_type == "telegram": - base_footer = f"\n\n更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}" - if update_info: - base_footer += f"\nTrendRadar 发现新版本 {update_info['remote_version']},当前 {update_info['current_version']}" - elif format_type == "ntfy": - base_footer = f"\n\n> 更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}" - if update_info: - base_footer += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**,当前 **{update_info['current_version']}**" - elif format_type == "feishu": - base_footer = f"\n\n更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}" - if update_info: - base_footer += f"\nTrendRadar 发现新版本 {update_info['remote_version']},当前 {update_info['current_version']}" - elif format_type == "dingtalk": - base_footer = f"\n\n> 更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}" - if update_info: - base_footer += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**,当前 **{update_info['current_version']}**" - elif format_type == "slack": - base_footer = f"\n\n_更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}_" - if update_info: - base_footer += f"\n_TrendRadar 发现新版本 *{update_info['remote_version']}*,当前 *{update_info['current_version']}_" - - stats_header = "" - if report_data["stats"]: - if format_type in ("wework", "bark"): - stats_header = f"📊 **热点词汇统计**\n\n" - elif format_type == "telegram": - stats_header = f"📊 热点词汇统计\n\n" - elif format_type == "ntfy": - stats_header = f"📊 **热点词汇统计**\n\n" - elif format_type == "feishu": - stats_header = f"📊 **热点词汇统计**\n\n" - elif format_type == "dingtalk": - stats_header = f"📊 **热点词汇统计**\n\n" - elif format_type == "slack": - stats_header = f"📊 *热点词汇统计*\n\n" - - current_batch = base_header - current_batch_has_content = False - - if ( - not report_data["stats"] - and not report_data["new_titles"] - and not report_data["failed_ids"] - ): - if mode == "incremental": - mode_text = "增量模式下暂无新增匹配的热点词汇" - elif mode == "current": - mode_text = "当前榜单模式下暂无匹配的热点词汇" - else: - mode_text = "暂无匹配的热点词汇" - simple_content = f"📭 {mode_text}\n\n" - final_content = base_header + simple_content + base_footer - batches.append(final_content) - return batches - - # 定义处理热点词汇统计的函数 - def process_stats_section(current_batch, current_batch_has_content, batches): - """处理热点词汇统计""" - if not report_data["stats"]: - return current_batch, current_batch_has_content, batches - - total_count = len(report_data["stats"]) - - # 添加统计标题 - test_content = current_batch + stats_header - if ( - len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8")) - < max_bytes - ): - current_batch = test_content - current_batch_has_content = True - else: - if current_batch_has_content: - batches.append(current_batch + base_footer) - current_batch = base_header + stats_header - current_batch_has_content = True - - # 逐个处理词组(确保词组标题+第一条新闻的原子性) - for i, stat in enumerate(report_data["stats"]): - word = stat["word"] - count = stat["count"] - sequence_display = f"[{i + 1}/{total_count}]" - - # 构建词组标题 - word_header = "" - if format_type in ("wework", "bark"): - if count >= 10: - word_header = ( - f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n" - ) - elif count >= 5: - word_header = ( - f"📈 {sequence_display} **{word}** : **{count}** 条\n\n" - ) - else: - word_header = f"📌 {sequence_display} **{word}** : {count} 条\n\n" - elif format_type == "telegram": - if count >= 10: - word_header = f"🔥 {sequence_display} {word} : {count} 条\n\n" - elif count >= 5: - word_header = f"📈 {sequence_display} {word} : {count} 条\n\n" - else: - word_header = f"📌 {sequence_display} {word} : {count} 条\n\n" - elif format_type == "ntfy": - if count >= 10: - word_header = ( - f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n" - ) - elif count >= 5: - word_header = ( - f"📈 {sequence_display} **{word}** : **{count}** 条\n\n" - ) - else: - word_header = f"📌 {sequence_display} **{word}** : {count} 条\n\n" - elif format_type == "feishu": - if count >= 10: - word_header = f"🔥 {sequence_display} **{word}** : {count} 条\n\n" - elif count >= 5: - word_header = f"📈 {sequence_display} **{word}** : {count} 条\n\n" - else: - word_header = f"📌 {sequence_display} **{word}** : {count} 条\n\n" - elif format_type == "dingtalk": - if count >= 10: - word_header = ( - f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n" - ) - elif count >= 5: - word_header = ( - f"📈 {sequence_display} **{word}** : **{count}** 条\n\n" - ) - else: - word_header = f"📌 {sequence_display} **{word}** : {count} 条\n\n" - elif format_type == "slack": - if count >= 10: - word_header = ( - f"🔥 {sequence_display} *{word}* : *{count}* 条\n\n" - ) - elif count >= 5: - word_header = ( - f"📈 {sequence_display} *{word}* : *{count}* 条\n\n" - ) - else: - word_header = f"📌 {sequence_display} *{word}* : {count} 条\n\n" - - # 构建第一条新闻 - first_news_line = "" - if stat["titles"]: - first_title_data = stat["titles"][0] - if format_type in ("wework", "bark"): - formatted_title = format_title_for_platform( - "wework", first_title_data, show_source=True - ) - elif format_type == "telegram": - formatted_title = format_title_for_platform( - "telegram", first_title_data, show_source=True - ) - elif format_type == "ntfy": - formatted_title = format_title_for_platform( - "ntfy", first_title_data, show_source=True - ) - elif format_type == "feishu": - formatted_title = format_title_for_platform( - "feishu", first_title_data, show_source=True - ) - elif format_type == "dingtalk": - formatted_title = format_title_for_platform( - "dingtalk", first_title_data, show_source=True - ) - elif format_type == "slack": - formatted_title = format_title_for_platform( - "slack", first_title_data, show_source=True - ) - else: - formatted_title = f"{first_title_data['title']}" - - first_news_line = f" 1. {formatted_title}\n" - if len(stat["titles"]) > 1: - first_news_line += "\n" - - # 原子性检查:词组标题+第一条新闻必须一起处理 - word_with_first_news = word_header + first_news_line - test_content = current_batch + word_with_first_news - - if ( - len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8")) - >= max_bytes - ): - # 当前批次容纳不下,开启新批次 - if current_batch_has_content: - batches.append(current_batch + base_footer) - current_batch = base_header + stats_header + word_with_first_news - current_batch_has_content = True - start_index = 1 - else: - current_batch = test_content - current_batch_has_content = True - start_index = 1 - - # 处理剩余新闻条目 - for j in range(start_index, len(stat["titles"])): - title_data = stat["titles"][j] - if format_type in ("wework", "bark"): - formatted_title = format_title_for_platform( - "wework", title_data, show_source=True - ) - elif format_type == "telegram": - formatted_title = format_title_for_platform( - "telegram", title_data, show_source=True - ) - elif format_type == "ntfy": - formatted_title = format_title_for_platform( - "ntfy", title_data, show_source=True - ) - elif format_type == "feishu": - formatted_title = format_title_for_platform( - "feishu", title_data, show_source=True - ) - elif format_type == "dingtalk": - formatted_title = format_title_for_platform( - "dingtalk", title_data, show_source=True - ) - elif format_type == "slack": - formatted_title = format_title_for_platform( - "slack", title_data, show_source=True - ) - else: - formatted_title = f"{title_data['title']}" - - news_line = f" {j + 1}. {formatted_title}\n" - if j < len(stat["titles"]) - 1: - news_line += "\n" - - test_content = current_batch + news_line - if ( - len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8")) - >= max_bytes - ): - if current_batch_has_content: - batches.append(current_batch + base_footer) - current_batch = base_header + stats_header + word_header + news_line - current_batch_has_content = True - else: - current_batch = test_content - current_batch_has_content = True - - # 词组间分隔符 - if i < len(report_data["stats"]) - 1: - separator = "" - if format_type in ("wework", "bark"): - separator = f"\n\n\n\n" - elif format_type == "telegram": - separator = f"\n\n" - elif format_type == "ntfy": - separator = f"\n\n" - elif format_type == "feishu": - separator = f"\n{CONFIG['FEISHU_MESSAGE_SEPARATOR']}\n\n" - elif format_type == "dingtalk": - separator = f"\n---\n\n" - elif format_type == "slack": - separator = f"\n\n" - - test_content = current_batch + separator - if ( - len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8")) - < max_bytes - ): - current_batch = test_content - - return current_batch, current_batch_has_content, batches - - # 定义处理新增新闻的函数 - def process_new_titles_section(current_batch, current_batch_has_content, batches): - """处理新增新闻""" - if not report_data["new_titles"]: - return current_batch, current_batch_has_content, batches - - new_header = "" - if format_type in ("wework", "bark"): - new_header = f"\n\n\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n" - elif format_type == "telegram": - new_header = ( - f"\n\n🆕 本次新增热点新闻 (共 {report_data['total_new_count']} 条)\n\n" - ) - elif format_type == "ntfy": - new_header = f"\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n" - elif format_type == "feishu": - new_header = f"\n{CONFIG['FEISHU_MESSAGE_SEPARATOR']}\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n" - elif format_type == "dingtalk": - new_header = f"\n---\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n" - elif format_type == "slack": - new_header = f"\n\n🆕 *本次新增热点新闻* (共 {report_data['total_new_count']} 条)\n\n" - - test_content = current_batch + new_header - if ( - len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8")) - >= max_bytes - ): - if current_batch_has_content: - batches.append(current_batch + base_footer) - current_batch = base_header + new_header - current_batch_has_content = True - else: - current_batch = test_content - current_batch_has_content = True - - # 逐个处理新增新闻来源 - for source_data in report_data["new_titles"]: - source_header = "" - if format_type in ("wework", "bark"): - source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n" - elif format_type == "telegram": - source_header = f"{source_data['source_name']} ({len(source_data['titles'])} 条):\n\n" - elif format_type == "ntfy": - source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n" - elif format_type == "feishu": - source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n" - elif format_type == "dingtalk": - source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n" - elif format_type == "slack": - source_header = f"*{source_data['source_name']}* ({len(source_data['titles'])} 条):\n\n" - - # 构建第一条新增新闻 - first_news_line = "" - if source_data["titles"]: - first_title_data = source_data["titles"][0] - title_data_copy = first_title_data.copy() - title_data_copy["is_new"] = False - - if format_type in ("wework", "bark"): - formatted_title = format_title_for_platform( - "wework", title_data_copy, show_source=False - ) - elif format_type == "telegram": - formatted_title = format_title_for_platform( - "telegram", title_data_copy, show_source=False - ) - elif format_type == "feishu": - formatted_title = format_title_for_platform( - "feishu", title_data_copy, show_source=False - ) - elif format_type == "dingtalk": - formatted_title = format_title_for_platform( - "dingtalk", title_data_copy, show_source=False - ) - elif format_type == "slack": - formatted_title = format_title_for_platform( - "slack", title_data_copy, show_source=False - ) - else: - formatted_title = f"{title_data_copy['title']}" - - first_news_line = f" 1. {formatted_title}\n" - - # 原子性检查:来源标题+第一条新闻 - source_with_first_news = source_header + first_news_line - test_content = current_batch + source_with_first_news - - if ( - len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8")) - >= max_bytes - ): - if current_batch_has_content: - batches.append(current_batch + base_footer) - current_batch = base_header + new_header + source_with_first_news - current_batch_has_content = True - start_index = 1 - else: - current_batch = test_content - current_batch_has_content = True - start_index = 1 - - # 处理剩余新增新闻 - for j in range(start_index, len(source_data["titles"])): - title_data = source_data["titles"][j] - title_data_copy = title_data.copy() - title_data_copy["is_new"] = False - - if format_type == "wework": - formatted_title = format_title_for_platform( - "wework", title_data_copy, show_source=False - ) - elif format_type == "telegram": - formatted_title = format_title_for_platform( - "telegram", title_data_copy, show_source=False - ) - elif format_type == "feishu": - formatted_title = format_title_for_platform( - "feishu", title_data_copy, show_source=False - ) - elif format_type == "dingtalk": - formatted_title = format_title_for_platform( - "dingtalk", title_data_copy, show_source=False - ) - elif format_type == "slack": - formatted_title = format_title_for_platform( - "slack", title_data_copy, show_source=False - ) - else: - formatted_title = f"{title_data_copy['title']}" - - news_line = f" {j + 1}. {formatted_title}\n" - - test_content = current_batch + news_line - if ( - len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8")) - >= max_bytes - ): - if current_batch_has_content: - batches.append(current_batch + base_footer) - current_batch = base_header + new_header + source_header + news_line - current_batch_has_content = True - else: - current_batch = test_content - current_batch_has_content = True - - current_batch += "\n" - - return current_batch, current_batch_has_content, batches - - # 根据配置决定处理顺序 - if CONFIG.get("REVERSE_CONTENT_ORDER", False): - # 新增热点在前,热点词汇统计在后 - current_batch, current_batch_has_content, batches = process_new_titles_section( - current_batch, current_batch_has_content, batches - ) - current_batch, current_batch_has_content, batches = process_stats_section( - current_batch, current_batch_has_content, batches - ) - else: - # 默认:热点词汇统计在前,新增热点在后 - current_batch, current_batch_has_content, batches = process_stats_section( - current_batch, current_batch_has_content, batches - ) - current_batch, current_batch_has_content, batches = process_new_titles_section( - current_batch, current_batch_has_content, batches - ) - - if report_data["failed_ids"]: - failed_header = "" - if format_type == "wework": - failed_header = f"\n\n\n\n⚠️ **数据获取失败的平台:**\n\n" - elif format_type == "telegram": - failed_header = f"\n\n⚠️ 数据获取失败的平台:\n\n" - elif format_type == "ntfy": - failed_header = f"\n\n⚠️ **数据获取失败的平台:**\n\n" - elif format_type == "feishu": - failed_header = f"\n{CONFIG['FEISHU_MESSAGE_SEPARATOR']}\n\n⚠️ **数据获取失败的平台:**\n\n" - elif format_type == "dingtalk": - failed_header = f"\n---\n\n⚠️ **数据获取失败的平台:**\n\n" - - test_content = current_batch + failed_header - if ( - len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8")) - >= max_bytes - ): - if current_batch_has_content: - batches.append(current_batch + base_footer) - current_batch = base_header + failed_header - current_batch_has_content = True - else: - current_batch = test_content - current_batch_has_content = True - - for i, id_value in enumerate(report_data["failed_ids"], 1): - if format_type == "feishu": - failed_line = f" • {id_value}\n" - elif format_type == "dingtalk": - failed_line = f" • **{id_value}**\n" - else: - failed_line = f" • {id_value}\n" - - test_content = current_batch + failed_line - if ( - len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8")) - >= max_bytes - ): - if current_batch_has_content: - batches.append(current_batch + base_footer) - current_batch = base_header + failed_header + failed_line - current_batch_has_content = True - else: - current_batch = test_content - current_batch_has_content = True - - # 完成最后批次 - if current_batch_has_content: - batches.append(current_batch + base_footer) - - return batches - - -def send_to_notifications( - stats: List[Dict], - failed_ids: Optional[List] = None, - report_type: str = "当日汇总", - new_titles: Optional[Dict] = None, - id_to_name: Optional[Dict] = None, - update_info: Optional[Dict] = None, - proxy_url: Optional[str] = None, - mode: str = "daily", - html_file_path: Optional[str] = None, -) -> Dict[str, bool]: - """发送数据到多个通知平台(支持多账号)""" - results = {} - max_accounts = CONFIG["MAX_ACCOUNTS_PER_CHANNEL"] - - if CONFIG["PUSH_WINDOW"]["ENABLED"]: - push_manager = PushRecordManager() - time_range_start = CONFIG["PUSH_WINDOW"]["TIME_RANGE"]["START"] - time_range_end = CONFIG["PUSH_WINDOW"]["TIME_RANGE"]["END"] - - if not push_manager.is_in_time_range(time_range_start, time_range_end): - now = get_beijing_time() - print( - f"推送窗口控制:当前时间 {now.strftime('%H:%M')} 不在推送时间窗口 {time_range_start}-{time_range_end} 内,跳过推送" - ) - return results - - if CONFIG["PUSH_WINDOW"]["ONCE_PER_DAY"]: - if push_manager.has_pushed_today(): - print(f"推送窗口控制:今天已推送过,跳过本次推送") - return results - else: - print(f"推送窗口控制:今天首次推送") - - report_data = prepare_report_data(stats, failed_ids, new_titles, id_to_name, mode) - - update_info_to_send = update_info if CONFIG["SHOW_VERSION_UPDATE"] else None - - # 发送到飞书(多账号) - feishu_urls = parse_multi_account_config(CONFIG["FEISHU_WEBHOOK_URL"]) - if feishu_urls: - feishu_urls = limit_accounts(feishu_urls, max_accounts, "飞书") - feishu_results = [] - for i, url in enumerate(feishu_urls): - if url: # 跳过空值 - account_label = f"账号{i+1}" if len(feishu_urls) > 1 else "" - result = send_to_feishu( - url, report_data, report_type, update_info_to_send, proxy_url, mode, account_label - ) - feishu_results.append(result) - results["feishu"] = any(feishu_results) if feishu_results else False - - # 发送到钉钉(多账号) - dingtalk_urls = parse_multi_account_config(CONFIG["DINGTALK_WEBHOOK_URL"]) - if dingtalk_urls: - dingtalk_urls = limit_accounts(dingtalk_urls, max_accounts, "钉钉") - dingtalk_results = [] - for i, url in enumerate(dingtalk_urls): - if url: - account_label = f"账号{i+1}" if len(dingtalk_urls) > 1 else "" - result = send_to_dingtalk( - url, report_data, report_type, update_info_to_send, proxy_url, mode, account_label - ) - dingtalk_results.append(result) - results["dingtalk"] = any(dingtalk_results) if dingtalk_results else False - - # 发送到企业微信(多账号) - wework_urls = parse_multi_account_config(CONFIG["WEWORK_WEBHOOK_URL"]) - if wework_urls: - wework_urls = limit_accounts(wework_urls, max_accounts, "企业微信") - wework_results = [] - for i, url in enumerate(wework_urls): - if url: - account_label = f"账号{i+1}" if len(wework_urls) > 1 else "" - result = send_to_wework( - url, report_data, report_type, update_info_to_send, proxy_url, mode, account_label - ) - wework_results.append(result) - results["wework"] = any(wework_results) if wework_results else False - - # 发送到 Telegram(多账号,需验证配对) - telegram_tokens = parse_multi_account_config(CONFIG["TELEGRAM_BOT_TOKEN"]) - telegram_chat_ids = parse_multi_account_config(CONFIG["TELEGRAM_CHAT_ID"]) - if telegram_tokens and telegram_chat_ids: - valid, count = validate_paired_configs( - {"bot_token": telegram_tokens, "chat_id": telegram_chat_ids}, - "Telegram", - required_keys=["bot_token", "chat_id"] - ) - if valid and count > 0: - telegram_tokens = limit_accounts(telegram_tokens, max_accounts, "Telegram") - telegram_chat_ids = telegram_chat_ids[:len(telegram_tokens)] # 保持数量一致 - telegram_results = [] - for i in range(len(telegram_tokens)): - token = telegram_tokens[i] - chat_id = telegram_chat_ids[i] - if token and chat_id: - account_label = f"账号{i+1}" if len(telegram_tokens) > 1 else "" - result = send_to_telegram( - token, chat_id, report_data, report_type, - update_info_to_send, proxy_url, mode, account_label - ) - telegram_results.append(result) - results["telegram"] = any(telegram_results) if telegram_results else False - - # 发送到 ntfy(多账号,需验证配对) - ntfy_server_url = CONFIG["NTFY_SERVER_URL"] - ntfy_topics = parse_multi_account_config(CONFIG["NTFY_TOPIC"]) - ntfy_tokens = parse_multi_account_config(CONFIG["NTFY_TOKEN"]) - if ntfy_server_url and ntfy_topics: - # 验证 token 和 topic 数量一致(如果配置了 token) - if ntfy_tokens and len(ntfy_tokens) != len(ntfy_topics): - print(f"❌ ntfy 配置错误:topic 数量({len(ntfy_topics)})与 token 数量({len(ntfy_tokens)})不一致,跳过 ntfy 推送") - else: - ntfy_topics = limit_accounts(ntfy_topics, max_accounts, "ntfy") - if ntfy_tokens: - ntfy_tokens = ntfy_tokens[:len(ntfy_topics)] - ntfy_results = [] - for i, topic in enumerate(ntfy_topics): - if topic: - token = get_account_at_index(ntfy_tokens, i, "") if ntfy_tokens else "" - account_label = f"账号{i+1}" if len(ntfy_topics) > 1 else "" - result = send_to_ntfy( - ntfy_server_url, topic, token, report_data, report_type, - update_info_to_send, proxy_url, mode, account_label - ) - ntfy_results.append(result) - results["ntfy"] = any(ntfy_results) if ntfy_results else False - - # 发送到 Bark(多账号) - bark_urls = parse_multi_account_config(CONFIG["BARK_URL"]) - if bark_urls: - bark_urls = limit_accounts(bark_urls, max_accounts, "Bark") - bark_results = [] - for i, url in enumerate(bark_urls): - if url: - account_label = f"账号{i+1}" if len(bark_urls) > 1 else "" - result = send_to_bark( - url, report_data, report_type, update_info_to_send, proxy_url, mode, account_label - ) - bark_results.append(result) - results["bark"] = any(bark_results) if bark_results else False - - # 发送到 Slack(多账号) - slack_urls = parse_multi_account_config(CONFIG["SLACK_WEBHOOK_URL"]) - if slack_urls: - slack_urls = limit_accounts(slack_urls, max_accounts, "Slack") - slack_results = [] - for i, url in enumerate(slack_urls): - if url: - account_label = f"账号{i+1}" if len(slack_urls) > 1 else "" - result = send_to_slack( - url, report_data, report_type, update_info_to_send, proxy_url, mode, account_label - ) - slack_results.append(result) - results["slack"] = any(slack_results) if slack_results else False - - # 发送邮件(保持原有逻辑,已支持多收件人) - email_from = CONFIG["EMAIL_FROM"] - email_password = CONFIG["EMAIL_PASSWORD"] - email_to = CONFIG["EMAIL_TO"] - email_smtp_server = CONFIG.get("EMAIL_SMTP_SERVER", "") - email_smtp_port = CONFIG.get("EMAIL_SMTP_PORT", "") - if email_from and email_password and email_to: - results["email"] = send_to_email( - email_from, - email_password, - email_to, - report_type, - html_file_path, - email_smtp_server, - email_smtp_port, - ) - - if not results: - print("未配置任何通知渠道,跳过通知发送") - - # 如果成功发送了任何通知,且启用了每天只推一次,则记录推送 - if ( - CONFIG["PUSH_WINDOW"]["ENABLED"] - and CONFIG["PUSH_WINDOW"]["ONCE_PER_DAY"] - and any(results.values()) - ): - push_manager = PushRecordManager() - push_manager.record_push(report_type) - - return results - - -def send_to_feishu( - webhook_url: str, - report_data: Dict, - report_type: str, - update_info: Optional[Dict] = None, - proxy_url: Optional[str] = None, - mode: str = "daily", - account_label: str = "", -) -> bool: - """发送到飞书(支持分批发送)""" - headers = {"Content-Type": "application/json"} - proxies = None - if proxy_url: - proxies = {"http": proxy_url, "https": proxy_url} - - # 日志前缀 - log_prefix = f"飞书{account_label}" if account_label else "飞书" - - # 获取分批内容,使用飞书专用的批次大小 - feishu_batch_size = CONFIG.get("FEISHU_BATCH_SIZE", 29000) - # 预留批次头部空间,避免添加头部后超限 - header_reserve = _get_max_batch_header_size("feishu") - batches = split_content_into_batches( - report_data, - "feishu", - update_info, - max_bytes=feishu_batch_size - header_reserve, - mode=mode, - ) - - # 统一添加批次头部(已预留空间,不会超限) - batches = add_batch_headers(batches, "feishu", feishu_batch_size) - - print(f"{log_prefix}消息分为 {len(batches)} 批次发送 [{report_type}]") - - # 逐批发送 - for i, batch_content in enumerate(batches, 1): - batch_size = len(batch_content.encode("utf-8")) - print( - f"发送{log_prefix}第 {i}/{len(batches)} 批次,大小:{batch_size} 字节 [{report_type}]" - ) - - total_titles = sum( - len(stat["titles"]) for stat in report_data["stats"] if stat["count"] > 0 - ) - now = get_beijing_time() - - payload = { - "msg_type": "text", - "content": { - "total_titles": total_titles, - "timestamp": now.strftime("%Y-%m-%d %H:%M:%S"), - "report_type": report_type, - "text": batch_content, - }, - } - - try: - response = requests.post( - webhook_url, headers=headers, json=payload, proxies=proxies, timeout=30 - ) - if response.status_code == 200: - result = response.json() - # 检查飞书的响应状态 - if result.get("StatusCode") == 0 or result.get("code") == 0: - print(f"{log_prefix}第 {i}/{len(batches)} 批次发送成功 [{report_type}]") - # 批次间间隔 - if i < len(batches): - time.sleep(CONFIG["BATCH_SEND_INTERVAL"]) - else: - error_msg = result.get("msg") or result.get("StatusMessage", "未知错误") - print( - f"{log_prefix}第 {i}/{len(batches)} 批次发送失败 [{report_type}],错误:{error_msg}" - ) - return False - else: - print( - f"{log_prefix}第 {i}/{len(batches)} 批次发送失败 [{report_type}],状态码:{response.status_code}" - ) - return False - except Exception as e: - print(f"{log_prefix}第 {i}/{len(batches)} 批次发送出错 [{report_type}]:{e}") - return False - - print(f"{log_prefix}所有 {len(batches)} 批次发送完成 [{report_type}]") - return True - - -def send_to_dingtalk( - webhook_url: str, - report_data: Dict, - report_type: str, - update_info: Optional[Dict] = None, - proxy_url: Optional[str] = None, - mode: str = "daily", - account_label: str = "", -) -> bool: - """发送到钉钉(支持分批发送)""" - headers = {"Content-Type": "application/json"} - proxies = None - if proxy_url: - proxies = {"http": proxy_url, "https": proxy_url} - - # 日志前缀 - log_prefix = f"钉钉{account_label}" if account_label else "钉钉" - - # 获取分批内容,使用钉钉专用的批次大小 - dingtalk_batch_size = CONFIG.get("DINGTALK_BATCH_SIZE", 20000) - # 预留批次头部空间,避免添加头部后超限 - header_reserve = _get_max_batch_header_size("dingtalk") - batches = split_content_into_batches( - report_data, - "dingtalk", - update_info, - max_bytes=dingtalk_batch_size - header_reserve, - mode=mode, - ) - - # 统一添加批次头部(已预留空间,不会超限) - batches = add_batch_headers(batches, "dingtalk", dingtalk_batch_size) - - print(f"{log_prefix}消息分为 {len(batches)} 批次发送 [{report_type}]") - - # 逐批发送 - for i, batch_content in enumerate(batches, 1): - batch_size = len(batch_content.encode("utf-8")) - print( - f"发送{log_prefix}第 {i}/{len(batches)} 批次,大小:{batch_size} 字节 [{report_type}]" - ) - - payload = { - "msgtype": "markdown", - "markdown": { - "title": f"TrendRadar 热点分析报告 - {report_type}", - "text": batch_content, - }, - } - - try: - response = requests.post( - webhook_url, headers=headers, json=payload, proxies=proxies, timeout=30 - ) - if response.status_code == 200: - result = response.json() - if result.get("errcode") == 0: - print(f"{log_prefix}第 {i}/{len(batches)} 批次发送成功 [{report_type}]") - # 批次间间隔 - if i < len(batches): - time.sleep(CONFIG["BATCH_SEND_INTERVAL"]) - else: - print( - f"{log_prefix}第 {i}/{len(batches)} 批次发送失败 [{report_type}],错误:{result.get('errmsg')}" - ) - return False - else: - print( - f"{log_prefix}第 {i}/{len(batches)} 批次发送失败 [{report_type}],状态码:{response.status_code}" - ) - return False - except Exception as e: - print(f"{log_prefix}第 {i}/{len(batches)} 批次发送出错 [{report_type}]:{e}") - return False - - print(f"{log_prefix}所有 {len(batches)} 批次发送完成 [{report_type}]") - return True - - -def strip_markdown(text: str) -> str: - """去除文本中的 markdown 语法格式,用于个人微信推送""" - - # 去除粗体 **text** 或 __text__ - text = re.sub(r'\*\*(.+?)\*\*', r'\1', text) - text = re.sub(r'__(.+?)__', r'\1', text) - - # 去除斜体 *text* 或 _text_ - text = re.sub(r'\*(.+?)\*', r'\1', text) - text = re.sub(r'_(.+?)_', r'\1', text) - - # 去除删除线 ~~text~~ - text = re.sub(r'~~(.+?)~~', r'\1', text) - - # 转换链接 [text](url) -> text url(保留 URL) - text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'\1 \2', text) - # 如果不需要保留 URL,可以使用下面这行(只保留标题文本): - # text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text) - - # 去除图片 ![alt](url) -> alt - text = re.sub(r'!\[(.+?)\]\(.+?\)', r'\1', text) - - # 去除行内代码 `code` - text = re.sub(r'`(.+?)`', r'\1', text) - - # 去除引用符号 > - text = re.sub(r'^>\s*', '', text, flags=re.MULTILINE) - - # 去除标题符号 # ## ### 等 - text = re.sub(r'^#+\s*', '', text, flags=re.MULTILINE) - - # 去除水平分割线 --- 或 *** - text = re.sub(r'^[\-\*]{3,}\s*$', '', text, flags=re.MULTILINE) - - # 去除 HTML 标签 text -> text - text = re.sub(r']*>(.+?)', r'\1', text) - text = re.sub(r'<[^>]+>', '', text) - - # 清理多余的空行(保留最多两个连续空行) - text = re.sub(r'\n{3,}', '\n\n', text) - - return text.strip() - - -def send_to_wework( - webhook_url: str, - report_data: Dict, - report_type: str, - update_info: Optional[Dict] = None, - proxy_url: Optional[str] = None, - mode: str = "daily", - account_label: str = "", -) -> bool: - """发送到企业微信(支持分批发送,支持 markdown 和 text 两种格式)""" - headers = {"Content-Type": "application/json"} - proxies = None - if proxy_url: - proxies = {"http": proxy_url, "https": proxy_url} - - # 日志前缀 - log_prefix = f"企业微信{account_label}" if account_label else "企业微信" - - # 获取消息类型配置(markdown 或 text) - msg_type = CONFIG.get("WEWORK_MSG_TYPE", "markdown").lower() - is_text_mode = msg_type == "text" - - if is_text_mode: - print(f"{log_prefix}使用 text 格式(个人微信模式)[{report_type}]") - else: - print(f"{log_prefix}使用 markdown 格式(群机器人模式)[{report_type}]") - - # text 模式使用 wework_text,markdown 模式使用 wework - header_format_type = "wework_text" if is_text_mode else "wework" - - # 获取分批内容,预留批次头部空间 - wework_batch_size = CONFIG.get("MESSAGE_BATCH_SIZE", 4000) - header_reserve = _get_max_batch_header_size(header_format_type) - batches = split_content_into_batches( - report_data, "wework", update_info, max_bytes=wework_batch_size - header_reserve, mode=mode - ) - - # 统一添加批次头部(已预留空间,不会超限) - batches = add_batch_headers(batches, header_format_type, wework_batch_size) - - print(f"{log_prefix}消息分为 {len(batches)} 批次发送 [{report_type}]") - - # 逐批发送 - for i, batch_content in enumerate(batches, 1): - # 根据消息类型构建 payload - if is_text_mode: - # text 格式:去除 markdown 语法 - plain_content = strip_markdown(batch_content) - payload = {"msgtype": "text", "text": {"content": plain_content}} - batch_size = len(plain_content.encode("utf-8")) - else: - # markdown 格式:保持原样 - payload = {"msgtype": "markdown", "markdown": {"content": batch_content}} - batch_size = len(batch_content.encode("utf-8")) - - print( - f"发送{log_prefix}第 {i}/{len(batches)} 批次,大小:{batch_size} 字节 [{report_type}]" - ) - - try: - response = requests.post( - webhook_url, headers=headers, json=payload, proxies=proxies, timeout=30 - ) - if response.status_code == 200: - result = response.json() - if result.get("errcode") == 0: - print(f"{log_prefix}第 {i}/{len(batches)} 批次发送成功 [{report_type}]") - # 批次间间隔 - if i < len(batches): - time.sleep(CONFIG["BATCH_SEND_INTERVAL"]) - else: - print( - f"{log_prefix}第 {i}/{len(batches)} 批次发送失败 [{report_type}],错误:{result.get('errmsg')}" - ) - return False - else: - print( - f"{log_prefix}第 {i}/{len(batches)} 批次发送失败 [{report_type}],状态码:{response.status_code}" - ) - return False - except Exception as e: - print(f"{log_prefix}第 {i}/{len(batches)} 批次发送出错 [{report_type}]:{e}") - return False - - print(f"{log_prefix}所有 {len(batches)} 批次发送完成 [{report_type}]") - return True - - -def send_to_telegram( - bot_token: str, - chat_id: str, - report_data: Dict, - report_type: str, - update_info: Optional[Dict] = None, - proxy_url: Optional[str] = None, - mode: str = "daily", - account_label: str = "", -) -> bool: - """发送到Telegram(支持分批发送)""" - headers = {"Content-Type": "application/json"} - url = f"https://api.telegram.org/bot{bot_token}/sendMessage" - - proxies = None - if proxy_url: - proxies = {"http": proxy_url, "https": proxy_url} - - # 日志前缀 - log_prefix = f"Telegram{account_label}" if account_label else "Telegram" - - # 获取分批内容,预留批次头部空间 - telegram_batch_size = CONFIG.get("MESSAGE_BATCH_SIZE", 4000) - header_reserve = _get_max_batch_header_size("telegram") - batches = split_content_into_batches( - report_data, "telegram", update_info, max_bytes=telegram_batch_size - header_reserve, mode=mode - ) - - # 统一添加批次头部(已预留空间,不会超限) - batches = add_batch_headers(batches, "telegram", telegram_batch_size) - - print(f"{log_prefix}消息分为 {len(batches)} 批次发送 [{report_type}]") - - # 逐批发送 - for i, batch_content in enumerate(batches, 1): - batch_size = len(batch_content.encode("utf-8")) - print( - f"发送{log_prefix}第 {i}/{len(batches)} 批次,大小:{batch_size} 字节 [{report_type}]" - ) - - payload = { - "chat_id": chat_id, - "text": batch_content, - "parse_mode": "HTML", - "disable_web_page_preview": True, - } - - try: - response = requests.post( - url, headers=headers, json=payload, proxies=proxies, timeout=30 - ) - if response.status_code == 200: - result = response.json() - if result.get("ok"): - print(f"{log_prefix}第 {i}/{len(batches)} 批次发送成功 [{report_type}]") - # 批次间间隔 - if i < len(batches): - time.sleep(CONFIG["BATCH_SEND_INTERVAL"]) - else: - print( - f"{log_prefix}第 {i}/{len(batches)} 批次发送失败 [{report_type}],错误:{result.get('description')}" - ) - return False - else: - print( - f"{log_prefix}第 {i}/{len(batches)} 批次发送失败 [{report_type}],状态码:{response.status_code}" - ) - return False - except Exception as e: - print(f"{log_prefix}第 {i}/{len(batches)} 批次发送出错 [{report_type}]:{e}") - return False - - print(f"{log_prefix}所有 {len(batches)} 批次发送完成 [{report_type}]") - return True - - -def send_to_email( - from_email: str, - password: str, - to_email: str, - report_type: str, - html_file_path: str, - custom_smtp_server: Optional[str] = None, - custom_smtp_port: Optional[int] = None, -) -> bool: - """发送邮件通知""" - try: - if not html_file_path or not Path(html_file_path).exists(): - print(f"错误:HTML文件不存在或未提供: {html_file_path}") - return False - - print(f"使用HTML文件: {html_file_path}") - with open(html_file_path, "r", encoding="utf-8") as f: - html_content = f.read() - - domain = from_email.split("@")[-1].lower() - - if custom_smtp_server and custom_smtp_port: - # 使用自定义 SMTP 配置 - smtp_server = custom_smtp_server - smtp_port = int(custom_smtp_port) - # 根据端口判断加密方式:465=SSL, 587=TLS - if smtp_port == 465: - use_tls = False # SSL 模式(SMTP_SSL) - elif smtp_port == 587: - use_tls = True # TLS 模式(STARTTLS) - else: - # 其他端口优先尝试 TLS(更安全,更广泛支持) - use_tls = True - elif domain in SMTP_CONFIGS: - # 使用预设配置 - config = SMTP_CONFIGS[domain] - smtp_server = config["server"] - smtp_port = config["port"] - use_tls = config["encryption"] == "TLS" - else: - print(f"未识别的邮箱服务商: {domain},使用通用 SMTP 配置") - smtp_server = f"smtp.{domain}" - smtp_port = 587 - use_tls = True - - msg = MIMEMultipart("alternative") - - # 严格按照 RFC 标准设置 From header - sender_name = "TrendRadar" - msg["From"] = formataddr((sender_name, from_email)) - - # 设置收件人 - recipients = [addr.strip() for addr in to_email.split(",")] - if len(recipients) == 1: - msg["To"] = recipients[0] - else: - msg["To"] = ", ".join(recipients) - - # 设置邮件主题 - now = get_beijing_time() - subject = f"TrendRadar 热点分析报告 - {report_type} - {now.strftime('%m月%d日 %H:%M')}" - msg["Subject"] = Header(subject, "utf-8") - - # 设置其他标准 header - msg["MIME-Version"] = "1.0" - msg["Date"] = formatdate(localtime=True) - msg["Message-ID"] = make_msgid() - - # 添加纯文本部分(作为备选) - text_content = f""" -TrendRadar 热点分析报告 -======================== -报告类型:{report_type} -生成时间:{now.strftime('%Y-%m-%d %H:%M:%S')} - -请使用支持HTML的邮件客户端查看完整报告内容。 - """ - text_part = MIMEText(text_content, "plain", "utf-8") - msg.attach(text_part) - - html_part = MIMEText(html_content, "html", "utf-8") - msg.attach(html_part) - - print(f"正在发送邮件到 {to_email}...") - print(f"SMTP 服务器: {smtp_server}:{smtp_port}") - print(f"发件人: {from_email}") - - try: - if use_tls: - # TLS 模式 - server = smtplib.SMTP(smtp_server, smtp_port, timeout=30) - server.set_debuglevel(0) # 设为1可以查看详细调试信息 - server.ehlo() - server.starttls() - server.ehlo() - else: - # SSL 模式 - server = smtplib.SMTP_SSL(smtp_server, smtp_port, timeout=30) - server.set_debuglevel(0) - server.ehlo() - - # 登录 - server.login(from_email, password) - - # 发送邮件 - server.send_message(msg) - server.quit() - - print(f"邮件发送成功 [{report_type}] -> {to_email}") - return True - - except smtplib.SMTPServerDisconnected: - print(f"邮件发送失败:服务器意外断开连接,请检查网络或稍后重试") - return False - - except smtplib.SMTPAuthenticationError as e: - print(f"邮件发送失败:认证错误,请检查邮箱和密码/授权码") - print(f"详细错误: {str(e)}") - return False - except smtplib.SMTPRecipientsRefused as e: - print(f"邮件发送失败:收件人地址被拒绝 {e}") - return False - except smtplib.SMTPSenderRefused as e: - print(f"邮件发送失败:发件人地址被拒绝 {e}") - return False - except smtplib.SMTPDataError as e: - print(f"邮件发送失败:邮件数据错误 {e}") - return False - except smtplib.SMTPConnectError as e: - print(f"邮件发送失败:无法连接到 SMTP 服务器 {smtp_server}:{smtp_port}") - print(f"详细错误: {str(e)}") - return False - except Exception as e: - print(f"邮件发送失败 [{report_type}]:{e}") - import traceback - - traceback.print_exc() - return False - - -def send_to_ntfy( - server_url: str, - topic: str, - token: Optional[str], - report_data: Dict, - report_type: str, - update_info: Optional[Dict] = None, - proxy_url: Optional[str] = None, - mode: str = "daily", - account_label: str = "", -) -> bool: - """发送到ntfy(支持分批发送,严格遵守4KB限制)""" - # 日志前缀 - log_prefix = f"ntfy{account_label}" if account_label else "ntfy" - - # 避免 HTTP header 编码问题 - report_type_en_map = { - "当日汇总": "Daily Summary", - "当前榜单汇总": "Current Ranking", - "增量更新": "Incremental Update", - "实时增量": "Realtime Incremental", - "实时当前榜单": "Realtime Current Ranking", - } - report_type_en = report_type_en_map.get(report_type, "News Report") - - headers = { - "Content-Type": "text/plain; charset=utf-8", - "Markdown": "yes", - "Title": report_type_en, - "Priority": "default", - "Tags": "news", - } - - if token: - headers["Authorization"] = f"Bearer {token}" - - # 构建完整URL,确保格式正确 - base_url = server_url.rstrip("/") - if not base_url.startswith(("http://", "https://")): - base_url = f"https://{base_url}" - url = f"{base_url}/{topic}" - - proxies = None - if proxy_url: - proxies = {"http": proxy_url, "https": proxy_url} - - # 获取分批内容,使用ntfy专用的4KB限制,预留批次头部空间 - ntfy_batch_size = 3800 - header_reserve = _get_max_batch_header_size("ntfy") - batches = split_content_into_batches( - report_data, "ntfy", update_info, max_bytes=ntfy_batch_size - header_reserve, mode=mode - ) - - # 统一添加批次头部(已预留空间,不会超限) - batches = add_batch_headers(batches, "ntfy", ntfy_batch_size) - - total_batches = len(batches) - print(f"{log_prefix}消息分为 {total_batches} 批次发送 [{report_type}]") - - # 反转批次顺序,使得在ntfy客户端显示时顺序正确 - # ntfy显示最新消息在上面,所以我们从最后一批开始推送 - reversed_batches = list(reversed(batches)) - - print(f"{log_prefix}将按反向顺序推送(最后批次先推送),确保客户端显示顺序正确") - - # 逐批发送(反向顺序) - success_count = 0 - for idx, batch_content in enumerate(reversed_batches, 1): - # 计算正确的批次编号(用户视角的编号) - actual_batch_num = total_batches - idx + 1 - - batch_size = len(batch_content.encode("utf-8")) - print( - f"发送{log_prefix}第 {actual_batch_num}/{total_batches} 批次(推送顺序: {idx}/{total_batches}),大小:{batch_size} 字节 [{report_type}]" - ) - - # 检查消息大小,确保不超过4KB - if batch_size > 4096: - print(f"警告:{log_prefix}第 {actual_batch_num} 批次消息过大({batch_size} 字节),可能被拒绝") - - # 更新 headers 的批次标识 - current_headers = headers.copy() - if total_batches > 1: - current_headers["Title"] = ( - f"{report_type_en} ({actual_batch_num}/{total_batches})" - ) - - try: - response = requests.post( - url, - headers=current_headers, - data=batch_content.encode("utf-8"), - proxies=proxies, - timeout=30, - ) - - if response.status_code == 200: - print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次发送成功 [{report_type}]") - success_count += 1 - if idx < total_batches: - # 公共服务器建议 2-3 秒,自托管可以更短 - interval = 2 if "ntfy.sh" in server_url else 1 - time.sleep(interval) - elif response.status_code == 429: - print( - f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次速率限制 [{report_type}],等待后重试" - ) - time.sleep(10) # 等待10秒后重试 - # 重试一次 - retry_response = requests.post( - url, - headers=current_headers, - data=batch_content.encode("utf-8"), - proxies=proxies, - timeout=30, - ) - if retry_response.status_code == 200: - print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次重试成功 [{report_type}]") - success_count += 1 - else: - print( - f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次重试失败,状态码:{retry_response.status_code}" - ) - elif response.status_code == 413: - print( - f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次消息过大被拒绝 [{report_type}],消息大小:{batch_size} 字节" - ) - else: - print( - f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次发送失败 [{report_type}],状态码:{response.status_code}" - ) - try: - print(f"错误详情:{response.text}") - except: - pass - - except requests.exceptions.ConnectTimeout: - print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次连接超时 [{report_type}]") - except requests.exceptions.ReadTimeout: - print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次读取超时 [{report_type}]") - except requests.exceptions.ConnectionError as e: - print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次连接错误 [{report_type}]:{e}") - except Exception as e: - print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次发送异常 [{report_type}]:{e}") - - # 判断整体发送是否成功 - if success_count == total_batches: - print(f"{log_prefix}所有 {total_batches} 批次发送完成 [{report_type}]") - return True - elif success_count > 0: - print(f"{log_prefix}部分发送成功:{success_count}/{total_batches} 批次 [{report_type}]") - return True # 部分成功也视为成功 - else: - print(f"{log_prefix}发送完全失败 [{report_type}]") - return False - - -def send_to_bark( - bark_url: str, - report_data: Dict, - report_type: str, - update_info: Optional[Dict] = None, - proxy_url: Optional[str] = None, - mode: str = "daily", - account_label: str = "", -) -> bool: - """发送到Bark(支持分批发送,使用 markdown 格式)""" - # 日志前缀 - log_prefix = f"Bark{account_label}" if account_label else "Bark" - - proxies = None - if proxy_url: - proxies = {"http": proxy_url, "https": proxy_url} - - # 解析 Bark URL,提取 device_key 和 API 端点 - # Bark URL 格式: https://api.day.app/device_key 或 https://bark.day.app/device_key - from urllib.parse import urlparse - - parsed_url = urlparse(bark_url) - device_key = parsed_url.path.strip('/').split('/')[0] if parsed_url.path else None - - if not device_key: - print(f"{log_prefix} URL 格式错误,无法提取 device_key: {bark_url}") - return False - - # 构建正确的 API 端点 - api_endpoint = f"{parsed_url.scheme}://{parsed_url.netloc}/push" - - # 获取分批内容(Bark 限制为 3600 字节以避免 413 错误),预留批次头部空间 - bark_batch_size = CONFIG["BARK_BATCH_SIZE"] - header_reserve = _get_max_batch_header_size("bark") - batches = split_content_into_batches( - report_data, "bark", update_info, max_bytes=bark_batch_size - header_reserve, mode=mode - ) - - # 统一添加批次头部(已预留空间,不会超限) - batches = add_batch_headers(batches, "bark", bark_batch_size) - - total_batches = len(batches) - print(f"{log_prefix}消息分为 {total_batches} 批次发送 [{report_type}]") - - # 反转批次顺序,使得在Bark客户端显示时顺序正确 - # Bark显示最新消息在上面,所以我们从最后一批开始推送 - reversed_batches = list(reversed(batches)) - - print(f"{log_prefix}将按反向顺序推送(最后批次先推送),确保客户端显示顺序正确") - - # 逐批发送(反向顺序) - success_count = 0 - for idx, batch_content in enumerate(reversed_batches, 1): - # 计算正确的批次编号(用户视角的编号) - actual_batch_num = total_batches - idx + 1 - - batch_size = len(batch_content.encode("utf-8")) - print( - f"发送{log_prefix}第 {actual_batch_num}/{total_batches} 批次(推送顺序: {idx}/{total_batches}),大小:{batch_size} 字节 [{report_type}]" - ) - - # 检查消息大小(Bark使用APNs,限制4KB) - if batch_size > 4096: - print( - f"警告:{log_prefix}第 {actual_batch_num}/{total_batches} 批次消息过大({batch_size} 字节),可能被拒绝" - ) - - # 构建JSON payload - payload = { - "title": report_type, - "markdown": batch_content, - "device_key": device_key, - "sound": "default", - "group": "TrendRadar", - "action": "none", # 点击推送跳到 APP 不弹出弹框,方便阅读 - } - - try: - response = requests.post( - api_endpoint, - json=payload, - proxies=proxies, - timeout=30, - ) - - if response.status_code == 200: - result = response.json() - if result.get("code") == 200: - print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次发送成功 [{report_type}]") - success_count += 1 - # 批次间间隔 - if idx < total_batches: - time.sleep(CONFIG["BATCH_SEND_INTERVAL"]) - else: - print( - f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次发送失败 [{report_type}],错误:{result.get('message', '未知错误')}" - ) - else: - print( - f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次发送失败 [{report_type}],状态码:{response.status_code}" - ) - try: - print(f"错误详情:{response.text}") - except: - pass - - except requests.exceptions.ConnectTimeout: - print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次连接超时 [{report_type}]") - except requests.exceptions.ReadTimeout: - print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次读取超时 [{report_type}]") - except requests.exceptions.ConnectionError as e: - print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次连接错误 [{report_type}]:{e}") - except Exception as e: - print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次发送异常 [{report_type}]:{e}") - - # 判断整体发送是否成功 - if success_count == total_batches: - print(f"{log_prefix}所有 {total_batches} 批次发送完成 [{report_type}]") - return True - elif success_count > 0: - print(f"{log_prefix}部分发送成功:{success_count}/{total_batches} 批次 [{report_type}]") - return True # 部分成功也视为成功 - else: - print(f"{log_prefix}发送完全失败 [{report_type}]") - return False - - -def convert_markdown_to_mrkdwn(content: str) -> str: - """ - 将标准 Markdown 转换为 Slack 的 mrkdwn 格式 - - 转换规则: - - **粗体** → *粗体* - - [文本](url) → - - 保留其他格式(代码块、列表等) - """ - # 1. 转换链接格式: [文本](url) → - content = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'<\2|\1>', content) - - # 2. 转换粗体: **文本** → *文本* - content = re.sub(r'\*\*([^*]+)\*\*', r'*\1*', content) - - return content - - -def send_to_slack( - webhook_url: str, - report_data: Dict, - report_type: str, - update_info: Optional[Dict] = None, - proxy_url: Optional[str] = None, - mode: str = "daily", - account_label: str = "", -) -> bool: - """发送到Slack(支持分批发送,使用 mrkdwn 格式)""" - headers = {"Content-Type": "application/json"} - proxies = None - if proxy_url: - proxies = {"http": proxy_url, "https": proxy_url} - - # 日志前缀 - log_prefix = f"Slack{account_label}" if account_label else "Slack" - - # 获取分批内容(使用 Slack 批次大小),预留批次头部空间 - slack_batch_size = CONFIG["SLACK_BATCH_SIZE"] - header_reserve = _get_max_batch_header_size("slack") - batches = split_content_into_batches( - report_data, "slack", update_info, max_bytes=slack_batch_size - header_reserve, mode=mode - ) - - # 统一添加批次头部(已预留空间,不会超限) - batches = add_batch_headers(batches, "slack", slack_batch_size) - - print(f"{log_prefix}消息分为 {len(batches)} 批次发送 [{report_type}]") - - # 逐批发送 - for i, batch_content in enumerate(batches, 1): - # 转换 Markdown 到 mrkdwn 格式 - mrkdwn_content = convert_markdown_to_mrkdwn(batch_content) - - batch_size = len(mrkdwn_content.encode("utf-8")) - print( - f"发送{log_prefix}第 {i}/{len(batches)} 批次,大小:{batch_size} 字节 [{report_type}]" - ) - - # 构建 Slack payload(使用简单的 text 字段,支持 mrkdwn) - payload = { - "text": mrkdwn_content - } - - try: - response = requests.post( - webhook_url, headers=headers, json=payload, proxies=proxies, timeout=30 - ) - - # Slack Incoming Webhooks 成功时返回 "ok" 文本 - if response.status_code == 200 and response.text == "ok": - print(f"{log_prefix}第 {i}/{len(batches)} 批次发送成功 [{report_type}]") - # 批次间间隔 - if i < len(batches): - time.sleep(CONFIG["BATCH_SEND_INTERVAL"]) - else: - error_msg = response.text if response.text else f"状态码:{response.status_code}" - print( - f"{log_prefix}第 {i}/{len(batches)} 批次发送失败 [{report_type}],错误:{error_msg}" - ) - return False - except Exception as e: - print(f"{log_prefix}第 {i}/{len(batches)} 批次发送出错 [{report_type}]:{e}") - return False - - print(f"{log_prefix}所有 {len(batches)} 批次发送完成 [{report_type}]") - return True - - -# === 主分析器 === -class NewsAnalyzer: - """新闻分析器""" - - # 模式策略定义 - MODE_STRATEGIES = { - "incremental": { - "mode_name": "增量模式", - "description": "增量模式(只关注新增新闻,无新增时不推送)", - "realtime_report_type": "实时增量", - "summary_report_type": "当日汇总", - "should_send_realtime": True, - "should_generate_summary": True, - "summary_mode": "daily", - }, - "current": { - "mode_name": "当前榜单模式", - "description": "当前榜单模式(当前榜单匹配新闻 + 新增新闻区域 + 按时推送)", - "realtime_report_type": "实时当前榜单", - "summary_report_type": "当前榜单汇总", - "should_send_realtime": True, - "should_generate_summary": True, - "summary_mode": "current", - }, - "daily": { - "mode_name": "当日汇总模式", - "description": "当日汇总模式(所有匹配新闻 + 新增新闻区域 + 按时推送)", - "realtime_report_type": "", - "summary_report_type": "当日汇总", - "should_send_realtime": False, - "should_generate_summary": True, - "summary_mode": "daily", - }, - } - - def __init__(self): - self.request_interval = CONFIG["REQUEST_INTERVAL"] - self.report_mode = CONFIG["REPORT_MODE"] - self.rank_threshold = CONFIG["RANK_THRESHOLD"] - self.is_github_actions = os.environ.get("GITHUB_ACTIONS") == "true" - self.is_docker_container = self._detect_docker_environment() - self.update_info = None - self.proxy_url = None - self._setup_proxy() - self.data_fetcher = DataFetcher(self.proxy_url) - - if self.is_github_actions: - self._check_version_update() - - def _detect_docker_environment(self) -> bool: - """检测是否运行在 Docker 容器中""" - try: - if os.environ.get("DOCKER_CONTAINER") == "true": - return True - - if os.path.exists("/.dockerenv"): - return True - - return False - except Exception: - return False - - def _should_open_browser(self) -> bool: - """判断是否应该打开浏览器""" - return not self.is_github_actions and not self.is_docker_container - - def _setup_proxy(self) -> None: - """设置代理配置""" - if not self.is_github_actions and CONFIG["USE_PROXY"]: - self.proxy_url = CONFIG["DEFAULT_PROXY"] - print("本地环境,使用代理") - elif not self.is_github_actions and not CONFIG["USE_PROXY"]: - print("本地环境,未启用代理") - else: - print("GitHub Actions环境,不使用代理") - - def _check_version_update(self) -> None: - """检查版本更新""" - try: - need_update, remote_version = check_version_update( - VERSION, CONFIG["VERSION_CHECK_URL"], self.proxy_url - ) - - if need_update and remote_version: - self.update_info = { - "current_version": VERSION, - "remote_version": remote_version, - } - print(f"发现新版本: {remote_version} (当前: {VERSION})") - else: - print("版本检查完成,当前为最新版本") - except Exception as e: - print(f"版本检查出错: {e}") - - def _get_mode_strategy(self) -> Dict: - """获取当前模式的策略配置""" - return self.MODE_STRATEGIES.get(self.report_mode, self.MODE_STRATEGIES["daily"]) - - def _has_notification_configured(self) -> bool: - """检查是否配置了任何通知渠道""" - return any( - [ - CONFIG["FEISHU_WEBHOOK_URL"], - CONFIG["DINGTALK_WEBHOOK_URL"], - CONFIG["WEWORK_WEBHOOK_URL"], - (CONFIG["TELEGRAM_BOT_TOKEN"] and CONFIG["TELEGRAM_CHAT_ID"]), - ( - CONFIG["EMAIL_FROM"] - and CONFIG["EMAIL_PASSWORD"] - and CONFIG["EMAIL_TO"] - ), - (CONFIG["NTFY_SERVER_URL"] and CONFIG["NTFY_TOPIC"]), - CONFIG["BARK_URL"], - CONFIG["SLACK_WEBHOOK_URL"], - ] - ) - - def _has_valid_content( - self, stats: List[Dict], new_titles: Optional[Dict] = None - ) -> bool: - """检查是否有有效的新闻内容""" - if self.report_mode in ["incremental", "current"]: - # 增量模式和current模式下,只要stats有内容就说明有匹配的新闻 - return any(stat["count"] > 0 for stat in stats) - else: - # 当日汇总模式下,检查是否有匹配的频率词新闻或新增新闻 - has_matched_news = any(stat["count"] > 0 for stat in stats) - has_new_news = bool( - new_titles and any(len(titles) > 0 for titles in new_titles.values()) - ) - return has_matched_news or has_new_news - - def _load_analysis_data( - self, - ) -> Optional[Tuple[Dict, Dict, Dict, Dict, List, List]]: - """统一的数据加载和预处理,使用当前监控平台列表过滤历史数据""" - try: - # 获取当前配置的监控平台ID列表 - current_platform_ids = [] - for platform in CONFIG["PLATFORMS"]: - current_platform_ids.append(platform["id"]) - - print(f"当前监控平台: {current_platform_ids}") - - all_results, id_to_name, title_info = read_all_today_titles( - current_platform_ids - ) - - if not all_results: - print("没有找到当天的数据") - return None - - total_titles = sum(len(titles) for titles in all_results.values()) - print(f"读取到 {total_titles} 个标题(已按当前监控平台过滤)") - - new_titles = detect_latest_new_titles(current_platform_ids) - word_groups, filter_words, global_filters = load_frequency_words() - - return ( - all_results, - id_to_name, - title_info, - new_titles, - word_groups, - filter_words, - global_filters, - ) - except Exception as e: - print(f"数据加载失败: {e}") - return None - - def _prepare_current_title_info(self, results: Dict, time_info: str) -> Dict: - """从当前抓取结果构建标题信息""" - title_info = {} - for source_id, titles_data in results.items(): - title_info[source_id] = {} - for title, title_data in titles_data.items(): - ranks = title_data.get("ranks", []) - url = title_data.get("url", "") - mobile_url = title_data.get("mobileUrl", "") - - title_info[source_id][title] = { - "first_time": time_info, - "last_time": time_info, - "count": 1, - "ranks": ranks, - "url": url, - "mobileUrl": mobile_url, - } - return title_info - - def _run_analysis_pipeline( - self, - data_source: Dict, - mode: str, - title_info: Dict, - new_titles: Dict, - word_groups: List[Dict], - filter_words: List[str], - id_to_name: Dict, - failed_ids: Optional[List] = None, - is_daily_summary: bool = False, - global_filters: Optional[List[str]] = None, - ) -> Tuple[List[Dict], str]: - """统一的分析流水线:数据处理 → 统计计算 → HTML生成""" - - # 统计计算 - stats, total_titles = count_word_frequency( - data_source, - word_groups, - filter_words, - id_to_name, - title_info, - self.rank_threshold, - new_titles, - mode=mode, - global_filters=global_filters, - ) - - # HTML生成 - html_file = generate_html_report( - stats, - total_titles, - failed_ids=failed_ids, - new_titles=new_titles, - id_to_name=id_to_name, - mode=mode, - is_daily_summary=is_daily_summary, - update_info=self.update_info if CONFIG["SHOW_VERSION_UPDATE"] else None, - ) - - return stats, html_file - - def _send_notification_if_needed( - self, - stats: List[Dict], - report_type: str, - mode: str, - failed_ids: Optional[List] = None, - new_titles: Optional[Dict] = None, - id_to_name: Optional[Dict] = None, - html_file_path: Optional[str] = None, - ) -> bool: - """统一的通知发送逻辑,包含所有判断条件""" - has_notification = self._has_notification_configured() - - if ( - CONFIG["ENABLE_NOTIFICATION"] - and has_notification - and self._has_valid_content(stats, new_titles) - ): - send_to_notifications( - stats, - failed_ids or [], - report_type, - new_titles, - id_to_name, - self.update_info, - self.proxy_url, - mode=mode, - html_file_path=html_file_path, - ) - return True - elif CONFIG["ENABLE_NOTIFICATION"] and not has_notification: - print("⚠️ 警告:通知功能已启用但未配置任何通知渠道,将跳过通知发送") - elif not CONFIG["ENABLE_NOTIFICATION"]: - print(f"跳过{report_type}通知:通知功能已禁用") - elif ( - CONFIG["ENABLE_NOTIFICATION"] - and has_notification - and not self._has_valid_content(stats, new_titles) - ): - mode_strategy = self._get_mode_strategy() - if "实时" in report_type: - print( - f"跳过实时推送通知:{mode_strategy['mode_name']}下未检测到匹配的新闻" - ) - else: - print( - f"跳过{mode_strategy['summary_report_type']}通知:未匹配到有效的新闻内容" - ) - - return False - - def _generate_summary_report(self, mode_strategy: Dict) -> Optional[str]: - """生成汇总报告(带通知)""" - summary_type = ( - "当前榜单汇总" if mode_strategy["summary_mode"] == "current" else "当日汇总" - ) - print(f"生成{summary_type}报告...") - - # 加载分析数据 - analysis_data = self._load_analysis_data() - if not analysis_data: - return None - - all_results, id_to_name, title_info, new_titles, word_groups, filter_words, global_filters = ( - analysis_data - ) - - # 运行分析流水线 - stats, html_file = self._run_analysis_pipeline( - all_results, - mode_strategy["summary_mode"], - title_info, - new_titles, - word_groups, - filter_words, - id_to_name, - is_daily_summary=True, - global_filters=global_filters, - ) - - print(f"{summary_type}报告已生成: {html_file}") - - # 发送通知 - self._send_notification_if_needed( - stats, - mode_strategy["summary_report_type"], - mode_strategy["summary_mode"], - failed_ids=[], - new_titles=new_titles, - id_to_name=id_to_name, - html_file_path=html_file, - ) - - return html_file - - def _generate_summary_html(self, mode: str = "daily") -> Optional[str]: - """生成汇总HTML""" - summary_type = "当前榜单汇总" if mode == "current" else "当日汇总" - print(f"生成{summary_type}HTML...") - - # 加载分析数据 - analysis_data = self._load_analysis_data() - if not analysis_data: - return None - - all_results, id_to_name, title_info, new_titles, word_groups, filter_words, global_filters = ( - analysis_data - ) - - # 运行分析流水线 - _, html_file = self._run_analysis_pipeline( - all_results, - mode, - title_info, - new_titles, - word_groups, - filter_words, - id_to_name, - is_daily_summary=True, - global_filters=global_filters, - ) - - print(f"{summary_type}HTML已生成: {html_file}") - return html_file - - def _initialize_and_check_config(self) -> None: - """通用初始化和配置检查""" - now = get_beijing_time() - print(f"当前北京时间: {now.strftime('%Y-%m-%d %H:%M:%S')}") - - if not CONFIG["ENABLE_CRAWLER"]: - print("爬虫功能已禁用(ENABLE_CRAWLER=False),程序退出") - return - - has_notification = self._has_notification_configured() - if not CONFIG["ENABLE_NOTIFICATION"]: - print("通知功能已禁用(ENABLE_NOTIFICATION=False),将只进行数据抓取") - elif not has_notification: - print("未配置任何通知渠道,将只进行数据抓取,不发送通知") - else: - print("通知功能已启用,将发送通知") - - mode_strategy = self._get_mode_strategy() - print(f"报告模式: {self.report_mode}") - print(f"运行模式: {mode_strategy['description']}") - - def _crawl_data(self) -> Tuple[Dict, Dict, List]: - """执行数据爬取""" - ids = [] - for platform in CONFIG["PLATFORMS"]: - if "name" in platform: - ids.append((platform["id"], platform["name"])) - else: - ids.append(platform["id"]) - - print( - f"配置的监控平台: {[p.get('name', p['id']) for p in CONFIG['PLATFORMS']]}" - ) - print(f"开始爬取数据,请求间隔 {self.request_interval} 毫秒") - ensure_directory_exists("output") - - results, id_to_name, failed_ids = self.data_fetcher.crawl_websites( - ids, self.request_interval - ) - - title_file = save_titles_to_file(results, id_to_name, failed_ids) - print(f"标题已保存到: {title_file}") - - return results, id_to_name, failed_ids - - def _execute_mode_strategy( - self, mode_strategy: Dict, results: Dict, id_to_name: Dict, failed_ids: List - ) -> Optional[str]: - """执行模式特定逻辑""" - # 获取当前监控平台ID列表 - current_platform_ids = [platform["id"] for platform in CONFIG["PLATFORMS"]] - - new_titles = detect_latest_new_titles(current_platform_ids) - time_info = Path(save_titles_to_file(results, id_to_name, failed_ids)).stem - word_groups, filter_words, global_filters = load_frequency_words() - - # current模式下,实时推送需要使用完整的历史数据来保证统计信息的完整性 - if self.report_mode == "current": - # 加载完整的历史数据(已按当前平台过滤) - analysis_data = self._load_analysis_data() - if analysis_data: - ( - all_results, - historical_id_to_name, - historical_title_info, - historical_new_titles, - _, - _, - _, - ) = analysis_data - - print( - f"current模式:使用过滤后的历史数据,包含平台:{list(all_results.keys())}" - ) - - stats, html_file = self._run_analysis_pipeline( - all_results, - self.report_mode, - historical_title_info, - historical_new_titles, - word_groups, - filter_words, - historical_id_to_name, - failed_ids=failed_ids, - global_filters=global_filters, - ) - - combined_id_to_name = {**historical_id_to_name, **id_to_name} - - print(f"HTML报告已生成: {html_file}") - - # 发送实时通知(使用完整历史数据的统计结果) - summary_html = None - if mode_strategy["should_send_realtime"]: - self._send_notification_if_needed( - stats, - mode_strategy["realtime_report_type"], - self.report_mode, - failed_ids=failed_ids, - new_titles=historical_new_titles, - id_to_name=combined_id_to_name, - html_file_path=html_file, - ) - else: - print("❌ 严重错误:无法读取刚保存的数据文件") - raise RuntimeError("数据一致性检查失败:保存后立即读取失败") - else: - title_info = self._prepare_current_title_info(results, time_info) - stats, html_file = self._run_analysis_pipeline( - results, - self.report_mode, - title_info, - new_titles, - word_groups, - filter_words, - id_to_name, - failed_ids=failed_ids, - global_filters=global_filters, - ) - print(f"HTML报告已生成: {html_file}") - - # 发送实时通知(如果需要) - summary_html = None - if mode_strategy["should_send_realtime"]: - self._send_notification_if_needed( - stats, - mode_strategy["realtime_report_type"], - self.report_mode, - failed_ids=failed_ids, - new_titles=new_titles, - id_to_name=id_to_name, - html_file_path=html_file, - ) - - # 生成汇总报告(如果需要) - summary_html = None - if mode_strategy["should_generate_summary"]: - if mode_strategy["should_send_realtime"]: - # 如果已经发送了实时通知,汇总只生成HTML不发送通知 - summary_html = self._generate_summary_html( - mode_strategy["summary_mode"] - ) - else: - # daily模式:直接生成汇总报告并发送通知 - summary_html = self._generate_summary_report(mode_strategy) - - # 打开浏览器(仅在非容器环境) - if self._should_open_browser() and html_file: - if summary_html: - summary_url = "file://" + str(Path(summary_html).resolve()) - print(f"正在打开汇总报告: {summary_url}") - webbrowser.open(summary_url) - else: - file_url = "file://" + str(Path(html_file).resolve()) - print(f"正在打开HTML报告: {file_url}") - webbrowser.open(file_url) - elif self.is_docker_container and html_file: - if summary_html: - print(f"汇总报告已生成(Docker环境): {summary_html}") - else: - print(f"HTML报告已生成(Docker环境): {html_file}") - - return summary_html - - def run(self) -> None: - """执行分析流程""" - try: - self._initialize_and_check_config() - - mode_strategy = self._get_mode_strategy() - - results, id_to_name, failed_ids = self._crawl_data() - - self._execute_mode_strategy(mode_strategy, results, id_to_name, failed_ids) - - except Exception as e: - print(f"分析流程执行出错: {e}") - raise - - -def main(): - try: - analyzer = NewsAnalyzer() - analyzer.run() - except FileNotFoundError as e: - print(f"❌ 配置文件错误: {e}") - print("\n请确保以下文件存在:") - print(" • config/config.yaml") - print(" • config/frequency_words.txt") - print("\n参考项目文档进行正确配置") - except Exception as e: - print(f"❌ 程序运行错误: {e}") - raise - - -if __name__ == "__main__": - main() diff --git a/mcp_server/__init__.py b/mcp_server/__init__.py index 352560e..4396d96 100644 --- a/mcp_server/__init__.py +++ b/mcp_server/__init__.py @@ -4,4 +4,4 @@ TrendRadar MCP Server 提供基于MCP协议的新闻聚合数据查询和系统管理接口。 """ -__version__ = "1.0.0" +__version__ = "1.1.0" diff --git a/mcp_server/server.py b/mcp_server/server.py index d1bbe04..8f9bb72 100644 --- a/mcp_server/server.py +++ b/mcp_server/server.py @@ -15,6 +15,7 @@ from .tools.analytics import AnalyticsTools from .tools.search_tools import SearchTools from .tools.config_mgmt import ConfigManagementTools from .tools.system import SystemManagementTools +from .tools.storage_sync import StorageSyncTools from .utils.date_parser import DateParser from .utils.errors import MCPError @@ -34,6 +35,7 @@ def _get_tools(project_root: Optional[str] = None): _tools_instances['search'] = SearchTools(project_root) _tools_instances['config'] = ConfigManagementTools(project_root) _tools_instances['system'] = SystemManagementTools(project_root) + _tools_instances['storage'] = StorageSyncTools(project_root) return _tools_instances @@ -657,6 +659,127 @@ async def trigger_crawl( return json.dumps(result, ensure_ascii=False, indent=2) +# ==================== 存储同步工具 ==================== + +@mcp.tool +async def sync_from_remote( + days: int = 7 +) -> str: + """ + 从远程存储拉取数据到本地 + + 用于 MCP Server 等场景:爬虫存到远程云存储(如 Cloudflare R2), + MCP Server 拉取到本地进行分析查询。 + + Args: + days: 拉取最近 N 天的数据,默认 7 天 + - 0: 不拉取 + - 7: 拉取最近一周的数据 + - 30: 拉取最近一个月的数据 + + Returns: + JSON格式的同步结果,包含: + - success: 是否成功 + - synced_files: 成功同步的文件数量 + - synced_dates: 成功同步的日期列表 + - skipped_dates: 跳过的日期(本地已存在) + - failed_dates: 失败的日期及错误信息 + - message: 操作结果描述 + + Examples: + - sync_from_remote() # 拉取最近7天 + - sync_from_remote(days=30) # 拉取最近30天 + + Note: + 需要在 config/config.yaml 中配置远程存储(storage.remote)或设置环境变量: + - S3_ENDPOINT_URL: 服务端点 + - S3_BUCKET_NAME: 存储桶名称 + - S3_ACCESS_KEY_ID: 访问密钥 ID + - S3_SECRET_ACCESS_KEY: 访问密钥 + """ + tools = _get_tools() + result = tools['storage'].sync_from_remote(days=days) + return json.dumps(result, ensure_ascii=False, indent=2) + + +@mcp.tool +async def get_storage_status() -> str: + """ + 获取存储配置和状态 + + 查看当前存储后端配置、本地和远程存储的状态信息。 + + Returns: + JSON格式的存储状态信息,包含: + - backend: 当前使用的后端类型(local/remote/auto) + - local: 本地存储状态 + - data_dir: 数据目录 + - retention_days: 保留天数 + - total_size: 总大小 + - date_count: 日期数量 + - earliest_date: 最早日期 + - latest_date: 最新日期 + - remote: 远程存储状态 + - configured: 是否已配置 + - endpoint_url: 服务端点 + - bucket_name: 存储桶名称 + - date_count: 远程日期数量 + - pull: 拉取配置 + - enabled: 是否启用自动拉取 + - days: 自动拉取天数 + + Examples: + - get_storage_status() # 查看所有存储状态 + """ + tools = _get_tools() + result = tools['storage'].get_storage_status() + return json.dumps(result, ensure_ascii=False, indent=2) + + +@mcp.tool +async def list_available_dates( + source: str = "both" +) -> str: + """ + 列出本地/远程可用的日期范围 + + 查看本地和远程存储中有哪些日期的数据可用, + 帮助了解数据覆盖范围和同步状态。 + + Args: + source: 数据来源,可选值: + - "local": 仅列出本地可用日期 + - "remote": 仅列出远程可用日期 + - "both": 同时列出两者并进行对比(默认) + + Returns: + JSON格式的日期列表,包含: + - local: 本地日期信息(如果 source 包含 local) + - dates: 日期列表(按时间倒序) + - count: 日期数量 + - earliest: 最早日期 + - latest: 最新日期 + - remote: 远程日期信息(如果 source 包含 remote) + - configured: 是否已配置远程存储 + - dates: 日期列表 + - count: 日期数量 + - earliest: 最早日期 + - latest: 最新日期 + - comparison: 对比结果(仅当 source="both" 时) + - only_local: 仅本地存在的日期 + - only_remote: 仅远程存在的日期 + - both: 两边都存在的日期 + + Examples: + - list_available_dates() # 查看本地和远程的对比 + - list_available_dates(source="local") # 仅查看本地 + - list_available_dates(source="remote") # 仅查看远程 + """ + tools = _get_tools() + result = tools['storage'].list_available_dates(source=source) + return json.dumps(result, ensure_ascii=False, indent=2) + + # ==================== 启动入口 ==================== def run_server( @@ -721,6 +844,11 @@ def run_server( print(" 11. get_current_config - 获取当前系统配置") print(" 12. get_system_status - 获取系统运行状态") print(" 13. trigger_crawl - 手动触发爬取任务") + print() + print(" === 存储同步工具 ===") + print(" 14. sync_from_remote - 从远程存储拉取数据到本地") + print(" 15. get_storage_status - 获取存储配置和状态") + print(" 16. list_available_dates - 列出本地/远程可用日期") print("=" * 60) print() diff --git a/mcp_server/services/data_service.py b/mcp_server/services/data_service.py index 9e409a1..bb31231 100644 --- a/mcp_server/services/data_service.py +++ b/mcp_server/services/data_service.py @@ -517,24 +517,55 @@ class DataService: # 遍历日期文件夹 for date_folder in output_dir.iterdir(): if date_folder.is_dir() and not date_folder.name.startswith('.'): - # 解析日期(格式: YYYY年MM月DD日) - try: - date_match = re.match(r'(\d{4})年(\d{2})月(\d{2})日', date_folder.name) - if date_match: - folder_date = datetime( - int(date_match.group(1)), - int(date_match.group(2)), - int(date_match.group(3)) - ) - available_dates.append(folder_date) - except Exception: - pass + folder_date = self._parse_date_folder_name(date_folder.name) + if folder_date: + available_dates.append(folder_date) if not available_dates: return (None, None) return (min(available_dates), max(available_dates)) + def _parse_date_folder_name(self, folder_name: str) -> Optional[datetime]: + """ + 解析日期文件夹名称(兼容中文和ISO格式) + + 支持两种格式: + - 中文格式:YYYY年MM月DD日 + - ISO格式:YYYY-MM-DD + + Args: + folder_name: 文件夹名称 + + Returns: + datetime 对象,解析失败返回 None + """ + # 尝试中文格式:YYYY年MM月DD日 + chinese_match = re.match(r'(\d{4})年(\d{2})月(\d{2})日', folder_name) + if chinese_match: + try: + return datetime( + int(chinese_match.group(1)), + int(chinese_match.group(2)), + int(chinese_match.group(3)) + ) + except ValueError: + pass + + # 尝试 ISO 格式:YYYY-MM-DD + iso_match = re.match(r'(\d{4})-(\d{2})-(\d{2})', folder_name) + if iso_match: + try: + return datetime( + int(iso_match.group(1)), + int(iso_match.group(2)), + int(iso_match.group(3)) + ) + except ValueError: + pass + + return None + def get_system_status(self) -> Dict: """ 获取系统运行状态 @@ -553,26 +584,14 @@ class DataService: if output_dir.exists(): # 遍历日期文件夹 for date_folder in output_dir.iterdir(): - if date_folder.is_dir(): - # 解析日期 - try: - date_str = date_folder.name - # 格式: YYYY年MM月DD日 - date_match = re.match(r'(\d{4})年(\d{2})月(\d{2})日', date_str) - if date_match: - folder_date = datetime( - int(date_match.group(1)), - int(date_match.group(2)), - int(date_match.group(3)) - ) - - if oldest_record is None or folder_date < oldest_record: - oldest_record = folder_date - if latest_record is None or folder_date > latest_record: - latest_record = folder_date - - except: - pass + if date_folder.is_dir() and not date_folder.name.startswith('.'): + # 解析日期(兼容中文和ISO格式) + folder_date = self._parse_date_folder_name(date_folder.name) + if folder_date: + if oldest_record is None or folder_date < oldest_record: + oldest_record = folder_date + if latest_record is None or folder_date > latest_record: + latest_record = folder_date # 计算存储大小 for item in date_folder.rglob("*"): diff --git a/mcp_server/services/parser_service.py b/mcp_server/services/parser_service.py index 6bd2969..d50fb17 100644 --- a/mcp_server/services/parser_service.py +++ b/mcp_server/services/parser_service.py @@ -2,9 +2,12 @@ 文件解析服务 提供txt格式新闻数据和YAML配置文件的解析功能。 +支持从 SQLite 数据库和 TXT 文件两种数据源读取。 """ +import json import re +import sqlite3 from pathlib import Path from typing import Dict, List, Tuple, Optional from datetime import datetime @@ -145,17 +148,310 @@ class ParserService: def get_date_folder_name(self, date: datetime = None) -> str: """ - 获取日期文件夹名称 + 获取日期文件夹名称(兼容中文和ISO格式) Args: date: 日期对象,默认为今天 Returns: - 文件夹名称,格式: YYYY年MM月DD日 + 实际存在的文件夹名称,优先返回中文格式(YYYY年MM月DD日), + 若不存在则返回 ISO 格式(YYYY-MM-DD) """ if date is None: date = datetime.now() - return date.strftime("%Y年%m月%d日") + return self._find_date_folder(date) + + def _get_date_folder_name(self, date: datetime = None) -> str: + """ + 获取日期文件夹名称(兼容中文和ISO格式) + + Args: + date: 日期对象,默认为今天 + + Returns: + 实际存在的文件夹名称,优先返回中文格式(YYYY年MM月DD日), + 若不存在则返回 ISO 格式(YYYY-MM-DD) + """ + if date is None: + date = datetime.now() + return self._find_date_folder(date) + + def _find_date_folder(self, date: datetime) -> str: + """ + 查找实际存在的日期文件夹 + + 支持两种格式: + - 中文格式:YYYY年MM月DD日(优先) + - ISO格式:YYYY-MM-DD + + Args: + date: 日期对象 + + Returns: + 实际存在的文件夹名称,若都不存在则返回中文格式 + """ + output_dir = self.project_root / "output" + + # 中文格式:YYYY年MM月DD日 + chinese_format = date.strftime("%Y年%m月%d日") + # ISO格式:YYYY-MM-DD + iso_format = date.strftime("%Y-%m-%d") + + # 优先检查中文格式 + if (output_dir / chinese_format).exists(): + return chinese_format + # 其次检查 ISO 格式 + if (output_dir / iso_format).exists(): + return iso_format + + # 都不存在,返回中文格式(与项目现有风格一致) + return chinese_format + + def _get_sqlite_db_path(self, date: datetime = None) -> Optional[Path]: + """ + 获取 SQLite 数据库文件路径 + + Args: + date: 日期对象,默认为今天 + + Returns: + 数据库文件路径,如果不存在则返回 None + """ + date_folder = self._get_date_folder_name(date) + db_path = self.project_root / "output" / date_folder / "news.db" + if db_path.exists(): + return db_path + return None + + def _get_txt_folder_path(self, date: datetime = None) -> Optional[Path]: + """ + 获取 TXT 文件夹路径 + + Args: + date: 日期对象,默认为今天 + + Returns: + TXT 文件夹路径,如果不存在则返回 None + """ + date_folder = self._get_date_folder_name(date) + txt_path = self.project_root / "output" / date_folder / "txt" + if txt_path.exists() and txt_path.is_dir(): + return txt_path + return None + + def _read_from_txt( + self, + date: datetime = None, + platform_ids: Optional[List[str]] = None + ) -> Optional[Tuple[Dict, Dict, Dict]]: + """ + 从 TXT 文件夹读取新闻数据 + + Args: + date: 日期对象,默认为今天 + platform_ids: 平台ID列表,None表示所有平台 + + Returns: + (all_titles, id_to_name, all_timestamps) 元组,如果不存在返回 None + """ + txt_folder = self._get_txt_folder_path(date) + if txt_folder is None: + return None + + # 获取所有 TXT 文件并按时间排序 + txt_files = sorted(txt_folder.glob("*.txt")) + if not txt_files: + return None + + all_titles = {} + id_to_name = {} + all_timestamps = {} + + for txt_file in txt_files: + try: + titles_by_id, file_id_to_name = self.parse_txt_file(txt_file) + + # 记录时间戳 + all_timestamps[txt_file.name] = txt_file.stat().st_mtime + + # 合并 id_to_name + id_to_name.update(file_id_to_name) + + # 合并标题数据 + for source_id, titles in titles_by_id.items(): + # 如果指定了 platform_ids,过滤 + if platform_ids and source_id not in platform_ids: + continue + + if source_id not in all_titles: + all_titles[source_id] = {} + + for title, data in titles.items(): + if title not in all_titles[source_id]: + # 新标题 + all_titles[source_id][title] = { + "ranks": data.get("ranks", []), + "url": data.get("url", ""), + "mobileUrl": data.get("mobileUrl", ""), + "first_time": txt_file.stem, # 使用文件名作为时间 + "last_time": txt_file.stem, + "count": 1, + } + else: + # 合并已存在的标题 + existing = all_titles[source_id][title] + # 合并排名 + for rank in data.get("ranks", []): + if rank not in existing["ranks"]: + existing["ranks"].append(rank) + # 更新 last_time + existing["last_time"] = txt_file.stem + existing["count"] += 1 + # 保留 URL + if not existing["url"] and data.get("url"): + existing["url"] = data["url"] + if not existing["mobileUrl"] and data.get("mobileUrl"): + existing["mobileUrl"] = data["mobileUrl"] + + except Exception as e: + print(f"Warning: 解析 TXT 文件失败 {txt_file}: {e}") + continue + + if not all_titles: + return None + + return (all_titles, id_to_name, all_timestamps) + + def _read_from_sqlite( + self, + date: datetime = None, + platform_ids: Optional[List[str]] = None + ) -> Optional[Tuple[Dict, Dict, Dict]]: + """ + 从 SQLite 数据库读取新闻数据 + + 新表结构数据已按 URL 去重,包含: + - first_crawl_time: 首次抓取时间 + - last_crawl_time: 最后抓取时间 + - crawl_count: 抓取次数 + + Args: + date: 日期对象,默认为今天 + platform_ids: 平台ID列表,None表示所有平台 + + Returns: + (all_titles, id_to_name, all_timestamps) 元组,如果数据库不存在返回 None + """ + db_path = self._get_sqlite_db_path(date) + if db_path is None: + return None + + all_titles = {} + id_to_name = {} + all_timestamps = {} + + try: + conn = sqlite3.connect(str(db_path)) + conn.row_factory = sqlite3.Row + cursor = conn.cursor() + + # 检查表是否存在 + cursor.execute(""" + SELECT name FROM sqlite_master + WHERE type='table' AND name='news_items' + """) + if not cursor.fetchone(): + conn.close() + return None + + # 构建查询 + if platform_ids: + placeholders = ','.join(['?' for _ in platform_ids]) + query = f""" + SELECT n.id, n.platform_id, p.name as platform_name, n.title, + n.rank, n.url, n.mobile_url, + n.first_crawl_time, n.last_crawl_time, n.crawl_count + FROM news_items n + LEFT JOIN platforms p ON n.platform_id = p.id + WHERE n.platform_id IN ({placeholders}) + """ + cursor.execute(query, platform_ids) + else: + cursor.execute(""" + SELECT n.id, n.platform_id, p.name as platform_name, n.title, + n.rank, n.url, n.mobile_url, + n.first_crawl_time, n.last_crawl_time, n.crawl_count + FROM news_items n + LEFT JOIN platforms p ON n.platform_id = p.id + """) + + rows = cursor.fetchall() + + # 收集所有 news_item_id 用于查询历史排名 + news_ids = [row['id'] for row in rows] + rank_history_map = {} + + if news_ids: + placeholders = ",".join("?" * len(news_ids)) + cursor.execute(f""" + SELECT news_item_id, rank FROM rank_history + WHERE news_item_id IN ({placeholders}) + ORDER BY news_item_id, crawl_time + """, news_ids) + + for rh_row in cursor.fetchall(): + news_id = rh_row['news_item_id'] + rank = rh_row['rank'] + if news_id not in rank_history_map: + rank_history_map[news_id] = [] + rank_history_map[news_id].append(rank) + + for row in rows: + news_id = row['id'] + platform_id = row['platform_id'] + platform_name = row['platform_name'] or platform_id + title = row['title'] + + # 更新 id_to_name + if platform_id not in id_to_name: + id_to_name[platform_id] = platform_name + + # 初始化平台字典 + if platform_id not in all_titles: + all_titles[platform_id] = {} + + # 获取排名历史,如果为空则使用当前排名 + ranks = rank_history_map.get(news_id, [row['rank']]) + + # 直接使用数据(已去重) + all_titles[platform_id][title] = { + "ranks": ranks, + "url": row['url'] or "", + "mobileUrl": row['mobile_url'] or "", + "first_time": row['first_crawl_time'] or "", + "last_time": row['last_crawl_time'] or "", + "count": row['crawl_count'] or 1, + } + + # 获取抓取时间作为 timestamps + cursor.execute(""" + SELECT crawl_time FROM crawl_records + ORDER BY crawl_time + """) + for row in cursor.fetchall(): + crawl_time = row['crawl_time'] + all_timestamps[f"{crawl_time}.db"] = 0 # 用虚拟时间戳 + + conn.close() + + if not all_titles: + return None + + return (all_titles, id_to_name, all_timestamps) + + except Exception as e: + print(f"Warning: 从 SQLite 读取数据失败: {e}") + return None def read_all_titles_for_date( self, @@ -163,7 +459,7 @@ class ParserService: platform_ids: Optional[List[str]] = None ) -> Tuple[Dict, Dict, Dict]: """ - 读取指定日期的所有标题文件(带缓存) + 读取指定日期的所有标题(带缓存) Args: date: 日期对象,默认为今天 @@ -193,71 +489,23 @@ class ParserService: if cached: return cached - # 缓存未命中,读取文件 - date_folder = self.get_date_folder_name(date) - txt_dir = self.project_root / "output" / date_folder / "txt" + # 优先从 SQLite 读取 + sqlite_result = self._read_from_sqlite(date, platform_ids) + if sqlite_result: + self.cache.set(cache_key, sqlite_result) + return sqlite_result - if not txt_dir.exists(): - raise DataNotFoundError( - f"未找到 {date_folder} 的数据目录", - suggestion="请先运行爬虫或检查日期是否正确" - ) + # SQLite 不存在,尝试从 TXT 读取 + txt_result = self._read_from_txt(date, platform_ids) + if txt_result: + self.cache.set(cache_key, txt_result) + return txt_result - all_titles = {} - id_to_name = {} - all_timestamps = {} - - # 读取所有txt文件 - txt_files = sorted(txt_dir.glob("*.txt")) - - if not txt_files: - raise DataNotFoundError( - f"{date_folder} 没有数据文件", - suggestion="请等待爬虫任务完成" - ) - - for txt_file in txt_files: - try: - titles_by_id, file_id_to_name = self.parse_txt_file(txt_file) - - # 更新id_to_name - id_to_name.update(file_id_to_name) - - # 合并标题数据 - for platform_id, titles in titles_by_id.items(): - # 如果指定了平台过滤 - if platform_ids and platform_id not in platform_ids: - continue - - if platform_id not in all_titles: - all_titles[platform_id] = {} - - for title, info in titles.items(): - if title in all_titles[platform_id]: - # 合并排名 - all_titles[platform_id][title]["ranks"].extend(info["ranks"]) - else: - all_titles[platform_id][title] = info.copy() - - # 记录文件时间戳 - all_timestamps[txt_file.name] = txt_file.stat().st_mtime - - except Exception as e: - # 忽略单个文件的解析错误,继续处理其他文件 - print(f"Warning: 解析文件 {txt_file} 失败: {e}") - continue - - if not all_titles: - raise DataNotFoundError( - f"{date_folder} 没有有效的数据", - suggestion="请检查数据文件格式或重新运行爬虫" - ) - - # 缓存结果 - result = (all_titles, id_to_name, all_timestamps) - self.cache.set(cache_key, result) - - return result + # 两种数据源都不存在 + raise DataNotFoundError( + f"未找到 {date_str} 的数据", + suggestion="请先运行爬虫或检查日期是否正确" + ) def parse_yaml_config(self, config_path: str = None) -> dict: """ diff --git a/mcp_server/tools/analytics.py b/mcp_server/tools/analytics.py index 4601e68..88ecc80 100644 --- a/mcp_server/tools/analytics.py +++ b/mcp_server/tools/analytics.py @@ -25,7 +25,6 @@ def calculate_news_weight(news_data: Dict, rank_threshold: int = 5) -> float: """ 计算新闻权重(用于排序) - 基于 main.py 的权重算法实现,综合考虑: - 排名权重 (60%):新闻在榜单中的排名 - 频次权重 (30%):新闻出现的次数 - 热度权重 (10%):高排名出现的比例 diff --git a/mcp_server/tools/storage_sync.py b/mcp_server/tools/storage_sync.py new file mode 100644 index 0000000..f5403f0 --- /dev/null +++ b/mcp_server/tools/storage_sync.py @@ -0,0 +1,468 @@ +# coding=utf-8 +""" +存储同步工具 + +实现从远程存储拉取数据到本地、获取存储状态、列出可用日期等功能。 +""" + +import os +import re +from pathlib import Path +from datetime import datetime, timedelta +from typing import Dict, List, Optional + +import yaml + +from ..utils.errors import MCPError + + +class StorageSyncTools: + """存储同步工具类""" + + def __init__(self, project_root: str = None): + """ + 初始化存储同步工具 + + Args: + project_root: 项目根目录 + """ + if project_root: + self.project_root = Path(project_root) + else: + current_file = Path(__file__) + self.project_root = current_file.parent.parent.parent + + self._config = None + self._remote_backend = None + + def _load_config(self) -> dict: + """加载配置文件""" + if self._config is None: + config_path = self.project_root / "config" / "config.yaml" + if config_path.exists(): + with open(config_path, "r", encoding="utf-8") as f: + self._config = yaml.safe_load(f) + else: + self._config = {} + return self._config + + def _get_storage_config(self) -> dict: + """获取存储配置""" + config = self._load_config() + return config.get("storage", {}) + + def _get_remote_config(self) -> dict: + """ + 获取远程存储配置(合并配置文件和环境变量) + """ + storage_config = self._get_storage_config() + remote_config = storage_config.get("remote", {}) + + return { + "endpoint_url": remote_config.get("endpoint_url") or os.environ.get("S3_ENDPOINT_URL", ""), + "bucket_name": remote_config.get("bucket_name") or os.environ.get("S3_BUCKET_NAME", ""), + "access_key_id": remote_config.get("access_key_id") or os.environ.get("S3_ACCESS_KEY_ID", ""), + "secret_access_key": remote_config.get("secret_access_key") or os.environ.get("S3_SECRET_ACCESS_KEY", ""), + "region": remote_config.get("region") or os.environ.get("S3_REGION", ""), + } + + def _has_remote_config(self) -> bool: + """检查是否有有效的远程存储配置""" + config = self._get_remote_config() + return bool( + config.get("bucket_name") and + config.get("access_key_id") and + config.get("secret_access_key") and + config.get("endpoint_url") + ) + + def _get_remote_backend(self): + """获取远程存储后端实例""" + if self._remote_backend is not None: + return self._remote_backend + + if not self._has_remote_config(): + return None + + try: + from trendradar.storage.remote import RemoteStorageBackend + + remote_config = self._get_remote_config() + config = self._load_config() + timezone = config.get("app", {}).get("timezone", "Asia/Shanghai") + + self._remote_backend = RemoteStorageBackend( + bucket_name=remote_config["bucket_name"], + access_key_id=remote_config["access_key_id"], + secret_access_key=remote_config["secret_access_key"], + endpoint_url=remote_config["endpoint_url"], + region=remote_config.get("region", ""), + timezone=timezone, + ) + return self._remote_backend + except ImportError: + print("[存储同步] 远程存储后端需要安装 boto3: pip install boto3") + return None + except Exception as e: + print(f"[存储同步] 创建远程后端失败: {e}") + return None + + def _get_local_data_dir(self) -> Path: + """获取本地数据目录""" + storage_config = self._get_storage_config() + local_config = storage_config.get("local", {}) + data_dir = local_config.get("data_dir", "output") + return self.project_root / data_dir + + def _parse_date_folder_name(self, folder_name: str) -> Optional[datetime]: + """ + 解析日期文件夹名称(兼容中文和 ISO 格式) + + 支持两种格式: + - 中文格式:YYYY年MM月DD日 + - ISO 格式:YYYY-MM-DD + """ + # 尝试 ISO 格式 + iso_match = re.match(r'(\d{4})-(\d{2})-(\d{2})', folder_name) + if iso_match: + try: + return datetime( + int(iso_match.group(1)), + int(iso_match.group(2)), + int(iso_match.group(3)) + ) + except ValueError: + pass + + # 尝试中文格式 + chinese_match = re.match(r'(\d{4})年(\d{2})月(\d{2})日', folder_name) + if chinese_match: + try: + return datetime( + int(chinese_match.group(1)), + int(chinese_match.group(2)), + int(chinese_match.group(3)) + ) + except ValueError: + pass + + return None + + def _get_local_dates(self) -> List[str]: + """获取本地可用的日期列表""" + local_dir = self._get_local_data_dir() + dates = [] + + if not local_dir.exists(): + return dates + + for item in local_dir.iterdir(): + if item.is_dir() and not item.name.startswith('.'): + folder_date = self._parse_date_folder_name(item.name) + if folder_date: + dates.append(folder_date.strftime("%Y-%m-%d")) + + return sorted(dates, reverse=True) + + def _calculate_dir_size(self, path: Path) -> int: + """计算目录大小(字节)""" + total_size = 0 + if path.exists(): + for item in path.rglob("*"): + if item.is_file(): + total_size += item.stat().st_size + return total_size + + def sync_from_remote(self, days: int = 7) -> Dict: + """ + 从远程存储拉取数据到本地 + + Args: + days: 拉取最近 N 天的数据,默认 7 天 + + Returns: + 同步结果字典 + """ + try: + # 检查远程配置 + if not self._has_remote_config(): + return { + "success": False, + "error": { + "code": "REMOTE_NOT_CONFIGURED", + "message": "未配置远程存储", + "suggestion": "请在 config/config.yaml 中配置 storage.remote 或设置环境变量" + } + } + + # 获取远程后端 + remote_backend = self._get_remote_backend() + if remote_backend is None: + return { + "success": False, + "error": { + "code": "REMOTE_BACKEND_FAILED", + "message": "无法创建远程存储后端", + "suggestion": "请检查远程存储配置和 boto3 是否已安装" + } + } + + # 获取本地数据目录 + local_dir = self._get_local_data_dir() + local_dir.mkdir(parents=True, exist_ok=True) + + # 获取远程可用日期 + remote_dates = remote_backend.list_remote_dates() + + # 获取本地已有日期 + local_dates = set(self._get_local_dates()) + + # 计算需要拉取的日期(最近 N 天) + from trendradar.utils.time import get_configured_time + config = self._load_config() + timezone = config.get("app", {}).get("timezone", "Asia/Shanghai") + now = get_configured_time(timezone) + + target_dates = [] + for i in range(days): + date = now - timedelta(days=i) + date_str = date.strftime("%Y-%m-%d") + if date_str in remote_dates: + target_dates.append(date_str) + + # 执行拉取 + synced_dates = [] + skipped_dates = [] + failed_dates = [] + + for date_str in target_dates: + # 检查本地是否已存在 + if date_str in local_dates: + skipped_dates.append(date_str) + continue + + # 拉取单个日期 + try: + local_date_dir = local_dir / date_str + local_db_path = local_date_dir / "news.db" + remote_key = f"news/{date_str}.db" + + local_date_dir.mkdir(parents=True, exist_ok=True) + remote_backend.s3_client.download_file( + remote_backend.bucket_name, + remote_key, + str(local_db_path) + ) + synced_dates.append(date_str) + print(f"[存储同步] 已拉取: {date_str}") + except Exception as e: + failed_dates.append({"date": date_str, "error": str(e)}) + print(f"[存储同步] 拉取失败 ({date_str}): {e}") + + return { + "success": True, + "synced_files": len(synced_dates), + "synced_dates": synced_dates, + "skipped_dates": skipped_dates, + "failed_dates": failed_dates, + "message": f"成功同步 {len(synced_dates)} 天数据" + ( + f",跳过 {len(skipped_dates)} 天(本地已存在)" if skipped_dates else "" + ) + ( + f",失败 {len(failed_dates)} 天" if failed_dates else "" + ) + } + + except MCPError as e: + return { + "success": False, + "error": e.to_dict() + } + except Exception as e: + return { + "success": False, + "error": { + "code": "INTERNAL_ERROR", + "message": str(e) + } + } + + def get_storage_status(self) -> Dict: + """ + 获取存储配置和状态 + + Returns: + 存储状态字典 + """ + try: + storage_config = self._get_storage_config() + config = self._load_config() + + # 本地存储状态 + local_config = storage_config.get("local", {}) + local_dir = self._get_local_data_dir() + local_size = self._calculate_dir_size(local_dir) + local_dates = self._get_local_dates() + + local_status = { + "data_dir": local_config.get("data_dir", "output"), + "retention_days": local_config.get("retention_days", 0), + "total_size": f"{local_size / 1024 / 1024:.2f} MB", + "total_size_bytes": local_size, + "date_count": len(local_dates), + "earliest_date": local_dates[-1] if local_dates else None, + "latest_date": local_dates[0] if local_dates else None, + } + + # 远程存储状态 + remote_config = storage_config.get("remote", {}) + has_remote = self._has_remote_config() + + remote_status = { + "configured": has_remote, + "retention_days": remote_config.get("retention_days", 0), + } + + if has_remote: + merged_config = self._get_remote_config() + # 脱敏显示 + endpoint = merged_config.get("endpoint_url", "") + bucket = merged_config.get("bucket_name", "") + remote_status["endpoint_url"] = endpoint + remote_status["bucket_name"] = bucket + + # 尝试获取远程日期列表 + remote_backend = self._get_remote_backend() + if remote_backend: + try: + remote_dates = remote_backend.list_remote_dates() + remote_status["date_count"] = len(remote_dates) + remote_status["earliest_date"] = remote_dates[-1] if remote_dates else None + remote_status["latest_date"] = remote_dates[0] if remote_dates else None + except Exception as e: + remote_status["error"] = str(e) + + # 拉取配置状态 + pull_config = storage_config.get("pull", {}) + pull_status = { + "enabled": pull_config.get("enabled", False), + "days": pull_config.get("days", 7), + } + + return { + "success": True, + "backend": storage_config.get("backend", "auto"), + "local": local_status, + "remote": remote_status, + "pull": pull_status, + } + + except MCPError as e: + return { + "success": False, + "error": e.to_dict() + } + except Exception as e: + return { + "success": False, + "error": { + "code": "INTERNAL_ERROR", + "message": str(e) + } + } + + def list_available_dates(self, source: str = "both") -> Dict: + """ + 列出可用的日期范围 + + Args: + source: 数据来源 + - "local": 仅本地 + - "remote": 仅远程 + - "both": 两者都列出(默认) + + Returns: + 日期列表字典 + """ + try: + result = { + "success": True, + } + + # 本地日期 + if source in ("local", "both"): + local_dates = self._get_local_dates() + result["local"] = { + "dates": local_dates, + "count": len(local_dates), + "earliest": local_dates[-1] if local_dates else None, + "latest": local_dates[0] if local_dates else None, + } + + # 远程日期 + if source in ("remote", "both"): + if not self._has_remote_config(): + result["remote"] = { + "configured": False, + "dates": [], + "count": 0, + "earliest": None, + "latest": None, + "error": "未配置远程存储" + } + else: + remote_backend = self._get_remote_backend() + if remote_backend: + try: + remote_dates = remote_backend.list_remote_dates() + result["remote"] = { + "configured": True, + "dates": remote_dates, + "count": len(remote_dates), + "earliest": remote_dates[-1] if remote_dates else None, + "latest": remote_dates[0] if remote_dates else None, + } + except Exception as e: + result["remote"] = { + "configured": True, + "dates": [], + "count": 0, + "earliest": None, + "latest": None, + "error": str(e) + } + else: + result["remote"] = { + "configured": True, + "dates": [], + "count": 0, + "earliest": None, + "latest": None, + "error": "无法创建远程存储后端" + } + + # 如果同时查询两者,计算差异 + if source == "both" and "local" in result and "remote" in result: + local_set = set(result["local"]["dates"]) + remote_set = set(result["remote"].get("dates", [])) + + result["comparison"] = { + "only_local": sorted(list(local_set - remote_set), reverse=True), + "only_remote": sorted(list(remote_set - local_set), reverse=True), + "both": sorted(list(local_set & remote_set), reverse=True), + } + + return result + + except MCPError as e: + return { + "success": False, + "error": e.to_dict() + } + except Exception as e: + return { + "success": False, + "error": { + "code": "INTERNAL_ERROR", + "message": str(e) + } + } diff --git a/mcp_server/tools/system.py b/mcp_server/tools/system.py index 2cf2248..af15a7b 100644 --- a/mcp_server/tools/system.py +++ b/mcp_server/tools/system.py @@ -87,13 +87,13 @@ class SystemManagementTools: >>> print(result['saved_files']) """ try: - import json import time - import random - import requests - from datetime import datetime - import pytz import yaml + from trendradar.crawler.fetcher import DataFetcher + from trendradar.storage.local import LocalStorageBackend + from trendradar.storage.base import convert_crawl_results_to_news_data + from trendradar.utils.time import get_configured_time, format_date_folder, format_time_filename + from ..services.cache_service import get_cache # 参数验证 platforms = validate_platforms(platforms) @@ -129,9 +129,6 @@ class SystemManagementTools: else: target_platforms = all_platforms - # 获取请求间隔 - request_interval = config_data.get("crawler", {}).get("request_interval", 100) - # 构建平台ID列表 ids = [] for platform in target_platforms: @@ -142,87 +139,82 @@ class SystemManagementTools: print(f"开始临时爬取,平台: {[p.get('name', p['id']) for p in target_platforms]}") - # 爬取数据 - results = {} - id_to_name = {} - failed_ids = [] + # 初始化数据获取器 + crawler_config = config_data.get("crawler", {}) + proxy_url = None + if crawler_config.get("use_proxy"): + proxy_url = crawler_config.get("proxy_url") + + fetcher = DataFetcher(proxy_url=proxy_url) + request_interval = crawler_config.get("request_interval", 100) - for i, id_info in enumerate(ids): - if isinstance(id_info, tuple): - id_value, name = id_info - else: - id_value = id_info - name = id_value + # 执行爬取 + results, id_to_name, failed_ids = fetcher.crawl_websites( + ids_list=ids, + request_interval=request_interval + ) - id_to_name[id_value] = name + # 获取当前时间(统一使用 trendradar 的时间工具) + # 从配置中读取时区,默认为 Asia/Shanghai + timezone = config_data.get("app", {}).get("timezone", "Asia/Shanghai") + current_time = get_configured_time(timezone) + crawl_date = format_date_folder(None, timezone) + crawl_time_str = format_time_filename(timezone) - # 构建请求URL - url = f"https://newsnow.busiyi.world/api/s?id={id_value}&latest" + # 转换为标准数据模型 + news_data = convert_crawl_results_to_news_data( + results=results, + id_to_name=id_to_name, + failed_ids=failed_ids, + crawl_time=crawl_time_str, + crawl_date=crawl_date + ) - headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", - "Accept": "application/json, text/plain, */*", - "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", - "Connection": "keep-alive", - "Cache-Control": "no-cache", - } + # 初始化存储后端 + storage = LocalStorageBackend( + data_dir=str(self.project_root / "output"), + enable_txt=True, + enable_html=True, + timezone=timezone + ) - # 重试机制 - max_retries = 2 - retries = 0 - success = False + # 尝试持久化数据 + save_success = False + save_error_msg = "" + saved_files = {} - while retries <= max_retries and not success: - try: - response = requests.get(url, headers=headers, timeout=10) - response.raise_for_status() + try: + # 1. 保存到 SQLite (核心持久化) + if storage.save_news_data(news_data): + save_success = True + + # 2. 如果请求保存到本地,生成 TXT/HTML 快照 + if save_to_local: + # 保存 TXT + txt_path = storage.save_txt_snapshot(news_data) + if txt_path: + saved_files["txt"] = txt_path - data_text = response.text - data_json = json.loads(data_text) + # 保存 HTML (使用简化版生成器) + html_content = self._generate_simple_html(results, id_to_name, failed_ids, current_time) + html_filename = f"{crawl_time_str}.html" + html_path = storage.save_html_report(html_content, html_filename) + if html_path: + saved_files["html"] = html_path - status = data_json.get("status", "未知") - if status not in ["success", "cache"]: - raise ValueError(f"响应状态异常: {status}") + except Exception as e: + # 捕获所有保存错误(特别是 Docker 只读卷导致的 PermissionError) + print(f"[System] 数据保存失败: {e}") + save_success = False + save_error_msg = str(e) - status_info = "最新数据" if status == "success" else "缓存数据" - print(f"获取 {id_value} 成功({status_info})") + # 3. 清除缓存,确保下次查询获取最新数据 + # 即使保存失败,内存中的数据可能已经通过其他方式更新,或者是临时的 + get_cache().clear() + print("[System] 缓存已清除") - # 解析数据 - results[id_value] = {} - for index, item in enumerate(data_json.get("items", []), 1): - title = item["title"] - url_link = item.get("url", "") - mobile_url = item.get("mobileUrl", "") - - if title in results[id_value]: - results[id_value][title]["ranks"].append(index) - else: - results[id_value][title] = { - "ranks": [index], - "url": url_link, - "mobileUrl": mobile_url, - } - - success = True - - except Exception as e: - retries += 1 - if retries <= max_retries: - wait_time = random.uniform(3, 5) - print(f"请求 {id_value} 失败: {e}. {wait_time:.2f}秒后重试...") - time.sleep(wait_time) - else: - print(f"请求 {id_value} 失败: {e}") - failed_ids.append(id_value) - - # 请求间隔 - if i < len(ids) - 1: - actual_interval = request_interval + random.randint(-10, 20) - actual_interval = max(50, actual_interval) - time.sleep(actual_interval / 1000) - - # 格式化返回数据 - news_data = [] + # 构建返回结果 + news_response_data = [] for platform_id, titles_data in results.items(): platform_name = id_to_name.get(platform_id, platform_id) for title, info in titles_data.items(): @@ -230,131 +222,42 @@ class SystemManagementTools: "platform_id": platform_id, "platform_name": platform_name, "title": title, - "ranks": info["ranks"] + "ranks": info.get("ranks", []) } - - # 条件性添加 URL 字段 if include_url: news_item["url"] = info.get("url", "") news_item["mobile_url"] = info.get("mobileUrl", "") + news_response_data.append(news_item) - news_data.append(news_item) - - # 获取北京时间 - beijing_tz = pytz.timezone("Asia/Shanghai") - now = datetime.now(beijing_tz) - - # 构建返回结果 result = { "success": True, "task_id": f"crawl_{int(time.time())}", "status": "completed", - "crawl_time": now.strftime("%Y-%m-%d %H:%M:%S"), + "crawl_time": current_time.strftime("%Y-%m-%d %H:%M:%S"), "platforms": list(results.keys()), - "total_news": len(news_data), + "total_news": len(news_response_data), "failed_platforms": failed_ids, - "data": news_data, - "saved_to_local": save_to_local + "data": news_response_data, + "saved_to_local": save_success and save_to_local } - # 如果需要持久化,调用保存逻辑 - if save_to_local: - try: - import re - - # 辅助函数:清理标题 - def clean_title(title: str) -> str: - """清理标题中的特殊字符""" - if not isinstance(title, str): - title = str(title) - cleaned_title = title.replace("\n", " ").replace("\r", " ") - cleaned_title = re.sub(r"\s+", " ", cleaned_title) - cleaned_title = cleaned_title.strip() - return cleaned_title - - # 辅助函数:创建目录 - def ensure_directory_exists(directory: str): - """确保目录存在""" - Path(directory).mkdir(parents=True, exist_ok=True) - - # 格式化日期和时间 - date_folder = now.strftime("%Y年%m月%d日") - time_filename = now.strftime("%H时%M分") - - # 创建 txt 文件路径 - txt_dir = self.project_root / "output" / date_folder / "txt" - ensure_directory_exists(str(txt_dir)) - txt_file_path = txt_dir / f"{time_filename}.txt" - - # 创建 html 文件路径 - html_dir = self.project_root / "output" / date_folder / "html" - ensure_directory_exists(str(html_dir)) - html_file_path = html_dir / f"{time_filename}.html" - - # 保存 txt 文件(按照 main.py 的格式) - with open(txt_file_path, "w", encoding="utf-8") as f: - for id_value, title_data in results.items(): - # id | name 或 id - name = id_to_name.get(id_value) - if name and name != id_value: - f.write(f"{id_value} | {name}\n") - else: - f.write(f"{id_value}\n") - - # 按排名排序标题 - sorted_titles = [] - for title, info in title_data.items(): - cleaned = clean_title(title) - if isinstance(info, dict): - ranks = info.get("ranks", []) - url = info.get("url", "") - mobile_url = info.get("mobileUrl", "") - else: - ranks = info if isinstance(info, list) else [] - url = "" - mobile_url = "" - - rank = ranks[0] if ranks else 1 - sorted_titles.append((rank, cleaned, url, mobile_url)) - - sorted_titles.sort(key=lambda x: x[0]) - - for rank, cleaned, url, mobile_url in sorted_titles: - line = f"{rank}. {cleaned}" - if url: - line += f" [URL:{url}]" - if mobile_url: - line += f" [MOBILE:{mobile_url}]" - f.write(line + "\n") - - f.write("\n") - - if failed_ids: - f.write("==== 以下ID请求失败 ====\n") - for id_value in failed_ids: - f.write(f"{id_value}\n") - - # 保存 html 文件(简化版) - html_content = self._generate_simple_html(results, id_to_name, failed_ids, now) - with open(html_file_path, "w", encoding="utf-8") as f: - f.write(html_content) - - print(f"数据已保存到:") - print(f" TXT: {txt_file_path}") - print(f" HTML: {html_file_path}") - - result["saved_files"] = { - "txt": str(txt_file_path), - "html": str(html_file_path) - } - result["note"] = "数据已持久化到 output 文件夹" - - except Exception as e: - print(f"保存文件失败: {e}") - result["save_error"] = str(e) - result["note"] = "爬取成功但保存失败,数据仅在内存中" + if save_success: + if save_to_local: + result["saved_files"] = saved_files + result["note"] = "数据已保存到 SQLite 数据库及 output 文件夹" + else: + result["note"] = "数据已保存到 SQLite 数据库 (仅内存中返回结果,未生成TXT快照)" else: - result["note"] = "临时爬取结果,未持久化到output文件夹" + # 明确告知用户保存失败 + result["saved_to_local"] = False + result["save_error"] = save_error_msg + if "Read-only file system" in save_error_msg or "Permission denied" in save_error_msg: + result["note"] = "爬取成功,但无法写入数据库(Docker只读模式)。数据仅在本次返回中有效。" + else: + result["note"] = f"爬取成功但保存失败: {save_error_msg}" + + # 清理资源 + storage.cleanup() return result diff --git a/mcp_server/utils/date_parser.py b/mcp_server/utils/date_parser.py index 3b28ebd..d50e7c9 100644 --- a/mcp_server/utils/date_parser.py +++ b/mcp_server/utils/date_parser.py @@ -283,13 +283,13 @@ class DateParser: date: datetime对象 Returns: - 文件夹名称,格式: YYYY年MM月DD日 + 文件夹名称,格式: YYYY-MM-DD Examples: >>> DateParser.format_date_folder(datetime(2025, 10, 11)) - '2025年10月11日' + '2025-10-11' """ - return date.strftime("%Y年%m月%d日") + return date.strftime("%Y-%m-%d") @staticmethod def validate_date_not_future(date: datetime) -> None: diff --git a/pyproject.toml b/pyproject.toml index 15f66a4..b19344c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "trendradar-mcp" -version = "1.0.3" +version = "1.1.0" description = "TrendRadar MCP Server - 新闻热点聚合工具" requires-python = ">=3.10" dependencies = [ diff --git a/requirements.txt b/requirements.txt index 7d66038..ca69a91 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,4 @@ pytz>=2025.2,<2026.0 PyYAML>=6.0.3,<7.0.0 fastmcp>=2.12.0,<2.14.0 websockets>=13.0,<14.0 +boto3>=1.35.0,<2.0.0 diff --git a/trendradar/__init__.py b/trendradar/__init__.py new file mode 100644 index 0000000..3135177 --- /dev/null +++ b/trendradar/__init__.py @@ -0,0 +1,13 @@ +# coding=utf-8 +""" +TrendRadar - 热点新闻聚合与分析工具 + +使用方式: + python -m trendradar # 模块执行 + trendradar # 安装后执行 +""" + +from trendradar.context import AppContext + +__version__ = "4.0.0" +__all__ = ["AppContext", "__version__"] diff --git a/trendradar/__main__.py b/trendradar/__main__.py new file mode 100644 index 0000000..6c9682a --- /dev/null +++ b/trendradar/__main__.py @@ -0,0 +1,719 @@ +# coding=utf-8 +""" +TrendRadar 主程序 + +热点新闻聚合与分析工具 +支持: python -m trendradar +""" + +import os +import webbrowser +from pathlib import Path +from typing import Dict, List, Tuple, Optional + +import requests + +from trendradar.context import AppContext + +# 版本号直接定义,避免循环导入 +VERSION = "4.0.0" +from trendradar.core import load_config +from trendradar.crawler import DataFetcher +from trendradar.storage import convert_crawl_results_to_news_data + + +def check_version_update( + current_version: str, version_url: str, proxy_url: Optional[str] = None +) -> Tuple[bool, Optional[str]]: + """检查版本更新""" + try: + proxies = None + if proxy_url: + proxies = {"http": proxy_url, "https": proxy_url} + + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", + "Accept": "text/plain, */*", + "Cache-Control": "no-cache", + } + + response = requests.get( + version_url, proxies=proxies, headers=headers, timeout=10 + ) + response.raise_for_status() + + remote_version = response.text.strip() + print(f"当前版本: {current_version}, 远程版本: {remote_version}") + + # 比较版本 + def parse_version(version_str): + try: + parts = version_str.strip().split(".") + if len(parts) != 3: + raise ValueError("版本号格式不正确") + return int(parts[0]), int(parts[1]), int(parts[2]) + except: + return 0, 0, 0 + + current_tuple = parse_version(current_version) + remote_tuple = parse_version(remote_version) + + need_update = current_tuple < remote_tuple + return need_update, remote_version if need_update else None + + except Exception as e: + print(f"版本检查失败: {e}") + return False, None + + +# === 主分析器 === +class NewsAnalyzer: + """新闻分析器""" + + # 模式策略定义 + MODE_STRATEGIES = { + "incremental": { + "mode_name": "增量模式", + "description": "增量模式(只关注新增新闻,无新增时不推送)", + "realtime_report_type": "实时增量", + "summary_report_type": "当日汇总", + "should_send_realtime": True, + "should_generate_summary": True, + "summary_mode": "daily", + }, + "current": { + "mode_name": "当前榜单模式", + "description": "当前榜单模式(当前榜单匹配新闻 + 新增新闻区域 + 按时推送)", + "realtime_report_type": "实时当前榜单", + "summary_report_type": "当前榜单汇总", + "should_send_realtime": True, + "should_generate_summary": True, + "summary_mode": "current", + }, + "daily": { + "mode_name": "当日汇总模式", + "description": "当日汇总模式(所有匹配新闻 + 新增新闻区域 + 按时推送)", + "realtime_report_type": "", + "summary_report_type": "当日汇总", + "should_send_realtime": False, + "should_generate_summary": True, + "summary_mode": "daily", + }, + } + + def __init__(self): + # 加载配置 + print("正在加载配置...") + config = load_config() + print(f"TrendRadar v{VERSION} 配置加载完成") + print(f"监控平台数量: {len(config['PLATFORMS'])}") + print(f"时区: {config.get('TIMEZONE', 'Asia/Shanghai')}") + + # 创建应用上下文 + self.ctx = AppContext(config) + + self.request_interval = self.ctx.config["REQUEST_INTERVAL"] + self.report_mode = self.ctx.config["REPORT_MODE"] + self.rank_threshold = self.ctx.rank_threshold + self.is_github_actions = os.environ.get("GITHUB_ACTIONS") == "true" + self.is_docker_container = self._detect_docker_environment() + self.update_info = None + self.proxy_url = None + self._setup_proxy() + self.data_fetcher = DataFetcher(self.proxy_url) + + # 初始化存储管理器(使用 AppContext) + self._init_storage_manager() + + if self.is_github_actions: + self._check_version_update() + + def _init_storage_manager(self) -> None: + """初始化存储管理器(使用 AppContext)""" + # 获取数据保留天数(支持环境变量覆盖) + env_retention = os.environ.get("STORAGE_RETENTION_DAYS", "").strip() + if env_retention: + # 环境变量覆盖配置 + self.ctx.config["STORAGE"]["RETENTION_DAYS"] = int(env_retention) + + self.storage_manager = self.ctx.get_storage_manager() + print(f"存储后端: {self.storage_manager.backend_name}") + + retention_days = self.ctx.config.get("STORAGE", {}).get("RETENTION_DAYS", 0) + if retention_days > 0: + print(f"数据保留天数: {retention_days} 天") + + def _detect_docker_environment(self) -> bool: + """检测是否运行在 Docker 容器中""" + try: + if os.environ.get("DOCKER_CONTAINER") == "true": + return True + + if os.path.exists("/.dockerenv"): + return True + + return False + except Exception: + return False + + def _should_open_browser(self) -> bool: + """判断是否应该打开浏览器""" + return not self.is_github_actions and not self.is_docker_container + + def _setup_proxy(self) -> None: + """设置代理配置""" + if not self.is_github_actions and self.ctx.config["USE_PROXY"]: + self.proxy_url = self.ctx.config["DEFAULT_PROXY"] + print("本地环境,使用代理") + elif not self.is_github_actions and not self.ctx.config["USE_PROXY"]: + print("本地环境,未启用代理") + else: + print("GitHub Actions环境,不使用代理") + + def _check_version_update(self) -> None: + """检查版本更新""" + try: + need_update, remote_version = check_version_update( + VERSION, self.ctx.config["VERSION_CHECK_URL"], self.proxy_url + ) + + if need_update and remote_version: + self.update_info = { + "current_version": VERSION, + "remote_version": remote_version, + } + print(f"发现新版本: {remote_version} (当前: {VERSION})") + else: + print("版本检查完成,当前为最新版本") + except Exception as e: + print(f"版本检查出错: {e}") + + def _get_mode_strategy(self) -> Dict: + """获取当前模式的策略配置""" + return self.MODE_STRATEGIES.get(self.report_mode, self.MODE_STRATEGIES["daily"]) + + def _has_notification_configured(self) -> bool: + """检查是否配置了任何通知渠道""" + cfg = self.ctx.config + return any( + [ + cfg["FEISHU_WEBHOOK_URL"], + cfg["DINGTALK_WEBHOOK_URL"], + cfg["WEWORK_WEBHOOK_URL"], + (cfg["TELEGRAM_BOT_TOKEN"] and cfg["TELEGRAM_CHAT_ID"]), + ( + cfg["EMAIL_FROM"] + and cfg["EMAIL_PASSWORD"] + and cfg["EMAIL_TO"] + ), + (cfg["NTFY_SERVER_URL"] and cfg["NTFY_TOPIC"]), + cfg["BARK_URL"], + cfg["SLACK_WEBHOOK_URL"], + ] + ) + + def _has_valid_content( + self, stats: List[Dict], new_titles: Optional[Dict] = None + ) -> bool: + """检查是否有有效的新闻内容""" + if self.report_mode in ["incremental", "current"]: + # 增量模式和current模式下,只要stats有内容就说明有匹配的新闻 + return any(stat["count"] > 0 for stat in stats) + else: + # 当日汇总模式下,检查是否有匹配的频率词新闻或新增新闻 + has_matched_news = any(stat["count"] > 0 for stat in stats) + has_new_news = bool( + new_titles and any(len(titles) > 0 for titles in new_titles.values()) + ) + return has_matched_news or has_new_news + + def _load_analysis_data( + self, + ) -> Optional[Tuple[Dict, Dict, Dict, Dict, List, List]]: + """统一的数据加载和预处理,使用当前监控平台列表过滤历史数据""" + try: + # 获取当前配置的监控平台ID列表 + current_platform_ids = self.ctx.platform_ids + print(f"当前监控平台: {current_platform_ids}") + + all_results, id_to_name, title_info = self.ctx.read_today_titles( + current_platform_ids + ) + + if not all_results: + print("没有找到当天的数据") + return None + + total_titles = sum(len(titles) for titles in all_results.values()) + print(f"读取到 {total_titles} 个标题(已按当前监控平台过滤)") + + new_titles = self.ctx.detect_new_titles(current_platform_ids) + word_groups, filter_words, global_filters = self.ctx.load_frequency_words() + + return ( + all_results, + id_to_name, + title_info, + new_titles, + word_groups, + filter_words, + global_filters, + ) + except Exception as e: + print(f"数据加载失败: {e}") + return None + + def _prepare_current_title_info(self, results: Dict, time_info: str) -> Dict: + """从当前抓取结果构建标题信息""" + title_info = {} + for source_id, titles_data in results.items(): + title_info[source_id] = {} + for title, title_data in titles_data.items(): + ranks = title_data.get("ranks", []) + url = title_data.get("url", "") + mobile_url = title_data.get("mobileUrl", "") + + title_info[source_id][title] = { + "first_time": time_info, + "last_time": time_info, + "count": 1, + "ranks": ranks, + "url": url, + "mobileUrl": mobile_url, + } + return title_info + + def _run_analysis_pipeline( + self, + data_source: Dict, + mode: str, + title_info: Dict, + new_titles: Dict, + word_groups: List[Dict], + filter_words: List[str], + id_to_name: Dict, + failed_ids: Optional[List] = None, + is_daily_summary: bool = False, + global_filters: Optional[List[str]] = None, + ) -> Tuple[List[Dict], Optional[str]]: + """统一的分析流水线:数据处理 → 统计计算 → HTML生成""" + + # 统计计算(使用 AppContext) + stats, total_titles = self.ctx.count_frequency( + data_source, + word_groups, + filter_words, + id_to_name, + title_info, + new_titles, + mode=mode, + global_filters=global_filters, + ) + + # HTML生成(如果启用) + html_file = None + if self.ctx.config["STORAGE"]["FORMATS"]["HTML"]: + html_file = self.ctx.generate_html( + stats, + total_titles, + failed_ids=failed_ids, + new_titles=new_titles, + id_to_name=id_to_name, + mode=mode, + is_daily_summary=is_daily_summary, + update_info=self.update_info if self.ctx.config["SHOW_VERSION_UPDATE"] else None, + ) + + return stats, html_file + + def _send_notification_if_needed( + self, + stats: List[Dict], + report_type: str, + mode: str, + failed_ids: Optional[List] = None, + new_titles: Optional[Dict] = None, + id_to_name: Optional[Dict] = None, + html_file_path: Optional[str] = None, + ) -> bool: + """统一的通知发送逻辑,包含所有判断条件""" + has_notification = self._has_notification_configured() + cfg = self.ctx.config + + if ( + cfg["ENABLE_NOTIFICATION"] + and has_notification + and self._has_valid_content(stats, new_titles) + ): + # 推送窗口控制 + if cfg["PUSH_WINDOW"]["ENABLED"]: + push_manager = self.ctx.create_push_manager() + time_range_start = cfg["PUSH_WINDOW"]["TIME_RANGE"]["START"] + time_range_end = cfg["PUSH_WINDOW"]["TIME_RANGE"]["END"] + + if not push_manager.is_in_time_range(time_range_start, time_range_end): + now = self.ctx.get_time() + print( + f"推送窗口控制:当前时间 {now.strftime('%H:%M')} 不在推送时间窗口 {time_range_start}-{time_range_end} 内,跳过推送" + ) + return False + + if cfg["PUSH_WINDOW"]["ONCE_PER_DAY"]: + if push_manager.has_pushed_today(): + print(f"推送窗口控制:今天已推送过,跳过本次推送") + return False + else: + print(f"推送窗口控制:今天首次推送") + + # 准备报告数据 + report_data = self.ctx.prepare_report(stats, failed_ids, new_titles, id_to_name, mode) + + # 是否发送版本更新信息 + update_info_to_send = self.update_info if cfg["SHOW_VERSION_UPDATE"] else None + + # 使用 NotificationDispatcher 发送到所有渠道 + dispatcher = self.ctx.create_notification_dispatcher() + results = dispatcher.dispatch_all( + report_data=report_data, + report_type=report_type, + update_info=update_info_to_send, + proxy_url=self.proxy_url, + mode=mode, + html_file_path=html_file_path, + ) + + if not results: + print("未配置任何通知渠道,跳过通知发送") + return False + + # 如果成功发送了任何通知,且启用了每天只推一次,则记录推送 + if ( + cfg["PUSH_WINDOW"]["ENABLED"] + and cfg["PUSH_WINDOW"]["ONCE_PER_DAY"] + and any(results.values()) + ): + push_manager = self.ctx.create_push_manager() + push_manager.record_push(report_type) + + return True + + elif cfg["ENABLE_NOTIFICATION"] and not has_notification: + print("⚠️ 警告:通知功能已启用但未配置任何通知渠道,将跳过通知发送") + elif not cfg["ENABLE_NOTIFICATION"]: + print(f"跳过{report_type}通知:通知功能已禁用") + elif ( + cfg["ENABLE_NOTIFICATION"] + and has_notification + and not self._has_valid_content(stats, new_titles) + ): + mode_strategy = self._get_mode_strategy() + if "实时" in report_type: + print( + f"跳过实时推送通知:{mode_strategy['mode_name']}下未检测到匹配的新闻" + ) + else: + print( + f"跳过{mode_strategy['summary_report_type']}通知:未匹配到有效的新闻内容" + ) + + return False + + def _generate_summary_report(self, mode_strategy: Dict) -> Optional[str]: + """生成汇总报告(带通知)""" + summary_type = ( + "当前榜单汇总" if mode_strategy["summary_mode"] == "current" else "当日汇总" + ) + print(f"生成{summary_type}报告...") + + # 加载分析数据 + analysis_data = self._load_analysis_data() + if not analysis_data: + return None + + all_results, id_to_name, title_info, new_titles, word_groups, filter_words, global_filters = ( + analysis_data + ) + + # 运行分析流水线 + stats, html_file = self._run_analysis_pipeline( + all_results, + mode_strategy["summary_mode"], + title_info, + new_titles, + word_groups, + filter_words, + id_to_name, + is_daily_summary=True, + global_filters=global_filters, + ) + + if html_file: + print(f"{summary_type}报告已生成: {html_file}") + + # 发送通知 + self._send_notification_if_needed( + stats, + mode_strategy["summary_report_type"], + mode_strategy["summary_mode"], + failed_ids=[], + new_titles=new_titles, + id_to_name=id_to_name, + html_file_path=html_file, + ) + + return html_file + + def _generate_summary_html(self, mode: str = "daily") -> Optional[str]: + """生成汇总HTML""" + summary_type = "当前榜单汇总" if mode == "current" else "当日汇总" + print(f"生成{summary_type}HTML...") + + # 加载分析数据 + analysis_data = self._load_analysis_data() + if not analysis_data: + return None + + all_results, id_to_name, title_info, new_titles, word_groups, filter_words, global_filters = ( + analysis_data + ) + + # 运行分析流水线 + _, html_file = self._run_analysis_pipeline( + all_results, + mode, + title_info, + new_titles, + word_groups, + filter_words, + id_to_name, + is_daily_summary=True, + global_filters=global_filters, + ) + + if html_file: + print(f"{summary_type}HTML已生成: {html_file}") + return html_file + + def _initialize_and_check_config(self) -> None: + """通用初始化和配置检查""" + now = self.ctx.get_time() + print(f"当前北京时间: {now.strftime('%Y-%m-%d %H:%M:%S')}") + + if not self.ctx.config["ENABLE_CRAWLER"]: + print("爬虫功能已禁用(ENABLE_CRAWLER=False),程序退出") + return + + has_notification = self._has_notification_configured() + if not self.ctx.config["ENABLE_NOTIFICATION"]: + print("通知功能已禁用(ENABLE_NOTIFICATION=False),将只进行数据抓取") + elif not has_notification: + print("未配置任何通知渠道,将只进行数据抓取,不发送通知") + else: + print("通知功能已启用,将发送通知") + + mode_strategy = self._get_mode_strategy() + print(f"报告模式: {self.report_mode}") + print(f"运行模式: {mode_strategy['description']}") + + def _crawl_data(self) -> Tuple[Dict, Dict, List]: + """执行数据爬取""" + ids = [] + for platform in self.ctx.platforms: + if "name" in platform: + ids.append((platform["id"], platform["name"])) + else: + ids.append(platform["id"]) + + print( + f"配置的监控平台: {[p.get('name', p['id']) for p in self.ctx.platforms]}" + ) + print(f"开始爬取数据,请求间隔 {self.request_interval} 毫秒") + Path("output").mkdir(parents=True, exist_ok=True) + + results, id_to_name, failed_ids = self.data_fetcher.crawl_websites( + ids, self.request_interval + ) + + # 转换为 NewsData 格式并保存到存储后端 + crawl_time = self.ctx.format_time() + crawl_date = self.ctx.format_date() + news_data = convert_crawl_results_to_news_data( + results, id_to_name, failed_ids, crawl_time, crawl_date + ) + + # 保存到存储后端(SQLite) + if self.storage_manager.save_news_data(news_data): + print(f"数据已保存到存储后端: {self.storage_manager.backend_name}") + + # 保存 TXT 快照(如果启用) + txt_file = self.storage_manager.save_txt_snapshot(news_data) + if txt_file: + print(f"TXT 快照已保存: {txt_file}") + + # 兼容:同时保存到原有 TXT 格式(确保向后兼容) + if self.ctx.config["STORAGE"]["FORMATS"]["TXT"]: + title_file = self.ctx.save_titles(results, id_to_name, failed_ids) + print(f"标题已保存到: {title_file}") + + return results, id_to_name, failed_ids + + def _execute_mode_strategy( + self, mode_strategy: Dict, results: Dict, id_to_name: Dict, failed_ids: List + ) -> Optional[str]: + """执行模式特定逻辑""" + # 获取当前监控平台ID列表 + current_platform_ids = self.ctx.platform_ids + + new_titles = self.ctx.detect_new_titles(current_platform_ids) + time_info = self.ctx.format_time() + if self.ctx.config["STORAGE"]["FORMATS"]["TXT"]: + self.ctx.save_titles(results, id_to_name, failed_ids) + word_groups, filter_words, global_filters = self.ctx.load_frequency_words() + + # current模式下,实时推送需要使用完整的历史数据来保证统计信息的完整性 + if self.report_mode == "current": + # 加载完整的历史数据(已按当前平台过滤) + analysis_data = self._load_analysis_data() + if analysis_data: + ( + all_results, + historical_id_to_name, + historical_title_info, + historical_new_titles, + _, + _, + _, + ) = analysis_data + + print( + f"current模式:使用过滤后的历史数据,包含平台:{list(all_results.keys())}" + ) + + stats, html_file = self._run_analysis_pipeline( + all_results, + self.report_mode, + historical_title_info, + historical_new_titles, + word_groups, + filter_words, + historical_id_to_name, + failed_ids=failed_ids, + global_filters=global_filters, + ) + + combined_id_to_name = {**historical_id_to_name, **id_to_name} + + if html_file: + print(f"HTML报告已生成: {html_file}") + + # 发送实时通知(使用完整历史数据的统计结果) + summary_html = None + if mode_strategy["should_send_realtime"]: + self._send_notification_if_needed( + stats, + mode_strategy["realtime_report_type"], + self.report_mode, + failed_ids=failed_ids, + new_titles=historical_new_titles, + id_to_name=combined_id_to_name, + html_file_path=html_file, + ) + else: + print("❌ 严重错误:无法读取刚保存的数据文件") + raise RuntimeError("数据一致性检查失败:保存后立即读取失败") + else: + title_info = self._prepare_current_title_info(results, time_info) + stats, html_file = self._run_analysis_pipeline( + results, + self.report_mode, + title_info, + new_titles, + word_groups, + filter_words, + id_to_name, + failed_ids=failed_ids, + global_filters=global_filters, + ) + if html_file: + print(f"HTML报告已生成: {html_file}") + + # 发送实时通知(如果需要) + summary_html = None + if mode_strategy["should_send_realtime"]: + self._send_notification_if_needed( + stats, + mode_strategy["realtime_report_type"], + self.report_mode, + failed_ids=failed_ids, + new_titles=new_titles, + id_to_name=id_to_name, + html_file_path=html_file, + ) + + # 生成汇总报告(如果需要) + summary_html = None + if mode_strategy["should_generate_summary"]: + if mode_strategy["should_send_realtime"]: + # 如果已经发送了实时通知,汇总只生成HTML不发送通知 + summary_html = self._generate_summary_html( + mode_strategy["summary_mode"] + ) + else: + # daily模式:直接生成汇总报告并发送通知 + summary_html = self._generate_summary_report(mode_strategy) + + # 打开浏览器(仅在非容器环境) + if self._should_open_browser() and html_file: + if summary_html: + summary_url = "file://" + str(Path(summary_html).resolve()) + print(f"正在打开汇总报告: {summary_url}") + webbrowser.open(summary_url) + else: + file_url = "file://" + str(Path(html_file).resolve()) + print(f"正在打开HTML报告: {file_url}") + webbrowser.open(file_url) + elif self.is_docker_container and html_file: + if summary_html: + print(f"汇总报告已生成(Docker环境): {summary_html}") + else: + print(f"HTML报告已生成(Docker环境): {html_file}") + + return summary_html + + def run(self) -> None: + """执行分析流程""" + try: + self._initialize_and_check_config() + + mode_strategy = self._get_mode_strategy() + + results, id_to_name, failed_ids = self._crawl_data() + + self._execute_mode_strategy(mode_strategy, results, id_to_name, failed_ids) + + except Exception as e: + print(f"分析流程执行出错: {e}") + raise + finally: + # 清理资源(包括过期数据清理和数据库连接关闭) + self.ctx.cleanup() + + +def main(): + """主程序入口""" + try: + analyzer = NewsAnalyzer() + analyzer.run() + except FileNotFoundError as e: + print(f"❌ 配置文件错误: {e}") + print("\n请确保以下文件存在:") + print(" • config/config.yaml") + print(" • config/frequency_words.txt") + print("\n参考项目文档进行正确配置") + except Exception as e: + print(f"❌ 程序运行错误: {e}") + raise + + +if __name__ == "__main__": + main() diff --git a/trendradar/context.py b/trendradar/context.py new file mode 100644 index 0000000..3fc7d15 --- /dev/null +++ b/trendradar/context.py @@ -0,0 +1,388 @@ +# coding=utf-8 +""" +应用上下文模块 + +提供配置上下文类,封装所有依赖配置的操作,消除全局状态和包装函数。 +""" + +from datetime import datetime +from pathlib import Path +from typing import Any, Callable, Dict, List, Optional, Tuple + +from trendradar.utils.time import ( + get_configured_time, + format_date_folder, + format_time_filename, + get_current_time_display, + convert_time_for_display, +) +from trendradar.core import ( + load_frequency_words, + matches_word_groups, + save_titles_to_file, + read_all_today_titles, + detect_latest_new_titles, + is_first_crawl_today, + count_word_frequency, +) +from trendradar.report import ( + clean_title, + prepare_report_data, + generate_html_report, + render_html_content, +) +from trendradar.notification import ( + render_feishu_content, + render_dingtalk_content, + split_content_into_batches, + NotificationDispatcher, + PushRecordManager, +) +from trendradar.storage import get_storage_manager + + +class AppContext: + """ + 应用上下文类 + + 封装所有依赖配置的操作,提供统一的接口。 + 消除对全局 CONFIG 的依赖,提高可测试性。 + + 使用示例: + config = load_config() + ctx = AppContext(config) + + # 时间操作 + now = ctx.get_time() + date_folder = ctx.format_date() + + # 存储操作 + storage = ctx.get_storage_manager() + + # 报告生成 + html = ctx.generate_html_report(stats, total_titles, ...) + """ + + def __init__(self, config: Dict[str, Any]): + """ + 初始化应用上下文 + + Args: + config: 完整的配置字典 + """ + self.config = config + self._storage_manager = None + + # === 配置访问 === + + @property + def timezone(self) -> str: + """获取配置的时区""" + return self.config.get("TIMEZONE", "Asia/Shanghai") + + @property + def rank_threshold(self) -> int: + """获取排名阈值""" + return self.config.get("RANK_THRESHOLD", 50) + + @property + def weight_config(self) -> Dict: + """获取权重配置""" + return self.config.get("WEIGHT_CONFIG", {}) + + @property + def platforms(self) -> List[Dict]: + """获取平台配置列表""" + return self.config.get("PLATFORMS", []) + + @property + def platform_ids(self) -> List[str]: + """获取平台ID列表""" + return [p["id"] for p in self.platforms] + + # === 时间操作 === + + def get_time(self) -> datetime: + """获取当前配置时区的时间""" + return get_configured_time(self.timezone) + + def format_date(self) -> str: + """格式化日期文件夹 (YYYY-MM-DD)""" + return format_date_folder(timezone=self.timezone) + + def format_time(self) -> str: + """格式化时间文件名 (HH-MM)""" + return format_time_filename(self.timezone) + + def get_time_display(self) -> str: + """获取时间显示 (HH:MM)""" + return get_current_time_display(self.timezone) + + @staticmethod + def convert_time_display(time_str: str) -> str: + """将 HH-MM 转换为 HH:MM""" + return convert_time_for_display(time_str) + + # === 存储操作 === + + def get_storage_manager(self): + """获取存储管理器(延迟初始化,单例)""" + if self._storage_manager is None: + storage_config = self.config.get("STORAGE", {}) + remote_config = storage_config.get("REMOTE", {}) + local_config = storage_config.get("LOCAL", {}) + pull_config = storage_config.get("PULL", {}) + + self._storage_manager = get_storage_manager( + backend_type=storage_config.get("BACKEND", "auto"), + data_dir=local_config.get("DATA_DIR", "output"), + enable_txt=storage_config.get("FORMATS", {}).get("TXT", True), + enable_html=storage_config.get("FORMATS", {}).get("HTML", True), + remote_config={ + "bucket_name": remote_config.get("BUCKET_NAME", ""), + "access_key_id": remote_config.get("ACCESS_KEY_ID", ""), + "secret_access_key": remote_config.get("SECRET_ACCESS_KEY", ""), + "endpoint_url": remote_config.get("ENDPOINT_URL", ""), + "region": remote_config.get("REGION", ""), + }, + local_retention_days=local_config.get("RETENTION_DAYS", 0), + remote_retention_days=remote_config.get("RETENTION_DAYS", 0), + pull_enabled=pull_config.get("ENABLED", False), + pull_days=pull_config.get("DAYS", 7), + timezone=self.timezone, + ) + return self._storage_manager + + def get_output_path(self, subfolder: str, filename: str) -> str: + """获取输出路径""" + output_dir = Path("output") / self.format_date() / subfolder + output_dir.mkdir(parents=True, exist_ok=True) + return str(output_dir / filename) + + # === 数据处理 === + + def save_titles(self, results: Dict, id_to_name: Dict, failed_ids: List) -> str: + """保存标题到文件""" + output_path = self.get_output_path("txt", f"{self.format_time()}.txt") + return save_titles_to_file(results, id_to_name, failed_ids, output_path, clean_title) + + def read_today_titles( + self, platform_ids: Optional[List[str]] = None + ) -> Tuple[Dict, Dict, Dict]: + """读取当天所有标题""" + return read_all_today_titles(self.get_storage_manager(), platform_ids) + + def detect_new_titles( + self, platform_ids: Optional[List[str]] = None + ) -> Dict: + """检测最新批次的新增标题""" + return detect_latest_new_titles(self.get_storage_manager(), platform_ids) + + def is_first_crawl(self) -> bool: + """检测是否是当天第一次爬取""" + return is_first_crawl_today("output", self.format_date()) + + # === 频率词处理 === + + def load_frequency_words( + self, frequency_file: Optional[str] = None + ) -> Tuple[List[Dict], List[str], List[str]]: + """加载频率词配置""" + return load_frequency_words(frequency_file) + + def matches_word_groups( + self, + title: str, + word_groups: List[Dict], + filter_words: List[str], + global_filters: Optional[List[str]] = None, + ) -> bool: + """检查标题是否匹配词组规则""" + return matches_word_groups(title, word_groups, filter_words, global_filters) + + # === 统计分析 === + + def count_frequency( + self, + results: Dict, + word_groups: List[Dict], + filter_words: List[str], + id_to_name: Dict, + title_info: Optional[Dict] = None, + new_titles: Optional[Dict] = None, + mode: str = "daily", + global_filters: Optional[List[str]] = None, + ) -> Tuple[List[Dict], int]: + """统计词频""" + return count_word_frequency( + results=results, + word_groups=word_groups, + filter_words=filter_words, + id_to_name=id_to_name, + title_info=title_info, + rank_threshold=self.rank_threshold, + new_titles=new_titles, + mode=mode, + global_filters=global_filters, + weight_config=self.weight_config, + max_news_per_keyword=self.config.get("MAX_NEWS_PER_KEYWORD", 0), + sort_by_position_first=self.config.get("SORT_BY_POSITION_FIRST", False), + is_first_crawl_func=self.is_first_crawl, + convert_time_func=self.convert_time_display, + ) + + # === 报告生成 === + + def prepare_report( + self, + stats: List[Dict], + failed_ids: Optional[List] = None, + new_titles: Optional[Dict] = None, + id_to_name: Optional[Dict] = None, + mode: str = "daily", + ) -> Dict: + """准备报告数据""" + return prepare_report_data( + stats=stats, + failed_ids=failed_ids, + new_titles=new_titles, + id_to_name=id_to_name, + mode=mode, + rank_threshold=self.rank_threshold, + matches_word_groups_func=self.matches_word_groups, + load_frequency_words_func=self.load_frequency_words, + ) + + def generate_html( + self, + stats: List[Dict], + total_titles: int, + failed_ids: Optional[List] = None, + new_titles: Optional[Dict] = None, + id_to_name: Optional[Dict] = None, + mode: str = "daily", + is_daily_summary: bool = False, + update_info: Optional[Dict] = None, + ) -> str: + """生成HTML报告""" + return generate_html_report( + stats=stats, + total_titles=total_titles, + failed_ids=failed_ids, + new_titles=new_titles, + id_to_name=id_to_name, + mode=mode, + is_daily_summary=is_daily_summary, + update_info=update_info, + rank_threshold=self.rank_threshold, + output_dir="output", + date_folder=self.format_date(), + time_filename=self.format_time(), + render_html_func=lambda *args, **kwargs: self.render_html(*args, **kwargs), + matches_word_groups_func=self.matches_word_groups, + load_frequency_words_func=self.load_frequency_words, + enable_index_copy=True, + ) + + def render_html( + self, + report_data: Dict, + total_titles: int, + is_daily_summary: bool = False, + mode: str = "daily", + update_info: Optional[Dict] = None, + ) -> str: + """渲染HTML内容""" + return render_html_content( + report_data=report_data, + total_titles=total_titles, + is_daily_summary=is_daily_summary, + mode=mode, + update_info=update_info, + reverse_content_order=self.config.get("REVERSE_CONTENT_ORDER", False), + get_time_func=self.get_time, + ) + + # === 通知内容渲染 === + + def render_feishu( + self, + report_data: Dict, + update_info: Optional[Dict] = None, + mode: str = "daily", + ) -> str: + """渲染飞书内容""" + return render_feishu_content( + report_data=report_data, + update_info=update_info, + mode=mode, + separator=self.config.get("FEISHU_MESSAGE_SEPARATOR", "---"), + reverse_content_order=self.config.get("REVERSE_CONTENT_ORDER", False), + get_time_func=self.get_time, + ) + + def render_dingtalk( + self, + report_data: Dict, + update_info: Optional[Dict] = None, + mode: str = "daily", + ) -> str: + """渲染钉钉内容""" + return render_dingtalk_content( + report_data=report_data, + update_info=update_info, + mode=mode, + reverse_content_order=self.config.get("REVERSE_CONTENT_ORDER", False), + get_time_func=self.get_time, + ) + + def split_content( + self, + report_data: Dict, + format_type: str, + update_info: Optional[Dict] = None, + max_bytes: Optional[int] = None, + mode: str = "daily", + ) -> List[str]: + """分批处理消息内容""" + return split_content_into_batches( + report_data=report_data, + format_type=format_type, + update_info=update_info, + max_bytes=max_bytes, + mode=mode, + batch_sizes={ + "dingtalk": self.config.get("DINGTALK_BATCH_SIZE", 20000), + "feishu": self.config.get("FEISHU_BATCH_SIZE", 29000), + "default": self.config.get("MESSAGE_BATCH_SIZE", 4000), + }, + feishu_separator=self.config.get("FEISHU_MESSAGE_SEPARATOR", "---"), + reverse_content_order=self.config.get("REVERSE_CONTENT_ORDER", False), + get_time_func=self.get_time, + ) + + # === 通知发送 === + + def create_notification_dispatcher(self) -> NotificationDispatcher: + """创建通知调度器""" + return NotificationDispatcher( + config=self.config, + get_time_func=self.get_time, + split_content_func=self.split_content, + ) + + def create_push_manager(self) -> PushRecordManager: + """创建推送记录管理器""" + return PushRecordManager( + storage_backend=self.get_storage_manager(), + get_time_func=self.get_time, + ) + + # === 资源清理 === + + def cleanup(self): + """清理资源""" + if self._storage_manager: + self._storage_manager.cleanup_old_data() + self._storage_manager.cleanup() + self._storage_manager = None diff --git a/trendradar/core/__init__.py b/trendradar/core/__init__.py new file mode 100644 index 0000000..a97f0a0 --- /dev/null +++ b/trendradar/core/__init__.py @@ -0,0 +1,47 @@ +# coding=utf-8 +""" +核心模块 - 配置管理和核心工具 +""" + +from trendradar.core.config import ( + parse_multi_account_config, + validate_paired_configs, + limit_accounts, + get_account_at_index, +) +from trendradar.core.loader import load_config +from trendradar.core.frequency import load_frequency_words, matches_word_groups +from trendradar.core.data import ( + save_titles_to_file, + read_all_today_titles_from_storage, + read_all_today_titles, + detect_latest_new_titles_from_storage, + detect_latest_new_titles, + is_first_crawl_today, +) +from trendradar.core.analyzer import ( + calculate_news_weight, + format_time_display, + count_word_frequency, +) + +__all__ = [ + "parse_multi_account_config", + "validate_paired_configs", + "limit_accounts", + "get_account_at_index", + "load_config", + "load_frequency_words", + "matches_word_groups", + # 数据处理 + "save_titles_to_file", + "read_all_today_titles_from_storage", + "read_all_today_titles", + "detect_latest_new_titles_from_storage", + "detect_latest_new_titles", + "is_first_crawl_today", + # 统计分析 + "calculate_news_weight", + "format_time_display", + "count_word_frequency", +] diff --git a/trendradar/core/analyzer.py b/trendradar/core/analyzer.py new file mode 100644 index 0000000..7cb1913 --- /dev/null +++ b/trendradar/core/analyzer.py @@ -0,0 +1,469 @@ +# coding=utf-8 +""" +统计分析模块 + +提供新闻统计和分析功能: +- calculate_news_weight: 计算新闻权重 +- format_time_display: 格式化时间显示 +- count_word_frequency: 统计词频 +""" + +from typing import Dict, List, Tuple, Optional, Callable + +from trendradar.core.frequency import matches_word_groups + + +def calculate_news_weight( + title_data: Dict, + rank_threshold: int, + weight_config: Dict, +) -> float: + """ + 计算新闻权重,用于排序 + + Args: + title_data: 标题数据,包含 ranks 和 count + rank_threshold: 排名阈值 + weight_config: 权重配置 {RANK_WEIGHT, FREQUENCY_WEIGHT, HOTNESS_WEIGHT} + + Returns: + float: 计算出的权重值 + """ + ranks = title_data.get("ranks", []) + if not ranks: + return 0.0 + + count = title_data.get("count", len(ranks)) + + # 排名权重:Σ(11 - min(rank, 10)) / 出现次数 + rank_scores = [] + for rank in ranks: + score = 11 - min(rank, 10) + rank_scores.append(score) + + rank_weight = sum(rank_scores) / len(ranks) if ranks else 0 + + # 频次权重:min(出现次数, 10) × 10 + frequency_weight = min(count, 10) * 10 + + # 热度加成:高排名次数 / 总出现次数 × 100 + high_rank_count = sum(1 for rank in ranks if rank <= rank_threshold) + hotness_ratio = high_rank_count / len(ranks) if ranks else 0 + hotness_weight = hotness_ratio * 100 + + total_weight = ( + rank_weight * weight_config["RANK_WEIGHT"] + + frequency_weight * weight_config["FREQUENCY_WEIGHT"] + + hotness_weight * weight_config["HOTNESS_WEIGHT"] + ) + + return total_weight + + +def format_time_display( + first_time: str, + last_time: str, + convert_time_func: Callable[[str], str], +) -> str: + """ + 格式化时间显示(将 HH-MM 转换为 HH:MM) + + Args: + first_time: 首次出现时间 + last_time: 最后出现时间 + convert_time_func: 时间格式转换函数 + + Returns: + str: 格式化后的时间显示字符串 + """ + if not first_time: + return "" + # 转换为显示格式 + first_display = convert_time_func(first_time) + last_display = convert_time_func(last_time) + if first_display == last_display or not last_display: + return first_display + else: + return f"[{first_display} ~ {last_display}]" + + +def count_word_frequency( + results: Dict, + word_groups: List[Dict], + filter_words: List[str], + id_to_name: Dict, + title_info: Optional[Dict] = None, + rank_threshold: int = 3, + new_titles: Optional[Dict] = None, + mode: str = "daily", + global_filters: Optional[List[str]] = None, + weight_config: Optional[Dict] = None, + max_news_per_keyword: int = 0, + sort_by_position_first: bool = False, + is_first_crawl_func: Optional[Callable[[], bool]] = None, + convert_time_func: Optional[Callable[[str], str]] = None, +) -> Tuple[List[Dict], int]: + """ + 统计词频,支持必须词、频率词、过滤词、全局过滤词,并标记新增标题 + + Args: + results: 抓取结果 {source_id: {title: title_data}} + word_groups: 词组配置列表 + filter_words: 过滤词列表 + id_to_name: ID 到名称的映射 + title_info: 标题统计信息(可选) + rank_threshold: 排名阈值 + new_titles: 新增标题(可选) + mode: 报告模式 (daily/incremental/current) + global_filters: 全局过滤词(可选) + weight_config: 权重配置 + max_news_per_keyword: 每个关键词最大显示数量 + sort_by_position_first: 是否优先按配置位置排序 + is_first_crawl_func: 检测是否是当天第一次爬取的函数 + convert_time_func: 时间格式转换函数 + + Returns: + Tuple[List[Dict], int]: (统计结果列表, 总标题数) + """ + # 默认权重配置 + if weight_config is None: + weight_config = { + "RANK_WEIGHT": 0.4, + "FREQUENCY_WEIGHT": 0.3, + "HOTNESS_WEIGHT": 0.3, + } + + # 默认时间转换函数 + if convert_time_func is None: + convert_time_func = lambda x: x + + # 默认首次爬取检测函数 + if is_first_crawl_func is None: + is_first_crawl_func = lambda: True + + # 如果没有配置词组,创建一个包含所有新闻的虚拟词组 + if not word_groups: + print("频率词配置为空,将显示所有新闻") + word_groups = [{"required": [], "normal": [], "group_key": "全部新闻"}] + filter_words = [] # 清空过滤词,显示所有新闻 + + is_first_today = is_first_crawl_func() + + # 确定处理的数据源和新增标记逻辑 + if mode == "incremental": + if is_first_today: + # 增量模式 + 当天第一次:处理所有新闻,都标记为新增 + results_to_process = results + all_news_are_new = True + else: + # 增量模式 + 当天非第一次:只处理新增的新闻 + results_to_process = new_titles if new_titles else {} + all_news_are_new = True + elif mode == "current": + # current 模式:只处理当前时间批次的新闻,但统计信息来自全部历史 + if title_info: + latest_time = None + for source_titles in title_info.values(): + for title_data in source_titles.values(): + last_time = title_data.get("last_time", "") + if last_time: + if latest_time is None or last_time > latest_time: + latest_time = last_time + + # 只处理 last_time 等于最新时间的新闻 + if latest_time: + results_to_process = {} + for source_id, source_titles in results.items(): + if source_id in title_info: + filtered_titles = {} + for title, title_data in source_titles.items(): + if title in title_info[source_id]: + info = title_info[source_id][title] + if info.get("last_time") == latest_time: + filtered_titles[title] = title_data + if filtered_titles: + results_to_process[source_id] = filtered_titles + + print( + f"当前榜单模式:最新时间 {latest_time},筛选出 {sum(len(titles) for titles in results_to_process.values())} 条当前榜单新闻" + ) + else: + results_to_process = results + else: + results_to_process = results + all_news_are_new = False + else: + # 当日汇总模式:处理所有新闻 + results_to_process = results + all_news_are_new = False + total_input_news = sum(len(titles) for titles in results.values()) + filter_status = ( + "全部显示" + if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻" + else "频率词过滤" + ) + print(f"当日汇总模式:处理 {total_input_news} 条新闻,模式:{filter_status}") + + word_stats = {} + total_titles = 0 + processed_titles = {} + matched_new_count = 0 + + if title_info is None: + title_info = {} + if new_titles is None: + new_titles = {} + + for group in word_groups: + group_key = group["group_key"] + word_stats[group_key] = {"count": 0, "titles": {}} + + for source_id, titles_data in results_to_process.items(): + total_titles += len(titles_data) + + if source_id not in processed_titles: + processed_titles[source_id] = {} + + for title, title_data in titles_data.items(): + if title in processed_titles.get(source_id, {}): + continue + + # 使用统一的匹配逻辑 + matches_frequency_words = matches_word_groups( + title, word_groups, filter_words, global_filters + ) + + if not matches_frequency_words: + continue + + # 如果是增量模式或 current 模式第一次,统计匹配的新增新闻数量 + if (mode == "incremental" and all_news_are_new) or ( + mode == "current" and is_first_today + ): + matched_new_count += 1 + + source_ranks = title_data.get("ranks", []) + source_url = title_data.get("url", "") + source_mobile_url = title_data.get("mobileUrl", "") + + # 找到匹配的词组(防御性转换确保类型安全) + title_lower = str(title).lower() if not isinstance(title, str) else title.lower() + for group in word_groups: + required_words = group["required"] + normal_words = group["normal"] + + # 如果是"全部新闻"模式,所有标题都匹配第一个(唯一的)词组 + if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻": + group_key = group["group_key"] + word_stats[group_key]["count"] += 1 + if source_id not in word_stats[group_key]["titles"]: + word_stats[group_key]["titles"][source_id] = [] + else: + # 原有的匹配逻辑 + if required_words: + all_required_present = all( + req_word.lower() in title_lower + for req_word in required_words + ) + if not all_required_present: + continue + + if normal_words: + any_normal_present = any( + normal_word.lower() in title_lower + for normal_word in normal_words + ) + if not any_normal_present: + continue + + group_key = group["group_key"] + word_stats[group_key]["count"] += 1 + if source_id not in word_stats[group_key]["titles"]: + word_stats[group_key]["titles"][source_id] = [] + + first_time = "" + last_time = "" + count_info = 1 + ranks = source_ranks if source_ranks else [] + url = source_url + mobile_url = source_mobile_url + + # 对于 current 模式,从历史统计信息中获取完整数据 + if ( + mode == "current" + and title_info + and source_id in title_info + and title in title_info[source_id] + ): + info = title_info[source_id][title] + first_time = info.get("first_time", "") + last_time = info.get("last_time", "") + count_info = info.get("count", 1) + if "ranks" in info and info["ranks"]: + ranks = info["ranks"] + url = info.get("url", source_url) + mobile_url = info.get("mobileUrl", source_mobile_url) + elif ( + title_info + and source_id in title_info + and title in title_info[source_id] + ): + info = title_info[source_id][title] + first_time = info.get("first_time", "") + last_time = info.get("last_time", "") + count_info = info.get("count", 1) + if "ranks" in info and info["ranks"]: + ranks = info["ranks"] + url = info.get("url", source_url) + mobile_url = info.get("mobileUrl", source_mobile_url) + + if not ranks: + ranks = [99] + + time_display = format_time_display(first_time, last_time, convert_time_func) + + source_name = id_to_name.get(source_id, source_id) + + # 判断是否为新增 + is_new = False + if all_news_are_new: + # 增量模式下所有处理的新闻都是新增,或者当天第一次的所有新闻都是新增 + is_new = True + elif new_titles and source_id in new_titles: + # 检查是否在新增列表中 + new_titles_for_source = new_titles[source_id] + is_new = title in new_titles_for_source + + word_stats[group_key]["titles"][source_id].append( + { + "title": title, + "source_name": source_name, + "first_time": first_time, + "last_time": last_time, + "time_display": time_display, + "count": count_info, + "ranks": ranks, + "rank_threshold": rank_threshold, + "url": url, + "mobileUrl": mobile_url, + "is_new": is_new, + } + ) + + if source_id not in processed_titles: + processed_titles[source_id] = {} + processed_titles[source_id][title] = True + + break + + # 最后统一打印汇总信息 + if mode == "incremental": + if is_first_today: + total_input_news = sum(len(titles) for titles in results.values()) + filter_status = ( + "全部显示" + if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻" + else "频率词匹配" + ) + print( + f"增量模式:当天第一次爬取,{total_input_news} 条新闻中有 {matched_new_count} 条{filter_status}" + ) + else: + if new_titles: + total_new_count = sum(len(titles) for titles in new_titles.values()) + filter_status = ( + "全部显示" + if len(word_groups) == 1 + and word_groups[0]["group_key"] == "全部新闻" + else "匹配频率词" + ) + print( + f"增量模式:{total_new_count} 条新增新闻中,有 {matched_new_count} 条{filter_status}" + ) + if matched_new_count == 0 and len(word_groups) > 1: + print("增量模式:没有新增新闻匹配频率词,将不会发送通知") + else: + print("增量模式:未检测到新增新闻") + elif mode == "current": + total_input_news = sum(len(titles) for titles in results_to_process.values()) + if is_first_today: + filter_status = ( + "全部显示" + if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻" + else "频率词匹配" + ) + print( + f"当前榜单模式:当天第一次爬取,{total_input_news} 条当前榜单新闻中有 {matched_new_count} 条{filter_status}" + ) + else: + matched_count = sum(stat["count"] for stat in word_stats.values()) + filter_status = ( + "全部显示" + if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻" + else "频率词匹配" + ) + print( + f"当前榜单模式:{total_input_news} 条当前榜单新闻中有 {matched_count} 条{filter_status}" + ) + + stats = [] + # 创建 group_key 到位置和最大数量的映射 + group_key_to_position = { + group["group_key"]: idx for idx, group in enumerate(word_groups) + } + group_key_to_max_count = { + group["group_key"]: group.get("max_count", 0) for group in word_groups + } + + for group_key, data in word_stats.items(): + all_titles = [] + for source_id, title_list in data["titles"].items(): + all_titles.extend(title_list) + + # 按权重排序 + sorted_titles = sorted( + all_titles, + key=lambda x: ( + -calculate_news_weight(x, rank_threshold, weight_config), + min(x["ranks"]) if x["ranks"] else 999, + -x["count"], + ), + ) + + # 应用最大显示数量限制(优先级:单独配置 > 全局配置) + group_max_count = group_key_to_max_count.get(group_key, 0) + if group_max_count == 0: + # 使用全局配置 + group_max_count = max_news_per_keyword + + if group_max_count > 0: + sorted_titles = sorted_titles[:group_max_count] + + stats.append( + { + "word": group_key, + "count": data["count"], + "position": group_key_to_position.get(group_key, 999), + "titles": sorted_titles, + "percentage": ( + round(data["count"] / total_titles * 100, 2) + if total_titles > 0 + else 0 + ), + } + ) + + # 根据配置选择排序优先级 + if sort_by_position_first: + # 先按配置位置,再按热点条数 + stats.sort(key=lambda x: (x["position"], -x["count"])) + else: + # 先按热点条数,再按配置位置(原逻辑) + stats.sort(key=lambda x: (-x["count"], x["position"])) + + # 打印过滤后的匹配新闻数(与推送显示一致) + matched_news_count = sum(len(stat["titles"]) for stat in stats if stat["count"] > 0) + if mode == "daily": + print(f"频率词过滤后:{matched_news_count} 条新闻匹配(将显示在推送中)") + + return stats, total_titles diff --git a/trendradar/core/config.py b/trendradar/core/config.py new file mode 100644 index 0000000..8aca2cf --- /dev/null +++ b/trendradar/core/config.py @@ -0,0 +1,152 @@ +# coding=utf-8 +""" +配置工具模块 - 多账号配置解析和验证 + +提供多账号推送配置的解析、验证和限制功能 +""" + +from typing import Dict, List, Optional, Tuple + + +def parse_multi_account_config(config_value: str, separator: str = ";") -> List[str]: + """ + 解析多账号配置,返回账号列表 + + Args: + config_value: 配置值字符串,多个账号用分隔符分隔 + separator: 分隔符,默认为 ; + + Returns: + 账号列表,空字符串会被保留(用于占位) + + Examples: + >>> parse_multi_account_config("url1;url2;url3") + ['url1', 'url2', 'url3'] + >>> parse_multi_account_config(";token2") # 第一个账号无token + ['', 'token2'] + >>> parse_multi_account_config("") + [] + """ + if not config_value: + return [] + # 保留空字符串用于占位(如 ";token2" 表示第一个账号无token) + accounts = [acc.strip() for acc in config_value.split(separator)] + # 过滤掉全部为空的情况 + if all(not acc for acc in accounts): + return [] + return accounts + + +def validate_paired_configs( + configs: Dict[str, List[str]], + channel_name: str, + required_keys: Optional[List[str]] = None +) -> Tuple[bool, int]: + """ + 验证配对配置的数量是否一致 + + 对于需要多个配置项配对的渠道(如 Telegram 的 token 和 chat_id), + 验证所有配置项的账号数量是否一致。 + + Args: + configs: 配置字典,key 为配置名,value 为账号列表 + channel_name: 渠道名称,用于日志输出 + required_keys: 必须有值的配置项列表 + + Returns: + (是否验证通过, 账号数量) + + Examples: + >>> validate_paired_configs({ + ... "token": ["t1", "t2"], + ... "chat_id": ["c1", "c2"] + ... }, "Telegram", ["token", "chat_id"]) + (True, 2) + + >>> validate_paired_configs({ + ... "token": ["t1", "t2"], + ... "chat_id": ["c1"] # 数量不匹配 + ... }, "Telegram", ["token", "chat_id"]) + (False, 0) + """ + # 过滤掉空列表 + non_empty_configs = {k: v for k, v in configs.items() if v} + + if not non_empty_configs: + return True, 0 + + # 检查必须项 + if required_keys: + for key in required_keys: + if key not in non_empty_configs or not non_empty_configs[key]: + return True, 0 # 必须项为空,视为未配置 + + # 获取所有非空配置的长度 + lengths = {k: len(v) for k, v in non_empty_configs.items()} + unique_lengths = set(lengths.values()) + + if len(unique_lengths) > 1: + print(f"❌ {channel_name} 配置错误:配对配置数量不一致,将跳过该渠道推送") + for key, length in lengths.items(): + print(f" - {key}: {length} 个") + return False, 0 + + return True, list(unique_lengths)[0] if unique_lengths else 0 + + +def limit_accounts( + accounts: List[str], + max_count: int, + channel_name: str +) -> List[str]: + """ + 限制账号数量 + + 当配置的账号数量超过最大限制时,只使用前 N 个账号, + 并输出警告信息。 + + Args: + accounts: 账号列表 + max_count: 最大账号数量 + channel_name: 渠道名称,用于日志输出 + + Returns: + 限制后的账号列表 + + Examples: + >>> limit_accounts(["a1", "a2", "a3"], 2, "飞书") + ⚠️ 飞书 配置了 3 个账号,超过最大限制 2,只使用前 2 个 + ['a1', 'a2'] + """ + if len(accounts) > max_count: + print(f"⚠️ {channel_name} 配置了 {len(accounts)} 个账号,超过最大限制 {max_count},只使用前 {max_count} 个") + print(f" ⚠️ 警告:如果您是 fork 用户,过多账号可能导致 GitHub Actions 运行时间过长,存在账号风险") + return accounts[:max_count] + return accounts + + +def get_account_at_index(accounts: List[str], index: int, default: str = "") -> str: + """ + 安全获取指定索引的账号值 + + 当索引超出范围或账号值为空时,返回默认值。 + + Args: + accounts: 账号列表 + index: 索引 + default: 默认值 + + Returns: + 账号值或默认值 + + Examples: + >>> get_account_at_index(["a", "b", "c"], 1) + 'b' + >>> get_account_at_index(["a", "", "c"], 1, "default") + 'default' + >>> get_account_at_index(["a"], 5, "default") + 'default' + """ + if index < len(accounts): + return accounts[index] if accounts[index] else default + return default diff --git a/trendradar/core/data.py b/trendradar/core/data.py new file mode 100644 index 0000000..f56eabe --- /dev/null +++ b/trendradar/core/data.py @@ -0,0 +1,291 @@ +# coding=utf-8 +""" +数据处理模块 + +提供数据读取、保存和检测功能: +- save_titles_to_file: 保存标题到 TXT 文件 +- read_all_today_titles: 从存储后端读取当天所有标题 +- detect_latest_new_titles: 检测最新批次的新增标题 + +Author: TrendRadar Team +""" + +from pathlib import Path +from typing import Dict, List, Tuple, Optional, Callable + + +def save_titles_to_file( + results: Dict, + id_to_name: Dict, + failed_ids: List, + output_path: str, + clean_title_func: Callable[[str], str], +) -> str: + """ + 保存标题到 TXT 文件 + + Args: + results: 抓取结果 {source_id: {title: title_data}} + id_to_name: ID 到名称的映射 + failed_ids: 失败的 ID 列表 + output_path: 输出文件路径 + clean_title_func: 标题清理函数 + + Returns: + str: 保存的文件路径 + """ + # 确保目录存在 + Path(output_path).parent.mkdir(parents=True, exist_ok=True) + + with open(output_path, "w", encoding="utf-8") as f: + for id_value, title_data in results.items(): + # id | name 或 id + name = id_to_name.get(id_value) + if name and name != id_value: + f.write(f"{id_value} | {name}\n") + else: + f.write(f"{id_value}\n") + + # 按排名排序标题 + sorted_titles = [] + for title, info in title_data.items(): + cleaned_title = clean_title_func(title) + if isinstance(info, dict): + ranks = info.get("ranks", []) + url = info.get("url", "") + mobile_url = info.get("mobileUrl", "") + else: + ranks = info if isinstance(info, list) else [] + url = "" + mobile_url = "" + + rank = ranks[0] if ranks else 1 + sorted_titles.append((rank, cleaned_title, url, mobile_url)) + + sorted_titles.sort(key=lambda x: x[0]) + + for rank, cleaned_title, url, mobile_url in sorted_titles: + line = f"{rank}. {cleaned_title}" + + if url: + line += f" [URL:{url}]" + if mobile_url: + line += f" [MOBILE:{mobile_url}]" + f.write(line + "\n") + + f.write("\n") + + if failed_ids: + f.write("==== 以下ID请求失败 ====\n") + for id_value in failed_ids: + f.write(f"{id_value}\n") + + return output_path + + +def read_all_today_titles_from_storage( + storage_manager, + current_platform_ids: Optional[List[str]] = None, +) -> Tuple[Dict, Dict, Dict]: + """ + 从存储后端读取当天所有标题(SQLite 数据) + + Args: + storage_manager: 存储管理器实例 + current_platform_ids: 当前监控的平台 ID 列表(用于过滤) + + Returns: + Tuple[Dict, Dict, Dict]: (all_results, id_to_name, title_info) + """ + try: + news_data = storage_manager.get_today_all_data() + + if not news_data or not news_data.items: + return {}, {}, {} + + all_results = {} + final_id_to_name = {} + title_info = {} + + for source_id, news_list in news_data.items.items(): + # 按平台过滤 + if current_platform_ids is not None and source_id not in current_platform_ids: + continue + + # 获取来源名称 + source_name = news_data.id_to_name.get(source_id, source_id) + final_id_to_name[source_id] = source_name + + if source_id not in all_results: + all_results[source_id] = {} + title_info[source_id] = {} + + for item in news_list: + title = item.title + ranks = getattr(item, 'ranks', [item.rank]) + first_time = getattr(item, 'first_time', item.crawl_time) + last_time = getattr(item, 'last_time', item.crawl_time) + count = getattr(item, 'count', 1) + + all_results[source_id][title] = { + "ranks": ranks, + "url": item.url or "", + "mobileUrl": item.mobile_url or "", + } + + title_info[source_id][title] = { + "first_time": first_time, + "last_time": last_time, + "count": count, + "ranks": ranks, + "url": item.url or "", + "mobileUrl": item.mobile_url or "", + } + + return all_results, final_id_to_name, title_info + + except Exception as e: + print(f"[存储] 从存储后端读取数据失败: {e}") + return {}, {}, {} + + +def read_all_today_titles( + storage_manager, + current_platform_ids: Optional[List[str]] = None, +) -> Tuple[Dict, Dict, Dict]: + """ + 读取当天所有标题(从存储后端) + + Args: + storage_manager: 存储管理器实例 + current_platform_ids: 当前监控的平台 ID 列表(用于过滤) + + Returns: + Tuple[Dict, Dict, Dict]: (all_results, id_to_name, title_info) + """ + all_results, final_id_to_name, title_info = read_all_today_titles_from_storage( + storage_manager, current_platform_ids + ) + + if all_results: + total_count = sum(len(titles) for titles in all_results.values()) + print(f"[存储] 已从存储后端读取 {total_count} 条标题") + else: + print("[存储] 当天暂无数据") + + return all_results, final_id_to_name, title_info + + +def detect_latest_new_titles_from_storage( + storage_manager, + current_platform_ids: Optional[List[str]] = None, +) -> Dict: + """ + 从存储后端检测最新批次的新增标题 + + Args: + storage_manager: 存储管理器实例 + current_platform_ids: 当前监控的平台 ID 列表(用于过滤) + + Returns: + Dict: 新增标题 {source_id: {title: title_data}} + """ + try: + # 获取最新抓取数据 + latest_data = storage_manager.get_latest_crawl_data() + if not latest_data or not latest_data.items: + return {} + + # 获取所有历史数据 + all_data = storage_manager.get_today_all_data() + if not all_data or not all_data.items: + # 没有历史数据(第一次抓取),不应该有"新增"标题 + return {} + + # 收集历史标题(不包括最新批次的时间) + latest_time = latest_data.crawl_time + historical_titles = {} + + for source_id, news_list in all_data.items.items(): + if current_platform_ids is not None and source_id not in current_platform_ids: + continue + + historical_titles[source_id] = set() + for item in news_list: + # 只统计非最新批次的标题 + first_time = getattr(item, 'first_time', item.crawl_time) + if first_time != latest_time: + historical_titles[source_id].add(item.title) + + # 检查是否是当天第一次抓取(没有任何历史标题) + # 如果所有平台的历史标题集合都为空,说明只有一个抓取批次,不应该有"新增"标题 + has_historical_data = any(len(titles) > 0 for titles in historical_titles.values()) + if not has_historical_data: + return {} + + # 找出新增标题 + new_titles = {} + for source_id, news_list in latest_data.items.items(): + if current_platform_ids is not None and source_id not in current_platform_ids: + continue + + historical_set = historical_titles.get(source_id, set()) + source_new_titles = {} + + for item in news_list: + if item.title not in historical_set: + source_new_titles[item.title] = { + "ranks": [item.rank], + "url": item.url or "", + "mobileUrl": item.mobile_url or "", + } + + if source_new_titles: + new_titles[source_id] = source_new_titles + + return new_titles + + except Exception as e: + print(f"[存储] 从存储后端检测新标题失败: {e}") + return {} + + +def detect_latest_new_titles( + storage_manager, + current_platform_ids: Optional[List[str]] = None, +) -> Dict: + """ + 检测当日最新批次的新增标题(从存储后端) + + Args: + storage_manager: 存储管理器实例 + current_platform_ids: 当前监控的平台 ID 列表(用于过滤) + + Returns: + Dict: 新增标题 {source_id: {title: title_data}} + """ + new_titles = detect_latest_new_titles_from_storage(storage_manager, current_platform_ids) + if new_titles: + total_new = sum(len(titles) for titles in new_titles.values()) + print(f"[存储] 从存储后端检测到 {total_new} 条新增标题") + return new_titles + + +def is_first_crawl_today(output_dir: str, date_folder: str) -> bool: + """ + 检测是否是当天第一次爬取 + + Args: + output_dir: 输出目录 + date_folder: 日期文件夹名称 + + Returns: + bool: 是否是当天第一次爬取 + """ + txt_dir = Path(output_dir) / date_folder / "txt" + + if not txt_dir.exists(): + return True + + files = sorted([f for f in txt_dir.iterdir() if f.suffix == ".txt"]) + return len(files) <= 1 diff --git a/trendradar/core/frequency.py b/trendradar/core/frequency.py new file mode 100644 index 0000000..1f983f4 --- /dev/null +++ b/trendradar/core/frequency.py @@ -0,0 +1,194 @@ +# coding=utf-8 +""" +频率词配置加载模块 + +负责从配置文件加载频率词规则,支持: +- 普通词组 +- 必须词(+前缀) +- 过滤词(!前缀) +- 全局过滤词([GLOBAL_FILTER] 区域) +- 最大显示数量(@前缀) +""" + +import os +from pathlib import Path +from typing import Dict, List, Tuple, Optional + + +def load_frequency_words( + frequency_file: Optional[str] = None, +) -> Tuple[List[Dict], List[str], List[str]]: + """ + 加载频率词配置 + + 配置文件格式说明: + - 每个词组由空行分隔 + - [GLOBAL_FILTER] 区域定义全局过滤词 + - [WORD_GROUPS] 区域定义词组(默认) + + 词组语法: + - 普通词:直接写入,任意匹配即可 + - +词:必须词,所有必须词都要匹配 + - !词:过滤词,匹配则排除 + - @数字:该词组最多显示的条数 + + Args: + frequency_file: 频率词配置文件路径,默认从环境变量 FREQUENCY_WORDS_PATH 获取或使用 config/frequency_words.txt + + Returns: + (词组列表, 词组内过滤词, 全局过滤词) + + Raises: + FileNotFoundError: 频率词文件不存在 + """ + if frequency_file is None: + frequency_file = os.environ.get( + "FREQUENCY_WORDS_PATH", "config/frequency_words.txt" + ) + + frequency_path = Path(frequency_file) + if not frequency_path.exists(): + raise FileNotFoundError(f"频率词文件 {frequency_file} 不存在") + + with open(frequency_path, "r", encoding="utf-8") as f: + content = f.read() + + word_groups = [group.strip() for group in content.split("\n\n") if group.strip()] + + processed_groups = [] + filter_words = [] + global_filters = [] + + # 默认区域(向后兼容) + current_section = "WORD_GROUPS" + + for group in word_groups: + lines = [line.strip() for line in group.split("\n") if line.strip()] + + if not lines: + continue + + # 检查是否为区域标记 + if lines[0].startswith("[") and lines[0].endswith("]"): + section_name = lines[0][1:-1].upper() + if section_name in ("GLOBAL_FILTER", "WORD_GROUPS"): + current_section = section_name + lines = lines[1:] # 移除标记行 + + # 处理全局过滤区域 + if current_section == "GLOBAL_FILTER": + # 直接添加所有非空行到全局过滤列表 + for line in lines: + # 忽略特殊语法前缀,只提取纯文本 + if line.startswith(("!", "+", "@")): + continue # 全局过滤区不支持特殊语法 + if line: + global_filters.append(line) + continue + + # 处理词组区域 + words = lines + + group_required_words = [] + group_normal_words = [] + group_filter_words = [] + group_max_count = 0 # 默认不限制 + + for word in words: + if word.startswith("@"): + # 解析最大显示数量(只接受正整数) + try: + count = int(word[1:]) + if count > 0: + group_max_count = count + except (ValueError, IndexError): + pass # 忽略无效的@数字格式 + elif word.startswith("!"): + filter_words.append(word[1:]) + group_filter_words.append(word[1:]) + elif word.startswith("+"): + group_required_words.append(word[1:]) + else: + group_normal_words.append(word) + + if group_required_words or group_normal_words: + if group_normal_words: + group_key = " ".join(group_normal_words) + else: + group_key = " ".join(group_required_words) + + processed_groups.append( + { + "required": group_required_words, + "normal": group_normal_words, + "group_key": group_key, + "max_count": group_max_count, + } + ) + + return processed_groups, filter_words, global_filters + + +def matches_word_groups( + title: str, + word_groups: List[Dict], + filter_words: List[str], + global_filters: Optional[List[str]] = None +) -> bool: + """ + 检查标题是否匹配词组规则 + + Args: + title: 标题文本 + word_groups: 词组列表 + filter_words: 过滤词列表 + global_filters: 全局过滤词列表 + + Returns: + 是否匹配 + """ + # 防御性类型检查:确保 title 是有效字符串 + if not isinstance(title, str): + title = str(title) if title is not None else "" + if not title.strip(): + return False + + title_lower = title.lower() + + # 全局过滤检查(优先级最高) + if global_filters: + if any(global_word.lower() in title_lower for global_word in global_filters): + return False + + # 如果没有配置词组,则匹配所有标题(支持显示全部新闻) + if not word_groups: + return True + + # 过滤词检查 + if any(filter_word.lower() in title_lower for filter_word in filter_words): + return False + + # 词组匹配检查 + for group in word_groups: + required_words = group["required"] + normal_words = group["normal"] + + # 必须词检查 + if required_words: + all_required_present = all( + req_word.lower() in title_lower for req_word in required_words + ) + if not all_required_present: + continue + + # 普通词检查 + if normal_words: + any_normal_present = any( + normal_word.lower() in title_lower for normal_word in normal_words + ) + if not any_normal_present: + continue + + return True + + return False diff --git a/trendradar/core/loader.py b/trendradar/core/loader.py new file mode 100644 index 0000000..677749f --- /dev/null +++ b/trendradar/core/loader.py @@ -0,0 +1,332 @@ +# coding=utf-8 +""" +配置加载模块 + +负责从 YAML 配置文件和环境变量加载配置。 +""" + +import os +from pathlib import Path +from typing import Dict, Any, Optional + +import yaml + +from .config import parse_multi_account_config, validate_paired_configs + + +def _get_env_bool(key: str, default: bool = False) -> Optional[bool]: + """从环境变量获取布尔值,如果未设置返回 None""" + value = os.environ.get(key, "").strip().lower() + if not value: + return None + return value in ("true", "1") + + +def _get_env_int(key: str, default: int = 0) -> int: + """从环境变量获取整数值""" + value = os.environ.get(key, "").strip() + if not value: + return default + try: + return int(value) + except ValueError: + return default + + +def _get_env_str(key: str, default: str = "") -> str: + """从环境变量获取字符串值""" + return os.environ.get(key, "").strip() or default + + +def _load_app_config(config_data: Dict) -> Dict: + """加载应用配置""" + app_config = config_data.get("app", {}) + return { + "VERSION_CHECK_URL": app_config.get("version_check_url", ""), + "SHOW_VERSION_UPDATE": app_config.get("show_version_update", True), + "TIMEZONE": _get_env_str("TIMEZONE") or app_config.get("timezone", "Asia/Shanghai"), + } + + +def _load_crawler_config(config_data: Dict) -> Dict: + """加载爬虫配置""" + crawler_config = config_data.get("crawler", {}) + enable_crawler_env = _get_env_bool("ENABLE_CRAWLER") + return { + "REQUEST_INTERVAL": crawler_config.get("request_interval", 100), + "USE_PROXY": crawler_config.get("use_proxy", False), + "DEFAULT_PROXY": crawler_config.get("default_proxy", ""), + "ENABLE_CRAWLER": enable_crawler_env if enable_crawler_env is not None else crawler_config.get("enable_crawler", True), + } + + +def _load_report_config(config_data: Dict) -> Dict: + """加载报告配置""" + report_config = config_data.get("report", {}) + + # 环境变量覆盖 + sort_by_position_env = _get_env_bool("SORT_BY_POSITION_FIRST") + reverse_content_env = _get_env_bool("REVERSE_CONTENT_ORDER") + max_news_env = _get_env_int("MAX_NEWS_PER_KEYWORD") + + return { + "REPORT_MODE": _get_env_str("REPORT_MODE") or report_config.get("mode", "daily"), + "RANK_THRESHOLD": report_config.get("rank_threshold", 10), + "SORT_BY_POSITION_FIRST": sort_by_position_env if sort_by_position_env is not None else report_config.get("sort_by_position_first", False), + "MAX_NEWS_PER_KEYWORD": max_news_env or report_config.get("max_news_per_keyword", 0), + "REVERSE_CONTENT_ORDER": reverse_content_env if reverse_content_env is not None else report_config.get("reverse_content_order", False), + } + + +def _load_notification_config(config_data: Dict) -> Dict: + """加载通知配置""" + notification = config_data.get("notification", {}) + enable_notification_env = _get_env_bool("ENABLE_NOTIFICATION") + + return { + "ENABLE_NOTIFICATION": enable_notification_env if enable_notification_env is not None else notification.get("enable_notification", True), + "MESSAGE_BATCH_SIZE": notification.get("message_batch_size", 4000), + "DINGTALK_BATCH_SIZE": notification.get("dingtalk_batch_size", 20000), + "FEISHU_BATCH_SIZE": notification.get("feishu_batch_size", 29000), + "BARK_BATCH_SIZE": notification.get("bark_batch_size", 3600), + "SLACK_BATCH_SIZE": notification.get("slack_batch_size", 4000), + "BATCH_SEND_INTERVAL": notification.get("batch_send_interval", 1.0), + "FEISHU_MESSAGE_SEPARATOR": notification.get("feishu_message_separator", "---"), + "MAX_ACCOUNTS_PER_CHANNEL": _get_env_int("MAX_ACCOUNTS_PER_CHANNEL") or notification.get("max_accounts_per_channel", 3), + } + + +def _load_push_window_config(config_data: Dict) -> Dict: + """加载推送窗口配置""" + notification = config_data.get("notification", {}) + push_window = notification.get("push_window", {}) + time_range = push_window.get("time_range", {}) + + enabled_env = _get_env_bool("PUSH_WINDOW_ENABLED") + once_per_day_env = _get_env_bool("PUSH_WINDOW_ONCE_PER_DAY") + + return { + "ENABLED": enabled_env if enabled_env is not None else push_window.get("enabled", False), + "TIME_RANGE": { + "START": _get_env_str("PUSH_WINDOW_START") or time_range.get("start", "08:00"), + "END": _get_env_str("PUSH_WINDOW_END") or time_range.get("end", "22:00"), + }, + "ONCE_PER_DAY": once_per_day_env if once_per_day_env is not None else push_window.get("once_per_day", True), + } + + +def _load_weight_config(config_data: Dict) -> Dict: + """加载权重配置""" + weight = config_data.get("weight", {}) + return { + "RANK_WEIGHT": weight.get("rank_weight", 1.0), + "FREQUENCY_WEIGHT": weight.get("frequency_weight", 1.0), + "HOTNESS_WEIGHT": weight.get("hotness_weight", 1.0), + } + + +def _load_storage_config(config_data: Dict) -> Dict: + """加载存储配置""" + storage = config_data.get("storage", {}) + formats = storage.get("formats", {}) + local = storage.get("local", {}) + remote = storage.get("remote", {}) + pull = storage.get("pull", {}) + + txt_enabled_env = _get_env_bool("STORAGE_TXT_ENABLED") + html_enabled_env = _get_env_bool("STORAGE_HTML_ENABLED") + pull_enabled_env = _get_env_bool("PULL_ENABLED") + + return { + "BACKEND": _get_env_str("STORAGE_BACKEND") or storage.get("backend", "auto"), + "FORMATS": { + "SQLITE": formats.get("sqlite", True), + "TXT": txt_enabled_env if txt_enabled_env is not None else formats.get("txt", True), + "HTML": html_enabled_env if html_enabled_env is not None else formats.get("html", True), + }, + "LOCAL": { + "DATA_DIR": local.get("data_dir", "output"), + "RETENTION_DAYS": _get_env_int("LOCAL_RETENTION_DAYS") or local.get("retention_days", 0), + }, + "REMOTE": { + "ENDPOINT_URL": _get_env_str("S3_ENDPOINT_URL") or remote.get("endpoint_url", ""), + "BUCKET_NAME": _get_env_str("S3_BUCKET_NAME") or remote.get("bucket_name", ""), + "ACCESS_KEY_ID": _get_env_str("S3_ACCESS_KEY_ID") or remote.get("access_key_id", ""), + "SECRET_ACCESS_KEY": _get_env_str("S3_SECRET_ACCESS_KEY") or remote.get("secret_access_key", ""), + "REGION": _get_env_str("S3_REGION") or remote.get("region", ""), + "RETENTION_DAYS": _get_env_int("REMOTE_RETENTION_DAYS") or remote.get("retention_days", 0), + }, + "PULL": { + "ENABLED": pull_enabled_env if pull_enabled_env is not None else pull.get("enabled", False), + "DAYS": _get_env_int("PULL_DAYS") or pull.get("days", 7), + }, + } + + +def _load_webhook_config(config_data: Dict) -> Dict: + """加载 Webhook 配置""" + notification = config_data.get("notification", {}) + webhooks = notification.get("webhooks", {}) + + return { + # 飞书 + "FEISHU_WEBHOOK_URL": _get_env_str("FEISHU_WEBHOOK_URL") or webhooks.get("feishu_url", ""), + # 钉钉 + "DINGTALK_WEBHOOK_URL": _get_env_str("DINGTALK_WEBHOOK_URL") or webhooks.get("dingtalk_url", ""), + # 企业微信 + "WEWORK_WEBHOOK_URL": _get_env_str("WEWORK_WEBHOOK_URL") or webhooks.get("wework_url", ""), + "WEWORK_MSG_TYPE": _get_env_str("WEWORK_MSG_TYPE") or webhooks.get("wework_msg_type", "markdown"), + # Telegram + "TELEGRAM_BOT_TOKEN": _get_env_str("TELEGRAM_BOT_TOKEN") or webhooks.get("telegram_bot_token", ""), + "TELEGRAM_CHAT_ID": _get_env_str("TELEGRAM_CHAT_ID") or webhooks.get("telegram_chat_id", ""), + # 邮件 + "EMAIL_FROM": _get_env_str("EMAIL_FROM") or webhooks.get("email_from", ""), + "EMAIL_PASSWORD": _get_env_str("EMAIL_PASSWORD") or webhooks.get("email_password", ""), + "EMAIL_TO": _get_env_str("EMAIL_TO") or webhooks.get("email_to", ""), + "EMAIL_SMTP_SERVER": _get_env_str("EMAIL_SMTP_SERVER") or webhooks.get("email_smtp_server", ""), + "EMAIL_SMTP_PORT": _get_env_str("EMAIL_SMTP_PORT") or webhooks.get("email_smtp_port", ""), + # ntfy + "NTFY_SERVER_URL": _get_env_str("NTFY_SERVER_URL") or webhooks.get("ntfy_server_url") or "https://ntfy.sh", + "NTFY_TOPIC": _get_env_str("NTFY_TOPIC") or webhooks.get("ntfy_topic", ""), + "NTFY_TOKEN": _get_env_str("NTFY_TOKEN") or webhooks.get("ntfy_token", ""), + # Bark + "BARK_URL": _get_env_str("BARK_URL") or webhooks.get("bark_url", ""), + # Slack + "SLACK_WEBHOOK_URL": _get_env_str("SLACK_WEBHOOK_URL") or webhooks.get("slack_webhook_url", ""), + } + + +def _print_notification_sources(config: Dict) -> None: + """打印通知渠道配置来源信息""" + notification_sources = [] + max_accounts = config["MAX_ACCOUNTS_PER_CHANNEL"] + + if config["FEISHU_WEBHOOK_URL"]: + accounts = parse_multi_account_config(config["FEISHU_WEBHOOK_URL"]) + count = min(len(accounts), max_accounts) + source = "环境变量" if os.environ.get("FEISHU_WEBHOOK_URL") else "配置文件" + notification_sources.append(f"飞书({source}, {count}个账号)") + + if config["DINGTALK_WEBHOOK_URL"]: + accounts = parse_multi_account_config(config["DINGTALK_WEBHOOK_URL"]) + count = min(len(accounts), max_accounts) + source = "环境变量" if os.environ.get("DINGTALK_WEBHOOK_URL") else "配置文件" + notification_sources.append(f"钉钉({source}, {count}个账号)") + + if config["WEWORK_WEBHOOK_URL"]: + accounts = parse_multi_account_config(config["WEWORK_WEBHOOK_URL"]) + count = min(len(accounts), max_accounts) + source = "环境变量" if os.environ.get("WEWORK_WEBHOOK_URL") else "配置文件" + notification_sources.append(f"企业微信({source}, {count}个账号)") + + if config["TELEGRAM_BOT_TOKEN"] and config["TELEGRAM_CHAT_ID"]: + tokens = parse_multi_account_config(config["TELEGRAM_BOT_TOKEN"]) + chat_ids = parse_multi_account_config(config["TELEGRAM_CHAT_ID"]) + valid, count = validate_paired_configs( + {"bot_token": tokens, "chat_id": chat_ids}, + "Telegram", + required_keys=["bot_token", "chat_id"] + ) + if valid and count > 0: + count = min(count, max_accounts) + token_source = "环境变量" if os.environ.get("TELEGRAM_BOT_TOKEN") else "配置文件" + notification_sources.append(f"Telegram({token_source}, {count}个账号)") + + if config["EMAIL_FROM"] and config["EMAIL_PASSWORD"] and config["EMAIL_TO"]: + from_source = "环境变量" if os.environ.get("EMAIL_FROM") else "配置文件" + notification_sources.append(f"邮件({from_source})") + + if config["NTFY_SERVER_URL"] and config["NTFY_TOPIC"]: + topics = parse_multi_account_config(config["NTFY_TOPIC"]) + tokens = parse_multi_account_config(config["NTFY_TOKEN"]) + if tokens: + valid, count = validate_paired_configs( + {"topic": topics, "token": tokens}, + "ntfy" + ) + if valid and count > 0: + count = min(count, max_accounts) + server_source = "环境变量" if os.environ.get("NTFY_SERVER_URL") else "配置文件" + notification_sources.append(f"ntfy({server_source}, {count}个账号)") + else: + count = min(len(topics), max_accounts) + server_source = "环境变量" if os.environ.get("NTFY_SERVER_URL") else "配置文件" + notification_sources.append(f"ntfy({server_source}, {count}个账号)") + + if config["BARK_URL"]: + accounts = parse_multi_account_config(config["BARK_URL"]) + count = min(len(accounts), max_accounts) + bark_source = "环境变量" if os.environ.get("BARK_URL") else "配置文件" + notification_sources.append(f"Bark({bark_source}, {count}个账号)") + + if config["SLACK_WEBHOOK_URL"]: + accounts = parse_multi_account_config(config["SLACK_WEBHOOK_URL"]) + count = min(len(accounts), max_accounts) + slack_source = "环境变量" if os.environ.get("SLACK_WEBHOOK_URL") else "配置文件" + notification_sources.append(f"Slack({slack_source}, {count}个账号)") + + if notification_sources: + print(f"通知渠道配置来源: {', '.join(notification_sources)}") + print(f"每个渠道最大账号数: {max_accounts}") + else: + print("未配置任何通知渠道") + + +def load_config(config_path: Optional[str] = None) -> Dict[str, Any]: + """ + 加载配置文件 + + Args: + config_path: 配置文件路径,默认从环境变量 CONFIG_PATH 获取或使用 config/config.yaml + + Returns: + 包含所有配置的字典 + + Raises: + FileNotFoundError: 配置文件不存在 + """ + if config_path is None: + config_path = os.environ.get("CONFIG_PATH", "config/config.yaml") + + if not Path(config_path).exists(): + raise FileNotFoundError(f"配置文件 {config_path} 不存在") + + with open(config_path, "r", encoding="utf-8") as f: + config_data = yaml.safe_load(f) + + print(f"配置文件加载成功: {config_path}") + + # 合并所有配置 + config = {} + + # 应用配置 + config.update(_load_app_config(config_data)) + + # 爬虫配置 + config.update(_load_crawler_config(config_data)) + + # 报告配置 + config.update(_load_report_config(config_data)) + + # 通知配置 + config.update(_load_notification_config(config_data)) + + # 推送窗口配置 + config["PUSH_WINDOW"] = _load_push_window_config(config_data) + + # 权重配置 + config["WEIGHT_CONFIG"] = _load_weight_config(config_data) + + # 平台配置 + config["PLATFORMS"] = config_data.get("platforms", []) + + # 存储配置 + config["STORAGE"] = _load_storage_config(config_data) + + # Webhook 配置 + config.update(_load_webhook_config(config_data)) + + # 打印通知渠道配置来源 + _print_notification_sources(config) + + return config diff --git a/trendradar/crawler/__init__.py b/trendradar/crawler/__init__.py new file mode 100644 index 0000000..aab60fc --- /dev/null +++ b/trendradar/crawler/__init__.py @@ -0,0 +1,8 @@ +# coding=utf-8 +""" +爬虫模块 - 数据抓取功能 +""" + +from trendradar.crawler.fetcher import DataFetcher + +__all__ = ["DataFetcher"] diff --git a/trendradar/crawler/fetcher.py b/trendradar/crawler/fetcher.py new file mode 100644 index 0000000..45d0b10 --- /dev/null +++ b/trendradar/crawler/fetcher.py @@ -0,0 +1,184 @@ +# coding=utf-8 +""" +数据获取器模块 + +负责从 NewsNow API 抓取新闻数据,支持: +- 单个平台数据获取 +- 批量平台数据爬取 +- 自动重试机制 +- 代理支持 +""" + +import json +import random +import time +from typing import Dict, List, Tuple, Optional, Union + +import requests + + +class DataFetcher: + """数据获取器""" + + # 默认 API 地址 + DEFAULT_API_URL = "https://newsnow.busiyi.world/api/s" + + # 默认请求头 + DEFAULT_HEADERS = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", + "Accept": "application/json, text/plain, */*", + "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", + "Connection": "keep-alive", + "Cache-Control": "no-cache", + } + + def __init__( + self, + proxy_url: Optional[str] = None, + api_url: Optional[str] = None, + ): + """ + 初始化数据获取器 + + Args: + proxy_url: 代理服务器 URL(可选) + api_url: API 基础 URL(可选,默认使用 DEFAULT_API_URL) + """ + self.proxy_url = proxy_url + self.api_url = api_url or self.DEFAULT_API_URL + + def fetch_data( + self, + id_info: Union[str, Tuple[str, str]], + max_retries: int = 2, + min_retry_wait: int = 3, + max_retry_wait: int = 5, + ) -> Tuple[Optional[str], str, str]: + """ + 获取指定ID数据,支持重试 + + Args: + id_info: 平台ID 或 (平台ID, 别名) 元组 + max_retries: 最大重试次数 + min_retry_wait: 最小重试等待时间(秒) + max_retry_wait: 最大重试等待时间(秒) + + Returns: + (响应文本, 平台ID, 别名) 元组,失败时响应文本为 None + """ + if isinstance(id_info, tuple): + id_value, alias = id_info + else: + id_value = id_info + alias = id_value + + url = f"{self.api_url}?id={id_value}&latest" + + proxies = None + if self.proxy_url: + proxies = {"http": self.proxy_url, "https": self.proxy_url} + + retries = 0 + while retries <= max_retries: + try: + response = requests.get( + url, + proxies=proxies, + headers=self.DEFAULT_HEADERS, + timeout=10, + ) + response.raise_for_status() + + data_text = response.text + data_json = json.loads(data_text) + + status = data_json.get("status", "未知") + if status not in ["success", "cache"]: + raise ValueError(f"响应状态异常: {status}") + + status_info = "最新数据" if status == "success" else "缓存数据" + print(f"获取 {id_value} 成功({status_info})") + return data_text, id_value, alias + + except Exception as e: + retries += 1 + if retries <= max_retries: + base_wait = random.uniform(min_retry_wait, max_retry_wait) + additional_wait = (retries - 1) * random.uniform(1, 2) + wait_time = base_wait + additional_wait + print(f"请求 {id_value} 失败: {e}. {wait_time:.2f}秒后重试...") + time.sleep(wait_time) + else: + print(f"请求 {id_value} 失败: {e}") + return None, id_value, alias + + return None, id_value, alias + + def crawl_websites( + self, + ids_list: List[Union[str, Tuple[str, str]]], + request_interval: int = 100, + ) -> Tuple[Dict, Dict, List]: + """ + 爬取多个网站数据 + + Args: + ids_list: 平台ID列表,每个元素可以是字符串或 (平台ID, 别名) 元组 + request_interval: 请求间隔(毫秒) + + Returns: + (结果字典, ID到名称的映射, 失败ID列表) 元组 + """ + results = {} + id_to_name = {} + failed_ids = [] + + for i, id_info in enumerate(ids_list): + if isinstance(id_info, tuple): + id_value, name = id_info + else: + id_value = id_info + name = id_value + + id_to_name[id_value] = name + response, _, _ = self.fetch_data(id_info) + + if response: + try: + data = json.loads(response) + results[id_value] = {} + + for index, item in enumerate(data.get("items", []), 1): + title = item.get("title") + # 跳过无效标题(None、float、空字符串) + if title is None or isinstance(title, float) or not str(title).strip(): + continue + title = str(title).strip() + url = item.get("url", "") + mobile_url = item.get("mobileUrl", "") + + if title in results[id_value]: + results[id_value][title]["ranks"].append(index) + else: + results[id_value][title] = { + "ranks": [index], + "url": url, + "mobileUrl": mobile_url, + } + except json.JSONDecodeError: + print(f"解析 {id_value} 响应失败") + failed_ids.append(id_value) + except Exception as e: + print(f"处理 {id_value} 数据出错: {e}") + failed_ids.append(id_value) + else: + failed_ids.append(id_value) + + # 请求间隔(除了最后一个) + if i < len(ids_list) - 1: + actual_interval = request_interval + random.randint(-10, 20) + actual_interval = max(50, actual_interval) + time.sleep(actual_interval / 1000) + + print(f"成功: {list(results.keys())}, 失败: {failed_ids}") + return results, id_to_name, failed_ids diff --git a/trendradar/notification/__init__.py b/trendradar/notification/__init__.py new file mode 100644 index 0000000..2dbbbd6 --- /dev/null +++ b/trendradar/notification/__init__.py @@ -0,0 +1,81 @@ +# coding=utf-8 +""" +通知推送模块 + +提供多渠道通知推送功能,包括: +- 飞书、钉钉、企业微信 +- Telegram、Slack +- Email、ntfy、Bark + +模块结构: +- push_manager: 推送记录管理 +- formatters: 内容格式转换 +- batch: 批次处理工具 +- renderer: 通知内容渲染 +- splitter: 消息分批拆分 +- senders: 消息发送器(各渠道发送函数) +- dispatcher: 多账号通知调度器 +""" + +from trendradar.notification.push_manager import PushRecordManager +from trendradar.notification.formatters import ( + strip_markdown, + convert_markdown_to_mrkdwn, +) +from trendradar.notification.batch import ( + get_batch_header, + get_max_batch_header_size, + truncate_to_bytes, + add_batch_headers, +) +from trendradar.notification.renderer import ( + render_feishu_content, + render_dingtalk_content, +) +from trendradar.notification.splitter import ( + split_content_into_batches, + DEFAULT_BATCH_SIZES, +) +from trendradar.notification.senders import ( + send_to_feishu, + send_to_dingtalk, + send_to_wework, + send_to_telegram, + send_to_email, + send_to_ntfy, + send_to_bark, + send_to_slack, + SMTP_CONFIGS, +) +from trendradar.notification.dispatcher import NotificationDispatcher + +__all__ = [ + # 推送记录管理 + "PushRecordManager", + # 格式转换 + "strip_markdown", + "convert_markdown_to_mrkdwn", + # 批次处理 + "get_batch_header", + "get_max_batch_header_size", + "truncate_to_bytes", + "add_batch_headers", + # 内容渲染 + "render_feishu_content", + "render_dingtalk_content", + # 消息分批 + "split_content_into_batches", + "DEFAULT_BATCH_SIZES", + # 消息发送器 + "send_to_feishu", + "send_to_dingtalk", + "send_to_wework", + "send_to_telegram", + "send_to_email", + "send_to_ntfy", + "send_to_bark", + "send_to_slack", + "SMTP_CONFIGS", + # 通知调度器 + "NotificationDispatcher", +] diff --git a/trendradar/notification/batch.py b/trendradar/notification/batch.py new file mode 100644 index 0000000..889b394 --- /dev/null +++ b/trendradar/notification/batch.py @@ -0,0 +1,115 @@ +# coding=utf-8 +""" +批次处理模块 + +提供消息分批发送的辅助函数 +""" + +from typing import List + + +def get_batch_header(format_type: str, batch_num: int, total_batches: int) -> str: + """根据 format_type 生成对应格式的批次头部 + + Args: + format_type: 推送类型(telegram, slack, wework_text, bark, feishu, dingtalk, ntfy, wework) + batch_num: 当前批次编号 + total_batches: 总批次数 + + Returns: + 格式化的批次头部字符串 + """ + if format_type == "telegram": + return f"[第 {batch_num}/{total_batches} 批次]\n\n" + elif format_type == "slack": + return f"*[第 {batch_num}/{total_batches} 批次]*\n\n" + elif format_type in ("wework_text", "bark"): + # 企业微信文本模式和 Bark 使用纯文本格式 + return f"[第 {batch_num}/{total_batches} 批次]\n\n" + else: + # 飞书、钉钉、ntfy、企业微信 markdown 模式 + return f"**[第 {batch_num}/{total_batches} 批次]**\n\n" + + +def get_max_batch_header_size(format_type: str) -> int: + """估算批次头部的最大字节数(假设最多 99 批次) + + 用于在分批时预留空间,避免事后截断破坏内容完整性。 + + Args: + format_type: 推送类型 + + Returns: + 最大头部字节数 + """ + # 生成最坏情况的头部(99/99 批次) + max_header = get_batch_header(format_type, 99, 99) + return len(max_header.encode("utf-8")) + + +def truncate_to_bytes(text: str, max_bytes: int) -> str: + """安全截断字符串到指定字节数,避免截断多字节字符 + + Args: + text: 要截断的文本 + max_bytes: 最大字节数 + + Returns: + 截断后的文本 + """ + text_bytes = text.encode("utf-8") + if len(text_bytes) <= max_bytes: + return text + + # 截断到指定字节数 + truncated = text_bytes[:max_bytes] + + # 处理可能的不完整 UTF-8 字符 + for i in range(min(4, len(truncated))): + try: + return truncated[: len(truncated) - i].decode("utf-8") + except UnicodeDecodeError: + continue + + # 极端情况:返回空字符串 + return "" + + +def add_batch_headers( + batches: List[str], format_type: str, max_bytes: int +) -> List[str]: + """为批次添加头部,动态计算确保总大小不超过限制 + + Args: + batches: 原始批次列表 + format_type: 推送类型(bark, telegram, feishu 等) + max_bytes: 该推送类型的最大字节限制 + + Returns: + 添加头部后的批次列表 + """ + if len(batches) <= 1: + return batches + + total = len(batches) + result = [] + + for i, content in enumerate(batches, 1): + # 生成批次头部 + header = get_batch_header(format_type, i, total) + header_size = len(header.encode("utf-8")) + + # 动态计算允许的最大内容大小 + max_content_size = max_bytes - header_size + content_size = len(content.encode("utf-8")) + + # 如果超出,截断到安全大小 + if content_size > max_content_size: + print( + f"警告:{format_type} 第 {i}/{total} 批次内容({content_size}字节) + 头部({header_size}字节) 超出限制({max_bytes}字节),截断到 {max_content_size} 字节" + ) + content = truncate_to_bytes(content, max_content_size) + + result.append(header + content) + + return result diff --git a/trendradar/notification/dispatcher.py b/trendradar/notification/dispatcher.py new file mode 100644 index 0000000..ae1197f --- /dev/null +++ b/trendradar/notification/dispatcher.py @@ -0,0 +1,420 @@ +# coding=utf-8 +""" +通知调度器模块 + +提供统一的通知分发接口。 +支持所有通知渠道的多账号配置,使用 `;` 分隔多个账号。 + +使用示例: + dispatcher = NotificationDispatcher(config, get_time_func, split_content_func) + results = dispatcher.dispatch_all(report_data, report_type, ...) +""" + +from typing import Any, Callable, Dict, List, Optional + +from trendradar.core.config import ( + get_account_at_index, + limit_accounts, + parse_multi_account_config, + validate_paired_configs, +) + +from .senders import ( + send_to_bark, + send_to_dingtalk, + send_to_email, + send_to_feishu, + send_to_ntfy, + send_to_slack, + send_to_telegram, + send_to_wework, +) + + +class NotificationDispatcher: + """ + 统一的多账号通知调度器 + + 将多账号发送逻辑封装,提供简洁的 dispatch_all 接口。 + 内部处理账号解析、数量限制、配对验证等逻辑。 + """ + + def __init__( + self, + config: Dict[str, Any], + get_time_func: Callable, + split_content_func: Callable, + ): + """ + 初始化通知调度器 + + Args: + config: 完整的配置字典,包含所有通知渠道的配置 + get_time_func: 获取当前时间的函数 + split_content_func: 内容分批函数 + """ + self.config = config + self.get_time_func = get_time_func + self.split_content_func = split_content_func + self.max_accounts = config.get("MAX_ACCOUNTS_PER_CHANNEL", 3) + + def dispatch_all( + self, + report_data: Dict, + report_type: str, + update_info: Optional[Dict] = None, + proxy_url: Optional[str] = None, + mode: str = "daily", + html_file_path: Optional[str] = None, + ) -> Dict[str, bool]: + """ + 分发通知到所有已配置的渠道 + + Args: + report_data: 报告数据(由 prepare_report_data 生成) + report_type: 报告类型(如 "当日汇总"、"实时增量") + update_info: 版本更新信息(可选) + proxy_url: 代理 URL(可选) + mode: 报告模式 (daily/current/incremental) + html_file_path: HTML 报告文件路径(邮件使用) + + Returns: + Dict[str, bool]: 每个渠道的发送结果,key 为渠道名,value 为是否成功 + """ + results = {} + + # 飞书 + if self.config.get("FEISHU_WEBHOOK_URL"): + results["feishu"] = self._send_feishu( + report_data, report_type, update_info, proxy_url, mode + ) + + # 钉钉 + if self.config.get("DINGTALK_WEBHOOK_URL"): + results["dingtalk"] = self._send_dingtalk( + report_data, report_type, update_info, proxy_url, mode + ) + + # 企业微信 + if self.config.get("WEWORK_WEBHOOK_URL"): + results["wework"] = self._send_wework( + report_data, report_type, update_info, proxy_url, mode + ) + + # Telegram(需要配对验证) + if self.config.get("TELEGRAM_BOT_TOKEN") and self.config.get("TELEGRAM_CHAT_ID"): + results["telegram"] = self._send_telegram( + report_data, report_type, update_info, proxy_url, mode + ) + + # ntfy(需要配对验证) + if self.config.get("NTFY_SERVER_URL") and self.config.get("NTFY_TOPIC"): + results["ntfy"] = self._send_ntfy( + report_data, report_type, update_info, proxy_url, mode + ) + + # Bark + if self.config.get("BARK_URL"): + results["bark"] = self._send_bark( + report_data, report_type, update_info, proxy_url, mode + ) + + # Slack + if self.config.get("SLACK_WEBHOOK_URL"): + results["slack"] = self._send_slack( + report_data, report_type, update_info, proxy_url, mode + ) + + # 邮件(保持原有逻辑,已支持多收件人) + if ( + self.config.get("EMAIL_FROM") + and self.config.get("EMAIL_PASSWORD") + and self.config.get("EMAIL_TO") + ): + results["email"] = self._send_email(report_type, html_file_path) + + return results + + def _send_to_multi_accounts( + self, + channel_name: str, + config_value: str, + send_func: Callable[..., bool], + **kwargs, + ) -> bool: + """ + 通用多账号发送逻辑 + + Args: + channel_name: 渠道名称(用于日志和账号数量限制提示) + config_value: 配置值(可能包含多个账号,用 ; 分隔) + send_func: 发送函数,签名为 (account, account_label=..., **kwargs) -> bool + **kwargs: 传递给发送函数的其他参数 + + Returns: + bool: 任一账号发送成功则返回 True + """ + accounts = parse_multi_account_config(config_value) + if not accounts: + return False + + accounts = limit_accounts(accounts, self.max_accounts, channel_name) + results = [] + + for i, account in enumerate(accounts): + if account: + account_label = f"账号{i+1}" if len(accounts) > 1 else "" + result = send_func(account, account_label=account_label, **kwargs) + results.append(result) + + return any(results) if results else False + + def _send_feishu( + self, + report_data: Dict, + report_type: str, + update_info: Optional[Dict], + proxy_url: Optional[str], + mode: str, + ) -> bool: + """发送到飞书(多账号)""" + return self._send_to_multi_accounts( + channel_name="飞书", + config_value=self.config["FEISHU_WEBHOOK_URL"], + send_func=lambda url, account_label: send_to_feishu( + webhook_url=url, + report_data=report_data, + report_type=report_type, + update_info=update_info, + proxy_url=proxy_url, + mode=mode, + account_label=account_label, + batch_size=self.config.get("FEISHU_BATCH_SIZE", 29000), + batch_interval=self.config.get("BATCH_SEND_INTERVAL", 1.0), + split_content_func=self.split_content_func, + get_time_func=self.get_time_func, + ), + ) + + def _send_dingtalk( + self, + report_data: Dict, + report_type: str, + update_info: Optional[Dict], + proxy_url: Optional[str], + mode: str, + ) -> bool: + """发送到钉钉(多账号)""" + return self._send_to_multi_accounts( + channel_name="钉钉", + config_value=self.config["DINGTALK_WEBHOOK_URL"], + send_func=lambda url, account_label: send_to_dingtalk( + webhook_url=url, + report_data=report_data, + report_type=report_type, + update_info=update_info, + proxy_url=proxy_url, + mode=mode, + account_label=account_label, + batch_size=self.config.get("DINGTALK_BATCH_SIZE", 20000), + batch_interval=self.config.get("BATCH_SEND_INTERVAL", 1.0), + split_content_func=self.split_content_func, + ), + ) + + def _send_wework( + self, + report_data: Dict, + report_type: str, + update_info: Optional[Dict], + proxy_url: Optional[str], + mode: str, + ) -> bool: + """发送到企业微信(多账号)""" + return self._send_to_multi_accounts( + channel_name="企业微信", + config_value=self.config["WEWORK_WEBHOOK_URL"], + send_func=lambda url, account_label: send_to_wework( + webhook_url=url, + report_data=report_data, + report_type=report_type, + update_info=update_info, + proxy_url=proxy_url, + mode=mode, + account_label=account_label, + batch_size=self.config.get("MESSAGE_BATCH_SIZE", 4000), + batch_interval=self.config.get("BATCH_SEND_INTERVAL", 1.0), + msg_type=self.config.get("WEWORK_MSG_TYPE", "markdown"), + split_content_func=self.split_content_func, + ), + ) + + def _send_telegram( + self, + report_data: Dict, + report_type: str, + update_info: Optional[Dict], + proxy_url: Optional[str], + mode: str, + ) -> bool: + """发送到 Telegram(多账号,需验证 token 和 chat_id 配对)""" + telegram_tokens = parse_multi_account_config(self.config["TELEGRAM_BOT_TOKEN"]) + telegram_chat_ids = parse_multi_account_config(self.config["TELEGRAM_CHAT_ID"]) + + if not telegram_tokens or not telegram_chat_ids: + return False + + # 验证配对 + valid, count = validate_paired_configs( + {"bot_token": telegram_tokens, "chat_id": telegram_chat_ids}, + "Telegram", + required_keys=["bot_token", "chat_id"], + ) + if not valid or count == 0: + return False + + # 限制账号数量 + telegram_tokens = limit_accounts(telegram_tokens, self.max_accounts, "Telegram") + telegram_chat_ids = telegram_chat_ids[: len(telegram_tokens)] + + results = [] + for i in range(len(telegram_tokens)): + token = telegram_tokens[i] + chat_id = telegram_chat_ids[i] + if token and chat_id: + account_label = f"账号{i+1}" if len(telegram_tokens) > 1 else "" + result = send_to_telegram( + bot_token=token, + chat_id=chat_id, + report_data=report_data, + report_type=report_type, + update_info=update_info, + proxy_url=proxy_url, + mode=mode, + account_label=account_label, + batch_size=self.config.get("MESSAGE_BATCH_SIZE", 4000), + batch_interval=self.config.get("BATCH_SEND_INTERVAL", 1.0), + split_content_func=self.split_content_func, + ) + results.append(result) + + return any(results) if results else False + + def _send_ntfy( + self, + report_data: Dict, + report_type: str, + update_info: Optional[Dict], + proxy_url: Optional[str], + mode: str, + ) -> bool: + """发送到 ntfy(多账号,需验证 topic 和 token 配对)""" + ntfy_server_url = self.config["NTFY_SERVER_URL"] + ntfy_topics = parse_multi_account_config(self.config["NTFY_TOPIC"]) + ntfy_tokens = parse_multi_account_config(self.config.get("NTFY_TOKEN", "")) + + if not ntfy_server_url or not ntfy_topics: + return False + + # 验证 token 和 topic 数量一致(如果配置了 token) + if ntfy_tokens and len(ntfy_tokens) != len(ntfy_topics): + print( + f"❌ ntfy 配置错误:topic 数量({len(ntfy_topics)})与 token 数量({len(ntfy_tokens)})不一致,跳过 ntfy 推送" + ) + return False + + # 限制账号数量 + ntfy_topics = limit_accounts(ntfy_topics, self.max_accounts, "ntfy") + if ntfy_tokens: + ntfy_tokens = ntfy_tokens[: len(ntfy_topics)] + + results = [] + for i, topic in enumerate(ntfy_topics): + if topic: + token = get_account_at_index(ntfy_tokens, i, "") if ntfy_tokens else "" + account_label = f"账号{i+1}" if len(ntfy_topics) > 1 else "" + result = send_to_ntfy( + server_url=ntfy_server_url, + topic=topic, + token=token, + report_data=report_data, + report_type=report_type, + update_info=update_info, + proxy_url=proxy_url, + mode=mode, + account_label=account_label, + batch_size=3800, + split_content_func=self.split_content_func, + ) + results.append(result) + + return any(results) if results else False + + def _send_bark( + self, + report_data: Dict, + report_type: str, + update_info: Optional[Dict], + proxy_url: Optional[str], + mode: str, + ) -> bool: + """发送到 Bark(多账号)""" + return self._send_to_multi_accounts( + channel_name="Bark", + config_value=self.config["BARK_URL"], + send_func=lambda url, account_label: send_to_bark( + bark_url=url, + report_data=report_data, + report_type=report_type, + update_info=update_info, + proxy_url=proxy_url, + mode=mode, + account_label=account_label, + batch_size=self.config.get("BARK_BATCH_SIZE", 3600), + batch_interval=self.config.get("BATCH_SEND_INTERVAL", 1.0), + split_content_func=self.split_content_func, + ), + ) + + def _send_slack( + self, + report_data: Dict, + report_type: str, + update_info: Optional[Dict], + proxy_url: Optional[str], + mode: str, + ) -> bool: + """发送到 Slack(多账号)""" + return self._send_to_multi_accounts( + channel_name="Slack", + config_value=self.config["SLACK_WEBHOOK_URL"], + send_func=lambda url, account_label: send_to_slack( + webhook_url=url, + report_data=report_data, + report_type=report_type, + update_info=update_info, + proxy_url=proxy_url, + mode=mode, + account_label=account_label, + batch_size=self.config.get("SLACK_BATCH_SIZE", 4000), + batch_interval=self.config.get("BATCH_SEND_INTERVAL", 1.0), + split_content_func=self.split_content_func, + ), + ) + + def _send_email( + self, + report_type: str, + html_file_path: Optional[str], + ) -> bool: + """发送邮件(保持原有逻辑,已支持多收件人)""" + return send_to_email( + from_email=self.config["EMAIL_FROM"], + password=self.config["EMAIL_PASSWORD"], + to_email=self.config["EMAIL_TO"], + report_type=report_type, + html_file_path=html_file_path, + custom_smtp_server=self.config.get("EMAIL_SMTP_SERVER", ""), + custom_smtp_port=self.config.get("EMAIL_SMTP_PORT", ""), + get_time_func=self.get_time_func, + ) diff --git a/trendradar/notification/formatters.py b/trendradar/notification/formatters.py new file mode 100644 index 0000000..fa7d7e4 --- /dev/null +++ b/trendradar/notification/formatters.py @@ -0,0 +1,80 @@ +# coding=utf-8 +""" +通知内容格式转换模块 + +提供不同推送平台间的格式转换功能 +""" + +import re + + +def strip_markdown(text: str) -> str: + """去除文本中的 markdown 语法格式,用于个人微信推送 + + Args: + text: 包含 markdown 格式的文本 + + Returns: + 纯文本内容 + """ + # 去除粗体 **text** 或 __text__ + text = re.sub(r'\*\*(.+?)\*\*', r'\1', text) + text = re.sub(r'__(.+?)__', r'\1', text) + + # 去除斜体 *text* 或 _text_ + text = re.sub(r'\*(.+?)\*', r'\1', text) + text = re.sub(r'_(.+?)_', r'\1', text) + + # 去除删除线 ~~text~~ + text = re.sub(r'~~(.+?)~~', r'\1', text) + + # 转换链接 [text](url) -> text url(保留 URL) + text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'\1 \2', text) + + # 去除图片 ![alt](url) -> alt + text = re.sub(r'!\[(.+?)\]\(.+?\)', r'\1', text) + + # 去除行内代码 `code` + text = re.sub(r'`(.+?)`', r'\1', text) + + # 去除引用符号 > + text = re.sub(r'^>\s*', '', text, flags=re.MULTILINE) + + # 去除标题符号 # ## ### 等 + text = re.sub(r'^#+\s*', '', text, flags=re.MULTILINE) + + # 去除水平分割线 --- 或 *** + text = re.sub(r'^[\-\*]{3,}\s*$', '', text, flags=re.MULTILINE) + + # 去除 HTML 标签 text -> text + text = re.sub(r']*>(.+?)', r'\1', text) + text = re.sub(r'<[^>]+>', '', text) + + # 清理多余的空行(保留最多两个连续空行) + text = re.sub(r'\n{3,}', '\n\n', text) + + return text.strip() + + +def convert_markdown_to_mrkdwn(content: str) -> str: + """ + 将标准 Markdown 转换为 Slack 的 mrkdwn 格式 + + 转换规则: + - **粗体** → *粗体* + - [文本](url) → + - 保留其他格式(代码块、列表等) + + Args: + content: Markdown 格式的内容 + + Returns: + Slack mrkdwn 格式的内容 + """ + # 1. 转换链接格式: [文本](url) → + content = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'<\2|\1>', content) + + # 2. 转换粗体: **文本** → *文本* + content = re.sub(r'\*\*([^*]+)\*\*', r'*\1*', content) + + return content diff --git a/trendradar/notification/push_manager.py b/trendradar/notification/push_manager.py new file mode 100644 index 0000000..26a3da5 --- /dev/null +++ b/trendradar/notification/push_manager.py @@ -0,0 +1,109 @@ +# coding=utf-8 +""" +推送记录管理模块 + +管理推送记录,支持每日只推送一次和时间窗口控制 +通过 storage_backend 统一存储,支持本地 SQLite 和远程云存储 +""" + +from datetime import datetime +from typing import Callable, Optional, Any + +import pytz + + +class PushRecordManager: + """ + 推送记录管理器 + + 通过 storage_backend 统一管理推送记录: + - 本地环境:使用 LocalStorageBackend,数据存储在本地 SQLite + - GitHub Actions:使用 RemoteStorageBackend,数据存储在云端 + + 这样 once_per_day 功能在 GitHub Actions 上也能正常工作。 + """ + + def __init__( + self, + storage_backend: Any, + get_time_func: Optional[Callable[[], datetime]] = None, + ): + """ + 初始化推送记录管理器 + + Args: + storage_backend: 存储后端实例(LocalStorageBackend 或 RemoteStorageBackend) + get_time_func: 获取当前时间的函数(应使用配置的时区) + """ + self.storage_backend = storage_backend + self.get_time = get_time_func or self._default_get_time + + print(f"[推送记录] 使用 {storage_backend.backend_name} 存储后端") + + def _default_get_time(self) -> datetime: + """默认时间获取函数(UTC+8)""" + return datetime.now(pytz.timezone("Asia/Shanghai")) + + def has_pushed_today(self) -> bool: + """ + 检查今天是否已经推送过 + + Returns: + 是否已推送 + """ + return self.storage_backend.has_pushed_today() + + def record_push(self, report_type: str) -> bool: + """ + 记录推送 + + Args: + report_type: 报告类型 + + Returns: + 是否记录成功 + """ + return self.storage_backend.record_push(report_type) + + def is_in_time_range(self, start_time: str, end_time: str) -> bool: + """ + 检查当前时间是否在指定时间范围内 + + Args: + start_time: 开始时间(格式:HH:MM) + end_time: 结束时间(格式:HH:MM) + + Returns: + 是否在时间范围内 + """ + now = self.get_time() + current_time = now.strftime("%H:%M") + + def normalize_time(time_str: str) -> str: + """将时间字符串标准化为 HH:MM 格式""" + try: + parts = time_str.strip().split(":") + if len(parts) != 2: + raise ValueError(f"时间格式错误: {time_str}") + + hour = int(parts[0]) + minute = int(parts[1]) + + if not (0 <= hour <= 23 and 0 <= minute <= 59): + raise ValueError(f"时间范围错误: {time_str}") + + return f"{hour:02d}:{minute:02d}" + except Exception as e: + print(f"时间格式化错误 '{time_str}': {e}") + return time_str + + normalized_start = normalize_time(start_time) + normalized_end = normalize_time(end_time) + normalized_current = normalize_time(current_time) + + result = normalized_start <= normalized_current <= normalized_end + + if not result: + print(f"时间窗口判断:当前 {normalized_current},窗口 {normalized_start}-{normalized_end}") + + return result diff --git a/trendradar/notification/renderer.py b/trendradar/notification/renderer.py new file mode 100644 index 0000000..dcee02a --- /dev/null +++ b/trendradar/notification/renderer.py @@ -0,0 +1,260 @@ +# coding=utf-8 +""" +通知内容渲染模块 + +提供多平台通知内容渲染功能,生成格式化的推送消息 +""" + +from datetime import datetime +from typing import Dict, List, Optional, Callable + +from trendradar.report.formatter import format_title_for_platform + + +def render_feishu_content( + report_data: Dict, + update_info: Optional[Dict] = None, + mode: str = "daily", + separator: str = "---", + reverse_content_order: bool = False, + get_time_func: Optional[Callable[[], datetime]] = None, +) -> str: + """渲染飞书通知内容 + + Args: + report_data: 报告数据字典,包含 stats, new_titles, failed_ids, total_new_count + update_info: 版本更新信息(可选) + mode: 报告模式 ("daily", "incremental", "current") + separator: 内容分隔符 + reverse_content_order: 是否反转内容顺序(新增在前) + get_time_func: 获取当前时间的函数(可选,默认使用 datetime.now()) + + Returns: + 格式化的飞书消息内容 + """ + # 生成热点词汇统计部分 + stats_content = "" + if report_data["stats"]: + stats_content += "📊 **热点词汇统计**\n\n" + + total_count = len(report_data["stats"]) + + for i, stat in enumerate(report_data["stats"]): + word = stat["word"] + count = stat["count"] + + sequence_display = f"[{i + 1}/{total_count}]" + + if count >= 10: + stats_content += f"🔥 {sequence_display} **{word}** : {count} 条\n\n" + elif count >= 5: + stats_content += f"📈 {sequence_display} **{word}** : {count} 条\n\n" + else: + stats_content += f"📌 {sequence_display} **{word}** : {count} 条\n\n" + + for j, title_data in enumerate(stat["titles"], 1): + formatted_title = format_title_for_platform( + "feishu", title_data, show_source=True + ) + stats_content += f" {j}. {formatted_title}\n" + + if j < len(stat["titles"]): + stats_content += "\n" + + if i < len(report_data["stats"]) - 1: + stats_content += f"\n{separator}\n\n" + + # 生成新增新闻部分 + new_titles_content = "" + if report_data["new_titles"]: + new_titles_content += ( + f"🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n" + ) + + for source_data in report_data["new_titles"]: + new_titles_content += ( + f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n" + ) + + for j, title_data in enumerate(source_data["titles"], 1): + title_data_copy = title_data.copy() + title_data_copy["is_new"] = False + formatted_title = format_title_for_platform( + "feishu", title_data_copy, show_source=False + ) + new_titles_content += f" {j}. {formatted_title}\n" + + new_titles_content += "\n" + + # 根据配置决定内容顺序 + text_content = "" + if reverse_content_order: + # 新增热点在前,热点词汇统计在后 + if new_titles_content: + text_content += new_titles_content + if stats_content: + text_content += f"\n{separator}\n\n" + if stats_content: + text_content += stats_content + else: + # 默认:热点词汇统计在前,新增热点在后 + if stats_content: + text_content += stats_content + if new_titles_content: + text_content += f"\n{separator}\n\n" + if new_titles_content: + text_content += new_titles_content + + if not text_content: + if mode == "incremental": + mode_text = "增量模式下暂无新增匹配的热点词汇" + elif mode == "current": + mode_text = "当前榜单模式下暂无匹配的热点词汇" + else: + mode_text = "暂无匹配的热点词汇" + text_content = f"📭 {mode_text}\n\n" + + if report_data["failed_ids"]: + if text_content and "暂无匹配" not in text_content: + text_content += f"\n{separator}\n\n" + + text_content += "⚠️ **数据获取失败的平台:**\n\n" + for i, id_value in enumerate(report_data["failed_ids"], 1): + text_content += f" • {id_value}\n" + + # 获取当前时间 + now = get_time_func() if get_time_func else datetime.now() + text_content += ( + f"\n\n更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}" + ) + + if update_info: + text_content += f"\nTrendRadar 发现新版本 {update_info['remote_version']},当前 {update_info['current_version']}" + + return text_content + + +def render_dingtalk_content( + report_data: Dict, + update_info: Optional[Dict] = None, + mode: str = "daily", + reverse_content_order: bool = False, + get_time_func: Optional[Callable[[], datetime]] = None, +) -> str: + """渲染钉钉通知内容 + + Args: + report_data: 报告数据字典,包含 stats, new_titles, failed_ids, total_new_count + update_info: 版本更新信息(可选) + mode: 报告模式 ("daily", "incremental", "current") + reverse_content_order: 是否反转内容顺序(新增在前) + get_time_func: 获取当前时间的函数(可选,默认使用 datetime.now()) + + Returns: + 格式化的钉钉消息内容 + """ + total_titles = sum( + len(stat["titles"]) for stat in report_data["stats"] if stat["count"] > 0 + ) + now = get_time_func() if get_time_func else datetime.now() + + # 头部信息 + header_content = f"**总新闻数:** {total_titles}\n\n" + header_content += f"**时间:** {now.strftime('%Y-%m-%d %H:%M:%S')}\n\n" + header_content += "**类型:** 热点分析报告\n\n" + header_content += "---\n\n" + + # 生成热点词汇统计部分 + stats_content = "" + if report_data["stats"]: + stats_content += "📊 **热点词汇统计**\n\n" + + total_count = len(report_data["stats"]) + + for i, stat in enumerate(report_data["stats"]): + word = stat["word"] + count = stat["count"] + + sequence_display = f"[{i + 1}/{total_count}]" + + if count >= 10: + stats_content += f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n" + elif count >= 5: + stats_content += f"📈 {sequence_display} **{word}** : **{count}** 条\n\n" + else: + stats_content += f"📌 {sequence_display} **{word}** : {count} 条\n\n" + + for j, title_data in enumerate(stat["titles"], 1): + formatted_title = format_title_for_platform( + "dingtalk", title_data, show_source=True + ) + stats_content += f" {j}. {formatted_title}\n" + + if j < len(stat["titles"]): + stats_content += "\n" + + if i < len(report_data["stats"]) - 1: + stats_content += "\n---\n\n" + + # 生成新增新闻部分 + new_titles_content = "" + if report_data["new_titles"]: + new_titles_content += ( + f"🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n" + ) + + for source_data in report_data["new_titles"]: + new_titles_content += f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n" + + for j, title_data in enumerate(source_data["titles"], 1): + title_data_copy = title_data.copy() + title_data_copy["is_new"] = False + formatted_title = format_title_for_platform( + "dingtalk", title_data_copy, show_source=False + ) + new_titles_content += f" {j}. {formatted_title}\n" + + new_titles_content += "\n" + + # 根据配置决定内容顺序 + text_content = header_content + if reverse_content_order: + # 新增热点在前,热点词汇统计在后 + if new_titles_content: + text_content += new_titles_content + if stats_content: + text_content += "\n---\n\n" + if stats_content: + text_content += stats_content + else: + # 默认:热点词汇统计在前,新增热点在后 + if stats_content: + text_content += stats_content + if new_titles_content: + text_content += "\n---\n\n" + if new_titles_content: + text_content += new_titles_content + + if not stats_content and not new_titles_content: + if mode == "incremental": + mode_text = "增量模式下暂无新增匹配的热点词汇" + elif mode == "current": + mode_text = "当前榜单模式下暂无匹配的热点词汇" + else: + mode_text = "暂无匹配的热点词汇" + text_content += f"📭 {mode_text}\n\n" + + if report_data["failed_ids"]: + if "暂无匹配" not in text_content: + text_content += "\n---\n\n" + + text_content += "⚠️ **数据获取失败的平台:**\n\n" + for i, id_value in enumerate(report_data["failed_ids"], 1): + text_content += f" • **{id_value}**\n" + + text_content += f"\n\n> 更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}" + + if update_info: + text_content += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**,当前 **{update_info['current_version']}**" + + return text_content diff --git a/trendradar/notification/senders.py b/trendradar/notification/senders.py new file mode 100644 index 0000000..a295eb1 --- /dev/null +++ b/trendradar/notification/senders.py @@ -0,0 +1,1033 @@ +# coding=utf-8 +""" +消息发送器模块 + +将报告数据发送到各种通知渠道: +- 飞书 (Feishu/Lark) +- 钉钉 (DingTalk) +- 企业微信 (WeCom/WeWork) +- Telegram +- 邮件 (Email) +- ntfy +- Bark +- Slack + +每个发送函数都支持分批发送,并通过参数化配置实现与 CONFIG 的解耦。 +""" + +import smtplib +import time +from datetime import datetime +from email.header import Header +from email.mime.multipart import MIMEMultipart +from email.mime.text import MIMEText +from email.utils import formataddr, formatdate, make_msgid +from pathlib import Path +from typing import Callable, Dict, List, Optional +from urllib.parse import urlparse + +import requests + +from .batch import add_batch_headers, get_max_batch_header_size +from .formatters import convert_markdown_to_mrkdwn, strip_markdown + + +# === SMTP 邮件配置 === +SMTP_CONFIGS = { + # Gmail(使用 STARTTLS) + "gmail.com": {"server": "smtp.gmail.com", "port": 587, "encryption": "TLS"}, + # QQ邮箱(使用 SSL,更稳定) + "qq.com": {"server": "smtp.qq.com", "port": 465, "encryption": "SSL"}, + # Outlook(使用 STARTTLS) + "outlook.com": {"server": "smtp-mail.outlook.com", "port": 587, "encryption": "TLS"}, + "hotmail.com": {"server": "smtp-mail.outlook.com", "port": 587, "encryption": "TLS"}, + "live.com": {"server": "smtp-mail.outlook.com", "port": 587, "encryption": "TLS"}, + # 网易邮箱(使用 SSL,更稳定) + "163.com": {"server": "smtp.163.com", "port": 465, "encryption": "SSL"}, + "126.com": {"server": "smtp.126.com", "port": 465, "encryption": "SSL"}, + # 新浪邮箱(使用 SSL) + "sina.com": {"server": "smtp.sina.com", "port": 465, "encryption": "SSL"}, + # 搜狐邮箱(使用 SSL) + "sohu.com": {"server": "smtp.sohu.com", "port": 465, "encryption": "SSL"}, + # 天翼邮箱(使用 SSL) + "189.cn": {"server": "smtp.189.cn", "port": 465, "encryption": "SSL"}, + # 阿里云邮箱(使用 TLS) + "aliyun.com": {"server": "smtp.aliyun.com", "port": 465, "encryption": "TLS"}, +} + + +def send_to_feishu( + webhook_url: str, + report_data: Dict, + report_type: str, + update_info: Optional[Dict] = None, + proxy_url: Optional[str] = None, + mode: str = "daily", + account_label: str = "", + *, + batch_size: int = 29000, + batch_interval: float = 1.0, + split_content_func: Callable = None, + get_time_func: Callable = None, +) -> bool: + """ + 发送到飞书(支持分批发送) + + Args: + webhook_url: 飞书 Webhook URL + report_data: 报告数据 + report_type: 报告类型 + update_info: 更新信息(可选) + proxy_url: 代理 URL(可选) + mode: 报告模式 (daily/current) + account_label: 账号标签(多账号时显示) + batch_size: 批次大小(字节) + batch_interval: 批次发送间隔(秒) + split_content_func: 内容分批函数 + get_time_func: 获取当前时间的函数 + + Returns: + bool: 发送是否成功 + """ + headers = {"Content-Type": "application/json"} + proxies = None + if proxy_url: + proxies = {"http": proxy_url, "https": proxy_url} + + # 日志前缀 + log_prefix = f"飞书{account_label}" if account_label else "飞书" + + # 预留批次头部空间,避免添加头部后超限 + header_reserve = get_max_batch_header_size("feishu") + batches = split_content_func( + report_data, + "feishu", + update_info, + max_bytes=batch_size - header_reserve, + mode=mode, + ) + + # 统一添加批次头部(已预留空间,不会超限) + batches = add_batch_headers(batches, "feishu", batch_size) + + print(f"{log_prefix}消息分为 {len(batches)} 批次发送 [{report_type}]") + + # 逐批发送 + for i, batch_content in enumerate(batches, 1): + content_size = len(batch_content.encode("utf-8")) + print( + f"发送{log_prefix}第 {i}/{len(batches)} 批次,大小:{content_size} 字节 [{report_type}]" + ) + + total_titles = sum( + len(stat["titles"]) for stat in report_data["stats"] if stat["count"] > 0 + ) + now = get_time_func() if get_time_func else datetime.now() + + payload = { + "msg_type": "text", + "content": { + "total_titles": total_titles, + "timestamp": now.strftime("%Y-%m-%d %H:%M:%S"), + "report_type": report_type, + "text": batch_content, + }, + } + + try: + response = requests.post( + webhook_url, headers=headers, json=payload, proxies=proxies, timeout=30 + ) + if response.status_code == 200: + result = response.json() + # 检查飞书的响应状态 + if result.get("StatusCode") == 0 or result.get("code") == 0: + print(f"{log_prefix}第 {i}/{len(batches)} 批次发送成功 [{report_type}]") + # 批次间间隔 + if i < len(batches): + time.sleep(batch_interval) + else: + error_msg = result.get("msg") or result.get("StatusMessage", "未知错误") + print( + f"{log_prefix}第 {i}/{len(batches)} 批次发送失败 [{report_type}],错误:{error_msg}" + ) + return False + else: + print( + f"{log_prefix}第 {i}/{len(batches)} 批次发送失败 [{report_type}],状态码:{response.status_code}" + ) + return False + except Exception as e: + print(f"{log_prefix}第 {i}/{len(batches)} 批次发送出错 [{report_type}]:{e}") + return False + + print(f"{log_prefix}所有 {len(batches)} 批次发送完成 [{report_type}]") + return True + + +def send_to_dingtalk( + webhook_url: str, + report_data: Dict, + report_type: str, + update_info: Optional[Dict] = None, + proxy_url: Optional[str] = None, + mode: str = "daily", + account_label: str = "", + *, + batch_size: int = 20000, + batch_interval: float = 1.0, + split_content_func: Callable = None, +) -> bool: + """ + 发送到钉钉(支持分批发送) + + Args: + webhook_url: 钉钉 Webhook URL + report_data: 报告数据 + report_type: 报告类型 + update_info: 更新信息(可选) + proxy_url: 代理 URL(可选) + mode: 报告模式 (daily/current) + account_label: 账号标签(多账号时显示) + batch_size: 批次大小(字节) + batch_interval: 批次发送间隔(秒) + split_content_func: 内容分批函数 + + Returns: + bool: 发送是否成功 + """ + headers = {"Content-Type": "application/json"} + proxies = None + if proxy_url: + proxies = {"http": proxy_url, "https": proxy_url} + + # 日志前缀 + log_prefix = f"钉钉{account_label}" if account_label else "钉钉" + + # 预留批次头部空间,避免添加头部后超限 + header_reserve = get_max_batch_header_size("dingtalk") + batches = split_content_func( + report_data, + "dingtalk", + update_info, + max_bytes=batch_size - header_reserve, + mode=mode, + ) + + # 统一添加批次头部(已预留空间,不会超限) + batches = add_batch_headers(batches, "dingtalk", batch_size) + + print(f"{log_prefix}消息分为 {len(batches)} 批次发送 [{report_type}]") + + # 逐批发送 + for i, batch_content in enumerate(batches, 1): + content_size = len(batch_content.encode("utf-8")) + print( + f"发送{log_prefix}第 {i}/{len(batches)} 批次,大小:{content_size} 字节 [{report_type}]" + ) + + payload = { + "msgtype": "markdown", + "markdown": { + "title": f"TrendRadar 热点分析报告 - {report_type}", + "text": batch_content, + }, + } + + try: + response = requests.post( + webhook_url, headers=headers, json=payload, proxies=proxies, timeout=30 + ) + if response.status_code == 200: + result = response.json() + if result.get("errcode") == 0: + print(f"{log_prefix}第 {i}/{len(batches)} 批次发送成功 [{report_type}]") + # 批次间间隔 + if i < len(batches): + time.sleep(batch_interval) + else: + print( + f"{log_prefix}第 {i}/{len(batches)} 批次发送失败 [{report_type}],错误:{result.get('errmsg')}" + ) + return False + else: + print( + f"{log_prefix}第 {i}/{len(batches)} 批次发送失败 [{report_type}],状态码:{response.status_code}" + ) + return False + except Exception as e: + print(f"{log_prefix}第 {i}/{len(batches)} 批次发送出错 [{report_type}]:{e}") + return False + + print(f"{log_prefix}所有 {len(batches)} 批次发送完成 [{report_type}]") + return True + + +def send_to_wework( + webhook_url: str, + report_data: Dict, + report_type: str, + update_info: Optional[Dict] = None, + proxy_url: Optional[str] = None, + mode: str = "daily", + account_label: str = "", + *, + batch_size: int = 4000, + batch_interval: float = 1.0, + msg_type: str = "markdown", + split_content_func: Callable = None, +) -> bool: + """ + 发送到企业微信(支持分批发送,支持 markdown 和 text 两种格式) + + Args: + webhook_url: 企业微信 Webhook URL + report_data: 报告数据 + report_type: 报告类型 + update_info: 更新信息(可选) + proxy_url: 代理 URL(可选) + mode: 报告模式 (daily/current) + account_label: 账号标签(多账号时显示) + batch_size: 批次大小(字节) + batch_interval: 批次发送间隔(秒) + msg_type: 消息类型 (markdown/text) + split_content_func: 内容分批函数 + + Returns: + bool: 发送是否成功 + """ + headers = {"Content-Type": "application/json"} + proxies = None + if proxy_url: + proxies = {"http": proxy_url, "https": proxy_url} + + # 日志前缀 + log_prefix = f"企业微信{account_label}" if account_label else "企业微信" + + # 获取消息类型配置(markdown 或 text) + is_text_mode = msg_type.lower() == "text" + + if is_text_mode: + print(f"{log_prefix}使用 text 格式(个人微信模式)[{report_type}]") + else: + print(f"{log_prefix}使用 markdown 格式(群机器人模式)[{report_type}]") + + # text 模式使用 wework_text,markdown 模式使用 wework + header_format_type = "wework_text" if is_text_mode else "wework" + + # 获取分批内容,预留批次头部空间 + header_reserve = get_max_batch_header_size(header_format_type) + batches = split_content_func( + report_data, "wework", update_info, max_bytes=batch_size - header_reserve, mode=mode + ) + + # 统一添加批次头部(已预留空间,不会超限) + batches = add_batch_headers(batches, header_format_type, batch_size) + + print(f"{log_prefix}消息分为 {len(batches)} 批次发送 [{report_type}]") + + # 逐批发送 + for i, batch_content in enumerate(batches, 1): + # 根据消息类型构建 payload + if is_text_mode: + # text 格式:去除 markdown 语法 + plain_content = strip_markdown(batch_content) + payload = {"msgtype": "text", "text": {"content": plain_content}} + content_size = len(plain_content.encode("utf-8")) + else: + # markdown 格式:保持原样 + payload = {"msgtype": "markdown", "markdown": {"content": batch_content}} + content_size = len(batch_content.encode("utf-8")) + + print( + f"发送{log_prefix}第 {i}/{len(batches)} 批次,大小:{content_size} 字节 [{report_type}]" + ) + + try: + response = requests.post( + webhook_url, headers=headers, json=payload, proxies=proxies, timeout=30 + ) + if response.status_code == 200: + result = response.json() + if result.get("errcode") == 0: + print(f"{log_prefix}第 {i}/{len(batches)} 批次发送成功 [{report_type}]") + # 批次间间隔 + if i < len(batches): + time.sleep(batch_interval) + else: + print( + f"{log_prefix}第 {i}/{len(batches)} 批次发送失败 [{report_type}],错误:{result.get('errmsg')}" + ) + return False + else: + print( + f"{log_prefix}第 {i}/{len(batches)} 批次发送失败 [{report_type}],状态码:{response.status_code}" + ) + return False + except Exception as e: + print(f"{log_prefix}第 {i}/{len(batches)} 批次发送出错 [{report_type}]:{e}") + return False + + print(f"{log_prefix}所有 {len(batches)} 批次发送完成 [{report_type}]") + return True + + +def send_to_telegram( + bot_token: str, + chat_id: str, + report_data: Dict, + report_type: str, + update_info: Optional[Dict] = None, + proxy_url: Optional[str] = None, + mode: str = "daily", + account_label: str = "", + *, + batch_size: int = 4000, + batch_interval: float = 1.0, + split_content_func: Callable = None, +) -> bool: + """ + 发送到 Telegram(支持分批发送) + + Args: + bot_token: Telegram Bot Token + chat_id: Telegram Chat ID + report_data: 报告数据 + report_type: 报告类型 + update_info: 更新信息(可选) + proxy_url: 代理 URL(可选) + mode: 报告模式 (daily/current) + account_label: 账号标签(多账号时显示) + batch_size: 批次大小(字节) + batch_interval: 批次发送间隔(秒) + split_content_func: 内容分批函数 + + Returns: + bool: 发送是否成功 + """ + headers = {"Content-Type": "application/json"} + url = f"https://api.telegram.org/bot{bot_token}/sendMessage" + + proxies = None + if proxy_url: + proxies = {"http": proxy_url, "https": proxy_url} + + # 日志前缀 + log_prefix = f"Telegram{account_label}" if account_label else "Telegram" + + # 获取分批内容,预留批次头部空间 + header_reserve = get_max_batch_header_size("telegram") + batches = split_content_func( + report_data, "telegram", update_info, max_bytes=batch_size - header_reserve, mode=mode + ) + + # 统一添加批次头部(已预留空间,不会超限) + batches = add_batch_headers(batches, "telegram", batch_size) + + print(f"{log_prefix}消息分为 {len(batches)} 批次发送 [{report_type}]") + + # 逐批发送 + for i, batch_content in enumerate(batches, 1): + content_size = len(batch_content.encode("utf-8")) + print( + f"发送{log_prefix}第 {i}/{len(batches)} 批次,大小:{content_size} 字节 [{report_type}]" + ) + + payload = { + "chat_id": chat_id, + "text": batch_content, + "parse_mode": "HTML", + "disable_web_page_preview": True, + } + + try: + response = requests.post( + url, headers=headers, json=payload, proxies=proxies, timeout=30 + ) + if response.status_code == 200: + result = response.json() + if result.get("ok"): + print(f"{log_prefix}第 {i}/{len(batches)} 批次发送成功 [{report_type}]") + # 批次间间隔 + if i < len(batches): + time.sleep(batch_interval) + else: + print( + f"{log_prefix}第 {i}/{len(batches)} 批次发送失败 [{report_type}],错误:{result.get('description')}" + ) + return False + else: + print( + f"{log_prefix}第 {i}/{len(batches)} 批次发送失败 [{report_type}],状态码:{response.status_code}" + ) + return False + except Exception as e: + print(f"{log_prefix}第 {i}/{len(batches)} 批次发送出错 [{report_type}]:{e}") + return False + + print(f"{log_prefix}所有 {len(batches)} 批次发送完成 [{report_type}]") + return True + + +def send_to_email( + from_email: str, + password: str, + to_email: str, + report_type: str, + html_file_path: str, + custom_smtp_server: Optional[str] = None, + custom_smtp_port: Optional[int] = None, + *, + get_time_func: Callable = None, +) -> bool: + """ + 发送邮件通知 + + Args: + from_email: 发件人邮箱 + password: 邮箱密码/授权码 + to_email: 收件人邮箱(多个用逗号分隔) + report_type: 报告类型 + html_file_path: HTML 报告文件路径 + custom_smtp_server: 自定义 SMTP 服务器(可选) + custom_smtp_port: 自定义 SMTP 端口(可选) + get_time_func: 获取当前时间的函数 + + Returns: + bool: 发送是否成功 + """ + try: + if not html_file_path or not Path(html_file_path).exists(): + print(f"错误:HTML文件不存在或未提供: {html_file_path}") + return False + + print(f"使用HTML文件: {html_file_path}") + with open(html_file_path, "r", encoding="utf-8") as f: + html_content = f.read() + + domain = from_email.split("@")[-1].lower() + + if custom_smtp_server and custom_smtp_port: + # 使用自定义 SMTP 配置 + smtp_server = custom_smtp_server + smtp_port = int(custom_smtp_port) + # 根据端口判断加密方式:465=SSL, 587=TLS + if smtp_port == 465: + use_tls = False # SSL 模式(SMTP_SSL) + elif smtp_port == 587: + use_tls = True # TLS 模式(STARTTLS) + else: + # 其他端口优先尝试 TLS(更安全,更广泛支持) + use_tls = True + elif domain in SMTP_CONFIGS: + # 使用预设配置 + config = SMTP_CONFIGS[domain] + smtp_server = config["server"] + smtp_port = config["port"] + use_tls = config["encryption"] == "TLS" + else: + print(f"未识别的邮箱服务商: {domain},使用通用 SMTP 配置") + smtp_server = f"smtp.{domain}" + smtp_port = 587 + use_tls = True + + msg = MIMEMultipart("alternative") + + # 严格按照 RFC 标准设置 From header + sender_name = "TrendRadar" + msg["From"] = formataddr((sender_name, from_email)) + + # 设置收件人 + recipients = [addr.strip() for addr in to_email.split(",")] + if len(recipients) == 1: + msg["To"] = recipients[0] + else: + msg["To"] = ", ".join(recipients) + + # 设置邮件主题 + now = get_time_func() if get_time_func else datetime.now() + subject = f"TrendRadar 热点分析报告 - {report_type} - {now.strftime('%m月%d日 %H:%M')}" + msg["Subject"] = Header(subject, "utf-8") + + # 设置其他标准 header + msg["MIME-Version"] = "1.0" + msg["Date"] = formatdate(localtime=True) + msg["Message-ID"] = make_msgid() + + # 添加纯文本部分(作为备选) + text_content = f""" +TrendRadar 热点分析报告 +======================== +报告类型:{report_type} +生成时间:{now.strftime('%Y-%m-%d %H:%M:%S')} + +请使用支持HTML的邮件客户端查看完整报告内容。 + """ + text_part = MIMEText(text_content, "plain", "utf-8") + msg.attach(text_part) + + html_part = MIMEText(html_content, "html", "utf-8") + msg.attach(html_part) + + print(f"正在发送邮件到 {to_email}...") + print(f"SMTP 服务器: {smtp_server}:{smtp_port}") + print(f"发件人: {from_email}") + + try: + if use_tls: + # TLS 模式 + server = smtplib.SMTP(smtp_server, smtp_port, timeout=30) + server.set_debuglevel(0) # 设为1可以查看详细调试信息 + server.ehlo() + server.starttls() + server.ehlo() + else: + # SSL 模式 + server = smtplib.SMTP_SSL(smtp_server, smtp_port, timeout=30) + server.set_debuglevel(0) + server.ehlo() + + # 登录 + server.login(from_email, password) + + # 发送邮件 + server.send_message(msg) + server.quit() + + print(f"邮件发送成功 [{report_type}] -> {to_email}") + return True + + except smtplib.SMTPServerDisconnected: + print("邮件发送失败:服务器意外断开连接,请检查网络或稍后重试") + return False + + except smtplib.SMTPAuthenticationError as e: + print("邮件发送失败:认证错误,请检查邮箱和密码/授权码") + print(f"详细错误: {str(e)}") + return False + except smtplib.SMTPRecipientsRefused as e: + print(f"邮件发送失败:收件人地址被拒绝 {e}") + return False + except smtplib.SMTPSenderRefused as e: + print(f"邮件发送失败:发件人地址被拒绝 {e}") + return False + except smtplib.SMTPDataError as e: + print(f"邮件发送失败:邮件数据错误 {e}") + return False + except smtplib.SMTPConnectError as e: + print(f"邮件发送失败:无法连接到 SMTP 服务器 {smtp_server}:{smtp_port}") + print(f"详细错误: {str(e)}") + return False + except Exception as e: + print(f"邮件发送失败 [{report_type}]:{e}") + import traceback + traceback.print_exc() + return False + + +def send_to_ntfy( + server_url: str, + topic: str, + token: Optional[str], + report_data: Dict, + report_type: str, + update_info: Optional[Dict] = None, + proxy_url: Optional[str] = None, + mode: str = "daily", + account_label: str = "", + *, + batch_size: int = 3800, + split_content_func: Callable = None, +) -> bool: + """ + 发送到 ntfy(支持分批发送,严格遵守4KB限制) + + Args: + server_url: ntfy 服务器 URL + topic: ntfy 主题 + token: ntfy 访问令牌(可选) + report_data: 报告数据 + report_type: 报告类型 + update_info: 更新信息(可选) + proxy_url: 代理 URL(可选) + mode: 报告模式 (daily/current) + account_label: 账号标签(多账号时显示) + batch_size: 批次大小(字节) + split_content_func: 内容分批函数 + + Returns: + bool: 发送是否成功 + """ + # 日志前缀 + log_prefix = f"ntfy{account_label}" if account_label else "ntfy" + + # 避免 HTTP header 编码问题 + report_type_en_map = { + "当日汇总": "Daily Summary", + "当前榜单汇总": "Current Ranking", + "增量更新": "Incremental Update", + "实时增量": "Realtime Incremental", + "实时当前榜单": "Realtime Current Ranking", + } + report_type_en = report_type_en_map.get(report_type, "News Report") + + headers = { + "Content-Type": "text/plain; charset=utf-8", + "Markdown": "yes", + "Title": report_type_en, + "Priority": "default", + "Tags": "news", + } + + if token: + headers["Authorization"] = f"Bearer {token}" + + # 构建完整URL,确保格式正确 + base_url = server_url.rstrip("/") + if not base_url.startswith(("http://", "https://")): + base_url = f"https://{base_url}" + url = f"{base_url}/{topic}" + + proxies = None + if proxy_url: + proxies = {"http": proxy_url, "https": proxy_url} + + # 获取分批内容,预留批次头部空间 + header_reserve = get_max_batch_header_size("ntfy") + batches = split_content_func( + report_data, "ntfy", update_info, max_bytes=batch_size - header_reserve, mode=mode + ) + + # 统一添加批次头部(已预留空间,不会超限) + batches = add_batch_headers(batches, "ntfy", batch_size) + + total_batches = len(batches) + print(f"{log_prefix}消息分为 {total_batches} 批次发送 [{report_type}]") + + # 反转批次顺序,使得在ntfy客户端显示时顺序正确 + # ntfy显示最新消息在上面,所以我们从最后一批开始推送 + reversed_batches = list(reversed(batches)) + + print(f"{log_prefix}将按反向顺序推送(最后批次先推送),确保客户端显示顺序正确") + + # 逐批发送(反向顺序) + success_count = 0 + for idx, batch_content in enumerate(reversed_batches, 1): + # 计算正确的批次编号(用户视角的编号) + actual_batch_num = total_batches - idx + 1 + + content_size = len(batch_content.encode("utf-8")) + print( + f"发送{log_prefix}第 {actual_batch_num}/{total_batches} 批次(推送顺序: {idx}/{total_batches}),大小:{content_size} 字节 [{report_type}]" + ) + + # 检查消息大小,确保不超过4KB + if content_size > 4096: + print(f"警告:{log_prefix}第 {actual_batch_num} 批次消息过大({content_size} 字节),可能被拒绝") + + # 更新 headers 的批次标识 + current_headers = headers.copy() + if total_batches > 1: + current_headers["Title"] = f"{report_type_en} ({actual_batch_num}/{total_batches})" + + try: + response = requests.post( + url, + headers=current_headers, + data=batch_content.encode("utf-8"), + proxies=proxies, + timeout=30, + ) + + if response.status_code == 200: + print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次发送成功 [{report_type}]") + success_count += 1 + if idx < total_batches: + # 公共服务器建议 2-3 秒,自托管可以更短 + interval = 2 if "ntfy.sh" in server_url else 1 + time.sleep(interval) + elif response.status_code == 429: + print( + f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次速率限制 [{report_type}],等待后重试" + ) + time.sleep(10) # 等待10秒后重试 + # 重试一次 + retry_response = requests.post( + url, + headers=current_headers, + data=batch_content.encode("utf-8"), + proxies=proxies, + timeout=30, + ) + if retry_response.status_code == 200: + print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次重试成功 [{report_type}]") + success_count += 1 + else: + print( + f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次重试失败,状态码:{retry_response.status_code}" + ) + elif response.status_code == 413: + print( + f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次消息过大被拒绝 [{report_type}],消息大小:{content_size} 字节" + ) + else: + print( + f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次发送失败 [{report_type}],状态码:{response.status_code}" + ) + try: + print(f"错误详情:{response.text}") + except: + pass + + except requests.exceptions.ConnectTimeout: + print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次连接超时 [{report_type}]") + except requests.exceptions.ReadTimeout: + print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次读取超时 [{report_type}]") + except requests.exceptions.ConnectionError as e: + print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次连接错误 [{report_type}]:{e}") + except Exception as e: + print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次发送异常 [{report_type}]:{e}") + + # 判断整体发送是否成功 + if success_count == total_batches: + print(f"{log_prefix}所有 {total_batches} 批次发送完成 [{report_type}]") + return True + elif success_count > 0: + print(f"{log_prefix}部分发送成功:{success_count}/{total_batches} 批次 [{report_type}]") + return True # 部分成功也视为成功 + else: + print(f"{log_prefix}发送完全失败 [{report_type}]") + return False + + +def send_to_bark( + bark_url: str, + report_data: Dict, + report_type: str, + update_info: Optional[Dict] = None, + proxy_url: Optional[str] = None, + mode: str = "daily", + account_label: str = "", + *, + batch_size: int = 3600, + batch_interval: float = 1.0, + split_content_func: Callable = None, +) -> bool: + """ + 发送到 Bark(支持分批发送,使用 markdown 格式) + + Args: + bark_url: Bark URL(包含 device_key) + report_data: 报告数据 + report_type: 报告类型 + update_info: 更新信息(可选) + proxy_url: 代理 URL(可选) + mode: 报告模式 (daily/current) + account_label: 账号标签(多账号时显示) + batch_size: 批次大小(字节) + batch_interval: 批次发送间隔(秒) + split_content_func: 内容分批函数 + + Returns: + bool: 发送是否成功 + """ + # 日志前缀 + log_prefix = f"Bark{account_label}" if account_label else "Bark" + + proxies = None + if proxy_url: + proxies = {"http": proxy_url, "https": proxy_url} + + # 解析 Bark URL,提取 device_key 和 API 端点 + # Bark URL 格式: https://api.day.app/device_key 或 https://bark.day.app/device_key + parsed_url = urlparse(bark_url) + device_key = parsed_url.path.strip('/').split('/')[0] if parsed_url.path else None + + if not device_key: + print(f"{log_prefix} URL 格式错误,无法提取 device_key: {bark_url}") + return False + + # 构建正确的 API 端点 + api_endpoint = f"{parsed_url.scheme}://{parsed_url.netloc}/push" + + # 获取分批内容,预留批次头部空间 + header_reserve = get_max_batch_header_size("bark") + batches = split_content_func( + report_data, "bark", update_info, max_bytes=batch_size - header_reserve, mode=mode + ) + + # 统一添加批次头部(已预留空间,不会超限) + batches = add_batch_headers(batches, "bark", batch_size) + + total_batches = len(batches) + print(f"{log_prefix}消息分为 {total_batches} 批次发送 [{report_type}]") + + # 反转批次顺序,使得在Bark客户端显示时顺序正确 + # Bark显示最新消息在上面,所以我们从最后一批开始推送 + reversed_batches = list(reversed(batches)) + + print(f"{log_prefix}将按反向顺序推送(最后批次先推送),确保客户端显示顺序正确") + + # 逐批发送(反向顺序) + success_count = 0 + for idx, batch_content in enumerate(reversed_batches, 1): + # 计算正确的批次编号(用户视角的编号) + actual_batch_num = total_batches - idx + 1 + + content_size = len(batch_content.encode("utf-8")) + print( + f"发送{log_prefix}第 {actual_batch_num}/{total_batches} 批次(推送顺序: {idx}/{total_batches}),大小:{content_size} 字节 [{report_type}]" + ) + + # 检查消息大小(Bark使用APNs,限制4KB) + if content_size > 4096: + print( + f"警告:{log_prefix}第 {actual_batch_num}/{total_batches} 批次消息过大({content_size} 字节),可能被拒绝" + ) + + # 构建JSON payload + payload = { + "title": report_type, + "markdown": batch_content, + "device_key": device_key, + "sound": "default", + "group": "TrendRadar", + "action": "none", # 点击推送跳到 APP 不弹出弹框,方便阅读 + } + + try: + response = requests.post( + api_endpoint, + json=payload, + proxies=proxies, + timeout=30, + ) + + if response.status_code == 200: + result = response.json() + if result.get("code") == 200: + print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次发送成功 [{report_type}]") + success_count += 1 + # 批次间间隔 + if idx < total_batches: + time.sleep(batch_interval) + else: + print( + f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次发送失败 [{report_type}],错误:{result.get('message', '未知错误')}" + ) + else: + print( + f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次发送失败 [{report_type}],状态码:{response.status_code}" + ) + try: + print(f"错误详情:{response.text}") + except: + pass + + except requests.exceptions.ConnectTimeout: + print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次连接超时 [{report_type}]") + except requests.exceptions.ReadTimeout: + print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次读取超时 [{report_type}]") + except requests.exceptions.ConnectionError as e: + print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次连接错误 [{report_type}]:{e}") + except Exception as e: + print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次发送异常 [{report_type}]:{e}") + + # 判断整体发送是否成功 + if success_count == total_batches: + print(f"{log_prefix}所有 {total_batches} 批次发送完成 [{report_type}]") + return True + elif success_count > 0: + print(f"{log_prefix}部分发送成功:{success_count}/{total_batches} 批次 [{report_type}]") + return True # 部分成功也视为成功 + else: + print(f"{log_prefix}发送完全失败 [{report_type}]") + return False + + +def send_to_slack( + webhook_url: str, + report_data: Dict, + report_type: str, + update_info: Optional[Dict] = None, + proxy_url: Optional[str] = None, + mode: str = "daily", + account_label: str = "", + *, + batch_size: int = 4000, + batch_interval: float = 1.0, + split_content_func: Callable = None, +) -> bool: + """ + 发送到 Slack(支持分批发送,使用 mrkdwn 格式) + + Args: + webhook_url: Slack Webhook URL + report_data: 报告数据 + report_type: 报告类型 + update_info: 更新信息(可选) + proxy_url: 代理 URL(可选) + mode: 报告模式 (daily/current) + account_label: 账号标签(多账号时显示) + batch_size: 批次大小(字节) + batch_interval: 批次发送间隔(秒) + split_content_func: 内容分批函数 + + Returns: + bool: 发送是否成功 + """ + headers = {"Content-Type": "application/json"} + proxies = None + if proxy_url: + proxies = {"http": proxy_url, "https": proxy_url} + + # 日志前缀 + log_prefix = f"Slack{account_label}" if account_label else "Slack" + + # 获取分批内容,预留批次头部空间 + header_reserve = get_max_batch_header_size("slack") + batches = split_content_func( + report_data, "slack", update_info, max_bytes=batch_size - header_reserve, mode=mode + ) + + # 统一添加批次头部(已预留空间,不会超限) + batches = add_batch_headers(batches, "slack", batch_size) + + print(f"{log_prefix}消息分为 {len(batches)} 批次发送 [{report_type}]") + + # 逐批发送 + for i, batch_content in enumerate(batches, 1): + # 转换 Markdown 到 mrkdwn 格式 + mrkdwn_content = convert_markdown_to_mrkdwn(batch_content) + + content_size = len(mrkdwn_content.encode("utf-8")) + print( + f"发送{log_prefix}第 {i}/{len(batches)} 批次,大小:{content_size} 字节 [{report_type}]" + ) + + # 构建 Slack payload(使用简单的 text 字段,支持 mrkdwn) + payload = {"text": mrkdwn_content} + + try: + response = requests.post( + webhook_url, headers=headers, json=payload, proxies=proxies, timeout=30 + ) + + # Slack Incoming Webhooks 成功时返回 "ok" 文本 + if response.status_code == 200 and response.text == "ok": + print(f"{log_prefix}第 {i}/{len(batches)} 批次发送成功 [{report_type}]") + # 批次间间隔 + if i < len(batches): + time.sleep(batch_interval) + else: + error_msg = response.text if response.text else f"状态码:{response.status_code}" + print( + f"{log_prefix}第 {i}/{len(batches)} 批次发送失败 [{report_type}],错误:{error_msg}" + ) + return False + except Exception as e: + print(f"{log_prefix}第 {i}/{len(batches)} 批次发送出错 [{report_type}]:{e}") + return False + + print(f"{log_prefix}所有 {len(batches)} 批次发送完成 [{report_type}]") + return True diff --git a/trendradar/notification/splitter.py b/trendradar/notification/splitter.py new file mode 100644 index 0000000..f137e3d --- /dev/null +++ b/trendradar/notification/splitter.py @@ -0,0 +1,580 @@ +# coding=utf-8 +""" +消息分批处理模块 + +提供消息内容分批拆分功能,确保消息大小不超过各平台限制 +""" + +from datetime import datetime +from typing import Dict, List, Optional, Callable + +from trendradar.report.formatter import format_title_for_platform + + +# 默认批次大小配置 +DEFAULT_BATCH_SIZES = { + "dingtalk": 20000, + "feishu": 29000, + "ntfy": 3800, + "default": 4000, +} + + +def split_content_into_batches( + report_data: Dict, + format_type: str, + update_info: Optional[Dict] = None, + max_bytes: Optional[int] = None, + mode: str = "daily", + batch_sizes: Optional[Dict[str, int]] = None, + feishu_separator: str = "---", + reverse_content_order: bool = False, + get_time_func: Optional[Callable[[], datetime]] = None, +) -> List[str]: + """分批处理消息内容,确保词组标题+至少第一条新闻的完整性 + + Args: + report_data: 报告数据字典,包含 stats, new_titles, failed_ids, total_new_count + format_type: 格式类型 (feishu, dingtalk, wework, telegram, ntfy, bark, slack) + update_info: 版本更新信息(可选) + max_bytes: 最大字节数(可选,如果不指定则使用默认配置) + mode: 报告模式 (daily, incremental, current) + batch_sizes: 批次大小配置字典(可选) + feishu_separator: 飞书消息分隔符 + reverse_content_order: 是否反转内容顺序(新增在前) + get_time_func: 获取当前时间的函数(可选) + + Returns: + 分批后的消息内容列表 + """ + # 合并批次大小配置 + sizes = {**DEFAULT_BATCH_SIZES, **(batch_sizes or {})} + + if max_bytes is None: + if format_type == "dingtalk": + max_bytes = sizes.get("dingtalk", 20000) + elif format_type == "feishu": + max_bytes = sizes.get("feishu", 29000) + elif format_type == "ntfy": + max_bytes = sizes.get("ntfy", 3800) + else: + max_bytes = sizes.get("default", 4000) + + batches = [] + + total_titles = sum( + len(stat["titles"]) for stat in report_data["stats"] if stat["count"] > 0 + ) + now = get_time_func() if get_time_func else datetime.now() + + base_header = "" + if format_type in ("wework", "bark"): + base_header = f"**总新闻数:** {total_titles}\n\n\n\n" + elif format_type == "telegram": + base_header = f"总新闻数: {total_titles}\n\n" + elif format_type == "ntfy": + base_header = f"**总新闻数:** {total_titles}\n\n" + elif format_type == "feishu": + base_header = "" + elif format_type == "dingtalk": + base_header = f"**总新闻数:** {total_titles}\n\n" + base_header += f"**时间:** {now.strftime('%Y-%m-%d %H:%M:%S')}\n\n" + base_header += f"**类型:** 热点分析报告\n\n" + base_header += "---\n\n" + elif format_type == "slack": + base_header = f"*总新闻数:* {total_titles}\n\n" + + base_footer = "" + if format_type in ("wework", "bark"): + base_footer = f"\n\n\n> 更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}" + if update_info: + base_footer += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**,当前 **{update_info['current_version']}**" + elif format_type == "telegram": + base_footer = f"\n\n更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}" + if update_info: + base_footer += f"\nTrendRadar 发现新版本 {update_info['remote_version']},当前 {update_info['current_version']}" + elif format_type == "ntfy": + base_footer = f"\n\n> 更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}" + if update_info: + base_footer += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**,当前 **{update_info['current_version']}**" + elif format_type == "feishu": + base_footer = f"\n\n更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}" + if update_info: + base_footer += f"\nTrendRadar 发现新版本 {update_info['remote_version']},当前 {update_info['current_version']}" + elif format_type == "dingtalk": + base_footer = f"\n\n> 更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}" + if update_info: + base_footer += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**,当前 **{update_info['current_version']}**" + elif format_type == "slack": + base_footer = f"\n\n_更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}_" + if update_info: + base_footer += f"\n_TrendRadar 发现新版本 *{update_info['remote_version']}*,当前 *{update_info['current_version']}_" + + stats_header = "" + if report_data["stats"]: + if format_type in ("wework", "bark"): + stats_header = f"📊 **热点词汇统计**\n\n" + elif format_type == "telegram": + stats_header = f"📊 热点词汇统计\n\n" + elif format_type == "ntfy": + stats_header = f"📊 **热点词汇统计**\n\n" + elif format_type == "feishu": + stats_header = f"📊 **热点词汇统计**\n\n" + elif format_type == "dingtalk": + stats_header = f"📊 **热点词汇统计**\n\n" + elif format_type == "slack": + stats_header = f"📊 *热点词汇统计*\n\n" + + current_batch = base_header + current_batch_has_content = False + + if ( + not report_data["stats"] + and not report_data["new_titles"] + and not report_data["failed_ids"] + ): + if mode == "incremental": + mode_text = "增量模式下暂无新增匹配的热点词汇" + elif mode == "current": + mode_text = "当前榜单模式下暂无匹配的热点词汇" + else: + mode_text = "暂无匹配的热点词汇" + simple_content = f"📭 {mode_text}\n\n" + final_content = base_header + simple_content + base_footer + batches.append(final_content) + return batches + + # 定义处理热点词汇统计的函数 + def process_stats_section(current_batch, current_batch_has_content, batches): + """处理热点词汇统计""" + if not report_data["stats"]: + return current_batch, current_batch_has_content, batches + + total_count = len(report_data["stats"]) + + # 添加统计标题 + test_content = current_batch + stats_header + if ( + len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8")) + < max_bytes + ): + current_batch = test_content + current_batch_has_content = True + else: + if current_batch_has_content: + batches.append(current_batch + base_footer) + current_batch = base_header + stats_header + current_batch_has_content = True + + # 逐个处理词组(确保词组标题+第一条新闻的原子性) + for i, stat in enumerate(report_data["stats"]): + word = stat["word"] + count = stat["count"] + sequence_display = f"[{i + 1}/{total_count}]" + + # 构建词组标题 + word_header = "" + if format_type in ("wework", "bark"): + if count >= 10: + word_header = ( + f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n" + ) + elif count >= 5: + word_header = ( + f"📈 {sequence_display} **{word}** : **{count}** 条\n\n" + ) + else: + word_header = f"📌 {sequence_display} **{word}** : {count} 条\n\n" + elif format_type == "telegram": + if count >= 10: + word_header = f"🔥 {sequence_display} {word} : {count} 条\n\n" + elif count >= 5: + word_header = f"📈 {sequence_display} {word} : {count} 条\n\n" + else: + word_header = f"📌 {sequence_display} {word} : {count} 条\n\n" + elif format_type == "ntfy": + if count >= 10: + word_header = ( + f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n" + ) + elif count >= 5: + word_header = ( + f"📈 {sequence_display} **{word}** : **{count}** 条\n\n" + ) + else: + word_header = f"📌 {sequence_display} **{word}** : {count} 条\n\n" + elif format_type == "feishu": + if count >= 10: + word_header = f"🔥 {sequence_display} **{word}** : {count} 条\n\n" + elif count >= 5: + word_header = f"📈 {sequence_display} **{word}** : {count} 条\n\n" + else: + word_header = f"📌 {sequence_display} **{word}** : {count} 条\n\n" + elif format_type == "dingtalk": + if count >= 10: + word_header = ( + f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n" + ) + elif count >= 5: + word_header = ( + f"📈 {sequence_display} **{word}** : **{count}** 条\n\n" + ) + else: + word_header = f"📌 {sequence_display} **{word}** : {count} 条\n\n" + elif format_type == "slack": + if count >= 10: + word_header = ( + f"🔥 {sequence_display} *{word}* : *{count}* 条\n\n" + ) + elif count >= 5: + word_header = ( + f"📈 {sequence_display} *{word}* : *{count}* 条\n\n" + ) + else: + word_header = f"📌 {sequence_display} *{word}* : {count} 条\n\n" + + # 构建第一条新闻 + first_news_line = "" + if stat["titles"]: + first_title_data = stat["titles"][0] + if format_type in ("wework", "bark"): + formatted_title = format_title_for_platform( + "wework", first_title_data, show_source=True + ) + elif format_type == "telegram": + formatted_title = format_title_for_platform( + "telegram", first_title_data, show_source=True + ) + elif format_type == "ntfy": + formatted_title = format_title_for_platform( + "ntfy", first_title_data, show_source=True + ) + elif format_type == "feishu": + formatted_title = format_title_for_platform( + "feishu", first_title_data, show_source=True + ) + elif format_type == "dingtalk": + formatted_title = format_title_for_platform( + "dingtalk", first_title_data, show_source=True + ) + elif format_type == "slack": + formatted_title = format_title_for_platform( + "slack", first_title_data, show_source=True + ) + else: + formatted_title = f"{first_title_data['title']}" + + first_news_line = f" 1. {formatted_title}\n" + if len(stat["titles"]) > 1: + first_news_line += "\n" + + # 原子性检查:词组标题+第一条新闻必须一起处理 + word_with_first_news = word_header + first_news_line + test_content = current_batch + word_with_first_news + + if ( + len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8")) + >= max_bytes + ): + # 当前批次容纳不下,开启新批次 + if current_batch_has_content: + batches.append(current_batch + base_footer) + current_batch = base_header + stats_header + word_with_first_news + current_batch_has_content = True + start_index = 1 + else: + current_batch = test_content + current_batch_has_content = True + start_index = 1 + + # 处理剩余新闻条目 + for j in range(start_index, len(stat["titles"])): + title_data = stat["titles"][j] + if format_type in ("wework", "bark"): + formatted_title = format_title_for_platform( + "wework", title_data, show_source=True + ) + elif format_type == "telegram": + formatted_title = format_title_for_platform( + "telegram", title_data, show_source=True + ) + elif format_type == "ntfy": + formatted_title = format_title_for_platform( + "ntfy", title_data, show_source=True + ) + elif format_type == "feishu": + formatted_title = format_title_for_platform( + "feishu", title_data, show_source=True + ) + elif format_type == "dingtalk": + formatted_title = format_title_for_platform( + "dingtalk", title_data, show_source=True + ) + elif format_type == "slack": + formatted_title = format_title_for_platform( + "slack", title_data, show_source=True + ) + else: + formatted_title = f"{title_data['title']}" + + news_line = f" {j + 1}. {formatted_title}\n" + if j < len(stat["titles"]) - 1: + news_line += "\n" + + test_content = current_batch + news_line + if ( + len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8")) + >= max_bytes + ): + if current_batch_has_content: + batches.append(current_batch + base_footer) + current_batch = base_header + stats_header + word_header + news_line + current_batch_has_content = True + else: + current_batch = test_content + current_batch_has_content = True + + # 词组间分隔符 + if i < len(report_data["stats"]) - 1: + separator = "" + if format_type in ("wework", "bark"): + separator = f"\n\n\n\n" + elif format_type == "telegram": + separator = f"\n\n" + elif format_type == "ntfy": + separator = f"\n\n" + elif format_type == "feishu": + separator = f"\n{feishu_separator}\n\n" + elif format_type == "dingtalk": + separator = f"\n---\n\n" + elif format_type == "slack": + separator = f"\n\n" + + test_content = current_batch + separator + if ( + len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8")) + < max_bytes + ): + current_batch = test_content + + return current_batch, current_batch_has_content, batches + + # 定义处理新增新闻的函数 + def process_new_titles_section(current_batch, current_batch_has_content, batches): + """处理新增新闻""" + if not report_data["new_titles"]: + return current_batch, current_batch_has_content, batches + + new_header = "" + if format_type in ("wework", "bark"): + new_header = f"\n\n\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n" + elif format_type == "telegram": + new_header = ( + f"\n\n🆕 本次新增热点新闻 (共 {report_data['total_new_count']} 条)\n\n" + ) + elif format_type == "ntfy": + new_header = f"\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n" + elif format_type == "feishu": + new_header = f"\n{feishu_separator}\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n" + elif format_type == "dingtalk": + new_header = f"\n---\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n" + elif format_type == "slack": + new_header = f"\n\n🆕 *本次新增热点新闻* (共 {report_data['total_new_count']} 条)\n\n" + + test_content = current_batch + new_header + if ( + len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8")) + >= max_bytes + ): + if current_batch_has_content: + batches.append(current_batch + base_footer) + current_batch = base_header + new_header + current_batch_has_content = True + else: + current_batch = test_content + current_batch_has_content = True + + # 逐个处理新增新闻来源 + for source_data in report_data["new_titles"]: + source_header = "" + if format_type in ("wework", "bark"): + source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n" + elif format_type == "telegram": + source_header = f"{source_data['source_name']} ({len(source_data['titles'])} 条):\n\n" + elif format_type == "ntfy": + source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n" + elif format_type == "feishu": + source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n" + elif format_type == "dingtalk": + source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n" + elif format_type == "slack": + source_header = f"*{source_data['source_name']}* ({len(source_data['titles'])} 条):\n\n" + + # 构建第一条新增新闻 + first_news_line = "" + if source_data["titles"]: + first_title_data = source_data["titles"][0] + title_data_copy = first_title_data.copy() + title_data_copy["is_new"] = False + + if format_type in ("wework", "bark"): + formatted_title = format_title_for_platform( + "wework", title_data_copy, show_source=False + ) + elif format_type == "telegram": + formatted_title = format_title_for_platform( + "telegram", title_data_copy, show_source=False + ) + elif format_type == "feishu": + formatted_title = format_title_for_platform( + "feishu", title_data_copy, show_source=False + ) + elif format_type == "dingtalk": + formatted_title = format_title_for_platform( + "dingtalk", title_data_copy, show_source=False + ) + elif format_type == "slack": + formatted_title = format_title_for_platform( + "slack", title_data_copy, show_source=False + ) + else: + formatted_title = f"{title_data_copy['title']}" + + first_news_line = f" 1. {formatted_title}\n" + + # 原子性检查:来源标题+第一条新闻 + source_with_first_news = source_header + first_news_line + test_content = current_batch + source_with_first_news + + if ( + len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8")) + >= max_bytes + ): + if current_batch_has_content: + batches.append(current_batch + base_footer) + current_batch = base_header + new_header + source_with_first_news + current_batch_has_content = True + start_index = 1 + else: + current_batch = test_content + current_batch_has_content = True + start_index = 1 + + # 处理剩余新增新闻 + for j in range(start_index, len(source_data["titles"])): + title_data = source_data["titles"][j] + title_data_copy = title_data.copy() + title_data_copy["is_new"] = False + + if format_type == "wework": + formatted_title = format_title_for_platform( + "wework", title_data_copy, show_source=False + ) + elif format_type == "telegram": + formatted_title = format_title_for_platform( + "telegram", title_data_copy, show_source=False + ) + elif format_type == "feishu": + formatted_title = format_title_for_platform( + "feishu", title_data_copy, show_source=False + ) + elif format_type == "dingtalk": + formatted_title = format_title_for_platform( + "dingtalk", title_data_copy, show_source=False + ) + elif format_type == "slack": + formatted_title = format_title_for_platform( + "slack", title_data_copy, show_source=False + ) + else: + formatted_title = f"{title_data_copy['title']}" + + news_line = f" {j + 1}. {formatted_title}\n" + + test_content = current_batch + news_line + if ( + len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8")) + >= max_bytes + ): + if current_batch_has_content: + batches.append(current_batch + base_footer) + current_batch = base_header + new_header + source_header + news_line + current_batch_has_content = True + else: + current_batch = test_content + current_batch_has_content = True + + current_batch += "\n" + + return current_batch, current_batch_has_content, batches + + # 根据配置决定处理顺序 + if reverse_content_order: + # 新增热点在前,热点词汇统计在后 + current_batch, current_batch_has_content, batches = process_new_titles_section( + current_batch, current_batch_has_content, batches + ) + current_batch, current_batch_has_content, batches = process_stats_section( + current_batch, current_batch_has_content, batches + ) + else: + # 默认:热点词汇统计在前,新增热点在后 + current_batch, current_batch_has_content, batches = process_stats_section( + current_batch, current_batch_has_content, batches + ) + current_batch, current_batch_has_content, batches = process_new_titles_section( + current_batch, current_batch_has_content, batches + ) + + if report_data["failed_ids"]: + failed_header = "" + if format_type == "wework": + failed_header = f"\n\n\n\n⚠️ **数据获取失败的平台:**\n\n" + elif format_type == "telegram": + failed_header = f"\n\n⚠️ 数据获取失败的平台:\n\n" + elif format_type == "ntfy": + failed_header = f"\n\n⚠️ **数据获取失败的平台:**\n\n" + elif format_type == "feishu": + failed_header = f"\n{feishu_separator}\n\n⚠️ **数据获取失败的平台:**\n\n" + elif format_type == "dingtalk": + failed_header = f"\n---\n\n⚠️ **数据获取失败的平台:**\n\n" + + test_content = current_batch + failed_header + if ( + len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8")) + >= max_bytes + ): + if current_batch_has_content: + batches.append(current_batch + base_footer) + current_batch = base_header + failed_header + current_batch_has_content = True + else: + current_batch = test_content + current_batch_has_content = True + + for i, id_value in enumerate(report_data["failed_ids"], 1): + if format_type == "feishu": + failed_line = f" • {id_value}\n" + elif format_type == "dingtalk": + failed_line = f" • **{id_value}**\n" + else: + failed_line = f" • {id_value}\n" + + test_content = current_batch + failed_line + if ( + len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8")) + >= max_bytes + ): + if current_batch_has_content: + batches.append(current_batch + base_footer) + current_batch = base_header + failed_header + failed_line + current_batch_has_content = True + else: + current_batch = test_content + current_batch_has_content = True + + # 完成最后批次 + if current_batch_has_content: + batches.append(current_batch + base_footer) + + return batches diff --git a/trendradar/report/__init__.py b/trendradar/report/__init__.py new file mode 100644 index 0000000..f9bc0f0 --- /dev/null +++ b/trendradar/report/__init__.py @@ -0,0 +1,40 @@ +# coding=utf-8 +""" +报告生成模块 + +提供报告生成和格式化功能,包括: +- HTML 报告生成 +- 标题格式化工具 + +模块结构: +- helpers: 报告辅助函数(清理、转义、格式化) +- formatter: 平台标题格式化 +- html: HTML 报告渲染 +- generator: 报告生成器 +""" + +from trendradar.report.helpers import ( + clean_title, + html_escape, + format_rank_display, +) +from trendradar.report.formatter import format_title_for_platform +from trendradar.report.html import render_html_content +from trendradar.report.generator import ( + prepare_report_data, + generate_html_report, +) + +__all__ = [ + # 辅助函数 + "clean_title", + "html_escape", + "format_rank_display", + # 格式化函数 + "format_title_for_platform", + # HTML 渲染 + "render_html_content", + # 报告生成器 + "prepare_report_data", + "generate_html_report", +] diff --git a/trendradar/report/formatter.py b/trendradar/report/formatter.py new file mode 100644 index 0000000..e634d07 --- /dev/null +++ b/trendradar/report/formatter.py @@ -0,0 +1,223 @@ +# coding=utf-8 +""" +平台标题格式化模块 + +提供多平台标题格式化功能 +""" + +from typing import Dict + +from trendradar.report.helpers import clean_title, html_escape, format_rank_display + + +def format_title_for_platform( + platform: str, title_data: Dict, show_source: bool = True +) -> str: + """统一的标题格式化方法 + + 为不同平台生成对应格式的标题字符串。 + + Args: + platform: 目标平台,支持: + - "feishu": 飞书 + - "dingtalk": 钉钉 + - "wework": 企业微信 + - "bark": Bark + - "telegram": Telegram + - "ntfy": ntfy + - "slack": Slack + - "html": HTML 报告 + title_data: 标题数据字典,包含以下字段: + - title: 标题文本 + - source_name: 来源名称 + - time_display: 时间显示 + - count: 出现次数 + - ranks: 排名列表 + - rank_threshold: 高亮阈值 + - url: PC端链接 + - mobile_url: 移动端链接(优先使用) + - is_new: 是否为新增标题(可选) + show_source: 是否显示来源名称 + + Returns: + 格式化后的标题字符串 + """ + rank_display = format_rank_display( + title_data["ranks"], title_data["rank_threshold"], platform + ) + + link_url = title_data["mobile_url"] or title_data["url"] + cleaned_title = clean_title(title_data["title"]) + + if platform == "feishu": + if link_url: + formatted_title = f"[{cleaned_title}]({link_url})" + else: + formatted_title = cleaned_title + + title_prefix = "🆕 " if title_data.get("is_new") else "" + + if show_source: + result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}" + else: + result = f"{title_prefix}{formatted_title}" + + if rank_display: + result += f" {rank_display}" + if title_data["time_display"]: + result += f" - {title_data['time_display']}" + if title_data["count"] > 1: + result += f" ({title_data['count']}次)" + + return result + + elif platform == "dingtalk": + if link_url: + formatted_title = f"[{cleaned_title}]({link_url})" + else: + formatted_title = cleaned_title + + title_prefix = "🆕 " if title_data.get("is_new") else "" + + if show_source: + result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}" + else: + result = f"{title_prefix}{formatted_title}" + + if rank_display: + result += f" {rank_display}" + if title_data["time_display"]: + result += f" - {title_data['time_display']}" + if title_data["count"] > 1: + result += f" ({title_data['count']}次)" + + return result + + elif platform in ("wework", "bark"): + # WeWork 和 Bark 使用 markdown 格式 + if link_url: + formatted_title = f"[{cleaned_title}]({link_url})" + else: + formatted_title = cleaned_title + + title_prefix = "🆕 " if title_data.get("is_new") else "" + + if show_source: + result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}" + else: + result = f"{title_prefix}{formatted_title}" + + if rank_display: + result += f" {rank_display}" + if title_data["time_display"]: + result += f" - {title_data['time_display']}" + if title_data["count"] > 1: + result += f" ({title_data['count']}次)" + + return result + + elif platform == "telegram": + if link_url: + formatted_title = f'{html_escape(cleaned_title)}' + else: + formatted_title = cleaned_title + + title_prefix = "🆕 " if title_data.get("is_new") else "" + + if show_source: + result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}" + else: + result = f"{title_prefix}{formatted_title}" + + if rank_display: + result += f" {rank_display}" + if title_data["time_display"]: + result += f" - {title_data['time_display']}" + if title_data["count"] > 1: + result += f" ({title_data['count']}次)" + + return result + + elif platform == "ntfy": + if link_url: + formatted_title = f"[{cleaned_title}]({link_url})" + else: + formatted_title = cleaned_title + + title_prefix = "🆕 " if title_data.get("is_new") else "" + + if show_source: + result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}" + else: + result = f"{title_prefix}{formatted_title}" + + if rank_display: + result += f" {rank_display}" + if title_data["time_display"]: + result += f" `- {title_data['time_display']}`" + if title_data["count"] > 1: + result += f" `({title_data['count']}次)`" + + return result + + elif platform == "slack": + # Slack 使用 mrkdwn 格式 + if link_url: + # Slack 链接格式: + formatted_title = f"<{link_url}|{cleaned_title}>" + else: + formatted_title = cleaned_title + + title_prefix = "🆕 " if title_data.get("is_new") else "" + + if show_source: + result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}" + else: + result = f"{title_prefix}{formatted_title}" + + # 排名(使用 * 加粗) + rank_display = format_rank_display( + title_data["ranks"], title_data["rank_threshold"], "slack" + ) + if rank_display: + result += f" {rank_display}" + if title_data["time_display"]: + result += f" `- {title_data['time_display']}`" + if title_data["count"] > 1: + result += f" `({title_data['count']}次)`" + + return result + + elif platform == "html": + rank_display = format_rank_display( + title_data["ranks"], title_data["rank_threshold"], "html" + ) + + link_url = title_data["mobile_url"] or title_data["url"] + + escaped_title = html_escape(cleaned_title) + escaped_source_name = html_escape(title_data["source_name"]) + + if link_url: + escaped_url = html_escape(link_url) + formatted_title = f'[{escaped_source_name}] {escaped_title}' + else: + formatted_title = ( + f'[{escaped_source_name}] {escaped_title}' + ) + + if rank_display: + formatted_title += f" {rank_display}" + if title_data["time_display"]: + escaped_time = html_escape(title_data["time_display"]) + formatted_title += f" - {escaped_time}" + if title_data["count"] > 1: + formatted_title += f" ({title_data['count']}次)" + + if title_data.get("is_new"): + formatted_title = f"
🆕 {formatted_title}
" + + return formatted_title + + else: + return cleaned_title diff --git a/trendradar/report/generator.py b/trendradar/report/generator.py new file mode 100644 index 0000000..f62f417 --- /dev/null +++ b/trendradar/report/generator.py @@ -0,0 +1,235 @@ +# coding=utf-8 +""" +报告生成模块 + +提供报告数据准备和 HTML 生成功能: +- prepare_report_data: 准备报告数据 +- generate_html_report: 生成 HTML 报告 +""" + +from pathlib import Path +from typing import Dict, List, Optional, Callable + + +def prepare_report_data( + stats: List[Dict], + failed_ids: Optional[List] = None, + new_titles: Optional[Dict] = None, + id_to_name: Optional[Dict] = None, + mode: str = "daily", + rank_threshold: int = 3, + matches_word_groups_func: Optional[Callable] = None, + load_frequency_words_func: Optional[Callable] = None, +) -> Dict: + """ + 准备报告数据 + + Args: + stats: 统计结果列表 + failed_ids: 失败的 ID 列表 + new_titles: 新增标题 + id_to_name: ID 到名称的映射 + mode: 报告模式 (daily/incremental/current) + rank_threshold: 排名阈值 + matches_word_groups_func: 词组匹配函数 + load_frequency_words_func: 加载频率词函数 + + Returns: + Dict: 准备好的报告数据 + """ + processed_new_titles = [] + + # 在增量模式下隐藏新增新闻区域 + hide_new_section = mode == "incremental" + + # 只有在非隐藏模式下才处理新增新闻部分 + if not hide_new_section: + filtered_new_titles = {} + if new_titles and id_to_name: + # 如果提供了匹配函数,使用它过滤 + if matches_word_groups_func and load_frequency_words_func: + word_groups, filter_words, global_filters = load_frequency_words_func() + for source_id, titles_data in new_titles.items(): + filtered_titles = {} + for title, title_data in titles_data.items(): + if matches_word_groups_func(title, word_groups, filter_words, global_filters): + filtered_titles[title] = title_data + if filtered_titles: + filtered_new_titles[source_id] = filtered_titles + else: + # 没有匹配函数时,使用全部 + filtered_new_titles = new_titles + + # 打印过滤后的新增热点数(与推送显示一致) + original_new_count = sum(len(titles) for titles in new_titles.values()) if new_titles else 0 + filtered_new_count = sum(len(titles) for titles in filtered_new_titles.values()) if filtered_new_titles else 0 + if original_new_count > 0: + print(f"频率词过滤后:{filtered_new_count} 条新增热点匹配(原始 {original_new_count} 条)") + + if filtered_new_titles and id_to_name: + for source_id, titles_data in filtered_new_titles.items(): + source_name = id_to_name.get(source_id, source_id) + source_titles = [] + + for title, title_data in titles_data.items(): + url = title_data.get("url", "") + mobile_url = title_data.get("mobileUrl", "") + ranks = title_data.get("ranks", []) + + processed_title = { + "title": title, + "source_name": source_name, + "time_display": "", + "count": 1, + "ranks": ranks, + "rank_threshold": rank_threshold, + "url": url, + "mobile_url": mobile_url, + "is_new": True, + } + source_titles.append(processed_title) + + if source_titles: + processed_new_titles.append( + { + "source_id": source_id, + "source_name": source_name, + "titles": source_titles, + } + ) + + processed_stats = [] + for stat in stats: + if stat["count"] <= 0: + continue + + processed_titles = [] + for title_data in stat["titles"]: + processed_title = { + "title": title_data["title"], + "source_name": title_data["source_name"], + "time_display": title_data["time_display"], + "count": title_data["count"], + "ranks": title_data["ranks"], + "rank_threshold": title_data["rank_threshold"], + "url": title_data.get("url", ""), + "mobile_url": title_data.get("mobileUrl", ""), + "is_new": title_data.get("is_new", False), + } + processed_titles.append(processed_title) + + processed_stats.append( + { + "word": stat["word"], + "count": stat["count"], + "percentage": stat.get("percentage", 0), + "titles": processed_titles, + } + ) + + return { + "stats": processed_stats, + "new_titles": processed_new_titles, + "failed_ids": failed_ids or [], + "total_new_count": sum( + len(source["titles"]) for source in processed_new_titles + ), + } + + +def generate_html_report( + stats: List[Dict], + total_titles: int, + failed_ids: Optional[List] = None, + new_titles: Optional[Dict] = None, + id_to_name: Optional[Dict] = None, + mode: str = "daily", + is_daily_summary: bool = False, + update_info: Optional[Dict] = None, + rank_threshold: int = 3, + output_dir: str = "output", + date_folder: str = "", + time_filename: str = "", + render_html_func: Optional[Callable] = None, + matches_word_groups_func: Optional[Callable] = None, + load_frequency_words_func: Optional[Callable] = None, + enable_index_copy: bool = True, +) -> str: + """ + 生成 HTML 报告 + + Args: + stats: 统计结果列表 + total_titles: 总标题数 + failed_ids: 失败的 ID 列表 + new_titles: 新增标题 + id_to_name: ID 到名称的映射 + mode: 报告模式 (daily/incremental/current) + is_daily_summary: 是否是每日汇总 + update_info: 更新信息 + rank_threshold: 排名阈值 + output_dir: 输出目录 + date_folder: 日期文件夹名称 + time_filename: 时间文件名 + render_html_func: HTML 渲染函数 + matches_word_groups_func: 词组匹配函数 + load_frequency_words_func: 加载频率词函数 + enable_index_copy: 是否复制到 index.html + + Returns: + str: 生成的 HTML 文件路径 + """ + if is_daily_summary: + if mode == "current": + filename = "当前榜单汇总.html" + elif mode == "incremental": + filename = "当日增量.html" + else: + filename = "当日汇总.html" + else: + filename = f"{time_filename}.html" + + # 构建输出路径 + output_path = Path(output_dir) / date_folder / "html" + output_path.mkdir(parents=True, exist_ok=True) + file_path = str(output_path / filename) + + # 准备报告数据 + report_data = prepare_report_data( + stats, + failed_ids, + new_titles, + id_to_name, + mode, + rank_threshold, + matches_word_groups_func, + load_frequency_words_func, + ) + + # 渲染 HTML 内容 + if render_html_func: + html_content = render_html_func( + report_data, total_titles, is_daily_summary, mode, update_info + ) + else: + # 默认简单 HTML + html_content = f"

Report

{report_data}
" + + # 写入文件 + with open(file_path, "w", encoding="utf-8") as f: + f.write(html_content) + + # 如果是每日汇总且启用 index 复制 + if is_daily_summary and enable_index_copy: + # 生成到根目录(供 GitHub Pages 访问) + root_index_path = Path("index.html") + with open(root_index_path, "w", encoding="utf-8") as f: + f.write(html_content) + + # 同时生成到 output 目录(供 Docker Volume 挂载访问) + output_index_path = Path(output_dir) / "index.html" + Path(output_dir).mkdir(parents=True, exist_ok=True) + with open(output_index_path, "w", encoding="utf-8") as f: + f.write(html_content) + + return file_path diff --git a/trendradar/report/helpers.py b/trendradar/report/helpers.py new file mode 100644 index 0000000..1142eaa --- /dev/null +++ b/trendradar/report/helpers.py @@ -0,0 +1,125 @@ +# coding=utf-8 +""" +报告辅助函数模块 + +提供报告生成相关的通用辅助函数 +""" + +import re +from typing import List + + +def clean_title(title: str) -> str: + """清理标题中的特殊字符 + + 清理规则: + - 将换行符(\n, \r)替换为空格 + - 将多个连续空白字符合并为单个空格 + - 去除首尾空白 + + Args: + title: 原始标题字符串 + + Returns: + 清理后的标题字符串 + """ + if not isinstance(title, str): + title = str(title) + cleaned_title = title.replace("\n", " ").replace("\r", " ") + cleaned_title = re.sub(r"\s+", " ", cleaned_title) + cleaned_title = cleaned_title.strip() + return cleaned_title + + +def html_escape(text: str) -> str: + """HTML特殊字符转义 + + 转义规则(按顺序): + - & → & + - < → < + - > → > + - " → " + - ' → ' + + Args: + text: 原始文本 + + Returns: + 转义后的文本 + """ + if not isinstance(text, str): + text = str(text) + + return ( + text.replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace('"', """) + .replace("'", "'") + ) + + +def format_rank_display(ranks: List[int], rank_threshold: int, format_type: str) -> str: + """格式化排名显示 + + 根据不同平台类型生成对应格式的排名字符串。 + 当最小排名小于等于阈值时,使用高亮格式。 + + Args: + ranks: 排名列表(可能包含重复值) + rank_threshold: 高亮阈值,小于等于此值的排名会高亮显示 + format_type: 平台类型,支持: + - "html": HTML格式 + - "feishu": 飞书格式 + - "dingtalk": 钉钉格式 + - "wework": 企业微信格式 + - "telegram": Telegram格式 + - "slack": Slack格式 + - 其他: 默认markdown格式 + + Returns: + 格式化后的排名字符串,如 "[1]" 或 "[1 - 5]" + 如果排名列表为空,返回空字符串 + """ + if not ranks: + return "" + + unique_ranks = sorted(set(ranks)) + min_rank = unique_ranks[0] + max_rank = unique_ranks[-1] + + # 根据平台类型选择高亮格式 + if format_type == "html": + highlight_start = "" + highlight_end = "" + elif format_type == "feishu": + highlight_start = "**" + highlight_end = "**" + elif format_type == "dingtalk": + highlight_start = "**" + highlight_end = "**" + elif format_type == "wework": + highlight_start = "**" + highlight_end = "**" + elif format_type == "telegram": + highlight_start = "" + highlight_end = "" + elif format_type == "slack": + highlight_start = "*" + highlight_end = "*" + else: + # 默认 markdown 格式 + highlight_start = "**" + highlight_end = "**" + + # 生成排名显示 + if min_rank <= rank_threshold: + if min_rank == max_rank: + return f"{highlight_start}[{min_rank}]{highlight_end}" + else: + return f"{highlight_start}[{min_rank} - {max_rank}]{highlight_end}" + else: + if min_rank == max_rank: + return f"[{min_rank}]" + else: + return f"[{min_rank} - {max_rank}]" diff --git a/trendradar/report/html.py b/trendradar/report/html.py new file mode 100644 index 0000000..e69216e --- /dev/null +++ b/trendradar/report/html.py @@ -0,0 +1,1050 @@ +# coding=utf-8 +""" +HTML 报告渲染模块 + +提供 HTML 格式的热点新闻报告生成功能 +""" + +from datetime import datetime +from typing import Dict, Optional, Callable + +from trendradar.report.helpers import html_escape + + +def render_html_content( + report_data: Dict, + total_titles: int, + is_daily_summary: bool = False, + mode: str = "daily", + update_info: Optional[Dict] = None, + *, + reverse_content_order: bool = False, + get_time_func: Optional[Callable[[], datetime]] = None, +) -> str: + """渲染HTML内容 + + Args: + report_data: 报告数据字典,包含 stats, new_titles, failed_ids, total_new_count + total_titles: 新闻总数 + is_daily_summary: 是否为当日汇总 + mode: 报告模式 ("daily", "current", "incremental") + update_info: 更新信息(可选) + reverse_content_order: 是否反转内容顺序(新增热点在前) + get_time_func: 获取当前时间的函数(可选,默认使用 datetime.now) + + Returns: + 渲染后的 HTML 字符串 + """ + html = """ + + + + + + 热点新闻分析 + + + + +
+
+
+ + +
+
热点新闻分析
+
+
+ 报告类型 + """ + + # 处理报告类型显示 + if is_daily_summary: + if mode == "current": + html += "当前榜单" + elif mode == "incremental": + html += "增量模式" + else: + html += "当日汇总" + else: + html += "实时分析" + + html += """ +
+
+ 新闻总数 + """ + + html += f"{total_titles} 条" + + # 计算筛选后的热点新闻数量 + hot_news_count = sum(len(stat["titles"]) for stat in report_data["stats"]) + + html += """ +
+
+ 热点新闻 + """ + + html += f"{hot_news_count} 条" + + html += """ +
+
+ 生成时间 + """ + + # 使用提供的时间函数或默认 datetime.now + if get_time_func: + now = get_time_func() + else: + now = datetime.now() + html += now.strftime("%m-%d %H:%M") + + html += """ +
+
+
+ +
""" + + # 处理失败ID错误信息 + if report_data["failed_ids"]: + html += """ +
+
⚠️ 请求失败的平台
+
    """ + for id_value in report_data["failed_ids"]: + html += f'
  • {html_escape(id_value)}
  • ' + html += """ +
+
""" + + # 生成热点词汇统计部分的HTML + stats_html = "" + if report_data["stats"]: + total_count = len(report_data["stats"]) + + for i, stat in enumerate(report_data["stats"], 1): + count = stat["count"] + + # 确定热度等级 + if count >= 10: + count_class = "hot" + elif count >= 5: + count_class = "warm" + else: + count_class = "" + + escaped_word = html_escape(stat["word"]) + + stats_html += f""" +
+
+
+
{escaped_word}
+
{count} 条
+
+
{i}/{total_count}
+
""" + + # 处理每个词组下的新闻标题,给每条新闻标上序号 + for j, title_data in enumerate(stat["titles"], 1): + is_new = title_data.get("is_new", False) + new_class = "new" if is_new else "" + + stats_html += f""" +
+
{j}
+
+
+ {html_escape(title_data["source_name"])}""" + + # 处理排名显示 + ranks = title_data.get("ranks", []) + if ranks: + min_rank = min(ranks) + max_rank = max(ranks) + rank_threshold = title_data.get("rank_threshold", 10) + + # 确定排名等级 + if min_rank <= 3: + rank_class = "top" + elif min_rank <= rank_threshold: + rank_class = "high" + else: + rank_class = "" + + if min_rank == max_rank: + rank_text = str(min_rank) + else: + rank_text = f"{min_rank}-{max_rank}" + + stats_html += f'{rank_text}' + + # 处理时间显示 + time_display = title_data.get("time_display", "") + if time_display: + # 简化时间显示格式,将波浪线替换为~ + simplified_time = ( + time_display.replace(" ~ ", "~") + .replace("[", "") + .replace("]", "") + ) + stats_html += ( + f'{html_escape(simplified_time)}' + ) + + # 处理出现次数 + count_info = title_data.get("count", 1) + if count_info > 1: + stats_html += f'{count_info}次' + + stats_html += """ +
+
""" + + # 处理标题和链接 + escaped_title = html_escape(title_data["title"]) + link_url = title_data.get("mobile_url") or title_data.get("url", "") + + if link_url: + escaped_url = html_escape(link_url) + stats_html += f'{escaped_title}' + else: + stats_html += escaped_title + + stats_html += """ +
+
+
""" + + stats_html += """ +
""" + + # 生成新增新闻区域的HTML + new_titles_html = "" + if report_data["new_titles"]: + new_titles_html += f""" +
+
本次新增热点 (共 {report_data['total_new_count']} 条)
""" + + for source_data in report_data["new_titles"]: + escaped_source = html_escape(source_data["source_name"]) + titles_count = len(source_data["titles"]) + + new_titles_html += f""" +
+
{escaped_source} · {titles_count}条
""" + + # 为新增新闻也添加序号 + for idx, title_data in enumerate(source_data["titles"], 1): + ranks = title_data.get("ranks", []) + + # 处理新增新闻的排名显示 + rank_class = "" + if ranks: + min_rank = min(ranks) + if min_rank <= 3: + rank_class = "top" + elif min_rank <= title_data.get("rank_threshold", 10): + rank_class = "high" + + if len(ranks) == 1: + rank_text = str(ranks[0]) + else: + rank_text = f"{min(ranks)}-{max(ranks)}" + else: + rank_text = "?" + + new_titles_html += f""" +
+
{idx}
+
{rank_text}
+
+
""" + + # 处理新增新闻的链接 + escaped_title = html_escape(title_data["title"]) + link_url = title_data.get("mobile_url") or title_data.get("url", "") + + if link_url: + escaped_url = html_escape(link_url) + new_titles_html += f'{escaped_title}' + else: + new_titles_html += escaped_title + + new_titles_html += """ +
+
+
""" + + new_titles_html += """ +
""" + + new_titles_html += """ +
""" + + # 根据配置决定内容顺序 + if reverse_content_order: + # 新增热点在前,热点词汇统计在后 + html += new_titles_html + stats_html + else: + # 默认:热点词汇统计在前,新增热点在后 + html += stats_html + new_titles_html + + html += """ +
+ + +
+ + + + + """ + + return html diff --git a/trendradar/storage/__init__.py b/trendradar/storage/__init__.py new file mode 100644 index 0000000..1897feb --- /dev/null +++ b/trendradar/storage/__init__.py @@ -0,0 +1,44 @@ +# coding=utf-8 +""" +存储模块 - 支持多种存储后端 + +支持的存储后端: +- local: 本地 SQLite + TXT/HTML 文件 +- remote: 远程云存储(S3 兼容协议:R2/OSS/COS/S3 等) +- auto: 根据环境自动选择(GitHub Actions 用 remote,其他用 local) +""" + +from trendradar.storage.base import ( + StorageBackend, + NewsItem, + NewsData, + convert_crawl_results_to_news_data, + convert_news_data_to_results, +) +from trendradar.storage.local import LocalStorageBackend +from trendradar.storage.manager import StorageManager, get_storage_manager + +# 远程后端可选导入(需要 boto3) +try: + from trendradar.storage.remote import RemoteStorageBackend + HAS_REMOTE = True +except ImportError: + RemoteStorageBackend = None + HAS_REMOTE = False + +__all__ = [ + # 基础类 + "StorageBackend", + "NewsItem", + "NewsData", + # 转换函数 + "convert_crawl_results_to_news_data", + "convert_news_data_to_results", + # 后端实现 + "LocalStorageBackend", + "RemoteStorageBackend", + "HAS_REMOTE", + # 管理器 + "StorageManager", + "get_storage_manager", +] diff --git a/trendradar/storage/base.py b/trendradar/storage/base.py new file mode 100644 index 0000000..94f7a1c --- /dev/null +++ b/trendradar/storage/base.py @@ -0,0 +1,457 @@ +# coding=utf-8 +""" +存储后端抽象基类和数据模型 + +定义统一的存储接口,所有存储后端都需要实现这些方法 +""" + +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from datetime import datetime +from typing import Dict, List, Optional, Any +import json + + +@dataclass +class NewsItem: + """新闻条目数据模型""" + + title: str # 新闻标题 + source_id: str # 来源平台ID(如 toutiao, baidu) + source_name: str = "" # 来源平台名称(运行时使用,数据库不存储) + rank: int = 0 # 排名 + url: str = "" # 链接 URL + mobile_url: str = "" # 移动端 URL + crawl_time: str = "" # 抓取时间(HH:MM 格式) + + # 统计信息(用于分析) + ranks: List[int] = field(default_factory=list) # 历史排名列表 + first_time: str = "" # 首次出现时间 + last_time: str = "" # 最后出现时间 + count: int = 1 # 出现次数 + + def to_dict(self) -> Dict[str, Any]: + """转换为字典""" + return { + "title": self.title, + "source_id": self.source_id, + "source_name": self.source_name, + "rank": self.rank, + "url": self.url, + "mobile_url": self.mobile_url, + "crawl_time": self.crawl_time, + "ranks": self.ranks, + "first_time": self.first_time, + "last_time": self.last_time, + "count": self.count, + } + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "NewsItem": + """从字典创建""" + return cls( + title=data.get("title", ""), + source_id=data.get("source_id", ""), + source_name=data.get("source_name", ""), + rank=data.get("rank", 0), + url=data.get("url", ""), + mobile_url=data.get("mobile_url", ""), + crawl_time=data.get("crawl_time", ""), + ranks=data.get("ranks", []), + first_time=data.get("first_time", ""), + last_time=data.get("last_time", ""), + count=data.get("count", 1), + ) + + +@dataclass +class NewsData: + """ + 新闻数据集合 + + 结构: + - date: 日期(YYYY-MM-DD) + - crawl_time: 抓取时间(HH时MM分) + - items: 按来源ID分组的新闻条目 + - id_to_name: 来源ID到名称的映射 + - failed_ids: 失败的来源ID列表 + """ + + date: str # 日期 + crawl_time: str # 抓取时间 + items: Dict[str, List[NewsItem]] # 按来源分组的新闻 + id_to_name: Dict[str, str] = field(default_factory=dict) # ID到名称映射 + failed_ids: List[str] = field(default_factory=list) # 失败的ID + + def to_dict(self) -> Dict[str, Any]: + """转换为字典""" + items_dict = {} + for source_id, news_list in self.items.items(): + items_dict[source_id] = [item.to_dict() for item in news_list] + + return { + "date": self.date, + "crawl_time": self.crawl_time, + "items": items_dict, + "id_to_name": self.id_to_name, + "failed_ids": self.failed_ids, + } + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "NewsData": + """从字典创建""" + items = {} + items_data = data.get("items", {}) + for source_id, news_list in items_data.items(): + items[source_id] = [NewsItem.from_dict(item) for item in news_list] + + return cls( + date=data.get("date", ""), + crawl_time=data.get("crawl_time", ""), + items=items, + id_to_name=data.get("id_to_name", {}), + failed_ids=data.get("failed_ids", []), + ) + + def get_total_count(self) -> int: + """获取新闻总数""" + return sum(len(news_list) for news_list in self.items.values()) + + def merge_with(self, other: "NewsData") -> "NewsData": + """ + 合并另一个 NewsData 到当前数据 + + 合并规则: + - 相同 source_id + title 的新闻合并排名历史 + - 更新 last_time 和 count + - 保留较早的 first_time + """ + merged_items = {} + + # 复制当前数据 + for source_id, news_list in self.items.items(): + merged_items[source_id] = {item.title: item for item in news_list} + + # 合并其他数据 + for source_id, news_list in other.items.items(): + if source_id not in merged_items: + merged_items[source_id] = {} + + for item in news_list: + if item.title in merged_items[source_id]: + # 合并已存在的新闻 + existing = merged_items[source_id][item.title] + + # 合并排名 + existing_ranks = set(existing.ranks) if existing.ranks else set() + new_ranks = set(item.ranks) if item.ranks else set() + merged_ranks = sorted(existing_ranks | new_ranks) + existing.ranks = merged_ranks + + # 更新时间 + if item.first_time and (not existing.first_time or item.first_time < existing.first_time): + existing.first_time = item.first_time + if item.last_time and (not existing.last_time or item.last_time > existing.last_time): + existing.last_time = item.last_time + + # 更新计数 + existing.count += 1 + + # 保留URL(如果原来没有) + if not existing.url and item.url: + existing.url = item.url + if not existing.mobile_url and item.mobile_url: + existing.mobile_url = item.mobile_url + else: + # 添加新新闻 + merged_items[source_id][item.title] = item + + # 转换回列表格式 + final_items = {} + for source_id, items_dict in merged_items.items(): + final_items[source_id] = list(items_dict.values()) + + # 合并 id_to_name + merged_id_to_name = {**self.id_to_name, **other.id_to_name} + + # 合并 failed_ids(去重) + merged_failed_ids = list(set(self.failed_ids + other.failed_ids)) + + return NewsData( + date=self.date or other.date, + crawl_time=other.crawl_time, # 使用较新的抓取时间 + items=final_items, + id_to_name=merged_id_to_name, + failed_ids=merged_failed_ids, + ) + + +class StorageBackend(ABC): + """ + 存储后端抽象基类 + + 所有存储后端都需要实现这些方法,以支持: + - 保存新闻数据 + - 读取当天所有数据 + - 检测新增新闻 + - 生成报告文件(TXT/HTML) + """ + + @abstractmethod + def save_news_data(self, data: NewsData) -> bool: + """ + 保存新闻数据 + + Args: + data: 新闻数据 + + Returns: + 是否保存成功 + """ + pass + + @abstractmethod + def get_today_all_data(self, date: Optional[str] = None) -> Optional[NewsData]: + """ + 获取指定日期的所有新闻数据 + + Args: + date: 日期字符串(YYYY-MM-DD),默认为今天 + + Returns: + 合并后的新闻数据,如果没有数据返回 None + """ + pass + + @abstractmethod + def get_latest_crawl_data(self, date: Optional[str] = None) -> Optional[NewsData]: + """ + 获取最新一次抓取的数据 + + Args: + date: 日期字符串,默认为今天 + + Returns: + 最新抓取的新闻数据 + """ + pass + + @abstractmethod + def detect_new_titles(self, current_data: NewsData) -> Dict[str, Dict]: + """ + 检测新增的标题 + + Args: + current_data: 当前抓取的数据 + + Returns: + 新增的标题数据,格式: {source_id: {title: title_data}} + """ + pass + + @abstractmethod + def save_txt_snapshot(self, data: NewsData) -> Optional[str]: + """ + 保存 TXT 快照(可选功能,本地环境可用) + + Args: + data: 新闻数据 + + Returns: + 保存的文件路径,如果不支持返回 None + """ + pass + + @abstractmethod + def save_html_report(self, html_content: str, filename: str, is_summary: bool = False) -> Optional[str]: + """ + 保存 HTML 报告 + + Args: + html_content: HTML 内容 + filename: 文件名 + is_summary: 是否为汇总报告 + + Returns: + 保存的文件路径 + """ + pass + + @abstractmethod + def is_first_crawl_today(self, date: Optional[str] = None) -> bool: + """ + 检查是否是当天第一次抓取 + + Args: + date: 日期字符串,默认为今天 + + Returns: + 是否是第一次抓取 + """ + pass + + @abstractmethod + def cleanup(self) -> None: + """ + 清理资源(如临时文件、数据库连接等) + """ + pass + + @abstractmethod + def cleanup_old_data(self, retention_days: int) -> int: + """ + 清理过期数据 + + Args: + retention_days: 保留天数(0 表示不清理) + + Returns: + 删除的日期目录数量 + """ + pass + + @property + @abstractmethod + def backend_name(self) -> str: + """ + 存储后端名称 + """ + pass + + @property + @abstractmethod + def supports_txt(self) -> bool: + """ + 是否支持生成 TXT 快照 + """ + pass + + # === 推送记录相关方法 === + + @abstractmethod + def has_pushed_today(self, date: Optional[str] = None) -> bool: + """ + 检查指定日期是否已推送过 + + Args: + date: 日期字符串(YYYY-MM-DD),默认为今天 + + Returns: + 是否已推送 + """ + pass + + @abstractmethod + def record_push(self, report_type: str, date: Optional[str] = None) -> bool: + """ + 记录推送 + + Args: + report_type: 报告类型 + date: 日期字符串(YYYY-MM-DD),默认为今天 + + Returns: + 是否记录成功 + """ + pass + + +def convert_crawl_results_to_news_data( + results: Dict[str, Dict], + id_to_name: Dict[str, str], + failed_ids: List[str], + crawl_time: str, + crawl_date: str, +) -> NewsData: + """ + 将爬虫结果转换为 NewsData 格式 + + Args: + results: 爬虫返回的结果 {source_id: {title: {ranks: [], url: "", mobileUrl: ""}}} + id_to_name: 来源ID到名称的映射 + failed_ids: 失败的来源ID + crawl_time: 抓取时间(HH:MM) + crawl_date: 抓取日期(YYYY-MM-DD) + + Returns: + NewsData 对象 + """ + items = {} + + for source_id, titles_data in results.items(): + source_name = id_to_name.get(source_id, source_id) + news_list = [] + + for title, data in titles_data.items(): + if isinstance(data, dict): + ranks = data.get("ranks", []) + url = data.get("url", "") + mobile_url = data.get("mobileUrl", "") + else: + # 兼容旧格式 + ranks = data if isinstance(data, list) else [] + url = "" + mobile_url = "" + + rank = ranks[0] if ranks else 99 + + news_item = NewsItem( + title=title, + source_id=source_id, + source_name=source_name, + rank=rank, + url=url, + mobile_url=mobile_url, + crawl_time=crawl_time, + ranks=ranks, + first_time=crawl_time, + last_time=crawl_time, + count=1, + ) + news_list.append(news_item) + + items[source_id] = news_list + + return NewsData( + date=crawl_date, + crawl_time=crawl_time, + items=items, + id_to_name=id_to_name, + failed_ids=failed_ids, + ) + + +def convert_news_data_to_results(data: NewsData) -> tuple: + """ + 将 NewsData 转换回原有的 results 格式(用于兼容现有代码) + + Args: + data: NewsData 对象 + + Returns: + (results, id_to_name, title_info) 元组 + """ + results = {} + title_info = {} + + for source_id, news_list in data.items.items(): + results[source_id] = {} + title_info[source_id] = {} + + for item in news_list: + results[source_id][item.title] = { + "ranks": item.ranks, + "url": item.url, + "mobileUrl": item.mobile_url, + } + + title_info[source_id][item.title] = { + "first_time": item.first_time, + "last_time": item.last_time, + "count": item.count, + "ranks": item.ranks, + "url": item.url, + "mobileUrl": item.mobile_url, + } + + return results, data.id_to_name, title_info diff --git a/trendradar/storage/local.py b/trendradar/storage/local.py new file mode 100644 index 0000000..f0fa3eb --- /dev/null +++ b/trendradar/storage/local.py @@ -0,0 +1,869 @@ +# coding=utf-8 +""" +本地存储后端 - SQLite + TXT/HTML + +使用 SQLite 作为主存储,支持可选的 TXT 快照和 HTML 报告 +""" + +import sqlite3 +import os +import shutil +import pytz +import re +from datetime import datetime, timedelta +from pathlib import Path +from typing import Dict, List, Optional, Any + +from trendradar.storage.base import StorageBackend, NewsItem, NewsData +from trendradar.utils.time import ( + get_configured_time, + format_date_folder, + format_time_filename, +) + + +class LocalStorageBackend(StorageBackend): + """ + 本地存储后端 + + 使用 SQLite 数据库存储新闻数据,支持: + - 按日期组织的 SQLite 数据库文件 + - 可选的 TXT 快照(用于调试) + - HTML 报告生成 + """ + + def __init__( + self, + data_dir: str = "output", + enable_txt: bool = True, + enable_html: bool = True, + timezone: str = "Asia/Shanghai", + ): + """ + 初始化本地存储后端 + + Args: + data_dir: 数据目录路径 + enable_txt: 是否启用 TXT 快照 + enable_html: 是否启用 HTML 报告 + timezone: 时区配置(默认 Asia/Shanghai) + """ + self.data_dir = Path(data_dir) + self.enable_txt = enable_txt + self.enable_html = enable_html + self.timezone = timezone + self._db_connections: Dict[str, sqlite3.Connection] = {} + + @property + def backend_name(self) -> str: + return "local" + + @property + def supports_txt(self) -> bool: + return self.enable_txt + + def _get_configured_time(self) -> datetime: + """获取配置时区的当前时间""" + return get_configured_time(self.timezone) + + def _format_date_folder(self, date: Optional[str] = None) -> str: + """格式化日期文件夹名 (ISO 格式: YYYY-MM-DD)""" + return format_date_folder(date, self.timezone) + + def _format_time_filename(self) -> str: + """格式化时间文件名 (格式: HH-MM)""" + return format_time_filename(self.timezone) + + def _get_db_path(self, date: Optional[str] = None) -> Path: + """获取 SQLite 数据库路径""" + date_folder = self._format_date_folder(date) + db_dir = self.data_dir / date_folder + db_dir.mkdir(parents=True, exist_ok=True) + return db_dir / "news.db" + + def _get_connection(self, date: Optional[str] = None) -> sqlite3.Connection: + """获取数据库连接(带缓存)""" + db_path = str(self._get_db_path(date)) + + if db_path not in self._db_connections: + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + self._init_tables(conn) + self._db_connections[db_path] = conn + + return self._db_connections[db_path] + + def _get_schema_path(self) -> Path: + """获取 schema.sql 文件路径""" + return Path(__file__).parent / "schema.sql" + + def _init_tables(self, conn: sqlite3.Connection) -> None: + """从 schema.sql 初始化数据库表结构""" + schema_path = self._get_schema_path() + + if schema_path.exists(): + with open(schema_path, "r", encoding="utf-8") as f: + schema_sql = f.read() + conn.executescript(schema_sql) + else: + raise FileNotFoundError(f"Schema file not found: {schema_path}") + + conn.commit() + + def save_news_data(self, data: NewsData) -> bool: + """ + 保存新闻数据到 SQLite(以 URL 为唯一标识,支持标题更新检测) + + Args: + data: 新闻数据 + + Returns: + 是否保存成功 + """ + try: + conn = self._get_connection(data.date) + cursor = conn.cursor() + + # 获取配置时区的当前时间 + now_str = self._get_configured_time().strftime("%Y-%m-%d %H:%M:%S") + + # 首先同步平台信息到 platforms 表 + for source_id, source_name in data.id_to_name.items(): + cursor.execute(""" + INSERT INTO platforms (id, name, updated_at) + VALUES (?, ?, ?) + ON CONFLICT(id) DO UPDATE SET + name = excluded.name, + updated_at = excluded.updated_at + """, (source_id, source_name, now_str)) + + # 统计计数器 + new_count = 0 + updated_count = 0 + title_changed_count = 0 + success_sources = [] + + for source_id, news_list in data.items.items(): + success_sources.append(source_id) + + for item in news_list: + try: + # 检查是否已存在(通过 URL + platform_id) + if item.url: + cursor.execute(""" + SELECT id, title FROM news_items + WHERE url = ? AND platform_id = ? + """, (item.url, source_id)) + existing = cursor.fetchone() + + if existing: + # 已存在,更新记录 + existing_id, existing_title = existing + + # 检查标题是否变化 + if existing_title != item.title: + # 记录标题变更 + cursor.execute(""" + INSERT INTO title_changes + (news_item_id, old_title, new_title, changed_at) + VALUES (?, ?, ?, ?) + """, (existing_id, existing_title, item.title, now_str)) + title_changed_count += 1 + + # 记录排名历史 + cursor.execute(""" + INSERT INTO rank_history + (news_item_id, rank, crawl_time, created_at) + VALUES (?, ?, ?, ?) + """, (existing_id, item.rank, data.crawl_time, now_str)) + + # 更新现有记录 + cursor.execute(""" + UPDATE news_items SET + title = ?, + rank = ?, + mobile_url = ?, + last_crawl_time = ?, + crawl_count = crawl_count + 1, + updated_at = ? + WHERE id = ? + """, (item.title, item.rank, item.mobile_url, + data.crawl_time, now_str, existing_id)) + updated_count += 1 + else: + # 不存在,插入新记录 + cursor.execute(""" + INSERT INTO news_items + (title, platform_id, rank, url, mobile_url, + first_crawl_time, last_crawl_time, crawl_count, + created_at, updated_at) + VALUES (?, ?, ?, ?, ?, ?, ?, 1, ?, ?) + """, (item.title, source_id, item.rank, item.url, + item.mobile_url, data.crawl_time, data.crawl_time, + now_str, now_str)) + new_id = cursor.lastrowid + # 记录初始排名 + cursor.execute(""" + INSERT INTO rank_history + (news_item_id, rank, crawl_time, created_at) + VALUES (?, ?, ?, ?) + """, (new_id, item.rank, data.crawl_time, now_str)) + new_count += 1 + else: + # URL 为空的情况,直接插入(不做去重) + cursor.execute(""" + INSERT INTO news_items + (title, platform_id, rank, url, mobile_url, + first_crawl_time, last_crawl_time, crawl_count, + created_at, updated_at) + VALUES (?, ?, ?, ?, ?, ?, ?, 1, ?, ?) + """, (item.title, source_id, item.rank, item.url, + item.mobile_url, data.crawl_time, data.crawl_time, + now_str, now_str)) + new_id = cursor.lastrowid + # 记录初始排名 + cursor.execute(""" + INSERT INTO rank_history + (news_item_id, rank, crawl_time, created_at) + VALUES (?, ?, ?, ?) + """, (new_id, item.rank, data.crawl_time, now_str)) + new_count += 1 + + except sqlite3.Error as e: + print(f"保存新闻条目失败 [{item.title[:30]}...]: {e}") + + total_items = new_count + updated_count + + # 记录抓取信息 + cursor.execute(""" + INSERT OR REPLACE INTO crawl_records + (crawl_time, total_items, created_at) + VALUES (?, ?, ?) + """, (data.crawl_time, total_items, now_str)) + + # 获取刚插入的 crawl_record 的 ID + cursor.execute(""" + SELECT id FROM crawl_records WHERE crawl_time = ? + """, (data.crawl_time,)) + record_row = cursor.fetchone() + if record_row: + crawl_record_id = record_row[0] + + # 记录成功的来源 + for source_id in success_sources: + cursor.execute(""" + INSERT OR REPLACE INTO crawl_source_status + (crawl_record_id, platform_id, status) + VALUES (?, ?, 'success') + """, (crawl_record_id, source_id)) + + # 记录失败的来源 + for failed_id in data.failed_ids: + # 确保失败的平台也在 platforms 表中 + cursor.execute(""" + INSERT OR IGNORE INTO platforms (id, name, updated_at) + VALUES (?, ?, ?) + """, (failed_id, failed_id, now_str)) + + cursor.execute(""" + INSERT OR REPLACE INTO crawl_source_status + (crawl_record_id, platform_id, status) + VALUES (?, ?, 'failed') + """, (crawl_record_id, failed_id)) + + conn.commit() + + # 输出详细的存储统计日志 + log_parts = [f"[本地存储] 处理完成:新增 {new_count} 条"] + if updated_count > 0: + log_parts.append(f"更新 {updated_count} 条") + if title_changed_count > 0: + log_parts.append(f"标题变更 {title_changed_count} 条") + print(",".join(log_parts)) + + return True + + except Exception as e: + print(f"[本地存储] 保存失败: {e}") + return False + + def get_today_all_data(self, date: Optional[str] = None) -> Optional[NewsData]: + """ + 获取指定日期的所有新闻数据(合并后) + + Args: + date: 日期字符串,默认为今天 + + Returns: + 合并后的新闻数据 + """ + try: + db_path = self._get_db_path(date) + if not db_path.exists(): + return None + + conn = self._get_connection(date) + cursor = conn.cursor() + + # 获取所有新闻数据(包含 id 用于查询排名历史) + cursor.execute(""" + SELECT n.id, n.title, n.platform_id, p.name as platform_name, + n.rank, n.url, n.mobile_url, + n.first_crawl_time, n.last_crawl_time, n.crawl_count + FROM news_items n + LEFT JOIN platforms p ON n.platform_id = p.id + ORDER BY n.platform_id, n.last_crawl_time + """) + + rows = cursor.fetchall() + if not rows: + return None + + # 收集所有 news_item_id + news_ids = [row[0] for row in rows] + + # 批量查询排名历史 + rank_history_map: Dict[int, List[int]] = {} + if news_ids: + placeholders = ",".join("?" * len(news_ids)) + cursor.execute(f""" + SELECT news_item_id, rank FROM rank_history + WHERE news_item_id IN ({placeholders}) + ORDER BY news_item_id, crawl_time + """, news_ids) + for rh_row in cursor.fetchall(): + news_id, rank = rh_row[0], rh_row[1] + if news_id not in rank_history_map: + rank_history_map[news_id] = [] + if rank not in rank_history_map[news_id]: + rank_history_map[news_id].append(rank) + + # 按 platform_id 分组 + items: Dict[str, List[NewsItem]] = {} + id_to_name: Dict[str, str] = {} + crawl_date = self._format_date_folder(date) + + for row in rows: + news_id = row[0] + platform_id = row[2] + title = row[1] + platform_name = row[3] or platform_id + + id_to_name[platform_id] = platform_name + + if platform_id not in items: + items[platform_id] = [] + + # 获取排名历史,如果没有则使用当前排名 + ranks = rank_history_map.get(news_id, [row[4]]) + + items[platform_id].append(NewsItem( + title=title, + source_id=platform_id, + source_name=platform_name, + rank=row[4], + url=row[5] or "", + mobile_url=row[6] or "", + crawl_time=row[8], # last_crawl_time + ranks=ranks, + first_time=row[7], # first_crawl_time + last_time=row[8], # last_crawl_time + count=row[9], # crawl_count + )) + + final_items = items + + # 获取失败的来源 + cursor.execute(""" + SELECT DISTINCT css.platform_id + FROM crawl_source_status css + JOIN crawl_records cr ON css.crawl_record_id = cr.id + WHERE css.status = 'failed' + """) + failed_ids = [row[0] for row in cursor.fetchall()] + + # 获取最新的抓取时间 + cursor.execute(""" + SELECT crawl_time FROM crawl_records + ORDER BY crawl_time DESC + LIMIT 1 + """) + + time_row = cursor.fetchone() + crawl_time = time_row[0] if time_row else self._format_time_filename() + + return NewsData( + date=crawl_date, + crawl_time=crawl_time, + items=final_items, + id_to_name=id_to_name, + failed_ids=failed_ids, + ) + + except Exception as e: + print(f"[本地存储] 读取数据失败: {e}") + return None + + def get_latest_crawl_data(self, date: Optional[str] = None) -> Optional[NewsData]: + """ + 获取最新一次抓取的数据 + + Args: + date: 日期字符串,默认为今天 + + Returns: + 最新抓取的新闻数据 + """ + try: + db_path = self._get_db_path(date) + if not db_path.exists(): + return None + + conn = self._get_connection(date) + cursor = conn.cursor() + + # 获取最新的抓取时间 + cursor.execute(""" + SELECT crawl_time FROM crawl_records + ORDER BY crawl_time DESC + LIMIT 1 + """) + + time_row = cursor.fetchone() + if not time_row: + return None + + latest_time = time_row[0] + + # 获取该时间的新闻数据(包含 id 用于查询排名历史) + cursor.execute(""" + SELECT n.id, n.title, n.platform_id, p.name as platform_name, + n.rank, n.url, n.mobile_url, + n.first_crawl_time, n.last_crawl_time, n.crawl_count + FROM news_items n + LEFT JOIN platforms p ON n.platform_id = p.id + WHERE n.last_crawl_time = ? + """, (latest_time,)) + + rows = cursor.fetchall() + if not rows: + return None + + # 收集所有 news_item_id + news_ids = [row[0] for row in rows] + + # 批量查询排名历史 + rank_history_map: Dict[int, List[int]] = {} + if news_ids: + placeholders = ",".join("?" * len(news_ids)) + cursor.execute(f""" + SELECT news_item_id, rank FROM rank_history + WHERE news_item_id IN ({placeholders}) + ORDER BY news_item_id, crawl_time + """, news_ids) + for rh_row in cursor.fetchall(): + news_id, rank = rh_row[0], rh_row[1] + if news_id not in rank_history_map: + rank_history_map[news_id] = [] + if rank not in rank_history_map[news_id]: + rank_history_map[news_id].append(rank) + + items: Dict[str, List[NewsItem]] = {} + id_to_name: Dict[str, str] = {} + crawl_date = self._format_date_folder(date) + + for row in rows: + news_id = row[0] + platform_id = row[2] + platform_name = row[3] or platform_id + id_to_name[platform_id] = platform_name + + if platform_id not in items: + items[platform_id] = [] + + # 获取排名历史,如果没有则使用当前排名 + ranks = rank_history_map.get(news_id, [row[4]]) + + items[platform_id].append(NewsItem( + title=row[1], + source_id=platform_id, + source_name=platform_name, + rank=row[4], + url=row[5] or "", + mobile_url=row[6] or "", + crawl_time=row[8], # last_crawl_time + ranks=ranks, + first_time=row[7], # first_crawl_time + last_time=row[8], # last_crawl_time + count=row[9], # crawl_count + )) + + # 获取失败的来源(针对最新一次抓取) + cursor.execute(""" + SELECT css.platform_id + FROM crawl_source_status css + JOIN crawl_records cr ON css.crawl_record_id = cr.id + WHERE cr.crawl_time = ? AND css.status = 'failed' + """, (latest_time,)) + + failed_ids = [row[0] for row in cursor.fetchall()] + + return NewsData( + date=crawl_date, + crawl_time=latest_time, + items=items, + id_to_name=id_to_name, + failed_ids=failed_ids, + ) + + except Exception as e: + print(f"[本地存储] 获取最新数据失败: {e}") + return None + + def detect_new_titles(self, current_data: NewsData) -> Dict[str, Dict]: + """ + 检测新增的标题 + + Args: + current_data: 当前抓取的数据 + + Returns: + 新增的标题数据 {source_id: {title: NewsItem}} + """ + try: + # 获取历史数据 + historical_data = self.get_today_all_data(current_data.date) + + if not historical_data: + # 没有历史数据,所有都是新的 + new_titles = {} + for source_id, news_list in current_data.items.items(): + new_titles[source_id] = {item.title: item for item in news_list} + return new_titles + + # 收集历史标题 + historical_titles: Dict[str, set] = {} + for source_id, news_list in historical_data.items.items(): + historical_titles[source_id] = {item.title for item in news_list} + + # 检测新增 + new_titles = {} + for source_id, news_list in current_data.items.items(): + hist_set = historical_titles.get(source_id, set()) + for item in news_list: + if item.title not in hist_set: + if source_id not in new_titles: + new_titles[source_id] = {} + new_titles[source_id][item.title] = item + + return new_titles + + except Exception as e: + print(f"[本地存储] 检测新标题失败: {e}") + return {} + + def save_txt_snapshot(self, data: NewsData) -> Optional[str]: + """ + 保存 TXT 快照 + + Args: + data: 新闻数据 + + Returns: + 保存的文件路径 + """ + if not self.enable_txt: + return None + + try: + date_folder = self._format_date_folder(data.date) + txt_dir = self.data_dir / date_folder / "txt" + txt_dir.mkdir(parents=True, exist_ok=True) + + file_path = txt_dir / f"{data.crawl_time}.txt" + + with open(file_path, "w", encoding="utf-8") as f: + for source_id, news_list in data.items.items(): + source_name = data.id_to_name.get(source_id, source_id) + + # 写入来源标题 + if source_name and source_name != source_id: + f.write(f"{source_id} | {source_name}\n") + else: + f.write(f"{source_id}\n") + + # 按排名排序 + sorted_news = sorted(news_list, key=lambda x: x.rank) + + for item in sorted_news: + line = f"{item.rank}. {item.title}" + if item.url: + line += f" [URL:{item.url}]" + if item.mobile_url: + line += f" [MOBILE:{item.mobile_url}]" + f.write(line + "\n") + + f.write("\n") + + # 写入失败的来源 + if data.failed_ids: + f.write("==== 以下ID请求失败 ====\n") + for failed_id in data.failed_ids: + f.write(f"{failed_id}\n") + + print(f"[本地存储] TXT 快照已保存: {file_path}") + return str(file_path) + + except Exception as e: + print(f"[本地存储] 保存 TXT 快照失败: {e}") + return None + + def save_html_report(self, html_content: str, filename: str, is_summary: bool = False) -> Optional[str]: + """ + 保存 HTML 报告 + + Args: + html_content: HTML 内容 + filename: 文件名 + is_summary: 是否为汇总报告 + + Returns: + 保存的文件路径 + """ + if not self.enable_html: + return None + + try: + date_folder = self._format_date_folder() + html_dir = self.data_dir / date_folder / "html" + html_dir.mkdir(parents=True, exist_ok=True) + + file_path = html_dir / filename + + with open(file_path, "w", encoding="utf-8") as f: + f.write(html_content) + + print(f"[本地存储] HTML 报告已保存: {file_path}") + return str(file_path) + + except Exception as e: + print(f"[本地存储] 保存 HTML 报告失败: {e}") + return None + + def is_first_crawl_today(self, date: Optional[str] = None) -> bool: + """ + 检查是否是当天第一次抓取 + + Args: + date: 日期字符串,默认为今天 + + Returns: + 是否是第一次抓取 + """ + try: + db_path = self._get_db_path(date) + if not db_path.exists(): + return True + + conn = self._get_connection(date) + cursor = conn.cursor() + + cursor.execute(""" + SELECT COUNT(*) as count FROM crawl_records + """) + + row = cursor.fetchone() + count = row[0] if row else 0 + + # 如果只有一条或没有记录,视为第一次抓取 + return count <= 1 + + except Exception as e: + print(f"[本地存储] 检查首次抓取失败: {e}") + return True + + def get_crawl_times(self, date: Optional[str] = None) -> List[str]: + """ + 获取指定日期的所有抓取时间列表 + + Args: + date: 日期字符串,默认为今天 + + Returns: + 抓取时间列表(按时间排序) + """ + try: + db_path = self._get_db_path(date) + if not db_path.exists(): + return [] + + conn = self._get_connection(date) + cursor = conn.cursor() + + cursor.execute(""" + SELECT crawl_time FROM crawl_records + ORDER BY crawl_time + """) + + rows = cursor.fetchall() + return [row[0] for row in rows] + + except Exception as e: + print(f"[本地存储] 获取抓取时间列表失败: {e}") + return [] + + def cleanup(self) -> None: + """清理资源(关闭数据库连接)""" + for db_path, conn in self._db_connections.items(): + try: + conn.close() + print(f"[本地存储] 关闭数据库连接: {db_path}") + except Exception as e: + print(f"[本地存储] 关闭连接失败 {db_path}: {e}") + + self._db_connections.clear() + + def cleanup_old_data(self, retention_days: int) -> int: + """ + 清理过期数据 + + Args: + retention_days: 保留天数(0 表示不清理) + + Returns: + 删除的日期目录数量 + """ + if retention_days <= 0: + return 0 + + deleted_count = 0 + cutoff_date = self._get_configured_time() - timedelta(days=retention_days) + + try: + if not self.data_dir.exists(): + return 0 + + for date_folder in self.data_dir.iterdir(): + if not date_folder.is_dir() or date_folder.name.startswith('.'): + continue + + # 解析日期文件夹名(支持两种格式) + folder_date = None + try: + # ISO 格式: YYYY-MM-DD + date_match = re.match(r'(\d{4})-(\d{2})-(\d{2})', date_folder.name) + if date_match: + folder_date = datetime( + int(date_match.group(1)), + int(date_match.group(2)), + int(date_match.group(3)), + tzinfo=pytz.timezone("Asia/Shanghai") + ) + else: + # 旧中文格式: YYYY年MM月DD日 + date_match = re.match(r'(\d{4})年(\d{2})月(\d{2})日', date_folder.name) + if date_match: + folder_date = datetime( + int(date_match.group(1)), + int(date_match.group(2)), + int(date_match.group(3)), + tzinfo=pytz.timezone("Asia/Shanghai") + ) + except Exception: + continue + + if folder_date and folder_date < cutoff_date: + # 先关闭该日期的数据库连接 + db_path = str(self._get_db_path(date_folder.name)) + if db_path in self._db_connections: + try: + self._db_connections[db_path].close() + del self._db_connections[db_path] + except Exception: + pass + + # 删除整个日期目录 + try: + shutil.rmtree(date_folder) + deleted_count += 1 + print(f"[本地存储] 清理过期数据: {date_folder.name}") + except Exception as e: + print(f"[本地存储] 删除目录失败 {date_folder.name}: {e}") + + if deleted_count > 0: + print(f"[本地存储] 共清理 {deleted_count} 个过期日期目录") + + return deleted_count + + except Exception as e: + print(f"[本地存储] 清理过期数据失败: {e}") + return deleted_count + + def has_pushed_today(self, date: Optional[str] = None) -> bool: + """ + 检查指定日期是否已推送过 + + Args: + date: 日期字符串(YYYY-MM-DD),默认为今天 + + Returns: + 是否已推送 + """ + try: + conn = self._get_connection(date) + cursor = conn.cursor() + + target_date = self._format_date_folder(date) + + cursor.execute(""" + SELECT pushed FROM push_records WHERE date = ? + """, (target_date,)) + + row = cursor.fetchone() + if row: + return bool(row[0]) + return False + + except Exception as e: + print(f"[本地存储] 检查推送记录失败: {e}") + return False + + def record_push(self, report_type: str, date: Optional[str] = None) -> bool: + """ + 记录推送 + + Args: + report_type: 报告类型 + date: 日期字符串(YYYY-MM-DD),默认为今天 + + Returns: + 是否记录成功 + """ + try: + conn = self._get_connection(date) + cursor = conn.cursor() + + target_date = self._format_date_folder(date) + now_str = self._get_configured_time().strftime("%Y-%m-%d %H:%M:%S") + + cursor.execute(""" + INSERT INTO push_records (date, pushed, push_time, report_type, created_at) + VALUES (?, 1, ?, ?, ?) + ON CONFLICT(date) DO UPDATE SET + pushed = 1, + push_time = excluded.push_time, + report_type = excluded.report_type + """, (target_date, now_str, report_type, now_str)) + + conn.commit() + + print(f"[本地存储] 推送记录已保存: {report_type} at {now_str}") + return True + + except Exception as e: + print(f"[本地存储] 记录推送失败: {e}") + return False + + def __del__(self): + """析构函数,确保关闭连接""" + self.cleanup() diff --git a/trendradar/storage/manager.py b/trendradar/storage/manager.py new file mode 100644 index 0000000..c488655 --- /dev/null +++ b/trendradar/storage/manager.py @@ -0,0 +1,316 @@ +# coding=utf-8 +""" +存储管理器 - 统一管理存储后端 + +根据环境和配置自动选择合适的存储后端 +""" + +import os +from typing import Optional + +from trendradar.storage.base import StorageBackend, NewsData + + +# 存储管理器单例 +_storage_manager: Optional["StorageManager"] = None + + +class StorageManager: + """ + 存储管理器 + + 功能: + - 自动检测运行环境(GitHub Actions / Docker / 本地) + - 根据配置选择存储后端(local / remote / auto) + - 提供统一的存储接口 + - 支持从远程拉取数据到本地 + """ + + def __init__( + self, + backend_type: str = "auto", + data_dir: str = "output", + enable_txt: bool = True, + enable_html: bool = True, + remote_config: Optional[dict] = None, + local_retention_days: int = 0, + remote_retention_days: int = 0, + pull_enabled: bool = False, + pull_days: int = 0, + timezone: str = "Asia/Shanghai", + ): + """ + 初始化存储管理器 + + Args: + backend_type: 存储后端类型 (local / remote / auto) + data_dir: 本地数据目录 + enable_txt: 是否启用 TXT 快照 + enable_html: 是否启用 HTML 报告 + remote_config: 远程存储配置(endpoint_url, bucket_name, access_key_id 等) + local_retention_days: 本地数据保留天数(0 = 无限制) + remote_retention_days: 远程数据保留天数(0 = 无限制) + pull_enabled: 是否启用启动时自动拉取 + pull_days: 拉取最近 N 天的数据 + timezone: 时区配置(默认 Asia/Shanghai) + """ + self.backend_type = backend_type + self.data_dir = data_dir + self.enable_txt = enable_txt + self.enable_html = enable_html + self.remote_config = remote_config or {} + self.local_retention_days = local_retention_days + self.remote_retention_days = remote_retention_days + self.pull_enabled = pull_enabled + self.pull_days = pull_days + self.timezone = timezone + + self._backend: Optional[StorageBackend] = None + self._remote_backend: Optional[StorageBackend] = None + + @staticmethod + def is_github_actions() -> bool: + """检测是否在 GitHub Actions 环境中运行""" + return os.environ.get("GITHUB_ACTIONS") == "true" + + @staticmethod + def is_docker() -> bool: + """检测是否在 Docker 容器中运行""" + # 方法1: 检查 /.dockerenv 文件 + if os.path.exists("/.dockerenv"): + return True + + # 方法2: 检查 cgroup(Linux) + try: + with open("/proc/1/cgroup", "r") as f: + return "docker" in f.read() + except (FileNotFoundError, PermissionError): + pass + + # 方法3: 检查环境变量 + return os.environ.get("DOCKER_CONTAINER") == "true" + + def _resolve_backend_type(self) -> str: + """解析实际使用的后端类型""" + if self.backend_type == "auto": + if self.is_github_actions(): + # GitHub Actions 环境,检查是否配置了远程存储 + if self._has_remote_config(): + return "remote" + else: + print("[存储管理器] GitHub Actions 环境但未配置远程存储,使用本地存储") + return "local" + else: + return "local" + return self.backend_type + + def _has_remote_config(self) -> bool: + """检查是否有有效的远程存储配置""" + # 检查配置或环境变量 + bucket_name = self.remote_config.get("bucket_name") or os.environ.get("S3_BUCKET_NAME") + access_key = self.remote_config.get("access_key_id") or os.environ.get("S3_ACCESS_KEY_ID") + secret_key = self.remote_config.get("secret_access_key") or os.environ.get("S3_SECRET_ACCESS_KEY") + endpoint = self.remote_config.get("endpoint_url") or os.environ.get("S3_ENDPOINT_URL") + + # 调试日志 + has_config = bool(bucket_name and access_key and secret_key and endpoint) + if not has_config: + print(f"[存储管理器] 远程存储配置检查失败:") + print(f" - bucket_name: {'已配置' if bucket_name else '未配置'}") + print(f" - access_key_id: {'已配置' if access_key else '未配置'}") + print(f" - secret_access_key: {'已配置' if secret_key else '未配置'}") + print(f" - endpoint_url: {'已配置' if endpoint else '未配置'}") + + return has_config + + def _create_remote_backend(self) -> Optional[StorageBackend]: + """创建远程存储后端""" + try: + from trendradar.storage.remote import RemoteStorageBackend + + return RemoteStorageBackend( + bucket_name=self.remote_config.get("bucket_name") or os.environ.get("S3_BUCKET_NAME", ""), + access_key_id=self.remote_config.get("access_key_id") or os.environ.get("S3_ACCESS_KEY_ID", ""), + secret_access_key=self.remote_config.get("secret_access_key") or os.environ.get("S3_SECRET_ACCESS_KEY", ""), + endpoint_url=self.remote_config.get("endpoint_url") or os.environ.get("S3_ENDPOINT_URL", ""), + region=self.remote_config.get("region") or os.environ.get("S3_REGION", ""), + enable_txt=self.enable_txt, + enable_html=self.enable_html, + timezone=self.timezone, + ) + except ImportError as e: + print(f"[存储管理器] 远程后端导入失败: {e}") + print("[存储管理器] 请确保已安装 boto3: pip install boto3") + return None + except Exception as e: + print(f"[存储管理器] 远程后端初始化失败: {e}") + return None + + def get_backend(self) -> StorageBackend: + """获取存储后端实例""" + if self._backend is None: + resolved_type = self._resolve_backend_type() + + if resolved_type == "remote": + self._backend = self._create_remote_backend() + if self._backend: + print(f"[存储管理器] 使用远程存储后端") + else: + print("[存储管理器] 回退到本地存储") + resolved_type = "local" + + if resolved_type == "local" or self._backend is None: + from trendradar.storage.local import LocalStorageBackend + + self._backend = LocalStorageBackend( + data_dir=self.data_dir, + enable_txt=self.enable_txt, + enable_html=self.enable_html, + timezone=self.timezone, + ) + print(f"[存储管理器] 使用本地存储后端 (数据目录: {self.data_dir})") + + return self._backend + + def pull_from_remote(self) -> int: + """ + 从远程拉取数据到本地 + + Returns: + 成功拉取的文件数量 + """ + if not self.pull_enabled or self.pull_days <= 0: + return 0 + + if not self._has_remote_config(): + print("[存储管理器] 未配置远程存储,无法拉取") + return 0 + + # 创建远程后端(如果还没有) + if self._remote_backend is None: + self._remote_backend = self._create_remote_backend() + + if self._remote_backend is None: + print("[存储管理器] 无法创建远程后端,拉取失败") + return 0 + + # 调用拉取方法 + return self._remote_backend.pull_recent_days(self.pull_days, self.data_dir) + + def save_news_data(self, data: NewsData) -> bool: + """保存新闻数据""" + return self.get_backend().save_news_data(data) + + def get_today_all_data(self, date: Optional[str] = None) -> Optional[NewsData]: + """获取当天所有数据""" + return self.get_backend().get_today_all_data(date) + + def get_latest_crawl_data(self, date: Optional[str] = None) -> Optional[NewsData]: + """获取最新抓取数据""" + return self.get_backend().get_latest_crawl_data(date) + + def detect_new_titles(self, current_data: NewsData) -> dict: + """检测新增标题""" + return self.get_backend().detect_new_titles(current_data) + + def save_txt_snapshot(self, data: NewsData) -> Optional[str]: + """保存 TXT 快照""" + return self.get_backend().save_txt_snapshot(data) + + def save_html_report(self, html_content: str, filename: str, is_summary: bool = False) -> Optional[str]: + """保存 HTML 报告""" + return self.get_backend().save_html_report(html_content, filename, is_summary) + + def is_first_crawl_today(self, date: Optional[str] = None) -> bool: + """检查是否是当天第一次抓取""" + return self.get_backend().is_first_crawl_today(date) + + def cleanup(self) -> None: + """清理资源""" + if self._backend: + self._backend.cleanup() + if self._remote_backend: + self._remote_backend.cleanup() + + def cleanup_old_data(self) -> int: + """ + 清理过期数据 + + Returns: + 删除的日期目录数量 + """ + total_deleted = 0 + + # 清理本地数据 + if self.local_retention_days > 0: + total_deleted += self.get_backend().cleanup_old_data(self.local_retention_days) + + # 清理远程数据(如果配置了) + if self.remote_retention_days > 0 and self._has_remote_config(): + if self._remote_backend is None: + self._remote_backend = self._create_remote_backend() + if self._remote_backend: + total_deleted += self._remote_backend.cleanup_old_data(self.remote_retention_days) + + return total_deleted + + @property + def backend_name(self) -> str: + """获取当前后端名称""" + return self.get_backend().backend_name + + @property + def supports_txt(self) -> bool: + """是否支持 TXT 快照""" + return self.get_backend().supports_txt + + +def get_storage_manager( + backend_type: str = "auto", + data_dir: str = "output", + enable_txt: bool = True, + enable_html: bool = True, + remote_config: Optional[dict] = None, + local_retention_days: int = 0, + remote_retention_days: int = 0, + pull_enabled: bool = False, + pull_days: int = 0, + timezone: str = "Asia/Shanghai", + force_new: bool = False, +) -> StorageManager: + """ + 获取存储管理器单例 + + Args: + backend_type: 存储后端类型 + data_dir: 本地数据目录 + enable_txt: 是否启用 TXT 快照 + enable_html: 是否启用 HTML 报告 + remote_config: 远程存储配置 + local_retention_days: 本地数据保留天数(0 = 无限制) + remote_retention_days: 远程数据保留天数(0 = 无限制) + pull_enabled: 是否启用启动时自动拉取 + pull_days: 拉取最近 N 天的数据 + timezone: 时区配置(默认 Asia/Shanghai) + force_new: 是否强制创建新实例 + + Returns: + StorageManager 实例 + """ + global _storage_manager + + if _storage_manager is None or force_new: + _storage_manager = StorageManager( + backend_type=backend_type, + data_dir=data_dir, + enable_txt=enable_txt, + enable_html=enable_html, + remote_config=remote_config, + local_retention_days=local_retention_days, + remote_retention_days=remote_retention_days, + pull_enabled=pull_enabled, + pull_days=pull_days, + timezone=timezone, + ) + + return _storage_manager diff --git a/trendradar/storage/remote.py b/trendradar/storage/remote.py new file mode 100644 index 0000000..e3486d0 --- /dev/null +++ b/trendradar/storage/remote.py @@ -0,0 +1,1071 @@ +# coding=utf-8 +""" +远程存储后端(S3 兼容协议) + +支持 Cloudflare R2、阿里云 OSS、腾讯云 COS、AWS S3、MinIO 等 +使用 S3 兼容 API (boto3) 访问对象存储 +数据流程:下载当天 SQLite → 合并新数据 → 上传回远程 +""" + +import atexit +import os +import pytz +import re +import shutil +import sys +import tempfile +import sqlite3 +from datetime import datetime, timedelta +from pathlib import Path +from typing import Dict, List, Optional, Any + +try: + import boto3 + from botocore.exceptions import ClientError + HAS_BOTO3 = True +except ImportError: + HAS_BOTO3 = False + boto3 = None + ClientError = Exception + +from trendradar.storage.base import StorageBackend, NewsItem, NewsData +from trendradar.utils.time import ( + get_configured_time, + format_date_folder, + format_time_filename, +) + + +class RemoteStorageBackend(StorageBackend): + """ + 远程云存储后端(S3 兼容协议) + + 特点: + - 使用 S3 兼容 API 访问远程存储 + - 支持 Cloudflare R2、阿里云 OSS、腾讯云 COS、AWS S3、MinIO 等 + - 下载 SQLite 到临时目录进行操作 + - 支持数据合并和上传 + - 支持从远程拉取历史数据到本地 + - 运行结束后自动清理临时文件 + """ + + def __init__( + self, + bucket_name: str, + access_key_id: str, + secret_access_key: str, + endpoint_url: str, + region: str = "", + enable_txt: bool = False, # 远程模式默认不生成 TXT + enable_html: bool = True, + temp_dir: Optional[str] = None, + timezone: str = "Asia/Shanghai", + ): + """ + 初始化远程存储后端 + + Args: + bucket_name: 存储桶名称 + access_key_id: 访问密钥 ID + secret_access_key: 访问密钥 + endpoint_url: 服务端点 URL + region: 区域(可选,部分服务商需要) + enable_txt: 是否启用 TXT 快照(默认关闭) + enable_html: 是否启用 HTML 报告 + temp_dir: 临时目录路径(默认使用系统临时目录) + timezone: 时区配置(默认 Asia/Shanghai) + """ + if not HAS_BOTO3: + raise ImportError("远程存储后端需要安装 boto3: pip install boto3") + + self.bucket_name = bucket_name + self.endpoint_url = endpoint_url + self.region = region + self.enable_txt = enable_txt + self.enable_html = enable_html + self.timezone = timezone + + # 创建临时目录 + self.temp_dir = Path(temp_dir) if temp_dir else Path(tempfile.mkdtemp(prefix="trendradar_")) + self.temp_dir.mkdir(parents=True, exist_ok=True) + + # 初始化 S3 客户端 + client_kwargs = { + "endpoint_url": endpoint_url, + "aws_access_key_id": access_key_id, + "aws_secret_access_key": secret_access_key, + } + if region: + client_kwargs["region_name"] = region + + self.s3_client = boto3.client("s3", **client_kwargs) + + # 跟踪下载的文件(用于清理) + self._downloaded_files: List[Path] = [] + self._db_connections: Dict[str, sqlite3.Connection] = {} + + print(f"[远程存储] 初始化完成,存储桶: {bucket_name}") + + @property + def backend_name(self) -> str: + return "remote" + + @property + def supports_txt(self) -> bool: + return self.enable_txt + + def _get_configured_time(self) -> datetime: + """获取配置时区的当前时间""" + return get_configured_time(self.timezone) + + def _format_date_folder(self, date: Optional[str] = None) -> str: + """格式化日期文件夹名 (ISO 格式: YYYY-MM-DD)""" + return format_date_folder(date, self.timezone) + + def _format_time_filename(self) -> str: + """格式化时间文件名 (格式: HH-MM)""" + return format_time_filename(self.timezone) + + def _get_remote_db_key(self, date: Optional[str] = None) -> str: + """获取 R2 中 SQLite 文件的对象键""" + date_folder = self._format_date_folder(date) + return f"news/{date_folder}.db" + + def _get_local_db_path(self, date: Optional[str] = None) -> Path: + """获取本地临时 SQLite 文件路径""" + date_folder = self._format_date_folder(date) + return self.temp_dir / date_folder / "news.db" + + def _check_object_exists(self, r2_key: str) -> bool: + """ + 检查 R2 中对象是否存在 + + Args: + r2_key: R2 对象键 + + Returns: + 是否存在 + """ + try: + self.s3_client.head_object(Bucket=self.bucket_name, Key=r2_key) + return True + except ClientError as e: + error_code = e.response.get("Error", {}).get("Code", "") + # R2/S3 可能返回 404, NoSuchKey, 或其他变体 + if error_code in ("404", "NoSuchKey", "Not Found"): + return False + # 其他错误(如权限问题)也视为不存在,但打印警告 + print(f"[远程存储] 检查对象存在性失败 ({r2_key}): {e}") + return False + except Exception as e: + print(f"[远程存储] 检查对象存在性异常 ({r2_key}): {e}") + return False + + def _download_sqlite(self, date: Optional[str] = None) -> Optional[Path]: + """ + 从 R2 下载当天的 SQLite 文件到本地临时目录 + + Args: + date: 日期字符串 + + Returns: + 本地文件路径,如果不存在返回 None + """ + r2_key = self._get_remote_db_key(date) + local_path = self._get_local_db_path(date) + + # 确保目录存在 + local_path.parent.mkdir(parents=True, exist_ok=True) + + # 先检查文件是否存在 + if not self._check_object_exists(r2_key): + print(f"[远程存储] 文件不存在,将创建新数据库: {r2_key}") + return None + + try: + self.s3_client.download_file(self.bucket_name, r2_key, str(local_path)) + self._downloaded_files.append(local_path) + print(f"[远程存储] 已下载: {r2_key} -> {local_path}") + return local_path + except ClientError as e: + error_code = e.response.get("Error", {}).get("Code", "") + # R2/S3 可能返回不同的错误码 + if error_code in ("404", "NoSuchKey", "Not Found"): + print(f"[远程存储] 文件不存在,将创建新数据库: {r2_key}") + return None + else: + print(f"[远程存储] 下载失败 (错误码: {error_code}): {e}") + raise + except Exception as e: + print(f"[远程存储] 下载异常: {e}") + raise + + def _upload_sqlite(self, date: Optional[str] = None) -> bool: + """ + 上传本地 SQLite 文件到 R2 + + Args: + date: 日期字符串 + + Returns: + 是否上传成功 + """ + local_path = self._get_local_db_path(date) + r2_key = self._get_remote_db_key(date) + + if not local_path.exists(): + print(f"[远程存储] 本地文件不存在,无法上传: {local_path}") + return False + + try: + # 获取本地文件大小 + local_size = local_path.stat().st_size + print(f"[远程存储] 准备上传: {local_path} ({local_size} bytes) -> {r2_key}") + + self.s3_client.upload_file(str(local_path), self.bucket_name, r2_key) + print(f"[远程存储] 已上传: {local_path} -> {r2_key}") + + # 验证上传成功 + if self._check_object_exists(r2_key): + print(f"[远程存储] 上传验证成功: {r2_key}") + return True + else: + print(f"[远程存储] 上传验证失败: 文件未在 R2 中找到") + return False + + except Exception as e: + print(f"[远程存储] 上传失败: {e}") + return False + + def _get_connection(self, date: Optional[str] = None) -> sqlite3.Connection: + """获取数据库连接""" + local_path = self._get_local_db_path(date) + db_path = str(local_path) + + if db_path not in self._db_connections: + # 确保目录存在 + local_path.parent.mkdir(parents=True, exist_ok=True) + + # 如果本地不存在,尝试从 R2 下载 + if not local_path.exists(): + self._download_sqlite(date) + + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + self._init_tables(conn) + self._db_connections[db_path] = conn + + return self._db_connections[db_path] + + def _get_schema_path(self) -> Path: + """获取 schema.sql 文件路径""" + return Path(__file__).parent / "schema.sql" + + def _init_tables(self, conn: sqlite3.Connection) -> None: + """从 schema.sql 初始化数据库表结构""" + schema_path = self._get_schema_path() + + if schema_path.exists(): + with open(schema_path, "r", encoding="utf-8") as f: + schema_sql = f.read() + conn.executescript(schema_sql) + else: + raise FileNotFoundError(f"Schema file not found: {schema_path}") + + conn.commit() + + def save_news_data(self, data: NewsData) -> bool: + """ + 保存新闻数据到 R2(以 URL 为唯一标识,支持标题更新检测) + + 流程:下载现有数据库 → 插入/更新数据 → 上传回 R2 + + Args: + data: 新闻数据 + + Returns: + 是否保存成功 + """ + try: + conn = self._get_connection(data.date) + cursor = conn.cursor() + + # 查询已有记录数 + cursor.execute("SELECT COUNT(*) as count FROM news_items") + row = cursor.fetchone() + existing_count = row[0] if row else 0 + if existing_count > 0: + print(f"[远程存储] 已有 {existing_count} 条历史记录,将合并新数据") + + # 获取配置时区的当前时间 + now_str = self._get_configured_time().strftime("%Y-%m-%d %H:%M:%S") + + # 首先同步平台信息到 platforms 表 + for source_id, source_name in data.id_to_name.items(): + cursor.execute(""" + INSERT INTO platforms (id, name, updated_at) + VALUES (?, ?, ?) + ON CONFLICT(id) DO UPDATE SET + name = excluded.name, + updated_at = excluded.updated_at + """, (source_id, source_name, now_str)) + + # 统计计数器 + new_count = 0 + updated_count = 0 + title_changed_count = 0 + success_sources = [] + + for source_id, news_list in data.items.items(): + success_sources.append(source_id) + + for item in news_list: + try: + # 检查是否已存在(通过 URL + platform_id) + if item.url: + cursor.execute(""" + SELECT id, title FROM news_items + WHERE url = ? AND platform_id = ? + """, (item.url, source_id)) + existing = cursor.fetchone() + + if existing: + # 已存在,更新记录 + existing_id, existing_title = existing + + # 检查标题是否变化 + if existing_title != item.title: + # 记录标题变更 + cursor.execute(""" + INSERT INTO title_changes + (news_item_id, old_title, new_title, changed_at) + VALUES (?, ?, ?, ?) + """, (existing_id, existing_title, item.title, now_str)) + title_changed_count += 1 + + # 记录排名历史 + cursor.execute(""" + INSERT INTO rank_history + (news_item_id, rank, crawl_time, created_at) + VALUES (?, ?, ?, ?) + """, (existing_id, item.rank, data.crawl_time, now_str)) + + # 更新现有记录 + cursor.execute(""" + UPDATE news_items SET + title = ?, + rank = ?, + mobile_url = ?, + last_crawl_time = ?, + crawl_count = crawl_count + 1, + updated_at = ? + WHERE id = ? + """, (item.title, item.rank, item.mobile_url, + data.crawl_time, now_str, existing_id)) + updated_count += 1 + else: + # 不存在,插入新记录 + cursor.execute(""" + INSERT INTO news_items + (title, platform_id, rank, url, mobile_url, + first_crawl_time, last_crawl_time, crawl_count, + created_at, updated_at) + VALUES (?, ?, ?, ?, ?, ?, ?, 1, ?, ?) + """, (item.title, source_id, item.rank, item.url, + item.mobile_url, data.crawl_time, data.crawl_time, + now_str, now_str)) + new_id = cursor.lastrowid + # 记录初始排名 + cursor.execute(""" + INSERT INTO rank_history + (news_item_id, rank, crawl_time, created_at) + VALUES (?, ?, ?, ?) + """, (new_id, item.rank, data.crawl_time, now_str)) + new_count += 1 + else: + # URL 为空的情况,直接插入(不做去重) + cursor.execute(""" + INSERT INTO news_items + (title, platform_id, rank, url, mobile_url, + first_crawl_time, last_crawl_time, crawl_count, + created_at, updated_at) + VALUES (?, ?, ?, ?, ?, ?, ?, 1, ?, ?) + """, (item.title, source_id, item.rank, item.url, + item.mobile_url, data.crawl_time, data.crawl_time, + now_str, now_str)) + new_id = cursor.lastrowid + # 记录初始排名 + cursor.execute(""" + INSERT INTO rank_history + (news_item_id, rank, crawl_time, created_at) + VALUES (?, ?, ?, ?) + """, (new_id, item.rank, data.crawl_time, now_str)) + new_count += 1 + + except sqlite3.Error as e: + print(f"[远程存储] 保存新闻条目失败 [{item.title[:30]}...]: {e}") + + total_items = new_count + updated_count + + # 记录抓取信息 + cursor.execute(""" + INSERT OR REPLACE INTO crawl_records + (crawl_time, total_items, created_at) + VALUES (?, ?, ?) + """, (data.crawl_time, total_items, now_str)) + + # 获取刚插入的 crawl_record 的 ID + cursor.execute(""" + SELECT id FROM crawl_records WHERE crawl_time = ? + """, (data.crawl_time,)) + record_row = cursor.fetchone() + if record_row: + crawl_record_id = record_row[0] + + # 记录成功的来源 + for source_id in success_sources: + cursor.execute(""" + INSERT OR REPLACE INTO crawl_source_status + (crawl_record_id, platform_id, status) + VALUES (?, ?, 'success') + """, (crawl_record_id, source_id)) + + # 记录失败的来源 + for failed_id in data.failed_ids: + # 确保失败的平台也在 platforms 表中 + cursor.execute(""" + INSERT OR IGNORE INTO platforms (id, name, updated_at) + VALUES (?, ?, ?) + """, (failed_id, failed_id, now_str)) + + cursor.execute(""" + INSERT OR REPLACE INTO crawl_source_status + (crawl_record_id, platform_id, status) + VALUES (?, ?, 'failed') + """, (crawl_record_id, failed_id)) + + conn.commit() + + # 查询合并后的总记录数 + cursor.execute("SELECT COUNT(*) as count FROM news_items") + row = cursor.fetchone() + final_count = row[0] if row else 0 + + # 输出详细的存储统计日志 + log_parts = [f"[远程存储] 处理完成:新增 {new_count} 条"] + if updated_count > 0: + log_parts.append(f"更新 {updated_count} 条") + if title_changed_count > 0: + log_parts.append(f"标题变更 {title_changed_count} 条") + log_parts.append(f"(去重后总计: {final_count} 条)") + print(",".join(log_parts)) + + # 上传到 R2 + if self._upload_sqlite(data.date): + print(f"[远程存储] 数据已同步到 R2") + return True + else: + print(f"[远程存储] 上传 R2 失败") + return False + + except Exception as e: + print(f"[远程存储] 保存失败: {e}") + return False + + def get_today_all_data(self, date: Optional[str] = None) -> Optional[NewsData]: + """获取指定日期的所有新闻数据(合并后)""" + try: + conn = self._get_connection(date) + cursor = conn.cursor() + + # 获取所有新闻数据(包含 id 用于查询排名历史) + cursor.execute(""" + SELECT n.id, n.title, n.platform_id, p.name as platform_name, + n.rank, n.url, n.mobile_url, + n.first_crawl_time, n.last_crawl_time, n.crawl_count + FROM news_items n + LEFT JOIN platforms p ON n.platform_id = p.id + ORDER BY n.platform_id, n.last_crawl_time + """) + + rows = cursor.fetchall() + if not rows: + return None + + # 收集所有 news_item_id + news_ids = [row[0] for row in rows] + + # 批量查询排名历史 + rank_history_map: Dict[int, List[int]] = {} + if news_ids: + placeholders = ",".join("?" * len(news_ids)) + cursor.execute(f""" + SELECT news_item_id, rank FROM rank_history + WHERE news_item_id IN ({placeholders}) + ORDER BY news_item_id, crawl_time + """, news_ids) + for rh_row in cursor.fetchall(): + news_id, rank = rh_row[0], rh_row[1] + if news_id not in rank_history_map: + rank_history_map[news_id] = [] + if rank not in rank_history_map[news_id]: + rank_history_map[news_id].append(rank) + + # 按 platform_id 分组 + items: Dict[str, List[NewsItem]] = {} + id_to_name: Dict[str, str] = {} + crawl_date = self._format_date_folder(date) + + for row in rows: + news_id = row[0] + platform_id = row[2] + title = row[1] + platform_name = row[3] or platform_id + + id_to_name[platform_id] = platform_name + + if platform_id not in items: + items[platform_id] = [] + + # 获取排名历史,如果没有则使用当前排名 + ranks = rank_history_map.get(news_id, [row[4]]) + + items[platform_id].append(NewsItem( + title=title, + source_id=platform_id, + source_name=platform_name, + rank=row[4], + url=row[5] or "", + mobile_url=row[6] or "", + crawl_time=row[8], # last_crawl_time + ranks=ranks, + first_time=row[7], # first_crawl_time + last_time=row[8], # last_crawl_time + count=row[9], # crawl_count + )) + + final_items = items + + # 获取失败的来源 + cursor.execute(""" + SELECT DISTINCT css.platform_id + FROM crawl_source_status css + JOIN crawl_records cr ON css.crawl_record_id = cr.id + WHERE css.status = 'failed' + """) + failed_ids = [row[0] for row in cursor.fetchall()] + + # 获取最新的抓取时间 + cursor.execute(""" + SELECT crawl_time FROM crawl_records + ORDER BY crawl_time DESC + LIMIT 1 + """) + + time_row = cursor.fetchone() + crawl_time = time_row[0] if time_row else self._format_time_filename() + + return NewsData( + date=crawl_date, + crawl_time=crawl_time, + items=final_items, + id_to_name=id_to_name, + failed_ids=failed_ids, + ) + + except Exception as e: + print(f"[远程存储] 读取数据失败: {e}") + return None + + def get_latest_crawl_data(self, date: Optional[str] = None) -> Optional[NewsData]: + """获取最新一次抓取的数据""" + try: + conn = self._get_connection(date) + cursor = conn.cursor() + + # 获取最新的抓取时间 + cursor.execute(""" + SELECT crawl_time FROM crawl_records + ORDER BY crawl_time DESC + LIMIT 1 + """) + + time_row = cursor.fetchone() + if not time_row: + return None + + latest_time = time_row[0] + + # 获取该时间的新闻数据,通过 JOIN 获取平台名称 + cursor.execute(""" + SELECT n.title, n.platform_id, p.name as platform_name, + n.rank, n.url, n.mobile_url, + n.first_crawl_time, n.last_crawl_time, n.crawl_count + FROM news_items n + LEFT JOIN platforms p ON n.platform_id = p.id + WHERE n.last_crawl_time = ? + """, (latest_time,)) + + rows = cursor.fetchall() + if not rows: + return None + + items: Dict[str, List[NewsItem]] = {} + id_to_name: Dict[str, str] = {} + crawl_date = self._format_date_folder(date) + + for row in rows: + platform_id = row[1] + platform_name = row[2] or platform_id + id_to_name[platform_id] = platform_name + + if platform_id not in items: + items[platform_id] = [] + + items[platform_id].append(NewsItem( + title=row[0], + source_id=platform_id, + source_name=platform_name, + rank=row[3], + url=row[4] or "", + mobile_url=row[5] or "", + crawl_time=row[7], # last_crawl_time + ranks=[row[3]], + first_time=row[6], # first_crawl_time + last_time=row[7], # last_crawl_time + count=row[8], # crawl_count + )) + + # 获取失败的来源(针对最新一次抓取) + cursor.execute(""" + SELECT css.platform_id + FROM crawl_source_status css + JOIN crawl_records cr ON css.crawl_record_id = cr.id + WHERE cr.crawl_time = ? AND css.status = 'failed' + """, (latest_time,)) + + failed_ids = [row[0] for row in cursor.fetchall()] + + return NewsData( + date=crawl_date, + crawl_time=latest_time, + items=items, + id_to_name=id_to_name, + failed_ids=failed_ids, + ) + + except Exception as e: + print(f"[远程存储] 获取最新数据失败: {e}") + return None + + def detect_new_titles(self, current_data: NewsData) -> Dict[str, Dict]: + """检测新增的标题""" + try: + historical_data = self.get_today_all_data(current_data.date) + + if not historical_data: + new_titles = {} + for source_id, news_list in current_data.items.items(): + new_titles[source_id] = {item.title: item for item in news_list} + return new_titles + + historical_titles: Dict[str, set] = {} + for source_id, news_list in historical_data.items.items(): + historical_titles[source_id] = {item.title for item in news_list} + + new_titles = {} + for source_id, news_list in current_data.items.items(): + hist_set = historical_titles.get(source_id, set()) + for item in news_list: + if item.title not in hist_set: + if source_id not in new_titles: + new_titles[source_id] = {} + new_titles[source_id][item.title] = item + + return new_titles + + except Exception as e: + print(f"[远程存储] 检测新标题失败: {e}") + return {} + + def save_txt_snapshot(self, data: NewsData) -> Optional[str]: + """保存 TXT 快照(R2 模式下默认不支持)""" + if not self.enable_txt: + return None + + # 如果启用,保存到本地临时目录 + try: + date_folder = self._format_date_folder(data.date) + txt_dir = self.temp_dir / date_folder / "txt" + txt_dir.mkdir(parents=True, exist_ok=True) + + file_path = txt_dir / f"{data.crawl_time}.txt" + + with open(file_path, "w", encoding="utf-8") as f: + for source_id, news_list in data.items.items(): + source_name = data.id_to_name.get(source_id, source_id) + + if source_name and source_name != source_id: + f.write(f"{source_id} | {source_name}\n") + else: + f.write(f"{source_id}\n") + + sorted_news = sorted(news_list, key=lambda x: x.rank) + + for item in sorted_news: + line = f"{item.rank}. {item.title}" + if item.url: + line += f" [URL:{item.url}]" + if item.mobile_url: + line += f" [MOBILE:{item.mobile_url}]" + f.write(line + "\n") + + f.write("\n") + + if data.failed_ids: + f.write("==== 以下ID请求失败 ====\n") + for failed_id in data.failed_ids: + f.write(f"{failed_id}\n") + + print(f"[远程存储] TXT 快照已保存: {file_path}") + return str(file_path) + + except Exception as e: + print(f"[远程存储] 保存 TXT 快照失败: {e}") + return None + + def save_html_report(self, html_content: str, filename: str, is_summary: bool = False) -> Optional[str]: + """保存 HTML 报告到临时目录""" + if not self.enable_html: + return None + + try: + date_folder = self._format_date_folder() + html_dir = self.temp_dir / date_folder / "html" + html_dir.mkdir(parents=True, exist_ok=True) + + file_path = html_dir / filename + + with open(file_path, "w", encoding="utf-8") as f: + f.write(html_content) + + print(f"[远程存储] HTML 报告已保存: {file_path}") + return str(file_path) + + except Exception as e: + print(f"[远程存储] 保存 HTML 报告失败: {e}") + return None + + def is_first_crawl_today(self, date: Optional[str] = None) -> bool: + """检查是否是当天第一次抓取""" + try: + conn = self._get_connection(date) + cursor = conn.cursor() + + cursor.execute(""" + SELECT COUNT(*) as count FROM crawl_records + """) + + row = cursor.fetchone() + count = row[0] if row else 0 + + return count <= 1 + + except Exception as e: + print(f"[远程存储] 检查首次抓取失败: {e}") + return True + + def cleanup(self) -> None: + """清理资源(关闭连接和删除临时文件)""" + # 检查 Python 是否正在关闭 + if sys.meta_path is None: + return + + # 关闭数据库连接 + db_connections = getattr(self, "_db_connections", {}) + for db_path, conn in list(db_connections.items()): + try: + conn.close() + print(f"[远程存储] 关闭数据库连接: {db_path}") + except Exception as e: + print(f"[远程存储] 关闭连接失败 {db_path}: {e}") + + if db_connections: + db_connections.clear() + + # 删除临时目录 + temp_dir = getattr(self, "temp_dir", None) + if temp_dir: + try: + if temp_dir.exists(): + shutil.rmtree(temp_dir) + print(f"[远程存储] 临时目录已清理: {temp_dir}") + except Exception as e: + # 忽略 Python 关闭时的错误 + if sys.meta_path is not None: + print(f"[远程存储] 清理临时目录失败: {e}") + + downloaded_files = getattr(self, "_downloaded_files", None) + if downloaded_files: + downloaded_files.clear() + + def cleanup_old_data(self, retention_days: int) -> int: + """ + 清理 R2 上的过期数据 + + Args: + retention_days: 保留天数(0 表示不清理) + + Returns: + 删除的数据库文件数量 + """ + if retention_days <= 0: + return 0 + + deleted_count = 0 + cutoff_date = self._get_configured_time() - timedelta(days=retention_days) + + try: + # 列出 R2 中 news/ 前缀下的所有对象 + paginator = self.s3_client.get_paginator('list_objects_v2') + pages = paginator.paginate(Bucket=self.bucket_name, Prefix="news/") + + # 收集需要删除的对象键 + objects_to_delete = [] + deleted_dates = set() + + for page in pages: + if 'Contents' not in page: + continue + + for obj in page['Contents']: + key = obj['Key'] + + # 解析日期(格式: news/YYYY-MM-DD.db 或 news/YYYY年MM月DD日.db) + folder_date = None + try: + # ISO 格式: news/YYYY-MM-DD.db + date_match = re.match(r'news/(\d{4})-(\d{2})-(\d{2})\.db$', key) + if date_match: + folder_date = datetime( + int(date_match.group(1)), + int(date_match.group(2)), + int(date_match.group(3)), + tzinfo=pytz.timezone("Asia/Shanghai") + ) + date_str = f"{date_match.group(1)}-{date_match.group(2)}-{date_match.group(3)}" + else: + # 旧中文格式: news/YYYY年MM月DD日.db + date_match = re.match(r'news/(\d{4})年(\d{2})月(\d{2})日\.db$', key) + if date_match: + folder_date = datetime( + int(date_match.group(1)), + int(date_match.group(2)), + int(date_match.group(3)), + tzinfo=pytz.timezone("Asia/Shanghai") + ) + date_str = f"{date_match.group(1)}年{date_match.group(2)}月{date_match.group(3)}日" + except Exception: + continue + + if folder_date and folder_date < cutoff_date: + objects_to_delete.append({'Key': key}) + deleted_dates.add(date_str) + + # 批量删除对象(每次最多 1000 个) + if objects_to_delete: + batch_size = 1000 + for i in range(0, len(objects_to_delete), batch_size): + batch = objects_to_delete[i:i + batch_size] + try: + self.s3_client.delete_objects( + Bucket=self.bucket_name, + Delete={'Objects': batch} + ) + print(f"[远程存储] 删除 {len(batch)} 个对象") + except Exception as e: + print(f"[远程存储] 批量删除失败: {e}") + + deleted_count = len(deleted_dates) + for date_str in sorted(deleted_dates): + print(f"[远程存储] 清理过期数据: news/{date_str}.db") + + print(f"[远程存储] 共清理 {deleted_count} 个过期日期数据库文件") + + return deleted_count + + except Exception as e: + print(f"[远程存储] 清理过期数据失败: {e}") + return deleted_count + + def has_pushed_today(self, date: Optional[str] = None) -> bool: + """ + 检查指定日期是否已推送过 + + Args: + date: 日期字符串(YYYY-MM-DD),默认为今天 + + Returns: + 是否已推送 + """ + try: + conn = self._get_connection(date) + cursor = conn.cursor() + + target_date = self._format_date_folder(date) + + cursor.execute(""" + SELECT pushed FROM push_records WHERE date = ? + """, (target_date,)) + + row = cursor.fetchone() + if row: + return bool(row[0]) + return False + + except Exception as e: + print(f"[远程存储] 检查推送记录失败: {e}") + return False + + def record_push(self, report_type: str, date: Optional[str] = None) -> bool: + """ + 记录推送 + + Args: + report_type: 报告类型 + date: 日期字符串(YYYY-MM-DD),默认为今天 + + Returns: + 是否记录成功 + """ + try: + conn = self._get_connection(date) + cursor = conn.cursor() + + target_date = self._format_date_folder(date) + now_str = self._get_configured_time().strftime("%Y-%m-%d %H:%M:%S") + + cursor.execute(""" + INSERT INTO push_records (date, pushed, push_time, report_type, created_at) + VALUES (?, 1, ?, ?, ?) + ON CONFLICT(date) DO UPDATE SET + pushed = 1, + push_time = excluded.push_time, + report_type = excluded.report_type + """, (target_date, now_str, report_type, now_str)) + + conn.commit() + + print(f"[远程存储] 推送记录已保存: {report_type} at {now_str}") + + # 上传到 R2 确保记录持久化 + if self._upload_sqlite(date): + print(f"[远程存储] 推送记录已同步到 R2") + return True + else: + print(f"[远程存储] 推送记录同步到 R2 失败") + return False + + except Exception as e: + print(f"[远程存储] 记录推送失败: {e}") + return False + + def __del__(self): + """析构函数""" + # 检查 Python 是否正在关闭 + if sys.meta_path is None: + return + try: + self.cleanup() + except Exception: + # Python 关闭时可能会出错,忽略即可 + pass + + def pull_recent_days(self, days: int, local_data_dir: str = "output") -> int: + """ + 从远程拉取最近 N 天的数据到本地 + + Args: + days: 拉取天数 + local_data_dir: 本地数据目录 + + Returns: + 成功拉取的数据库文件数量 + """ + if days <= 0: + return 0 + + local_dir = Path(local_data_dir) + local_dir.mkdir(parents=True, exist_ok=True) + + pulled_count = 0 + now = self._get_configured_time() + + print(f"[远程存储] 开始拉取最近 {days} 天的数据...") + + for i in range(days): + date = now - timedelta(days=i) + date_str = date.strftime("%Y-%m-%d") + + # 本地目标路径 + local_date_dir = local_dir / date_str + local_db_path = local_date_dir / "news.db" + + # 如果本地已存在,跳过 + if local_db_path.exists(): + print(f"[远程存储] 跳过(本地已存在): {date_str}") + continue + + # 远程对象键 + remote_key = f"news/{date_str}.db" + + # 检查远程是否存在 + if not self._check_object_exists(remote_key): + print(f"[远程存储] 跳过(远程不存在): {date_str}") + continue + + # 下载 + try: + local_date_dir.mkdir(parents=True, exist_ok=True) + self.s3_client.download_file( + self.bucket_name, + remote_key, + str(local_db_path) + ) + print(f"[远程存储] 已拉取: {remote_key} -> {local_db_path}") + pulled_count += 1 + except Exception as e: + print(f"[远程存储] 拉取失败 ({date_str}): {e}") + + print(f"[远程存储] 拉取完成,共下载 {pulled_count} 个数据库文件") + return pulled_count + + def list_remote_dates(self) -> List[str]: + """ + 列出远程存储中所有可用的日期 + + Returns: + 日期字符串列表(YYYY-MM-DD 格式) + """ + dates = [] + + try: + paginator = self.s3_client.get_paginator('list_objects_v2') + pages = paginator.paginate(Bucket=self.bucket_name, Prefix="news/") + + for page in pages: + if 'Contents' not in page: + continue + + for obj in page['Contents']: + key = obj['Key'] + # 解析日期 + date_match = re.match(r'news/(\d{4}-\d{2}-\d{2})\.db$', key) + if date_match: + dates.append(date_match.group(1)) + + return sorted(dates, reverse=True) + + except Exception as e: + print(f"[远程存储] 列出远程日期失败: {e}") + return [] diff --git a/trendradar/storage/schema.sql b/trendradar/storage/schema.sql new file mode 100644 index 0000000..a669407 --- /dev/null +++ b/trendradar/storage/schema.sql @@ -0,0 +1,117 @@ +-- TrendRadar 数据库表结构 + +-- ============================================ +-- 平台信息表 +-- 核心:id 不变,name 可变 +-- ============================================ +CREATE TABLE IF NOT EXISTS platforms ( + id TEXT PRIMARY KEY, + name TEXT NOT NULL, + is_active INTEGER DEFAULT 1, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); + +-- ============================================ +-- 新闻条目表 +-- 以 URL + platform_id 为唯一标识,支持去重存储 +-- ============================================ +CREATE TABLE IF NOT EXISTS news_items ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + title TEXT NOT NULL, + platform_id TEXT NOT NULL, + rank INTEGER NOT NULL, + url TEXT DEFAULT '', + mobile_url TEXT DEFAULT '', + first_crawl_time TEXT NOT NULL, -- 首次抓取时间 + last_crawl_time TEXT NOT NULL, -- 最后抓取时间 + crawl_count INTEGER DEFAULT 1, -- 抓取次数 + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY (platform_id) REFERENCES platforms(id) +); + +-- ============================================ +-- 标题变更历史表 +-- 记录同一 URL 下标题的变化 +-- ============================================ +CREATE TABLE IF NOT EXISTS title_changes ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + news_item_id INTEGER NOT NULL, + old_title TEXT NOT NULL, + new_title TEXT NOT NULL, + changed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY (news_item_id) REFERENCES news_items(id) +); + +-- ============================================ +-- 排名历史表 +-- 记录每次抓取时的排名变化 +-- ============================================ +CREATE TABLE IF NOT EXISTS rank_history ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + news_item_id INTEGER NOT NULL, + rank INTEGER NOT NULL, + crawl_time TEXT NOT NULL, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY (news_item_id) REFERENCES news_items(id) +); + +-- ============================================ +-- 抓取记录表 +-- 记录每次抓取的时间和数量 +-- ============================================ +CREATE TABLE IF NOT EXISTS crawl_records ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + crawl_time TEXT NOT NULL UNIQUE, + total_items INTEGER DEFAULT 0, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); + +-- ============================================ +-- 抓取来源状态表 +-- 记录每次抓取各平台的成功/失败状态 +-- ============================================ +CREATE TABLE IF NOT EXISTS crawl_source_status ( + crawl_record_id INTEGER NOT NULL, + platform_id TEXT NOT NULL, + status TEXT NOT NULL CHECK(status IN ('success', 'failed')), + PRIMARY KEY (crawl_record_id, platform_id), + FOREIGN KEY (crawl_record_id) REFERENCES crawl_records(id), + FOREIGN KEY (platform_id) REFERENCES platforms(id) +); + +-- ============================================ +-- 推送记录表 +-- 用于 push_window once_per_day 功能 +-- ============================================ +CREATE TABLE IF NOT EXISTS push_records ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + date TEXT NOT NULL UNIQUE, + pushed INTEGER DEFAULT 0, + push_time TEXT, + report_type TEXT, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); + +-- ============================================ +-- 索引定义 +-- ============================================ + +-- 平台索引 +CREATE INDEX IF NOT EXISTS idx_news_platform ON news_items(platform_id); + +-- 时间索引(用于查询最新数据) +CREATE INDEX IF NOT EXISTS idx_news_crawl_time ON news_items(last_crawl_time); + +-- 标题索引(用于标题搜索) +CREATE INDEX IF NOT EXISTS idx_news_title ON news_items(title); + +-- URL + platform_id 唯一索引(仅对非空 URL,实现去重) +CREATE UNIQUE INDEX IF NOT EXISTS idx_news_url_platform + ON news_items(url, platform_id) WHERE url != ''; + +-- 抓取状态索引 +CREATE INDEX IF NOT EXISTS idx_crawl_status_record ON crawl_source_status(crawl_record_id); + +-- 排名历史索引 +CREATE INDEX IF NOT EXISTS idx_rank_history_news ON rank_history(news_item_id); diff --git a/trendradar/utils/__init__.py b/trendradar/utils/__init__.py new file mode 100644 index 0000000..0b6a46d --- /dev/null +++ b/trendradar/utils/__init__.py @@ -0,0 +1,20 @@ +# coding=utf-8 +""" +工具模块 - 公共工具函数 +""" + +from trendradar.utils.time import ( + get_configured_time, + format_date_folder, + format_time_filename, + get_current_time_display, + convert_time_for_display, +) + +__all__ = [ + "get_configured_time", + "format_date_folder", + "format_time_filename", + "get_current_time_display", + "convert_time_for_display", +] diff --git a/trendradar/utils/time.py b/trendradar/utils/time.py new file mode 100644 index 0000000..1e5b853 --- /dev/null +++ b/trendradar/utils/time.py @@ -0,0 +1,91 @@ +# coding=utf-8 +""" +时间工具模块 - 统一时间处理函数 +""" + +from datetime import datetime +from typing import Optional + +import pytz + +# 默认时区 +DEFAULT_TIMEZONE = "Asia/Shanghai" + + +def get_configured_time(timezone: str = DEFAULT_TIMEZONE) -> datetime: + """ + 获取配置时区的当前时间 + + Args: + timezone: 时区名称,如 'Asia/Shanghai', 'America/Los_Angeles' + + Returns: + 带时区信息的当前时间 + """ + try: + tz = pytz.timezone(timezone) + except pytz.UnknownTimeZoneError: + print(f"[警告] 未知时区 '{timezone}',使用默认时区 {DEFAULT_TIMEZONE}") + tz = pytz.timezone(DEFAULT_TIMEZONE) + return datetime.now(tz) + + +def format_date_folder( + date: Optional[str] = None, timezone: str = DEFAULT_TIMEZONE +) -> str: + """ + 格式化日期文件夹名 (ISO 格式: YYYY-MM-DD) + + Args: + date: 指定日期字符串,为 None 则使用当前日期 + timezone: 时区名称 + + Returns: + 格式化后的日期字符串,如 '2025-12-09' + """ + if date: + return date + return get_configured_time(timezone).strftime("%Y-%m-%d") + + +def format_time_filename(timezone: str = DEFAULT_TIMEZONE) -> str: + """ + 格式化时间文件名 (格式: HH-MM,用于文件名) + + Windows 系统不支持冒号作为文件名,因此使用连字符 + + Args: + timezone: 时区名称 + + Returns: + 格式化后的时间字符串,如 '15-30' + """ + return get_configured_time(timezone).strftime("%H-%M") + + +def get_current_time_display(timezone: str = DEFAULT_TIMEZONE) -> str: + """ + 获取当前时间显示 (格式: HH:MM,用于显示) + + Args: + timezone: 时区名称 + + Returns: + 格式化后的时间字符串,如 '15:30' + """ + return get_configured_time(timezone).strftime("%H:%M") + + +def convert_time_for_display(time_str: str) -> str: + """ + 将 HH-MM 格式转换为 HH:MM 格式用于显示 + + Args: + time_str: 输入时间字符串,如 '15-30' + + Returns: + 转换后的时间字符串,如 '15:30' + """ + if time_str and "-" in time_str and len(time_str) == 5: + return time_str.replace("-", ":") + return time_str diff --git a/version b/version index e5b8203..0c89fc9 100644 --- a/version +++ b/version @@ -1 +1 @@ -3.5.0 \ No newline at end of file +4.0.0 \ No newline at end of file