v4.0.0 大大大更新

This commit is contained in:
sansan 2025-12-13 13:44:35 +08:00
parent 97c05aa33c
commit c7bacdfff7
61 changed files with 12407 additions and 5889 deletions

View File

@ -4,8 +4,6 @@ name: 🐛 遇到问题了
description: 程序运行不正常或出现错误 description: 程序运行不正常或出现错误
title: "[问题] " title: "[问题] "
labels: ["bug"] labels: ["bug"]
assignees:
- sansan0
body: body:
- type: markdown - type: markdown
attributes: attributes:

View File

@ -4,8 +4,6 @@ name: 💡 我有个想法
description: 建议新功能或改进现有功能 description: 建议新功能或改进现有功能
title: "[建议] " title: "[建议] "
labels: ["enhancement"] labels: ["enhancement"]
assignees:
- sansan0
body: body:
- type: markdown - type: markdown
attributes: attributes:

View File

@ -4,8 +4,6 @@ name: ⚙️ 设置遇到困难
description: 配置相关的问题或需要帮助 description: 配置相关的问题或需要帮助
title: "[设置] " title: "[设置] "
labels: ["配置", "帮助"] labels: ["配置", "帮助"]
assignees:
- sansan0
body: body:
- type: markdown - type: markdown
attributes: attributes:

28
.github/workflows/clean-crawler.yml vendored Normal file
View File

@ -0,0 +1,28 @@
name: Check In
# ✅ 签到续期:运行此 workflow 可重置 7 天计时,保持 "Get Hot News" 正常运行
# ✅ Renewal: Run this workflow to reset the 7-day timer and keep "Get Hot News" active
#
# 📌 操作方法 / How to use:
# 1. 点击 "Run workflow" 按钮 / Click "Run workflow" button
# 2. 每 7 天内至少运行一次 / Run at least once every 7 days
on:
workflow_dispatch:
jobs:
del_runs:
runs-on: ubuntu-latest
permissions:
actions: write
contents: read
steps:
- name: Delete all workflow runs
uses: Mattraks/delete-workflow-runs@v2
with:
token: ${{ github.token }}
repository: ${{ github.repository }}
retain_days: 0
keep_minimum_runs: 0
delete_workflow_by_state_pattern: "ALL"
delete_run_by_conclusion_pattern: "ALL"

163
.github/workflows/crawler.yml vendored Normal file
View File

@ -0,0 +1,163 @@
name: Get Hot News
on:
schedule:
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# ⚠️ 试用版说明 / Trial Mode
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
#
# 🔄 运行机制 / How it works:
# - 每个周期为 7 天,届时自动停止
# - 运行 "Check In" 会重置周期(重新开始 7 天倒计时,而非累加)
# - Each cycle is 7 days, then auto-stops
# - "Check In" resets the cycle (restarts 7-day countdown, not cumulative)
#
# 💡 设计初衷 / Why this design:
# 如果 7 天都忘了签到,或许这些资讯对你来说并非刚需
# 适时的暂停,能帮你从信息流中抽离,给大脑留出喘息的空间
# If you forget for 7 days, maybe you don't really need it
# A timely pause helps you detach from the stream and gives your mind space
#
# 🙏 珍惜资源 / Respect shared resources:
# GitHub Actions 是平台提供的公共资源,每次运行都会消耗算力
# 签到机制确保资源分配给真正需要的用户,感谢你的理解与配合
# GitHub Actions is a shared public resource provided by the platform
# Check-in ensures resources go to those who truly need it — thank you
#
# 🚀 长期使用请部署 Docker 版本 / For long-term use, deploy Docker version
#
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
#
# 📝 修改运行时间只改第一个数字0-59表示每小时第几分钟运行
# 📝 Change time: Only modify the first number (0-59) = minute of each hour
#
# 示例 / Examples:
# "15 * * * *" → 每小时第15分钟 / minute 15 every hour
# "30 0-14 * * *" → 北京时间 8:00-22:00 每小时第30分钟 / Beijing 8am-10pm
#
- cron: "33 * * * *"
workflow_dispatch:
concurrency:
group: crawler-${{ github.ref_name }}
cancel-in-progress: true
permissions:
contents: read
actions: write
jobs:
crawl:
runs-on: ubuntu-latest
timeout-minutes: 15
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
fetch-depth: 1
clean: true
- name: Check Expiration
env:
GH_TOKEN: ${{ github.token }}
run: |
WORKFLOW_FILE="crawler.yml"
API_URL="repos/${{ github.repository }}/actions/workflows/$WORKFLOW_FILE/runs"
TOTAL=$(gh api "$API_URL" --jq '.total_count')
if [ -z "$TOTAL" ] || [ "$TOTAL" -eq 0 ]; then
echo "No previous runs found, skipping expiration check"
exit 0
fi
LAST_PAGE=$(( (TOTAL + 99) / 100 ))
FIRST_RUN_DATE=$(gh api "$API_URL?per_page=100&page=$LAST_PAGE" --jq '.workflow_runs[-1].created_at')
if [ -n "$FIRST_RUN_DATE" ]; then
CURRENT_TIMESTAMP=$(date +%s)
FIRST_RUN_TIMESTAMP=$(date -d "$FIRST_RUN_DATE" +%s)
DIFF_SECONDS=$((CURRENT_TIMESTAMP - FIRST_RUN_TIMESTAMP))
LIMIT_SECONDS=604800
if [ $DIFF_SECONDS -gt $LIMIT_SECONDS ]; then
echo "⚠️ 试用期已结束,请运行 'Check In' 签到续期"
echo "⚠️ Trial expired. Run 'Check In' to renew."
gh workflow disable "$WORKFLOW_FILE"
exit 1
else
DAYS_LEFT=$(( (LIMIT_SECONDS - DIFF_SECONDS) / 86400 ))
echo "✅ 试用期剩余 ${DAYS_LEFT} 天,到期前请运行 'Check In' 签到续期"
echo "✅ Trial: ${DAYS_LEFT} days left. Run 'Check In' before expiry to renew."
fi
fi
# --------------------------------------------------------------------------------
# 🚦 TRAFFIC CONTROL / 流量控制
# --------------------------------------------------------------------------------
# EN: Generates a random delay between 1 and 300 seconds (5 minutes).
# Critical for load balancing.
#
# CN: 生成 1 到 300 秒5分钟之间的随机延迟。
# 这对负载均衡至关重要。
- name: Random Delay (Traffic Control)
if: success()
run: |
echo "🎲 Traffic Control: Generating random delay..."
DELAY=$(( ( RANDOM % 300 ) + 1 ))
echo "⏸️ Sleeping for ${DELAY} seconds to spread the load..."
sleep ${DELAY}s
echo "▶️ Delay finished. Starting crawler..."
- name: Set up Python
if: success()
uses: actions/setup-python@v5
with:
python-version: "3.10"
cache: "pip"
- name: Install dependencies
if: success()
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
- name: Verify required files
if: success()
run: |
if [ ! -f config/config.yaml ]; then
echo "Error: Config missing"
exit 1
fi
- name: Run crawler
if: success()
env:
FEISHU_WEBHOOK_URL: ${{ secrets.FEISHU_WEBHOOK_URL }}
TELEGRAM_BOT_TOKEN: ${{ secrets.TELEGRAM_BOT_TOKEN }}
TELEGRAM_CHAT_ID: ${{ secrets.TELEGRAM_CHAT_ID }}
DINGTALK_WEBHOOK_URL: ${{ secrets.DINGTALK_WEBHOOK_URL }}
WEWORK_WEBHOOK_URL: ${{ secrets.WEWORK_WEBHOOK_URL }}
WEWORK_MSG_TYPE: ${{ secrets.WEWORK_MSG_TYPE }}
EMAIL_FROM: ${{ secrets.EMAIL_FROM }}
EMAIL_PASSWORD: ${{ secrets.EMAIL_PASSWORD }}
EMAIL_TO: ${{ secrets.EMAIL_TO }}
EMAIL_SMTP_SERVER: ${{ secrets.EMAIL_SMTP_SERVER }}
EMAIL_SMTP_PORT: ${{ secrets.EMAIL_SMTP_PORT }}
NTFY_TOPIC: ${{ secrets.NTFY_TOPIC }}
NTFY_SERVER_URL: ${{ secrets.NTFY_SERVER_URL }}
NTFY_TOKEN: ${{ secrets.NTFY_TOKEN }}
BARK_URL: ${{ secrets.BARK_URL }}
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
STORAGE_BACKEND: auto
LOCAL_RETENTION_DAYS: ${{ secrets.LOCAL_RETENTION_DAYS }}
REMOTE_RETENTION_DAYS: ${{ secrets.REMOTE_RETENTION_DAYS }}
S3_BUCKET_NAME: ${{ secrets.S3_BUCKET_NAME }}
S3_ACCESS_KEY_ID: ${{ secrets.S3_ACCESS_KEY_ID }}
S3_SECRET_ACCESS_KEY: ${{ secrets.S3_SECRET_ACCESS_KEY }}
S3_ENDPOINT_URL: ${{ secrets.S3_ENDPOINT_URL }}
S3_REGION: ${{ secrets.S3_REGION }}
GITHUB_ACTIONS: true
run: python -m trendradar

View File

@ -1,6 +1,6 @@
<div align="center" id="trendradar"> <div align="center" id="trendradar">
> **📢 Announcement:** After communicating with GitHub officials, "One-Click Fork Deployment" will be restored after compliance adjustments are completed. Please stay tuned for **v4.0.0** update > **📢 Announcement:** **v4.0.0** has been released! Including storage architecture refactoring, database optimization, modularization improvements, and more major updates
<a href="https://github.com/sansan0/TrendRadar" title="TrendRadar"> <a href="https://github.com/sansan0/TrendRadar" title="TrendRadar">
<img src="/_image/banner.webp" alt="TrendRadar Banner" width="80%"> <img src="/_image/banner.webp" alt="TrendRadar Banner" width="80%">
@ -16,8 +16,8 @@
[![GitHub Stars](https://img.shields.io/github/stars/sansan0/TrendRadar?style=flat-square&logo=github&color=yellow)](https://github.com/sansan0/TrendRadar/stargazers) [![GitHub Stars](https://img.shields.io/github/stars/sansan0/TrendRadar?style=flat-square&logo=github&color=yellow)](https://github.com/sansan0/TrendRadar/stargazers)
[![GitHub Forks](https://img.shields.io/github/forks/sansan0/TrendRadar?style=flat-square&logo=github&color=blue)](https://github.com/sansan0/TrendRadar/network/members) [![GitHub Forks](https://img.shields.io/github/forks/sansan0/TrendRadar?style=flat-square&logo=github&color=blue)](https://github.com/sansan0/TrendRadar/network/members)
[![License](https://img.shields.io/badge/license-GPL--3.0-blue.svg?style=flat-square)](LICENSE) [![License](https://img.shields.io/badge/license-GPL--3.0-blue.svg?style=flat-square)](LICENSE)
[![Version](https://img.shields.io/badge/version-v3.5.0-blue.svg)](https://github.com/sansan0/TrendRadar) [![Version](https://img.shields.io/badge/version-v4.0.0-blue.svg)](https://github.com/sansan0/TrendRadar)
[![MCP](https://img.shields.io/badge/MCP-v1.0.3-green.svg)](https://github.com/sansan0/TrendRadar) [![MCP](https://img.shields.io/badge/MCP-v1.1.0-green.svg)](https://github.com/sansan0/TrendRadar)
[![WeWork](https://img.shields.io/badge/WeWork-Notification-00D4AA?style=flat-square)](https://work.weixin.qq.com/) [![WeWork](https://img.shields.io/badge/WeWork-Notification-00D4AA?style=flat-square)](https://work.weixin.qq.com/)
[![WeChat](https://img.shields.io/badge/WeChat-Notification-00D4AA?style=flat-square)](https://weixin.qq.com/) [![WeChat](https://img.shields.io/badge/WeChat-Notification-00D4AA?style=flat-square)](https://weixin.qq.com/)
@ -48,62 +48,61 @@
<br> <br>
<details> <details>
<summary>🚨 <strong>【MUST READ】Important Announcement: The Correct Way to Deploy This Project</strong></summary> <summary>🚨 <strong>【Must Read】Important Announcement: v4.0.0 Deployment & Storage Architecture Changes</strong></summary>
<br> <br>
> **⚠️ December 2025 Urgent Notice** ### 🛠️ Choose the Deployment Method That Fits You
>
> Due to a surge in Fork numbers causing excessive load on GitHub servers, **GitHub Actions and GitHub Pages deployments are currently restricted**. Please read the following instructions carefully to ensure successful deployment.
### 1. ✅ Only Recommended Deployment Method: Docker #### 🅰️ Option 1: Docker Deployment (Recommended 🔥)
**This is currently the most stable solution, free from GitHub restrictions.** Data is stored locally and won't be affected by GitHub policy changes. * **Features**: Most stable and simplest. Data is stored in **local SQLite**, fully under your control.
* **Best for**: Users with their own server, NAS, or an always-on PC.
* 👉 [Jump to Docker Deployment Tutorial](#6-docker-deployment) * 👉 [Jump to Docker Deployment Tutorial](#6-docker-deployment)
--- ---
### 2. If You Were Planning to Fork This Project... #### 🅱️ Option 2: GitHub Actions Deployment (Restored ✅)
To reduce pressure on GitHub servers, **please DO NOT directly click the "Fork" button!** * **Features**: Data is no longer committed directly to the repo. Instead, it is stored in **Remote Cloud Storage** (supports S3-compatible protocols: Cloudflare R2, Alibaba Cloud OSS, Tencent Cloud COS, etc.).
Please use the **"Use this template"** feature instead of Fork: * **Requirement**: You **must** configure an S3-compatible object storage service (Cloudflare R2 recommended, it's free).
> **⚠️ Note**: If you choose this option, you must complete the following two configuration steps:
#### 1. 🚀 Recommended Start: Use this template
To keep the repository clean and avoid inheriting redundant history, I **recommend** using Template mode:
1. **Click** the green **[Use this template]** button at the top right of the original repository page.
1. **Click** the green **[Use this template]** button in the top right corner of the original repository page.
2. **Select** "Create a new repository". 2. **Select** "Create a new repository".
**Why do this?** > **💡 Why do this?**
* **❌ Fork**: Copies complete history records. Many forks running simultaneously will trigger GitHub risk control. > * **Use this template**: Creates a brand new, clean repository with no historical baggage.
* **✅ Use this template**: Creates a completely new independent repository without historical baggage, more server-friendly. > * **Fork**: Retains the complete commit history and relationships, consuming more GitHub resources.
--- #### 2. ☁️ About the Mandatory Remote Storage for GitHub Actions
### 3. About New Data Storage If you choose **Option 2 (GitHub Actions)**, you must configure an S3-compatible object storage service.
The new version will use **Cloudflare R2** to store news data, ensuring data persistence. **Supported Storage Services:**
- **Cloudflare R2** (Recommended, generous free tier)
- Other S3-compatible services
**⚠️ Configuration Prerequisites:** **⚠️ Configuration Prerequisites (Using Cloudflare R2 as Example):**
According to Cloudflare platform rules, activating R2 requires binding a payment method. According to Cloudflare platform rules, enabling R2 requires binding a payment method.
- **Purpose:** Identity verification only (Verify Only), no charges will be incurred. * **Purpose**: Identity verification only (Verify Only). **No charges will be incurred**.
- **Payment:** Supports credit cards or PayPal (China region).
- **Usage:** R2's free tier is sufficient to cover this project's daily operation, no payment required.
--- * **Payment**: Supports international credit cards or PayPal.
### 4. 📅 Future Plans & Documentation Reading Notes * **Usage**: The R2 free tier (10GB storage/month) is sufficient to cover the daily operation of this project. No need to worry about costs.
> **Future Plans:** 👉 **[Click to View Detailed Configuration Tutorial](#-quick-start)**
> - Exploring new approach: keep Actions for fetching and pushing, but no longer save data to repository, use external storage instead.
**⚠️ Reading Note:**
Given that the above plans mean **Fork deployment mode may return in a new form in the future**, and the workload to fully revise documentation is massive, we have temporarily retained the old descriptions.
**At the current stage, if "Fork" related expressions still appear in subsequent tutorials, please ignore them or understand them as "Use this template"**.
👉 **[Click here to view TrendRadar's latest official documentation](https://github.com/sansan0/TrendRadar?tab=readme-ov-file)**
</details> </details>
@ -287,10 +286,32 @@ Supports **WeWork** (+ WeChat push solution), **Feishu**, **DingTalk**, **Telegr
- ⚠️ **Paired Configuration**: Telegram and ntfy require paired parameter quantities to match (e.g., token and chat_id both have 2 values) - ⚠️ **Paired Configuration**: Telegram and ntfy require paired parameter quantities to match (e.g., token and chat_id both have 2 values)
- ⚠️ **Quantity Limit**: Default maximum 3 accounts per channel, exceeded values will be truncated - ⚠️ **Quantity Limit**: Default maximum 3 accounts per channel, exceeded values will be truncated
### **Multi-Platform Support** ### **Flexible Storage Architecture (v4.0.0 Major Update)**
- **GitHub Pages**: Auto-generate beautiful web reports, PC/mobile adapted
**Multi-Backend Support**:
- ☁️ **Remote Cloud Storage**: GitHub Actions environment default, supports S3-compatible protocols (R2/OSS/COS, etc.), data stored in cloud, keeping repository clean
- 💾 **Local SQLite**: Traditional SQLite database, stable and efficient (Docker/local deployment)
- 🔀 **Auto Selection**: Auto-selects appropriate backend based on runtime environment
**Data Format Hierarchy**:
| Format | Role | Description |
|--------|------|-------------|
| **SQLite** | Primary storage | Complete data with statistics information |
| **TXT** | Human-readable backup | Optional text records for manual viewing |
| **HTML** | Web report | Beautiful visual report (GitHub Pages) |
**Data Management Features**:
- Auto data cleanup (configurable retention period)
- Timezone support (configurable IANA time zone)
- Cloud/local seamless switching
> 💡 For storage configuration details, see [Configuration Details - Storage Configuration](#11-storage-configuration-v400-new)
### **Multi-Platform Deployment**
- **GitHub Actions**: Cloud automated operations (7-day check-in cycle + remote cloud storage)
- **Docker Deployment**: Supports multi-architecture containerized operation - **Docker Deployment**: Supports multi-architecture containerized operation
- **Data Persistence**: HTML/TXT multi-format history saving - **Local Running**: Python environment direct execution
### **AI Smart Analysis (v3.0.0 New)** ### **AI Smart Analysis (v3.0.0 New)**
@ -341,10 +362,32 @@ Transform from "algorithm recommendation captivity" to "actively getting the inf
>**Upgrade Instructions**: >**Upgrade Instructions**:
- **📌 Check Latest Updates**: **[Original Repository Changelog](https://github.com/sansan0/TrendRadar?tab=readme-ov-file#-changelog)** - **📌 Check Latest Updates**: **[Original Repository Changelog](https://github.com/sansan0/TrendRadar?tab=readme-ov-file#-changelog)**
- **Tip**: Do NOT update this project via **Sync fork**. Check [Changelog] to understand specific [Upgrade Methods] and [Features] - **Tip**: Do NOT update this project via **Sync fork**. Check [Changelog] to understand specific [Upgrade Methods] and [Features]
- **Minor Version Update**: Upgrading from v2.x to v2.y, replace `main.py` in your forked repo with the latest version
- **Major Version Upgrade**: Upgrading from v1.x to v2.y, recommend deleting existing fork and re-forking to save effort and avoid config conflicts - **Major Version Upgrade**: Upgrading from v1.x to v2.y, recommend deleting existing fork and re-forking to save effort and avoid config conflicts
### 2025/12/13 - v4.0.0
**🎉 Major Update: Comprehensive Refactoring of Storage and Core Architecture**
- **Multi-Storage Backend Support**: Introduced a brand new storage module supporting local SQLite and remote cloud storage (S3-compatible protocols, Cloudflare R2 recommended for free tier), adaptable to GitHub Actions, Docker, and local environments.
- **Database Structure Optimization**: Refactored SQLite database table structures to improve data efficiency and query performance.
- **Enhanced Features**: Implemented date format standardization, data retention policies, timezone configuration support, and optimized time display. Fixed remote storage data persistence issues to ensure accurate data merging.
- **Cleanup and Compatibility**: Removed most legacy compatibility code and unified data storage and retrieval methods.
### 2025/12/13 - mcp-v1.1.0
**MCP Module Update:**
- Adapted for v4.0.0, while maintaining compatibility with v3.x data.
- Added storage sync tools:
- `sync_from_remote`: Pull data from remote storage to local
- `get_storage_status`: Get storage configuration and status
- `list_available_dates`: List available dates in local/remote storage
<details>
<summary>👉 Click to expand: <strong>Historical Updates</strong></summary>
### 2025/12/03 - v3.5.0 ### 2025/12/03 - v3.5.0
**🎉 Core Feature Enhancements** **🎉 Core Feature Enhancements**
@ -397,7 +440,7 @@ Transform from "algorithm recommendation captivity" to "actively getting the inf
**🔧 Upgrade Instructions**: **🔧 Upgrade Instructions**:
- **GitHub Fork Users**: Update `main.py`, `config/config.yaml` (Added multi-account push support, existing single-account configuration unaffected) - **GitHub Fork Users**: Update `main.py`, `config/config.yaml` (Added multi-account push support, existing single-account configuration unaffected)
- **Docker Users**: Update `.env`, `docker compose.yml` or set environment variables `REVERSE_CONTENT_ORDER`, `MAX_ACCOUNTS_PER_CHANNEL` - **Docker Users**: Update `.env`, `docker-compose.yml` or set environment variables `REVERSE_CONTENT_ORDER`, `MAX_ACCOUNTS_PER_CHANNEL`
- **Multi-Account Push**: New feature, disabled by default, existing single-account configuration unaffected - **Multi-Account Push**: New feature, disabled by default, existing single-account configuration unaffected
@ -431,10 +474,6 @@ Transform from "algorithm recommendation captivity" to "actively getting the inf
- Tool count increased from 13 to 14 - Tool count increased from 13 to 14
<details>
<summary>👉 Click to expand: <strong>Historical Updates</strong></summary>
### 2025/11/25 - v3.4.0 ### 2025/11/25 - v3.4.0
**🎉 Added Slack Push Support** **🎉 Added Slack Push Support**
@ -819,11 +858,44 @@ frequency_words.txt file added **required word** feature, using + sign
> **📖 Reminder**: Fork users should first **[check the latest official documentation](https://github.com/sansan0/TrendRadar?tab=readme-ov-file)** to ensure the configuration steps are up to date. > **📖 Reminder**: Fork users should first **[check the latest official documentation](https://github.com/sansan0/TrendRadar?tab=readme-ov-file)** to ensure the configuration steps are up to date.
### ⚠️ GitHub Actions Usage Instructions
**v4.0.0 Important Change**: Introduced "Activity Detection" mechanism—GitHub Actions now requires periodic check-in to maintain operation.
#### 🔄 Check-In Renewal Mechanism
- **Running Cycle**: Valid for **7 days**—service will automatically suspend when countdown ends.
- **Renewal Method**: Manually trigger the "Check In" workflow on the Actions page to reset the 7-day validity period.
- **Operation Path**: `Actions``Check In``Run workflow`
- **Design Philosophy**:
- If you forget for 7 days, maybe you don't really need it. Letting it stop is a digital detox, freeing you from the constant impact.
- GitHub Actions is a valuable public computing resource. The check-in mechanism aims to prevent wasted computing cycles, ensuring resources are allocated to truly active users who need them. Thank you for your understanding and support.
#### 📦 Data Storage (Required Configuration)
In GitHub Actions environment, data is stored in **Remote Cloud Storage** (supports S3-compatible protocols, Cloudflare R2 recommended for free tier), keeping your repository clean (see **Required Configuration: Remote Cloud Storage** below).
#### 🚀 Recommended: Docker Deployment
For long-term stable operation, we recommend [Docker Deployment](#6-docker-deployment), with data stored locally and no check-in required—though it does require purchasing a cloud server.
<br>
> 🎉 **Now Supported: Multi-Cloud Storage Options**
>
> This project now supports S3-compatible protocols. You can choose:
> - **Cloudflare R2** (Recommended, generous free tier)
> - Other S3-compatible storage services
>
> Simply configure the corresponding `S3_ENDPOINT_URL`, `S3_BUCKET_NAME` and other environment variables to switch.
---
1. **Fork this project** to your GitHub account 1. **Fork this project** to your GitHub account
- Click the "Fork" button at the top right of this page - Click the "Fork" button at the top right of this page
2. **Setup GitHub Secrets (Choose your needed platforms)**: 2. **Setup GitHub Secrets (Required + Optional Platforms)**:
In your forked repo, go to `Settings` > `Secrets and variables` > `Actions` > `New repository secret` In your forked repo, go to `Settings` > `Secrets and variables` > `Actions` > `New repository secret`
@ -862,6 +934,35 @@ frequency_words.txt file added **required word** feature, using + sign
<br> <br>
<details>
<summary>⚠️ <strong>Required Configuration: Remote Cloud Storage</strong> (Required for GitHub Actions Environment, Cloudflare R2 Recommended)</summary>
<br>
**GitHub Secret Configuration (⚠️ All 4 configuration items below are required):**
| Name | Secret (Value) Description |
|------|----------------------------|
| `S3_BUCKET_NAME` | Bucket name (e.g., `trendradar-data`) |
| `S3_ACCESS_KEY_ID` | Access key ID |
| `S3_SECRET_ACCESS_KEY` | Access key |
| `S3_ENDPOINT_URL` | S3 API endpoint (e.g., R2: `https://<account-id>.r2.cloudflarestorage.com`) |
<br>
**How to Get Credentials (Using Cloudflare R2 as Example):**
1. Visit [Cloudflare Dashboard](https://dash.cloudflare.com/) and log in
2. Select `R2` in left menu → Click `Create Bucket` → Enter name (e.g., `trendradar-data`)
3. Click `Manage R2 API Tokens` at top right → `Create API Token`
4. Select `Object Read & Write` permission → After creation, it will display `Access Key ID` and `Secret Access Key`
5. Endpoint URL can be found in bucket details page (format: `https://<account-id>.r2.cloudflarestorage.com`)
**Notes**:
- R2 free tier: 10GB storage + 1 million reads per month, sufficient for this project
- Activation requires binding a payment method (identity verification only, no charges)
- Data stored in cloud, keeps GitHub repository clean
</details>
<details> <details>
<summary> <strong>👉 Click to expand: WeWork Bot</strong> (Simplest and fastest configuration)</summary> <summary> <strong>👉 Click to expand: WeWork Bot</strong> (Simplest and fastest configuration)</summary>
@ -2041,7 +2142,7 @@ TrendRadar provides two independent Docker images, deploy according to your need
# Download docker compose config # Download docker compose config
wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/.env -P docker/ wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/.env -P docker/
wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/docker compose.yml -P docker/ wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/docker-compose.yml -P docker/
``` ```
> 💡 **Note**: Key directory structure required for Docker deployment: > 💡 **Note**: Key directory structure required for Docker deployment:
@ -2052,7 +2153,7 @@ current directory/
│ └── frequency_words.txt │ └── frequency_words.txt
└── docker/ └── docker/
├── .env ├── .env
└── docker compose.yml └── docker-compose.yml
``` ```
2. **Config File Description**: 2. **Config File Description**:
@ -2146,7 +2247,7 @@ vim config/frequency_words.txt
# Use build version docker compose # Use build version docker compose
cd docker cd docker
cp docker compose-build.yml docker compose.yml cp docker-compose-build.yml docker-compose.yml
``` ```
**Build and Start Services**: **Build and Start Services**:
@ -2232,7 +2333,7 @@ docker rm trend-radar
> 💡 **Web Server Notes**: > 💡 **Web Server Notes**:
> - After starting, access latest report at `http://localhost:8080` > - After starting, access latest report at `http://localhost:8080`
> - Access historical reports via directory navigation (e.g., `http://localhost:8080/2025年xx月xx日/`) > - Access historical reports via directory navigation (e.g., `http://localhost:8080/2025-xx-xx/`)
> - Port can be configured in `.env` file with `WEBSERVER_PORT` parameter > - Port can be configured in `.env` file with `WEBSERVER_PORT` parameter
> - Auto-start: Set `ENABLE_WEBSERVER=true` in `.env` > - Auto-start: Set `ENABLE_WEBSERVER=true` in `.env`
> - Security: Static files only, limited to output directory, localhost binding only > - Security: Static files only, limited to output directory, localhost binding only
@ -2249,7 +2350,7 @@ TrendRadar generates daily summary HTML reports to two locations simultaneously:
|--------------|---------------|----------| |--------------|---------------|----------|
| `output/index.html` | Direct host access | **Docker Deployment** (via Volume mount, visible on host) | | `output/index.html` | Direct host access | **Docker Deployment** (via Volume mount, visible on host) |
| `index.html` | Root directory access | **GitHub Pages** (repository root, auto-detected by Pages) | | `index.html` | Root directory access | **GitHub Pages** (repository root, auto-detected by Pages) |
| `output/YYYY年MM月DD日/html/当日汇总.html` | Historical reports | All environments (archived by date) | | `output/YYYY-MM-DD/html/当日汇总.html` | Historical reports | All environments (archived by date) |
**Local Access Examples**: **Local Access Examples**:
```bash ```bash
@ -2258,8 +2359,8 @@ TrendRadar generates daily summary HTML reports to two locations simultaneously:
docker exec -it trend-radar python manage.py start_webserver docker exec -it trend-radar python manage.py start_webserver
# 2. Access in browser # 2. Access in browser
http://localhost:8080 # Access latest report (default index.html) http://localhost:8080 # Access latest report (default index.html)
http://localhost:8080/2025年xx月xx日/ # Access reports for specific date http://localhost:8080/2025-xx-xx/ # Access reports for specific date
http://localhost:8080/2025年xx月xx日/html/ # Browse all HTML files for that date http://localhost:8080/2025-xx-xx/html/ # Browse all HTML files for that date
# Method 2: Direct file access (local environment) # Method 2: Direct file access (local environment)
open ./output/index.html # macOS open ./output/index.html # macOS
@ -2267,7 +2368,7 @@ start ./output/index.html # Windows
xdg-open ./output/index.html # Linux xdg-open ./output/index.html # Linux
# Method 3: Access historical archives # Method 3: Access historical archives
open ./output/2025年xx月xx日/html/当日汇总.html open ./output/2025-xx-xx/html/当日汇总.html
``` ```
**Why two index.html files?** **Why two index.html files?**
@ -2324,10 +2425,20 @@ flowchart TB
Use docker compose to start both news push and MCP services: Use docker compose to start both news push and MCP services:
```bash ```bash
# Download latest docker compose.yml (includes MCP service config) # Method 1: Clone project (Recommended)
wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/docker compose.yml git clone https://github.com/sansan0/TrendRadar.git
cd TrendRadar/docker
docker compose up -d
# Start all services # Method 2: Download docker-compose.yml separately
mkdir trendradar && cd trendradar
wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/docker-compose.yml
wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/.env
mkdir -p config output
# Download config files
wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/config/config.yaml -P config/
wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/config/frequency_words.txt -P config/
# Modify volume paths in docker-compose.yml: ../config -> ./config, ../output -> ./output
docker compose up -d docker compose up -d
# Check running status # Check running status
@ -2337,18 +2448,29 @@ docker ps | grep trend-radar
**Start MCP Service Separately**: **Start MCP Service Separately**:
```bash ```bash
# Linux/Mac
docker run -d --name trend-radar-mcp \ docker run -d --name trend-radar-mcp \
-p 127.0.0.1:3333:3333 \ -p 127.0.0.1:3333:3333 \
-v ./config:/app/config:ro \ -v $(pwd)/config:/app/config:ro \
-v ./output:/app/output:ro \ -v $(pwd)/output:/app/output:ro \
-e TZ=Asia/Shanghai \ -e TZ=Asia/Shanghai \
wantcat/trendradar-mcp:latest wantcat/trendradar-mcp:latest
# Windows PowerShell
docker run -d --name trend-radar-mcp `
-p 127.0.0.1:3333:3333 `
-v ${PWD}/config:/app/config:ro `
-v ${PWD}/output:/app/output:ro `
-e TZ=Asia/Shanghai `
wantcat/trendradar-mcp:latest
``` ```
> ⚠️ **Note**: Ensure `config/` and `output/` folders exist in current directory with config files and news data before running.
**Verify Service**: **Verify Service**:
```bash ```bash
# Check if MCP service is running properly # Check MCP service health
curl http://127.0.0.1:3333/mcp curl http://127.0.0.1:3333/mcp
# View MCP service logs # View MCP service logs
@ -2357,14 +2479,20 @@ docker logs -f trend-radar-mcp
**Configure in AI Clients**: **Configure in AI Clients**:
After MCP service starts, configure in Claude Desktop, Cherry Studio, Cursor, etc.: After MCP service starts, configure based on your client:
**Cherry Studio** (Recommended, GUI config):
- Settings → MCP Server → Add
- Type: `streamableHttp`
- URL: `http://127.0.0.1:3333/mcp`
**Claude Desktop / Cline** (JSON config):
```json ```json
{ {
"mcpServers": { "mcpServers": {
"trendradar": { "trendradar": {
"url": "http://127.0.0.1:3333/mcp", "url": "http://127.0.0.1:3333/mcp",
"description": "TrendRadar News Trending Analysis" "type": "streamableHttp"
} }
} }
} }
@ -2452,7 +2580,6 @@ notification:
start: "20:00" # Start time (Beijing time) start: "20:00" # Start time (Beijing time)
end: "22:00" # End time (Beijing time) end: "22:00" # End time (Beijing time)
once_per_day: true # Push only once per day once_per_day: true # Push only once per day
push_record_retention_days: 7 # Push record retention days
``` ```
#### Configuration Details #### Configuration Details
@ -2463,7 +2590,6 @@ notification:
| `time_range.start` | string | `"20:00"` | Push window start time (Beijing time, HH:MM format) | | `time_range.start` | string | `"20:00"` | Push window start time (Beijing time, HH:MM format) |
| `time_range.end` | string | `"22:00"` | Push window end time (Beijing time, HH:MM format) | | `time_range.end` | string | `"22:00"` | Push window end time (Beijing time, HH:MM format) |
| `once_per_day` | bool | `true` | `true`=push only once per day within window, `false`=push every execution within window | | `once_per_day` | bool | `true` | `true`=push only once per day within window, `false`=push every execution within window |
| `push_record_retention_days` | int | `7` | Push record retention days (used to determine if already pushed) |
#### Use Cases #### Use Cases
@ -2487,7 +2613,6 @@ PUSH_WINDOW_ENABLED=true
PUSH_WINDOW_START=09:00 PUSH_WINDOW_START=09:00
PUSH_WINDOW_END=18:00 PUSH_WINDOW_END=18:00
PUSH_WINDOW_ONCE_PER_DAY=false PUSH_WINDOW_ONCE_PER_DAY=false
PUSH_WINDOW_RETENTION_DAYS=7
``` ```
#### Complete Configuration Examples #### Complete Configuration Examples
@ -2502,7 +2627,6 @@ notification:
start: "20:00" start: "20:00"
end: "22:00" end: "22:00"
once_per_day: true once_per_day: true
push_record_retention_days: 7
``` ```
**Scenario: Push every hour during working hours** **Scenario: Push every hour during working hours**
@ -2515,7 +2639,6 @@ notification:
start: "09:00" start: "09:00"
end: "18:00" end: "18:00"
once_per_day: false once_per_day: false
push_record_retention_days: 7
``` ```
</details> </details>
@ -2811,6 +2934,207 @@ notification:
<br> <br>
### 11. Storage Configuration (v4.0.0 New)
<details>
<summary>👉 Click to expand: <strong>Storage Configuration Guide</strong></summary>
<br>
#### Storage Backend Selection
TrendRadar v4.0.0 introduces **multi-backend storage architecture**, supporting automatic backend selection or manual specification:
| Configuration Value | Description | Applicable Scenarios |
|---------------------|-------------|---------------------|
| `auto` (default) | Auto-select backend: GitHub Actions→R2, other environments→Local | Most users (recommended) |
| `local` | Force use of local SQLite | Docker/local deployment |
| `r2` | Force use of Cloudflare R2 | Cloud storage required |
**Configuration Location**:
- GitHub Actions: Set `STORAGE_BACKEND` environment variable in GitHub Secrets
- Docker: Configure `STORAGE_BACKEND=local` in `.env` file
- Local: Add `STORAGE_BACKEND` in environment variables or use auto mode
---
#### Database Structure Optimization (v4.0.0)
v4.0.0 made significant optimizations to database structure, removing redundant fields and improving data normalization:
##### 1. Removed Redundant Fields
Removed the following redundant fields from `news` table:
| Field Name | Removal Reason | Alternative |
|------------|----------------|------------|
| `source_name` | Duplicate with platform name | Get via `platforms` table JOIN query |
| `crawl_date` | Duplicate with file path date | Infer from file path timestamp |
**Migration Notes**: Old databases are incompatible, see [Breaking Changes](#breaking-changes-v400) section
##### 2. New Platforms Table
Added `platforms` table for unified management of platform information:
```sql
CREATE TABLE IF NOT EXISTS platforms (
id TEXT PRIMARY KEY, -- Platform ID (immutable, e.g., 'zhihu', 'weibo')
name TEXT NOT NULL, -- Platform display name (mutable, e.g., 'Zhihu', 'Weibo')
enabled INTEGER DEFAULT 1 -- Whether enabled (1=enabled, 0=disabled)
);
```
**Design Advantages**:
- `id` field is immutable, maintains data consistency
- `name` field is mutable, supports internationalization and customization
- Historical data remains valid when modifying platform names
##### 3. Crawl Source Status Normalization
Replaced original comma-separated string storage `successful_sources` field with normalized `crawl_source_status` table:
```sql
CREATE TABLE IF NOT EXISTS crawl_source_status (
id INTEGER PRIMARY KEY AUTOINCREMENT,
file_path TEXT NOT NULL, -- File path (e.g., 'output/2025-12-09/news.db')
platform_id TEXT NOT NULL, -- Platform ID (foreign key to platforms.id)
success INTEGER NOT NULL, -- Whether crawl succeeded (1=success, 0=failed)
crawl_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (platform_id) REFERENCES platforms(id)
);
```
**Design Advantages**:
- Supports efficient SQL queries (e.g., calculate success rate by platform)
- Easy statistics and analysis (no string splitting required)
- Normalized structure, avoids data redundancy
##### 4. File Path Format Standardization
**Old Format**: `output/2025年12月09日/news_14-30.txt`
**New Format**: `output/2025-12-09/news.db`
**Changes**:
- Date format: Chinese format → ISO 8601 standard format
- Filename: Multiple time-stamped TXT files → single SQLite database file
- Extension: `.txt``.db`
**Advantages**:
- Cross-platform compatibility (avoids Chinese path issues)
- Easier programmatic parsing
- International standard, better maintainability
---
#### Remote Cloud Storage Configuration
When using remote cloud storage (required for GitHub Actions environment), configure the following environment variables:
| Environment Variable | Description | Required | Example Value |
|----------------------|-------------|----------|--------------|
| `S3_BUCKET_NAME` | Bucket name | ✅ Yes | `trendradar-data` |
| `S3_ACCESS_KEY_ID` | Access key ID | ✅ Yes | `abc123...` |
| `S3_SECRET_ACCESS_KEY` | Access key | ✅ Yes | `xyz789...` |
| `S3_ENDPOINT_URL` | S3 API endpoint | ✅ Yes | `https://<account-id>.r2.cloudflarestorage.com` |
| `S3_REGION` | Region (optional) | ❌ No | `auto` |
**Configuration Method**:
- GitHub Actions: Configure in GitHub Secrets (see [Quick Start - Remote Storage Configuration](#2-setup-github-secrets-required--optional-platforms))
- Docker/Local: Configure in `.env` file (remote storage is optional)
---
#### Data Cleanup Strategy
v4.0.0 added automatic data cleanup feature, supporting scheduled cleanup of old data:
**Configuration Items**: `LOCAL_RETENTION_DAYS` and `REMOTE_RETENTION_DAYS`
| Configuration Value | Description |
|---------------------|-------------|
| `0` (default) | Disable cleanup, keep all data |
| Positive integer (e.g., `30`) | Only keep recent N days of data, auto-delete old data |
**Configuration Method**:
```bash
# GitHub Actions: Configure in GitHub Secrets
LOCAL_RETENTION_DAYS=30
REMOTE_RETENTION_DAYS=30
# Docker: Configure in .env file
LOCAL_RETENTION_DAYS=30
REMOTE_RETENTION_DAYS=30
# Local: Add to environment variables
export LOCAL_RETENTION_DAYS=30
```
**Cleanup Rules**:
- Cleanup executes during each crawl task
- Local: Deletes `output/YYYY-MM-DD/` directories older than N days
- Remote: Deletes cloud objects older than N days (e.g., `news/2025-11-10.db`)
---
#### Timezone Configuration
v4.0.0 added timezone configuration support, using IANA standard time zone names:
**Configuration Item**: `TIMEZONE`
| Configuration Value | Description | Example |
|---------------------|-------------|---------|
| Not set (default) | Use UTC+0 | - |
| IANA time zone name | Specify time zone | `Asia/Shanghai`, `America/New_York`, `Europe/London` |
**Configuration Method**:
```bash
# GitHub Actions: Configure in GitHub Secrets
TIMEZONE=Asia/Shanghai
# Docker: Configure in .env file
TIMEZONE=Asia/Shanghai
# Local: Add to environment variables
export TIMEZONE=Asia/Shanghai
```
**Common IANA Time Zones**:
- China: `Asia/Shanghai`
- United States East: `America/New_York`
- United States West: `America/Los_Angeles`
- United Kingdom: `Europe/London`
- Japan: `Asia/Tokyo`
---
#### Breaking Changes (v4.0.0)
**⚠️ Important Notice**: v4.0.0 made breaking changes to database structure, **old databases are incompatible**
**Impact**:
- Cannot directly read v3.x version data
- Need to re-crawl data to build new database
- **No automatic migration tool provided**
**Recommendations**:
1. **Fresh Start**: Recommended to start from scratch to accumulate data
2. **Keep Historical Data**: If need to preserve v3.x historical data, can rename old `output/` directory (e.g., `output_v3_backup/`) before running new version
**Data Format Comparison**:
| Item | v3.x | v4.0.0 |
|------|------|--------|
| File path format | `output/2025年12月09日/` | `output/2025-12-09/` |
| Data file | Multiple `news_HH-MM.txt` files | Single `news.db` file |
| Database fields | Contains `source_name`, `crawl_date` | Removed redundant fields |
| Platform management | No independent table | Added `platforms` table |
| Crawl status | Comma-separated string | Normalized `crawl_source_status` table |
</details>
<br>
## 🤖 AI Analysis ## 🤖 AI Analysis
TrendRadar v3.0.0 added **MCP (Model Context Protocol)** based AI analysis feature, allowing natural language conversations with news data for deep analysis. TrendRadar v3.0.0 added **MCP (Model Context Protocol)** based AI analysis feature, allowing natural language conversations with news data for deep analysis.

View File

@ -450,7 +450,89 @@ AI: (date_range={"start": "2024-12-01", "end": "2024-12-31"})
--- ---
### Q14: How to parse natural language date expressions? (Recommended to use first) ## Storage Sync
### Q14: How to sync data from remote storage to local?
**You can ask like this:**
- "Sync last 7 days data from remote"
- "Pull data from remote storage to local"
- "Sync last 30 days of news data"
**Tool called:** `sync_from_remote`
**Use cases:**
- Crawler deployed in the cloud (e.g., GitHub Actions), data stored remotely (e.g., Cloudflare R2)
- MCP Server deployed locally, needs to pull data from remote for analysis
**Return information:**
- synced_files: Number of successfully synced files
- synced_dates: List of successfully synced dates
- skipped_dates: Skipped dates (already exist locally)
- failed_dates: Failed dates and error information
**Prerequisites:**
Need to configure remote storage in `config/config.yaml` or set environment variables:
- `S3_ENDPOINT_URL`: Service endpoint
- `S3_BUCKET_NAME`: Bucket name
- `S3_ACCESS_KEY_ID`: Access key ID
- `S3_SECRET_ACCESS_KEY`: Secret access key
---
### Q15: How to view storage status?
**You can ask like this:**
- "View current storage status"
- "What's the storage configuration"
- "How much data is stored locally"
- "Is remote storage configured"
**Tool called:** `get_storage_status`
**Return information:**
| Category | Information |
|----------|-------------|
| **Local Storage** | Data directory, total size, date count, date range |
| **Remote Storage** | Whether configured, endpoint URL, bucket name, date count |
| **Pull Config** | Whether auto-pull enabled, pull days |
---
### Q16: How to view available data dates?
**You can ask like this:**
- "What dates are available locally"
- "What dates are in remote storage"
- "Compare local and remote data dates"
- "Which dates only exist remotely"
**Tool called:** `list_available_dates`
**Three query modes:**
| Mode | Description | Example Question |
|------|-------------|------------------|
| **local** | View local only | "What dates are available locally" |
| **remote** | View remote only | "What dates are in remote" |
| **both** | Compare both (default) | "Compare local and remote data" |
**Return information (both mode):**
- only_local: Dates only existing locally
- only_remote: Dates only existing remotely (useful for deciding which dates to sync)
- both: Dates existing in both places
---
### Q17: How to parse natural language date expressions? (Recommended to use first)
**You can ask like this:** **You can ask like this:**

View File

@ -450,7 +450,89 @@ AIdate_range={"start": "2024-12-01", "end": "2024-12-31"}
--- ---
### Q14: 如何解析自然语言日期表达式?(推荐优先使用) ## 存储同步
### Q14: 如何从远程存储同步数据到本地?
**你可以这样问:**
- "从远程同步最近 7 天的数据"
- "拉取远程存储的数据到本地"
- "同步最近 30 天的新闻数据"
**调用的工具:** `sync_from_remote`
**使用场景:**
- 爬虫部署在云端(如 GitHub Actions数据存储到远程如 Cloudflare R2
- MCP Server 部署在本地,需要从远程拉取数据进行分析
**返回信息:**
- synced_files: 成功同步的文件数量
- synced_dates: 成功同步的日期列表
- skipped_dates: 跳过的日期(本地已存在)
- failed_dates: 失败的日期及错误信息
**前提条件:**
需要在 `config/config.yaml` 中配置远程存储或设置环境变量:
- `S3_ENDPOINT_URL`: 服务端点
- `S3_BUCKET_NAME`: 存储桶名称
- `S3_ACCESS_KEY_ID`: 访问密钥 ID
- `S3_SECRET_ACCESS_KEY`: 访问密钥
---
### Q15: 如何查看存储状态?
**你可以这样问:**
- "查看当前存储状态"
- "存储配置是什么"
- "本地有多少数据"
- "远程存储配置了吗"
**调用的工具:** `get_storage_status`
**返回信息:**
| 类别 | 信息 |
|------|------|
| **本地存储** | 数据目录、总大小、日期数量、日期范围 |
| **远程存储** | 是否配置、端点地址、存储桶名称、日期数量 |
| **拉取配置** | 是否启用自动拉取、拉取天数 |
---
### Q16: 如何查看可用的数据日期?
**你可以这样问:**
- "本地有哪些日期的数据"
- "远程存储有哪些日期"
- "对比本地和远程的数据日期"
- "哪些日期只在远程有"
**调用的工具:** `list_available_dates`
**三种查询模式:**
| 模式 | 说明 | 示例问法 |
|------|------|---------|
| **local** | 仅查看本地 | "本地有哪些日期" |
| **remote** | 仅查看远程 | "远程有哪些日期" |
| **both** | 对比两者(默认) | "对比本地和远程的数据" |
**返回信息both 模式):**
- only_local: 仅本地存在的日期
- only_remote: 仅远程存在的日期(可用于决定同步哪些日期)
- both: 两边都存在的日期
---
### Q17: 如何解析自然语言日期表达式?(推荐优先使用)
**你可以这样问:** **你可以这样问:**

394
README.md
View File

@ -1,6 +1,6 @@
<div align="center" id="trendradar"> <div align="center" id="trendradar">
> **📢 公告:** 经与 GitHub 官方沟通,完成合规调整后将恢复"一键 Fork 部署",请关注 **v4.0.0** 版本的更新 > **📢 公告:** **v4.0.0** 版本已发布!包含存储架构重构、数据库优化、模块化改进等重大更新
<a href="https://github.com/sansan0/TrendRadar" title="TrendRadar"> <a href="https://github.com/sansan0/TrendRadar" title="TrendRadar">
<img src="/_image/banner.webp" alt="TrendRadar Banner" width="80%"> <img src="/_image/banner.webp" alt="TrendRadar Banner" width="80%">
@ -16,8 +16,8 @@
[![GitHub Stars](https://img.shields.io/github/stars/sansan0/TrendRadar?style=flat-square&logo=github&color=yellow)](https://github.com/sansan0/TrendRadar/stargazers) [![GitHub Stars](https://img.shields.io/github/stars/sansan0/TrendRadar?style=flat-square&logo=github&color=yellow)](https://github.com/sansan0/TrendRadar/stargazers)
[![GitHub Forks](https://img.shields.io/github/forks/sansan0/TrendRadar?style=flat-square&logo=github&color=blue)](https://github.com/sansan0/TrendRadar/network/members) [![GitHub Forks](https://img.shields.io/github/forks/sansan0/TrendRadar?style=flat-square&logo=github&color=blue)](https://github.com/sansan0/TrendRadar/network/members)
[![License](https://img.shields.io/badge/license-GPL--3.0-blue.svg?style=flat-square)](LICENSE) [![License](https://img.shields.io/badge/license-GPL--3.0-blue.svg?style=flat-square)](LICENSE)
[![Version](https://img.shields.io/badge/version-v3.5.0-blue.svg)](https://github.com/sansan0/TrendRadar) [![Version](https://img.shields.io/badge/version-v4.0.0-blue.svg)](https://github.com/sansan0/TrendRadar)
[![MCP](https://img.shields.io/badge/MCP-v1.0.3-green.svg)](https://github.com/sansan0/TrendRadar) [![MCP](https://img.shields.io/badge/MCP-v1.1.0-green.svg)](https://github.com/sansan0/TrendRadar)
[![企业微信通知](https://img.shields.io/badge/企业微信-通知-00D4AA?style=flat-square)](https://work.weixin.qq.com/) [![企业微信通知](https://img.shields.io/badge/企业微信-通知-00D4AA?style=flat-square)](https://work.weixin.qq.com/)
[![个人微信通知](https://img.shields.io/badge/个人微信-通知-00D4AA?style=flat-square)](https://weixin.qq.com/) [![个人微信通知](https://img.shields.io/badge/个人微信-通知-00D4AA?style=flat-square)](https://weixin.qq.com/)
@ -48,62 +48,61 @@
<br> <br>
<details> <details>
<summary>🚨 <strong>【必读】重要公告:本项目的正确部署姿势</strong></summary> <summary>🚨 <strong>【必读】重要公告:v4.0.0 部署方式与存储架构变更</strong></summary>
<br> <br>
> **⚠️ 2025年12月紧急通知** ### 🛠️ 请选择适合你的部署方式
>
> 由于 Fork 数量激增导致 GitHub 服务器压力过大,**GitHub Actions 及 GitHub Pages 部署目前已受限**。为确保顺利部署,请务必阅读以下说明。
### 1. ✅ 唯一推荐部署方式Docker #### 🅰️ 方案一Docker 部署(推荐 🔥)
**这是目前最稳定、不受 GitHub 限制的方案。** 数据存储在本地,不会因为 GitHub 策略调整而失效。 * **特点**:最稳定、最简单,数据存储在 **本地 SQLite**,完全自主可控。
* **适用**有自己的服务器、NAS 或长期运行的电脑。
* 👉 [跳转到 Docker 部署教程](#6-docker-部署) * 👉 [跳转到 Docker 部署教程](#6-docker-部署)
--- ---
### 2. 如果你本打算 Fork 本项目... #### 🅱️ 方案二GitHub Actions 部署(已恢复 ✅)
为了减少对 GitHub 服务器的压力,**请千万不要直接点击 "Fork" 按钮!** * **特点**数据不再直接写入仓库Git Commit而是存储在 **远程云存储**(支持 S3 兼容协议Cloudflare R2、阿里云 OSS、腾讯云 COS 等)。
请务必使用 **"Use this template"** 功能来替代 Fork * **门槛****必须**配置一个 S3 兼容的对象存储服务(推荐免费的 Cloudflare R2
> **⚠️ 注意**:选择此方案,请务必执行以下两步配置:
#### 1. 🚀 推荐的开始方式Use this template
为了保持仓库整洁,避免继承冗余的历史记录,我**建议**你使用 Template 模式:
1. **点击**原仓库页面右上角的绿色 **[Use this template]** 按钮。
1. **点击**原仓库页面右上角的绿色的 **[Use this template]** 按钮。
2. **选择** "Create a new repository"。 2. **选择** "Create a new repository"。
**为什么要这样做?** > **💡 为什么要这样做?**
* **❌ Fork**:复制完整历史记录,大量 Fork 同时运行会触发 GitHub 风控。 > * **Use this template**:创建一个全新的、干净的仓库,没有历史包袱
* **✅ Use this template**:创建的是一个全新的独立仓库,没有历史包袱,对服务器更友好。 > * **Fork**:会保留完整的提交历史和关联关系,占用 GitHub 更多资源
--- #### 2. ☁️ 关于 GitHub Actions 必配的远程存储
### 3. 关于新版数据存储的说明 如果你选择 **方案二 (GitHub Actions)**,则必须配置一个 S3 兼容的对象存储服务。
新版将使用 **Cloudflare R2** 存储新闻数据,以保证持久化。 **支持的存储服务:**
- **Cloudflare R2**(推荐,免费额度充足)
- 其他 S3 兼容服务
**⚠️ 配置前置条件:** **⚠️ 以 Cloudflare R2 为例的配置前置条件:**
根据 Cloudflare 平台规则,开通 R2 需绑定支付方式。 根据 Cloudflare 平台规则,开通 R2 需绑定支付方式。
- **目的:** 仅作身份验证Verify Only不产生扣费。 * **目的**仅作身份验证Verify Only**不产生扣费**。
- **支付:** 支持信用卡或国区 PayPal。
- **用量:** R2 的免费额度足以覆盖本项目日常运行,无需付费。
--- * **支付**:支持双币信用卡或国区 PayPal。
### 4. 📅 后续计划与文档阅读说明 * **用量**R2 的免费额度10GB存储/月)足以覆盖本项目日常运行,无需担心付费。
> **后续计划:** 👉 **[点击查看详细配置教程](#-快速开始)**
> - 探索新方案:保留 Actions 用于抓取和推送,但不再将数据保存到仓库,改用外部存储。
**⚠️ 阅读注意:**
鉴于上述计划意味着 **Fork 部署模式未来可能会以新形式回归**,且当前全面修改文档工作量巨大,我们暂时保留了旧版描述。
**在当前阶段,若后续教程中仍出现 "Fork" 相关表述,请一律忽略或将其理解为 "Use this template"**。
👉 **[点击此处查看 TrendRadar 最新官方文档](https://github.com/sansan0/TrendRadar?tab=readme-ov-file)**
</details> </details>
@ -335,10 +334,30 @@
- ⚠️ **配对配置**Telegram 和 ntfy 需要保证配对参数数量一致(如 token 和 chat_id 都是 2 个) - ⚠️ **配对配置**Telegram 和 ntfy 需要保证配对参数数量一致(如 token 和 chat_id 都是 2 个)
- ⚠️ **数量限制**:默认每个渠道最多 3 个账号,超出会被截断 - ⚠️ **数量限制**:默认每个渠道最多 3 个账号,超出会被截断
### **多端适配** ### **灵活存储架构**v4.0.0 重大更新)
- **GitHub Pages**自动生成精美网页报告PC/移动端适配
- **Docker部署**:支持多架构容器化运行 **多存储后端支持**
- **数据持久化**HTML/TXT多格式历史记录保存 - ☁️ **远程云存储**GitHub Actions 环境默认,支持 S3 兼容协议R2/OSS/COS 等),数据存储在云端,不污染仓库
- 💾 **本地 SQLite 数据库**Docker/本地环境默认,数据完全可控
- 🔄 **自动后端选择**:根据运行环境智能切换存储方式
**数据格式**
| 格式 | 用途 | 说明 |
|------|------|------|
| **SQLite** | 主存储 | 单文件数据库,查询快速,支持 MCP AI 分析 |
| **TXT** | 可选快照 | 可读文本格式,方便直接查看 |
| **HTML** | 报告展示 | 精美可视化页面PC/移动端适配 |
**数据管理**
- ✅ 自动清理过期数据(可配置保留天数)
- ✅ 时区配置支持(全球时区)
> 💡 详细说明见 [配置详解 - 存储配置](#9-存储配置)
### **多端部署**
- **GitHub Actions**:定时自动爬取 + 远程云存储(需签到续期)
- **Docker 部署**:支持多架构容器化运行,数据本地存储
- **本地运行**Windows/Mac/Linux 直接运行
### **AI 智能分析v3.0.0 新增)** ### **AI 智能分析v3.0.0 新增)**
@ -389,10 +408,34 @@ GitHub 一键 Fork 即可使用,无需编程基础。
>**升级说明** >**升级说明**
- **📌 查看最新更新****[原仓库更新日志](https://github.com/sansan0/TrendRadar?tab=readme-ov-file#-更新日志)** - **📌 查看最新更新****[原仓库更新日志](https://github.com/sansan0/TrendRadar?tab=readme-ov-file#-更新日志)**
- **提示**:不要通过 **Sync fork** 更新本项目,建议查看【历史更新】,明确具体的【升级方式】和【功能内容】 - **提示**:不要通过 **Sync fork** 更新本项目,建议查看【历史更新】,明确具体的【升级方式】和【功能内容】
- **小版本更新**:从 v2.x 升级到 v2.y用本项目的 `main.py` 代码替换你 fork 仓库中的对应文件
- **大版本升级**:从 v1.x 升级到 v2.y建议删除现有 fork 后重新 fork这样更省力且避免配置冲突 - **大版本升级**:从 v1.x 升级到 v2.y建议删除现有 fork 后重新 fork这样更省力且避免配置冲突
### 2025/12/13 - v4.0.0
**🎉 重大更新:全面重构存储和核心架构**
- **多存储后端支持**:引入全新的存储模块,支持本地 SQLite 和远程云存储S3 兼容协议,推荐免费的 Cloudflare R2适应 GitHub Actions、Docker 和本地环境。
- **数据库结构优化**:重构 SQLite 数据库表结构,提升数据效率和查询能力。
- **核心代码模块化**:将主程序逻辑拆分为 trendradar 包的多个模块,显著提升代码可维护性。
- **增强功能**:实现日期格式标准化、数据保留策略、时区配置支持、时间显示优化,并修复远程存储数据持久化问题,确保数据合并的准确性。
- **清理和兼容**:移除了大部分历史兼容代码,统一了数据存储和读取方式。
### 2025/12/13 - mcp-v1.1.0
**MCP 模块更新:**
- 适配 v4.0.0,同时也兼容 v3.x 的数据
- 新增存储同步工具:
- `sync_from_remote`: 从远程存储拉取数据到本地
- `get_storage_status`: 获取存储配置和状态
- `list_available_dates`: 列出本地/远程可用日期范围
<details>
<summary>👉 点击展开:<strong>历史更新</strong></summary>
### 2025/12/03 - v3.5.0 ### 2025/12/03 - v3.5.0
**🎉 核心功能增强** **🎉 核心功能增强**
@ -456,10 +499,6 @@ GitHub 一键 Fork 即可使用,无需编程基础。
- 工具总数从 13 个增加到 14 个 - 工具总数从 13 个增加到 14 个
<details>
<summary>👉 点击展开:<strong>历史更新</strong></summary>
### 2025/11/28 - v3.4.1 ### 2025/11/28 - v3.4.1
**🔧 格式优化** **🔧 格式优化**
@ -857,11 +896,44 @@ frequency_words.txt 文件增加了一个【必须词】功能,使用 + 号
> **📖 提醒**Fork 用户建议先 **[查看最新官方文档](https://github.com/sansan0/TrendRadar?tab=readme-ov-file)**,确保配置步骤是最新的。 > **📖 提醒**Fork 用户建议先 **[查看最新官方文档](https://github.com/sansan0/TrendRadar?tab=readme-ov-file)**,确保配置步骤是最新的。
### ⚠️ GitHub Actions 使用说明
**v4.0.0 重要变更**引入「活跃度检测」机制GitHub Actions 需定期签到以维持运行。
#### 🔄 签到续期机制
- **运行周期**:有效期为 **7 天**,倒计时结束后服务将自动挂起。
- **续期方式**:在 Actions 页面手动触发 "Check In" workflow即可重置 7 天有效期。
- **操作路径**`Actions` → `Check In``Run workflow`
- **设计理念**
- 如果 7 天都忘了签到,或许这些资讯对你来说并非刚需。适时的暂停,能帮你从信息流中抽离,给大脑留出喘息的空间。
- GitHub Actions 是宝贵的公共计算资源。引入签到机制旨在避免算力的无效空转,确保资源能分配给真正活跃且需要的用户。感谢你的理解与支持。
#### 📦 数据存储(必需配置)
GitHub Actions 环境下,数据存储在 **远程云存储**(支持 S3 兼容协议,推荐免费的 Cloudflare R2不会污染仓库见下方 **必需配置:远程云存储**
#### 🚀 推荐Docker 部署
如需长期稳定运行,建议使用 [Docker 部署](#6-docker-部署),数据存储在本地,无需签到,不过需要额外付费购买云服务器。
<br>
> 🎉 **已支持:多云存储方案**
>
> 本项目现已支持 S3 兼容协议,你可以选择:
> - **Cloudflare R2**(推荐,免费额度充足)
> - 其他 S3 兼容存储服务
>
> 只需配置对应的 `S3_ENDPOINT_URL`、`S3_BUCKET_NAME` 等环境变量即可切换。
---
1. **Fork 本项目**到你的 GitHub 账户 1. **Fork 本项目**到你的 GitHub 账户
- 点击本页面右上角的"Fork"按钮 - 点击本页面右上角的"Fork"按钮
2. **设置 GitHub Secrets选择你需要的平台**: 2. **设置 GitHub Secrets必需 + 可选平台)**:
在你 Fork 后的仓库中,进入 `Settings` > `Secrets and variables` > `Actions` > `New repository secret` 在你 Fork 后的仓库中,进入 `Settings` > `Secrets and variables` > `Actions` > `New repository secret`
@ -900,6 +972,53 @@ frequency_words.txt 文件增加了一个【必须词】功能,使用 + 号
<br> <br>
<details>
<summary>⚠️ <strong>必需配置:远程云存储</strong>GitHub Actions 环境必需,推荐 Cloudflare R2</summary>
<br>
**GitHub Secret 配置(⚠️ 以下 4 个配置项都是必需的):**
| Name名称 | Secret说明 |
|-------------|-----------------|
| `S3_BUCKET_NAME` | 存储桶名称(如 `trendradar-data` |
| `S3_ACCESS_KEY_ID` | 访问密钥 IDAccess Key ID |
| `S3_SECRET_ACCESS_KEY` | 访问密钥Secret Access Key |
| `S3_ENDPOINT_URL` | S3 API 端点(如 R2`https://<account-id>.r2.cloudflarestorage.com` |
<br>
**如何获取凭据(以 Cloudflare R2 为例):**
1. **进入 R2 概览**
- 登录 [Cloudflare Dashboard](https://dash.cloudflare.com/)。
- 在左侧侧边栏找到并点击 `R2对象存储`
<br>
2. **创建存储桶**
- 点击`概述`
- 点击右上角的 `创建存储桶` (Create bucket)。
- 输入名称(例如 `trendradar-data`),点击 `创建存储桶`
<br>
3. **创建 API 令牌**
- 回到 **概述**页面。
- 点击**右下角** `Account Details `找到并点击 `Manage` (Manage R2 API Tokens)。
- 同时你会看到 `S3 API``https://<account-id>.r2.cloudflarestorage.com`(这就是 S3_ENDPOINT_URL)
- 点击 `创建 Account APl 令牌`
- **⚠️ 关键设置**
- **令牌名称**:随意填写(如 `github-action-write`)。
- **权限**:选择 `管理员读和写`
- **指定存储桶**:为了安全,建议选择 `仅适用于指定存储桶` 并选中你的桶(如 `trendradar-data`)。
- 点击 `创建 API 令牌`**立即复制** 显示的 `Access Key ID``Secret Access Key`(只显示一次!)。
<br>
- **R2 免费额度**:每月 10GB 存储 + 100万次读取对本项目来说非常充足。
- **支付验证**:开通 R2 即使是免费额度Cloudflare 也要求绑定 PayPal 或信用卡进行身份验证(不会实际扣费,除非超过额度)。
</details>
<details> <details>
<summary>👉 点击展开:<strong>企业微信机器人</strong>(配置最简单最迅速)</summary> <summary>👉 点击展开:<strong>企业微信机器人</strong>(配置最简单最迅速)</summary>
@ -1489,10 +1608,11 @@ frequency_words.txt 文件增加了一个【必须词】功能,使用 + 号
**测试步骤** **测试步骤**
1. 进入你项目的 Actions 页面 1. 进入你项目的 Actions 页面
2. 找到 **"Hot News Crawler"** 点进去 2. 找到 **"Get Hot News"**(必须得是这个字)点进去,点击右侧的 **"Run workflow"** 按钮运行
- 如果看不到该字样,参照 [#109](https://github.com/sansan0/TrendRadar/issues/109) 解决 - 如果看不到该字样,参照 [#109](https://github.com/sansan0/TrendRadar/issues/109) 解决
3. 点击右侧的 **"Run workflow"** 按钮运行 3. 3 分钟左右,消息会推送到你配置的平台
4. 等待 1 分钟左右,消息会推送到你配置的平台
<br>
> ⏱️ **测试提示** > ⏱️ **测试提示**
> - 手动测试不要太频繁,避免触发 GitHub Actions 限制 > - 手动测试不要太频繁,避免触发 GitHub Actions 限制
@ -2069,7 +2189,7 @@ TrendRadar 提供两个独立的 Docker 镜像,可根据需求选择部署:
# 下载 docker compose 配置 # 下载 docker compose 配置
wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/.env -P docker/ wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/.env -P docker/
wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/docker compose.yml -P docker/ wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/docker-compose.yml -P docker/
``` ```
> 💡 **说明**Docker 部署需要的关键目录结构如下: > 💡 **说明**Docker 部署需要的关键目录结构如下:
@ -2080,7 +2200,7 @@ TrendRadar 提供两个独立的 Docker 镜像,可根据需求选择部署:
│ └── frequency_words.txt │ └── frequency_words.txt
└── docker/ └── docker/
├── .env ├── .env
└── docker compose.yml └── docker-compose.yml
``` ```
2. **配置文件说明**: 2. **配置文件说明**:
@ -2174,7 +2294,7 @@ vim config/frequency_words.txt
# 使用构建版本的 docker compose # 使用构建版本的 docker compose
cd docker cd docker
cp docker compose-build.yml docker compose.yml cp docker-compose-build.yml docker-compose.yml
``` ```
**构建并启动服务** **构建并启动服务**
@ -2260,7 +2380,7 @@ docker rm trend-radar
> 💡 **Web 服务器说明** > 💡 **Web 服务器说明**
> - 启动后可通过浏览器访问 `http://localhost:8080` 查看最新报告 > - 启动后可通过浏览器访问 `http://localhost:8080` 查看最新报告
> - 通过目录导航访问历史报告(如:`http://localhost:8080/2025年xx月xx日/` > - 通过目录导航访问历史报告(如:`http://localhost:8080/2025-xx-xx/`
> - 端口可在 `.env` 文件中配置 `WEBSERVER_PORT` 参数 > - 端口可在 `.env` 文件中配置 `WEBSERVER_PORT` 参数
> - 自动启动:在 `.env` 中设置 `ENABLE_WEBSERVER=true` > - 自动启动:在 `.env` 中设置 `ENABLE_WEBSERVER=true`
> - 安全提示:仅提供静态文件访问,限制在 output 目录,只绑定本地访问 > - 安全提示:仅提供静态文件访问,限制在 output 目录,只绑定本地访问
@ -2277,7 +2397,7 @@ TrendRadar 生成的当日汇总 HTML 报告会同时保存到两个位置:
|---------|---------|---------| |---------|---------|---------|
| `output/index.html` | 宿主机直接访问 | **Docker 部署**(通过 Volume 挂载,宿主机可见) | | `output/index.html` | 宿主机直接访问 | **Docker 部署**(通过 Volume 挂载,宿主机可见) |
| `index.html` | 根目录访问 | **GitHub Pages**仓库根目录Pages 自动识别) | | `index.html` | 根目录访问 | **GitHub Pages**仓库根目录Pages 自动识别) |
| `output/YYYY年MM月DD日/html/当日汇总.html` | 历史报告访问 | 所有环境(按日期归档) | | `output/YYYY-MM-DD/html/当日汇总.html` | 历史报告访问 | 所有环境(按日期归档) |
**本地访问示例** **本地访问示例**
```bash ```bash
@ -2286,8 +2406,8 @@ TrendRadar 生成的当日汇总 HTML 报告会同时保存到两个位置:
docker exec -it trend-radar python manage.py start_webserver docker exec -it trend-radar python manage.py start_webserver
# 2. 在浏览器访问 # 2. 在浏览器访问
http://localhost:8080 # 访问最新报告(默认 index.html http://localhost:8080 # 访问最新报告(默认 index.html
http://localhost:8080/2025年xx月xx日/ # 访问指定日期的报告 http://localhost:8080/2025-xx-xx/ # 访问指定日期的报告
http://localhost:8080/2025年xx月xx日/html/ # 浏览该日期下的所有 HTML 文件 http://localhost:8080/2025-xx-xx/html/ # 浏览该日期下的所有 HTML 文件
# 方式 2直接打开文件本地环境 # 方式 2直接打开文件本地环境
open ./output/index.html # macOS open ./output/index.html # macOS
@ -2295,7 +2415,7 @@ start ./output/index.html # Windows
xdg-open ./output/index.html # Linux xdg-open ./output/index.html # Linux
# 方式 3访问历史归档 # 方式 3访问历史归档
open ./output/2025年xx月xx日/html/当日汇总.html open ./output/2025-xx-xx/html/当日汇总.html
``` ```
**为什么有两个 index.html** **为什么有两个 index.html**
@ -2349,34 +2469,42 @@ flowchart TB
**快速启动** **快速启动**
使用 docker compose 同时启动新闻推送和 MCP 服务: 如果已按照 [方式一:使用 docker compose](#方式一使用-docker-compose推荐) 完成部署,只需启动 MCP 服务:
```bash ```bash
# 下载最新的 docker compose.yml已包含 MCP 服务配置) cd TrendRadar/docker
wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/docker compose.yml docker compose up -d trend-radar-mcp
# 启动所有服务
docker compose up -d
# 查看运行状态 # 查看运行状态
docker ps | grep trend-radar docker ps | grep trend-radar-mcp
``` ```
**单独启动 MCP 服务** **单独启动 MCP 服务**(不使用 docker compose
```bash ```bash
# Linux/Mac
docker run -d --name trend-radar-mcp \ docker run -d --name trend-radar-mcp \
-p 127.0.0.1:3333:3333 \ -p 127.0.0.1:3333:3333 \
-v ./config:/app/config:ro \ -v $(pwd)/config:/app/config:ro \
-v ./output:/app/output:ro \ -v $(pwd)/output:/app/output:ro \
-e TZ=Asia/Shanghai \ -e TZ=Asia/Shanghai \
wantcat/trendradar-mcp:latest wantcat/trendradar-mcp:latest
# Windows PowerShell
docker run -d --name trend-radar-mcp `
-p 127.0.0.1:3333:3333 `
-v ${PWD}/config:/app/config:ro `
-v ${PWD}/output:/app/output:ro `
-e TZ=Asia/Shanghai `
wantcat/trendradar-mcp:latest
``` ```
> ⚠️ **注意**:单独运行时,确保当前目录下有 `config/``output/` 文件夹,且包含配置文件和新闻数据。
**验证服务** **验证服务**
```bash ```bash
# 检查 MCP 服务是否正常运行 # 检查 MCP 服务健康状态
curl http://127.0.0.1:3333/mcp curl http://127.0.0.1:3333/mcp
# 查看 MCP 服务日志 # 查看 MCP 服务日志
@ -2385,14 +2513,20 @@ docker logs -f trend-radar-mcp
**在 AI 客户端中配置** **在 AI 客户端中配置**
MCP 服务启动后,在 Claude Desktop、Cherry Studio、Cursor 等客户端中配置: MCP 服务启动后,根据不同客户端进行配置:
**Cherry Studio**推荐GUI 配置):
- 设置 → MCP 服务器 → 添加
- 类型:`streamableHttp`
- URL`http://127.0.0.1:3333/mcp`
**Claude Desktop / Cline**JSON 配置):
```json ```json
{ {
"mcpServers": { "mcpServers": {
"trendradar": { "trendradar": {
"url": "http://127.0.0.1:3333/mcp", "url": "http://127.0.0.1:3333/mcp",
"description": "TrendRadar 新闻热点分析" "type": "streamableHttp"
} }
} }
} }
@ -2480,7 +2614,6 @@ notification:
start: "20:00" # 开始时间(北京时间) start: "20:00" # 开始时间(北京时间)
end: "22:00" # 结束时间(北京时间) end: "22:00" # 结束时间(北京时间)
once_per_day: true # 每天只推送一次 once_per_day: true # 每天只推送一次
push_record_retention_days: 7 # 推送记录保留天数
``` ```
#### 配置项详解 #### 配置项详解
@ -2491,7 +2624,6 @@ notification:
| `time_range.start` | string | `"20:00"` | 推送时间窗口开始时间北京时间HH:MM 格式) | | `time_range.start` | string | `"20:00"` | 推送时间窗口开始时间北京时间HH:MM 格式) |
| `time_range.end` | string | `"22:00"` | 推送时间窗口结束时间北京时间HH:MM 格式) | | `time_range.end` | string | `"22:00"` | 推送时间窗口结束时间北京时间HH:MM 格式) |
| `once_per_day` | bool | `true` | `true`=每天在窗口内只推送一次,`false`=窗口内每次执行都推送 | | `once_per_day` | bool | `true` | `true`=每天在窗口内只推送一次,`false`=窗口内每次执行都推送 |
| `push_record_retention_days` | int | `7` | 推送记录保留天数(用于判断是否已推送) |
#### 使用场景 #### 使用场景
@ -2515,7 +2647,6 @@ PUSH_WINDOW_ENABLED=true
PUSH_WINDOW_START=09:00 PUSH_WINDOW_START=09:00
PUSH_WINDOW_END=18:00 PUSH_WINDOW_END=18:00
PUSH_WINDOW_ONCE_PER_DAY=false PUSH_WINDOW_ONCE_PER_DAY=false
PUSH_WINDOW_RETENTION_DAYS=7
``` ```
#### 完整配置示例 #### 完整配置示例
@ -2530,7 +2661,6 @@ notification:
start: "20:00" start: "20:00"
end: "22:00" end: "22:00"
once_per_day: true once_per_day: true
push_record_retention_days: 7
``` ```
**场景:工作时间内每小时推送** **场景:工作时间内每小时推送**
@ -2543,7 +2673,6 @@ notification:
start: "09:00" start: "09:00"
end: "18:00" end: "18:00"
once_per_day: false once_per_day: false
push_record_retention_days: 7
``` ```
</details> </details>
@ -2829,6 +2958,123 @@ notification:
</details> </details>
### 11. 存储配置
<details id="storage-config">
<summary>👉 点击展开:<strong>存储架构配置详解</strong></summary>
<br>
#### 存储后端选择
**配置位置**`config/config.yaml` 的 `storage` 部分
v4.0.0 版本重构了存储架构,支持多种存储后端:
```yaml
storage:
backend: auto # 存储后端auto自动选择/ local本地SQLite/ remote远程云存储
formats:
sqlite: true # 是否启用SQLite存储
txt: true # 是否生成TXT快照
html: true # 是否生成HTML报告
local:
data_dir: "output" # 本地存储目录
retention_days: 0 # 本地数据保留天数0表示永久保留
remote:
endpoint_url: "" # S3 API 端点
bucket_name: "" # 存储桶名称
access_key_id: "" # 访问密钥ID
secret_access_key: "" # 访问密钥
region: "" # 区域(可选)
retention_days: 0 # 远程数据保留天数0表示永久保留
pull:
enabled: false # 是否启用启动时从远程拉取数据
days: 7 # 拉取最近N天的数据
```
#### 后端选择策略
| backend 值 | 说明 | 适用场景 |
|-----------|------|---------|
| `auto` | **自动选择**(推荐) | 根据运行环境智能选择:<br>• GitHub Actions → Remote<br>• Docker/本地 → Local |
| `local` | 本地 SQLite 数据库 | Docker 部署、本地开发 |
| `remote` | 远程云存储S3 兼容,如 Cloudflare R2 | GitHub Actions、多机器同步 |
#### 远程云存储配置
**环境变量**(推荐方式):
```bash
# GitHub Actions / Docker 环境变量
STORAGE_BACKEND=remote # 或 auto
# 本地/远程数据保留天数0 表示永久保留)
LOCAL_RETENTION_DAYS=0
REMOTE_RETENTION_DAYS=0
# S3 兼容存储配置(以 Cloudflare R2 为例)
S3_BUCKET_NAME=your-bucket-name
S3_ACCESS_KEY_ID=your-access-key-id
S3_SECRET_ACCESS_KEY=your-secret-access-key
S3_ENDPOINT_URL=https://<account-id>.r2.cloudflarestorage.com
S3_REGION=auto
# 数据拉取配置(可选,从远程同步到本地)
PULL_ENABLED=false
PULL_DAYS=7
```
**获取凭据**:参见 [快速开始 - 远程存储配置](#-快速开始)
#### 数据清理策略
**自动清理**:每次运行结束时检查并删除超过保留天数的数据。
```yaml
storage:
local:
retention_days: 30 # 本地保留最近30天数据
remote:
retention_days: 30 # 远程保留最近30天数据
```
**清理逻辑**
- 本地存储:删除过期日期的文件夹(如 `output/2025-11-10/`
- 远程存储:批量删除过期的云端对象(如 `news/2025-11-10.db`
#### 时区配置v4.0.0 新增)
**全球时区支持**:解决非中国用户推送时间窗口问题。
```yaml
app:
timezone: "Asia/Shanghai" # 默认中国时区
# 其他示例:
# timezone: "America/Los_Angeles" # 美西时间
# timezone: "Europe/London" # 英国时间
```
**支持所有 IANA 时区名称**[时区列表](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones)
#### 不兼容变更
⚠️ **v4.0.0 不兼容 v3.x 数据**
1. 数据库结构完全重构,无法读取旧数据
2. 文件路径格式变更ISO 格式)
**迁移建议**
- 从 v4.0.0 开始重新收集数据
- 旧数据如需保留,请手动重命名目录格式(不推荐)
</details>
<br> <br>
## 🤖 AI 智能分析 ## 🤖 AI 智能分析
@ -2846,7 +3092,7 @@ AI 分析功能**不是**直接查询网络实时数据,而是分析你**本
#### 使用说明: #### 使用说明:
1. **项目自带测试数据**`output` 目录默认包含 **2025年11月1日11月15日** 的新闻数据,可用于快速体验 AI 功能 1. **项目自带测试数据**`output` 目录默认包含 **2025-11-012025-11-15** 的新闻数据,可用于快速体验 AI 功能
2. **查询限制** 2. **查询限制**
- ✅ 只能查询已有日期范围内的数据11月1-15日 - ✅ 只能查询已有日期范围内的数据11月1-15日

View File

@ -1,12 +1,60 @@
app: app:
version_check_url: "https://raw.githubusercontent.com/sansan0/TrendRadar/refs/heads/master/version" version_check_url: "https://raw.githubusercontent.com/sansan0/TrendRadar/refs/heads/master/version"
show_version_update: true # 控制显示版本更新提示,如果 false则不接受新版本提示 show_version_update: true # 控制显示版本更新提示,如果 false则不接受新版本提示
# 时区配置(影响所有时间显示、推送窗口判断、数据存储)
# 常用时区:
# - Asia/Shanghai (北京时间 UTC+8)
# - America/New_York (美东时间 UTC-5/-4)
# - Europe/London (伦敦时间 UTC+0/+1)
# 完整时区列表: https://en.wikipedia.org/wiki/List_of_tz_database_time_zones
timezone: "Asia/Shanghai"
# 存储配置
storage:
# 存储后端选择: local / remote / auto
# - local: 本地 SQLite + TXT/HTML 文件
# - remote: 远程云存储S3 兼容协议,支持 R2/OSS/COS 等)
# - auto: 自动选择GitHub Actions 环境且配置了远程存储则用 remote否则用 local
backend: "auto"
# 数据格式选项
formats:
sqlite: true # 主存储(必须启用)
txt: false # 是否生成 TXT 快照
html: false # 是否生成 HTML 报告
# 本地存储配置
local:
data_dir: "output" # 数据目录
retention_days: 0 # 本地数据保留天数0 = 不清理)
# 远程存储配置S3 兼容协议)
# 支持: Cloudflare R2, 阿里云 OSS, 腾讯云 COS, AWS S3, MinIO 等
# 建议将敏感信息配置在 GitHub Secrets 或环境变量中
remote:
# 数据保留天数0 = 不清理远程数据)
retention_days: 0
# S3 兼容配置
endpoint_url: "" # 服务端点(或环境变量 S3_ENDPOINT_URL
# Cloudflare R2: https://<account_id>.r2.cloudflarestorage.com
# 阿里云 OSS: https://oss-cn-hangzhou.aliyuncs.com
# 腾讯云 COS: https://cos.ap-guangzhou.myqcloud.com
bucket_name: "" # 存储桶名称(或环境变量 S3_BUCKET_NAME
access_key_id: "" # 访问密钥 ID或环境变量 S3_ACCESS_KEY_ID
secret_access_key: "" # 访问密钥(或环境变量 S3_SECRET_ACCESS_KEY
region: "" # 区域(可选,部分服务商需要,或环境变量 S3_REGION
# 数据拉取配置(从远程同步到本地)
# 用于 MCP Server 等场景爬虫存到远程MCP 拉取到本地分析
pull:
enabled: false # 是否启用启动时自动拉取
days: 7 # 拉取最近 N 天的数据0 = 不拉取)
crawler: crawler:
request_interval: 1000 # 请求间隔(毫秒) request_interval: 1000 # 请求间隔(毫秒)
enable_crawler: true # 是否启用爬取新闻功能,如果 false则直接停止程序 enable_crawler: true # 是否启用爬取新闻功能,如果 false则直接停止程序
use_proxy: false # 是否启用代理false 时为关闭 use_proxy: false # 是否启用代理false 时为关闭
default_proxy: "http://127.0.0.1:10086" default_proxy: "http://127.0.0.1:10801"
# 🔸 daily当日汇总模式 # 🔸 daily当日汇总模式
# • 推送时机:按时推送(默认每小时推送一次) # • 推送时机:按时推送(默认每小时推送一次)
@ -55,7 +103,6 @@ notification:
start: "20:00" # 推送时间窗口开始(北京时间) start: "20:00" # 推送时间窗口开始(北京时间)
end: "22:00" # 推送时间窗口结束(北京时间) end: "22:00" # 推送时间窗口结束(北京时间)
once_per_day: true # 每天在时间窗口内只推送一次,如果 false则窗口内每次执行都推送 once_per_day: true # 每天在时间窗口内只推送一次,如果 false则窗口内每次执行都推送
push_record_retention_days: 7 # 推送记录保留天数
# ⚠️⚠️⚠️ 重要安全警告 / IMPORTANT SECURITY WARNING ⚠️⚠️⚠️ # ⚠️⚠️⚠️ 重要安全警告 / IMPORTANT SECURITY WARNING ⚠️⚠️⚠️
# #

View File

@ -40,8 +40,6 @@ PUSH_WINDOW_START=
PUSH_WINDOW_END= PUSH_WINDOW_END=
# 每天只推送一次 (true/false) # 每天只推送一次 (true/false)
PUSH_WINDOW_ONCE_PER_DAY= PUSH_WINDOW_ONCE_PER_DAY=
# 推送记录保留天数 (数字,如 7)
PUSH_WINDOW_RETENTION_DAYS=
# ============================================ # ============================================
# 多账号配置 # 多账号配置
@ -87,6 +85,39 @@ BARK_URL=
# Slack 推送配置(多账号用 ; 分隔) # Slack 推送配置(多账号用 ; 分隔)
SLACK_WEBHOOK_URL= SLACK_WEBHOOK_URL=
# ============================================
# 存储配置
# ============================================
# 存储后端选择 (local/remote/auto)
# - local: 本地 SQLite + TXT/HTML 文件
# - remote: 远程云存储S3 兼容协议)
# - auto: 自动选择GitHub Actions 用 remote其他用 local
STORAGE_BACKEND=auto
# 本地数据保留天数0 = 无限制,不清理历史数据)
LOCAL_RETENTION_DAYS=0
# 远程数据保留天数0 = 无限制,不清理历史数据)
REMOTE_RETENTION_DAYS=0
# 是否生成 TXT 快照 (true/false)
STORAGE_TXT_ENABLED=
# 是否生成 HTML 报告 (true/false)
STORAGE_HTML_ENABLED=
# 远程存储配置S3 兼容协议,支持 R2/OSS/COS/S3 等)
S3_ENDPOINT_URL=
S3_BUCKET_NAME=
S3_ACCESS_KEY_ID=
S3_SECRET_ACCESS_KEY=
S3_REGION=
# 数据拉取配置(从远程同步到本地)
PULL_ENABLED=false
PULL_DAYS=7
# ============================================ # ============================================
# 运行配置 # 运行配置
# ============================================ # ============================================

View File

@ -53,8 +53,8 @@ RUN set -ex && \
COPY requirements.txt . COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt RUN pip install --no-cache-dir -r requirements.txt
COPY main.py .
COPY docker/manage.py . COPY docker/manage.py .
COPY trendradar/ ./trendradar/
# 复制 entrypoint.sh 并强制转换为 LF 格式 # 复制 entrypoint.sh 并强制转换为 LF 格式
COPY docker/entrypoint.sh /entrypoint.sh.tmp COPY docker/entrypoint.sh /entrypoint.sh.tmp

View File

@ -8,6 +8,8 @@ RUN pip install --no-cache-dir -r requirements.txt
# 复制 MCP 服务器代码 # 复制 MCP 服务器代码
COPY mcp_server/ ./mcp_server/ COPY mcp_server/ ./mcp_server/
# 复制 trendradar 模块MCP 服务需要读取 SQLite 数据)
COPY trendradar/ ./trendradar/
# 创建必要目录 # 创建必要目录
RUN mkdir -p /app/config /app/output RUN mkdir -p /app/config /app/output

View File

@ -32,7 +32,6 @@ services:
- PUSH_WINDOW_START=${PUSH_WINDOW_START:-} - PUSH_WINDOW_START=${PUSH_WINDOW_START:-}
- PUSH_WINDOW_END=${PUSH_WINDOW_END:-} - PUSH_WINDOW_END=${PUSH_WINDOW_END:-}
- PUSH_WINDOW_ONCE_PER_DAY=${PUSH_WINDOW_ONCE_PER_DAY:-} - PUSH_WINDOW_ONCE_PER_DAY=${PUSH_WINDOW_ONCE_PER_DAY:-}
- PUSH_WINDOW_RETENTION_DAYS=${PUSH_WINDOW_RETENTION_DAYS:-}
# 通知渠道 # 通知渠道
- FEISHU_WEBHOOK_URL=${FEISHU_WEBHOOK_URL:-} - FEISHU_WEBHOOK_URL=${FEISHU_WEBHOOK_URL:-}
- TELEGRAM_BOT_TOKEN=${TELEGRAM_BOT_TOKEN:-} - TELEGRAM_BOT_TOKEN=${TELEGRAM_BOT_TOKEN:-}
@ -54,6 +53,21 @@ services:
- BARK_URL=${BARK_URL:-} - BARK_URL=${BARK_URL:-}
# Slack配置 # Slack配置
- SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL:-} - SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL:-}
# 存储配置
- STORAGE_BACKEND=${STORAGE_BACKEND:-auto}
- LOCAL_RETENTION_DAYS=${LOCAL_RETENTION_DAYS:-0}
- REMOTE_RETENTION_DAYS=${REMOTE_RETENTION_DAYS:-0}
- STORAGE_TXT_ENABLED=${STORAGE_TXT_ENABLED:-true}
- STORAGE_HTML_ENABLED=${STORAGE_HTML_ENABLED:-true}
# 远程存储配置S3 兼容协议)
- S3_ENDPOINT_URL=${S3_ENDPOINT_URL:-}
- S3_BUCKET_NAME=${S3_BUCKET_NAME:-}
- S3_ACCESS_KEY_ID=${S3_ACCESS_KEY_ID:-}
- S3_SECRET_ACCESS_KEY=${S3_SECRET_ACCESS_KEY:-}
- S3_REGION=${S3_REGION:-}
# 数据拉取配置
- PULL_ENABLED=${PULL_ENABLED:-false}
- PULL_DAYS=${PULL_DAYS:-7}
# 运行模式 # 运行模式
- CRON_SCHEDULE=${CRON_SCHEDULE:-*/5 * * * *} - CRON_SCHEDULE=${CRON_SCHEDULE:-*/5 * * * *}
- RUN_MODE=${RUN_MODE:-cron} - RUN_MODE=${RUN_MODE:-cron}
@ -71,7 +85,7 @@ services:
volumes: volumes:
- ../config:/app/config:ro - ../config:/app/config:ro
- ../output:/app/output:ro - ../output:/app/output
environment: environment:
- TZ=Asia/Shanghai - TZ=Asia/Shanghai

View File

@ -30,7 +30,6 @@ services:
- PUSH_WINDOW_START=${PUSH_WINDOW_START:-} - PUSH_WINDOW_START=${PUSH_WINDOW_START:-}
- PUSH_WINDOW_END=${PUSH_WINDOW_END:-} - PUSH_WINDOW_END=${PUSH_WINDOW_END:-}
- PUSH_WINDOW_ONCE_PER_DAY=${PUSH_WINDOW_ONCE_PER_DAY:-} - PUSH_WINDOW_ONCE_PER_DAY=${PUSH_WINDOW_ONCE_PER_DAY:-}
- PUSH_WINDOW_RETENTION_DAYS=${PUSH_WINDOW_RETENTION_DAYS:-}
# 通知渠道 # 通知渠道
- FEISHU_WEBHOOK_URL=${FEISHU_WEBHOOK_URL:-} - FEISHU_WEBHOOK_URL=${FEISHU_WEBHOOK_URL:-}
- TELEGRAM_BOT_TOKEN=${TELEGRAM_BOT_TOKEN:-} - TELEGRAM_BOT_TOKEN=${TELEGRAM_BOT_TOKEN:-}
@ -52,6 +51,21 @@ services:
- BARK_URL=${BARK_URL:-} - BARK_URL=${BARK_URL:-}
# Slack配置 # Slack配置
- SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL:-} - SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL:-}
# 存储配置
- STORAGE_BACKEND=${STORAGE_BACKEND:-auto}
- LOCAL_RETENTION_DAYS=${LOCAL_RETENTION_DAYS:-0}
- REMOTE_RETENTION_DAYS=${REMOTE_RETENTION_DAYS:-0}
- STORAGE_TXT_ENABLED=${STORAGE_TXT_ENABLED:-true}
- STORAGE_HTML_ENABLED=${STORAGE_HTML_ENABLED:-true}
# 远程存储配置S3 兼容协议)
- S3_ENDPOINT_URL=${S3_ENDPOINT_URL:-}
- S3_BUCKET_NAME=${S3_BUCKET_NAME:-}
- S3_ACCESS_KEY_ID=${S3_ACCESS_KEY_ID:-}
- S3_SECRET_ACCESS_KEY=${S3_SECRET_ACCESS_KEY:-}
- S3_REGION=${S3_REGION:-}
# 数据拉取配置
- PULL_ENABLED=${PULL_ENABLED:-false}
- PULL_DAYS=${PULL_DAYS:-7}
# 运行模式 # 运行模式
- CRON_SCHEDULE=${CRON_SCHEDULE:-*/5 * * * *} - CRON_SCHEDULE=${CRON_SCHEDULE:-*/5 * * * *}
- RUN_MODE=${RUN_MODE:-cron} - RUN_MODE=${RUN_MODE:-cron}
@ -67,7 +81,7 @@ services:
volumes: volumes:
- ../config:/app/config:ro - ../config:/app/config:ro
- ../output:/app/output:ro - ../output:/app/output
environment: environment:
- TZ=Asia/Shanghai - TZ=Asia/Shanghai

View File

@ -13,11 +13,11 @@ env >> /etc/environment
case "${RUN_MODE:-cron}" in case "${RUN_MODE:-cron}" in
"once") "once")
echo "🔄 单次执行" echo "🔄 单次执行"
exec /usr/local/bin/python main.py exec /usr/local/bin/python -m trendradar
;; ;;
"cron") "cron")
# 生成 crontab # 生成 crontab
echo "${CRON_SCHEDULE:-*/30 * * * *} cd /app && /usr/local/bin/python main.py" > /tmp/crontab echo "${CRON_SCHEDULE:-*/30 * * * *} cd /app && /usr/local/bin/python -m trendradar" > /tmp/crontab
echo "📅 生成的crontab内容:" echo "📅 生成的crontab内容:"
cat /tmp/crontab cat /tmp/crontab
@ -30,7 +30,7 @@ case "${RUN_MODE:-cron}" in
# 立即执行一次(如果配置了) # 立即执行一次(如果配置了)
if [ "${IMMEDIATE_RUN:-false}" = "true" ]; then if [ "${IMMEDIATE_RUN:-false}" = "true" ]; then
echo "▶️ 立即执行一次" echo "▶️ 立即执行一次"
/usr/local/bin/python main.py /usr/local/bin/python -m trendradar
fi fi
# 启动 Web 服务器(如果配置了) # 启动 Web 服务器(如果配置了)

View File

@ -33,7 +33,7 @@ def manual_run():
print("🔄 手动执行爬虫...") print("🔄 手动执行爬虫...")
try: try:
result = subprocess.run( result = subprocess.run(
["python", "main.py"], cwd="/app", capture_output=False, text=True ["python", "-m", "trendradar"], cwd="/app", capture_output=False, text=True
) )
if result.returncode == 0: if result.returncode == 0:
print("✅ 执行完成") print("✅ 执行完成")
@ -285,12 +285,24 @@ def show_config():
"TELEGRAM_CHAT_ID", "TELEGRAM_CHAT_ID",
"CONFIG_PATH", "CONFIG_PATH",
"FREQUENCY_WORDS_PATH", "FREQUENCY_WORDS_PATH",
# 存储配置
"STORAGE_BACKEND",
"LOCAL_RETENTION_DAYS",
"REMOTE_RETENTION_DAYS",
"STORAGE_TXT_ENABLED",
"STORAGE_HTML_ENABLED",
"S3_BUCKET_NAME",
"S3_ACCESS_KEY_ID",
"S3_ENDPOINT_URL",
"S3_REGION",
"PULL_ENABLED",
"PULL_DAYS",
] ]
for var in env_vars: for var in env_vars:
value = os.environ.get(var, "未设置") value = os.environ.get(var, "未设置")
# 隐藏敏感信息 # 隐藏敏感信息
if any(sensitive in var for sensitive in ["WEBHOOK", "TOKEN", "KEY"]): if any(sensitive in var for sensitive in ["WEBHOOK", "TOKEN", "KEY", "SECRET"]):
if value and value != "未设置": if value and value != "未设置":
masked_value = value[:10] + "***" if len(value) > 10 else "***" masked_value = value[:10] + "***" if len(value) > 10 else "***"
print(f" {var}: {masked_value}") print(f" {var}: {masked_value}")
@ -331,6 +343,17 @@ def show_files():
# 显示最近2天的文件 # 显示最近2天的文件
for date_dir in date_dirs[:2]: for date_dir in date_dirs[:2]:
print(f" 📅 {date_dir.name}:") print(f" 📅 {date_dir.name}:")
# 检查 SQLite 数据库文件
db_files = list(date_dir.glob("*.db"))
if db_files:
print(f" 💾 SQLite: {len(db_files)} 个数据库")
for db_file in db_files[:3]:
mtime = time.ctime(db_file.stat().st_mtime)
size_kb = db_file.stat().st_size // 1024
print(f" 📀 {db_file.name} ({size_kb}KB, {mtime.split()[3][:5]})")
# 检查子目录html, txt
for subdir in ["html", "txt"]: for subdir in ["html", "txt"]:
sub_path = date_dir / subdir sub_path = date_dir / subdir
if sub_path.exists(): if sub_path.exists():

5431
main.py

File diff suppressed because it is too large Load Diff

View File

@ -4,4 +4,4 @@ TrendRadar MCP Server
提供基于MCP协议的新闻聚合数据查询和系统管理接口 提供基于MCP协议的新闻聚合数据查询和系统管理接口
""" """
__version__ = "1.0.0" __version__ = "1.1.0"

View File

@ -15,6 +15,7 @@ from .tools.analytics import AnalyticsTools
from .tools.search_tools import SearchTools from .tools.search_tools import SearchTools
from .tools.config_mgmt import ConfigManagementTools from .tools.config_mgmt import ConfigManagementTools
from .tools.system import SystemManagementTools from .tools.system import SystemManagementTools
from .tools.storage_sync import StorageSyncTools
from .utils.date_parser import DateParser from .utils.date_parser import DateParser
from .utils.errors import MCPError from .utils.errors import MCPError
@ -34,6 +35,7 @@ def _get_tools(project_root: Optional[str] = None):
_tools_instances['search'] = SearchTools(project_root) _tools_instances['search'] = SearchTools(project_root)
_tools_instances['config'] = ConfigManagementTools(project_root) _tools_instances['config'] = ConfigManagementTools(project_root)
_tools_instances['system'] = SystemManagementTools(project_root) _tools_instances['system'] = SystemManagementTools(project_root)
_tools_instances['storage'] = StorageSyncTools(project_root)
return _tools_instances return _tools_instances
@ -657,6 +659,127 @@ async def trigger_crawl(
return json.dumps(result, ensure_ascii=False, indent=2) return json.dumps(result, ensure_ascii=False, indent=2)
# ==================== 存储同步工具 ====================
@mcp.tool
async def sync_from_remote(
days: int = 7
) -> str:
"""
从远程存储拉取数据到本地
用于 MCP Server 等场景爬虫存到远程云存储 Cloudflare R2
MCP Server 拉取到本地进行分析查询
Args:
days: 拉取最近 N 天的数据默认 7
- 0: 不拉取
- 7: 拉取最近一周的数据
- 30: 拉取最近一个月的数据
Returns:
JSON格式的同步结果包含
- success: 是否成功
- synced_files: 成功同步的文件数量
- synced_dates: 成功同步的日期列表
- skipped_dates: 跳过的日期本地已存在
- failed_dates: 失败的日期及错误信息
- message: 操作结果描述
Examples:
- sync_from_remote() # 拉取最近7天
- sync_from_remote(days=30) # 拉取最近30天
Note:
需要在 config/config.yaml 中配置远程存储storage.remote或设置环境变量
- S3_ENDPOINT_URL: 服务端点
- S3_BUCKET_NAME: 存储桶名称
- S3_ACCESS_KEY_ID: 访问密钥 ID
- S3_SECRET_ACCESS_KEY: 访问密钥
"""
tools = _get_tools()
result = tools['storage'].sync_from_remote(days=days)
return json.dumps(result, ensure_ascii=False, indent=2)
@mcp.tool
async def get_storage_status() -> str:
"""
获取存储配置和状态
查看当前存储后端配置本地和远程存储的状态信息
Returns:
JSON格式的存储状态信息包含
- backend: 当前使用的后端类型local/remote/auto
- local: 本地存储状态
- data_dir: 数据目录
- retention_days: 保留天数
- total_size: 总大小
- date_count: 日期数量
- earliest_date: 最早日期
- latest_date: 最新日期
- remote: 远程存储状态
- configured: 是否已配置
- endpoint_url: 服务端点
- bucket_name: 存储桶名称
- date_count: 远程日期数量
- pull: 拉取配置
- enabled: 是否启用自动拉取
- days: 自动拉取天数
Examples:
- get_storage_status() # 查看所有存储状态
"""
tools = _get_tools()
result = tools['storage'].get_storage_status()
return json.dumps(result, ensure_ascii=False, indent=2)
@mcp.tool
async def list_available_dates(
source: str = "both"
) -> str:
"""
列出本地/远程可用的日期范围
查看本地和远程存储中有哪些日期的数据可用
帮助了解数据覆盖范围和同步状态
Args:
source: 数据来源可选值
- "local": 仅列出本地可用日期
- "remote": 仅列出远程可用日期
- "both": 同时列出两者并进行对比默认
Returns:
JSON格式的日期列表包含
- local: 本地日期信息如果 source 包含 local
- dates: 日期列表按时间倒序
- count: 日期数量
- earliest: 最早日期
- latest: 最新日期
- remote: 远程日期信息如果 source 包含 remote
- configured: 是否已配置远程存储
- dates: 日期列表
- count: 日期数量
- earliest: 最早日期
- latest: 最新日期
- comparison: 对比结果仅当 source="both"
- only_local: 仅本地存在的日期
- only_remote: 仅远程存在的日期
- both: 两边都存在的日期
Examples:
- list_available_dates() # 查看本地和远程的对比
- list_available_dates(source="local") # 仅查看本地
- list_available_dates(source="remote") # 仅查看远程
"""
tools = _get_tools()
result = tools['storage'].list_available_dates(source=source)
return json.dumps(result, ensure_ascii=False, indent=2)
# ==================== 启动入口 ==================== # ==================== 启动入口 ====================
def run_server( def run_server(
@ -721,6 +844,11 @@ def run_server(
print(" 11. get_current_config - 获取当前系统配置") print(" 11. get_current_config - 获取当前系统配置")
print(" 12. get_system_status - 获取系统运行状态") print(" 12. get_system_status - 获取系统运行状态")
print(" 13. trigger_crawl - 手动触发爬取任务") print(" 13. trigger_crawl - 手动触发爬取任务")
print()
print(" === 存储同步工具 ===")
print(" 14. sync_from_remote - 从远程存储拉取数据到本地")
print(" 15. get_storage_status - 获取存储配置和状态")
print(" 16. list_available_dates - 列出本地/远程可用日期")
print("=" * 60) print("=" * 60)
print() print()

View File

@ -517,24 +517,55 @@ class DataService:
# 遍历日期文件夹 # 遍历日期文件夹
for date_folder in output_dir.iterdir(): for date_folder in output_dir.iterdir():
if date_folder.is_dir() and not date_folder.name.startswith('.'): if date_folder.is_dir() and not date_folder.name.startswith('.'):
# 解析日期(格式: YYYY年MM月DD日 folder_date = self._parse_date_folder_name(date_folder.name)
try: if folder_date:
date_match = re.match(r'(\d{4})年(\d{2})月(\d{2})日', date_folder.name)
if date_match:
folder_date = datetime(
int(date_match.group(1)),
int(date_match.group(2)),
int(date_match.group(3))
)
available_dates.append(folder_date) available_dates.append(folder_date)
except Exception:
pass
if not available_dates: if not available_dates:
return (None, None) return (None, None)
return (min(available_dates), max(available_dates)) return (min(available_dates), max(available_dates))
def _parse_date_folder_name(self, folder_name: str) -> Optional[datetime]:
"""
解析日期文件夹名称兼容中文和ISO格式
支持两种格式
- 中文格式YYYY年MM月DD日
- ISO格式YYYY-MM-DD
Args:
folder_name: 文件夹名称
Returns:
datetime 对象解析失败返回 None
"""
# 尝试中文格式YYYY年MM月DD日
chinese_match = re.match(r'(\d{4})年(\d{2})月(\d{2})日', folder_name)
if chinese_match:
try:
return datetime(
int(chinese_match.group(1)),
int(chinese_match.group(2)),
int(chinese_match.group(3))
)
except ValueError:
pass
# 尝试 ISO 格式YYYY-MM-DD
iso_match = re.match(r'(\d{4})-(\d{2})-(\d{2})', folder_name)
if iso_match:
try:
return datetime(
int(iso_match.group(1)),
int(iso_match.group(2)),
int(iso_match.group(3))
)
except ValueError:
pass
return None
def get_system_status(self) -> Dict: def get_system_status(self) -> Dict:
""" """
获取系统运行状态 获取系统运行状态
@ -553,27 +584,15 @@ class DataService:
if output_dir.exists(): if output_dir.exists():
# 遍历日期文件夹 # 遍历日期文件夹
for date_folder in output_dir.iterdir(): for date_folder in output_dir.iterdir():
if date_folder.is_dir(): if date_folder.is_dir() and not date_folder.name.startswith('.'):
# 解析日期 # 解析日期兼容中文和ISO格式
try: folder_date = self._parse_date_folder_name(date_folder.name)
date_str = date_folder.name if folder_date:
# 格式: YYYY年MM月DD日
date_match = re.match(r'(\d{4})年(\d{2})月(\d{2})日', date_str)
if date_match:
folder_date = datetime(
int(date_match.group(1)),
int(date_match.group(2)),
int(date_match.group(3))
)
if oldest_record is None or folder_date < oldest_record: if oldest_record is None or folder_date < oldest_record:
oldest_record = folder_date oldest_record = folder_date
if latest_record is None or folder_date > latest_record: if latest_record is None or folder_date > latest_record:
latest_record = folder_date latest_record = folder_date
except:
pass
# 计算存储大小 # 计算存储大小
for item in date_folder.rglob("*"): for item in date_folder.rglob("*"):
if item.is_file(): if item.is_file():

View File

@ -2,9 +2,12 @@
文件解析服务 文件解析服务
提供txt格式新闻数据和YAML配置文件的解析功能 提供txt格式新闻数据和YAML配置文件的解析功能
支持从 SQLite 数据库和 TXT 文件两种数据源读取
""" """
import json
import re import re
import sqlite3
from pathlib import Path from pathlib import Path
from typing import Dict, List, Tuple, Optional from typing import Dict, List, Tuple, Optional
from datetime import datetime from datetime import datetime
@ -145,17 +148,310 @@ class ParserService:
def get_date_folder_name(self, date: datetime = None) -> str: def get_date_folder_name(self, date: datetime = None) -> str:
""" """
获取日期文件夹名称 获取日期文件夹名称兼容中文和ISO格式
Args: Args:
date: 日期对象默认为今天 date: 日期对象默认为今天
Returns: Returns:
文件夹名称格式: YYYY年MM月DD日 实际存在的文件夹名称优先返回中文格式YYYY年MM月DD日
若不存在则返回 ISO 格式YYYY-MM-DD
""" """
if date is None: if date is None:
date = datetime.now() date = datetime.now()
return date.strftime("%Y年%m月%d") return self._find_date_folder(date)
def _get_date_folder_name(self, date: datetime = None) -> str:
"""
获取日期文件夹名称兼容中文和ISO格式
Args:
date: 日期对象默认为今天
Returns:
实际存在的文件夹名称优先返回中文格式YYYY年MM月DD日
若不存在则返回 ISO 格式YYYY-MM-DD
"""
if date is None:
date = datetime.now()
return self._find_date_folder(date)
def _find_date_folder(self, date: datetime) -> str:
"""
查找实际存在的日期文件夹
支持两种格式
- 中文格式YYYY年MM月DD日优先
- ISO格式YYYY-MM-DD
Args:
date: 日期对象
Returns:
实际存在的文件夹名称若都不存在则返回中文格式
"""
output_dir = self.project_root / "output"
# 中文格式YYYY年MM月DD日
chinese_format = date.strftime("%Y年%m月%d")
# ISO格式YYYY-MM-DD
iso_format = date.strftime("%Y-%m-%d")
# 优先检查中文格式
if (output_dir / chinese_format).exists():
return chinese_format
# 其次检查 ISO 格式
if (output_dir / iso_format).exists():
return iso_format
# 都不存在,返回中文格式(与项目现有风格一致)
return chinese_format
def _get_sqlite_db_path(self, date: datetime = None) -> Optional[Path]:
"""
获取 SQLite 数据库文件路径
Args:
date: 日期对象默认为今天
Returns:
数据库文件路径如果不存在则返回 None
"""
date_folder = self._get_date_folder_name(date)
db_path = self.project_root / "output" / date_folder / "news.db"
if db_path.exists():
return db_path
return None
def _get_txt_folder_path(self, date: datetime = None) -> Optional[Path]:
"""
获取 TXT 文件夹路径
Args:
date: 日期对象默认为今天
Returns:
TXT 文件夹路径如果不存在则返回 None
"""
date_folder = self._get_date_folder_name(date)
txt_path = self.project_root / "output" / date_folder / "txt"
if txt_path.exists() and txt_path.is_dir():
return txt_path
return None
def _read_from_txt(
self,
date: datetime = None,
platform_ids: Optional[List[str]] = None
) -> Optional[Tuple[Dict, Dict, Dict]]:
"""
TXT 文件夹读取新闻数据
Args:
date: 日期对象默认为今天
platform_ids: 平台ID列表None表示所有平台
Returns:
(all_titles, id_to_name, all_timestamps) 元组如果不存在返回 None
"""
txt_folder = self._get_txt_folder_path(date)
if txt_folder is None:
return None
# 获取所有 TXT 文件并按时间排序
txt_files = sorted(txt_folder.glob("*.txt"))
if not txt_files:
return None
all_titles = {}
id_to_name = {}
all_timestamps = {}
for txt_file in txt_files:
try:
titles_by_id, file_id_to_name = self.parse_txt_file(txt_file)
# 记录时间戳
all_timestamps[txt_file.name] = txt_file.stat().st_mtime
# 合并 id_to_name
id_to_name.update(file_id_to_name)
# 合并标题数据
for source_id, titles in titles_by_id.items():
# 如果指定了 platform_ids过滤
if platform_ids and source_id not in platform_ids:
continue
if source_id not in all_titles:
all_titles[source_id] = {}
for title, data in titles.items():
if title not in all_titles[source_id]:
# 新标题
all_titles[source_id][title] = {
"ranks": data.get("ranks", []),
"url": data.get("url", ""),
"mobileUrl": data.get("mobileUrl", ""),
"first_time": txt_file.stem, # 使用文件名作为时间
"last_time": txt_file.stem,
"count": 1,
}
else:
# 合并已存在的标题
existing = all_titles[source_id][title]
# 合并排名
for rank in data.get("ranks", []):
if rank not in existing["ranks"]:
existing["ranks"].append(rank)
# 更新 last_time
existing["last_time"] = txt_file.stem
existing["count"] += 1
# 保留 URL
if not existing["url"] and data.get("url"):
existing["url"] = data["url"]
if not existing["mobileUrl"] and data.get("mobileUrl"):
existing["mobileUrl"] = data["mobileUrl"]
except Exception as e:
print(f"Warning: 解析 TXT 文件失败 {txt_file}: {e}")
continue
if not all_titles:
return None
return (all_titles, id_to_name, all_timestamps)
def _read_from_sqlite(
self,
date: datetime = None,
platform_ids: Optional[List[str]] = None
) -> Optional[Tuple[Dict, Dict, Dict]]:
"""
SQLite 数据库读取新闻数据
新表结构数据已按 URL 去重包含
- first_crawl_time: 首次抓取时间
- last_crawl_time: 最后抓取时间
- crawl_count: 抓取次数
Args:
date: 日期对象默认为今天
platform_ids: 平台ID列表None表示所有平台
Returns:
(all_titles, id_to_name, all_timestamps) 元组如果数据库不存在返回 None
"""
db_path = self._get_sqlite_db_path(date)
if db_path is None:
return None
all_titles = {}
id_to_name = {}
all_timestamps = {}
try:
conn = sqlite3.connect(str(db_path))
conn.row_factory = sqlite3.Row
cursor = conn.cursor()
# 检查表是否存在
cursor.execute("""
SELECT name FROM sqlite_master
WHERE type='table' AND name='news_items'
""")
if not cursor.fetchone():
conn.close()
return None
# 构建查询
if platform_ids:
placeholders = ','.join(['?' for _ in platform_ids])
query = f"""
SELECT n.id, n.platform_id, p.name as platform_name, n.title,
n.rank, n.url, n.mobile_url,
n.first_crawl_time, n.last_crawl_time, n.crawl_count
FROM news_items n
LEFT JOIN platforms p ON n.platform_id = p.id
WHERE n.platform_id IN ({placeholders})
"""
cursor.execute(query, platform_ids)
else:
cursor.execute("""
SELECT n.id, n.platform_id, p.name as platform_name, n.title,
n.rank, n.url, n.mobile_url,
n.first_crawl_time, n.last_crawl_time, n.crawl_count
FROM news_items n
LEFT JOIN platforms p ON n.platform_id = p.id
""")
rows = cursor.fetchall()
# 收集所有 news_item_id 用于查询历史排名
news_ids = [row['id'] for row in rows]
rank_history_map = {}
if news_ids:
placeholders = ",".join("?" * len(news_ids))
cursor.execute(f"""
SELECT news_item_id, rank FROM rank_history
WHERE news_item_id IN ({placeholders})
ORDER BY news_item_id, crawl_time
""", news_ids)
for rh_row in cursor.fetchall():
news_id = rh_row['news_item_id']
rank = rh_row['rank']
if news_id not in rank_history_map:
rank_history_map[news_id] = []
rank_history_map[news_id].append(rank)
for row in rows:
news_id = row['id']
platform_id = row['platform_id']
platform_name = row['platform_name'] or platform_id
title = row['title']
# 更新 id_to_name
if platform_id not in id_to_name:
id_to_name[platform_id] = platform_name
# 初始化平台字典
if platform_id not in all_titles:
all_titles[platform_id] = {}
# 获取排名历史,如果为空则使用当前排名
ranks = rank_history_map.get(news_id, [row['rank']])
# 直接使用数据(已去重)
all_titles[platform_id][title] = {
"ranks": ranks,
"url": row['url'] or "",
"mobileUrl": row['mobile_url'] or "",
"first_time": row['first_crawl_time'] or "",
"last_time": row['last_crawl_time'] or "",
"count": row['crawl_count'] or 1,
}
# 获取抓取时间作为 timestamps
cursor.execute("""
SELECT crawl_time FROM crawl_records
ORDER BY crawl_time
""")
for row in cursor.fetchall():
crawl_time = row['crawl_time']
all_timestamps[f"{crawl_time}.db"] = 0 # 用虚拟时间戳
conn.close()
if not all_titles:
return None
return (all_titles, id_to_name, all_timestamps)
except Exception as e:
print(f"Warning: 从 SQLite 读取数据失败: {e}")
return None
def read_all_titles_for_date( def read_all_titles_for_date(
self, self,
@ -163,7 +459,7 @@ class ParserService:
platform_ids: Optional[List[str]] = None platform_ids: Optional[List[str]] = None
) -> Tuple[Dict, Dict, Dict]: ) -> Tuple[Dict, Dict, Dict]:
""" """
读取指定日期的所有标题文件带缓存 读取指定日期的所有标题带缓存
Args: Args:
date: 日期对象默认为今天 date: 日期对象默认为今天
@ -193,72 +489,24 @@ class ParserService:
if cached: if cached:
return cached return cached
# 缓存未命中,读取文件 # 优先从 SQLite 读取
date_folder = self.get_date_folder_name(date) sqlite_result = self._read_from_sqlite(date, platform_ids)
txt_dir = self.project_root / "output" / date_folder / "txt" if sqlite_result:
self.cache.set(cache_key, sqlite_result)
return sqlite_result
if not txt_dir.exists(): # SQLite 不存在,尝试从 TXT 读取
txt_result = self._read_from_txt(date, platform_ids)
if txt_result:
self.cache.set(cache_key, txt_result)
return txt_result
# 两种数据源都不存在
raise DataNotFoundError( raise DataNotFoundError(
f"未找到 {date_folder} 的数据目录", f"未找到 {date_str} 的数据",
suggestion="请先运行爬虫或检查日期是否正确" suggestion="请先运行爬虫或检查日期是否正确"
) )
all_titles = {}
id_to_name = {}
all_timestamps = {}
# 读取所有txt文件
txt_files = sorted(txt_dir.glob("*.txt"))
if not txt_files:
raise DataNotFoundError(
f"{date_folder} 没有数据文件",
suggestion="请等待爬虫任务完成"
)
for txt_file in txt_files:
try:
titles_by_id, file_id_to_name = self.parse_txt_file(txt_file)
# 更新id_to_name
id_to_name.update(file_id_to_name)
# 合并标题数据
for platform_id, titles in titles_by_id.items():
# 如果指定了平台过滤
if platform_ids and platform_id not in platform_ids:
continue
if platform_id not in all_titles:
all_titles[platform_id] = {}
for title, info in titles.items():
if title in all_titles[platform_id]:
# 合并排名
all_titles[platform_id][title]["ranks"].extend(info["ranks"])
else:
all_titles[platform_id][title] = info.copy()
# 记录文件时间戳
all_timestamps[txt_file.name] = txt_file.stat().st_mtime
except Exception as e:
# 忽略单个文件的解析错误,继续处理其他文件
print(f"Warning: 解析文件 {txt_file} 失败: {e}")
continue
if not all_titles:
raise DataNotFoundError(
f"{date_folder} 没有有效的数据",
suggestion="请检查数据文件格式或重新运行爬虫"
)
# 缓存结果
result = (all_titles, id_to_name, all_timestamps)
self.cache.set(cache_key, result)
return result
def parse_yaml_config(self, config_path: str = None) -> dict: def parse_yaml_config(self, config_path: str = None) -> dict:
""" """
解析YAML配置文件 解析YAML配置文件

View File

@ -25,7 +25,6 @@ def calculate_news_weight(news_data: Dict, rank_threshold: int = 5) -> float:
""" """
计算新闻权重用于排序 计算新闻权重用于排序
基于 main.py 的权重算法实现综合考虑
- 排名权重 (60%)新闻在榜单中的排名 - 排名权重 (60%)新闻在榜单中的排名
- 频次权重 (30%)新闻出现的次数 - 频次权重 (30%)新闻出现的次数
- 热度权重 (10%)高排名出现的比例 - 热度权重 (10%)高排名出现的比例

View File

@ -0,0 +1,468 @@
# coding=utf-8
"""
存储同步工具
实现从远程存储拉取数据到本地获取存储状态列出可用日期等功能
"""
import os
import re
from pathlib import Path
from datetime import datetime, timedelta
from typing import Dict, List, Optional
import yaml
from ..utils.errors import MCPError
class StorageSyncTools:
"""存储同步工具类"""
def __init__(self, project_root: str = None):
"""
初始化存储同步工具
Args:
project_root: 项目根目录
"""
if project_root:
self.project_root = Path(project_root)
else:
current_file = Path(__file__)
self.project_root = current_file.parent.parent.parent
self._config = None
self._remote_backend = None
def _load_config(self) -> dict:
"""加载配置文件"""
if self._config is None:
config_path = self.project_root / "config" / "config.yaml"
if config_path.exists():
with open(config_path, "r", encoding="utf-8") as f:
self._config = yaml.safe_load(f)
else:
self._config = {}
return self._config
def _get_storage_config(self) -> dict:
"""获取存储配置"""
config = self._load_config()
return config.get("storage", {})
def _get_remote_config(self) -> dict:
"""
获取远程存储配置合并配置文件和环境变量
"""
storage_config = self._get_storage_config()
remote_config = storage_config.get("remote", {})
return {
"endpoint_url": remote_config.get("endpoint_url") or os.environ.get("S3_ENDPOINT_URL", ""),
"bucket_name": remote_config.get("bucket_name") or os.environ.get("S3_BUCKET_NAME", ""),
"access_key_id": remote_config.get("access_key_id") or os.environ.get("S3_ACCESS_KEY_ID", ""),
"secret_access_key": remote_config.get("secret_access_key") or os.environ.get("S3_SECRET_ACCESS_KEY", ""),
"region": remote_config.get("region") or os.environ.get("S3_REGION", ""),
}
def _has_remote_config(self) -> bool:
"""检查是否有有效的远程存储配置"""
config = self._get_remote_config()
return bool(
config.get("bucket_name") and
config.get("access_key_id") and
config.get("secret_access_key") and
config.get("endpoint_url")
)
def _get_remote_backend(self):
"""获取远程存储后端实例"""
if self._remote_backend is not None:
return self._remote_backend
if not self._has_remote_config():
return None
try:
from trendradar.storage.remote import RemoteStorageBackend
remote_config = self._get_remote_config()
config = self._load_config()
timezone = config.get("app", {}).get("timezone", "Asia/Shanghai")
self._remote_backend = RemoteStorageBackend(
bucket_name=remote_config["bucket_name"],
access_key_id=remote_config["access_key_id"],
secret_access_key=remote_config["secret_access_key"],
endpoint_url=remote_config["endpoint_url"],
region=remote_config.get("region", ""),
timezone=timezone,
)
return self._remote_backend
except ImportError:
print("[存储同步] 远程存储后端需要安装 boto3: pip install boto3")
return None
except Exception as e:
print(f"[存储同步] 创建远程后端失败: {e}")
return None
def _get_local_data_dir(self) -> Path:
"""获取本地数据目录"""
storage_config = self._get_storage_config()
local_config = storage_config.get("local", {})
data_dir = local_config.get("data_dir", "output")
return self.project_root / data_dir
def _parse_date_folder_name(self, folder_name: str) -> Optional[datetime]:
"""
解析日期文件夹名称兼容中文和 ISO 格式
支持两种格式
- 中文格式YYYY年MM月DD日
- ISO 格式YYYY-MM-DD
"""
# 尝试 ISO 格式
iso_match = re.match(r'(\d{4})-(\d{2})-(\d{2})', folder_name)
if iso_match:
try:
return datetime(
int(iso_match.group(1)),
int(iso_match.group(2)),
int(iso_match.group(3))
)
except ValueError:
pass
# 尝试中文格式
chinese_match = re.match(r'(\d{4})年(\d{2})月(\d{2})日', folder_name)
if chinese_match:
try:
return datetime(
int(chinese_match.group(1)),
int(chinese_match.group(2)),
int(chinese_match.group(3))
)
except ValueError:
pass
return None
def _get_local_dates(self) -> List[str]:
"""获取本地可用的日期列表"""
local_dir = self._get_local_data_dir()
dates = []
if not local_dir.exists():
return dates
for item in local_dir.iterdir():
if item.is_dir() and not item.name.startswith('.'):
folder_date = self._parse_date_folder_name(item.name)
if folder_date:
dates.append(folder_date.strftime("%Y-%m-%d"))
return sorted(dates, reverse=True)
def _calculate_dir_size(self, path: Path) -> int:
"""计算目录大小(字节)"""
total_size = 0
if path.exists():
for item in path.rglob("*"):
if item.is_file():
total_size += item.stat().st_size
return total_size
def sync_from_remote(self, days: int = 7) -> Dict:
"""
从远程存储拉取数据到本地
Args:
days: 拉取最近 N 天的数据默认 7
Returns:
同步结果字典
"""
try:
# 检查远程配置
if not self._has_remote_config():
return {
"success": False,
"error": {
"code": "REMOTE_NOT_CONFIGURED",
"message": "未配置远程存储",
"suggestion": "请在 config/config.yaml 中配置 storage.remote 或设置环境变量"
}
}
# 获取远程后端
remote_backend = self._get_remote_backend()
if remote_backend is None:
return {
"success": False,
"error": {
"code": "REMOTE_BACKEND_FAILED",
"message": "无法创建远程存储后端",
"suggestion": "请检查远程存储配置和 boto3 是否已安装"
}
}
# 获取本地数据目录
local_dir = self._get_local_data_dir()
local_dir.mkdir(parents=True, exist_ok=True)
# 获取远程可用日期
remote_dates = remote_backend.list_remote_dates()
# 获取本地已有日期
local_dates = set(self._get_local_dates())
# 计算需要拉取的日期(最近 N 天)
from trendradar.utils.time import get_configured_time
config = self._load_config()
timezone = config.get("app", {}).get("timezone", "Asia/Shanghai")
now = get_configured_time(timezone)
target_dates = []
for i in range(days):
date = now - timedelta(days=i)
date_str = date.strftime("%Y-%m-%d")
if date_str in remote_dates:
target_dates.append(date_str)
# 执行拉取
synced_dates = []
skipped_dates = []
failed_dates = []
for date_str in target_dates:
# 检查本地是否已存在
if date_str in local_dates:
skipped_dates.append(date_str)
continue
# 拉取单个日期
try:
local_date_dir = local_dir / date_str
local_db_path = local_date_dir / "news.db"
remote_key = f"news/{date_str}.db"
local_date_dir.mkdir(parents=True, exist_ok=True)
remote_backend.s3_client.download_file(
remote_backend.bucket_name,
remote_key,
str(local_db_path)
)
synced_dates.append(date_str)
print(f"[存储同步] 已拉取: {date_str}")
except Exception as e:
failed_dates.append({"date": date_str, "error": str(e)})
print(f"[存储同步] 拉取失败 ({date_str}): {e}")
return {
"success": True,
"synced_files": len(synced_dates),
"synced_dates": synced_dates,
"skipped_dates": skipped_dates,
"failed_dates": failed_dates,
"message": f"成功同步 {len(synced_dates)} 天数据" + (
f",跳过 {len(skipped_dates)} 天(本地已存在)" if skipped_dates else ""
) + (
f",失败 {len(failed_dates)}" if failed_dates else ""
)
}
except MCPError as e:
return {
"success": False,
"error": e.to_dict()
}
except Exception as e:
return {
"success": False,
"error": {
"code": "INTERNAL_ERROR",
"message": str(e)
}
}
def get_storage_status(self) -> Dict:
"""
获取存储配置和状态
Returns:
存储状态字典
"""
try:
storage_config = self._get_storage_config()
config = self._load_config()
# 本地存储状态
local_config = storage_config.get("local", {})
local_dir = self._get_local_data_dir()
local_size = self._calculate_dir_size(local_dir)
local_dates = self._get_local_dates()
local_status = {
"data_dir": local_config.get("data_dir", "output"),
"retention_days": local_config.get("retention_days", 0),
"total_size": f"{local_size / 1024 / 1024:.2f} MB",
"total_size_bytes": local_size,
"date_count": len(local_dates),
"earliest_date": local_dates[-1] if local_dates else None,
"latest_date": local_dates[0] if local_dates else None,
}
# 远程存储状态
remote_config = storage_config.get("remote", {})
has_remote = self._has_remote_config()
remote_status = {
"configured": has_remote,
"retention_days": remote_config.get("retention_days", 0),
}
if has_remote:
merged_config = self._get_remote_config()
# 脱敏显示
endpoint = merged_config.get("endpoint_url", "")
bucket = merged_config.get("bucket_name", "")
remote_status["endpoint_url"] = endpoint
remote_status["bucket_name"] = bucket
# 尝试获取远程日期列表
remote_backend = self._get_remote_backend()
if remote_backend:
try:
remote_dates = remote_backend.list_remote_dates()
remote_status["date_count"] = len(remote_dates)
remote_status["earliest_date"] = remote_dates[-1] if remote_dates else None
remote_status["latest_date"] = remote_dates[0] if remote_dates else None
except Exception as e:
remote_status["error"] = str(e)
# 拉取配置状态
pull_config = storage_config.get("pull", {})
pull_status = {
"enabled": pull_config.get("enabled", False),
"days": pull_config.get("days", 7),
}
return {
"success": True,
"backend": storage_config.get("backend", "auto"),
"local": local_status,
"remote": remote_status,
"pull": pull_status,
}
except MCPError as e:
return {
"success": False,
"error": e.to_dict()
}
except Exception as e:
return {
"success": False,
"error": {
"code": "INTERNAL_ERROR",
"message": str(e)
}
}
def list_available_dates(self, source: str = "both") -> Dict:
"""
列出可用的日期范围
Args:
source: 数据来源
- "local": 仅本地
- "remote": 仅远程
- "both": 两者都列出默认
Returns:
日期列表字典
"""
try:
result = {
"success": True,
}
# 本地日期
if source in ("local", "both"):
local_dates = self._get_local_dates()
result["local"] = {
"dates": local_dates,
"count": len(local_dates),
"earliest": local_dates[-1] if local_dates else None,
"latest": local_dates[0] if local_dates else None,
}
# 远程日期
if source in ("remote", "both"):
if not self._has_remote_config():
result["remote"] = {
"configured": False,
"dates": [],
"count": 0,
"earliest": None,
"latest": None,
"error": "未配置远程存储"
}
else:
remote_backend = self._get_remote_backend()
if remote_backend:
try:
remote_dates = remote_backend.list_remote_dates()
result["remote"] = {
"configured": True,
"dates": remote_dates,
"count": len(remote_dates),
"earliest": remote_dates[-1] if remote_dates else None,
"latest": remote_dates[0] if remote_dates else None,
}
except Exception as e:
result["remote"] = {
"configured": True,
"dates": [],
"count": 0,
"earliest": None,
"latest": None,
"error": str(e)
}
else:
result["remote"] = {
"configured": True,
"dates": [],
"count": 0,
"earliest": None,
"latest": None,
"error": "无法创建远程存储后端"
}
# 如果同时查询两者,计算差异
if source == "both" and "local" in result and "remote" in result:
local_set = set(result["local"]["dates"])
remote_set = set(result["remote"].get("dates", []))
result["comparison"] = {
"only_local": sorted(list(local_set - remote_set), reverse=True),
"only_remote": sorted(list(remote_set - local_set), reverse=True),
"both": sorted(list(local_set & remote_set), reverse=True),
}
return result
except MCPError as e:
return {
"success": False,
"error": e.to_dict()
}
except Exception as e:
return {
"success": False,
"error": {
"code": "INTERNAL_ERROR",
"message": str(e)
}
}

View File

@ -87,13 +87,13 @@ class SystemManagementTools:
>>> print(result['saved_files']) >>> print(result['saved_files'])
""" """
try: try:
import json
import time import time
import random
import requests
from datetime import datetime
import pytz
import yaml import yaml
from trendradar.crawler.fetcher import DataFetcher
from trendradar.storage.local import LocalStorageBackend
from trendradar.storage.base import convert_crawl_results_to_news_data
from trendradar.utils.time import get_configured_time, format_date_folder, format_time_filename
from ..services.cache_service import get_cache
# 参数验证 # 参数验证
platforms = validate_platforms(platforms) platforms = validate_platforms(platforms)
@ -129,9 +129,6 @@ class SystemManagementTools:
else: else:
target_platforms = all_platforms target_platforms = all_platforms
# 获取请求间隔
request_interval = config_data.get("crawler", {}).get("request_interval", 100)
# 构建平台ID列表 # 构建平台ID列表
ids = [] ids = []
for platform in target_platforms: for platform in target_platforms:
@ -142,87 +139,82 @@ class SystemManagementTools:
print(f"开始临时爬取,平台: {[p.get('name', p['id']) for p in target_platforms]}") print(f"开始临时爬取,平台: {[p.get('name', p['id']) for p in target_platforms]}")
# 爬取数据 # 初始化数据获取器
results = {} crawler_config = config_data.get("crawler", {})
id_to_name = {} proxy_url = None
failed_ids = [] if crawler_config.get("use_proxy"):
proxy_url = crawler_config.get("proxy_url")
for i, id_info in enumerate(ids): fetcher = DataFetcher(proxy_url=proxy_url)
if isinstance(id_info, tuple): request_interval = crawler_config.get("request_interval", 100)
id_value, name = id_info
else:
id_value = id_info
name = id_value
id_to_name[id_value] = name # 执行爬取
results, id_to_name, failed_ids = fetcher.crawl_websites(
ids_list=ids,
request_interval=request_interval
)
# 构建请求URL # 获取当前时间(统一使用 trendradar 的时间工具)
url = f"https://newsnow.busiyi.world/api/s?id={id_value}&latest" # 从配置中读取时区,默认为 Asia/Shanghai
timezone = config_data.get("app", {}).get("timezone", "Asia/Shanghai")
current_time = get_configured_time(timezone)
crawl_date = format_date_folder(None, timezone)
crawl_time_str = format_time_filename(timezone)
headers = { # 转换为标准数据模型
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", news_data = convert_crawl_results_to_news_data(
"Accept": "application/json, text/plain, */*", results=results,
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", id_to_name=id_to_name,
"Connection": "keep-alive", failed_ids=failed_ids,
"Cache-Control": "no-cache", crawl_time=crawl_time_str,
} crawl_date=crawl_date
)
# 重试机制 # 初始化存储后端
max_retries = 2 storage = LocalStorageBackend(
retries = 0 data_dir=str(self.project_root / "output"),
success = False enable_txt=True,
enable_html=True,
timezone=timezone
)
# 尝试持久化数据
save_success = False
save_error_msg = ""
saved_files = {}
while retries <= max_retries and not success:
try: try:
response = requests.get(url, headers=headers, timeout=10) # 1. 保存到 SQLite (核心持久化)
response.raise_for_status() if storage.save_news_data(news_data):
save_success = True
data_text = response.text # 2. 如果请求保存到本地,生成 TXT/HTML 快照
data_json = json.loads(data_text) if save_to_local:
# 保存 TXT
txt_path = storage.save_txt_snapshot(news_data)
if txt_path:
saved_files["txt"] = txt_path
status = data_json.get("status", "未知") # 保存 HTML (使用简化版生成器)
if status not in ["success", "cache"]: html_content = self._generate_simple_html(results, id_to_name, failed_ids, current_time)
raise ValueError(f"响应状态异常: {status}") html_filename = f"{crawl_time_str}.html"
html_path = storage.save_html_report(html_content, html_filename)
status_info = "最新数据" if status == "success" else "缓存数据" if html_path:
print(f"获取 {id_value} 成功({status_info}") saved_files["html"] = html_path
# 解析数据
results[id_value] = {}
for index, item in enumerate(data_json.get("items", []), 1):
title = item["title"]
url_link = item.get("url", "")
mobile_url = item.get("mobileUrl", "")
if title in results[id_value]:
results[id_value][title]["ranks"].append(index)
else:
results[id_value][title] = {
"ranks": [index],
"url": url_link,
"mobileUrl": mobile_url,
}
success = True
except Exception as e: except Exception as e:
retries += 1 # 捕获所有保存错误(特别是 Docker 只读卷导致的 PermissionError
if retries <= max_retries: print(f"[System] 数据保存失败: {e}")
wait_time = random.uniform(3, 5) save_success = False
print(f"请求 {id_value} 失败: {e}. {wait_time:.2f}秒后重试...") save_error_msg = str(e)
time.sleep(wait_time)
else:
print(f"请求 {id_value} 失败: {e}")
failed_ids.append(id_value)
# 请求间隔 # 3. 清除缓存,确保下次查询获取最新数据
if i < len(ids) - 1: # 即使保存失败,内存中的数据可能已经通过其他方式更新,或者是临时的
actual_interval = request_interval + random.randint(-10, 20) get_cache().clear()
actual_interval = max(50, actual_interval) print("[System] 缓存已清除")
time.sleep(actual_interval / 1000)
# 格式化返回数据 # 构建返回结果
news_data = [] news_response_data = []
for platform_id, titles_data in results.items(): for platform_id, titles_data in results.items():
platform_name = id_to_name.get(platform_id, platform_id) platform_name = id_to_name.get(platform_id, platform_id)
for title, info in titles_data.items(): for title, info in titles_data.items():
@ -230,131 +222,42 @@ class SystemManagementTools:
"platform_id": platform_id, "platform_id": platform_id,
"platform_name": platform_name, "platform_name": platform_name,
"title": title, "title": title,
"ranks": info["ranks"] "ranks": info.get("ranks", [])
} }
# 条件性添加 URL 字段
if include_url: if include_url:
news_item["url"] = info.get("url", "") news_item["url"] = info.get("url", "")
news_item["mobile_url"] = info.get("mobileUrl", "") news_item["mobile_url"] = info.get("mobileUrl", "")
news_response_data.append(news_item)
news_data.append(news_item)
# 获取北京时间
beijing_tz = pytz.timezone("Asia/Shanghai")
now = datetime.now(beijing_tz)
# 构建返回结果
result = { result = {
"success": True, "success": True,
"task_id": f"crawl_{int(time.time())}", "task_id": f"crawl_{int(time.time())}",
"status": "completed", "status": "completed",
"crawl_time": now.strftime("%Y-%m-%d %H:%M:%S"), "crawl_time": current_time.strftime("%Y-%m-%d %H:%M:%S"),
"platforms": list(results.keys()), "platforms": list(results.keys()),
"total_news": len(news_data), "total_news": len(news_response_data),
"failed_platforms": failed_ids, "failed_platforms": failed_ids,
"data": news_data, "data": news_response_data,
"saved_to_local": save_to_local "saved_to_local": save_success and save_to_local
} }
# 如果需要持久化,调用保存逻辑 if save_success:
if save_to_local: if save_to_local:
try: result["saved_files"] = saved_files
import re result["note"] = "数据已保存到 SQLite 数据库及 output 文件夹"
# 辅助函数:清理标题
def clean_title(title: str) -> str:
"""清理标题中的特殊字符"""
if not isinstance(title, str):
title = str(title)
cleaned_title = title.replace("\n", " ").replace("\r", " ")
cleaned_title = re.sub(r"\s+", " ", cleaned_title)
cleaned_title = cleaned_title.strip()
return cleaned_title
# 辅助函数:创建目录
def ensure_directory_exists(directory: str):
"""确保目录存在"""
Path(directory).mkdir(parents=True, exist_ok=True)
# 格式化日期和时间
date_folder = now.strftime("%Y年%m月%d")
time_filename = now.strftime("%H时%M分")
# 创建 txt 文件路径
txt_dir = self.project_root / "output" / date_folder / "txt"
ensure_directory_exists(str(txt_dir))
txt_file_path = txt_dir / f"{time_filename}.txt"
# 创建 html 文件路径
html_dir = self.project_root / "output" / date_folder / "html"
ensure_directory_exists(str(html_dir))
html_file_path = html_dir / f"{time_filename}.html"
# 保存 txt 文件(按照 main.py 的格式)
with open(txt_file_path, "w", encoding="utf-8") as f:
for id_value, title_data in results.items():
# id | name 或 id
name = id_to_name.get(id_value)
if name and name != id_value:
f.write(f"{id_value} | {name}\n")
else: else:
f.write(f"{id_value}\n") result["note"] = "数据已保存到 SQLite 数据库 (仅内存中返回结果未生成TXT快照)"
# 按排名排序标题
sorted_titles = []
for title, info in title_data.items():
cleaned = clean_title(title)
if isinstance(info, dict):
ranks = info.get("ranks", [])
url = info.get("url", "")
mobile_url = info.get("mobileUrl", "")
else: else:
ranks = info if isinstance(info, list) else [] # 明确告知用户保存失败
url = "" result["saved_to_local"] = False
mobile_url = "" result["save_error"] = save_error_msg
if "Read-only file system" in save_error_msg or "Permission denied" in save_error_msg:
rank = ranks[0] if ranks else 1 result["note"] = "爬取成功但无法写入数据库Docker只读模式。数据仅在本次返回中有效。"
sorted_titles.append((rank, cleaned, url, mobile_url))
sorted_titles.sort(key=lambda x: x[0])
for rank, cleaned, url, mobile_url in sorted_titles:
line = f"{rank}. {cleaned}"
if url:
line += f" [URL:{url}]"
if mobile_url:
line += f" [MOBILE:{mobile_url}]"
f.write(line + "\n")
f.write("\n")
if failed_ids:
f.write("==== 以下ID请求失败 ====\n")
for id_value in failed_ids:
f.write(f"{id_value}\n")
# 保存 html 文件(简化版)
html_content = self._generate_simple_html(results, id_to_name, failed_ids, now)
with open(html_file_path, "w", encoding="utf-8") as f:
f.write(html_content)
print(f"数据已保存到:")
print(f" TXT: {txt_file_path}")
print(f" HTML: {html_file_path}")
result["saved_files"] = {
"txt": str(txt_file_path),
"html": str(html_file_path)
}
result["note"] = "数据已持久化到 output 文件夹"
except Exception as e:
print(f"保存文件失败: {e}")
result["save_error"] = str(e)
result["note"] = "爬取成功但保存失败,数据仅在内存中"
else: else:
result["note"] = "临时爬取结果未持久化到output文件夹" result["note"] = f"爬取成功但保存失败: {save_error_msg}"
# 清理资源
storage.cleanup()
return result return result

View File

@ -283,13 +283,13 @@ class DateParser:
date: datetime对象 date: datetime对象
Returns: Returns:
文件夹名称格式: YYYY年MM月DD日 文件夹名称格式: YYYY-MM-DD
Examples: Examples:
>>> DateParser.format_date_folder(datetime(2025, 10, 11)) >>> DateParser.format_date_folder(datetime(2025, 10, 11))
'2025年10月11日' '2025-10-11'
""" """
return date.strftime("%Y%m月%d") return date.strftime("%Y-%m-%d")
@staticmethod @staticmethod
def validate_date_not_future(date: datetime) -> None: def validate_date_not_future(date: datetime) -> None:

View File

@ -1,6 +1,6 @@
[project] [project]
name = "trendradar-mcp" name = "trendradar-mcp"
version = "1.0.3" version = "1.1.0"
description = "TrendRadar MCP Server - 新闻热点聚合工具" description = "TrendRadar MCP Server - 新闻热点聚合工具"
requires-python = ">=3.10" requires-python = ">=3.10"
dependencies = [ dependencies = [

View File

@ -3,3 +3,4 @@ pytz>=2025.2,<2026.0
PyYAML>=6.0.3,<7.0.0 PyYAML>=6.0.3,<7.0.0
fastmcp>=2.12.0,<2.14.0 fastmcp>=2.12.0,<2.14.0
websockets>=13.0,<14.0 websockets>=13.0,<14.0
boto3>=1.35.0,<2.0.0

13
trendradar/__init__.py Normal file
View File

@ -0,0 +1,13 @@
# coding=utf-8
"""
TrendRadar - 热点新闻聚合与分析工具
使用方式:
python -m trendradar # 模块执行
trendradar # 安装后执行
"""
from trendradar.context import AppContext
__version__ = "4.0.0"
__all__ = ["AppContext", "__version__"]

719
trendradar/__main__.py Normal file
View File

@ -0,0 +1,719 @@
# coding=utf-8
"""
TrendRadar 主程序
热点新闻聚合与分析工具
支持: python -m trendradar
"""
import os
import webbrowser
from pathlib import Path
from typing import Dict, List, Tuple, Optional
import requests
from trendradar.context import AppContext
# 版本号直接定义,避免循环导入
VERSION = "4.0.0"
from trendradar.core import load_config
from trendradar.crawler import DataFetcher
from trendradar.storage import convert_crawl_results_to_news_data
def check_version_update(
current_version: str, version_url: str, proxy_url: Optional[str] = None
) -> Tuple[bool, Optional[str]]:
"""检查版本更新"""
try:
proxies = None
if proxy_url:
proxies = {"http": proxy_url, "https": proxy_url}
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Accept": "text/plain, */*",
"Cache-Control": "no-cache",
}
response = requests.get(
version_url, proxies=proxies, headers=headers, timeout=10
)
response.raise_for_status()
remote_version = response.text.strip()
print(f"当前版本: {current_version}, 远程版本: {remote_version}")
# 比较版本
def parse_version(version_str):
try:
parts = version_str.strip().split(".")
if len(parts) != 3:
raise ValueError("版本号格式不正确")
return int(parts[0]), int(parts[1]), int(parts[2])
except:
return 0, 0, 0
current_tuple = parse_version(current_version)
remote_tuple = parse_version(remote_version)
need_update = current_tuple < remote_tuple
return need_update, remote_version if need_update else None
except Exception as e:
print(f"版本检查失败: {e}")
return False, None
# === 主分析器 ===
class NewsAnalyzer:
"""新闻分析器"""
# 模式策略定义
MODE_STRATEGIES = {
"incremental": {
"mode_name": "增量模式",
"description": "增量模式(只关注新增新闻,无新增时不推送)",
"realtime_report_type": "实时增量",
"summary_report_type": "当日汇总",
"should_send_realtime": True,
"should_generate_summary": True,
"summary_mode": "daily",
},
"current": {
"mode_name": "当前榜单模式",
"description": "当前榜单模式(当前榜单匹配新闻 + 新增新闻区域 + 按时推送)",
"realtime_report_type": "实时当前榜单",
"summary_report_type": "当前榜单汇总",
"should_send_realtime": True,
"should_generate_summary": True,
"summary_mode": "current",
},
"daily": {
"mode_name": "当日汇总模式",
"description": "当日汇总模式(所有匹配新闻 + 新增新闻区域 + 按时推送)",
"realtime_report_type": "",
"summary_report_type": "当日汇总",
"should_send_realtime": False,
"should_generate_summary": True,
"summary_mode": "daily",
},
}
def __init__(self):
# 加载配置
print("正在加载配置...")
config = load_config()
print(f"TrendRadar v{VERSION} 配置加载完成")
print(f"监控平台数量: {len(config['PLATFORMS'])}")
print(f"时区: {config.get('TIMEZONE', 'Asia/Shanghai')}")
# 创建应用上下文
self.ctx = AppContext(config)
self.request_interval = self.ctx.config["REQUEST_INTERVAL"]
self.report_mode = self.ctx.config["REPORT_MODE"]
self.rank_threshold = self.ctx.rank_threshold
self.is_github_actions = os.environ.get("GITHUB_ACTIONS") == "true"
self.is_docker_container = self._detect_docker_environment()
self.update_info = None
self.proxy_url = None
self._setup_proxy()
self.data_fetcher = DataFetcher(self.proxy_url)
# 初始化存储管理器(使用 AppContext
self._init_storage_manager()
if self.is_github_actions:
self._check_version_update()
def _init_storage_manager(self) -> None:
"""初始化存储管理器(使用 AppContext"""
# 获取数据保留天数(支持环境变量覆盖)
env_retention = os.environ.get("STORAGE_RETENTION_DAYS", "").strip()
if env_retention:
# 环境变量覆盖配置
self.ctx.config["STORAGE"]["RETENTION_DAYS"] = int(env_retention)
self.storage_manager = self.ctx.get_storage_manager()
print(f"存储后端: {self.storage_manager.backend_name}")
retention_days = self.ctx.config.get("STORAGE", {}).get("RETENTION_DAYS", 0)
if retention_days > 0:
print(f"数据保留天数: {retention_days}")
def _detect_docker_environment(self) -> bool:
"""检测是否运行在 Docker 容器中"""
try:
if os.environ.get("DOCKER_CONTAINER") == "true":
return True
if os.path.exists("/.dockerenv"):
return True
return False
except Exception:
return False
def _should_open_browser(self) -> bool:
"""判断是否应该打开浏览器"""
return not self.is_github_actions and not self.is_docker_container
def _setup_proxy(self) -> None:
"""设置代理配置"""
if not self.is_github_actions and self.ctx.config["USE_PROXY"]:
self.proxy_url = self.ctx.config["DEFAULT_PROXY"]
print("本地环境,使用代理")
elif not self.is_github_actions and not self.ctx.config["USE_PROXY"]:
print("本地环境,未启用代理")
else:
print("GitHub Actions环境不使用代理")
def _check_version_update(self) -> None:
"""检查版本更新"""
try:
need_update, remote_version = check_version_update(
VERSION, self.ctx.config["VERSION_CHECK_URL"], self.proxy_url
)
if need_update and remote_version:
self.update_info = {
"current_version": VERSION,
"remote_version": remote_version,
}
print(f"发现新版本: {remote_version} (当前: {VERSION})")
else:
print("版本检查完成,当前为最新版本")
except Exception as e:
print(f"版本检查出错: {e}")
def _get_mode_strategy(self) -> Dict:
"""获取当前模式的策略配置"""
return self.MODE_STRATEGIES.get(self.report_mode, self.MODE_STRATEGIES["daily"])
def _has_notification_configured(self) -> bool:
"""检查是否配置了任何通知渠道"""
cfg = self.ctx.config
return any(
[
cfg["FEISHU_WEBHOOK_URL"],
cfg["DINGTALK_WEBHOOK_URL"],
cfg["WEWORK_WEBHOOK_URL"],
(cfg["TELEGRAM_BOT_TOKEN"] and cfg["TELEGRAM_CHAT_ID"]),
(
cfg["EMAIL_FROM"]
and cfg["EMAIL_PASSWORD"]
and cfg["EMAIL_TO"]
),
(cfg["NTFY_SERVER_URL"] and cfg["NTFY_TOPIC"]),
cfg["BARK_URL"],
cfg["SLACK_WEBHOOK_URL"],
]
)
def _has_valid_content(
self, stats: List[Dict], new_titles: Optional[Dict] = None
) -> bool:
"""检查是否有有效的新闻内容"""
if self.report_mode in ["incremental", "current"]:
# 增量模式和current模式下只要stats有内容就说明有匹配的新闻
return any(stat["count"] > 0 for stat in stats)
else:
# 当日汇总模式下,检查是否有匹配的频率词新闻或新增新闻
has_matched_news = any(stat["count"] > 0 for stat in stats)
has_new_news = bool(
new_titles and any(len(titles) > 0 for titles in new_titles.values())
)
return has_matched_news or has_new_news
def _load_analysis_data(
self,
) -> Optional[Tuple[Dict, Dict, Dict, Dict, List, List]]:
"""统一的数据加载和预处理,使用当前监控平台列表过滤历史数据"""
try:
# 获取当前配置的监控平台ID列表
current_platform_ids = self.ctx.platform_ids
print(f"当前监控平台: {current_platform_ids}")
all_results, id_to_name, title_info = self.ctx.read_today_titles(
current_platform_ids
)
if not all_results:
print("没有找到当天的数据")
return None
total_titles = sum(len(titles) for titles in all_results.values())
print(f"读取到 {total_titles} 个标题(已按当前监控平台过滤)")
new_titles = self.ctx.detect_new_titles(current_platform_ids)
word_groups, filter_words, global_filters = self.ctx.load_frequency_words()
return (
all_results,
id_to_name,
title_info,
new_titles,
word_groups,
filter_words,
global_filters,
)
except Exception as e:
print(f"数据加载失败: {e}")
return None
def _prepare_current_title_info(self, results: Dict, time_info: str) -> Dict:
"""从当前抓取结果构建标题信息"""
title_info = {}
for source_id, titles_data in results.items():
title_info[source_id] = {}
for title, title_data in titles_data.items():
ranks = title_data.get("ranks", [])
url = title_data.get("url", "")
mobile_url = title_data.get("mobileUrl", "")
title_info[source_id][title] = {
"first_time": time_info,
"last_time": time_info,
"count": 1,
"ranks": ranks,
"url": url,
"mobileUrl": mobile_url,
}
return title_info
def _run_analysis_pipeline(
self,
data_source: Dict,
mode: str,
title_info: Dict,
new_titles: Dict,
word_groups: List[Dict],
filter_words: List[str],
id_to_name: Dict,
failed_ids: Optional[List] = None,
is_daily_summary: bool = False,
global_filters: Optional[List[str]] = None,
) -> Tuple[List[Dict], Optional[str]]:
"""统一的分析流水线:数据处理 → 统计计算 → HTML生成"""
# 统计计算(使用 AppContext
stats, total_titles = self.ctx.count_frequency(
data_source,
word_groups,
filter_words,
id_to_name,
title_info,
new_titles,
mode=mode,
global_filters=global_filters,
)
# HTML生成如果启用
html_file = None
if self.ctx.config["STORAGE"]["FORMATS"]["HTML"]:
html_file = self.ctx.generate_html(
stats,
total_titles,
failed_ids=failed_ids,
new_titles=new_titles,
id_to_name=id_to_name,
mode=mode,
is_daily_summary=is_daily_summary,
update_info=self.update_info if self.ctx.config["SHOW_VERSION_UPDATE"] else None,
)
return stats, html_file
def _send_notification_if_needed(
self,
stats: List[Dict],
report_type: str,
mode: str,
failed_ids: Optional[List] = None,
new_titles: Optional[Dict] = None,
id_to_name: Optional[Dict] = None,
html_file_path: Optional[str] = None,
) -> bool:
"""统一的通知发送逻辑,包含所有判断条件"""
has_notification = self._has_notification_configured()
cfg = self.ctx.config
if (
cfg["ENABLE_NOTIFICATION"]
and has_notification
and self._has_valid_content(stats, new_titles)
):
# 推送窗口控制
if cfg["PUSH_WINDOW"]["ENABLED"]:
push_manager = self.ctx.create_push_manager()
time_range_start = cfg["PUSH_WINDOW"]["TIME_RANGE"]["START"]
time_range_end = cfg["PUSH_WINDOW"]["TIME_RANGE"]["END"]
if not push_manager.is_in_time_range(time_range_start, time_range_end):
now = self.ctx.get_time()
print(
f"推送窗口控制:当前时间 {now.strftime('%H:%M')} 不在推送时间窗口 {time_range_start}-{time_range_end} 内,跳过推送"
)
return False
if cfg["PUSH_WINDOW"]["ONCE_PER_DAY"]:
if push_manager.has_pushed_today():
print(f"推送窗口控制:今天已推送过,跳过本次推送")
return False
else:
print(f"推送窗口控制:今天首次推送")
# 准备报告数据
report_data = self.ctx.prepare_report(stats, failed_ids, new_titles, id_to_name, mode)
# 是否发送版本更新信息
update_info_to_send = self.update_info if cfg["SHOW_VERSION_UPDATE"] else None
# 使用 NotificationDispatcher 发送到所有渠道
dispatcher = self.ctx.create_notification_dispatcher()
results = dispatcher.dispatch_all(
report_data=report_data,
report_type=report_type,
update_info=update_info_to_send,
proxy_url=self.proxy_url,
mode=mode,
html_file_path=html_file_path,
)
if not results:
print("未配置任何通知渠道,跳过通知发送")
return False
# 如果成功发送了任何通知,且启用了每天只推一次,则记录推送
if (
cfg["PUSH_WINDOW"]["ENABLED"]
and cfg["PUSH_WINDOW"]["ONCE_PER_DAY"]
and any(results.values())
):
push_manager = self.ctx.create_push_manager()
push_manager.record_push(report_type)
return True
elif cfg["ENABLE_NOTIFICATION"] and not has_notification:
print("⚠️ 警告:通知功能已启用但未配置任何通知渠道,将跳过通知发送")
elif not cfg["ENABLE_NOTIFICATION"]:
print(f"跳过{report_type}通知:通知功能已禁用")
elif (
cfg["ENABLE_NOTIFICATION"]
and has_notification
and not self._has_valid_content(stats, new_titles)
):
mode_strategy = self._get_mode_strategy()
if "实时" in report_type:
print(
f"跳过实时推送通知:{mode_strategy['mode_name']}下未检测到匹配的新闻"
)
else:
print(
f"跳过{mode_strategy['summary_report_type']}通知:未匹配到有效的新闻内容"
)
return False
def _generate_summary_report(self, mode_strategy: Dict) -> Optional[str]:
"""生成汇总报告(带通知)"""
summary_type = (
"当前榜单汇总" if mode_strategy["summary_mode"] == "current" else "当日汇总"
)
print(f"生成{summary_type}报告...")
# 加载分析数据
analysis_data = self._load_analysis_data()
if not analysis_data:
return None
all_results, id_to_name, title_info, new_titles, word_groups, filter_words, global_filters = (
analysis_data
)
# 运行分析流水线
stats, html_file = self._run_analysis_pipeline(
all_results,
mode_strategy["summary_mode"],
title_info,
new_titles,
word_groups,
filter_words,
id_to_name,
is_daily_summary=True,
global_filters=global_filters,
)
if html_file:
print(f"{summary_type}报告已生成: {html_file}")
# 发送通知
self._send_notification_if_needed(
stats,
mode_strategy["summary_report_type"],
mode_strategy["summary_mode"],
failed_ids=[],
new_titles=new_titles,
id_to_name=id_to_name,
html_file_path=html_file,
)
return html_file
def _generate_summary_html(self, mode: str = "daily") -> Optional[str]:
"""生成汇总HTML"""
summary_type = "当前榜单汇总" if mode == "current" else "当日汇总"
print(f"生成{summary_type}HTML...")
# 加载分析数据
analysis_data = self._load_analysis_data()
if not analysis_data:
return None
all_results, id_to_name, title_info, new_titles, word_groups, filter_words, global_filters = (
analysis_data
)
# 运行分析流水线
_, html_file = self._run_analysis_pipeline(
all_results,
mode,
title_info,
new_titles,
word_groups,
filter_words,
id_to_name,
is_daily_summary=True,
global_filters=global_filters,
)
if html_file:
print(f"{summary_type}HTML已生成: {html_file}")
return html_file
def _initialize_and_check_config(self) -> None:
"""通用初始化和配置检查"""
now = self.ctx.get_time()
print(f"当前北京时间: {now.strftime('%Y-%m-%d %H:%M:%S')}")
if not self.ctx.config["ENABLE_CRAWLER"]:
print("爬虫功能已禁用ENABLE_CRAWLER=False程序退出")
return
has_notification = self._has_notification_configured()
if not self.ctx.config["ENABLE_NOTIFICATION"]:
print("通知功能已禁用ENABLE_NOTIFICATION=False将只进行数据抓取")
elif not has_notification:
print("未配置任何通知渠道,将只进行数据抓取,不发送通知")
else:
print("通知功能已启用,将发送通知")
mode_strategy = self._get_mode_strategy()
print(f"报告模式: {self.report_mode}")
print(f"运行模式: {mode_strategy['description']}")
def _crawl_data(self) -> Tuple[Dict, Dict, List]:
"""执行数据爬取"""
ids = []
for platform in self.ctx.platforms:
if "name" in platform:
ids.append((platform["id"], platform["name"]))
else:
ids.append(platform["id"])
print(
f"配置的监控平台: {[p.get('name', p['id']) for p in self.ctx.platforms]}"
)
print(f"开始爬取数据,请求间隔 {self.request_interval} 毫秒")
Path("output").mkdir(parents=True, exist_ok=True)
results, id_to_name, failed_ids = self.data_fetcher.crawl_websites(
ids, self.request_interval
)
# 转换为 NewsData 格式并保存到存储后端
crawl_time = self.ctx.format_time()
crawl_date = self.ctx.format_date()
news_data = convert_crawl_results_to_news_data(
results, id_to_name, failed_ids, crawl_time, crawl_date
)
# 保存到存储后端SQLite
if self.storage_manager.save_news_data(news_data):
print(f"数据已保存到存储后端: {self.storage_manager.backend_name}")
# 保存 TXT 快照(如果启用)
txt_file = self.storage_manager.save_txt_snapshot(news_data)
if txt_file:
print(f"TXT 快照已保存: {txt_file}")
# 兼容:同时保存到原有 TXT 格式(确保向后兼容)
if self.ctx.config["STORAGE"]["FORMATS"]["TXT"]:
title_file = self.ctx.save_titles(results, id_to_name, failed_ids)
print(f"标题已保存到: {title_file}")
return results, id_to_name, failed_ids
def _execute_mode_strategy(
self, mode_strategy: Dict, results: Dict, id_to_name: Dict, failed_ids: List
) -> Optional[str]:
"""执行模式特定逻辑"""
# 获取当前监控平台ID列表
current_platform_ids = self.ctx.platform_ids
new_titles = self.ctx.detect_new_titles(current_platform_ids)
time_info = self.ctx.format_time()
if self.ctx.config["STORAGE"]["FORMATS"]["TXT"]:
self.ctx.save_titles(results, id_to_name, failed_ids)
word_groups, filter_words, global_filters = self.ctx.load_frequency_words()
# current模式下实时推送需要使用完整的历史数据来保证统计信息的完整性
if self.report_mode == "current":
# 加载完整的历史数据(已按当前平台过滤)
analysis_data = self._load_analysis_data()
if analysis_data:
(
all_results,
historical_id_to_name,
historical_title_info,
historical_new_titles,
_,
_,
_,
) = analysis_data
print(
f"current模式使用过滤后的历史数据包含平台{list(all_results.keys())}"
)
stats, html_file = self._run_analysis_pipeline(
all_results,
self.report_mode,
historical_title_info,
historical_new_titles,
word_groups,
filter_words,
historical_id_to_name,
failed_ids=failed_ids,
global_filters=global_filters,
)
combined_id_to_name = {**historical_id_to_name, **id_to_name}
if html_file:
print(f"HTML报告已生成: {html_file}")
# 发送实时通知(使用完整历史数据的统计结果)
summary_html = None
if mode_strategy["should_send_realtime"]:
self._send_notification_if_needed(
stats,
mode_strategy["realtime_report_type"],
self.report_mode,
failed_ids=failed_ids,
new_titles=historical_new_titles,
id_to_name=combined_id_to_name,
html_file_path=html_file,
)
else:
print("❌ 严重错误:无法读取刚保存的数据文件")
raise RuntimeError("数据一致性检查失败:保存后立即读取失败")
else:
title_info = self._prepare_current_title_info(results, time_info)
stats, html_file = self._run_analysis_pipeline(
results,
self.report_mode,
title_info,
new_titles,
word_groups,
filter_words,
id_to_name,
failed_ids=failed_ids,
global_filters=global_filters,
)
if html_file:
print(f"HTML报告已生成: {html_file}")
# 发送实时通知(如果需要)
summary_html = None
if mode_strategy["should_send_realtime"]:
self._send_notification_if_needed(
stats,
mode_strategy["realtime_report_type"],
self.report_mode,
failed_ids=failed_ids,
new_titles=new_titles,
id_to_name=id_to_name,
html_file_path=html_file,
)
# 生成汇总报告(如果需要)
summary_html = None
if mode_strategy["should_generate_summary"]:
if mode_strategy["should_send_realtime"]:
# 如果已经发送了实时通知汇总只生成HTML不发送通知
summary_html = self._generate_summary_html(
mode_strategy["summary_mode"]
)
else:
# daily模式直接生成汇总报告并发送通知
summary_html = self._generate_summary_report(mode_strategy)
# 打开浏览器(仅在非容器环境)
if self._should_open_browser() and html_file:
if summary_html:
summary_url = "file://" + str(Path(summary_html).resolve())
print(f"正在打开汇总报告: {summary_url}")
webbrowser.open(summary_url)
else:
file_url = "file://" + str(Path(html_file).resolve())
print(f"正在打开HTML报告: {file_url}")
webbrowser.open(file_url)
elif self.is_docker_container and html_file:
if summary_html:
print(f"汇总报告已生成Docker环境: {summary_html}")
else:
print(f"HTML报告已生成Docker环境: {html_file}")
return summary_html
def run(self) -> None:
"""执行分析流程"""
try:
self._initialize_and_check_config()
mode_strategy = self._get_mode_strategy()
results, id_to_name, failed_ids = self._crawl_data()
self._execute_mode_strategy(mode_strategy, results, id_to_name, failed_ids)
except Exception as e:
print(f"分析流程执行出错: {e}")
raise
finally:
# 清理资源(包括过期数据清理和数据库连接关闭)
self.ctx.cleanup()
def main():
"""主程序入口"""
try:
analyzer = NewsAnalyzer()
analyzer.run()
except FileNotFoundError as e:
print(f"❌ 配置文件错误: {e}")
print("\n请确保以下文件存在:")
print(" • config/config.yaml")
print(" • config/frequency_words.txt")
print("\n参考项目文档进行正确配置")
except Exception as e:
print(f"❌ 程序运行错误: {e}")
raise
if __name__ == "__main__":
main()

388
trendradar/context.py Normal file
View File

@ -0,0 +1,388 @@
# coding=utf-8
"""
应用上下文模块
提供配置上下文类封装所有依赖配置的操作消除全局状态和包装函数
"""
from datetime import datetime
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Tuple
from trendradar.utils.time import (
get_configured_time,
format_date_folder,
format_time_filename,
get_current_time_display,
convert_time_for_display,
)
from trendradar.core import (
load_frequency_words,
matches_word_groups,
save_titles_to_file,
read_all_today_titles,
detect_latest_new_titles,
is_first_crawl_today,
count_word_frequency,
)
from trendradar.report import (
clean_title,
prepare_report_data,
generate_html_report,
render_html_content,
)
from trendradar.notification import (
render_feishu_content,
render_dingtalk_content,
split_content_into_batches,
NotificationDispatcher,
PushRecordManager,
)
from trendradar.storage import get_storage_manager
class AppContext:
"""
应用上下文类
封装所有依赖配置的操作提供统一的接口
消除对全局 CONFIG 的依赖提高可测试性
使用示例:
config = load_config()
ctx = AppContext(config)
# 时间操作
now = ctx.get_time()
date_folder = ctx.format_date()
# 存储操作
storage = ctx.get_storage_manager()
# 报告生成
html = ctx.generate_html_report(stats, total_titles, ...)
"""
def __init__(self, config: Dict[str, Any]):
"""
初始化应用上下文
Args:
config: 完整的配置字典
"""
self.config = config
self._storage_manager = None
# === 配置访问 ===
@property
def timezone(self) -> str:
"""获取配置的时区"""
return self.config.get("TIMEZONE", "Asia/Shanghai")
@property
def rank_threshold(self) -> int:
"""获取排名阈值"""
return self.config.get("RANK_THRESHOLD", 50)
@property
def weight_config(self) -> Dict:
"""获取权重配置"""
return self.config.get("WEIGHT_CONFIG", {})
@property
def platforms(self) -> List[Dict]:
"""获取平台配置列表"""
return self.config.get("PLATFORMS", [])
@property
def platform_ids(self) -> List[str]:
"""获取平台ID列表"""
return [p["id"] for p in self.platforms]
# === 时间操作 ===
def get_time(self) -> datetime:
"""获取当前配置时区的时间"""
return get_configured_time(self.timezone)
def format_date(self) -> str:
"""格式化日期文件夹 (YYYY-MM-DD)"""
return format_date_folder(timezone=self.timezone)
def format_time(self) -> str:
"""格式化时间文件名 (HH-MM)"""
return format_time_filename(self.timezone)
def get_time_display(self) -> str:
"""获取时间显示 (HH:MM)"""
return get_current_time_display(self.timezone)
@staticmethod
def convert_time_display(time_str: str) -> str:
"""将 HH-MM 转换为 HH:MM"""
return convert_time_for_display(time_str)
# === 存储操作 ===
def get_storage_manager(self):
"""获取存储管理器(延迟初始化,单例)"""
if self._storage_manager is None:
storage_config = self.config.get("STORAGE", {})
remote_config = storage_config.get("REMOTE", {})
local_config = storage_config.get("LOCAL", {})
pull_config = storage_config.get("PULL", {})
self._storage_manager = get_storage_manager(
backend_type=storage_config.get("BACKEND", "auto"),
data_dir=local_config.get("DATA_DIR", "output"),
enable_txt=storage_config.get("FORMATS", {}).get("TXT", True),
enable_html=storage_config.get("FORMATS", {}).get("HTML", True),
remote_config={
"bucket_name": remote_config.get("BUCKET_NAME", ""),
"access_key_id": remote_config.get("ACCESS_KEY_ID", ""),
"secret_access_key": remote_config.get("SECRET_ACCESS_KEY", ""),
"endpoint_url": remote_config.get("ENDPOINT_URL", ""),
"region": remote_config.get("REGION", ""),
},
local_retention_days=local_config.get("RETENTION_DAYS", 0),
remote_retention_days=remote_config.get("RETENTION_DAYS", 0),
pull_enabled=pull_config.get("ENABLED", False),
pull_days=pull_config.get("DAYS", 7),
timezone=self.timezone,
)
return self._storage_manager
def get_output_path(self, subfolder: str, filename: str) -> str:
"""获取输出路径"""
output_dir = Path("output") / self.format_date() / subfolder
output_dir.mkdir(parents=True, exist_ok=True)
return str(output_dir / filename)
# === 数据处理 ===
def save_titles(self, results: Dict, id_to_name: Dict, failed_ids: List) -> str:
"""保存标题到文件"""
output_path = self.get_output_path("txt", f"{self.format_time()}.txt")
return save_titles_to_file(results, id_to_name, failed_ids, output_path, clean_title)
def read_today_titles(
self, platform_ids: Optional[List[str]] = None
) -> Tuple[Dict, Dict, Dict]:
"""读取当天所有标题"""
return read_all_today_titles(self.get_storage_manager(), platform_ids)
def detect_new_titles(
self, platform_ids: Optional[List[str]] = None
) -> Dict:
"""检测最新批次的新增标题"""
return detect_latest_new_titles(self.get_storage_manager(), platform_ids)
def is_first_crawl(self) -> bool:
"""检测是否是当天第一次爬取"""
return is_first_crawl_today("output", self.format_date())
# === 频率词处理 ===
def load_frequency_words(
self, frequency_file: Optional[str] = None
) -> Tuple[List[Dict], List[str], List[str]]:
"""加载频率词配置"""
return load_frequency_words(frequency_file)
def matches_word_groups(
self,
title: str,
word_groups: List[Dict],
filter_words: List[str],
global_filters: Optional[List[str]] = None,
) -> bool:
"""检查标题是否匹配词组规则"""
return matches_word_groups(title, word_groups, filter_words, global_filters)
# === 统计分析 ===
def count_frequency(
self,
results: Dict,
word_groups: List[Dict],
filter_words: List[str],
id_to_name: Dict,
title_info: Optional[Dict] = None,
new_titles: Optional[Dict] = None,
mode: str = "daily",
global_filters: Optional[List[str]] = None,
) -> Tuple[List[Dict], int]:
"""统计词频"""
return count_word_frequency(
results=results,
word_groups=word_groups,
filter_words=filter_words,
id_to_name=id_to_name,
title_info=title_info,
rank_threshold=self.rank_threshold,
new_titles=new_titles,
mode=mode,
global_filters=global_filters,
weight_config=self.weight_config,
max_news_per_keyword=self.config.get("MAX_NEWS_PER_KEYWORD", 0),
sort_by_position_first=self.config.get("SORT_BY_POSITION_FIRST", False),
is_first_crawl_func=self.is_first_crawl,
convert_time_func=self.convert_time_display,
)
# === 报告生成 ===
def prepare_report(
self,
stats: List[Dict],
failed_ids: Optional[List] = None,
new_titles: Optional[Dict] = None,
id_to_name: Optional[Dict] = None,
mode: str = "daily",
) -> Dict:
"""准备报告数据"""
return prepare_report_data(
stats=stats,
failed_ids=failed_ids,
new_titles=new_titles,
id_to_name=id_to_name,
mode=mode,
rank_threshold=self.rank_threshold,
matches_word_groups_func=self.matches_word_groups,
load_frequency_words_func=self.load_frequency_words,
)
def generate_html(
self,
stats: List[Dict],
total_titles: int,
failed_ids: Optional[List] = None,
new_titles: Optional[Dict] = None,
id_to_name: Optional[Dict] = None,
mode: str = "daily",
is_daily_summary: bool = False,
update_info: Optional[Dict] = None,
) -> str:
"""生成HTML报告"""
return generate_html_report(
stats=stats,
total_titles=total_titles,
failed_ids=failed_ids,
new_titles=new_titles,
id_to_name=id_to_name,
mode=mode,
is_daily_summary=is_daily_summary,
update_info=update_info,
rank_threshold=self.rank_threshold,
output_dir="output",
date_folder=self.format_date(),
time_filename=self.format_time(),
render_html_func=lambda *args, **kwargs: self.render_html(*args, **kwargs),
matches_word_groups_func=self.matches_word_groups,
load_frequency_words_func=self.load_frequency_words,
enable_index_copy=True,
)
def render_html(
self,
report_data: Dict,
total_titles: int,
is_daily_summary: bool = False,
mode: str = "daily",
update_info: Optional[Dict] = None,
) -> str:
"""渲染HTML内容"""
return render_html_content(
report_data=report_data,
total_titles=total_titles,
is_daily_summary=is_daily_summary,
mode=mode,
update_info=update_info,
reverse_content_order=self.config.get("REVERSE_CONTENT_ORDER", False),
get_time_func=self.get_time,
)
# === 通知内容渲染 ===
def render_feishu(
self,
report_data: Dict,
update_info: Optional[Dict] = None,
mode: str = "daily",
) -> str:
"""渲染飞书内容"""
return render_feishu_content(
report_data=report_data,
update_info=update_info,
mode=mode,
separator=self.config.get("FEISHU_MESSAGE_SEPARATOR", "---"),
reverse_content_order=self.config.get("REVERSE_CONTENT_ORDER", False),
get_time_func=self.get_time,
)
def render_dingtalk(
self,
report_data: Dict,
update_info: Optional[Dict] = None,
mode: str = "daily",
) -> str:
"""渲染钉钉内容"""
return render_dingtalk_content(
report_data=report_data,
update_info=update_info,
mode=mode,
reverse_content_order=self.config.get("REVERSE_CONTENT_ORDER", False),
get_time_func=self.get_time,
)
def split_content(
self,
report_data: Dict,
format_type: str,
update_info: Optional[Dict] = None,
max_bytes: Optional[int] = None,
mode: str = "daily",
) -> List[str]:
"""分批处理消息内容"""
return split_content_into_batches(
report_data=report_data,
format_type=format_type,
update_info=update_info,
max_bytes=max_bytes,
mode=mode,
batch_sizes={
"dingtalk": self.config.get("DINGTALK_BATCH_SIZE", 20000),
"feishu": self.config.get("FEISHU_BATCH_SIZE", 29000),
"default": self.config.get("MESSAGE_BATCH_SIZE", 4000),
},
feishu_separator=self.config.get("FEISHU_MESSAGE_SEPARATOR", "---"),
reverse_content_order=self.config.get("REVERSE_CONTENT_ORDER", False),
get_time_func=self.get_time,
)
# === 通知发送 ===
def create_notification_dispatcher(self) -> NotificationDispatcher:
"""创建通知调度器"""
return NotificationDispatcher(
config=self.config,
get_time_func=self.get_time,
split_content_func=self.split_content,
)
def create_push_manager(self) -> PushRecordManager:
"""创建推送记录管理器"""
return PushRecordManager(
storage_backend=self.get_storage_manager(),
get_time_func=self.get_time,
)
# === 资源清理 ===
def cleanup(self):
"""清理资源"""
if self._storage_manager:
self._storage_manager.cleanup_old_data()
self._storage_manager.cleanup()
self._storage_manager = None

View File

@ -0,0 +1,47 @@
# coding=utf-8
"""
核心模块 - 配置管理和核心工具
"""
from trendradar.core.config import (
parse_multi_account_config,
validate_paired_configs,
limit_accounts,
get_account_at_index,
)
from trendradar.core.loader import load_config
from trendradar.core.frequency import load_frequency_words, matches_word_groups
from trendradar.core.data import (
save_titles_to_file,
read_all_today_titles_from_storage,
read_all_today_titles,
detect_latest_new_titles_from_storage,
detect_latest_new_titles,
is_first_crawl_today,
)
from trendradar.core.analyzer import (
calculate_news_weight,
format_time_display,
count_word_frequency,
)
__all__ = [
"parse_multi_account_config",
"validate_paired_configs",
"limit_accounts",
"get_account_at_index",
"load_config",
"load_frequency_words",
"matches_word_groups",
# 数据处理
"save_titles_to_file",
"read_all_today_titles_from_storage",
"read_all_today_titles",
"detect_latest_new_titles_from_storage",
"detect_latest_new_titles",
"is_first_crawl_today",
# 统计分析
"calculate_news_weight",
"format_time_display",
"count_word_frequency",
]

469
trendradar/core/analyzer.py Normal file
View File

@ -0,0 +1,469 @@
# coding=utf-8
"""
统计分析模块
提供新闻统计和分析功能
- calculate_news_weight: 计算新闻权重
- format_time_display: 格式化时间显示
- count_word_frequency: 统计词频
"""
from typing import Dict, List, Tuple, Optional, Callable
from trendradar.core.frequency import matches_word_groups
def calculate_news_weight(
title_data: Dict,
rank_threshold: int,
weight_config: Dict,
) -> float:
"""
计算新闻权重用于排序
Args:
title_data: 标题数据包含 ranks count
rank_threshold: 排名阈值
weight_config: 权重配置 {RANK_WEIGHT, FREQUENCY_WEIGHT, HOTNESS_WEIGHT}
Returns:
float: 计算出的权重值
"""
ranks = title_data.get("ranks", [])
if not ranks:
return 0.0
count = title_data.get("count", len(ranks))
# 排名权重:Σ(11 - min(rank, 10)) / 出现次数
rank_scores = []
for rank in ranks:
score = 11 - min(rank, 10)
rank_scores.append(score)
rank_weight = sum(rank_scores) / len(ranks) if ranks else 0
# 频次权重min(出现次数, 10) × 10
frequency_weight = min(count, 10) * 10
# 热度加成:高排名次数 / 总出现次数 × 100
high_rank_count = sum(1 for rank in ranks if rank <= rank_threshold)
hotness_ratio = high_rank_count / len(ranks) if ranks else 0
hotness_weight = hotness_ratio * 100
total_weight = (
rank_weight * weight_config["RANK_WEIGHT"]
+ frequency_weight * weight_config["FREQUENCY_WEIGHT"]
+ hotness_weight * weight_config["HOTNESS_WEIGHT"]
)
return total_weight
def format_time_display(
first_time: str,
last_time: str,
convert_time_func: Callable[[str], str],
) -> str:
"""
格式化时间显示 HH-MM 转换为 HH:MM
Args:
first_time: 首次出现时间
last_time: 最后出现时间
convert_time_func: 时间格式转换函数
Returns:
str: 格式化后的时间显示字符串
"""
if not first_time:
return ""
# 转换为显示格式
first_display = convert_time_func(first_time)
last_display = convert_time_func(last_time)
if first_display == last_display or not last_display:
return first_display
else:
return f"[{first_display} ~ {last_display}]"
def count_word_frequency(
results: Dict,
word_groups: List[Dict],
filter_words: List[str],
id_to_name: Dict,
title_info: Optional[Dict] = None,
rank_threshold: int = 3,
new_titles: Optional[Dict] = None,
mode: str = "daily",
global_filters: Optional[List[str]] = None,
weight_config: Optional[Dict] = None,
max_news_per_keyword: int = 0,
sort_by_position_first: bool = False,
is_first_crawl_func: Optional[Callable[[], bool]] = None,
convert_time_func: Optional[Callable[[str], str]] = None,
) -> Tuple[List[Dict], int]:
"""
统计词频支持必须词频率词过滤词全局过滤词并标记新增标题
Args:
results: 抓取结果 {source_id: {title: title_data}}
word_groups: 词组配置列表
filter_words: 过滤词列表
id_to_name: ID 到名称的映射
title_info: 标题统计信息可选
rank_threshold: 排名阈值
new_titles: 新增标题可选
mode: 报告模式 (daily/incremental/current)
global_filters: 全局过滤词可选
weight_config: 权重配置
max_news_per_keyword: 每个关键词最大显示数量
sort_by_position_first: 是否优先按配置位置排序
is_first_crawl_func: 检测是否是当天第一次爬取的函数
convert_time_func: 时间格式转换函数
Returns:
Tuple[List[Dict], int]: (统计结果列表, 总标题数)
"""
# 默认权重配置
if weight_config is None:
weight_config = {
"RANK_WEIGHT": 0.4,
"FREQUENCY_WEIGHT": 0.3,
"HOTNESS_WEIGHT": 0.3,
}
# 默认时间转换函数
if convert_time_func is None:
convert_time_func = lambda x: x
# 默认首次爬取检测函数
if is_first_crawl_func is None:
is_first_crawl_func = lambda: True
# 如果没有配置词组,创建一个包含所有新闻的虚拟词组
if not word_groups:
print("频率词配置为空,将显示所有新闻")
word_groups = [{"required": [], "normal": [], "group_key": "全部新闻"}]
filter_words = [] # 清空过滤词,显示所有新闻
is_first_today = is_first_crawl_func()
# 确定处理的数据源和新增标记逻辑
if mode == "incremental":
if is_first_today:
# 增量模式 + 当天第一次:处理所有新闻,都标记为新增
results_to_process = results
all_news_are_new = True
else:
# 增量模式 + 当天非第一次:只处理新增的新闻
results_to_process = new_titles if new_titles else {}
all_news_are_new = True
elif mode == "current":
# current 模式:只处理当前时间批次的新闻,但统计信息来自全部历史
if title_info:
latest_time = None
for source_titles in title_info.values():
for title_data in source_titles.values():
last_time = title_data.get("last_time", "")
if last_time:
if latest_time is None or last_time > latest_time:
latest_time = last_time
# 只处理 last_time 等于最新时间的新闻
if latest_time:
results_to_process = {}
for source_id, source_titles in results.items():
if source_id in title_info:
filtered_titles = {}
for title, title_data in source_titles.items():
if title in title_info[source_id]:
info = title_info[source_id][title]
if info.get("last_time") == latest_time:
filtered_titles[title] = title_data
if filtered_titles:
results_to_process[source_id] = filtered_titles
print(
f"当前榜单模式:最新时间 {latest_time},筛选出 {sum(len(titles) for titles in results_to_process.values())} 条当前榜单新闻"
)
else:
results_to_process = results
else:
results_to_process = results
all_news_are_new = False
else:
# 当日汇总模式:处理所有新闻
results_to_process = results
all_news_are_new = False
total_input_news = sum(len(titles) for titles in results.values())
filter_status = (
"全部显示"
if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻"
else "频率词过滤"
)
print(f"当日汇总模式:处理 {total_input_news} 条新闻,模式:{filter_status}")
word_stats = {}
total_titles = 0
processed_titles = {}
matched_new_count = 0
if title_info is None:
title_info = {}
if new_titles is None:
new_titles = {}
for group in word_groups:
group_key = group["group_key"]
word_stats[group_key] = {"count": 0, "titles": {}}
for source_id, titles_data in results_to_process.items():
total_titles += len(titles_data)
if source_id not in processed_titles:
processed_titles[source_id] = {}
for title, title_data in titles_data.items():
if title in processed_titles.get(source_id, {}):
continue
# 使用统一的匹配逻辑
matches_frequency_words = matches_word_groups(
title, word_groups, filter_words, global_filters
)
if not matches_frequency_words:
continue
# 如果是增量模式或 current 模式第一次,统计匹配的新增新闻数量
if (mode == "incremental" and all_news_are_new) or (
mode == "current" and is_first_today
):
matched_new_count += 1
source_ranks = title_data.get("ranks", [])
source_url = title_data.get("url", "")
source_mobile_url = title_data.get("mobileUrl", "")
# 找到匹配的词组(防御性转换确保类型安全)
title_lower = str(title).lower() if not isinstance(title, str) else title.lower()
for group in word_groups:
required_words = group["required"]
normal_words = group["normal"]
# 如果是"全部新闻"模式,所有标题都匹配第一个(唯一的)词组
if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻":
group_key = group["group_key"]
word_stats[group_key]["count"] += 1
if source_id not in word_stats[group_key]["titles"]:
word_stats[group_key]["titles"][source_id] = []
else:
# 原有的匹配逻辑
if required_words:
all_required_present = all(
req_word.lower() in title_lower
for req_word in required_words
)
if not all_required_present:
continue
if normal_words:
any_normal_present = any(
normal_word.lower() in title_lower
for normal_word in normal_words
)
if not any_normal_present:
continue
group_key = group["group_key"]
word_stats[group_key]["count"] += 1
if source_id not in word_stats[group_key]["titles"]:
word_stats[group_key]["titles"][source_id] = []
first_time = ""
last_time = ""
count_info = 1
ranks = source_ranks if source_ranks else []
url = source_url
mobile_url = source_mobile_url
# 对于 current 模式,从历史统计信息中获取完整数据
if (
mode == "current"
and title_info
and source_id in title_info
and title in title_info[source_id]
):
info = title_info[source_id][title]
first_time = info.get("first_time", "")
last_time = info.get("last_time", "")
count_info = info.get("count", 1)
if "ranks" in info and info["ranks"]:
ranks = info["ranks"]
url = info.get("url", source_url)
mobile_url = info.get("mobileUrl", source_mobile_url)
elif (
title_info
and source_id in title_info
and title in title_info[source_id]
):
info = title_info[source_id][title]
first_time = info.get("first_time", "")
last_time = info.get("last_time", "")
count_info = info.get("count", 1)
if "ranks" in info and info["ranks"]:
ranks = info["ranks"]
url = info.get("url", source_url)
mobile_url = info.get("mobileUrl", source_mobile_url)
if not ranks:
ranks = [99]
time_display = format_time_display(first_time, last_time, convert_time_func)
source_name = id_to_name.get(source_id, source_id)
# 判断是否为新增
is_new = False
if all_news_are_new:
# 增量模式下所有处理的新闻都是新增,或者当天第一次的所有新闻都是新增
is_new = True
elif new_titles and source_id in new_titles:
# 检查是否在新增列表中
new_titles_for_source = new_titles[source_id]
is_new = title in new_titles_for_source
word_stats[group_key]["titles"][source_id].append(
{
"title": title,
"source_name": source_name,
"first_time": first_time,
"last_time": last_time,
"time_display": time_display,
"count": count_info,
"ranks": ranks,
"rank_threshold": rank_threshold,
"url": url,
"mobileUrl": mobile_url,
"is_new": is_new,
}
)
if source_id not in processed_titles:
processed_titles[source_id] = {}
processed_titles[source_id][title] = True
break
# 最后统一打印汇总信息
if mode == "incremental":
if is_first_today:
total_input_news = sum(len(titles) for titles in results.values())
filter_status = (
"全部显示"
if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻"
else "频率词匹配"
)
print(
f"增量模式:当天第一次爬取,{total_input_news} 条新闻中有 {matched_new_count}{filter_status}"
)
else:
if new_titles:
total_new_count = sum(len(titles) for titles in new_titles.values())
filter_status = (
"全部显示"
if len(word_groups) == 1
and word_groups[0]["group_key"] == "全部新闻"
else "匹配频率词"
)
print(
f"增量模式:{total_new_count} 条新增新闻中,有 {matched_new_count}{filter_status}"
)
if matched_new_count == 0 and len(word_groups) > 1:
print("增量模式:没有新增新闻匹配频率词,将不会发送通知")
else:
print("增量模式:未检测到新增新闻")
elif mode == "current":
total_input_news = sum(len(titles) for titles in results_to_process.values())
if is_first_today:
filter_status = (
"全部显示"
if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻"
else "频率词匹配"
)
print(
f"当前榜单模式:当天第一次爬取,{total_input_news} 条当前榜单新闻中有 {matched_new_count}{filter_status}"
)
else:
matched_count = sum(stat["count"] for stat in word_stats.values())
filter_status = (
"全部显示"
if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻"
else "频率词匹配"
)
print(
f"当前榜单模式:{total_input_news} 条当前榜单新闻中有 {matched_count}{filter_status}"
)
stats = []
# 创建 group_key 到位置和最大数量的映射
group_key_to_position = {
group["group_key"]: idx for idx, group in enumerate(word_groups)
}
group_key_to_max_count = {
group["group_key"]: group.get("max_count", 0) for group in word_groups
}
for group_key, data in word_stats.items():
all_titles = []
for source_id, title_list in data["titles"].items():
all_titles.extend(title_list)
# 按权重排序
sorted_titles = sorted(
all_titles,
key=lambda x: (
-calculate_news_weight(x, rank_threshold, weight_config),
min(x["ranks"]) if x["ranks"] else 999,
-x["count"],
),
)
# 应用最大显示数量限制(优先级:单独配置 > 全局配置)
group_max_count = group_key_to_max_count.get(group_key, 0)
if group_max_count == 0:
# 使用全局配置
group_max_count = max_news_per_keyword
if group_max_count > 0:
sorted_titles = sorted_titles[:group_max_count]
stats.append(
{
"word": group_key,
"count": data["count"],
"position": group_key_to_position.get(group_key, 999),
"titles": sorted_titles,
"percentage": (
round(data["count"] / total_titles * 100, 2)
if total_titles > 0
else 0
),
}
)
# 根据配置选择排序优先级
if sort_by_position_first:
# 先按配置位置,再按热点条数
stats.sort(key=lambda x: (x["position"], -x["count"]))
else:
# 先按热点条数,再按配置位置(原逻辑)
stats.sort(key=lambda x: (-x["count"], x["position"]))
# 打印过滤后的匹配新闻数(与推送显示一致)
matched_news_count = sum(len(stat["titles"]) for stat in stats if stat["count"] > 0)
if mode == "daily":
print(f"频率词过滤后:{matched_news_count} 条新闻匹配(将显示在推送中)")
return stats, total_titles

152
trendradar/core/config.py Normal file
View File

@ -0,0 +1,152 @@
# coding=utf-8
"""
配置工具模块 - 多账号配置解析和验证
提供多账号推送配置的解析验证和限制功能
"""
from typing import Dict, List, Optional, Tuple
def parse_multi_account_config(config_value: str, separator: str = ";") -> List[str]:
"""
解析多账号配置返回账号列表
Args:
config_value: 配置值字符串多个账号用分隔符分隔
separator: 分隔符默认为 ;
Returns:
账号列表空字符串会被保留用于占位
Examples:
>>> parse_multi_account_config("url1;url2;url3")
['url1', 'url2', 'url3']
>>> parse_multi_account_config(";token2") # 第一个账号无token
['', 'token2']
>>> parse_multi_account_config("")
[]
"""
if not config_value:
return []
# 保留空字符串用于占位(如 ";token2" 表示第一个账号无token
accounts = [acc.strip() for acc in config_value.split(separator)]
# 过滤掉全部为空的情况
if all(not acc for acc in accounts):
return []
return accounts
def validate_paired_configs(
configs: Dict[str, List[str]],
channel_name: str,
required_keys: Optional[List[str]] = None
) -> Tuple[bool, int]:
"""
验证配对配置的数量是否一致
对于需要多个配置项配对的渠道 Telegram token chat_id
验证所有配置项的账号数量是否一致
Args:
configs: 配置字典key 为配置名value 为账号列表
channel_name: 渠道名称用于日志输出
required_keys: 必须有值的配置项列表
Returns:
(是否验证通过, 账号数量)
Examples:
>>> validate_paired_configs({
... "token": ["t1", "t2"],
... "chat_id": ["c1", "c2"]
... }, "Telegram", ["token", "chat_id"])
(True, 2)
>>> validate_paired_configs({
... "token": ["t1", "t2"],
... "chat_id": ["c1"] # 数量不匹配
... }, "Telegram", ["token", "chat_id"])
(False, 0)
"""
# 过滤掉空列表
non_empty_configs = {k: v for k, v in configs.items() if v}
if not non_empty_configs:
return True, 0
# 检查必须项
if required_keys:
for key in required_keys:
if key not in non_empty_configs or not non_empty_configs[key]:
return True, 0 # 必须项为空,视为未配置
# 获取所有非空配置的长度
lengths = {k: len(v) for k, v in non_empty_configs.items()}
unique_lengths = set(lengths.values())
if len(unique_lengths) > 1:
print(f"{channel_name} 配置错误:配对配置数量不一致,将跳过该渠道推送")
for key, length in lengths.items():
print(f" - {key}: {length}")
return False, 0
return True, list(unique_lengths)[0] if unique_lengths else 0
def limit_accounts(
accounts: List[str],
max_count: int,
channel_name: str
) -> List[str]:
"""
限制账号数量
当配置的账号数量超过最大限制时只使用前 N 个账号
并输出警告信息
Args:
accounts: 账号列表
max_count: 最大账号数量
channel_name: 渠道名称用于日志输出
Returns:
限制后的账号列表
Examples:
>>> limit_accounts(["a1", "a2", "a3"], 2, "飞书")
飞书 配置了 3 个账号超过最大限制 2只使用前 2
['a1', 'a2']
"""
if len(accounts) > max_count:
print(f"⚠️ {channel_name} 配置了 {len(accounts)} 个账号,超过最大限制 {max_count},只使用前 {max_count}")
print(f" ⚠️ 警告:如果您是 fork 用户,过多账号可能导致 GitHub Actions 运行时间过长,存在账号风险")
return accounts[:max_count]
return accounts
def get_account_at_index(accounts: List[str], index: int, default: str = "") -> str:
"""
安全获取指定索引的账号值
当索引超出范围或账号值为空时返回默认值
Args:
accounts: 账号列表
index: 索引
default: 默认值
Returns:
账号值或默认值
Examples:
>>> get_account_at_index(["a", "b", "c"], 1)
'b'
>>> get_account_at_index(["a", "", "c"], 1, "default")
'default'
>>> get_account_at_index(["a"], 5, "default")
'default'
"""
if index < len(accounts):
return accounts[index] if accounts[index] else default
return default

291
trendradar/core/data.py Normal file
View File

@ -0,0 +1,291 @@
# coding=utf-8
"""
数据处理模块
提供数据读取保存和检测功能
- save_titles_to_file: 保存标题到 TXT 文件
- read_all_today_titles: 从存储后端读取当天所有标题
- detect_latest_new_titles: 检测最新批次的新增标题
Author: TrendRadar Team
"""
from pathlib import Path
from typing import Dict, List, Tuple, Optional, Callable
def save_titles_to_file(
results: Dict,
id_to_name: Dict,
failed_ids: List,
output_path: str,
clean_title_func: Callable[[str], str],
) -> str:
"""
保存标题到 TXT 文件
Args:
results: 抓取结果 {source_id: {title: title_data}}
id_to_name: ID 到名称的映射
failed_ids: 失败的 ID 列表
output_path: 输出文件路径
clean_title_func: 标题清理函数
Returns:
str: 保存的文件路径
"""
# 确保目录存在
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
for id_value, title_data in results.items():
# id | name 或 id
name = id_to_name.get(id_value)
if name and name != id_value:
f.write(f"{id_value} | {name}\n")
else:
f.write(f"{id_value}\n")
# 按排名排序标题
sorted_titles = []
for title, info in title_data.items():
cleaned_title = clean_title_func(title)
if isinstance(info, dict):
ranks = info.get("ranks", [])
url = info.get("url", "")
mobile_url = info.get("mobileUrl", "")
else:
ranks = info if isinstance(info, list) else []
url = ""
mobile_url = ""
rank = ranks[0] if ranks else 1
sorted_titles.append((rank, cleaned_title, url, mobile_url))
sorted_titles.sort(key=lambda x: x[0])
for rank, cleaned_title, url, mobile_url in sorted_titles:
line = f"{rank}. {cleaned_title}"
if url:
line += f" [URL:{url}]"
if mobile_url:
line += f" [MOBILE:{mobile_url}]"
f.write(line + "\n")
f.write("\n")
if failed_ids:
f.write("==== 以下ID请求失败 ====\n")
for id_value in failed_ids:
f.write(f"{id_value}\n")
return output_path
def read_all_today_titles_from_storage(
storage_manager,
current_platform_ids: Optional[List[str]] = None,
) -> Tuple[Dict, Dict, Dict]:
"""
从存储后端读取当天所有标题SQLite 数据
Args:
storage_manager: 存储管理器实例
current_platform_ids: 当前监控的平台 ID 列表用于过滤
Returns:
Tuple[Dict, Dict, Dict]: (all_results, id_to_name, title_info)
"""
try:
news_data = storage_manager.get_today_all_data()
if not news_data or not news_data.items:
return {}, {}, {}
all_results = {}
final_id_to_name = {}
title_info = {}
for source_id, news_list in news_data.items.items():
# 按平台过滤
if current_platform_ids is not None and source_id not in current_platform_ids:
continue
# 获取来源名称
source_name = news_data.id_to_name.get(source_id, source_id)
final_id_to_name[source_id] = source_name
if source_id not in all_results:
all_results[source_id] = {}
title_info[source_id] = {}
for item in news_list:
title = item.title
ranks = getattr(item, 'ranks', [item.rank])
first_time = getattr(item, 'first_time', item.crawl_time)
last_time = getattr(item, 'last_time', item.crawl_time)
count = getattr(item, 'count', 1)
all_results[source_id][title] = {
"ranks": ranks,
"url": item.url or "",
"mobileUrl": item.mobile_url or "",
}
title_info[source_id][title] = {
"first_time": first_time,
"last_time": last_time,
"count": count,
"ranks": ranks,
"url": item.url or "",
"mobileUrl": item.mobile_url or "",
}
return all_results, final_id_to_name, title_info
except Exception as e:
print(f"[存储] 从存储后端读取数据失败: {e}")
return {}, {}, {}
def read_all_today_titles(
storage_manager,
current_platform_ids: Optional[List[str]] = None,
) -> Tuple[Dict, Dict, Dict]:
"""
读取当天所有标题从存储后端
Args:
storage_manager: 存储管理器实例
current_platform_ids: 当前监控的平台 ID 列表用于过滤
Returns:
Tuple[Dict, Dict, Dict]: (all_results, id_to_name, title_info)
"""
all_results, final_id_to_name, title_info = read_all_today_titles_from_storage(
storage_manager, current_platform_ids
)
if all_results:
total_count = sum(len(titles) for titles in all_results.values())
print(f"[存储] 已从存储后端读取 {total_count} 条标题")
else:
print("[存储] 当天暂无数据")
return all_results, final_id_to_name, title_info
def detect_latest_new_titles_from_storage(
storage_manager,
current_platform_ids: Optional[List[str]] = None,
) -> Dict:
"""
从存储后端检测最新批次的新增标题
Args:
storage_manager: 存储管理器实例
current_platform_ids: 当前监控的平台 ID 列表用于过滤
Returns:
Dict: 新增标题 {source_id: {title: title_data}}
"""
try:
# 获取最新抓取数据
latest_data = storage_manager.get_latest_crawl_data()
if not latest_data or not latest_data.items:
return {}
# 获取所有历史数据
all_data = storage_manager.get_today_all_data()
if not all_data or not all_data.items:
# 没有历史数据(第一次抓取),不应该有"新增"标题
return {}
# 收集历史标题(不包括最新批次的时间)
latest_time = latest_data.crawl_time
historical_titles = {}
for source_id, news_list in all_data.items.items():
if current_platform_ids is not None and source_id not in current_platform_ids:
continue
historical_titles[source_id] = set()
for item in news_list:
# 只统计非最新批次的标题
first_time = getattr(item, 'first_time', item.crawl_time)
if first_time != latest_time:
historical_titles[source_id].add(item.title)
# 检查是否是当天第一次抓取(没有任何历史标题)
# 如果所有平台的历史标题集合都为空,说明只有一个抓取批次,不应该有"新增"标题
has_historical_data = any(len(titles) > 0 for titles in historical_titles.values())
if not has_historical_data:
return {}
# 找出新增标题
new_titles = {}
for source_id, news_list in latest_data.items.items():
if current_platform_ids is not None and source_id not in current_platform_ids:
continue
historical_set = historical_titles.get(source_id, set())
source_new_titles = {}
for item in news_list:
if item.title not in historical_set:
source_new_titles[item.title] = {
"ranks": [item.rank],
"url": item.url or "",
"mobileUrl": item.mobile_url or "",
}
if source_new_titles:
new_titles[source_id] = source_new_titles
return new_titles
except Exception as e:
print(f"[存储] 从存储后端检测新标题失败: {e}")
return {}
def detect_latest_new_titles(
storage_manager,
current_platform_ids: Optional[List[str]] = None,
) -> Dict:
"""
检测当日最新批次的新增标题从存储后端
Args:
storage_manager: 存储管理器实例
current_platform_ids: 当前监控的平台 ID 列表用于过滤
Returns:
Dict: 新增标题 {source_id: {title: title_data}}
"""
new_titles = detect_latest_new_titles_from_storage(storage_manager, current_platform_ids)
if new_titles:
total_new = sum(len(titles) for titles in new_titles.values())
print(f"[存储] 从存储后端检测到 {total_new} 条新增标题")
return new_titles
def is_first_crawl_today(output_dir: str, date_folder: str) -> bool:
"""
检测是否是当天第一次爬取
Args:
output_dir: 输出目录
date_folder: 日期文件夹名称
Returns:
bool: 是否是当天第一次爬取
"""
txt_dir = Path(output_dir) / date_folder / "txt"
if not txt_dir.exists():
return True
files = sorted([f for f in txt_dir.iterdir() if f.suffix == ".txt"])
return len(files) <= 1

View File

@ -0,0 +1,194 @@
# coding=utf-8
"""
频率词配置加载模块
负责从配置文件加载频率词规则支持
- 普通词组
- 必须词+前缀
- 过滤词!前缀
- 全局过滤词[GLOBAL_FILTER] 区域
- 最大显示数量@前缀
"""
import os
from pathlib import Path
from typing import Dict, List, Tuple, Optional
def load_frequency_words(
frequency_file: Optional[str] = None,
) -> Tuple[List[Dict], List[str], List[str]]:
"""
加载频率词配置
配置文件格式说明
- 每个词组由空行分隔
- [GLOBAL_FILTER] 区域定义全局过滤词
- [WORD_GROUPS] 区域定义词组默认
词组语法
- 普通词直接写入任意匹配即可
- +必须词所有必须词都要匹配
- !过滤词匹配则排除
- @数字该词组最多显示的条数
Args:
frequency_file: 频率词配置文件路径默认从环境变量 FREQUENCY_WORDS_PATH 获取或使用 config/frequency_words.txt
Returns:
(词组列表, 词组内过滤词, 全局过滤词)
Raises:
FileNotFoundError: 频率词文件不存在
"""
if frequency_file is None:
frequency_file = os.environ.get(
"FREQUENCY_WORDS_PATH", "config/frequency_words.txt"
)
frequency_path = Path(frequency_file)
if not frequency_path.exists():
raise FileNotFoundError(f"频率词文件 {frequency_file} 不存在")
with open(frequency_path, "r", encoding="utf-8") as f:
content = f.read()
word_groups = [group.strip() for group in content.split("\n\n") if group.strip()]
processed_groups = []
filter_words = []
global_filters = []
# 默认区域(向后兼容)
current_section = "WORD_GROUPS"
for group in word_groups:
lines = [line.strip() for line in group.split("\n") if line.strip()]
if not lines:
continue
# 检查是否为区域标记
if lines[0].startswith("[") and lines[0].endswith("]"):
section_name = lines[0][1:-1].upper()
if section_name in ("GLOBAL_FILTER", "WORD_GROUPS"):
current_section = section_name
lines = lines[1:] # 移除标记行
# 处理全局过滤区域
if current_section == "GLOBAL_FILTER":
# 直接添加所有非空行到全局过滤列表
for line in lines:
# 忽略特殊语法前缀,只提取纯文本
if line.startswith(("!", "+", "@")):
continue # 全局过滤区不支持特殊语法
if line:
global_filters.append(line)
continue
# 处理词组区域
words = lines
group_required_words = []
group_normal_words = []
group_filter_words = []
group_max_count = 0 # 默认不限制
for word in words:
if word.startswith("@"):
# 解析最大显示数量(只接受正整数)
try:
count = int(word[1:])
if count > 0:
group_max_count = count
except (ValueError, IndexError):
pass # 忽略无效的@数字格式
elif word.startswith("!"):
filter_words.append(word[1:])
group_filter_words.append(word[1:])
elif word.startswith("+"):
group_required_words.append(word[1:])
else:
group_normal_words.append(word)
if group_required_words or group_normal_words:
if group_normal_words:
group_key = " ".join(group_normal_words)
else:
group_key = " ".join(group_required_words)
processed_groups.append(
{
"required": group_required_words,
"normal": group_normal_words,
"group_key": group_key,
"max_count": group_max_count,
}
)
return processed_groups, filter_words, global_filters
def matches_word_groups(
title: str,
word_groups: List[Dict],
filter_words: List[str],
global_filters: Optional[List[str]] = None
) -> bool:
"""
检查标题是否匹配词组规则
Args:
title: 标题文本
word_groups: 词组列表
filter_words: 过滤词列表
global_filters: 全局过滤词列表
Returns:
是否匹配
"""
# 防御性类型检查:确保 title 是有效字符串
if not isinstance(title, str):
title = str(title) if title is not None else ""
if not title.strip():
return False
title_lower = title.lower()
# 全局过滤检查(优先级最高)
if global_filters:
if any(global_word.lower() in title_lower for global_word in global_filters):
return False
# 如果没有配置词组,则匹配所有标题(支持显示全部新闻)
if not word_groups:
return True
# 过滤词检查
if any(filter_word.lower() in title_lower for filter_word in filter_words):
return False
# 词组匹配检查
for group in word_groups:
required_words = group["required"]
normal_words = group["normal"]
# 必须词检查
if required_words:
all_required_present = all(
req_word.lower() in title_lower for req_word in required_words
)
if not all_required_present:
continue
# 普通词检查
if normal_words:
any_normal_present = any(
normal_word.lower() in title_lower for normal_word in normal_words
)
if not any_normal_present:
continue
return True
return False

332
trendradar/core/loader.py Normal file
View File

@ -0,0 +1,332 @@
# coding=utf-8
"""
配置加载模块
负责从 YAML 配置文件和环境变量加载配置
"""
import os
from pathlib import Path
from typing import Dict, Any, Optional
import yaml
from .config import parse_multi_account_config, validate_paired_configs
def _get_env_bool(key: str, default: bool = False) -> Optional[bool]:
"""从环境变量获取布尔值,如果未设置返回 None"""
value = os.environ.get(key, "").strip().lower()
if not value:
return None
return value in ("true", "1")
def _get_env_int(key: str, default: int = 0) -> int:
"""从环境变量获取整数值"""
value = os.environ.get(key, "").strip()
if not value:
return default
try:
return int(value)
except ValueError:
return default
def _get_env_str(key: str, default: str = "") -> str:
"""从环境变量获取字符串值"""
return os.environ.get(key, "").strip() or default
def _load_app_config(config_data: Dict) -> Dict:
"""加载应用配置"""
app_config = config_data.get("app", {})
return {
"VERSION_CHECK_URL": app_config.get("version_check_url", ""),
"SHOW_VERSION_UPDATE": app_config.get("show_version_update", True),
"TIMEZONE": _get_env_str("TIMEZONE") or app_config.get("timezone", "Asia/Shanghai"),
}
def _load_crawler_config(config_data: Dict) -> Dict:
"""加载爬虫配置"""
crawler_config = config_data.get("crawler", {})
enable_crawler_env = _get_env_bool("ENABLE_CRAWLER")
return {
"REQUEST_INTERVAL": crawler_config.get("request_interval", 100),
"USE_PROXY": crawler_config.get("use_proxy", False),
"DEFAULT_PROXY": crawler_config.get("default_proxy", ""),
"ENABLE_CRAWLER": enable_crawler_env if enable_crawler_env is not None else crawler_config.get("enable_crawler", True),
}
def _load_report_config(config_data: Dict) -> Dict:
"""加载报告配置"""
report_config = config_data.get("report", {})
# 环境变量覆盖
sort_by_position_env = _get_env_bool("SORT_BY_POSITION_FIRST")
reverse_content_env = _get_env_bool("REVERSE_CONTENT_ORDER")
max_news_env = _get_env_int("MAX_NEWS_PER_KEYWORD")
return {
"REPORT_MODE": _get_env_str("REPORT_MODE") or report_config.get("mode", "daily"),
"RANK_THRESHOLD": report_config.get("rank_threshold", 10),
"SORT_BY_POSITION_FIRST": sort_by_position_env if sort_by_position_env is not None else report_config.get("sort_by_position_first", False),
"MAX_NEWS_PER_KEYWORD": max_news_env or report_config.get("max_news_per_keyword", 0),
"REVERSE_CONTENT_ORDER": reverse_content_env if reverse_content_env is not None else report_config.get("reverse_content_order", False),
}
def _load_notification_config(config_data: Dict) -> Dict:
"""加载通知配置"""
notification = config_data.get("notification", {})
enable_notification_env = _get_env_bool("ENABLE_NOTIFICATION")
return {
"ENABLE_NOTIFICATION": enable_notification_env if enable_notification_env is not None else notification.get("enable_notification", True),
"MESSAGE_BATCH_SIZE": notification.get("message_batch_size", 4000),
"DINGTALK_BATCH_SIZE": notification.get("dingtalk_batch_size", 20000),
"FEISHU_BATCH_SIZE": notification.get("feishu_batch_size", 29000),
"BARK_BATCH_SIZE": notification.get("bark_batch_size", 3600),
"SLACK_BATCH_SIZE": notification.get("slack_batch_size", 4000),
"BATCH_SEND_INTERVAL": notification.get("batch_send_interval", 1.0),
"FEISHU_MESSAGE_SEPARATOR": notification.get("feishu_message_separator", "---"),
"MAX_ACCOUNTS_PER_CHANNEL": _get_env_int("MAX_ACCOUNTS_PER_CHANNEL") or notification.get("max_accounts_per_channel", 3),
}
def _load_push_window_config(config_data: Dict) -> Dict:
"""加载推送窗口配置"""
notification = config_data.get("notification", {})
push_window = notification.get("push_window", {})
time_range = push_window.get("time_range", {})
enabled_env = _get_env_bool("PUSH_WINDOW_ENABLED")
once_per_day_env = _get_env_bool("PUSH_WINDOW_ONCE_PER_DAY")
return {
"ENABLED": enabled_env if enabled_env is not None else push_window.get("enabled", False),
"TIME_RANGE": {
"START": _get_env_str("PUSH_WINDOW_START") or time_range.get("start", "08:00"),
"END": _get_env_str("PUSH_WINDOW_END") or time_range.get("end", "22:00"),
},
"ONCE_PER_DAY": once_per_day_env if once_per_day_env is not None else push_window.get("once_per_day", True),
}
def _load_weight_config(config_data: Dict) -> Dict:
"""加载权重配置"""
weight = config_data.get("weight", {})
return {
"RANK_WEIGHT": weight.get("rank_weight", 1.0),
"FREQUENCY_WEIGHT": weight.get("frequency_weight", 1.0),
"HOTNESS_WEIGHT": weight.get("hotness_weight", 1.0),
}
def _load_storage_config(config_data: Dict) -> Dict:
"""加载存储配置"""
storage = config_data.get("storage", {})
formats = storage.get("formats", {})
local = storage.get("local", {})
remote = storage.get("remote", {})
pull = storage.get("pull", {})
txt_enabled_env = _get_env_bool("STORAGE_TXT_ENABLED")
html_enabled_env = _get_env_bool("STORAGE_HTML_ENABLED")
pull_enabled_env = _get_env_bool("PULL_ENABLED")
return {
"BACKEND": _get_env_str("STORAGE_BACKEND") or storage.get("backend", "auto"),
"FORMATS": {
"SQLITE": formats.get("sqlite", True),
"TXT": txt_enabled_env if txt_enabled_env is not None else formats.get("txt", True),
"HTML": html_enabled_env if html_enabled_env is not None else formats.get("html", True),
},
"LOCAL": {
"DATA_DIR": local.get("data_dir", "output"),
"RETENTION_DAYS": _get_env_int("LOCAL_RETENTION_DAYS") or local.get("retention_days", 0),
},
"REMOTE": {
"ENDPOINT_URL": _get_env_str("S3_ENDPOINT_URL") or remote.get("endpoint_url", ""),
"BUCKET_NAME": _get_env_str("S3_BUCKET_NAME") or remote.get("bucket_name", ""),
"ACCESS_KEY_ID": _get_env_str("S3_ACCESS_KEY_ID") or remote.get("access_key_id", ""),
"SECRET_ACCESS_KEY": _get_env_str("S3_SECRET_ACCESS_KEY") or remote.get("secret_access_key", ""),
"REGION": _get_env_str("S3_REGION") or remote.get("region", ""),
"RETENTION_DAYS": _get_env_int("REMOTE_RETENTION_DAYS") or remote.get("retention_days", 0),
},
"PULL": {
"ENABLED": pull_enabled_env if pull_enabled_env is not None else pull.get("enabled", False),
"DAYS": _get_env_int("PULL_DAYS") or pull.get("days", 7),
},
}
def _load_webhook_config(config_data: Dict) -> Dict:
"""加载 Webhook 配置"""
notification = config_data.get("notification", {})
webhooks = notification.get("webhooks", {})
return {
# 飞书
"FEISHU_WEBHOOK_URL": _get_env_str("FEISHU_WEBHOOK_URL") or webhooks.get("feishu_url", ""),
# 钉钉
"DINGTALK_WEBHOOK_URL": _get_env_str("DINGTALK_WEBHOOK_URL") or webhooks.get("dingtalk_url", ""),
# 企业微信
"WEWORK_WEBHOOK_URL": _get_env_str("WEWORK_WEBHOOK_URL") or webhooks.get("wework_url", ""),
"WEWORK_MSG_TYPE": _get_env_str("WEWORK_MSG_TYPE") or webhooks.get("wework_msg_type", "markdown"),
# Telegram
"TELEGRAM_BOT_TOKEN": _get_env_str("TELEGRAM_BOT_TOKEN") or webhooks.get("telegram_bot_token", ""),
"TELEGRAM_CHAT_ID": _get_env_str("TELEGRAM_CHAT_ID") or webhooks.get("telegram_chat_id", ""),
# 邮件
"EMAIL_FROM": _get_env_str("EMAIL_FROM") or webhooks.get("email_from", ""),
"EMAIL_PASSWORD": _get_env_str("EMAIL_PASSWORD") or webhooks.get("email_password", ""),
"EMAIL_TO": _get_env_str("EMAIL_TO") or webhooks.get("email_to", ""),
"EMAIL_SMTP_SERVER": _get_env_str("EMAIL_SMTP_SERVER") or webhooks.get("email_smtp_server", ""),
"EMAIL_SMTP_PORT": _get_env_str("EMAIL_SMTP_PORT") or webhooks.get("email_smtp_port", ""),
# ntfy
"NTFY_SERVER_URL": _get_env_str("NTFY_SERVER_URL") or webhooks.get("ntfy_server_url") or "https://ntfy.sh",
"NTFY_TOPIC": _get_env_str("NTFY_TOPIC") or webhooks.get("ntfy_topic", ""),
"NTFY_TOKEN": _get_env_str("NTFY_TOKEN") or webhooks.get("ntfy_token", ""),
# Bark
"BARK_URL": _get_env_str("BARK_URL") or webhooks.get("bark_url", ""),
# Slack
"SLACK_WEBHOOK_URL": _get_env_str("SLACK_WEBHOOK_URL") or webhooks.get("slack_webhook_url", ""),
}
def _print_notification_sources(config: Dict) -> None:
"""打印通知渠道配置来源信息"""
notification_sources = []
max_accounts = config["MAX_ACCOUNTS_PER_CHANNEL"]
if config["FEISHU_WEBHOOK_URL"]:
accounts = parse_multi_account_config(config["FEISHU_WEBHOOK_URL"])
count = min(len(accounts), max_accounts)
source = "环境变量" if os.environ.get("FEISHU_WEBHOOK_URL") else "配置文件"
notification_sources.append(f"飞书({source}, {count}个账号)")
if config["DINGTALK_WEBHOOK_URL"]:
accounts = parse_multi_account_config(config["DINGTALK_WEBHOOK_URL"])
count = min(len(accounts), max_accounts)
source = "环境变量" if os.environ.get("DINGTALK_WEBHOOK_URL") else "配置文件"
notification_sources.append(f"钉钉({source}, {count}个账号)")
if config["WEWORK_WEBHOOK_URL"]:
accounts = parse_multi_account_config(config["WEWORK_WEBHOOK_URL"])
count = min(len(accounts), max_accounts)
source = "环境变量" if os.environ.get("WEWORK_WEBHOOK_URL") else "配置文件"
notification_sources.append(f"企业微信({source}, {count}个账号)")
if config["TELEGRAM_BOT_TOKEN"] and config["TELEGRAM_CHAT_ID"]:
tokens = parse_multi_account_config(config["TELEGRAM_BOT_TOKEN"])
chat_ids = parse_multi_account_config(config["TELEGRAM_CHAT_ID"])
valid, count = validate_paired_configs(
{"bot_token": tokens, "chat_id": chat_ids},
"Telegram",
required_keys=["bot_token", "chat_id"]
)
if valid and count > 0:
count = min(count, max_accounts)
token_source = "环境变量" if os.environ.get("TELEGRAM_BOT_TOKEN") else "配置文件"
notification_sources.append(f"Telegram({token_source}, {count}个账号)")
if config["EMAIL_FROM"] and config["EMAIL_PASSWORD"] and config["EMAIL_TO"]:
from_source = "环境变量" if os.environ.get("EMAIL_FROM") else "配置文件"
notification_sources.append(f"邮件({from_source})")
if config["NTFY_SERVER_URL"] and config["NTFY_TOPIC"]:
topics = parse_multi_account_config(config["NTFY_TOPIC"])
tokens = parse_multi_account_config(config["NTFY_TOKEN"])
if tokens:
valid, count = validate_paired_configs(
{"topic": topics, "token": tokens},
"ntfy"
)
if valid and count > 0:
count = min(count, max_accounts)
server_source = "环境变量" if os.environ.get("NTFY_SERVER_URL") else "配置文件"
notification_sources.append(f"ntfy({server_source}, {count}个账号)")
else:
count = min(len(topics), max_accounts)
server_source = "环境变量" if os.environ.get("NTFY_SERVER_URL") else "配置文件"
notification_sources.append(f"ntfy({server_source}, {count}个账号)")
if config["BARK_URL"]:
accounts = parse_multi_account_config(config["BARK_URL"])
count = min(len(accounts), max_accounts)
bark_source = "环境变量" if os.environ.get("BARK_URL") else "配置文件"
notification_sources.append(f"Bark({bark_source}, {count}个账号)")
if config["SLACK_WEBHOOK_URL"]:
accounts = parse_multi_account_config(config["SLACK_WEBHOOK_URL"])
count = min(len(accounts), max_accounts)
slack_source = "环境变量" if os.environ.get("SLACK_WEBHOOK_URL") else "配置文件"
notification_sources.append(f"Slack({slack_source}, {count}个账号)")
if notification_sources:
print(f"通知渠道配置来源: {', '.join(notification_sources)}")
print(f"每个渠道最大账号数: {max_accounts}")
else:
print("未配置任何通知渠道")
def load_config(config_path: Optional[str] = None) -> Dict[str, Any]:
"""
加载配置文件
Args:
config_path: 配置文件路径默认从环境变量 CONFIG_PATH 获取或使用 config/config.yaml
Returns:
包含所有配置的字典
Raises:
FileNotFoundError: 配置文件不存在
"""
if config_path is None:
config_path = os.environ.get("CONFIG_PATH", "config/config.yaml")
if not Path(config_path).exists():
raise FileNotFoundError(f"配置文件 {config_path} 不存在")
with open(config_path, "r", encoding="utf-8") as f:
config_data = yaml.safe_load(f)
print(f"配置文件加载成功: {config_path}")
# 合并所有配置
config = {}
# 应用配置
config.update(_load_app_config(config_data))
# 爬虫配置
config.update(_load_crawler_config(config_data))
# 报告配置
config.update(_load_report_config(config_data))
# 通知配置
config.update(_load_notification_config(config_data))
# 推送窗口配置
config["PUSH_WINDOW"] = _load_push_window_config(config_data)
# 权重配置
config["WEIGHT_CONFIG"] = _load_weight_config(config_data)
# 平台配置
config["PLATFORMS"] = config_data.get("platforms", [])
# 存储配置
config["STORAGE"] = _load_storage_config(config_data)
# Webhook 配置
config.update(_load_webhook_config(config_data))
# 打印通知渠道配置来源
_print_notification_sources(config)
return config

View File

@ -0,0 +1,8 @@
# coding=utf-8
"""
爬虫模块 - 数据抓取功能
"""
from trendradar.crawler.fetcher import DataFetcher
__all__ = ["DataFetcher"]

View File

@ -0,0 +1,184 @@
# coding=utf-8
"""
数据获取器模块
负责从 NewsNow API 抓取新闻数据支持
- 单个平台数据获取
- 批量平台数据爬取
- 自动重试机制
- 代理支持
"""
import json
import random
import time
from typing import Dict, List, Tuple, Optional, Union
import requests
class DataFetcher:
"""数据获取器"""
# 默认 API 地址
DEFAULT_API_URL = "https://newsnow.busiyi.world/api/s"
# 默认请求头
DEFAULT_HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Accept": "application/json, text/plain, */*",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Connection": "keep-alive",
"Cache-Control": "no-cache",
}
def __init__(
self,
proxy_url: Optional[str] = None,
api_url: Optional[str] = None,
):
"""
初始化数据获取器
Args:
proxy_url: 代理服务器 URL可选
api_url: API 基础 URL可选默认使用 DEFAULT_API_URL
"""
self.proxy_url = proxy_url
self.api_url = api_url or self.DEFAULT_API_URL
def fetch_data(
self,
id_info: Union[str, Tuple[str, str]],
max_retries: int = 2,
min_retry_wait: int = 3,
max_retry_wait: int = 5,
) -> Tuple[Optional[str], str, str]:
"""
获取指定ID数据支持重试
Args:
id_info: 平台ID (平台ID, 别名) 元组
max_retries: 最大重试次数
min_retry_wait: 最小重试等待时间
max_retry_wait: 最大重试等待时间
Returns:
(响应文本, 平台ID, 别名) 元组失败时响应文本为 None
"""
if isinstance(id_info, tuple):
id_value, alias = id_info
else:
id_value = id_info
alias = id_value
url = f"{self.api_url}?id={id_value}&latest"
proxies = None
if self.proxy_url:
proxies = {"http": self.proxy_url, "https": self.proxy_url}
retries = 0
while retries <= max_retries:
try:
response = requests.get(
url,
proxies=proxies,
headers=self.DEFAULT_HEADERS,
timeout=10,
)
response.raise_for_status()
data_text = response.text
data_json = json.loads(data_text)
status = data_json.get("status", "未知")
if status not in ["success", "cache"]:
raise ValueError(f"响应状态异常: {status}")
status_info = "最新数据" if status == "success" else "缓存数据"
print(f"获取 {id_value} 成功({status_info}")
return data_text, id_value, alias
except Exception as e:
retries += 1
if retries <= max_retries:
base_wait = random.uniform(min_retry_wait, max_retry_wait)
additional_wait = (retries - 1) * random.uniform(1, 2)
wait_time = base_wait + additional_wait
print(f"请求 {id_value} 失败: {e}. {wait_time:.2f}秒后重试...")
time.sleep(wait_time)
else:
print(f"请求 {id_value} 失败: {e}")
return None, id_value, alias
return None, id_value, alias
def crawl_websites(
self,
ids_list: List[Union[str, Tuple[str, str]]],
request_interval: int = 100,
) -> Tuple[Dict, Dict, List]:
"""
爬取多个网站数据
Args:
ids_list: 平台ID列表每个元素可以是字符串或 (平台ID, 别名) 元组
request_interval: 请求间隔毫秒
Returns:
(结果字典, ID到名称的映射, 失败ID列表) 元组
"""
results = {}
id_to_name = {}
failed_ids = []
for i, id_info in enumerate(ids_list):
if isinstance(id_info, tuple):
id_value, name = id_info
else:
id_value = id_info
name = id_value
id_to_name[id_value] = name
response, _, _ = self.fetch_data(id_info)
if response:
try:
data = json.loads(response)
results[id_value] = {}
for index, item in enumerate(data.get("items", []), 1):
title = item.get("title")
# 跳过无效标题None、float、空字符串
if title is None or isinstance(title, float) or not str(title).strip():
continue
title = str(title).strip()
url = item.get("url", "")
mobile_url = item.get("mobileUrl", "")
if title in results[id_value]:
results[id_value][title]["ranks"].append(index)
else:
results[id_value][title] = {
"ranks": [index],
"url": url,
"mobileUrl": mobile_url,
}
except json.JSONDecodeError:
print(f"解析 {id_value} 响应失败")
failed_ids.append(id_value)
except Exception as e:
print(f"处理 {id_value} 数据出错: {e}")
failed_ids.append(id_value)
else:
failed_ids.append(id_value)
# 请求间隔(除了最后一个)
if i < len(ids_list) - 1:
actual_interval = request_interval + random.randint(-10, 20)
actual_interval = max(50, actual_interval)
time.sleep(actual_interval / 1000)
print(f"成功: {list(results.keys())}, 失败: {failed_ids}")
return results, id_to_name, failed_ids

View File

@ -0,0 +1,81 @@
# coding=utf-8
"""
通知推送模块
提供多渠道通知推送功能包括
- 飞书钉钉企业微信
- TelegramSlack
- EmailntfyBark
模块结构
- push_manager: 推送记录管理
- formatters: 内容格式转换
- batch: 批次处理工具
- renderer: 通知内容渲染
- splitter: 消息分批拆分
- senders: 消息发送器各渠道发送函数
- dispatcher: 多账号通知调度器
"""
from trendradar.notification.push_manager import PushRecordManager
from trendradar.notification.formatters import (
strip_markdown,
convert_markdown_to_mrkdwn,
)
from trendradar.notification.batch import (
get_batch_header,
get_max_batch_header_size,
truncate_to_bytes,
add_batch_headers,
)
from trendradar.notification.renderer import (
render_feishu_content,
render_dingtalk_content,
)
from trendradar.notification.splitter import (
split_content_into_batches,
DEFAULT_BATCH_SIZES,
)
from trendradar.notification.senders import (
send_to_feishu,
send_to_dingtalk,
send_to_wework,
send_to_telegram,
send_to_email,
send_to_ntfy,
send_to_bark,
send_to_slack,
SMTP_CONFIGS,
)
from trendradar.notification.dispatcher import NotificationDispatcher
__all__ = [
# 推送记录管理
"PushRecordManager",
# 格式转换
"strip_markdown",
"convert_markdown_to_mrkdwn",
# 批次处理
"get_batch_header",
"get_max_batch_header_size",
"truncate_to_bytes",
"add_batch_headers",
# 内容渲染
"render_feishu_content",
"render_dingtalk_content",
# 消息分批
"split_content_into_batches",
"DEFAULT_BATCH_SIZES",
# 消息发送器
"send_to_feishu",
"send_to_dingtalk",
"send_to_wework",
"send_to_telegram",
"send_to_email",
"send_to_ntfy",
"send_to_bark",
"send_to_slack",
"SMTP_CONFIGS",
# 通知调度器
"NotificationDispatcher",
]

View File

@ -0,0 +1,115 @@
# coding=utf-8
"""
批次处理模块
提供消息分批发送的辅助函数
"""
from typing import List
def get_batch_header(format_type: str, batch_num: int, total_batches: int) -> str:
"""根据 format_type 生成对应格式的批次头部
Args:
format_type: 推送类型telegram, slack, wework_text, bark, feishu, dingtalk, ntfy, wework
batch_num: 当前批次编号
total_batches: 总批次数
Returns:
格式化的批次头部字符串
"""
if format_type == "telegram":
return f"<b>[第 {batch_num}/{total_batches} 批次]</b>\n\n"
elif format_type == "slack":
return f"*[第 {batch_num}/{total_batches} 批次]*\n\n"
elif format_type in ("wework_text", "bark"):
# 企业微信文本模式和 Bark 使用纯文本格式
return f"[第 {batch_num}/{total_batches} 批次]\n\n"
else:
# 飞书、钉钉、ntfy、企业微信 markdown 模式
return f"**[第 {batch_num}/{total_batches} 批次]**\n\n"
def get_max_batch_header_size(format_type: str) -> int:
"""估算批次头部的最大字节数(假设最多 99 批次)
用于在分批时预留空间避免事后截断破坏内容完整性
Args:
format_type: 推送类型
Returns:
最大头部字节数
"""
# 生成最坏情况的头部99/99 批次)
max_header = get_batch_header(format_type, 99, 99)
return len(max_header.encode("utf-8"))
def truncate_to_bytes(text: str, max_bytes: int) -> str:
"""安全截断字符串到指定字节数,避免截断多字节字符
Args:
text: 要截断的文本
max_bytes: 最大字节数
Returns:
截断后的文本
"""
text_bytes = text.encode("utf-8")
if len(text_bytes) <= max_bytes:
return text
# 截断到指定字节数
truncated = text_bytes[:max_bytes]
# 处理可能的不完整 UTF-8 字符
for i in range(min(4, len(truncated))):
try:
return truncated[: len(truncated) - i].decode("utf-8")
except UnicodeDecodeError:
continue
# 极端情况:返回空字符串
return ""
def add_batch_headers(
batches: List[str], format_type: str, max_bytes: int
) -> List[str]:
"""为批次添加头部,动态计算确保总大小不超过限制
Args:
batches: 原始批次列表
format_type: 推送类型bark, telegram, feishu
max_bytes: 该推送类型的最大字节限制
Returns:
添加头部后的批次列表
"""
if len(batches) <= 1:
return batches
total = len(batches)
result = []
for i, content in enumerate(batches, 1):
# 生成批次头部
header = get_batch_header(format_type, i, total)
header_size = len(header.encode("utf-8"))
# 动态计算允许的最大内容大小
max_content_size = max_bytes - header_size
content_size = len(content.encode("utf-8"))
# 如果超出,截断到安全大小
if content_size > max_content_size:
print(
f"警告:{format_type}{i}/{total} 批次内容({content_size}字节) + 头部({header_size}字节) 超出限制({max_bytes}字节),截断到 {max_content_size} 字节"
)
content = truncate_to_bytes(content, max_content_size)
result.append(header + content)
return result

View File

@ -0,0 +1,420 @@
# coding=utf-8
"""
通知调度器模块
提供统一的通知分发接口
支持所有通知渠道的多账号配置使用 `;` 分隔多个账号
使用示例:
dispatcher = NotificationDispatcher(config, get_time_func, split_content_func)
results = dispatcher.dispatch_all(report_data, report_type, ...)
"""
from typing import Any, Callable, Dict, List, Optional
from trendradar.core.config import (
get_account_at_index,
limit_accounts,
parse_multi_account_config,
validate_paired_configs,
)
from .senders import (
send_to_bark,
send_to_dingtalk,
send_to_email,
send_to_feishu,
send_to_ntfy,
send_to_slack,
send_to_telegram,
send_to_wework,
)
class NotificationDispatcher:
"""
统一的多账号通知调度器
将多账号发送逻辑封装提供简洁的 dispatch_all 接口
内部处理账号解析数量限制配对验证等逻辑
"""
def __init__(
self,
config: Dict[str, Any],
get_time_func: Callable,
split_content_func: Callable,
):
"""
初始化通知调度器
Args:
config: 完整的配置字典包含所有通知渠道的配置
get_time_func: 获取当前时间的函数
split_content_func: 内容分批函数
"""
self.config = config
self.get_time_func = get_time_func
self.split_content_func = split_content_func
self.max_accounts = config.get("MAX_ACCOUNTS_PER_CHANNEL", 3)
def dispatch_all(
self,
report_data: Dict,
report_type: str,
update_info: Optional[Dict] = None,
proxy_url: Optional[str] = None,
mode: str = "daily",
html_file_path: Optional[str] = None,
) -> Dict[str, bool]:
"""
分发通知到所有已配置的渠道
Args:
report_data: 报告数据 prepare_report_data 生成
report_type: 报告类型 "当日汇总""实时增量"
update_info: 版本更新信息可选
proxy_url: 代理 URL可选
mode: 报告模式 (daily/current/incremental)
html_file_path: HTML 报告文件路径邮件使用
Returns:
Dict[str, bool]: 每个渠道的发送结果key 为渠道名value 为是否成功
"""
results = {}
# 飞书
if self.config.get("FEISHU_WEBHOOK_URL"):
results["feishu"] = self._send_feishu(
report_data, report_type, update_info, proxy_url, mode
)
# 钉钉
if self.config.get("DINGTALK_WEBHOOK_URL"):
results["dingtalk"] = self._send_dingtalk(
report_data, report_type, update_info, proxy_url, mode
)
# 企业微信
if self.config.get("WEWORK_WEBHOOK_URL"):
results["wework"] = self._send_wework(
report_data, report_type, update_info, proxy_url, mode
)
# Telegram需要配对验证
if self.config.get("TELEGRAM_BOT_TOKEN") and self.config.get("TELEGRAM_CHAT_ID"):
results["telegram"] = self._send_telegram(
report_data, report_type, update_info, proxy_url, mode
)
# ntfy需要配对验证
if self.config.get("NTFY_SERVER_URL") and self.config.get("NTFY_TOPIC"):
results["ntfy"] = self._send_ntfy(
report_data, report_type, update_info, proxy_url, mode
)
# Bark
if self.config.get("BARK_URL"):
results["bark"] = self._send_bark(
report_data, report_type, update_info, proxy_url, mode
)
# Slack
if self.config.get("SLACK_WEBHOOK_URL"):
results["slack"] = self._send_slack(
report_data, report_type, update_info, proxy_url, mode
)
# 邮件(保持原有逻辑,已支持多收件人)
if (
self.config.get("EMAIL_FROM")
and self.config.get("EMAIL_PASSWORD")
and self.config.get("EMAIL_TO")
):
results["email"] = self._send_email(report_type, html_file_path)
return results
def _send_to_multi_accounts(
self,
channel_name: str,
config_value: str,
send_func: Callable[..., bool],
**kwargs,
) -> bool:
"""
通用多账号发送逻辑
Args:
channel_name: 渠道名称用于日志和账号数量限制提示
config_value: 配置值可能包含多个账号 ; 分隔
send_func: 发送函数签名为 (account, account_label=..., **kwargs) -> bool
**kwargs: 传递给发送函数的其他参数
Returns:
bool: 任一账号发送成功则返回 True
"""
accounts = parse_multi_account_config(config_value)
if not accounts:
return False
accounts = limit_accounts(accounts, self.max_accounts, channel_name)
results = []
for i, account in enumerate(accounts):
if account:
account_label = f"账号{i+1}" if len(accounts) > 1 else ""
result = send_func(account, account_label=account_label, **kwargs)
results.append(result)
return any(results) if results else False
def _send_feishu(
self,
report_data: Dict,
report_type: str,
update_info: Optional[Dict],
proxy_url: Optional[str],
mode: str,
) -> bool:
"""发送到飞书(多账号)"""
return self._send_to_multi_accounts(
channel_name="飞书",
config_value=self.config["FEISHU_WEBHOOK_URL"],
send_func=lambda url, account_label: send_to_feishu(
webhook_url=url,
report_data=report_data,
report_type=report_type,
update_info=update_info,
proxy_url=proxy_url,
mode=mode,
account_label=account_label,
batch_size=self.config.get("FEISHU_BATCH_SIZE", 29000),
batch_interval=self.config.get("BATCH_SEND_INTERVAL", 1.0),
split_content_func=self.split_content_func,
get_time_func=self.get_time_func,
),
)
def _send_dingtalk(
self,
report_data: Dict,
report_type: str,
update_info: Optional[Dict],
proxy_url: Optional[str],
mode: str,
) -> bool:
"""发送到钉钉(多账号)"""
return self._send_to_multi_accounts(
channel_name="钉钉",
config_value=self.config["DINGTALK_WEBHOOK_URL"],
send_func=lambda url, account_label: send_to_dingtalk(
webhook_url=url,
report_data=report_data,
report_type=report_type,
update_info=update_info,
proxy_url=proxy_url,
mode=mode,
account_label=account_label,
batch_size=self.config.get("DINGTALK_BATCH_SIZE", 20000),
batch_interval=self.config.get("BATCH_SEND_INTERVAL", 1.0),
split_content_func=self.split_content_func,
),
)
def _send_wework(
self,
report_data: Dict,
report_type: str,
update_info: Optional[Dict],
proxy_url: Optional[str],
mode: str,
) -> bool:
"""发送到企业微信(多账号)"""
return self._send_to_multi_accounts(
channel_name="企业微信",
config_value=self.config["WEWORK_WEBHOOK_URL"],
send_func=lambda url, account_label: send_to_wework(
webhook_url=url,
report_data=report_data,
report_type=report_type,
update_info=update_info,
proxy_url=proxy_url,
mode=mode,
account_label=account_label,
batch_size=self.config.get("MESSAGE_BATCH_SIZE", 4000),
batch_interval=self.config.get("BATCH_SEND_INTERVAL", 1.0),
msg_type=self.config.get("WEWORK_MSG_TYPE", "markdown"),
split_content_func=self.split_content_func,
),
)
def _send_telegram(
self,
report_data: Dict,
report_type: str,
update_info: Optional[Dict],
proxy_url: Optional[str],
mode: str,
) -> bool:
"""发送到 Telegram多账号需验证 token 和 chat_id 配对)"""
telegram_tokens = parse_multi_account_config(self.config["TELEGRAM_BOT_TOKEN"])
telegram_chat_ids = parse_multi_account_config(self.config["TELEGRAM_CHAT_ID"])
if not telegram_tokens or not telegram_chat_ids:
return False
# 验证配对
valid, count = validate_paired_configs(
{"bot_token": telegram_tokens, "chat_id": telegram_chat_ids},
"Telegram",
required_keys=["bot_token", "chat_id"],
)
if not valid or count == 0:
return False
# 限制账号数量
telegram_tokens = limit_accounts(telegram_tokens, self.max_accounts, "Telegram")
telegram_chat_ids = telegram_chat_ids[: len(telegram_tokens)]
results = []
for i in range(len(telegram_tokens)):
token = telegram_tokens[i]
chat_id = telegram_chat_ids[i]
if token and chat_id:
account_label = f"账号{i+1}" if len(telegram_tokens) > 1 else ""
result = send_to_telegram(
bot_token=token,
chat_id=chat_id,
report_data=report_data,
report_type=report_type,
update_info=update_info,
proxy_url=proxy_url,
mode=mode,
account_label=account_label,
batch_size=self.config.get("MESSAGE_BATCH_SIZE", 4000),
batch_interval=self.config.get("BATCH_SEND_INTERVAL", 1.0),
split_content_func=self.split_content_func,
)
results.append(result)
return any(results) if results else False
def _send_ntfy(
self,
report_data: Dict,
report_type: str,
update_info: Optional[Dict],
proxy_url: Optional[str],
mode: str,
) -> bool:
"""发送到 ntfy多账号需验证 topic 和 token 配对)"""
ntfy_server_url = self.config["NTFY_SERVER_URL"]
ntfy_topics = parse_multi_account_config(self.config["NTFY_TOPIC"])
ntfy_tokens = parse_multi_account_config(self.config.get("NTFY_TOKEN", ""))
if not ntfy_server_url or not ntfy_topics:
return False
# 验证 token 和 topic 数量一致(如果配置了 token
if ntfy_tokens and len(ntfy_tokens) != len(ntfy_topics):
print(
f"❌ ntfy 配置错误topic 数量({len(ntfy_topics)})与 token 数量({len(ntfy_tokens)})不一致,跳过 ntfy 推送"
)
return False
# 限制账号数量
ntfy_topics = limit_accounts(ntfy_topics, self.max_accounts, "ntfy")
if ntfy_tokens:
ntfy_tokens = ntfy_tokens[: len(ntfy_topics)]
results = []
for i, topic in enumerate(ntfy_topics):
if topic:
token = get_account_at_index(ntfy_tokens, i, "") if ntfy_tokens else ""
account_label = f"账号{i+1}" if len(ntfy_topics) > 1 else ""
result = send_to_ntfy(
server_url=ntfy_server_url,
topic=topic,
token=token,
report_data=report_data,
report_type=report_type,
update_info=update_info,
proxy_url=proxy_url,
mode=mode,
account_label=account_label,
batch_size=3800,
split_content_func=self.split_content_func,
)
results.append(result)
return any(results) if results else False
def _send_bark(
self,
report_data: Dict,
report_type: str,
update_info: Optional[Dict],
proxy_url: Optional[str],
mode: str,
) -> bool:
"""发送到 Bark多账号"""
return self._send_to_multi_accounts(
channel_name="Bark",
config_value=self.config["BARK_URL"],
send_func=lambda url, account_label: send_to_bark(
bark_url=url,
report_data=report_data,
report_type=report_type,
update_info=update_info,
proxy_url=proxy_url,
mode=mode,
account_label=account_label,
batch_size=self.config.get("BARK_BATCH_SIZE", 3600),
batch_interval=self.config.get("BATCH_SEND_INTERVAL", 1.0),
split_content_func=self.split_content_func,
),
)
def _send_slack(
self,
report_data: Dict,
report_type: str,
update_info: Optional[Dict],
proxy_url: Optional[str],
mode: str,
) -> bool:
"""发送到 Slack多账号"""
return self._send_to_multi_accounts(
channel_name="Slack",
config_value=self.config["SLACK_WEBHOOK_URL"],
send_func=lambda url, account_label: send_to_slack(
webhook_url=url,
report_data=report_data,
report_type=report_type,
update_info=update_info,
proxy_url=proxy_url,
mode=mode,
account_label=account_label,
batch_size=self.config.get("SLACK_BATCH_SIZE", 4000),
batch_interval=self.config.get("BATCH_SEND_INTERVAL", 1.0),
split_content_func=self.split_content_func,
),
)
def _send_email(
self,
report_type: str,
html_file_path: Optional[str],
) -> bool:
"""发送邮件(保持原有逻辑,已支持多收件人)"""
return send_to_email(
from_email=self.config["EMAIL_FROM"],
password=self.config["EMAIL_PASSWORD"],
to_email=self.config["EMAIL_TO"],
report_type=report_type,
html_file_path=html_file_path,
custom_smtp_server=self.config.get("EMAIL_SMTP_SERVER", ""),
custom_smtp_port=self.config.get("EMAIL_SMTP_PORT", ""),
get_time_func=self.get_time_func,
)

View File

@ -0,0 +1,80 @@
# coding=utf-8
"""
通知内容格式转换模块
提供不同推送平台间的格式转换功能
"""
import re
def strip_markdown(text: str) -> str:
"""去除文本中的 markdown 语法格式,用于个人微信推送
Args:
text: 包含 markdown 格式的文本
Returns:
纯文本内容
"""
# 去除粗体 **text** 或 __text__
text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)
text = re.sub(r'__(.+?)__', r'\1', text)
# 去除斜体 *text* 或 _text_
text = re.sub(r'\*(.+?)\*', r'\1', text)
text = re.sub(r'_(.+?)_', r'\1', text)
# 去除删除线 ~~text~~
text = re.sub(r'~~(.+?)~~', r'\1', text)
# 转换链接 [text](url) -> text url保留 URL
text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'\1 \2', text)
# 去除图片 ![alt](url) -> alt
text = re.sub(r'!\[(.+?)\]\(.+?\)', r'\1', text)
# 去除行内代码 `code`
text = re.sub(r'`(.+?)`', r'\1', text)
# 去除引用符号 >
text = re.sub(r'^>\s*', '', text, flags=re.MULTILINE)
# 去除标题符号 # ## ### 等
text = re.sub(r'^#+\s*', '', text, flags=re.MULTILINE)
# 去除水平分割线 --- 或 ***
text = re.sub(r'^[\-\*]{3,}\s*$', '', text, flags=re.MULTILINE)
# 去除 HTML 标签 <font color='xxx'>text</font> -> text
text = re.sub(r'<font[^>]*>(.+?)</font>', r'\1', text)
text = re.sub(r'<[^>]+>', '', text)
# 清理多余的空行(保留最多两个连续空行)
text = re.sub(r'\n{3,}', '\n\n', text)
return text.strip()
def convert_markdown_to_mrkdwn(content: str) -> str:
"""
将标准 Markdown 转换为 Slack mrkdwn 格式
转换规则
- **粗体** *粗体*
- [文本](url) <url|文本>
- 保留其他格式代码块列表等
Args:
content: Markdown 格式的内容
Returns:
Slack mrkdwn 格式的内容
"""
# 1. 转换链接格式: [文本](url) → <url|文本>
content = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'<\2|\1>', content)
# 2. 转换粗体: **文本** → *文本*
content = re.sub(r'\*\*([^*]+)\*\*', r'*\1*', content)
return content

View File

@ -0,0 +1,109 @@
# coding=utf-8
"""
推送记录管理模块
管理推送记录支持每日只推送一次和时间窗口控制
通过 storage_backend 统一存储支持本地 SQLite 和远程云存储
"""
from datetime import datetime
from typing import Callable, Optional, Any
import pytz
class PushRecordManager:
"""
推送记录管理器
通过 storage_backend 统一管理推送记录
- 本地环境使用 LocalStorageBackend数据存储在本地 SQLite
- GitHub Actions使用 RemoteStorageBackend数据存储在云端
这样 once_per_day 功能在 GitHub Actions 上也能正常工作
"""
def __init__(
self,
storage_backend: Any,
get_time_func: Optional[Callable[[], datetime]] = None,
):
"""
初始化推送记录管理器
Args:
storage_backend: 存储后端实例LocalStorageBackend RemoteStorageBackend
get_time_func: 获取当前时间的函数应使用配置的时区
"""
self.storage_backend = storage_backend
self.get_time = get_time_func or self._default_get_time
print(f"[推送记录] 使用 {storage_backend.backend_name} 存储后端")
def _default_get_time(self) -> datetime:
"""默认时间获取函数UTC+8"""
return datetime.now(pytz.timezone("Asia/Shanghai"))
def has_pushed_today(self) -> bool:
"""
检查今天是否已经推送过
Returns:
是否已推送
"""
return self.storage_backend.has_pushed_today()
def record_push(self, report_type: str) -> bool:
"""
记录推送
Args:
report_type: 报告类型
Returns:
是否记录成功
"""
return self.storage_backend.record_push(report_type)
def is_in_time_range(self, start_time: str, end_time: str) -> bool:
"""
检查当前时间是否在指定时间范围内
Args:
start_time: 开始时间格式HH:MM
end_time: 结束时间格式HH:MM
Returns:
是否在时间范围内
"""
now = self.get_time()
current_time = now.strftime("%H:%M")
def normalize_time(time_str: str) -> str:
"""将时间字符串标准化为 HH:MM 格式"""
try:
parts = time_str.strip().split(":")
if len(parts) != 2:
raise ValueError(f"时间格式错误: {time_str}")
hour = int(parts[0])
minute = int(parts[1])
if not (0 <= hour <= 23 and 0 <= minute <= 59):
raise ValueError(f"时间范围错误: {time_str}")
return f"{hour:02d}:{minute:02d}"
except Exception as e:
print(f"时间格式化错误 '{time_str}': {e}")
return time_str
normalized_start = normalize_time(start_time)
normalized_end = normalize_time(end_time)
normalized_current = normalize_time(current_time)
result = normalized_start <= normalized_current <= normalized_end
if not result:
print(f"时间窗口判断:当前 {normalized_current},窗口 {normalized_start}-{normalized_end}")
return result

View File

@ -0,0 +1,260 @@
# coding=utf-8
"""
通知内容渲染模块
提供多平台通知内容渲染功能生成格式化的推送消息
"""
from datetime import datetime
from typing import Dict, List, Optional, Callable
from trendradar.report.formatter import format_title_for_platform
def render_feishu_content(
report_data: Dict,
update_info: Optional[Dict] = None,
mode: str = "daily",
separator: str = "---",
reverse_content_order: bool = False,
get_time_func: Optional[Callable[[], datetime]] = None,
) -> str:
"""渲染飞书通知内容
Args:
report_data: 报告数据字典包含 stats, new_titles, failed_ids, total_new_count
update_info: 版本更新信息可选
mode: 报告模式 ("daily", "incremental", "current")
separator: 内容分隔符
reverse_content_order: 是否反转内容顺序新增在前
get_time_func: 获取当前时间的函数可选默认使用 datetime.now()
Returns:
格式化的飞书消息内容
"""
# 生成热点词汇统计部分
stats_content = ""
if report_data["stats"]:
stats_content += "📊 **热点词汇统计**\n\n"
total_count = len(report_data["stats"])
for i, stat in enumerate(report_data["stats"]):
word = stat["word"]
count = stat["count"]
sequence_display = f"<font color='grey'>[{i + 1}/{total_count}]</font>"
if count >= 10:
stats_content += f"🔥 {sequence_display} **{word}** : <font color='red'>{count}</font> 条\n\n"
elif count >= 5:
stats_content += f"📈 {sequence_display} **{word}** : <font color='orange'>{count}</font> 条\n\n"
else:
stats_content += f"📌 {sequence_display} **{word}** : {count}\n\n"
for j, title_data in enumerate(stat["titles"], 1):
formatted_title = format_title_for_platform(
"feishu", title_data, show_source=True
)
stats_content += f" {j}. {formatted_title}\n"
if j < len(stat["titles"]):
stats_content += "\n"
if i < len(report_data["stats"]) - 1:
stats_content += f"\n{separator}\n\n"
# 生成新增新闻部分
new_titles_content = ""
if report_data["new_titles"]:
new_titles_content += (
f"🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
)
for source_data in report_data["new_titles"]:
new_titles_content += (
f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n"
)
for j, title_data in enumerate(source_data["titles"], 1):
title_data_copy = title_data.copy()
title_data_copy["is_new"] = False
formatted_title = format_title_for_platform(
"feishu", title_data_copy, show_source=False
)
new_titles_content += f" {j}. {formatted_title}\n"
new_titles_content += "\n"
# 根据配置决定内容顺序
text_content = ""
if reverse_content_order:
# 新增热点在前,热点词汇统计在后
if new_titles_content:
text_content += new_titles_content
if stats_content:
text_content += f"\n{separator}\n\n"
if stats_content:
text_content += stats_content
else:
# 默认:热点词汇统计在前,新增热点在后
if stats_content:
text_content += stats_content
if new_titles_content:
text_content += f"\n{separator}\n\n"
if new_titles_content:
text_content += new_titles_content
if not text_content:
if mode == "incremental":
mode_text = "增量模式下暂无新增匹配的热点词汇"
elif mode == "current":
mode_text = "当前榜单模式下暂无匹配的热点词汇"
else:
mode_text = "暂无匹配的热点词汇"
text_content = f"📭 {mode_text}\n\n"
if report_data["failed_ids"]:
if text_content and "暂无匹配" not in text_content:
text_content += f"\n{separator}\n\n"
text_content += "⚠️ **数据获取失败的平台:**\n\n"
for i, id_value in enumerate(report_data["failed_ids"], 1):
text_content += f" • <font color='red'>{id_value}</font>\n"
# 获取当前时间
now = get_time_func() if get_time_func else datetime.now()
text_content += (
f"\n\n<font color='grey'>更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}</font>"
)
if update_info:
text_content += f"\n<font color='grey'>TrendRadar 发现新版本 {update_info['remote_version']},当前 {update_info['current_version']}</font>"
return text_content
def render_dingtalk_content(
report_data: Dict,
update_info: Optional[Dict] = None,
mode: str = "daily",
reverse_content_order: bool = False,
get_time_func: Optional[Callable[[], datetime]] = None,
) -> str:
"""渲染钉钉通知内容
Args:
report_data: 报告数据字典包含 stats, new_titles, failed_ids, total_new_count
update_info: 版本更新信息可选
mode: 报告模式 ("daily", "incremental", "current")
reverse_content_order: 是否反转内容顺序新增在前
get_time_func: 获取当前时间的函数可选默认使用 datetime.now()
Returns:
格式化的钉钉消息内容
"""
total_titles = sum(
len(stat["titles"]) for stat in report_data["stats"] if stat["count"] > 0
)
now = get_time_func() if get_time_func else datetime.now()
# 头部信息
header_content = f"**总新闻数:** {total_titles}\n\n"
header_content += f"**时间:** {now.strftime('%Y-%m-%d %H:%M:%S')}\n\n"
header_content += "**类型:** 热点分析报告\n\n"
header_content += "---\n\n"
# 生成热点词汇统计部分
stats_content = ""
if report_data["stats"]:
stats_content += "📊 **热点词汇统计**\n\n"
total_count = len(report_data["stats"])
for i, stat in enumerate(report_data["stats"]):
word = stat["word"]
count = stat["count"]
sequence_display = f"[{i + 1}/{total_count}]"
if count >= 10:
stats_content += f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n"
elif count >= 5:
stats_content += f"📈 {sequence_display} **{word}** : **{count}** 条\n\n"
else:
stats_content += f"📌 {sequence_display} **{word}** : {count}\n\n"
for j, title_data in enumerate(stat["titles"], 1):
formatted_title = format_title_for_platform(
"dingtalk", title_data, show_source=True
)
stats_content += f" {j}. {formatted_title}\n"
if j < len(stat["titles"]):
stats_content += "\n"
if i < len(report_data["stats"]) - 1:
stats_content += "\n---\n\n"
# 生成新增新闻部分
new_titles_content = ""
if report_data["new_titles"]:
new_titles_content += (
f"🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
)
for source_data in report_data["new_titles"]:
new_titles_content += f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n"
for j, title_data in enumerate(source_data["titles"], 1):
title_data_copy = title_data.copy()
title_data_copy["is_new"] = False
formatted_title = format_title_for_platform(
"dingtalk", title_data_copy, show_source=False
)
new_titles_content += f" {j}. {formatted_title}\n"
new_titles_content += "\n"
# 根据配置决定内容顺序
text_content = header_content
if reverse_content_order:
# 新增热点在前,热点词汇统计在后
if new_titles_content:
text_content += new_titles_content
if stats_content:
text_content += "\n---\n\n"
if stats_content:
text_content += stats_content
else:
# 默认:热点词汇统计在前,新增热点在后
if stats_content:
text_content += stats_content
if new_titles_content:
text_content += "\n---\n\n"
if new_titles_content:
text_content += new_titles_content
if not stats_content and not new_titles_content:
if mode == "incremental":
mode_text = "增量模式下暂无新增匹配的热点词汇"
elif mode == "current":
mode_text = "当前榜单模式下暂无匹配的热点词汇"
else:
mode_text = "暂无匹配的热点词汇"
text_content += f"📭 {mode_text}\n\n"
if report_data["failed_ids"]:
if "暂无匹配" not in text_content:
text_content += "\n---\n\n"
text_content += "⚠️ **数据获取失败的平台:**\n\n"
for i, id_value in enumerate(report_data["failed_ids"], 1):
text_content += f" • **{id_value}**\n"
text_content += f"\n\n> 更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}"
if update_info:
text_content += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**,当前 **{update_info['current_version']}**"
return text_content

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,580 @@
# coding=utf-8
"""
消息分批处理模块
提供消息内容分批拆分功能确保消息大小不超过各平台限制
"""
from datetime import datetime
from typing import Dict, List, Optional, Callable
from trendradar.report.formatter import format_title_for_platform
# 默认批次大小配置
DEFAULT_BATCH_SIZES = {
"dingtalk": 20000,
"feishu": 29000,
"ntfy": 3800,
"default": 4000,
}
def split_content_into_batches(
report_data: Dict,
format_type: str,
update_info: Optional[Dict] = None,
max_bytes: Optional[int] = None,
mode: str = "daily",
batch_sizes: Optional[Dict[str, int]] = None,
feishu_separator: str = "---",
reverse_content_order: bool = False,
get_time_func: Optional[Callable[[], datetime]] = None,
) -> List[str]:
"""分批处理消息内容,确保词组标题+至少第一条新闻的完整性
Args:
report_data: 报告数据字典包含 stats, new_titles, failed_ids, total_new_count
format_type: 格式类型 (feishu, dingtalk, wework, telegram, ntfy, bark, slack)
update_info: 版本更新信息可选
max_bytes: 最大字节数可选如果不指定则使用默认配置
mode: 报告模式 (daily, incremental, current)
batch_sizes: 批次大小配置字典可选
feishu_separator: 飞书消息分隔符
reverse_content_order: 是否反转内容顺序新增在前
get_time_func: 获取当前时间的函数可选
Returns:
分批后的消息内容列表
"""
# 合并批次大小配置
sizes = {**DEFAULT_BATCH_SIZES, **(batch_sizes or {})}
if max_bytes is None:
if format_type == "dingtalk":
max_bytes = sizes.get("dingtalk", 20000)
elif format_type == "feishu":
max_bytes = sizes.get("feishu", 29000)
elif format_type == "ntfy":
max_bytes = sizes.get("ntfy", 3800)
else:
max_bytes = sizes.get("default", 4000)
batches = []
total_titles = sum(
len(stat["titles"]) for stat in report_data["stats"] if stat["count"] > 0
)
now = get_time_func() if get_time_func else datetime.now()
base_header = ""
if format_type in ("wework", "bark"):
base_header = f"**总新闻数:** {total_titles}\n\n\n\n"
elif format_type == "telegram":
base_header = f"总新闻数: {total_titles}\n\n"
elif format_type == "ntfy":
base_header = f"**总新闻数:** {total_titles}\n\n"
elif format_type == "feishu":
base_header = ""
elif format_type == "dingtalk":
base_header = f"**总新闻数:** {total_titles}\n\n"
base_header += f"**时间:** {now.strftime('%Y-%m-%d %H:%M:%S')}\n\n"
base_header += f"**类型:** 热点分析报告\n\n"
base_header += "---\n\n"
elif format_type == "slack":
base_header = f"*总新闻数:* {total_titles}\n\n"
base_footer = ""
if format_type in ("wework", "bark"):
base_footer = f"\n\n\n> 更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}"
if update_info:
base_footer += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**,当前 **{update_info['current_version']}**"
elif format_type == "telegram":
base_footer = f"\n\n更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}"
if update_info:
base_footer += f"\nTrendRadar 发现新版本 {update_info['remote_version']},当前 {update_info['current_version']}"
elif format_type == "ntfy":
base_footer = f"\n\n> 更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}"
if update_info:
base_footer += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**,当前 **{update_info['current_version']}**"
elif format_type == "feishu":
base_footer = f"\n\n<font color='grey'>更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}</font>"
if update_info:
base_footer += f"\n<font color='grey'>TrendRadar 发现新版本 {update_info['remote_version']},当前 {update_info['current_version']}</font>"
elif format_type == "dingtalk":
base_footer = f"\n\n> 更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}"
if update_info:
base_footer += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**,当前 **{update_info['current_version']}**"
elif format_type == "slack":
base_footer = f"\n\n_更新时间{now.strftime('%Y-%m-%d %H:%M:%S')}_"
if update_info:
base_footer += f"\n_TrendRadar 发现新版本 *{update_info['remote_version']}*,当前 *{update_info['current_version']}_"
stats_header = ""
if report_data["stats"]:
if format_type in ("wework", "bark"):
stats_header = f"📊 **热点词汇统计**\n\n"
elif format_type == "telegram":
stats_header = f"📊 热点词汇统计\n\n"
elif format_type == "ntfy":
stats_header = f"📊 **热点词汇统计**\n\n"
elif format_type == "feishu":
stats_header = f"📊 **热点词汇统计**\n\n"
elif format_type == "dingtalk":
stats_header = f"📊 **热点词汇统计**\n\n"
elif format_type == "slack":
stats_header = f"📊 *热点词汇统计*\n\n"
current_batch = base_header
current_batch_has_content = False
if (
not report_data["stats"]
and not report_data["new_titles"]
and not report_data["failed_ids"]
):
if mode == "incremental":
mode_text = "增量模式下暂无新增匹配的热点词汇"
elif mode == "current":
mode_text = "当前榜单模式下暂无匹配的热点词汇"
else:
mode_text = "暂无匹配的热点词汇"
simple_content = f"📭 {mode_text}\n\n"
final_content = base_header + simple_content + base_footer
batches.append(final_content)
return batches
# 定义处理热点词汇统计的函数
def process_stats_section(current_batch, current_batch_has_content, batches):
"""处理热点词汇统计"""
if not report_data["stats"]:
return current_batch, current_batch_has_content, batches
total_count = len(report_data["stats"])
# 添加统计标题
test_content = current_batch + stats_header
if (
len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
< max_bytes
):
current_batch = test_content
current_batch_has_content = True
else:
if current_batch_has_content:
batches.append(current_batch + base_footer)
current_batch = base_header + stats_header
current_batch_has_content = True
# 逐个处理词组(确保词组标题+第一条新闻的原子性)
for i, stat in enumerate(report_data["stats"]):
word = stat["word"]
count = stat["count"]
sequence_display = f"[{i + 1}/{total_count}]"
# 构建词组标题
word_header = ""
if format_type in ("wework", "bark"):
if count >= 10:
word_header = (
f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n"
)
elif count >= 5:
word_header = (
f"📈 {sequence_display} **{word}** : **{count}** 条\n\n"
)
else:
word_header = f"📌 {sequence_display} **{word}** : {count}\n\n"
elif format_type == "telegram":
if count >= 10:
word_header = f"🔥 {sequence_display} {word} : {count}\n\n"
elif count >= 5:
word_header = f"📈 {sequence_display} {word} : {count}\n\n"
else:
word_header = f"📌 {sequence_display} {word} : {count}\n\n"
elif format_type == "ntfy":
if count >= 10:
word_header = (
f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n"
)
elif count >= 5:
word_header = (
f"📈 {sequence_display} **{word}** : **{count}** 条\n\n"
)
else:
word_header = f"📌 {sequence_display} **{word}** : {count}\n\n"
elif format_type == "feishu":
if count >= 10:
word_header = f"🔥 <font color='grey'>{sequence_display}</font> **{word}** : <font color='red'>{count}</font> 条\n\n"
elif count >= 5:
word_header = f"📈 <font color='grey'>{sequence_display}</font> **{word}** : <font color='orange'>{count}</font> 条\n\n"
else:
word_header = f"📌 <font color='grey'>{sequence_display}</font> **{word}** : {count}\n\n"
elif format_type == "dingtalk":
if count >= 10:
word_header = (
f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n"
)
elif count >= 5:
word_header = (
f"📈 {sequence_display} **{word}** : **{count}** 条\n\n"
)
else:
word_header = f"📌 {sequence_display} **{word}** : {count}\n\n"
elif format_type == "slack":
if count >= 10:
word_header = (
f"🔥 {sequence_display} *{word}* : *{count}* 条\n\n"
)
elif count >= 5:
word_header = (
f"📈 {sequence_display} *{word}* : *{count}* 条\n\n"
)
else:
word_header = f"📌 {sequence_display} *{word}* : {count}\n\n"
# 构建第一条新闻
first_news_line = ""
if stat["titles"]:
first_title_data = stat["titles"][0]
if format_type in ("wework", "bark"):
formatted_title = format_title_for_platform(
"wework", first_title_data, show_source=True
)
elif format_type == "telegram":
formatted_title = format_title_for_platform(
"telegram", first_title_data, show_source=True
)
elif format_type == "ntfy":
formatted_title = format_title_for_platform(
"ntfy", first_title_data, show_source=True
)
elif format_type == "feishu":
formatted_title = format_title_for_platform(
"feishu", first_title_data, show_source=True
)
elif format_type == "dingtalk":
formatted_title = format_title_for_platform(
"dingtalk", first_title_data, show_source=True
)
elif format_type == "slack":
formatted_title = format_title_for_platform(
"slack", first_title_data, show_source=True
)
else:
formatted_title = f"{first_title_data['title']}"
first_news_line = f" 1. {formatted_title}\n"
if len(stat["titles"]) > 1:
first_news_line += "\n"
# 原子性检查:词组标题+第一条新闻必须一起处理
word_with_first_news = word_header + first_news_line
test_content = current_batch + word_with_first_news
if (
len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
>= max_bytes
):
# 当前批次容纳不下,开启新批次
if current_batch_has_content:
batches.append(current_batch + base_footer)
current_batch = base_header + stats_header + word_with_first_news
current_batch_has_content = True
start_index = 1
else:
current_batch = test_content
current_batch_has_content = True
start_index = 1
# 处理剩余新闻条目
for j in range(start_index, len(stat["titles"])):
title_data = stat["titles"][j]
if format_type in ("wework", "bark"):
formatted_title = format_title_for_platform(
"wework", title_data, show_source=True
)
elif format_type == "telegram":
formatted_title = format_title_for_platform(
"telegram", title_data, show_source=True
)
elif format_type == "ntfy":
formatted_title = format_title_for_platform(
"ntfy", title_data, show_source=True
)
elif format_type == "feishu":
formatted_title = format_title_for_platform(
"feishu", title_data, show_source=True
)
elif format_type == "dingtalk":
formatted_title = format_title_for_platform(
"dingtalk", title_data, show_source=True
)
elif format_type == "slack":
formatted_title = format_title_for_platform(
"slack", title_data, show_source=True
)
else:
formatted_title = f"{title_data['title']}"
news_line = f" {j + 1}. {formatted_title}\n"
if j < len(stat["titles"]) - 1:
news_line += "\n"
test_content = current_batch + news_line
if (
len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
>= max_bytes
):
if current_batch_has_content:
batches.append(current_batch + base_footer)
current_batch = base_header + stats_header + word_header + news_line
current_batch_has_content = True
else:
current_batch = test_content
current_batch_has_content = True
# 词组间分隔符
if i < len(report_data["stats"]) - 1:
separator = ""
if format_type in ("wework", "bark"):
separator = f"\n\n\n\n"
elif format_type == "telegram":
separator = f"\n\n"
elif format_type == "ntfy":
separator = f"\n\n"
elif format_type == "feishu":
separator = f"\n{feishu_separator}\n\n"
elif format_type == "dingtalk":
separator = f"\n---\n\n"
elif format_type == "slack":
separator = f"\n\n"
test_content = current_batch + separator
if (
len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
< max_bytes
):
current_batch = test_content
return current_batch, current_batch_has_content, batches
# 定义处理新增新闻的函数
def process_new_titles_section(current_batch, current_batch_has_content, batches):
"""处理新增新闻"""
if not report_data["new_titles"]:
return current_batch, current_batch_has_content, batches
new_header = ""
if format_type in ("wework", "bark"):
new_header = f"\n\n\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
elif format_type == "telegram":
new_header = (
f"\n\n🆕 本次新增热点新闻 (共 {report_data['total_new_count']} 条)\n\n"
)
elif format_type == "ntfy":
new_header = f"\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
elif format_type == "feishu":
new_header = f"\n{feishu_separator}\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
elif format_type == "dingtalk":
new_header = f"\n---\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
elif format_type == "slack":
new_header = f"\n\n🆕 *本次新增热点新闻* (共 {report_data['total_new_count']} 条)\n\n"
test_content = current_batch + new_header
if (
len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
>= max_bytes
):
if current_batch_has_content:
batches.append(current_batch + base_footer)
current_batch = base_header + new_header
current_batch_has_content = True
else:
current_batch = test_content
current_batch_has_content = True
# 逐个处理新增新闻来源
for source_data in report_data["new_titles"]:
source_header = ""
if format_type in ("wework", "bark"):
source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n"
elif format_type == "telegram":
source_header = f"{source_data['source_name']} ({len(source_data['titles'])} 条):\n\n"
elif format_type == "ntfy":
source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n"
elif format_type == "feishu":
source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n"
elif format_type == "dingtalk":
source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n"
elif format_type == "slack":
source_header = f"*{source_data['source_name']}* ({len(source_data['titles'])} 条):\n\n"
# 构建第一条新增新闻
first_news_line = ""
if source_data["titles"]:
first_title_data = source_data["titles"][0]
title_data_copy = first_title_data.copy()
title_data_copy["is_new"] = False
if format_type in ("wework", "bark"):
formatted_title = format_title_for_platform(
"wework", title_data_copy, show_source=False
)
elif format_type == "telegram":
formatted_title = format_title_for_platform(
"telegram", title_data_copy, show_source=False
)
elif format_type == "feishu":
formatted_title = format_title_for_platform(
"feishu", title_data_copy, show_source=False
)
elif format_type == "dingtalk":
formatted_title = format_title_for_platform(
"dingtalk", title_data_copy, show_source=False
)
elif format_type == "slack":
formatted_title = format_title_for_platform(
"slack", title_data_copy, show_source=False
)
else:
formatted_title = f"{title_data_copy['title']}"
first_news_line = f" 1. {formatted_title}\n"
# 原子性检查:来源标题+第一条新闻
source_with_first_news = source_header + first_news_line
test_content = current_batch + source_with_first_news
if (
len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
>= max_bytes
):
if current_batch_has_content:
batches.append(current_batch + base_footer)
current_batch = base_header + new_header + source_with_first_news
current_batch_has_content = True
start_index = 1
else:
current_batch = test_content
current_batch_has_content = True
start_index = 1
# 处理剩余新增新闻
for j in range(start_index, len(source_data["titles"])):
title_data = source_data["titles"][j]
title_data_copy = title_data.copy()
title_data_copy["is_new"] = False
if format_type == "wework":
formatted_title = format_title_for_platform(
"wework", title_data_copy, show_source=False
)
elif format_type == "telegram":
formatted_title = format_title_for_platform(
"telegram", title_data_copy, show_source=False
)
elif format_type == "feishu":
formatted_title = format_title_for_platform(
"feishu", title_data_copy, show_source=False
)
elif format_type == "dingtalk":
formatted_title = format_title_for_platform(
"dingtalk", title_data_copy, show_source=False
)
elif format_type == "slack":
formatted_title = format_title_for_platform(
"slack", title_data_copy, show_source=False
)
else:
formatted_title = f"{title_data_copy['title']}"
news_line = f" {j + 1}. {formatted_title}\n"
test_content = current_batch + news_line
if (
len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
>= max_bytes
):
if current_batch_has_content:
batches.append(current_batch + base_footer)
current_batch = base_header + new_header + source_header + news_line
current_batch_has_content = True
else:
current_batch = test_content
current_batch_has_content = True
current_batch += "\n"
return current_batch, current_batch_has_content, batches
# 根据配置决定处理顺序
if reverse_content_order:
# 新增热点在前,热点词汇统计在后
current_batch, current_batch_has_content, batches = process_new_titles_section(
current_batch, current_batch_has_content, batches
)
current_batch, current_batch_has_content, batches = process_stats_section(
current_batch, current_batch_has_content, batches
)
else:
# 默认:热点词汇统计在前,新增热点在后
current_batch, current_batch_has_content, batches = process_stats_section(
current_batch, current_batch_has_content, batches
)
current_batch, current_batch_has_content, batches = process_new_titles_section(
current_batch, current_batch_has_content, batches
)
if report_data["failed_ids"]:
failed_header = ""
if format_type == "wework":
failed_header = f"\n\n\n\n⚠️ **数据获取失败的平台:**\n\n"
elif format_type == "telegram":
failed_header = f"\n\n⚠️ 数据获取失败的平台:\n\n"
elif format_type == "ntfy":
failed_header = f"\n\n⚠️ **数据获取失败的平台:**\n\n"
elif format_type == "feishu":
failed_header = f"\n{feishu_separator}\n\n⚠️ **数据获取失败的平台:**\n\n"
elif format_type == "dingtalk":
failed_header = f"\n---\n\n⚠️ **数据获取失败的平台:**\n\n"
test_content = current_batch + failed_header
if (
len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
>= max_bytes
):
if current_batch_has_content:
batches.append(current_batch + base_footer)
current_batch = base_header + failed_header
current_batch_has_content = True
else:
current_batch = test_content
current_batch_has_content = True
for i, id_value in enumerate(report_data["failed_ids"], 1):
if format_type == "feishu":
failed_line = f" • <font color='red'>{id_value}</font>\n"
elif format_type == "dingtalk":
failed_line = f" • **{id_value}**\n"
else:
failed_line = f"{id_value}\n"
test_content = current_batch + failed_line
if (
len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
>= max_bytes
):
if current_batch_has_content:
batches.append(current_batch + base_footer)
current_batch = base_header + failed_header + failed_line
current_batch_has_content = True
else:
current_batch = test_content
current_batch_has_content = True
# 完成最后批次
if current_batch_has_content:
batches.append(current_batch + base_footer)
return batches

View File

@ -0,0 +1,40 @@
# coding=utf-8
"""
报告生成模块
提供报告生成和格式化功能包括
- HTML 报告生成
- 标题格式化工具
模块结构
- helpers: 报告辅助函数清理转义格式化
- formatter: 平台标题格式化
- html: HTML 报告渲染
- generator: 报告生成器
"""
from trendradar.report.helpers import (
clean_title,
html_escape,
format_rank_display,
)
from trendradar.report.formatter import format_title_for_platform
from trendradar.report.html import render_html_content
from trendradar.report.generator import (
prepare_report_data,
generate_html_report,
)
__all__ = [
# 辅助函数
"clean_title",
"html_escape",
"format_rank_display",
# 格式化函数
"format_title_for_platform",
# HTML 渲染
"render_html_content",
# 报告生成器
"prepare_report_data",
"generate_html_report",
]

View File

@ -0,0 +1,223 @@
# coding=utf-8
"""
平台标题格式化模块
提供多平台标题格式化功能
"""
from typing import Dict
from trendradar.report.helpers import clean_title, html_escape, format_rank_display
def format_title_for_platform(
platform: str, title_data: Dict, show_source: bool = True
) -> str:
"""统一的标题格式化方法
为不同平台生成对应格式的标题字符串
Args:
platform: 目标平台支持:
- "feishu": 飞书
- "dingtalk": 钉钉
- "wework": 企业微信
- "bark": Bark
- "telegram": Telegram
- "ntfy": ntfy
- "slack": Slack
- "html": HTML 报告
title_data: 标题数据字典包含以下字段:
- title: 标题文本
- source_name: 来源名称
- time_display: 时间显示
- count: 出现次数
- ranks: 排名列表
- rank_threshold: 高亮阈值
- url: PC端链接
- mobile_url: 移动端链接优先使用
- is_new: 是否为新增标题可选
show_source: 是否显示来源名称
Returns:
格式化后的标题字符串
"""
rank_display = format_rank_display(
title_data["ranks"], title_data["rank_threshold"], platform
)
link_url = title_data["mobile_url"] or title_data["url"]
cleaned_title = clean_title(title_data["title"])
if platform == "feishu":
if link_url:
formatted_title = f"[{cleaned_title}]({link_url})"
else:
formatted_title = cleaned_title
title_prefix = "🆕 " if title_data.get("is_new") else ""
if show_source:
result = f"<font color='grey'>[{title_data['source_name']}]</font> {title_prefix}{formatted_title}"
else:
result = f"{title_prefix}{formatted_title}"
if rank_display:
result += f" {rank_display}"
if title_data["time_display"]:
result += f" <font color='grey'>- {title_data['time_display']}</font>"
if title_data["count"] > 1:
result += f" <font color='green'>({title_data['count']}次)</font>"
return result
elif platform == "dingtalk":
if link_url:
formatted_title = f"[{cleaned_title}]({link_url})"
else:
formatted_title = cleaned_title
title_prefix = "🆕 " if title_data.get("is_new") else ""
if show_source:
result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}"
else:
result = f"{title_prefix}{formatted_title}"
if rank_display:
result += f" {rank_display}"
if title_data["time_display"]:
result += f" - {title_data['time_display']}"
if title_data["count"] > 1:
result += f" ({title_data['count']}次)"
return result
elif platform in ("wework", "bark"):
# WeWork 和 Bark 使用 markdown 格式
if link_url:
formatted_title = f"[{cleaned_title}]({link_url})"
else:
formatted_title = cleaned_title
title_prefix = "🆕 " if title_data.get("is_new") else ""
if show_source:
result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}"
else:
result = f"{title_prefix}{formatted_title}"
if rank_display:
result += f" {rank_display}"
if title_data["time_display"]:
result += f" - {title_data['time_display']}"
if title_data["count"] > 1:
result += f" ({title_data['count']}次)"
return result
elif platform == "telegram":
if link_url:
formatted_title = f'<a href="{link_url}">{html_escape(cleaned_title)}</a>'
else:
formatted_title = cleaned_title
title_prefix = "🆕 " if title_data.get("is_new") else ""
if show_source:
result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}"
else:
result = f"{title_prefix}{formatted_title}"
if rank_display:
result += f" {rank_display}"
if title_data["time_display"]:
result += f" <code>- {title_data['time_display']}</code>"
if title_data["count"] > 1:
result += f" <code>({title_data['count']}次)</code>"
return result
elif platform == "ntfy":
if link_url:
formatted_title = f"[{cleaned_title}]({link_url})"
else:
formatted_title = cleaned_title
title_prefix = "🆕 " if title_data.get("is_new") else ""
if show_source:
result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}"
else:
result = f"{title_prefix}{formatted_title}"
if rank_display:
result += f" {rank_display}"
if title_data["time_display"]:
result += f" `- {title_data['time_display']}`"
if title_data["count"] > 1:
result += f" `({title_data['count']}次)`"
return result
elif platform == "slack":
# Slack 使用 mrkdwn 格式
if link_url:
# Slack 链接格式: <url|text>
formatted_title = f"<{link_url}|{cleaned_title}>"
else:
formatted_title = cleaned_title
title_prefix = "🆕 " if title_data.get("is_new") else ""
if show_source:
result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}"
else:
result = f"{title_prefix}{formatted_title}"
# 排名(使用 * 加粗)
rank_display = format_rank_display(
title_data["ranks"], title_data["rank_threshold"], "slack"
)
if rank_display:
result += f" {rank_display}"
if title_data["time_display"]:
result += f" `- {title_data['time_display']}`"
if title_data["count"] > 1:
result += f" `({title_data['count']}次)`"
return result
elif platform == "html":
rank_display = format_rank_display(
title_data["ranks"], title_data["rank_threshold"], "html"
)
link_url = title_data["mobile_url"] or title_data["url"]
escaped_title = html_escape(cleaned_title)
escaped_source_name = html_escape(title_data["source_name"])
if link_url:
escaped_url = html_escape(link_url)
formatted_title = f'[{escaped_source_name}] <a href="{escaped_url}" target="_blank" class="news-link">{escaped_title}</a>'
else:
formatted_title = (
f'[{escaped_source_name}] <span class="no-link">{escaped_title}</span>'
)
if rank_display:
formatted_title += f" {rank_display}"
if title_data["time_display"]:
escaped_time = html_escape(title_data["time_display"])
formatted_title += f" <font color='grey'>- {escaped_time}</font>"
if title_data["count"] > 1:
formatted_title += f" <font color='green'>({title_data['count']}次)</font>"
if title_data.get("is_new"):
formatted_title = f"<div class='new-title'>🆕 {formatted_title}</div>"
return formatted_title
else:
return cleaned_title

View File

@ -0,0 +1,235 @@
# coding=utf-8
"""
报告生成模块
提供报告数据准备和 HTML 生成功能
- prepare_report_data: 准备报告数据
- generate_html_report: 生成 HTML 报告
"""
from pathlib import Path
from typing import Dict, List, Optional, Callable
def prepare_report_data(
stats: List[Dict],
failed_ids: Optional[List] = None,
new_titles: Optional[Dict] = None,
id_to_name: Optional[Dict] = None,
mode: str = "daily",
rank_threshold: int = 3,
matches_word_groups_func: Optional[Callable] = None,
load_frequency_words_func: Optional[Callable] = None,
) -> Dict:
"""
准备报告数据
Args:
stats: 统计结果列表
failed_ids: 失败的 ID 列表
new_titles: 新增标题
id_to_name: ID 到名称的映射
mode: 报告模式 (daily/incremental/current)
rank_threshold: 排名阈值
matches_word_groups_func: 词组匹配函数
load_frequency_words_func: 加载频率词函数
Returns:
Dict: 准备好的报告数据
"""
processed_new_titles = []
# 在增量模式下隐藏新增新闻区域
hide_new_section = mode == "incremental"
# 只有在非隐藏模式下才处理新增新闻部分
if not hide_new_section:
filtered_new_titles = {}
if new_titles and id_to_name:
# 如果提供了匹配函数,使用它过滤
if matches_word_groups_func and load_frequency_words_func:
word_groups, filter_words, global_filters = load_frequency_words_func()
for source_id, titles_data in new_titles.items():
filtered_titles = {}
for title, title_data in titles_data.items():
if matches_word_groups_func(title, word_groups, filter_words, global_filters):
filtered_titles[title] = title_data
if filtered_titles:
filtered_new_titles[source_id] = filtered_titles
else:
# 没有匹配函数时,使用全部
filtered_new_titles = new_titles
# 打印过滤后的新增热点数(与推送显示一致)
original_new_count = sum(len(titles) for titles in new_titles.values()) if new_titles else 0
filtered_new_count = sum(len(titles) for titles in filtered_new_titles.values()) if filtered_new_titles else 0
if original_new_count > 0:
print(f"频率词过滤后:{filtered_new_count} 条新增热点匹配(原始 {original_new_count} 条)")
if filtered_new_titles and id_to_name:
for source_id, titles_data in filtered_new_titles.items():
source_name = id_to_name.get(source_id, source_id)
source_titles = []
for title, title_data in titles_data.items():
url = title_data.get("url", "")
mobile_url = title_data.get("mobileUrl", "")
ranks = title_data.get("ranks", [])
processed_title = {
"title": title,
"source_name": source_name,
"time_display": "",
"count": 1,
"ranks": ranks,
"rank_threshold": rank_threshold,
"url": url,
"mobile_url": mobile_url,
"is_new": True,
}
source_titles.append(processed_title)
if source_titles:
processed_new_titles.append(
{
"source_id": source_id,
"source_name": source_name,
"titles": source_titles,
}
)
processed_stats = []
for stat in stats:
if stat["count"] <= 0:
continue
processed_titles = []
for title_data in stat["titles"]:
processed_title = {
"title": title_data["title"],
"source_name": title_data["source_name"],
"time_display": title_data["time_display"],
"count": title_data["count"],
"ranks": title_data["ranks"],
"rank_threshold": title_data["rank_threshold"],
"url": title_data.get("url", ""),
"mobile_url": title_data.get("mobileUrl", ""),
"is_new": title_data.get("is_new", False),
}
processed_titles.append(processed_title)
processed_stats.append(
{
"word": stat["word"],
"count": stat["count"],
"percentage": stat.get("percentage", 0),
"titles": processed_titles,
}
)
return {
"stats": processed_stats,
"new_titles": processed_new_titles,
"failed_ids": failed_ids or [],
"total_new_count": sum(
len(source["titles"]) for source in processed_new_titles
),
}
def generate_html_report(
stats: List[Dict],
total_titles: int,
failed_ids: Optional[List] = None,
new_titles: Optional[Dict] = None,
id_to_name: Optional[Dict] = None,
mode: str = "daily",
is_daily_summary: bool = False,
update_info: Optional[Dict] = None,
rank_threshold: int = 3,
output_dir: str = "output",
date_folder: str = "",
time_filename: str = "",
render_html_func: Optional[Callable] = None,
matches_word_groups_func: Optional[Callable] = None,
load_frequency_words_func: Optional[Callable] = None,
enable_index_copy: bool = True,
) -> str:
"""
生成 HTML 报告
Args:
stats: 统计结果列表
total_titles: 总标题数
failed_ids: 失败的 ID 列表
new_titles: 新增标题
id_to_name: ID 到名称的映射
mode: 报告模式 (daily/incremental/current)
is_daily_summary: 是否是每日汇总
update_info: 更新信息
rank_threshold: 排名阈值
output_dir: 输出目录
date_folder: 日期文件夹名称
time_filename: 时间文件名
render_html_func: HTML 渲染函数
matches_word_groups_func: 词组匹配函数
load_frequency_words_func: 加载频率词函数
enable_index_copy: 是否复制到 index.html
Returns:
str: 生成的 HTML 文件路径
"""
if is_daily_summary:
if mode == "current":
filename = "当前榜单汇总.html"
elif mode == "incremental":
filename = "当日增量.html"
else:
filename = "当日汇总.html"
else:
filename = f"{time_filename}.html"
# 构建输出路径
output_path = Path(output_dir) / date_folder / "html"
output_path.mkdir(parents=True, exist_ok=True)
file_path = str(output_path / filename)
# 准备报告数据
report_data = prepare_report_data(
stats,
failed_ids,
new_titles,
id_to_name,
mode,
rank_threshold,
matches_word_groups_func,
load_frequency_words_func,
)
# 渲染 HTML 内容
if render_html_func:
html_content = render_html_func(
report_data, total_titles, is_daily_summary, mode, update_info
)
else:
# 默认简单 HTML
html_content = f"<html><body><h1>Report</h1><pre>{report_data}</pre></body></html>"
# 写入文件
with open(file_path, "w", encoding="utf-8") as f:
f.write(html_content)
# 如果是每日汇总且启用 index 复制
if is_daily_summary and enable_index_copy:
# 生成到根目录(供 GitHub Pages 访问)
root_index_path = Path("index.html")
with open(root_index_path, "w", encoding="utf-8") as f:
f.write(html_content)
# 同时生成到 output 目录(供 Docker Volume 挂载访问)
output_index_path = Path(output_dir) / "index.html"
Path(output_dir).mkdir(parents=True, exist_ok=True)
with open(output_index_path, "w", encoding="utf-8") as f:
f.write(html_content)
return file_path

View File

@ -0,0 +1,125 @@
# coding=utf-8
"""
报告辅助函数模块
提供报告生成相关的通用辅助函数
"""
import re
from typing import List
def clean_title(title: str) -> str:
"""清理标题中的特殊字符
清理规则
- 将换行符(\n, \r)替换为空格
- 将多个连续空白字符合并为单个空格
- 去除首尾空白
Args:
title: 原始标题字符串
Returns:
清理后的标题字符串
"""
if not isinstance(title, str):
title = str(title)
cleaned_title = title.replace("\n", " ").replace("\r", " ")
cleaned_title = re.sub(r"\s+", " ", cleaned_title)
cleaned_title = cleaned_title.strip()
return cleaned_title
def html_escape(text: str) -> str:
"""HTML特殊字符转义
转义规则按顺序
- & &amp;
- < &lt;
- > &gt;
- " → &quot;
- ' → &#x27;
Args:
text: 原始文本
Returns:
转义后的文本
"""
if not isinstance(text, str):
text = str(text)
return (
text.replace("&", "&amp;")
.replace("<", "&lt;")
.replace(">", "&gt;")
.replace('"', "&quot;")
.replace("'", "&#x27;")
)
def format_rank_display(ranks: List[int], rank_threshold: int, format_type: str) -> str:
"""格式化排名显示
根据不同平台类型生成对应格式的排名字符串
当最小排名小于等于阈值时使用高亮格式
Args:
ranks: 排名列表可能包含重复值
rank_threshold: 高亮阈值小于等于此值的排名会高亮显示
format_type: 平台类型支持:
- "html": HTML格式
- "feishu": 飞书格式
- "dingtalk": 钉钉格式
- "wework": 企业微信格式
- "telegram": Telegram格式
- "slack": Slack格式
- 其他: 默认markdown格式
Returns:
格式化后的排名字符串 "[1]" "[1 - 5]"
如果排名列表为空返回空字符串
"""
if not ranks:
return ""
unique_ranks = sorted(set(ranks))
min_rank = unique_ranks[0]
max_rank = unique_ranks[-1]
# 根据平台类型选择高亮格式
if format_type == "html":
highlight_start = "<font color='red'><strong>"
highlight_end = "</strong></font>"
elif format_type == "feishu":
highlight_start = "<font color='red'>**"
highlight_end = "**</font>"
elif format_type == "dingtalk":
highlight_start = "**"
highlight_end = "**"
elif format_type == "wework":
highlight_start = "**"
highlight_end = "**"
elif format_type == "telegram":
highlight_start = "<b>"
highlight_end = "</b>"
elif format_type == "slack":
highlight_start = "*"
highlight_end = "*"
else:
# 默认 markdown 格式
highlight_start = "**"
highlight_end = "**"
# 生成排名显示
if min_rank <= rank_threshold:
if min_rank == max_rank:
return f"{highlight_start}[{min_rank}]{highlight_end}"
else:
return f"{highlight_start}[{min_rank} - {max_rank}]{highlight_end}"
else:
if min_rank == max_rank:
return f"[{min_rank}]"
else:
return f"[{min_rank} - {max_rank}]"

1050
trendradar/report/html.py Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,44 @@
# coding=utf-8
"""
存储模块 - 支持多种存储后端
支持的存储后端:
- local: 本地 SQLite + TXT/HTML 文件
- remote: 远程云存储S3 兼容协议R2/OSS/COS/S3
- auto: 根据环境自动选择GitHub Actions remote其他用 local
"""
from trendradar.storage.base import (
StorageBackend,
NewsItem,
NewsData,
convert_crawl_results_to_news_data,
convert_news_data_to_results,
)
from trendradar.storage.local import LocalStorageBackend
from trendradar.storage.manager import StorageManager, get_storage_manager
# 远程后端可选导入(需要 boto3
try:
from trendradar.storage.remote import RemoteStorageBackend
HAS_REMOTE = True
except ImportError:
RemoteStorageBackend = None
HAS_REMOTE = False
__all__ = [
# 基础类
"StorageBackend",
"NewsItem",
"NewsData",
# 转换函数
"convert_crawl_results_to_news_data",
"convert_news_data_to_results",
# 后端实现
"LocalStorageBackend",
"RemoteStorageBackend",
"HAS_REMOTE",
# 管理器
"StorageManager",
"get_storage_manager",
]

457
trendradar/storage/base.py Normal file
View File

@ -0,0 +1,457 @@
# coding=utf-8
"""
存储后端抽象基类和数据模型
定义统一的存储接口所有存储后端都需要实现这些方法
"""
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from datetime import datetime
from typing import Dict, List, Optional, Any
import json
@dataclass
class NewsItem:
"""新闻条目数据模型"""
title: str # 新闻标题
source_id: str # 来源平台ID如 toutiao, baidu
source_name: str = "" # 来源平台名称(运行时使用,数据库不存储)
rank: int = 0 # 排名
url: str = "" # 链接 URL
mobile_url: str = "" # 移动端 URL
crawl_time: str = "" # 抓取时间HH:MM 格式)
# 统计信息(用于分析)
ranks: List[int] = field(default_factory=list) # 历史排名列表
first_time: str = "" # 首次出现时间
last_time: str = "" # 最后出现时间
count: int = 1 # 出现次数
def to_dict(self) -> Dict[str, Any]:
"""转换为字典"""
return {
"title": self.title,
"source_id": self.source_id,
"source_name": self.source_name,
"rank": self.rank,
"url": self.url,
"mobile_url": self.mobile_url,
"crawl_time": self.crawl_time,
"ranks": self.ranks,
"first_time": self.first_time,
"last_time": self.last_time,
"count": self.count,
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "NewsItem":
"""从字典创建"""
return cls(
title=data.get("title", ""),
source_id=data.get("source_id", ""),
source_name=data.get("source_name", ""),
rank=data.get("rank", 0),
url=data.get("url", ""),
mobile_url=data.get("mobile_url", ""),
crawl_time=data.get("crawl_time", ""),
ranks=data.get("ranks", []),
first_time=data.get("first_time", ""),
last_time=data.get("last_time", ""),
count=data.get("count", 1),
)
@dataclass
class NewsData:
"""
新闻数据集合
结构:
- date: 日期YYYY-MM-DD
- crawl_time: 抓取时间HH时MM分
- items: 按来源ID分组的新闻条目
- id_to_name: 来源ID到名称的映射
- failed_ids: 失败的来源ID列表
"""
date: str # 日期
crawl_time: str # 抓取时间
items: Dict[str, List[NewsItem]] # 按来源分组的新闻
id_to_name: Dict[str, str] = field(default_factory=dict) # ID到名称映射
failed_ids: List[str] = field(default_factory=list) # 失败的ID
def to_dict(self) -> Dict[str, Any]:
"""转换为字典"""
items_dict = {}
for source_id, news_list in self.items.items():
items_dict[source_id] = [item.to_dict() for item in news_list]
return {
"date": self.date,
"crawl_time": self.crawl_time,
"items": items_dict,
"id_to_name": self.id_to_name,
"failed_ids": self.failed_ids,
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "NewsData":
"""从字典创建"""
items = {}
items_data = data.get("items", {})
for source_id, news_list in items_data.items():
items[source_id] = [NewsItem.from_dict(item) for item in news_list]
return cls(
date=data.get("date", ""),
crawl_time=data.get("crawl_time", ""),
items=items,
id_to_name=data.get("id_to_name", {}),
failed_ids=data.get("failed_ids", []),
)
def get_total_count(self) -> int:
"""获取新闻总数"""
return sum(len(news_list) for news_list in self.items.values())
def merge_with(self, other: "NewsData") -> "NewsData":
"""
合并另一个 NewsData 到当前数据
合并规则:
- 相同 source_id + title 的新闻合并排名历史
- 更新 last_time count
- 保留较早的 first_time
"""
merged_items = {}
# 复制当前数据
for source_id, news_list in self.items.items():
merged_items[source_id] = {item.title: item for item in news_list}
# 合并其他数据
for source_id, news_list in other.items.items():
if source_id not in merged_items:
merged_items[source_id] = {}
for item in news_list:
if item.title in merged_items[source_id]:
# 合并已存在的新闻
existing = merged_items[source_id][item.title]
# 合并排名
existing_ranks = set(existing.ranks) if existing.ranks else set()
new_ranks = set(item.ranks) if item.ranks else set()
merged_ranks = sorted(existing_ranks | new_ranks)
existing.ranks = merged_ranks
# 更新时间
if item.first_time and (not existing.first_time or item.first_time < existing.first_time):
existing.first_time = item.first_time
if item.last_time and (not existing.last_time or item.last_time > existing.last_time):
existing.last_time = item.last_time
# 更新计数
existing.count += 1
# 保留URL如果原来没有
if not existing.url and item.url:
existing.url = item.url
if not existing.mobile_url and item.mobile_url:
existing.mobile_url = item.mobile_url
else:
# 添加新新闻
merged_items[source_id][item.title] = item
# 转换回列表格式
final_items = {}
for source_id, items_dict in merged_items.items():
final_items[source_id] = list(items_dict.values())
# 合并 id_to_name
merged_id_to_name = {**self.id_to_name, **other.id_to_name}
# 合并 failed_ids去重
merged_failed_ids = list(set(self.failed_ids + other.failed_ids))
return NewsData(
date=self.date or other.date,
crawl_time=other.crawl_time, # 使用较新的抓取时间
items=final_items,
id_to_name=merged_id_to_name,
failed_ids=merged_failed_ids,
)
class StorageBackend(ABC):
"""
存储后端抽象基类
所有存储后端都需要实现这些方法以支持:
- 保存新闻数据
- 读取当天所有数据
- 检测新增新闻
- 生成报告文件TXT/HTML
"""
@abstractmethod
def save_news_data(self, data: NewsData) -> bool:
"""
保存新闻数据
Args:
data: 新闻数据
Returns:
是否保存成功
"""
pass
@abstractmethod
def get_today_all_data(self, date: Optional[str] = None) -> Optional[NewsData]:
"""
获取指定日期的所有新闻数据
Args:
date: 日期字符串YYYY-MM-DD默认为今天
Returns:
合并后的新闻数据如果没有数据返回 None
"""
pass
@abstractmethod
def get_latest_crawl_data(self, date: Optional[str] = None) -> Optional[NewsData]:
"""
获取最新一次抓取的数据
Args:
date: 日期字符串默认为今天
Returns:
最新抓取的新闻数据
"""
pass
@abstractmethod
def detect_new_titles(self, current_data: NewsData) -> Dict[str, Dict]:
"""
检测新增的标题
Args:
current_data: 当前抓取的数据
Returns:
新增的标题数据格式: {source_id: {title: title_data}}
"""
pass
@abstractmethod
def save_txt_snapshot(self, data: NewsData) -> Optional[str]:
"""
保存 TXT 快照可选功能本地环境可用
Args:
data: 新闻数据
Returns:
保存的文件路径如果不支持返回 None
"""
pass
@abstractmethod
def save_html_report(self, html_content: str, filename: str, is_summary: bool = False) -> Optional[str]:
"""
保存 HTML 报告
Args:
html_content: HTML 内容
filename: 文件名
is_summary: 是否为汇总报告
Returns:
保存的文件路径
"""
pass
@abstractmethod
def is_first_crawl_today(self, date: Optional[str] = None) -> bool:
"""
检查是否是当天第一次抓取
Args:
date: 日期字符串默认为今天
Returns:
是否是第一次抓取
"""
pass
@abstractmethod
def cleanup(self) -> None:
"""
清理资源如临时文件数据库连接等
"""
pass
@abstractmethod
def cleanup_old_data(self, retention_days: int) -> int:
"""
清理过期数据
Args:
retention_days: 保留天数0 表示不清理
Returns:
删除的日期目录数量
"""
pass
@property
@abstractmethod
def backend_name(self) -> str:
"""
存储后端名称
"""
pass
@property
@abstractmethod
def supports_txt(self) -> bool:
"""
是否支持生成 TXT 快照
"""
pass
# === 推送记录相关方法 ===
@abstractmethod
def has_pushed_today(self, date: Optional[str] = None) -> bool:
"""
检查指定日期是否已推送过
Args:
date: 日期字符串YYYY-MM-DD默认为今天
Returns:
是否已推送
"""
pass
@abstractmethod
def record_push(self, report_type: str, date: Optional[str] = None) -> bool:
"""
记录推送
Args:
report_type: 报告类型
date: 日期字符串YYYY-MM-DD默认为今天
Returns:
是否记录成功
"""
pass
def convert_crawl_results_to_news_data(
results: Dict[str, Dict],
id_to_name: Dict[str, str],
failed_ids: List[str],
crawl_time: str,
crawl_date: str,
) -> NewsData:
"""
将爬虫结果转换为 NewsData 格式
Args:
results: 爬虫返回的结果 {source_id: {title: {ranks: [], url: "", mobileUrl: ""}}}
id_to_name: 来源ID到名称的映射
failed_ids: 失败的来源ID
crawl_time: 抓取时间HH:MM
crawl_date: 抓取日期YYYY-MM-DD
Returns:
NewsData 对象
"""
items = {}
for source_id, titles_data in results.items():
source_name = id_to_name.get(source_id, source_id)
news_list = []
for title, data in titles_data.items():
if isinstance(data, dict):
ranks = data.get("ranks", [])
url = data.get("url", "")
mobile_url = data.get("mobileUrl", "")
else:
# 兼容旧格式
ranks = data if isinstance(data, list) else []
url = ""
mobile_url = ""
rank = ranks[0] if ranks else 99
news_item = NewsItem(
title=title,
source_id=source_id,
source_name=source_name,
rank=rank,
url=url,
mobile_url=mobile_url,
crawl_time=crawl_time,
ranks=ranks,
first_time=crawl_time,
last_time=crawl_time,
count=1,
)
news_list.append(news_item)
items[source_id] = news_list
return NewsData(
date=crawl_date,
crawl_time=crawl_time,
items=items,
id_to_name=id_to_name,
failed_ids=failed_ids,
)
def convert_news_data_to_results(data: NewsData) -> tuple:
"""
NewsData 转换回原有的 results 格式用于兼容现有代码
Args:
data: NewsData 对象
Returns:
(results, id_to_name, title_info) 元组
"""
results = {}
title_info = {}
for source_id, news_list in data.items.items():
results[source_id] = {}
title_info[source_id] = {}
for item in news_list:
results[source_id][item.title] = {
"ranks": item.ranks,
"url": item.url,
"mobileUrl": item.mobile_url,
}
title_info[source_id][item.title] = {
"first_time": item.first_time,
"last_time": item.last_time,
"count": item.count,
"ranks": item.ranks,
"url": item.url,
"mobileUrl": item.mobile_url,
}
return results, data.id_to_name, title_info

869
trendradar/storage/local.py Normal file
View File

@ -0,0 +1,869 @@
# coding=utf-8
"""
本地存储后端 - SQLite + TXT/HTML
使用 SQLite 作为主存储支持可选的 TXT 快照和 HTML 报告
"""
import sqlite3
import os
import shutil
import pytz
import re
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, List, Optional, Any
from trendradar.storage.base import StorageBackend, NewsItem, NewsData
from trendradar.utils.time import (
get_configured_time,
format_date_folder,
format_time_filename,
)
class LocalStorageBackend(StorageBackend):
"""
本地存储后端
使用 SQLite 数据库存储新闻数据支持
- 按日期组织的 SQLite 数据库文件
- 可选的 TXT 快照用于调试
- HTML 报告生成
"""
def __init__(
self,
data_dir: str = "output",
enable_txt: bool = True,
enable_html: bool = True,
timezone: str = "Asia/Shanghai",
):
"""
初始化本地存储后端
Args:
data_dir: 数据目录路径
enable_txt: 是否启用 TXT 快照
enable_html: 是否启用 HTML 报告
timezone: 时区配置默认 Asia/Shanghai
"""
self.data_dir = Path(data_dir)
self.enable_txt = enable_txt
self.enable_html = enable_html
self.timezone = timezone
self._db_connections: Dict[str, sqlite3.Connection] = {}
@property
def backend_name(self) -> str:
return "local"
@property
def supports_txt(self) -> bool:
return self.enable_txt
def _get_configured_time(self) -> datetime:
"""获取配置时区的当前时间"""
return get_configured_time(self.timezone)
def _format_date_folder(self, date: Optional[str] = None) -> str:
"""格式化日期文件夹名 (ISO 格式: YYYY-MM-DD)"""
return format_date_folder(date, self.timezone)
def _format_time_filename(self) -> str:
"""格式化时间文件名 (格式: HH-MM)"""
return format_time_filename(self.timezone)
def _get_db_path(self, date: Optional[str] = None) -> Path:
"""获取 SQLite 数据库路径"""
date_folder = self._format_date_folder(date)
db_dir = self.data_dir / date_folder
db_dir.mkdir(parents=True, exist_ok=True)
return db_dir / "news.db"
def _get_connection(self, date: Optional[str] = None) -> sqlite3.Connection:
"""获取数据库连接(带缓存)"""
db_path = str(self._get_db_path(date))
if db_path not in self._db_connections:
conn = sqlite3.connect(db_path)
conn.row_factory = sqlite3.Row
self._init_tables(conn)
self._db_connections[db_path] = conn
return self._db_connections[db_path]
def _get_schema_path(self) -> Path:
"""获取 schema.sql 文件路径"""
return Path(__file__).parent / "schema.sql"
def _init_tables(self, conn: sqlite3.Connection) -> None:
"""从 schema.sql 初始化数据库表结构"""
schema_path = self._get_schema_path()
if schema_path.exists():
with open(schema_path, "r", encoding="utf-8") as f:
schema_sql = f.read()
conn.executescript(schema_sql)
else:
raise FileNotFoundError(f"Schema file not found: {schema_path}")
conn.commit()
def save_news_data(self, data: NewsData) -> bool:
"""
保存新闻数据到 SQLite URL 为唯一标识支持标题更新检测
Args:
data: 新闻数据
Returns:
是否保存成功
"""
try:
conn = self._get_connection(data.date)
cursor = conn.cursor()
# 获取配置时区的当前时间
now_str = self._get_configured_time().strftime("%Y-%m-%d %H:%M:%S")
# 首先同步平台信息到 platforms 表
for source_id, source_name in data.id_to_name.items():
cursor.execute("""
INSERT INTO platforms (id, name, updated_at)
VALUES (?, ?, ?)
ON CONFLICT(id) DO UPDATE SET
name = excluded.name,
updated_at = excluded.updated_at
""", (source_id, source_name, now_str))
# 统计计数器
new_count = 0
updated_count = 0
title_changed_count = 0
success_sources = []
for source_id, news_list in data.items.items():
success_sources.append(source_id)
for item in news_list:
try:
# 检查是否已存在(通过 URL + platform_id
if item.url:
cursor.execute("""
SELECT id, title FROM news_items
WHERE url = ? AND platform_id = ?
""", (item.url, source_id))
existing = cursor.fetchone()
if existing:
# 已存在,更新记录
existing_id, existing_title = existing
# 检查标题是否变化
if existing_title != item.title:
# 记录标题变更
cursor.execute("""
INSERT INTO title_changes
(news_item_id, old_title, new_title, changed_at)
VALUES (?, ?, ?, ?)
""", (existing_id, existing_title, item.title, now_str))
title_changed_count += 1
# 记录排名历史
cursor.execute("""
INSERT INTO rank_history
(news_item_id, rank, crawl_time, created_at)
VALUES (?, ?, ?, ?)
""", (existing_id, item.rank, data.crawl_time, now_str))
# 更新现有记录
cursor.execute("""
UPDATE news_items SET
title = ?,
rank = ?,
mobile_url = ?,
last_crawl_time = ?,
crawl_count = crawl_count + 1,
updated_at = ?
WHERE id = ?
""", (item.title, item.rank, item.mobile_url,
data.crawl_time, now_str, existing_id))
updated_count += 1
else:
# 不存在,插入新记录
cursor.execute("""
INSERT INTO news_items
(title, platform_id, rank, url, mobile_url,
first_crawl_time, last_crawl_time, crawl_count,
created_at, updated_at)
VALUES (?, ?, ?, ?, ?, ?, ?, 1, ?, ?)
""", (item.title, source_id, item.rank, item.url,
item.mobile_url, data.crawl_time, data.crawl_time,
now_str, now_str))
new_id = cursor.lastrowid
# 记录初始排名
cursor.execute("""
INSERT INTO rank_history
(news_item_id, rank, crawl_time, created_at)
VALUES (?, ?, ?, ?)
""", (new_id, item.rank, data.crawl_time, now_str))
new_count += 1
else:
# URL 为空的情况,直接插入(不做去重)
cursor.execute("""
INSERT INTO news_items
(title, platform_id, rank, url, mobile_url,
first_crawl_time, last_crawl_time, crawl_count,
created_at, updated_at)
VALUES (?, ?, ?, ?, ?, ?, ?, 1, ?, ?)
""", (item.title, source_id, item.rank, item.url,
item.mobile_url, data.crawl_time, data.crawl_time,
now_str, now_str))
new_id = cursor.lastrowid
# 记录初始排名
cursor.execute("""
INSERT INTO rank_history
(news_item_id, rank, crawl_time, created_at)
VALUES (?, ?, ?, ?)
""", (new_id, item.rank, data.crawl_time, now_str))
new_count += 1
except sqlite3.Error as e:
print(f"保存新闻条目失败 [{item.title[:30]}...]: {e}")
total_items = new_count + updated_count
# 记录抓取信息
cursor.execute("""
INSERT OR REPLACE INTO crawl_records
(crawl_time, total_items, created_at)
VALUES (?, ?, ?)
""", (data.crawl_time, total_items, now_str))
# 获取刚插入的 crawl_record 的 ID
cursor.execute("""
SELECT id FROM crawl_records WHERE crawl_time = ?
""", (data.crawl_time,))
record_row = cursor.fetchone()
if record_row:
crawl_record_id = record_row[0]
# 记录成功的来源
for source_id in success_sources:
cursor.execute("""
INSERT OR REPLACE INTO crawl_source_status
(crawl_record_id, platform_id, status)
VALUES (?, ?, 'success')
""", (crawl_record_id, source_id))
# 记录失败的来源
for failed_id in data.failed_ids:
# 确保失败的平台也在 platforms 表中
cursor.execute("""
INSERT OR IGNORE INTO platforms (id, name, updated_at)
VALUES (?, ?, ?)
""", (failed_id, failed_id, now_str))
cursor.execute("""
INSERT OR REPLACE INTO crawl_source_status
(crawl_record_id, platform_id, status)
VALUES (?, ?, 'failed')
""", (crawl_record_id, failed_id))
conn.commit()
# 输出详细的存储统计日志
log_parts = [f"[本地存储] 处理完成:新增 {new_count}"]
if updated_count > 0:
log_parts.append(f"更新 {updated_count}")
if title_changed_count > 0:
log_parts.append(f"标题变更 {title_changed_count}")
print("".join(log_parts))
return True
except Exception as e:
print(f"[本地存储] 保存失败: {e}")
return False
def get_today_all_data(self, date: Optional[str] = None) -> Optional[NewsData]:
"""
获取指定日期的所有新闻数据合并后
Args:
date: 日期字符串默认为今天
Returns:
合并后的新闻数据
"""
try:
db_path = self._get_db_path(date)
if not db_path.exists():
return None
conn = self._get_connection(date)
cursor = conn.cursor()
# 获取所有新闻数据(包含 id 用于查询排名历史)
cursor.execute("""
SELECT n.id, n.title, n.platform_id, p.name as platform_name,
n.rank, n.url, n.mobile_url,
n.first_crawl_time, n.last_crawl_time, n.crawl_count
FROM news_items n
LEFT JOIN platforms p ON n.platform_id = p.id
ORDER BY n.platform_id, n.last_crawl_time
""")
rows = cursor.fetchall()
if not rows:
return None
# 收集所有 news_item_id
news_ids = [row[0] for row in rows]
# 批量查询排名历史
rank_history_map: Dict[int, List[int]] = {}
if news_ids:
placeholders = ",".join("?" * len(news_ids))
cursor.execute(f"""
SELECT news_item_id, rank FROM rank_history
WHERE news_item_id IN ({placeholders})
ORDER BY news_item_id, crawl_time
""", news_ids)
for rh_row in cursor.fetchall():
news_id, rank = rh_row[0], rh_row[1]
if news_id not in rank_history_map:
rank_history_map[news_id] = []
if rank not in rank_history_map[news_id]:
rank_history_map[news_id].append(rank)
# 按 platform_id 分组
items: Dict[str, List[NewsItem]] = {}
id_to_name: Dict[str, str] = {}
crawl_date = self._format_date_folder(date)
for row in rows:
news_id = row[0]
platform_id = row[2]
title = row[1]
platform_name = row[3] or platform_id
id_to_name[platform_id] = platform_name
if platform_id not in items:
items[platform_id] = []
# 获取排名历史,如果没有则使用当前排名
ranks = rank_history_map.get(news_id, [row[4]])
items[platform_id].append(NewsItem(
title=title,
source_id=platform_id,
source_name=platform_name,
rank=row[4],
url=row[5] or "",
mobile_url=row[6] or "",
crawl_time=row[8], # last_crawl_time
ranks=ranks,
first_time=row[7], # first_crawl_time
last_time=row[8], # last_crawl_time
count=row[9], # crawl_count
))
final_items = items
# 获取失败的来源
cursor.execute("""
SELECT DISTINCT css.platform_id
FROM crawl_source_status css
JOIN crawl_records cr ON css.crawl_record_id = cr.id
WHERE css.status = 'failed'
""")
failed_ids = [row[0] for row in cursor.fetchall()]
# 获取最新的抓取时间
cursor.execute("""
SELECT crawl_time FROM crawl_records
ORDER BY crawl_time DESC
LIMIT 1
""")
time_row = cursor.fetchone()
crawl_time = time_row[0] if time_row else self._format_time_filename()
return NewsData(
date=crawl_date,
crawl_time=crawl_time,
items=final_items,
id_to_name=id_to_name,
failed_ids=failed_ids,
)
except Exception as e:
print(f"[本地存储] 读取数据失败: {e}")
return None
def get_latest_crawl_data(self, date: Optional[str] = None) -> Optional[NewsData]:
"""
获取最新一次抓取的数据
Args:
date: 日期字符串默认为今天
Returns:
最新抓取的新闻数据
"""
try:
db_path = self._get_db_path(date)
if not db_path.exists():
return None
conn = self._get_connection(date)
cursor = conn.cursor()
# 获取最新的抓取时间
cursor.execute("""
SELECT crawl_time FROM crawl_records
ORDER BY crawl_time DESC
LIMIT 1
""")
time_row = cursor.fetchone()
if not time_row:
return None
latest_time = time_row[0]
# 获取该时间的新闻数据(包含 id 用于查询排名历史)
cursor.execute("""
SELECT n.id, n.title, n.platform_id, p.name as platform_name,
n.rank, n.url, n.mobile_url,
n.first_crawl_time, n.last_crawl_time, n.crawl_count
FROM news_items n
LEFT JOIN platforms p ON n.platform_id = p.id
WHERE n.last_crawl_time = ?
""", (latest_time,))
rows = cursor.fetchall()
if not rows:
return None
# 收集所有 news_item_id
news_ids = [row[0] for row in rows]
# 批量查询排名历史
rank_history_map: Dict[int, List[int]] = {}
if news_ids:
placeholders = ",".join("?" * len(news_ids))
cursor.execute(f"""
SELECT news_item_id, rank FROM rank_history
WHERE news_item_id IN ({placeholders})
ORDER BY news_item_id, crawl_time
""", news_ids)
for rh_row in cursor.fetchall():
news_id, rank = rh_row[0], rh_row[1]
if news_id not in rank_history_map:
rank_history_map[news_id] = []
if rank not in rank_history_map[news_id]:
rank_history_map[news_id].append(rank)
items: Dict[str, List[NewsItem]] = {}
id_to_name: Dict[str, str] = {}
crawl_date = self._format_date_folder(date)
for row in rows:
news_id = row[0]
platform_id = row[2]
platform_name = row[3] or platform_id
id_to_name[platform_id] = platform_name
if platform_id not in items:
items[platform_id] = []
# 获取排名历史,如果没有则使用当前排名
ranks = rank_history_map.get(news_id, [row[4]])
items[platform_id].append(NewsItem(
title=row[1],
source_id=platform_id,
source_name=platform_name,
rank=row[4],
url=row[5] or "",
mobile_url=row[6] or "",
crawl_time=row[8], # last_crawl_time
ranks=ranks,
first_time=row[7], # first_crawl_time
last_time=row[8], # last_crawl_time
count=row[9], # crawl_count
))
# 获取失败的来源(针对最新一次抓取)
cursor.execute("""
SELECT css.platform_id
FROM crawl_source_status css
JOIN crawl_records cr ON css.crawl_record_id = cr.id
WHERE cr.crawl_time = ? AND css.status = 'failed'
""", (latest_time,))
failed_ids = [row[0] for row in cursor.fetchall()]
return NewsData(
date=crawl_date,
crawl_time=latest_time,
items=items,
id_to_name=id_to_name,
failed_ids=failed_ids,
)
except Exception as e:
print(f"[本地存储] 获取最新数据失败: {e}")
return None
def detect_new_titles(self, current_data: NewsData) -> Dict[str, Dict]:
"""
检测新增的标题
Args:
current_data: 当前抓取的数据
Returns:
新增的标题数据 {source_id: {title: NewsItem}}
"""
try:
# 获取历史数据
historical_data = self.get_today_all_data(current_data.date)
if not historical_data:
# 没有历史数据,所有都是新的
new_titles = {}
for source_id, news_list in current_data.items.items():
new_titles[source_id] = {item.title: item for item in news_list}
return new_titles
# 收集历史标题
historical_titles: Dict[str, set] = {}
for source_id, news_list in historical_data.items.items():
historical_titles[source_id] = {item.title for item in news_list}
# 检测新增
new_titles = {}
for source_id, news_list in current_data.items.items():
hist_set = historical_titles.get(source_id, set())
for item in news_list:
if item.title not in hist_set:
if source_id not in new_titles:
new_titles[source_id] = {}
new_titles[source_id][item.title] = item
return new_titles
except Exception as e:
print(f"[本地存储] 检测新标题失败: {e}")
return {}
def save_txt_snapshot(self, data: NewsData) -> Optional[str]:
"""
保存 TXT 快照
Args:
data: 新闻数据
Returns:
保存的文件路径
"""
if not self.enable_txt:
return None
try:
date_folder = self._format_date_folder(data.date)
txt_dir = self.data_dir / date_folder / "txt"
txt_dir.mkdir(parents=True, exist_ok=True)
file_path = txt_dir / f"{data.crawl_time}.txt"
with open(file_path, "w", encoding="utf-8") as f:
for source_id, news_list in data.items.items():
source_name = data.id_to_name.get(source_id, source_id)
# 写入来源标题
if source_name and source_name != source_id:
f.write(f"{source_id} | {source_name}\n")
else:
f.write(f"{source_id}\n")
# 按排名排序
sorted_news = sorted(news_list, key=lambda x: x.rank)
for item in sorted_news:
line = f"{item.rank}. {item.title}"
if item.url:
line += f" [URL:{item.url}]"
if item.mobile_url:
line += f" [MOBILE:{item.mobile_url}]"
f.write(line + "\n")
f.write("\n")
# 写入失败的来源
if data.failed_ids:
f.write("==== 以下ID请求失败 ====\n")
for failed_id in data.failed_ids:
f.write(f"{failed_id}\n")
print(f"[本地存储] TXT 快照已保存: {file_path}")
return str(file_path)
except Exception as e:
print(f"[本地存储] 保存 TXT 快照失败: {e}")
return None
def save_html_report(self, html_content: str, filename: str, is_summary: bool = False) -> Optional[str]:
"""
保存 HTML 报告
Args:
html_content: HTML 内容
filename: 文件名
is_summary: 是否为汇总报告
Returns:
保存的文件路径
"""
if not self.enable_html:
return None
try:
date_folder = self._format_date_folder()
html_dir = self.data_dir / date_folder / "html"
html_dir.mkdir(parents=True, exist_ok=True)
file_path = html_dir / filename
with open(file_path, "w", encoding="utf-8") as f:
f.write(html_content)
print(f"[本地存储] HTML 报告已保存: {file_path}")
return str(file_path)
except Exception as e:
print(f"[本地存储] 保存 HTML 报告失败: {e}")
return None
def is_first_crawl_today(self, date: Optional[str] = None) -> bool:
"""
检查是否是当天第一次抓取
Args:
date: 日期字符串默认为今天
Returns:
是否是第一次抓取
"""
try:
db_path = self._get_db_path(date)
if not db_path.exists():
return True
conn = self._get_connection(date)
cursor = conn.cursor()
cursor.execute("""
SELECT COUNT(*) as count FROM crawl_records
""")
row = cursor.fetchone()
count = row[0] if row else 0
# 如果只有一条或没有记录,视为第一次抓取
return count <= 1
except Exception as e:
print(f"[本地存储] 检查首次抓取失败: {e}")
return True
def get_crawl_times(self, date: Optional[str] = None) -> List[str]:
"""
获取指定日期的所有抓取时间列表
Args:
date: 日期字符串默认为今天
Returns:
抓取时间列表按时间排序
"""
try:
db_path = self._get_db_path(date)
if not db_path.exists():
return []
conn = self._get_connection(date)
cursor = conn.cursor()
cursor.execute("""
SELECT crawl_time FROM crawl_records
ORDER BY crawl_time
""")
rows = cursor.fetchall()
return [row[0] for row in rows]
except Exception as e:
print(f"[本地存储] 获取抓取时间列表失败: {e}")
return []
def cleanup(self) -> None:
"""清理资源(关闭数据库连接)"""
for db_path, conn in self._db_connections.items():
try:
conn.close()
print(f"[本地存储] 关闭数据库连接: {db_path}")
except Exception as e:
print(f"[本地存储] 关闭连接失败 {db_path}: {e}")
self._db_connections.clear()
def cleanup_old_data(self, retention_days: int) -> int:
"""
清理过期数据
Args:
retention_days: 保留天数0 表示不清理
Returns:
删除的日期目录数量
"""
if retention_days <= 0:
return 0
deleted_count = 0
cutoff_date = self._get_configured_time() - timedelta(days=retention_days)
try:
if not self.data_dir.exists():
return 0
for date_folder in self.data_dir.iterdir():
if not date_folder.is_dir() or date_folder.name.startswith('.'):
continue
# 解析日期文件夹名(支持两种格式)
folder_date = None
try:
# ISO 格式: YYYY-MM-DD
date_match = re.match(r'(\d{4})-(\d{2})-(\d{2})', date_folder.name)
if date_match:
folder_date = datetime(
int(date_match.group(1)),
int(date_match.group(2)),
int(date_match.group(3)),
tzinfo=pytz.timezone("Asia/Shanghai")
)
else:
# 旧中文格式: YYYY年MM月DD日
date_match = re.match(r'(\d{4})年(\d{2})月(\d{2})日', date_folder.name)
if date_match:
folder_date = datetime(
int(date_match.group(1)),
int(date_match.group(2)),
int(date_match.group(3)),
tzinfo=pytz.timezone("Asia/Shanghai")
)
except Exception:
continue
if folder_date and folder_date < cutoff_date:
# 先关闭该日期的数据库连接
db_path = str(self._get_db_path(date_folder.name))
if db_path in self._db_connections:
try:
self._db_connections[db_path].close()
del self._db_connections[db_path]
except Exception:
pass
# 删除整个日期目录
try:
shutil.rmtree(date_folder)
deleted_count += 1
print(f"[本地存储] 清理过期数据: {date_folder.name}")
except Exception as e:
print(f"[本地存储] 删除目录失败 {date_folder.name}: {e}")
if deleted_count > 0:
print(f"[本地存储] 共清理 {deleted_count} 个过期日期目录")
return deleted_count
except Exception as e:
print(f"[本地存储] 清理过期数据失败: {e}")
return deleted_count
def has_pushed_today(self, date: Optional[str] = None) -> bool:
"""
检查指定日期是否已推送过
Args:
date: 日期字符串YYYY-MM-DD默认为今天
Returns:
是否已推送
"""
try:
conn = self._get_connection(date)
cursor = conn.cursor()
target_date = self._format_date_folder(date)
cursor.execute("""
SELECT pushed FROM push_records WHERE date = ?
""", (target_date,))
row = cursor.fetchone()
if row:
return bool(row[0])
return False
except Exception as e:
print(f"[本地存储] 检查推送记录失败: {e}")
return False
def record_push(self, report_type: str, date: Optional[str] = None) -> bool:
"""
记录推送
Args:
report_type: 报告类型
date: 日期字符串YYYY-MM-DD默认为今天
Returns:
是否记录成功
"""
try:
conn = self._get_connection(date)
cursor = conn.cursor()
target_date = self._format_date_folder(date)
now_str = self._get_configured_time().strftime("%Y-%m-%d %H:%M:%S")
cursor.execute("""
INSERT INTO push_records (date, pushed, push_time, report_type, created_at)
VALUES (?, 1, ?, ?, ?)
ON CONFLICT(date) DO UPDATE SET
pushed = 1,
push_time = excluded.push_time,
report_type = excluded.report_type
""", (target_date, now_str, report_type, now_str))
conn.commit()
print(f"[本地存储] 推送记录已保存: {report_type} at {now_str}")
return True
except Exception as e:
print(f"[本地存储] 记录推送失败: {e}")
return False
def __del__(self):
"""析构函数,确保关闭连接"""
self.cleanup()

View File

@ -0,0 +1,316 @@
# coding=utf-8
"""
存储管理器 - 统一管理存储后端
根据环境和配置自动选择合适的存储后端
"""
import os
from typing import Optional
from trendradar.storage.base import StorageBackend, NewsData
# 存储管理器单例
_storage_manager: Optional["StorageManager"] = None
class StorageManager:
"""
存储管理器
功能
- 自动检测运行环境GitHub Actions / Docker / 本地
- 根据配置选择存储后端local / remote / auto
- 提供统一的存储接口
- 支持从远程拉取数据到本地
"""
def __init__(
self,
backend_type: str = "auto",
data_dir: str = "output",
enable_txt: bool = True,
enable_html: bool = True,
remote_config: Optional[dict] = None,
local_retention_days: int = 0,
remote_retention_days: int = 0,
pull_enabled: bool = False,
pull_days: int = 0,
timezone: str = "Asia/Shanghai",
):
"""
初始化存储管理器
Args:
backend_type: 存储后端类型 (local / remote / auto)
data_dir: 本地数据目录
enable_txt: 是否启用 TXT 快照
enable_html: 是否启用 HTML 报告
remote_config: 远程存储配置endpoint_url, bucket_name, access_key_id
local_retention_days: 本地数据保留天数0 = 无限制
remote_retention_days: 远程数据保留天数0 = 无限制
pull_enabled: 是否启用启动时自动拉取
pull_days: 拉取最近 N 天的数据
timezone: 时区配置默认 Asia/Shanghai
"""
self.backend_type = backend_type
self.data_dir = data_dir
self.enable_txt = enable_txt
self.enable_html = enable_html
self.remote_config = remote_config or {}
self.local_retention_days = local_retention_days
self.remote_retention_days = remote_retention_days
self.pull_enabled = pull_enabled
self.pull_days = pull_days
self.timezone = timezone
self._backend: Optional[StorageBackend] = None
self._remote_backend: Optional[StorageBackend] = None
@staticmethod
def is_github_actions() -> bool:
"""检测是否在 GitHub Actions 环境中运行"""
return os.environ.get("GITHUB_ACTIONS") == "true"
@staticmethod
def is_docker() -> bool:
"""检测是否在 Docker 容器中运行"""
# 方法1: 检查 /.dockerenv 文件
if os.path.exists("/.dockerenv"):
return True
# 方法2: 检查 cgroupLinux
try:
with open("/proc/1/cgroup", "r") as f:
return "docker" in f.read()
except (FileNotFoundError, PermissionError):
pass
# 方法3: 检查环境变量
return os.environ.get("DOCKER_CONTAINER") == "true"
def _resolve_backend_type(self) -> str:
"""解析实际使用的后端类型"""
if self.backend_type == "auto":
if self.is_github_actions():
# GitHub Actions 环境,检查是否配置了远程存储
if self._has_remote_config():
return "remote"
else:
print("[存储管理器] GitHub Actions 环境但未配置远程存储,使用本地存储")
return "local"
else:
return "local"
return self.backend_type
def _has_remote_config(self) -> bool:
"""检查是否有有效的远程存储配置"""
# 检查配置或环境变量
bucket_name = self.remote_config.get("bucket_name") or os.environ.get("S3_BUCKET_NAME")
access_key = self.remote_config.get("access_key_id") or os.environ.get("S3_ACCESS_KEY_ID")
secret_key = self.remote_config.get("secret_access_key") or os.environ.get("S3_SECRET_ACCESS_KEY")
endpoint = self.remote_config.get("endpoint_url") or os.environ.get("S3_ENDPOINT_URL")
# 调试日志
has_config = bool(bucket_name and access_key and secret_key and endpoint)
if not has_config:
print(f"[存储管理器] 远程存储配置检查失败:")
print(f" - bucket_name: {'已配置' if bucket_name else '未配置'}")
print(f" - access_key_id: {'已配置' if access_key else '未配置'}")
print(f" - secret_access_key: {'已配置' if secret_key else '未配置'}")
print(f" - endpoint_url: {'已配置' if endpoint else '未配置'}")
return has_config
def _create_remote_backend(self) -> Optional[StorageBackend]:
"""创建远程存储后端"""
try:
from trendradar.storage.remote import RemoteStorageBackend
return RemoteStorageBackend(
bucket_name=self.remote_config.get("bucket_name") or os.environ.get("S3_BUCKET_NAME", ""),
access_key_id=self.remote_config.get("access_key_id") or os.environ.get("S3_ACCESS_KEY_ID", ""),
secret_access_key=self.remote_config.get("secret_access_key") or os.environ.get("S3_SECRET_ACCESS_KEY", ""),
endpoint_url=self.remote_config.get("endpoint_url") or os.environ.get("S3_ENDPOINT_URL", ""),
region=self.remote_config.get("region") or os.environ.get("S3_REGION", ""),
enable_txt=self.enable_txt,
enable_html=self.enable_html,
timezone=self.timezone,
)
except ImportError as e:
print(f"[存储管理器] 远程后端导入失败: {e}")
print("[存储管理器] 请确保已安装 boto3: pip install boto3")
return None
except Exception as e:
print(f"[存储管理器] 远程后端初始化失败: {e}")
return None
def get_backend(self) -> StorageBackend:
"""获取存储后端实例"""
if self._backend is None:
resolved_type = self._resolve_backend_type()
if resolved_type == "remote":
self._backend = self._create_remote_backend()
if self._backend:
print(f"[存储管理器] 使用远程存储后端")
else:
print("[存储管理器] 回退到本地存储")
resolved_type = "local"
if resolved_type == "local" or self._backend is None:
from trendradar.storage.local import LocalStorageBackend
self._backend = LocalStorageBackend(
data_dir=self.data_dir,
enable_txt=self.enable_txt,
enable_html=self.enable_html,
timezone=self.timezone,
)
print(f"[存储管理器] 使用本地存储后端 (数据目录: {self.data_dir})")
return self._backend
def pull_from_remote(self) -> int:
"""
从远程拉取数据到本地
Returns:
成功拉取的文件数量
"""
if not self.pull_enabled or self.pull_days <= 0:
return 0
if not self._has_remote_config():
print("[存储管理器] 未配置远程存储,无法拉取")
return 0
# 创建远程后端(如果还没有)
if self._remote_backend is None:
self._remote_backend = self._create_remote_backend()
if self._remote_backend is None:
print("[存储管理器] 无法创建远程后端,拉取失败")
return 0
# 调用拉取方法
return self._remote_backend.pull_recent_days(self.pull_days, self.data_dir)
def save_news_data(self, data: NewsData) -> bool:
"""保存新闻数据"""
return self.get_backend().save_news_data(data)
def get_today_all_data(self, date: Optional[str] = None) -> Optional[NewsData]:
"""获取当天所有数据"""
return self.get_backend().get_today_all_data(date)
def get_latest_crawl_data(self, date: Optional[str] = None) -> Optional[NewsData]:
"""获取最新抓取数据"""
return self.get_backend().get_latest_crawl_data(date)
def detect_new_titles(self, current_data: NewsData) -> dict:
"""检测新增标题"""
return self.get_backend().detect_new_titles(current_data)
def save_txt_snapshot(self, data: NewsData) -> Optional[str]:
"""保存 TXT 快照"""
return self.get_backend().save_txt_snapshot(data)
def save_html_report(self, html_content: str, filename: str, is_summary: bool = False) -> Optional[str]:
"""保存 HTML 报告"""
return self.get_backend().save_html_report(html_content, filename, is_summary)
def is_first_crawl_today(self, date: Optional[str] = None) -> bool:
"""检查是否是当天第一次抓取"""
return self.get_backend().is_first_crawl_today(date)
def cleanup(self) -> None:
"""清理资源"""
if self._backend:
self._backend.cleanup()
if self._remote_backend:
self._remote_backend.cleanup()
def cleanup_old_data(self) -> int:
"""
清理过期数据
Returns:
删除的日期目录数量
"""
total_deleted = 0
# 清理本地数据
if self.local_retention_days > 0:
total_deleted += self.get_backend().cleanup_old_data(self.local_retention_days)
# 清理远程数据(如果配置了)
if self.remote_retention_days > 0 and self._has_remote_config():
if self._remote_backend is None:
self._remote_backend = self._create_remote_backend()
if self._remote_backend:
total_deleted += self._remote_backend.cleanup_old_data(self.remote_retention_days)
return total_deleted
@property
def backend_name(self) -> str:
"""获取当前后端名称"""
return self.get_backend().backend_name
@property
def supports_txt(self) -> bool:
"""是否支持 TXT 快照"""
return self.get_backend().supports_txt
def get_storage_manager(
backend_type: str = "auto",
data_dir: str = "output",
enable_txt: bool = True,
enable_html: bool = True,
remote_config: Optional[dict] = None,
local_retention_days: int = 0,
remote_retention_days: int = 0,
pull_enabled: bool = False,
pull_days: int = 0,
timezone: str = "Asia/Shanghai",
force_new: bool = False,
) -> StorageManager:
"""
获取存储管理器单例
Args:
backend_type: 存储后端类型
data_dir: 本地数据目录
enable_txt: 是否启用 TXT 快照
enable_html: 是否启用 HTML 报告
remote_config: 远程存储配置
local_retention_days: 本地数据保留天数0 = 无限制
remote_retention_days: 远程数据保留天数0 = 无限制
pull_enabled: 是否启用启动时自动拉取
pull_days: 拉取最近 N 天的数据
timezone: 时区配置默认 Asia/Shanghai
force_new: 是否强制创建新实例
Returns:
StorageManager 实例
"""
global _storage_manager
if _storage_manager is None or force_new:
_storage_manager = StorageManager(
backend_type=backend_type,
data_dir=data_dir,
enable_txt=enable_txt,
enable_html=enable_html,
remote_config=remote_config,
local_retention_days=local_retention_days,
remote_retention_days=remote_retention_days,
pull_enabled=pull_enabled,
pull_days=pull_days,
timezone=timezone,
)
return _storage_manager

1071
trendradar/storage/remote.py Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,117 @@
-- TrendRadar 数据库表结构
-- ============================================
-- 平台信息表
-- 核心id 不变name 可变
-- ============================================
CREATE TABLE IF NOT EXISTS platforms (
id TEXT PRIMARY KEY,
name TEXT NOT NULL,
is_active INTEGER DEFAULT 1,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
-- ============================================
-- 新闻条目表
-- 以 URL + platform_id 为唯一标识,支持去重存储
-- ============================================
CREATE TABLE IF NOT EXISTS news_items (
id INTEGER PRIMARY KEY AUTOINCREMENT,
title TEXT NOT NULL,
platform_id TEXT NOT NULL,
rank INTEGER NOT NULL,
url TEXT DEFAULT '',
mobile_url TEXT DEFAULT '',
first_crawl_time TEXT NOT NULL, -- 首次抓取时间
last_crawl_time TEXT NOT NULL, -- 最后抓取时间
crawl_count INTEGER DEFAULT 1, -- 抓取次数
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (platform_id) REFERENCES platforms(id)
);
-- ============================================
-- 标题变更历史表
-- 记录同一 URL 下标题的变化
-- ============================================
CREATE TABLE IF NOT EXISTS title_changes (
id INTEGER PRIMARY KEY AUTOINCREMENT,
news_item_id INTEGER NOT NULL,
old_title TEXT NOT NULL,
new_title TEXT NOT NULL,
changed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (news_item_id) REFERENCES news_items(id)
);
-- ============================================
-- 排名历史表
-- 记录每次抓取时的排名变化
-- ============================================
CREATE TABLE IF NOT EXISTS rank_history (
id INTEGER PRIMARY KEY AUTOINCREMENT,
news_item_id INTEGER NOT NULL,
rank INTEGER NOT NULL,
crawl_time TEXT NOT NULL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (news_item_id) REFERENCES news_items(id)
);
-- ============================================
-- 抓取记录表
-- 记录每次抓取的时间和数量
-- ============================================
CREATE TABLE IF NOT EXISTS crawl_records (
id INTEGER PRIMARY KEY AUTOINCREMENT,
crawl_time TEXT NOT NULL UNIQUE,
total_items INTEGER DEFAULT 0,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
-- ============================================
-- 抓取来源状态表
-- 记录每次抓取各平台的成功/失败状态
-- ============================================
CREATE TABLE IF NOT EXISTS crawl_source_status (
crawl_record_id INTEGER NOT NULL,
platform_id TEXT NOT NULL,
status TEXT NOT NULL CHECK(status IN ('success', 'failed')),
PRIMARY KEY (crawl_record_id, platform_id),
FOREIGN KEY (crawl_record_id) REFERENCES crawl_records(id),
FOREIGN KEY (platform_id) REFERENCES platforms(id)
);
-- ============================================
-- 推送记录表
-- 用于 push_window once_per_day 功能
-- ============================================
CREATE TABLE IF NOT EXISTS push_records (
id INTEGER PRIMARY KEY AUTOINCREMENT,
date TEXT NOT NULL UNIQUE,
pushed INTEGER DEFAULT 0,
push_time TEXT,
report_type TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
-- ============================================
-- 索引定义
-- ============================================
-- 平台索引
CREATE INDEX IF NOT EXISTS idx_news_platform ON news_items(platform_id);
-- 时间索引(用于查询最新数据)
CREATE INDEX IF NOT EXISTS idx_news_crawl_time ON news_items(last_crawl_time);
-- 标题索引(用于标题搜索)
CREATE INDEX IF NOT EXISTS idx_news_title ON news_items(title);
-- URL + platform_id 唯一索引(仅对非空 URL实现去重
CREATE UNIQUE INDEX IF NOT EXISTS idx_news_url_platform
ON news_items(url, platform_id) WHERE url != '';
-- 抓取状态索引
CREATE INDEX IF NOT EXISTS idx_crawl_status_record ON crawl_source_status(crawl_record_id);
-- 排名历史索引
CREATE INDEX IF NOT EXISTS idx_rank_history_news ON rank_history(news_item_id);

View File

@ -0,0 +1,20 @@
# coding=utf-8
"""
工具模块 - 公共工具函数
"""
from trendradar.utils.time import (
get_configured_time,
format_date_folder,
format_time_filename,
get_current_time_display,
convert_time_for_display,
)
__all__ = [
"get_configured_time",
"format_date_folder",
"format_time_filename",
"get_current_time_display",
"convert_time_for_display",
]

91
trendradar/utils/time.py Normal file
View File

@ -0,0 +1,91 @@
# coding=utf-8
"""
时间工具模块 - 统一时间处理函数
"""
from datetime import datetime
from typing import Optional
import pytz
# 默认时区
DEFAULT_TIMEZONE = "Asia/Shanghai"
def get_configured_time(timezone: str = DEFAULT_TIMEZONE) -> datetime:
"""
获取配置时区的当前时间
Args:
timezone: 时区名称 'Asia/Shanghai', 'America/Los_Angeles'
Returns:
带时区信息的当前时间
"""
try:
tz = pytz.timezone(timezone)
except pytz.UnknownTimeZoneError:
print(f"[警告] 未知时区 '{timezone}',使用默认时区 {DEFAULT_TIMEZONE}")
tz = pytz.timezone(DEFAULT_TIMEZONE)
return datetime.now(tz)
def format_date_folder(
date: Optional[str] = None, timezone: str = DEFAULT_TIMEZONE
) -> str:
"""
格式化日期文件夹名 (ISO 格式: YYYY-MM-DD)
Args:
date: 指定日期字符串 None 则使用当前日期
timezone: 时区名称
Returns:
格式化后的日期字符串 '2025-12-09'
"""
if date:
return date
return get_configured_time(timezone).strftime("%Y-%m-%d")
def format_time_filename(timezone: str = DEFAULT_TIMEZONE) -> str:
"""
格式化时间文件名 (格式: HH-MM用于文件名)
Windows 系统不支持冒号作为文件名因此使用连字符
Args:
timezone: 时区名称
Returns:
格式化后的时间字符串 '15-30'
"""
return get_configured_time(timezone).strftime("%H-%M")
def get_current_time_display(timezone: str = DEFAULT_TIMEZONE) -> str:
"""
获取当前时间显示 (格式: HH:MM用于显示)
Args:
timezone: 时区名称
Returns:
格式化后的时间字符串 '15:30'
"""
return get_configured_time(timezone).strftime("%H:%M")
def convert_time_for_display(time_str: str) -> str:
"""
HH-MM 格式转换为 HH:MM 格式用于显示
Args:
time_str: 输入时间字符串 '15-30'
Returns:
转换后的时间字符串 '15:30'
"""
if time_str and "-" in time_str and len(time_str) == 5:
return time_str.replace("-", ":")
return time_str

View File

@ -1 +1 @@
3.5.0 4.0.0