mirror of
https://gitee.com/houhuan/TrendRadar.git
synced 2025-12-21 14:47:16 +08:00
v4.0.0 大大大更新
This commit is contained in:
parent
97c05aa33c
commit
c7bacdfff7
2
.github/ISSUE_TEMPLATE/01-bug-report.yml
vendored
2
.github/ISSUE_TEMPLATE/01-bug-report.yml
vendored
@ -4,8 +4,6 @@ name: 🐛 遇到问题了
|
|||||||
description: 程序运行不正常或出现错误
|
description: 程序运行不正常或出现错误
|
||||||
title: "[问题] "
|
title: "[问题] "
|
||||||
labels: ["bug"]
|
labels: ["bug"]
|
||||||
assignees:
|
|
||||||
- sansan0
|
|
||||||
body:
|
body:
|
||||||
- type: markdown
|
- type: markdown
|
||||||
attributes:
|
attributes:
|
||||||
|
|||||||
@ -4,8 +4,6 @@ name: 💡 我有个想法
|
|||||||
description: 建议新功能或改进现有功能
|
description: 建议新功能或改进现有功能
|
||||||
title: "[建议] "
|
title: "[建议] "
|
||||||
labels: ["enhancement"]
|
labels: ["enhancement"]
|
||||||
assignees:
|
|
||||||
- sansan0
|
|
||||||
body:
|
body:
|
||||||
- type: markdown
|
- type: markdown
|
||||||
attributes:
|
attributes:
|
||||||
|
|||||||
2
.github/ISSUE_TEMPLATE/03-config-help.yml
vendored
2
.github/ISSUE_TEMPLATE/03-config-help.yml
vendored
@ -4,8 +4,6 @@ name: ⚙️ 设置遇到困难
|
|||||||
description: 配置相关的问题或需要帮助
|
description: 配置相关的问题或需要帮助
|
||||||
title: "[设置] "
|
title: "[设置] "
|
||||||
labels: ["配置", "帮助"]
|
labels: ["配置", "帮助"]
|
||||||
assignees:
|
|
||||||
- sansan0
|
|
||||||
body:
|
body:
|
||||||
- type: markdown
|
- type: markdown
|
||||||
attributes:
|
attributes:
|
||||||
|
|||||||
28
.github/workflows/clean-crawler.yml
vendored
Normal file
28
.github/workflows/clean-crawler.yml
vendored
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
name: Check In
|
||||||
|
|
||||||
|
# ✅ 签到续期:运行此 workflow 可重置 7 天计时,保持 "Get Hot News" 正常运行
|
||||||
|
# ✅ Renewal: Run this workflow to reset the 7-day timer and keep "Get Hot News" active
|
||||||
|
#
|
||||||
|
# 📌 操作方法 / How to use:
|
||||||
|
# 1. 点击 "Run workflow" 按钮 / Click "Run workflow" button
|
||||||
|
# 2. 每 7 天内至少运行一次 / Run at least once every 7 days
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_dispatch:
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
del_runs:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
permissions:
|
||||||
|
actions: write
|
||||||
|
contents: read
|
||||||
|
steps:
|
||||||
|
- name: Delete all workflow runs
|
||||||
|
uses: Mattraks/delete-workflow-runs@v2
|
||||||
|
with:
|
||||||
|
token: ${{ github.token }}
|
||||||
|
repository: ${{ github.repository }}
|
||||||
|
retain_days: 0
|
||||||
|
keep_minimum_runs: 0
|
||||||
|
delete_workflow_by_state_pattern: "ALL"
|
||||||
|
delete_run_by_conclusion_pattern: "ALL"
|
||||||
163
.github/workflows/crawler.yml
vendored
Normal file
163
.github/workflows/crawler.yml
vendored
Normal file
@ -0,0 +1,163 @@
|
|||||||
|
name: Get Hot News
|
||||||
|
|
||||||
|
on:
|
||||||
|
schedule:
|
||||||
|
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
||||||
|
# ⚠️ 试用版说明 / Trial Mode
|
||||||
|
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
||||||
|
#
|
||||||
|
# 🔄 运行机制 / How it works:
|
||||||
|
# - 每个周期为 7 天,届时自动停止
|
||||||
|
# - 运行 "Check In" 会重置周期(重新开始 7 天倒计时,而非累加)
|
||||||
|
# - Each cycle is 7 days, then auto-stops
|
||||||
|
# - "Check In" resets the cycle (restarts 7-day countdown, not cumulative)
|
||||||
|
#
|
||||||
|
# 💡 设计初衷 / Why this design:
|
||||||
|
# 如果 7 天都忘了签到,或许这些资讯对你来说并非刚需
|
||||||
|
# 适时的暂停,能帮你从信息流中抽离,给大脑留出喘息的空间
|
||||||
|
# If you forget for 7 days, maybe you don't really need it
|
||||||
|
# A timely pause helps you detach from the stream and gives your mind space
|
||||||
|
#
|
||||||
|
# 🙏 珍惜资源 / Respect shared resources:
|
||||||
|
# GitHub Actions 是平台提供的公共资源,每次运行都会消耗算力
|
||||||
|
# 签到机制确保资源分配给真正需要的用户,感谢你的理解与配合
|
||||||
|
# GitHub Actions is a shared public resource provided by the platform
|
||||||
|
# Check-in ensures resources go to those who truly need it — thank you
|
||||||
|
#
|
||||||
|
# 🚀 长期使用请部署 Docker 版本 / For long-term use, deploy Docker version
|
||||||
|
#
|
||||||
|
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
||||||
|
#
|
||||||
|
# 📝 修改运行时间:只改第一个数字(0-59),表示每小时第几分钟运行
|
||||||
|
# 📝 Change time: Only modify the first number (0-59) = minute of each hour
|
||||||
|
#
|
||||||
|
# 示例 / Examples:
|
||||||
|
# "15 * * * *" → 每小时第15分钟 / minute 15 every hour
|
||||||
|
# "30 0-14 * * *" → 北京时间 8:00-22:00 每小时第30分钟 / Beijing 8am-10pm
|
||||||
|
#
|
||||||
|
- cron: "33 * * * *"
|
||||||
|
|
||||||
|
workflow_dispatch:
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: crawler-${{ github.ref_name }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
|
actions: write
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
crawl:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
timeout-minutes: 15
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 1
|
||||||
|
clean: true
|
||||||
|
|
||||||
|
- name: Check Expiration
|
||||||
|
env:
|
||||||
|
GH_TOKEN: ${{ github.token }}
|
||||||
|
run: |
|
||||||
|
WORKFLOW_FILE="crawler.yml"
|
||||||
|
API_URL="repos/${{ github.repository }}/actions/workflows/$WORKFLOW_FILE/runs"
|
||||||
|
|
||||||
|
TOTAL=$(gh api "$API_URL" --jq '.total_count')
|
||||||
|
if [ -z "$TOTAL" ] || [ "$TOTAL" -eq 0 ]; then
|
||||||
|
echo "No previous runs found, skipping expiration check"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
LAST_PAGE=$(( (TOTAL + 99) / 100 ))
|
||||||
|
FIRST_RUN_DATE=$(gh api "$API_URL?per_page=100&page=$LAST_PAGE" --jq '.workflow_runs[-1].created_at')
|
||||||
|
|
||||||
|
if [ -n "$FIRST_RUN_DATE" ]; then
|
||||||
|
CURRENT_TIMESTAMP=$(date +%s)
|
||||||
|
FIRST_RUN_TIMESTAMP=$(date -d "$FIRST_RUN_DATE" +%s)
|
||||||
|
DIFF_SECONDS=$((CURRENT_TIMESTAMP - FIRST_RUN_TIMESTAMP))
|
||||||
|
LIMIT_SECONDS=604800
|
||||||
|
|
||||||
|
if [ $DIFF_SECONDS -gt $LIMIT_SECONDS ]; then
|
||||||
|
echo "⚠️ 试用期已结束,请运行 'Check In' 签到续期"
|
||||||
|
echo "⚠️ Trial expired. Run 'Check In' to renew."
|
||||||
|
gh workflow disable "$WORKFLOW_FILE"
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
DAYS_LEFT=$(( (LIMIT_SECONDS - DIFF_SECONDS) / 86400 ))
|
||||||
|
echo "✅ 试用期剩余 ${DAYS_LEFT} 天,到期前请运行 'Check In' 签到续期"
|
||||||
|
echo "✅ Trial: ${DAYS_LEFT} days left. Run 'Check In' before expiry to renew."
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------------------
|
||||||
|
# 🚦 TRAFFIC CONTROL / 流量控制
|
||||||
|
# --------------------------------------------------------------------------------
|
||||||
|
# EN: Generates a random delay between 1 and 300 seconds (5 minutes).
|
||||||
|
# Critical for load balancing.
|
||||||
|
#
|
||||||
|
# CN: 生成 1 到 300 秒(5分钟)之间的随机延迟。
|
||||||
|
# 这对负载均衡至关重要。
|
||||||
|
- name: Random Delay (Traffic Control)
|
||||||
|
if: success()
|
||||||
|
run: |
|
||||||
|
echo "🎲 Traffic Control: Generating random delay..."
|
||||||
|
DELAY=$(( ( RANDOM % 300 ) + 1 ))
|
||||||
|
echo "⏸️ Sleeping for ${DELAY} seconds to spread the load..."
|
||||||
|
sleep ${DELAY}s
|
||||||
|
echo "▶️ Delay finished. Starting crawler..."
|
||||||
|
|
||||||
|
- name: Set up Python
|
||||||
|
if: success()
|
||||||
|
uses: actions/setup-python@v5
|
||||||
|
with:
|
||||||
|
python-version: "3.10"
|
||||||
|
cache: "pip"
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
if: success()
|
||||||
|
run: |
|
||||||
|
python -m pip install --upgrade pip
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
- name: Verify required files
|
||||||
|
if: success()
|
||||||
|
run: |
|
||||||
|
if [ ! -f config/config.yaml ]; then
|
||||||
|
echo "Error: Config missing"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
- name: Run crawler
|
||||||
|
if: success()
|
||||||
|
env:
|
||||||
|
FEISHU_WEBHOOK_URL: ${{ secrets.FEISHU_WEBHOOK_URL }}
|
||||||
|
TELEGRAM_BOT_TOKEN: ${{ secrets.TELEGRAM_BOT_TOKEN }}
|
||||||
|
TELEGRAM_CHAT_ID: ${{ secrets.TELEGRAM_CHAT_ID }}
|
||||||
|
DINGTALK_WEBHOOK_URL: ${{ secrets.DINGTALK_WEBHOOK_URL }}
|
||||||
|
WEWORK_WEBHOOK_URL: ${{ secrets.WEWORK_WEBHOOK_URL }}
|
||||||
|
WEWORK_MSG_TYPE: ${{ secrets.WEWORK_MSG_TYPE }}
|
||||||
|
EMAIL_FROM: ${{ secrets.EMAIL_FROM }}
|
||||||
|
EMAIL_PASSWORD: ${{ secrets.EMAIL_PASSWORD }}
|
||||||
|
EMAIL_TO: ${{ secrets.EMAIL_TO }}
|
||||||
|
EMAIL_SMTP_SERVER: ${{ secrets.EMAIL_SMTP_SERVER }}
|
||||||
|
EMAIL_SMTP_PORT: ${{ secrets.EMAIL_SMTP_PORT }}
|
||||||
|
NTFY_TOPIC: ${{ secrets.NTFY_TOPIC }}
|
||||||
|
NTFY_SERVER_URL: ${{ secrets.NTFY_SERVER_URL }}
|
||||||
|
NTFY_TOKEN: ${{ secrets.NTFY_TOKEN }}
|
||||||
|
BARK_URL: ${{ secrets.BARK_URL }}
|
||||||
|
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
|
||||||
|
STORAGE_BACKEND: auto
|
||||||
|
LOCAL_RETENTION_DAYS: ${{ secrets.LOCAL_RETENTION_DAYS }}
|
||||||
|
REMOTE_RETENTION_DAYS: ${{ secrets.REMOTE_RETENTION_DAYS }}
|
||||||
|
S3_BUCKET_NAME: ${{ secrets.S3_BUCKET_NAME }}
|
||||||
|
S3_ACCESS_KEY_ID: ${{ secrets.S3_ACCESS_KEY_ID }}
|
||||||
|
S3_SECRET_ACCESS_KEY: ${{ secrets.S3_SECRET_ACCESS_KEY }}
|
||||||
|
S3_ENDPOINT_URL: ${{ secrets.S3_ENDPOINT_URL }}
|
||||||
|
S3_REGION: ${{ secrets.S3_REGION }}
|
||||||
|
GITHUB_ACTIONS: true
|
||||||
|
run: python -m trendradar
|
||||||
456
README-EN.md
456
README-EN.md
@ -1,6 +1,6 @@
|
|||||||
<div align="center" id="trendradar">
|
<div align="center" id="trendradar">
|
||||||
|
|
||||||
> **📢 Announcement:** After communicating with GitHub officials, "One-Click Fork Deployment" will be restored after compliance adjustments are completed. Please stay tuned for **v4.0.0** update
|
> **📢 Announcement:** **v4.0.0** has been released! Including storage architecture refactoring, database optimization, modularization improvements, and more major updates
|
||||||
|
|
||||||
<a href="https://github.com/sansan0/TrendRadar" title="TrendRadar">
|
<a href="https://github.com/sansan0/TrendRadar" title="TrendRadar">
|
||||||
<img src="/_image/banner.webp" alt="TrendRadar Banner" width="80%">
|
<img src="/_image/banner.webp" alt="TrendRadar Banner" width="80%">
|
||||||
@ -16,8 +16,8 @@
|
|||||||
[](https://github.com/sansan0/TrendRadar/stargazers)
|
[](https://github.com/sansan0/TrendRadar/stargazers)
|
||||||
[](https://github.com/sansan0/TrendRadar/network/members)
|
[](https://github.com/sansan0/TrendRadar/network/members)
|
||||||
[](LICENSE)
|
[](LICENSE)
|
||||||
[](https://github.com/sansan0/TrendRadar)
|
[](https://github.com/sansan0/TrendRadar)
|
||||||
[](https://github.com/sansan0/TrendRadar)
|
[](https://github.com/sansan0/TrendRadar)
|
||||||
|
|
||||||
[](https://work.weixin.qq.com/)
|
[](https://work.weixin.qq.com/)
|
||||||
[](https://weixin.qq.com/)
|
[](https://weixin.qq.com/)
|
||||||
@ -48,62 +48,61 @@
|
|||||||
<br>
|
<br>
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary>🚨 <strong>【MUST READ】Important Announcement: The Correct Way to Deploy This Project</strong></summary>
|
<summary>🚨 <strong>【Must Read】Important Announcement: v4.0.0 Deployment & Storage Architecture Changes</strong></summary>
|
||||||
|
|
||||||
<br>
|
<br>
|
||||||
|
|
||||||
> **⚠️ December 2025 Urgent Notice**
|
### 🛠️ Choose the Deployment Method That Fits You
|
||||||
>
|
|
||||||
> Due to a surge in Fork numbers causing excessive load on GitHub servers, **GitHub Actions and GitHub Pages deployments are currently restricted**. Please read the following instructions carefully to ensure successful deployment.
|
|
||||||
|
|
||||||
### 1. ✅ Only Recommended Deployment Method: Docker
|
#### 🅰️ Option 1: Docker Deployment (Recommended 🔥)
|
||||||
|
|
||||||
**This is currently the most stable solution, free from GitHub restrictions.** Data is stored locally and won't be affected by GitHub policy changes.
|
* **Features**: Most stable and simplest. Data is stored in **local SQLite**, fully under your control.
|
||||||
|
|
||||||
|
* **Best for**: Users with their own server, NAS, or an always-on PC.
|
||||||
|
|
||||||
* 👉 [Jump to Docker Deployment Tutorial](#6-docker-deployment)
|
* 👉 [Jump to Docker Deployment Tutorial](#6-docker-deployment)
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
### 2. If You Were Planning to Fork This Project...
|
#### 🅱️ Option 2: GitHub Actions Deployment (Restored ✅)
|
||||||
|
|
||||||
To reduce pressure on GitHub servers, **please DO NOT directly click the "Fork" button!**
|
* **Features**: Data is no longer committed directly to the repo. Instead, it is stored in **Remote Cloud Storage** (supports S3-compatible protocols: Cloudflare R2, Alibaba Cloud OSS, Tencent Cloud COS, etc.).
|
||||||
|
|
||||||
Please use the **"Use this template"** feature instead of Fork:
|
* **Requirement**: You **must** configure an S3-compatible object storage service (Cloudflare R2 recommended, it's free).
|
||||||
|
|
||||||
|
> **⚠️ Note**: If you choose this option, you must complete the following two configuration steps:
|
||||||
|
|
||||||
|
#### 1. 🚀 Recommended Start: Use this template
|
||||||
|
|
||||||
|
To keep the repository clean and avoid inheriting redundant history, I **recommend** using Template mode:
|
||||||
|
|
||||||
|
1. **Click** the green **[Use this template]** button at the top right of the original repository page.
|
||||||
|
|
||||||
1. **Click** the green **[Use this template]** button in the top right corner of the original repository page.
|
|
||||||
2. **Select** "Create a new repository".
|
2. **Select** "Create a new repository".
|
||||||
|
|
||||||
**Why do this?**
|
> **💡 Why do this?**
|
||||||
* **❌ Fork**: Copies complete history records. Many forks running simultaneously will trigger GitHub risk control.
|
> * **Use this template**: Creates a brand new, clean repository with no historical baggage.
|
||||||
* **✅ Use this template**: Creates a completely new independent repository without historical baggage, more server-friendly.
|
> * **Fork**: Retains the complete commit history and relationships, consuming more GitHub resources.
|
||||||
|
|
||||||
---
|
#### 2. ☁️ About the Mandatory Remote Storage for GitHub Actions
|
||||||
|
|
||||||
### 3. About New Data Storage
|
If you choose **Option 2 (GitHub Actions)**, you must configure an S3-compatible object storage service.
|
||||||
|
|
||||||
The new version will use **Cloudflare R2** to store news data, ensuring data persistence.
|
**Supported Storage Services:**
|
||||||
|
- **Cloudflare R2** (Recommended, generous free tier)
|
||||||
|
- Other S3-compatible services
|
||||||
|
|
||||||
**⚠️ Configuration Prerequisites:**
|
**⚠️ Configuration Prerequisites (Using Cloudflare R2 as Example):**
|
||||||
|
|
||||||
According to Cloudflare platform rules, activating R2 requires binding a payment method.
|
According to Cloudflare platform rules, enabling R2 requires binding a payment method.
|
||||||
|
|
||||||
- **Purpose:** Identity verification only (Verify Only), no charges will be incurred.
|
* **Purpose**: Identity verification only (Verify Only). **No charges will be incurred**.
|
||||||
- **Payment:** Supports credit cards or PayPal (China region).
|
|
||||||
- **Usage:** R2's free tier is sufficient to cover this project's daily operation, no payment required.
|
|
||||||
|
|
||||||
---
|
* **Payment**: Supports international credit cards or PayPal.
|
||||||
|
|
||||||
### 4. 📅 Future Plans & Documentation Reading Notes
|
* **Usage**: The R2 free tier (10GB storage/month) is sufficient to cover the daily operation of this project. No need to worry about costs.
|
||||||
|
|
||||||
> **Future Plans:**
|
👉 **[Click to View Detailed Configuration Tutorial](#-quick-start)**
|
||||||
> - Exploring new approach: keep Actions for fetching and pushing, but no longer save data to repository, use external storage instead.
|
|
||||||
|
|
||||||
**⚠️ Reading Note:**
|
|
||||||
Given that the above plans mean **Fork deployment mode may return in a new form in the future**, and the workload to fully revise documentation is massive, we have temporarily retained the old descriptions.
|
|
||||||
|
|
||||||
**At the current stage, if "Fork" related expressions still appear in subsequent tutorials, please ignore them or understand them as "Use this template"**.
|
|
||||||
|
|
||||||
👉 **[Click here to view TrendRadar's latest official documentation](https://github.com/sansan0/TrendRadar?tab=readme-ov-file)**
|
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
@ -287,10 +286,32 @@ Supports **WeWork** (+ WeChat push solution), **Feishu**, **DingTalk**, **Telegr
|
|||||||
- ⚠️ **Paired Configuration**: Telegram and ntfy require paired parameter quantities to match (e.g., token and chat_id both have 2 values)
|
- ⚠️ **Paired Configuration**: Telegram and ntfy require paired parameter quantities to match (e.g., token and chat_id both have 2 values)
|
||||||
- ⚠️ **Quantity Limit**: Default maximum 3 accounts per channel, exceeded values will be truncated
|
- ⚠️ **Quantity Limit**: Default maximum 3 accounts per channel, exceeded values will be truncated
|
||||||
|
|
||||||
### **Multi-Platform Support**
|
### **Flexible Storage Architecture (v4.0.0 Major Update)**
|
||||||
- **GitHub Pages**: Auto-generate beautiful web reports, PC/mobile adapted
|
|
||||||
|
**Multi-Backend Support**:
|
||||||
|
- ☁️ **Remote Cloud Storage**: GitHub Actions environment default, supports S3-compatible protocols (R2/OSS/COS, etc.), data stored in cloud, keeping repository clean
|
||||||
|
- 💾 **Local SQLite**: Traditional SQLite database, stable and efficient (Docker/local deployment)
|
||||||
|
- 🔀 **Auto Selection**: Auto-selects appropriate backend based on runtime environment
|
||||||
|
|
||||||
|
**Data Format Hierarchy**:
|
||||||
|
|
||||||
|
| Format | Role | Description |
|
||||||
|
|--------|------|-------------|
|
||||||
|
| **SQLite** | Primary storage | Complete data with statistics information |
|
||||||
|
| **TXT** | Human-readable backup | Optional text records for manual viewing |
|
||||||
|
| **HTML** | Web report | Beautiful visual report (GitHub Pages) |
|
||||||
|
|
||||||
|
**Data Management Features**:
|
||||||
|
- Auto data cleanup (configurable retention period)
|
||||||
|
- Timezone support (configurable IANA time zone)
|
||||||
|
- Cloud/local seamless switching
|
||||||
|
|
||||||
|
> 💡 For storage configuration details, see [Configuration Details - Storage Configuration](#11-storage-configuration-v400-new)
|
||||||
|
|
||||||
|
### **Multi-Platform Deployment**
|
||||||
|
- **GitHub Actions**: Cloud automated operations (7-day check-in cycle + remote cloud storage)
|
||||||
- **Docker Deployment**: Supports multi-architecture containerized operation
|
- **Docker Deployment**: Supports multi-architecture containerized operation
|
||||||
- **Data Persistence**: HTML/TXT multi-format history saving
|
- **Local Running**: Python environment direct execution
|
||||||
|
|
||||||
|
|
||||||
### **AI Smart Analysis (v3.0.0 New)**
|
### **AI Smart Analysis (v3.0.0 New)**
|
||||||
@ -341,10 +362,32 @@ Transform from "algorithm recommendation captivity" to "actively getting the inf
|
|||||||
>**Upgrade Instructions**:
|
>**Upgrade Instructions**:
|
||||||
- **📌 Check Latest Updates**: **[Original Repository Changelog](https://github.com/sansan0/TrendRadar?tab=readme-ov-file#-changelog)**
|
- **📌 Check Latest Updates**: **[Original Repository Changelog](https://github.com/sansan0/TrendRadar?tab=readme-ov-file#-changelog)**
|
||||||
- **Tip**: Do NOT update this project via **Sync fork**. Check [Changelog] to understand specific [Upgrade Methods] and [Features]
|
- **Tip**: Do NOT update this project via **Sync fork**. Check [Changelog] to understand specific [Upgrade Methods] and [Features]
|
||||||
- **Minor Version Update**: Upgrading from v2.x to v2.y, replace `main.py` in your forked repo with the latest version
|
|
||||||
- **Major Version Upgrade**: Upgrading from v1.x to v2.y, recommend deleting existing fork and re-forking to save effort and avoid config conflicts
|
- **Major Version Upgrade**: Upgrading from v1.x to v2.y, recommend deleting existing fork and re-forking to save effort and avoid config conflicts
|
||||||
|
|
||||||
|
|
||||||
|
### 2025/12/13 - v4.0.0
|
||||||
|
|
||||||
|
**🎉 Major Update: Comprehensive Refactoring of Storage and Core Architecture**
|
||||||
|
|
||||||
|
- **Multi-Storage Backend Support**: Introduced a brand new storage module supporting local SQLite and remote cloud storage (S3-compatible protocols, Cloudflare R2 recommended for free tier), adaptable to GitHub Actions, Docker, and local environments.
|
||||||
|
- **Database Structure Optimization**: Refactored SQLite database table structures to improve data efficiency and query performance.
|
||||||
|
- **Enhanced Features**: Implemented date format standardization, data retention policies, timezone configuration support, and optimized time display. Fixed remote storage data persistence issues to ensure accurate data merging.
|
||||||
|
- **Cleanup and Compatibility**: Removed most legacy compatibility code and unified data storage and retrieval methods.
|
||||||
|
|
||||||
|
### 2025/12/13 - mcp-v1.1.0
|
||||||
|
|
||||||
|
**MCP Module Update:**
|
||||||
|
- Adapted for v4.0.0, while maintaining compatibility with v3.x data.
|
||||||
|
- Added storage sync tools:
|
||||||
|
- `sync_from_remote`: Pull data from remote storage to local
|
||||||
|
- `get_storage_status`: Get storage configuration and status
|
||||||
|
- `list_available_dates`: List available dates in local/remote storage
|
||||||
|
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>👉 Click to expand: <strong>Historical Updates</strong></summary>
|
||||||
|
|
||||||
|
|
||||||
### 2025/12/03 - v3.5.0
|
### 2025/12/03 - v3.5.0
|
||||||
|
|
||||||
**🎉 Core Feature Enhancements**
|
**🎉 Core Feature Enhancements**
|
||||||
@ -397,7 +440,7 @@ Transform from "algorithm recommendation captivity" to "actively getting the inf
|
|||||||
|
|
||||||
**🔧 Upgrade Instructions**:
|
**🔧 Upgrade Instructions**:
|
||||||
- **GitHub Fork Users**: Update `main.py`, `config/config.yaml` (Added multi-account push support, existing single-account configuration unaffected)
|
- **GitHub Fork Users**: Update `main.py`, `config/config.yaml` (Added multi-account push support, existing single-account configuration unaffected)
|
||||||
- **Docker Users**: Update `.env`, `docker compose.yml` or set environment variables `REVERSE_CONTENT_ORDER`, `MAX_ACCOUNTS_PER_CHANNEL`
|
- **Docker Users**: Update `.env`, `docker-compose.yml` or set environment variables `REVERSE_CONTENT_ORDER`, `MAX_ACCOUNTS_PER_CHANNEL`
|
||||||
- **Multi-Account Push**: New feature, disabled by default, existing single-account configuration unaffected
|
- **Multi-Account Push**: New feature, disabled by default, existing single-account configuration unaffected
|
||||||
|
|
||||||
|
|
||||||
@ -431,10 +474,6 @@ Transform from "algorithm recommendation captivity" to "actively getting the inf
|
|||||||
- Tool count increased from 13 to 14
|
- Tool count increased from 13 to 14
|
||||||
|
|
||||||
|
|
||||||
<details>
|
|
||||||
<summary>👉 Click to expand: <strong>Historical Updates</strong></summary>
|
|
||||||
|
|
||||||
|
|
||||||
### 2025/11/25 - v3.4.0
|
### 2025/11/25 - v3.4.0
|
||||||
|
|
||||||
**🎉 Added Slack Push Support**
|
**🎉 Added Slack Push Support**
|
||||||
@ -819,11 +858,44 @@ frequency_words.txt file added **required word** feature, using + sign
|
|||||||
|
|
||||||
> **📖 Reminder**: Fork users should first **[check the latest official documentation](https://github.com/sansan0/TrendRadar?tab=readme-ov-file)** to ensure the configuration steps are up to date.
|
> **📖 Reminder**: Fork users should first **[check the latest official documentation](https://github.com/sansan0/TrendRadar?tab=readme-ov-file)** to ensure the configuration steps are up to date.
|
||||||
|
|
||||||
|
### ⚠️ GitHub Actions Usage Instructions
|
||||||
|
|
||||||
|
**v4.0.0 Important Change**: Introduced "Activity Detection" mechanism—GitHub Actions now requires periodic check-in to maintain operation.
|
||||||
|
|
||||||
|
#### 🔄 Check-In Renewal Mechanism
|
||||||
|
|
||||||
|
- **Running Cycle**: Valid for **7 days**—service will automatically suspend when countdown ends.
|
||||||
|
- **Renewal Method**: Manually trigger the "Check In" workflow on the Actions page to reset the 7-day validity period.
|
||||||
|
- **Operation Path**: `Actions` → `Check In` → `Run workflow`
|
||||||
|
- **Design Philosophy**:
|
||||||
|
- If you forget for 7 days, maybe you don't really need it. Letting it stop is a digital detox, freeing you from the constant impact.
|
||||||
|
- GitHub Actions is a valuable public computing resource. The check-in mechanism aims to prevent wasted computing cycles, ensuring resources are allocated to truly active users who need them. Thank you for your understanding and support.
|
||||||
|
|
||||||
|
#### 📦 Data Storage (Required Configuration)
|
||||||
|
|
||||||
|
In GitHub Actions environment, data is stored in **Remote Cloud Storage** (supports S3-compatible protocols, Cloudflare R2 recommended for free tier), keeping your repository clean (see **Required Configuration: Remote Cloud Storage** below).
|
||||||
|
|
||||||
|
#### 🚀 Recommended: Docker Deployment
|
||||||
|
|
||||||
|
For long-term stable operation, we recommend [Docker Deployment](#6-docker-deployment), with data stored locally and no check-in required—though it does require purchasing a cloud server.
|
||||||
|
|
||||||
|
<br>
|
||||||
|
|
||||||
|
> 🎉 **Now Supported: Multi-Cloud Storage Options**
|
||||||
|
>
|
||||||
|
> This project now supports S3-compatible protocols. You can choose:
|
||||||
|
> - **Cloudflare R2** (Recommended, generous free tier)
|
||||||
|
> - Other S3-compatible storage services
|
||||||
|
>
|
||||||
|
> Simply configure the corresponding `S3_ENDPOINT_URL`, `S3_BUCKET_NAME` and other environment variables to switch.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
1. **Fork this project** to your GitHub account
|
1. **Fork this project** to your GitHub account
|
||||||
|
|
||||||
- Click the "Fork" button at the top right of this page
|
- Click the "Fork" button at the top right of this page
|
||||||
|
|
||||||
2. **Setup GitHub Secrets (Choose your needed platforms)**:
|
2. **Setup GitHub Secrets (Required + Optional Platforms)**:
|
||||||
|
|
||||||
In your forked repo, go to `Settings` > `Secrets and variables` > `Actions` > `New repository secret`
|
In your forked repo, go to `Settings` > `Secrets and variables` > `Actions` > `New repository secret`
|
||||||
|
|
||||||
@ -862,6 +934,35 @@ frequency_words.txt file added **required word** feature, using + sign
|
|||||||
|
|
||||||
<br>
|
<br>
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>⚠️ <strong>Required Configuration: Remote Cloud Storage</strong> (Required for GitHub Actions Environment, Cloudflare R2 Recommended)</summary>
|
||||||
|
<br>
|
||||||
|
|
||||||
|
**GitHub Secret Configuration (⚠️ All 4 configuration items below are required):**
|
||||||
|
|
||||||
|
| Name | Secret (Value) Description |
|
||||||
|
|------|----------------------------|
|
||||||
|
| `S3_BUCKET_NAME` | Bucket name (e.g., `trendradar-data`) |
|
||||||
|
| `S3_ACCESS_KEY_ID` | Access key ID |
|
||||||
|
| `S3_SECRET_ACCESS_KEY` | Access key |
|
||||||
|
| `S3_ENDPOINT_URL` | S3 API endpoint (e.g., R2: `https://<account-id>.r2.cloudflarestorage.com`) |
|
||||||
|
|
||||||
|
<br>
|
||||||
|
|
||||||
|
**How to Get Credentials (Using Cloudflare R2 as Example):**
|
||||||
|
|
||||||
|
1. Visit [Cloudflare Dashboard](https://dash.cloudflare.com/) and log in
|
||||||
|
2. Select `R2` in left menu → Click `Create Bucket` → Enter name (e.g., `trendradar-data`)
|
||||||
|
3. Click `Manage R2 API Tokens` at top right → `Create API Token`
|
||||||
|
4. Select `Object Read & Write` permission → After creation, it will display `Access Key ID` and `Secret Access Key`
|
||||||
|
5. Endpoint URL can be found in bucket details page (format: `https://<account-id>.r2.cloudflarestorage.com`)
|
||||||
|
|
||||||
|
**Notes**:
|
||||||
|
- R2 free tier: 10GB storage + 1 million reads per month, sufficient for this project
|
||||||
|
- Activation requires binding a payment method (identity verification only, no charges)
|
||||||
|
- Data stored in cloud, keeps GitHub repository clean
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary> <strong>👉 Click to expand: WeWork Bot</strong> (Simplest and fastest configuration)</summary>
|
<summary> <strong>👉 Click to expand: WeWork Bot</strong> (Simplest and fastest configuration)</summary>
|
||||||
@ -2041,7 +2142,7 @@ TrendRadar provides two independent Docker images, deploy according to your need
|
|||||||
|
|
||||||
# Download docker compose config
|
# Download docker compose config
|
||||||
wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/.env -P docker/
|
wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/.env -P docker/
|
||||||
wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/docker compose.yml -P docker/
|
wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/docker-compose.yml -P docker/
|
||||||
```
|
```
|
||||||
|
|
||||||
> 💡 **Note**: Key directory structure required for Docker deployment:
|
> 💡 **Note**: Key directory structure required for Docker deployment:
|
||||||
@ -2052,7 +2153,7 @@ current directory/
|
|||||||
│ └── frequency_words.txt
|
│ └── frequency_words.txt
|
||||||
└── docker/
|
└── docker/
|
||||||
├── .env
|
├── .env
|
||||||
└── docker compose.yml
|
└── docker-compose.yml
|
||||||
```
|
```
|
||||||
|
|
||||||
2. **Config File Description**:
|
2. **Config File Description**:
|
||||||
@ -2146,7 +2247,7 @@ vim config/frequency_words.txt
|
|||||||
|
|
||||||
# Use build version docker compose
|
# Use build version docker compose
|
||||||
cd docker
|
cd docker
|
||||||
cp docker compose-build.yml docker compose.yml
|
cp docker-compose-build.yml docker-compose.yml
|
||||||
```
|
```
|
||||||
|
|
||||||
**Build and Start Services**:
|
**Build and Start Services**:
|
||||||
@ -2232,7 +2333,7 @@ docker rm trend-radar
|
|||||||
|
|
||||||
> 💡 **Web Server Notes**:
|
> 💡 **Web Server Notes**:
|
||||||
> - After starting, access latest report at `http://localhost:8080`
|
> - After starting, access latest report at `http://localhost:8080`
|
||||||
> - Access historical reports via directory navigation (e.g., `http://localhost:8080/2025年xx月xx日/`)
|
> - Access historical reports via directory navigation (e.g., `http://localhost:8080/2025-xx-xx/`)
|
||||||
> - Port can be configured in `.env` file with `WEBSERVER_PORT` parameter
|
> - Port can be configured in `.env` file with `WEBSERVER_PORT` parameter
|
||||||
> - Auto-start: Set `ENABLE_WEBSERVER=true` in `.env`
|
> - Auto-start: Set `ENABLE_WEBSERVER=true` in `.env`
|
||||||
> - Security: Static files only, limited to output directory, localhost binding only
|
> - Security: Static files only, limited to output directory, localhost binding only
|
||||||
@ -2249,7 +2350,7 @@ TrendRadar generates daily summary HTML reports to two locations simultaneously:
|
|||||||
|--------------|---------------|----------|
|
|--------------|---------------|----------|
|
||||||
| `output/index.html` | Direct host access | **Docker Deployment** (via Volume mount, visible on host) |
|
| `output/index.html` | Direct host access | **Docker Deployment** (via Volume mount, visible on host) |
|
||||||
| `index.html` | Root directory access | **GitHub Pages** (repository root, auto-detected by Pages) |
|
| `index.html` | Root directory access | **GitHub Pages** (repository root, auto-detected by Pages) |
|
||||||
| `output/YYYY年MM月DD日/html/当日汇总.html` | Historical reports | All environments (archived by date) |
|
| `output/YYYY-MM-DD/html/当日汇总.html` | Historical reports | All environments (archived by date) |
|
||||||
|
|
||||||
**Local Access Examples**:
|
**Local Access Examples**:
|
||||||
```bash
|
```bash
|
||||||
@ -2258,8 +2359,8 @@ TrendRadar generates daily summary HTML reports to two locations simultaneously:
|
|||||||
docker exec -it trend-radar python manage.py start_webserver
|
docker exec -it trend-radar python manage.py start_webserver
|
||||||
# 2. Access in browser
|
# 2. Access in browser
|
||||||
http://localhost:8080 # Access latest report (default index.html)
|
http://localhost:8080 # Access latest report (default index.html)
|
||||||
http://localhost:8080/2025年xx月xx日/ # Access reports for specific date
|
http://localhost:8080/2025-xx-xx/ # Access reports for specific date
|
||||||
http://localhost:8080/2025年xx月xx日/html/ # Browse all HTML files for that date
|
http://localhost:8080/2025-xx-xx/html/ # Browse all HTML files for that date
|
||||||
|
|
||||||
# Method 2: Direct file access (local environment)
|
# Method 2: Direct file access (local environment)
|
||||||
open ./output/index.html # macOS
|
open ./output/index.html # macOS
|
||||||
@ -2267,7 +2368,7 @@ start ./output/index.html # Windows
|
|||||||
xdg-open ./output/index.html # Linux
|
xdg-open ./output/index.html # Linux
|
||||||
|
|
||||||
# Method 3: Access historical archives
|
# Method 3: Access historical archives
|
||||||
open ./output/2025年xx月xx日/html/当日汇总.html
|
open ./output/2025-xx-xx/html/当日汇总.html
|
||||||
```
|
```
|
||||||
|
|
||||||
**Why two index.html files?**
|
**Why two index.html files?**
|
||||||
@ -2324,10 +2425,20 @@ flowchart TB
|
|||||||
Use docker compose to start both news push and MCP services:
|
Use docker compose to start both news push and MCP services:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Download latest docker compose.yml (includes MCP service config)
|
# Method 1: Clone project (Recommended)
|
||||||
wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/docker compose.yml
|
git clone https://github.com/sansan0/TrendRadar.git
|
||||||
|
cd TrendRadar/docker
|
||||||
|
docker compose up -d
|
||||||
|
|
||||||
# Start all services
|
# Method 2: Download docker-compose.yml separately
|
||||||
|
mkdir trendradar && cd trendradar
|
||||||
|
wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/docker-compose.yml
|
||||||
|
wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/.env
|
||||||
|
mkdir -p config output
|
||||||
|
# Download config files
|
||||||
|
wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/config/config.yaml -P config/
|
||||||
|
wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/config/frequency_words.txt -P config/
|
||||||
|
# Modify volume paths in docker-compose.yml: ../config -> ./config, ../output -> ./output
|
||||||
docker compose up -d
|
docker compose up -d
|
||||||
|
|
||||||
# Check running status
|
# Check running status
|
||||||
@ -2337,18 +2448,29 @@ docker ps | grep trend-radar
|
|||||||
**Start MCP Service Separately**:
|
**Start MCP Service Separately**:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
# Linux/Mac
|
||||||
docker run -d --name trend-radar-mcp \
|
docker run -d --name trend-radar-mcp \
|
||||||
-p 127.0.0.1:3333:3333 \
|
-p 127.0.0.1:3333:3333 \
|
||||||
-v ./config:/app/config:ro \
|
-v $(pwd)/config:/app/config:ro \
|
||||||
-v ./output:/app/output:ro \
|
-v $(pwd)/output:/app/output:ro \
|
||||||
-e TZ=Asia/Shanghai \
|
-e TZ=Asia/Shanghai \
|
||||||
wantcat/trendradar-mcp:latest
|
wantcat/trendradar-mcp:latest
|
||||||
|
|
||||||
|
# Windows PowerShell
|
||||||
|
docker run -d --name trend-radar-mcp `
|
||||||
|
-p 127.0.0.1:3333:3333 `
|
||||||
|
-v ${PWD}/config:/app/config:ro `
|
||||||
|
-v ${PWD}/output:/app/output:ro `
|
||||||
|
-e TZ=Asia/Shanghai `
|
||||||
|
wantcat/trendradar-mcp:latest
|
||||||
```
|
```
|
||||||
|
|
||||||
|
> ⚠️ **Note**: Ensure `config/` and `output/` folders exist in current directory with config files and news data before running.
|
||||||
|
|
||||||
**Verify Service**:
|
**Verify Service**:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Check if MCP service is running properly
|
# Check MCP service health
|
||||||
curl http://127.0.0.1:3333/mcp
|
curl http://127.0.0.1:3333/mcp
|
||||||
|
|
||||||
# View MCP service logs
|
# View MCP service logs
|
||||||
@ -2357,14 +2479,20 @@ docker logs -f trend-radar-mcp
|
|||||||
|
|
||||||
**Configure in AI Clients**:
|
**Configure in AI Clients**:
|
||||||
|
|
||||||
After MCP service starts, configure in Claude Desktop, Cherry Studio, Cursor, etc.:
|
After MCP service starts, configure based on your client:
|
||||||
|
|
||||||
|
**Cherry Studio** (Recommended, GUI config):
|
||||||
|
- Settings → MCP Server → Add
|
||||||
|
- Type: `streamableHttp`
|
||||||
|
- URL: `http://127.0.0.1:3333/mcp`
|
||||||
|
|
||||||
|
**Claude Desktop / Cline** (JSON config):
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"mcpServers": {
|
"mcpServers": {
|
||||||
"trendradar": {
|
"trendradar": {
|
||||||
"url": "http://127.0.0.1:3333/mcp",
|
"url": "http://127.0.0.1:3333/mcp",
|
||||||
"description": "TrendRadar News Trending Analysis"
|
"type": "streamableHttp"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -2452,7 +2580,6 @@ notification:
|
|||||||
start: "20:00" # Start time (Beijing time)
|
start: "20:00" # Start time (Beijing time)
|
||||||
end: "22:00" # End time (Beijing time)
|
end: "22:00" # End time (Beijing time)
|
||||||
once_per_day: true # Push only once per day
|
once_per_day: true # Push only once per day
|
||||||
push_record_retention_days: 7 # Push record retention days
|
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Configuration Details
|
#### Configuration Details
|
||||||
@ -2463,7 +2590,6 @@ notification:
|
|||||||
| `time_range.start` | string | `"20:00"` | Push window start time (Beijing time, HH:MM format) |
|
| `time_range.start` | string | `"20:00"` | Push window start time (Beijing time, HH:MM format) |
|
||||||
| `time_range.end` | string | `"22:00"` | Push window end time (Beijing time, HH:MM format) |
|
| `time_range.end` | string | `"22:00"` | Push window end time (Beijing time, HH:MM format) |
|
||||||
| `once_per_day` | bool | `true` | `true`=push only once per day within window, `false`=push every execution within window |
|
| `once_per_day` | bool | `true` | `true`=push only once per day within window, `false`=push every execution within window |
|
||||||
| `push_record_retention_days` | int | `7` | Push record retention days (used to determine if already pushed) |
|
|
||||||
|
|
||||||
#### Use Cases
|
#### Use Cases
|
||||||
|
|
||||||
@ -2487,7 +2613,6 @@ PUSH_WINDOW_ENABLED=true
|
|||||||
PUSH_WINDOW_START=09:00
|
PUSH_WINDOW_START=09:00
|
||||||
PUSH_WINDOW_END=18:00
|
PUSH_WINDOW_END=18:00
|
||||||
PUSH_WINDOW_ONCE_PER_DAY=false
|
PUSH_WINDOW_ONCE_PER_DAY=false
|
||||||
PUSH_WINDOW_RETENTION_DAYS=7
|
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Complete Configuration Examples
|
#### Complete Configuration Examples
|
||||||
@ -2502,7 +2627,6 @@ notification:
|
|||||||
start: "20:00"
|
start: "20:00"
|
||||||
end: "22:00"
|
end: "22:00"
|
||||||
once_per_day: true
|
once_per_day: true
|
||||||
push_record_retention_days: 7
|
|
||||||
```
|
```
|
||||||
|
|
||||||
**Scenario: Push every hour during working hours**
|
**Scenario: Push every hour during working hours**
|
||||||
@ -2515,7 +2639,6 @@ notification:
|
|||||||
start: "09:00"
|
start: "09:00"
|
||||||
end: "18:00"
|
end: "18:00"
|
||||||
once_per_day: false
|
once_per_day: false
|
||||||
push_record_retention_days: 7
|
|
||||||
```
|
```
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
@ -2811,6 +2934,207 @@ notification:
|
|||||||
|
|
||||||
<br>
|
<br>
|
||||||
|
|
||||||
|
### 11. Storage Configuration (v4.0.0 New)
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>👉 Click to expand: <strong>Storage Configuration Guide</strong></summary>
|
||||||
|
<br>
|
||||||
|
|
||||||
|
#### Storage Backend Selection
|
||||||
|
|
||||||
|
TrendRadar v4.0.0 introduces **multi-backend storage architecture**, supporting automatic backend selection or manual specification:
|
||||||
|
|
||||||
|
| Configuration Value | Description | Applicable Scenarios |
|
||||||
|
|---------------------|-------------|---------------------|
|
||||||
|
| `auto` (default) | Auto-select backend: GitHub Actions→R2, other environments→Local | Most users (recommended) |
|
||||||
|
| `local` | Force use of local SQLite | Docker/local deployment |
|
||||||
|
| `r2` | Force use of Cloudflare R2 | Cloud storage required |
|
||||||
|
|
||||||
|
**Configuration Location**:
|
||||||
|
- GitHub Actions: Set `STORAGE_BACKEND` environment variable in GitHub Secrets
|
||||||
|
- Docker: Configure `STORAGE_BACKEND=local` in `.env` file
|
||||||
|
- Local: Add `STORAGE_BACKEND` in environment variables or use auto mode
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### Database Structure Optimization (v4.0.0)
|
||||||
|
|
||||||
|
v4.0.0 made significant optimizations to database structure, removing redundant fields and improving data normalization:
|
||||||
|
|
||||||
|
##### 1. Removed Redundant Fields
|
||||||
|
|
||||||
|
Removed the following redundant fields from `news` table:
|
||||||
|
|
||||||
|
| Field Name | Removal Reason | Alternative |
|
||||||
|
|------------|----------------|------------|
|
||||||
|
| `source_name` | Duplicate with platform name | Get via `platforms` table JOIN query |
|
||||||
|
| `crawl_date` | Duplicate with file path date | Infer from file path timestamp |
|
||||||
|
|
||||||
|
**Migration Notes**: Old databases are incompatible, see [Breaking Changes](#breaking-changes-v400) section
|
||||||
|
|
||||||
|
##### 2. New Platforms Table
|
||||||
|
|
||||||
|
Added `platforms` table for unified management of platform information:
|
||||||
|
|
||||||
|
```sql
|
||||||
|
CREATE TABLE IF NOT EXISTS platforms (
|
||||||
|
id TEXT PRIMARY KEY, -- Platform ID (immutable, e.g., 'zhihu', 'weibo')
|
||||||
|
name TEXT NOT NULL, -- Platform display name (mutable, e.g., 'Zhihu', 'Weibo')
|
||||||
|
enabled INTEGER DEFAULT 1 -- Whether enabled (1=enabled, 0=disabled)
|
||||||
|
);
|
||||||
|
```
|
||||||
|
|
||||||
|
**Design Advantages**:
|
||||||
|
- `id` field is immutable, maintains data consistency
|
||||||
|
- `name` field is mutable, supports internationalization and customization
|
||||||
|
- Historical data remains valid when modifying platform names
|
||||||
|
|
||||||
|
##### 3. Crawl Source Status Normalization
|
||||||
|
|
||||||
|
Replaced original comma-separated string storage `successful_sources` field with normalized `crawl_source_status` table:
|
||||||
|
|
||||||
|
```sql
|
||||||
|
CREATE TABLE IF NOT EXISTS crawl_source_status (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
file_path TEXT NOT NULL, -- File path (e.g., 'output/2025-12-09/news.db')
|
||||||
|
platform_id TEXT NOT NULL, -- Platform ID (foreign key to platforms.id)
|
||||||
|
success INTEGER NOT NULL, -- Whether crawl succeeded (1=success, 0=failed)
|
||||||
|
crawl_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
FOREIGN KEY (platform_id) REFERENCES platforms(id)
|
||||||
|
);
|
||||||
|
```
|
||||||
|
|
||||||
|
**Design Advantages**:
|
||||||
|
- Supports efficient SQL queries (e.g., calculate success rate by platform)
|
||||||
|
- Easy statistics and analysis (no string splitting required)
|
||||||
|
- Normalized structure, avoids data redundancy
|
||||||
|
|
||||||
|
##### 4. File Path Format Standardization
|
||||||
|
|
||||||
|
**Old Format**: `output/2025年12月09日/news_14-30.txt`
|
||||||
|
**New Format**: `output/2025-12-09/news.db`
|
||||||
|
|
||||||
|
**Changes**:
|
||||||
|
- Date format: Chinese format → ISO 8601 standard format
|
||||||
|
- Filename: Multiple time-stamped TXT files → single SQLite database file
|
||||||
|
- Extension: `.txt` → `.db`
|
||||||
|
|
||||||
|
**Advantages**:
|
||||||
|
- Cross-platform compatibility (avoids Chinese path issues)
|
||||||
|
- Easier programmatic parsing
|
||||||
|
- International standard, better maintainability
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### Remote Cloud Storage Configuration
|
||||||
|
|
||||||
|
When using remote cloud storage (required for GitHub Actions environment), configure the following environment variables:
|
||||||
|
|
||||||
|
| Environment Variable | Description | Required | Example Value |
|
||||||
|
|----------------------|-------------|----------|--------------|
|
||||||
|
| `S3_BUCKET_NAME` | Bucket name | ✅ Yes | `trendradar-data` |
|
||||||
|
| `S3_ACCESS_KEY_ID` | Access key ID | ✅ Yes | `abc123...` |
|
||||||
|
| `S3_SECRET_ACCESS_KEY` | Access key | ✅ Yes | `xyz789...` |
|
||||||
|
| `S3_ENDPOINT_URL` | S3 API endpoint | ✅ Yes | `https://<account-id>.r2.cloudflarestorage.com` |
|
||||||
|
| `S3_REGION` | Region (optional) | ❌ No | `auto` |
|
||||||
|
|
||||||
|
**Configuration Method**:
|
||||||
|
- GitHub Actions: Configure in GitHub Secrets (see [Quick Start - Remote Storage Configuration](#2-setup-github-secrets-required--optional-platforms))
|
||||||
|
- Docker/Local: Configure in `.env` file (remote storage is optional)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### Data Cleanup Strategy
|
||||||
|
|
||||||
|
v4.0.0 added automatic data cleanup feature, supporting scheduled cleanup of old data:
|
||||||
|
|
||||||
|
**Configuration Items**: `LOCAL_RETENTION_DAYS` and `REMOTE_RETENTION_DAYS`
|
||||||
|
|
||||||
|
| Configuration Value | Description |
|
||||||
|
|---------------------|-------------|
|
||||||
|
| `0` (default) | Disable cleanup, keep all data |
|
||||||
|
| Positive integer (e.g., `30`) | Only keep recent N days of data, auto-delete old data |
|
||||||
|
|
||||||
|
**Configuration Method**:
|
||||||
|
```bash
|
||||||
|
# GitHub Actions: Configure in GitHub Secrets
|
||||||
|
LOCAL_RETENTION_DAYS=30
|
||||||
|
REMOTE_RETENTION_DAYS=30
|
||||||
|
|
||||||
|
# Docker: Configure in .env file
|
||||||
|
LOCAL_RETENTION_DAYS=30
|
||||||
|
REMOTE_RETENTION_DAYS=30
|
||||||
|
|
||||||
|
# Local: Add to environment variables
|
||||||
|
export LOCAL_RETENTION_DAYS=30
|
||||||
|
```
|
||||||
|
|
||||||
|
**Cleanup Rules**:
|
||||||
|
- Cleanup executes during each crawl task
|
||||||
|
- Local: Deletes `output/YYYY-MM-DD/` directories older than N days
|
||||||
|
- Remote: Deletes cloud objects older than N days (e.g., `news/2025-11-10.db`)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### Timezone Configuration
|
||||||
|
|
||||||
|
v4.0.0 added timezone configuration support, using IANA standard time zone names:
|
||||||
|
|
||||||
|
**Configuration Item**: `TIMEZONE`
|
||||||
|
|
||||||
|
| Configuration Value | Description | Example |
|
||||||
|
|---------------------|-------------|---------|
|
||||||
|
| Not set (default) | Use UTC+0 | - |
|
||||||
|
| IANA time zone name | Specify time zone | `Asia/Shanghai`, `America/New_York`, `Europe/London` |
|
||||||
|
|
||||||
|
**Configuration Method**:
|
||||||
|
```bash
|
||||||
|
# GitHub Actions: Configure in GitHub Secrets
|
||||||
|
TIMEZONE=Asia/Shanghai
|
||||||
|
|
||||||
|
# Docker: Configure in .env file
|
||||||
|
TIMEZONE=Asia/Shanghai
|
||||||
|
|
||||||
|
# Local: Add to environment variables
|
||||||
|
export TIMEZONE=Asia/Shanghai
|
||||||
|
```
|
||||||
|
|
||||||
|
**Common IANA Time Zones**:
|
||||||
|
- China: `Asia/Shanghai`
|
||||||
|
- United States East: `America/New_York`
|
||||||
|
- United States West: `America/Los_Angeles`
|
||||||
|
- United Kingdom: `Europe/London`
|
||||||
|
- Japan: `Asia/Tokyo`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### Breaking Changes (v4.0.0)
|
||||||
|
|
||||||
|
**⚠️ Important Notice**: v4.0.0 made breaking changes to database structure, **old databases are incompatible**
|
||||||
|
|
||||||
|
**Impact**:
|
||||||
|
- Cannot directly read v3.x version data
|
||||||
|
- Need to re-crawl data to build new database
|
||||||
|
- **No automatic migration tool provided**
|
||||||
|
|
||||||
|
**Recommendations**:
|
||||||
|
1. **Fresh Start**: Recommended to start from scratch to accumulate data
|
||||||
|
2. **Keep Historical Data**: If need to preserve v3.x historical data, can rename old `output/` directory (e.g., `output_v3_backup/`) before running new version
|
||||||
|
|
||||||
|
**Data Format Comparison**:
|
||||||
|
|
||||||
|
| Item | v3.x | v4.0.0 |
|
||||||
|
|------|------|--------|
|
||||||
|
| File path format | `output/2025年12月09日/` | `output/2025-12-09/` |
|
||||||
|
| Data file | Multiple `news_HH-MM.txt` files | Single `news.db` file |
|
||||||
|
| Database fields | Contains `source_name`, `crawl_date` | Removed redundant fields |
|
||||||
|
| Platform management | No independent table | Added `platforms` table |
|
||||||
|
| Crawl status | Comma-separated string | Normalized `crawl_source_status` table |
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<br>
|
||||||
|
|
||||||
## 🤖 AI Analysis
|
## 🤖 AI Analysis
|
||||||
|
|
||||||
TrendRadar v3.0.0 added **MCP (Model Context Protocol)** based AI analysis feature, allowing natural language conversations with news data for deep analysis.
|
TrendRadar v3.0.0 added **MCP (Model Context Protocol)** based AI analysis feature, allowing natural language conversations with news data for deep analysis.
|
||||||
|
|||||||
@ -450,7 +450,89 @@ AI: (date_range={"start": "2024-12-01", "end": "2024-12-31"})
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
### Q14: How to parse natural language date expressions? (Recommended to use first)
|
## Storage Sync
|
||||||
|
|
||||||
|
### Q14: How to sync data from remote storage to local?
|
||||||
|
|
||||||
|
**You can ask like this:**
|
||||||
|
|
||||||
|
- "Sync last 7 days data from remote"
|
||||||
|
- "Pull data from remote storage to local"
|
||||||
|
- "Sync last 30 days of news data"
|
||||||
|
|
||||||
|
**Tool called:** `sync_from_remote`
|
||||||
|
|
||||||
|
**Use cases:**
|
||||||
|
|
||||||
|
- Crawler deployed in the cloud (e.g., GitHub Actions), data stored remotely (e.g., Cloudflare R2)
|
||||||
|
- MCP Server deployed locally, needs to pull data from remote for analysis
|
||||||
|
|
||||||
|
**Return information:**
|
||||||
|
|
||||||
|
- synced_files: Number of successfully synced files
|
||||||
|
- synced_dates: List of successfully synced dates
|
||||||
|
- skipped_dates: Skipped dates (already exist locally)
|
||||||
|
- failed_dates: Failed dates and error information
|
||||||
|
|
||||||
|
**Prerequisites:**
|
||||||
|
|
||||||
|
Need to configure remote storage in `config/config.yaml` or set environment variables:
|
||||||
|
- `S3_ENDPOINT_URL`: Service endpoint
|
||||||
|
- `S3_BUCKET_NAME`: Bucket name
|
||||||
|
- `S3_ACCESS_KEY_ID`: Access key ID
|
||||||
|
- `S3_SECRET_ACCESS_KEY`: Secret access key
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Q15: How to view storage status?
|
||||||
|
|
||||||
|
**You can ask like this:**
|
||||||
|
|
||||||
|
- "View current storage status"
|
||||||
|
- "What's the storage configuration"
|
||||||
|
- "How much data is stored locally"
|
||||||
|
- "Is remote storage configured"
|
||||||
|
|
||||||
|
**Tool called:** `get_storage_status`
|
||||||
|
|
||||||
|
**Return information:**
|
||||||
|
|
||||||
|
| Category | Information |
|
||||||
|
|----------|-------------|
|
||||||
|
| **Local Storage** | Data directory, total size, date count, date range |
|
||||||
|
| **Remote Storage** | Whether configured, endpoint URL, bucket name, date count |
|
||||||
|
| **Pull Config** | Whether auto-pull enabled, pull days |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Q16: How to view available data dates?
|
||||||
|
|
||||||
|
**You can ask like this:**
|
||||||
|
|
||||||
|
- "What dates are available locally"
|
||||||
|
- "What dates are in remote storage"
|
||||||
|
- "Compare local and remote data dates"
|
||||||
|
- "Which dates only exist remotely"
|
||||||
|
|
||||||
|
**Tool called:** `list_available_dates`
|
||||||
|
|
||||||
|
**Three query modes:**
|
||||||
|
|
||||||
|
| Mode | Description | Example Question |
|
||||||
|
|------|-------------|------------------|
|
||||||
|
| **local** | View local only | "What dates are available locally" |
|
||||||
|
| **remote** | View remote only | "What dates are in remote" |
|
||||||
|
| **both** | Compare both (default) | "Compare local and remote data" |
|
||||||
|
|
||||||
|
**Return information (both mode):**
|
||||||
|
|
||||||
|
- only_local: Dates only existing locally
|
||||||
|
- only_remote: Dates only existing remotely (useful for deciding which dates to sync)
|
||||||
|
- both: Dates existing in both places
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Q17: How to parse natural language date expressions? (Recommended to use first)
|
||||||
|
|
||||||
**You can ask like this:**
|
**You can ask like this:**
|
||||||
|
|
||||||
|
|||||||
@ -450,7 +450,89 @@ AI:(date_range={"start": "2024-12-01", "end": "2024-12-31"})
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
### Q14: 如何解析自然语言日期表达式?(推荐优先使用)
|
## 存储同步
|
||||||
|
|
||||||
|
### Q14: 如何从远程存储同步数据到本地?
|
||||||
|
|
||||||
|
**你可以这样问:**
|
||||||
|
|
||||||
|
- "从远程同步最近 7 天的数据"
|
||||||
|
- "拉取远程存储的数据到本地"
|
||||||
|
- "同步最近 30 天的新闻数据"
|
||||||
|
|
||||||
|
**调用的工具:** `sync_from_remote`
|
||||||
|
|
||||||
|
**使用场景:**
|
||||||
|
|
||||||
|
- 爬虫部署在云端(如 GitHub Actions),数据存储到远程(如 Cloudflare R2)
|
||||||
|
- MCP Server 部署在本地,需要从远程拉取数据进行分析
|
||||||
|
|
||||||
|
**返回信息:**
|
||||||
|
|
||||||
|
- synced_files: 成功同步的文件数量
|
||||||
|
- synced_dates: 成功同步的日期列表
|
||||||
|
- skipped_dates: 跳过的日期(本地已存在)
|
||||||
|
- failed_dates: 失败的日期及错误信息
|
||||||
|
|
||||||
|
**前提条件:**
|
||||||
|
|
||||||
|
需要在 `config/config.yaml` 中配置远程存储或设置环境变量:
|
||||||
|
- `S3_ENDPOINT_URL`: 服务端点
|
||||||
|
- `S3_BUCKET_NAME`: 存储桶名称
|
||||||
|
- `S3_ACCESS_KEY_ID`: 访问密钥 ID
|
||||||
|
- `S3_SECRET_ACCESS_KEY`: 访问密钥
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Q15: 如何查看存储状态?
|
||||||
|
|
||||||
|
**你可以这样问:**
|
||||||
|
|
||||||
|
- "查看当前存储状态"
|
||||||
|
- "存储配置是什么"
|
||||||
|
- "本地有多少数据"
|
||||||
|
- "远程存储配置了吗"
|
||||||
|
|
||||||
|
**调用的工具:** `get_storage_status`
|
||||||
|
|
||||||
|
**返回信息:**
|
||||||
|
|
||||||
|
| 类别 | 信息 |
|
||||||
|
|------|------|
|
||||||
|
| **本地存储** | 数据目录、总大小、日期数量、日期范围 |
|
||||||
|
| **远程存储** | 是否配置、端点地址、存储桶名称、日期数量 |
|
||||||
|
| **拉取配置** | 是否启用自动拉取、拉取天数 |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Q16: 如何查看可用的数据日期?
|
||||||
|
|
||||||
|
**你可以这样问:**
|
||||||
|
|
||||||
|
- "本地有哪些日期的数据"
|
||||||
|
- "远程存储有哪些日期"
|
||||||
|
- "对比本地和远程的数据日期"
|
||||||
|
- "哪些日期只在远程有"
|
||||||
|
|
||||||
|
**调用的工具:** `list_available_dates`
|
||||||
|
|
||||||
|
**三种查询模式:**
|
||||||
|
|
||||||
|
| 模式 | 说明 | 示例问法 |
|
||||||
|
|------|------|---------|
|
||||||
|
| **local** | 仅查看本地 | "本地有哪些日期" |
|
||||||
|
| **remote** | 仅查看远程 | "远程有哪些日期" |
|
||||||
|
| **both** | 对比两者(默认) | "对比本地和远程的数据" |
|
||||||
|
|
||||||
|
**返回信息(both 模式):**
|
||||||
|
|
||||||
|
- only_local: 仅本地存在的日期
|
||||||
|
- only_remote: 仅远程存在的日期(可用于决定同步哪些日期)
|
||||||
|
- both: 两边都存在的日期
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Q17: 如何解析自然语言日期表达式?(推荐优先使用)
|
||||||
|
|
||||||
**你可以这样问:**
|
**你可以这样问:**
|
||||||
|
|
||||||
|
|||||||
394
README.md
394
README.md
@ -1,6 +1,6 @@
|
|||||||
<div align="center" id="trendradar">
|
<div align="center" id="trendradar">
|
||||||
|
|
||||||
> **📢 公告:** 经与 GitHub 官方沟通,完成合规调整后将恢复"一键 Fork 部署",请关注 **v4.0.0** 版本的更新
|
> **📢 公告:** **v4.0.0** 版本已发布!包含存储架构重构、数据库优化、模块化改进等重大更新
|
||||||
|
|
||||||
<a href="https://github.com/sansan0/TrendRadar" title="TrendRadar">
|
<a href="https://github.com/sansan0/TrendRadar" title="TrendRadar">
|
||||||
<img src="/_image/banner.webp" alt="TrendRadar Banner" width="80%">
|
<img src="/_image/banner.webp" alt="TrendRadar Banner" width="80%">
|
||||||
@ -16,8 +16,8 @@
|
|||||||
[](https://github.com/sansan0/TrendRadar/stargazers)
|
[](https://github.com/sansan0/TrendRadar/stargazers)
|
||||||
[](https://github.com/sansan0/TrendRadar/network/members)
|
[](https://github.com/sansan0/TrendRadar/network/members)
|
||||||
[](LICENSE)
|
[](LICENSE)
|
||||||
[](https://github.com/sansan0/TrendRadar)
|
[](https://github.com/sansan0/TrendRadar)
|
||||||
[](https://github.com/sansan0/TrendRadar)
|
[](https://github.com/sansan0/TrendRadar)
|
||||||
|
|
||||||
[](https://work.weixin.qq.com/)
|
[](https://work.weixin.qq.com/)
|
||||||
[](https://weixin.qq.com/)
|
[](https://weixin.qq.com/)
|
||||||
@ -48,62 +48,61 @@
|
|||||||
<br>
|
<br>
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary>🚨 <strong>【必读】重要公告:本项目的正确部署姿势</strong></summary>
|
<summary>🚨 <strong>【必读】重要公告:v4.0.0 部署方式与存储架构变更</strong></summary>
|
||||||
|
|
||||||
<br>
|
<br>
|
||||||
|
|
||||||
> **⚠️ 2025年12月紧急通知**
|
### 🛠️ 请选择适合你的部署方式
|
||||||
>
|
|
||||||
> 由于 Fork 数量激增导致 GitHub 服务器压力过大,**GitHub Actions 及 GitHub Pages 部署目前已受限**。为确保顺利部署,请务必阅读以下说明。
|
|
||||||
|
|
||||||
### 1. ✅ 唯一推荐部署方式:Docker
|
#### 🅰️ 方案一:Docker 部署(推荐 🔥)
|
||||||
|
|
||||||
**这是目前最稳定、不受 GitHub 限制的方案。** 数据存储在本地,不会因为 GitHub 策略调整而失效。
|
* **特点**:最稳定、最简单,数据存储在 **本地 SQLite**,完全自主可控。
|
||||||
|
|
||||||
|
* **适用**:有自己的服务器、NAS 或长期运行的电脑。
|
||||||
|
|
||||||
* 👉 [跳转到 Docker 部署教程](#6-docker-部署)
|
* 👉 [跳转到 Docker 部署教程](#6-docker-部署)
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
### 2. 如果你本打算 Fork 本项目...
|
#### 🅱️ 方案二:GitHub Actions 部署(已恢复 ✅)
|
||||||
|
|
||||||
为了减少对 GitHub 服务器的压力,**请千万不要直接点击 "Fork" 按钮!**
|
* **特点**:数据不再直接写入仓库(Git Commit),而是存储在 **远程云存储**(支持 S3 兼容协议:Cloudflare R2、阿里云 OSS、腾讯云 COS 等)。
|
||||||
|
|
||||||
请务必使用 **"Use this template"** 功能来替代 Fork:
|
* **门槛**:**必须**配置一个 S3 兼容的对象存储服务(推荐免费的 Cloudflare R2)。
|
||||||
|
|
||||||
|
> **⚠️ 注意**:选择此方案,请务必执行以下两步配置:
|
||||||
|
|
||||||
|
#### 1. 🚀 推荐的开始方式:Use this template
|
||||||
|
|
||||||
|
为了保持仓库整洁,避免继承冗余的历史记录,我**建议**你使用 Template 模式:
|
||||||
|
|
||||||
|
1. **点击**原仓库页面右上角的绿色 **[Use this template]** 按钮。
|
||||||
|
|
||||||
1. **点击**原仓库页面右上角的绿色的 **[Use this template]** 按钮。
|
|
||||||
2. **选择** "Create a new repository"。
|
2. **选择** "Create a new repository"。
|
||||||
|
|
||||||
**为什么要这样做?**
|
> **💡 为什么要这样做?**
|
||||||
* **❌ Fork**:复制完整历史记录,大量 Fork 同时运行会触发 GitHub 风控。
|
> * **Use this template**:创建一个全新的、干净的仓库,没有历史包袱。
|
||||||
* **✅ Use this template**:创建的是一个全新的独立仓库,没有历史包袱,对服务器更友好。
|
> * **Fork**:会保留完整的提交历史和关联关系,占用 GitHub 更多资源。
|
||||||
|
|
||||||
---
|
#### 2. ☁️ 关于 GitHub Actions 必配的远程存储
|
||||||
|
|
||||||
### 3. 关于新版数据存储的说明
|
如果你选择 **方案二 (GitHub Actions)**,则必须配置一个 S3 兼容的对象存储服务。
|
||||||
|
|
||||||
新版将使用 **Cloudflare R2** 存储新闻数据,以保证持久化。
|
**支持的存储服务:**
|
||||||
|
- **Cloudflare R2**(推荐,免费额度充足)
|
||||||
|
- 其他 S3 兼容服务
|
||||||
|
|
||||||
**⚠️ 配置前置条件:**
|
**⚠️ 以 Cloudflare R2 为例的配置前置条件:**
|
||||||
|
|
||||||
根据 Cloudflare 平台规则,开通 R2 需绑定支付方式。
|
根据 Cloudflare 平台规则,开通 R2 需绑定支付方式。
|
||||||
|
|
||||||
- **目的:** 仅作身份验证(Verify Only),不产生扣费。
|
* **目的**:仅作身份验证(Verify Only),**不产生扣费**。
|
||||||
- **支付:** 支持信用卡或国区 PayPal。
|
|
||||||
- **用量:** R2 的免费额度足以覆盖本项目日常运行,无需付费。
|
|
||||||
|
|
||||||
---
|
* **支付**:支持双币信用卡或国区 PayPal。
|
||||||
|
|
||||||
### 4. 📅 后续计划与文档阅读说明
|
* **用量**:R2 的免费额度(10GB存储/月)足以覆盖本项目日常运行,无需担心付费。
|
||||||
|
|
||||||
> **后续计划:**
|
👉 **[点击查看详细配置教程](#-快速开始)**
|
||||||
> - 探索新方案:保留 Actions 用于抓取和推送,但不再将数据保存到仓库,改用外部存储。
|
|
||||||
|
|
||||||
**⚠️ 阅读注意:**
|
|
||||||
鉴于上述计划意味着 **Fork 部署模式未来可能会以新形式回归**,且当前全面修改文档工作量巨大,我们暂时保留了旧版描述。
|
|
||||||
|
|
||||||
**在当前阶段,若后续教程中仍出现 "Fork" 相关表述,请一律忽略或将其理解为 "Use this template"**。
|
|
||||||
|
|
||||||
👉 **[点击此处查看 TrendRadar 最新官方文档](https://github.com/sansan0/TrendRadar?tab=readme-ov-file)**
|
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
@ -335,10 +334,30 @@
|
|||||||
- ⚠️ **配对配置**:Telegram 和 ntfy 需要保证配对参数数量一致(如 token 和 chat_id 都是 2 个)
|
- ⚠️ **配对配置**:Telegram 和 ntfy 需要保证配对参数数量一致(如 token 和 chat_id 都是 2 个)
|
||||||
- ⚠️ **数量限制**:默认每个渠道最多 3 个账号,超出会被截断
|
- ⚠️ **数量限制**:默认每个渠道最多 3 个账号,超出会被截断
|
||||||
|
|
||||||
### **多端适配**
|
### **灵活存储架构**(v4.0.0 重大更新)
|
||||||
- **GitHub Pages**:自动生成精美网页报告,PC/移动端适配
|
|
||||||
- **Docker部署**:支持多架构容器化运行
|
**多存储后端支持**:
|
||||||
- **数据持久化**:HTML/TXT多格式历史记录保存
|
- ☁️ **远程云存储**:GitHub Actions 环境默认,支持 S3 兼容协议(R2/OSS/COS 等),数据存储在云端,不污染仓库
|
||||||
|
- 💾 **本地 SQLite 数据库**:Docker/本地环境默认,数据完全可控
|
||||||
|
- 🔄 **自动后端选择**:根据运行环境智能切换存储方式
|
||||||
|
|
||||||
|
**数据格式**:
|
||||||
|
| 格式 | 用途 | 说明 |
|
||||||
|
|------|------|------|
|
||||||
|
| **SQLite** | 主存储 | 单文件数据库,查询快速,支持 MCP AI 分析 |
|
||||||
|
| **TXT** | 可选快照 | 可读文本格式,方便直接查看 |
|
||||||
|
| **HTML** | 报告展示 | 精美可视化页面,PC/移动端适配 |
|
||||||
|
|
||||||
|
**数据管理**:
|
||||||
|
- ✅ 自动清理过期数据(可配置保留天数)
|
||||||
|
- ✅ 时区配置支持(全球时区)
|
||||||
|
|
||||||
|
> 💡 详细说明见 [配置详解 - 存储配置](#9-存储配置)
|
||||||
|
|
||||||
|
### **多端部署**
|
||||||
|
- **GitHub Actions**:定时自动爬取 + 远程云存储(需签到续期)
|
||||||
|
- **Docker 部署**:支持多架构容器化运行,数据本地存储
|
||||||
|
- **本地运行**:Windows/Mac/Linux 直接运行
|
||||||
|
|
||||||
|
|
||||||
### **AI 智能分析(v3.0.0 新增)**
|
### **AI 智能分析(v3.0.0 新增)**
|
||||||
@ -389,10 +408,34 @@ GitHub 一键 Fork 即可使用,无需编程基础。
|
|||||||
>**升级说明**:
|
>**升级说明**:
|
||||||
- **📌 查看最新更新**:**[原仓库更新日志](https://github.com/sansan0/TrendRadar?tab=readme-ov-file#-更新日志)**
|
- **📌 查看最新更新**:**[原仓库更新日志](https://github.com/sansan0/TrendRadar?tab=readme-ov-file#-更新日志)**
|
||||||
- **提示**:不要通过 **Sync fork** 更新本项目,建议查看【历史更新】,明确具体的【升级方式】和【功能内容】
|
- **提示**:不要通过 **Sync fork** 更新本项目,建议查看【历史更新】,明确具体的【升级方式】和【功能内容】
|
||||||
- **小版本更新**:从 v2.x 升级到 v2.y,用本项目的 `main.py` 代码替换你 fork 仓库中的对应文件
|
|
||||||
- **大版本升级**:从 v1.x 升级到 v2.y,建议删除现有 fork 后重新 fork,这样更省力且避免配置冲突
|
- **大版本升级**:从 v1.x 升级到 v2.y,建议删除现有 fork 后重新 fork,这样更省力且避免配置冲突
|
||||||
|
|
||||||
|
|
||||||
|
### 2025/12/13 - v4.0.0
|
||||||
|
|
||||||
|
**🎉 重大更新:全面重构存储和核心架构**
|
||||||
|
|
||||||
|
- **多存储后端支持**:引入全新的存储模块,支持本地 SQLite 和远程云存储(S3 兼容协议,推荐免费的 Cloudflare R2),适应 GitHub Actions、Docker 和本地环境。
|
||||||
|
- **数据库结构优化**:重构 SQLite 数据库表结构,提升数据效率和查询能力。
|
||||||
|
- **核心代码模块化**:将主程序逻辑拆分为 trendradar 包的多个模块,显著提升代码可维护性。
|
||||||
|
- **增强功能**:实现日期格式标准化、数据保留策略、时区配置支持、时间显示优化,并修复远程存储数据持久化问题,确保数据合并的准确性。
|
||||||
|
- **清理和兼容**:移除了大部分历史兼容代码,统一了数据存储和读取方式。
|
||||||
|
|
||||||
|
|
||||||
|
### 2025/12/13 - mcp-v1.1.0
|
||||||
|
|
||||||
|
**MCP 模块更新:**
|
||||||
|
- 适配 v4.0.0,同时也兼容 v3.x 的数据
|
||||||
|
- 新增存储同步工具:
|
||||||
|
- `sync_from_remote`: 从远程存储拉取数据到本地
|
||||||
|
- `get_storage_status`: 获取存储配置和状态
|
||||||
|
- `list_available_dates`: 列出本地/远程可用日期范围
|
||||||
|
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>👉 点击展开:<strong>历史更新</strong></summary>
|
||||||
|
|
||||||
|
|
||||||
### 2025/12/03 - v3.5.0
|
### 2025/12/03 - v3.5.0
|
||||||
|
|
||||||
**🎉 核心功能增强**
|
**🎉 核心功能增强**
|
||||||
@ -456,10 +499,6 @@ GitHub 一键 Fork 即可使用,无需编程基础。
|
|||||||
- 工具总数从 13 个增加到 14 个
|
- 工具总数从 13 个增加到 14 个
|
||||||
|
|
||||||
|
|
||||||
<details>
|
|
||||||
<summary>👉 点击展开:<strong>历史更新</strong></summary>
|
|
||||||
|
|
||||||
|
|
||||||
### 2025/11/28 - v3.4.1
|
### 2025/11/28 - v3.4.1
|
||||||
|
|
||||||
**🔧 格式优化**
|
**🔧 格式优化**
|
||||||
@ -857,11 +896,44 @@ frequency_words.txt 文件增加了一个【必须词】功能,使用 + 号
|
|||||||
|
|
||||||
> **📖 提醒**:Fork 用户建议先 **[查看最新官方文档](https://github.com/sansan0/TrendRadar?tab=readme-ov-file)**,确保配置步骤是最新的。
|
> **📖 提醒**:Fork 用户建议先 **[查看最新官方文档](https://github.com/sansan0/TrendRadar?tab=readme-ov-file)**,确保配置步骤是最新的。
|
||||||
|
|
||||||
|
### ⚠️ GitHub Actions 使用说明
|
||||||
|
|
||||||
|
**v4.0.0 重要变更**:引入「活跃度检测」机制,GitHub Actions 需定期签到以维持运行。
|
||||||
|
|
||||||
|
#### 🔄 签到续期机制
|
||||||
|
|
||||||
|
- **运行周期**:有效期为 **7 天**,倒计时结束后服务将自动挂起。
|
||||||
|
- **续期方式**:在 Actions 页面手动触发 "Check In" workflow,即可重置 7 天有效期。
|
||||||
|
- **操作路径**:`Actions` → `Check In` → `Run workflow`
|
||||||
|
- **设计理念**:
|
||||||
|
- 如果 7 天都忘了签到,或许这些资讯对你来说并非刚需。适时的暂停,能帮你从信息流中抽离,给大脑留出喘息的空间。
|
||||||
|
- GitHub Actions 是宝贵的公共计算资源。引入签到机制旨在避免算力的无效空转,确保资源能分配给真正活跃且需要的用户。感谢你的理解与支持。
|
||||||
|
|
||||||
|
#### 📦 数据存储(必需配置)
|
||||||
|
|
||||||
|
GitHub Actions 环境下,数据存储在 **远程云存储**(支持 S3 兼容协议,推荐免费的 Cloudflare R2),不会污染仓库(见下方 **必需配置:远程云存储**)
|
||||||
|
|
||||||
|
#### 🚀 推荐:Docker 部署
|
||||||
|
|
||||||
|
如需长期稳定运行,建议使用 [Docker 部署](#6-docker-部署),数据存储在本地,无需签到,不过需要额外付费购买云服务器。
|
||||||
|
|
||||||
|
<br>
|
||||||
|
|
||||||
|
> 🎉 **已支持:多云存储方案**
|
||||||
|
>
|
||||||
|
> 本项目现已支持 S3 兼容协议,你可以选择:
|
||||||
|
> - **Cloudflare R2**(推荐,免费额度充足)
|
||||||
|
> - 其他 S3 兼容存储服务
|
||||||
|
>
|
||||||
|
> 只需配置对应的 `S3_ENDPOINT_URL`、`S3_BUCKET_NAME` 等环境变量即可切换。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
1. **Fork 本项目**到你的 GitHub 账户
|
1. **Fork 本项目**到你的 GitHub 账户
|
||||||
|
|
||||||
- 点击本页面右上角的"Fork"按钮
|
- 点击本页面右上角的"Fork"按钮
|
||||||
|
|
||||||
2. **设置 GitHub Secrets(选择你需要的平台)**:
|
2. **设置 GitHub Secrets(必需 + 可选平台)**:
|
||||||
|
|
||||||
在你 Fork 后的仓库中,进入 `Settings` > `Secrets and variables` > `Actions` > `New repository secret`
|
在你 Fork 后的仓库中,进入 `Settings` > `Secrets and variables` > `Actions` > `New repository secret`
|
||||||
|
|
||||||
@ -900,6 +972,53 @@ frequency_words.txt 文件增加了一个【必须词】功能,使用 + 号
|
|||||||
|
|
||||||
<br>
|
<br>
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>⚠️ <strong>必需配置:远程云存储</strong>(GitHub Actions 环境必需,推荐 Cloudflare R2)</summary>
|
||||||
|
<br>
|
||||||
|
|
||||||
|
**GitHub Secret 配置(⚠️ 以下 4 个配置项都是必需的):**
|
||||||
|
|
||||||
|
| Name(名称) | Secret(值)说明 |
|
||||||
|
|-------------|-----------------|
|
||||||
|
| `S3_BUCKET_NAME` | 存储桶名称(如 `trendradar-data`) |
|
||||||
|
| `S3_ACCESS_KEY_ID` | 访问密钥 ID(Access Key ID) |
|
||||||
|
| `S3_SECRET_ACCESS_KEY` | 访问密钥(Secret Access Key) |
|
||||||
|
| `S3_ENDPOINT_URL` | S3 API 端点(如 R2:`https://<account-id>.r2.cloudflarestorage.com`) |
|
||||||
|
|
||||||
|
<br>
|
||||||
|
|
||||||
|
**如何获取凭据(以 Cloudflare R2 为例):**
|
||||||
|
|
||||||
|
1. **进入 R2 概览**:
|
||||||
|
- 登录 [Cloudflare Dashboard](https://dash.cloudflare.com/)。
|
||||||
|
- 在左侧侧边栏找到并点击 `R2对象存储`。
|
||||||
|
|
||||||
|
<br>
|
||||||
|
|
||||||
|
2. **创建存储桶**:
|
||||||
|
- 点击`概述`
|
||||||
|
- 点击右上角的 `创建存储桶` (Create bucket)。
|
||||||
|
- 输入名称(例如 `trendradar-data`),点击 `创建存储桶`。
|
||||||
|
|
||||||
|
<br>
|
||||||
|
|
||||||
|
3. **创建 API 令牌**:
|
||||||
|
- 回到 **概述**页面。
|
||||||
|
- 点击**右下角** `Account Details `找到并点击 `Manage` (Manage R2 API Tokens)。
|
||||||
|
- 同时你会看到 `S3 API`:`https://<account-id>.r2.cloudflarestorage.com`(这就是 S3_ENDPOINT_URL)
|
||||||
|
- 点击 `创建 Account APl 令牌` 。
|
||||||
|
- **⚠️ 关键设置**:
|
||||||
|
- **令牌名称**:随意填写(如 `github-action-write`)。
|
||||||
|
- **权限**:选择 `管理员读和写` 。
|
||||||
|
- **指定存储桶**:为了安全,建议选择 `仅适用于指定存储桶` 并选中你的桶(如 `trendradar-data`)。
|
||||||
|
- 点击 `创建 API 令牌`,**立即复制** 显示的 `Access Key ID` 和 `Secret Access Key`(只显示一次!)。
|
||||||
|
|
||||||
|
<br>
|
||||||
|
|
||||||
|
- **R2 免费额度**:每月 10GB 存储 + 100万次读取,对本项目来说非常充足。
|
||||||
|
- **支付验证**:开通 R2 即使是免费额度,Cloudflare 也要求绑定 PayPal 或信用卡进行身份验证(不会实际扣费,除非超过额度)。
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary>👉 点击展开:<strong>企业微信机器人</strong>(配置最简单最迅速)</summary>
|
<summary>👉 点击展开:<strong>企业微信机器人</strong>(配置最简单最迅速)</summary>
|
||||||
@ -1489,10 +1608,11 @@ frequency_words.txt 文件增加了一个【必须词】功能,使用 + 号
|
|||||||
|
|
||||||
**测试步骤**:
|
**测试步骤**:
|
||||||
1. 进入你项目的 Actions 页面
|
1. 进入你项目的 Actions 页面
|
||||||
2. 找到 **"Hot News Crawler"** 点进去
|
2. 找到 **"Get Hot News"**(必须得是这个字)点进去,点击右侧的 **"Run workflow"** 按钮运行
|
||||||
- 如果看不到该字样,参照 [#109](https://github.com/sansan0/TrendRadar/issues/109) 解决
|
- 如果看不到该字样,参照 [#109](https://github.com/sansan0/TrendRadar/issues/109) 解决
|
||||||
3. 点击右侧的 **"Run workflow"** 按钮运行
|
3. 3 分钟左右,消息会推送到你配置的平台
|
||||||
4. 等待 1 分钟左右,消息会推送到你配置的平台
|
|
||||||
|
<br>
|
||||||
|
|
||||||
> ⏱️ **测试提示**:
|
> ⏱️ **测试提示**:
|
||||||
> - 手动测试不要太频繁,避免触发 GitHub Actions 限制
|
> - 手动测试不要太频繁,避免触发 GitHub Actions 限制
|
||||||
@ -2069,7 +2189,7 @@ TrendRadar 提供两个独立的 Docker 镜像,可根据需求选择部署:
|
|||||||
|
|
||||||
# 下载 docker compose 配置
|
# 下载 docker compose 配置
|
||||||
wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/.env -P docker/
|
wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/.env -P docker/
|
||||||
wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/docker compose.yml -P docker/
|
wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/docker-compose.yml -P docker/
|
||||||
```
|
```
|
||||||
|
|
||||||
> 💡 **说明**:Docker 部署需要的关键目录结构如下:
|
> 💡 **说明**:Docker 部署需要的关键目录结构如下:
|
||||||
@ -2080,7 +2200,7 @@ TrendRadar 提供两个独立的 Docker 镜像,可根据需求选择部署:
|
|||||||
│ └── frequency_words.txt
|
│ └── frequency_words.txt
|
||||||
└── docker/
|
└── docker/
|
||||||
├── .env
|
├── .env
|
||||||
└── docker compose.yml
|
└── docker-compose.yml
|
||||||
```
|
```
|
||||||
|
|
||||||
2. **配置文件说明**:
|
2. **配置文件说明**:
|
||||||
@ -2174,7 +2294,7 @@ vim config/frequency_words.txt
|
|||||||
|
|
||||||
# 使用构建版本的 docker compose
|
# 使用构建版本的 docker compose
|
||||||
cd docker
|
cd docker
|
||||||
cp docker compose-build.yml docker compose.yml
|
cp docker-compose-build.yml docker-compose.yml
|
||||||
```
|
```
|
||||||
|
|
||||||
**构建并启动服务**:
|
**构建并启动服务**:
|
||||||
@ -2260,7 +2380,7 @@ docker rm trend-radar
|
|||||||
|
|
||||||
> 💡 **Web 服务器说明**:
|
> 💡 **Web 服务器说明**:
|
||||||
> - 启动后可通过浏览器访问 `http://localhost:8080` 查看最新报告
|
> - 启动后可通过浏览器访问 `http://localhost:8080` 查看最新报告
|
||||||
> - 通过目录导航访问历史报告(如:`http://localhost:8080/2025年xx月xx日/`)
|
> - 通过目录导航访问历史报告(如:`http://localhost:8080/2025-xx-xx/`)
|
||||||
> - 端口可在 `.env` 文件中配置 `WEBSERVER_PORT` 参数
|
> - 端口可在 `.env` 文件中配置 `WEBSERVER_PORT` 参数
|
||||||
> - 自动启动:在 `.env` 中设置 `ENABLE_WEBSERVER=true`
|
> - 自动启动:在 `.env` 中设置 `ENABLE_WEBSERVER=true`
|
||||||
> - 安全提示:仅提供静态文件访问,限制在 output 目录,只绑定本地访问
|
> - 安全提示:仅提供静态文件访问,限制在 output 目录,只绑定本地访问
|
||||||
@ -2277,7 +2397,7 @@ TrendRadar 生成的当日汇总 HTML 报告会同时保存到两个位置:
|
|||||||
|---------|---------|---------|
|
|---------|---------|---------|
|
||||||
| `output/index.html` | 宿主机直接访问 | **Docker 部署**(通过 Volume 挂载,宿主机可见) |
|
| `output/index.html` | 宿主机直接访问 | **Docker 部署**(通过 Volume 挂载,宿主机可见) |
|
||||||
| `index.html` | 根目录访问 | **GitHub Pages**(仓库根目录,Pages 自动识别) |
|
| `index.html` | 根目录访问 | **GitHub Pages**(仓库根目录,Pages 自动识别) |
|
||||||
| `output/YYYY年MM月DD日/html/当日汇总.html` | 历史报告访问 | 所有环境(按日期归档) |
|
| `output/YYYY-MM-DD/html/当日汇总.html` | 历史报告访问 | 所有环境(按日期归档) |
|
||||||
|
|
||||||
**本地访问示例**:
|
**本地访问示例**:
|
||||||
```bash
|
```bash
|
||||||
@ -2286,8 +2406,8 @@ TrendRadar 生成的当日汇总 HTML 报告会同时保存到两个位置:
|
|||||||
docker exec -it trend-radar python manage.py start_webserver
|
docker exec -it trend-radar python manage.py start_webserver
|
||||||
# 2. 在浏览器访问
|
# 2. 在浏览器访问
|
||||||
http://localhost:8080 # 访问最新报告(默认 index.html)
|
http://localhost:8080 # 访问最新报告(默认 index.html)
|
||||||
http://localhost:8080/2025年xx月xx日/ # 访问指定日期的报告
|
http://localhost:8080/2025-xx-xx/ # 访问指定日期的报告
|
||||||
http://localhost:8080/2025年xx月xx日/html/ # 浏览该日期下的所有 HTML 文件
|
http://localhost:8080/2025-xx-xx/html/ # 浏览该日期下的所有 HTML 文件
|
||||||
|
|
||||||
# 方式 2:直接打开文件(本地环境)
|
# 方式 2:直接打开文件(本地环境)
|
||||||
open ./output/index.html # macOS
|
open ./output/index.html # macOS
|
||||||
@ -2295,7 +2415,7 @@ start ./output/index.html # Windows
|
|||||||
xdg-open ./output/index.html # Linux
|
xdg-open ./output/index.html # Linux
|
||||||
|
|
||||||
# 方式 3:访问历史归档
|
# 方式 3:访问历史归档
|
||||||
open ./output/2025年xx月xx日/html/当日汇总.html
|
open ./output/2025-xx-xx/html/当日汇总.html
|
||||||
```
|
```
|
||||||
|
|
||||||
**为什么有两个 index.html?**
|
**为什么有两个 index.html?**
|
||||||
@ -2349,34 +2469,42 @@ flowchart TB
|
|||||||
|
|
||||||
**快速启动**:
|
**快速启动**:
|
||||||
|
|
||||||
使用 docker compose 同时启动新闻推送和 MCP 服务:
|
如果已按照 [方式一:使用 docker compose](#方式一使用-docker-compose推荐) 完成部署,只需启动 MCP 服务:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# 下载最新的 docker compose.yml(已包含 MCP 服务配置)
|
cd TrendRadar/docker
|
||||||
wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/docker compose.yml
|
docker compose up -d trend-radar-mcp
|
||||||
|
|
||||||
# 启动所有服务
|
|
||||||
docker compose up -d
|
|
||||||
|
|
||||||
# 查看运行状态
|
# 查看运行状态
|
||||||
docker ps | grep trend-radar
|
docker ps | grep trend-radar-mcp
|
||||||
```
|
```
|
||||||
|
|
||||||
**单独启动 MCP 服务**:
|
**单独启动 MCP 服务**(不使用 docker compose):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
# Linux/Mac
|
||||||
docker run -d --name trend-radar-mcp \
|
docker run -d --name trend-radar-mcp \
|
||||||
-p 127.0.0.1:3333:3333 \
|
-p 127.0.0.1:3333:3333 \
|
||||||
-v ./config:/app/config:ro \
|
-v $(pwd)/config:/app/config:ro \
|
||||||
-v ./output:/app/output:ro \
|
-v $(pwd)/output:/app/output:ro \
|
||||||
-e TZ=Asia/Shanghai \
|
-e TZ=Asia/Shanghai \
|
||||||
wantcat/trendradar-mcp:latest
|
wantcat/trendradar-mcp:latest
|
||||||
|
|
||||||
|
# Windows PowerShell
|
||||||
|
docker run -d --name trend-radar-mcp `
|
||||||
|
-p 127.0.0.1:3333:3333 `
|
||||||
|
-v ${PWD}/config:/app/config:ro `
|
||||||
|
-v ${PWD}/output:/app/output:ro `
|
||||||
|
-e TZ=Asia/Shanghai `
|
||||||
|
wantcat/trendradar-mcp:latest
|
||||||
```
|
```
|
||||||
|
|
||||||
|
> ⚠️ **注意**:单独运行时,确保当前目录下有 `config/` 和 `output/` 文件夹,且包含配置文件和新闻数据。
|
||||||
|
|
||||||
**验证服务**:
|
**验证服务**:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# 检查 MCP 服务是否正常运行
|
# 检查 MCP 服务健康状态
|
||||||
curl http://127.0.0.1:3333/mcp
|
curl http://127.0.0.1:3333/mcp
|
||||||
|
|
||||||
# 查看 MCP 服务日志
|
# 查看 MCP 服务日志
|
||||||
@ -2385,14 +2513,20 @@ docker logs -f trend-radar-mcp
|
|||||||
|
|
||||||
**在 AI 客户端中配置**:
|
**在 AI 客户端中配置**:
|
||||||
|
|
||||||
MCP 服务启动后,在 Claude Desktop、Cherry Studio、Cursor 等客户端中配置:
|
MCP 服务启动后,根据不同客户端进行配置:
|
||||||
|
|
||||||
|
**Cherry Studio**(推荐,GUI 配置):
|
||||||
|
- 设置 → MCP 服务器 → 添加
|
||||||
|
- 类型:`streamableHttp`
|
||||||
|
- URL:`http://127.0.0.1:3333/mcp`
|
||||||
|
|
||||||
|
**Claude Desktop / Cline**(JSON 配置):
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"mcpServers": {
|
"mcpServers": {
|
||||||
"trendradar": {
|
"trendradar": {
|
||||||
"url": "http://127.0.0.1:3333/mcp",
|
"url": "http://127.0.0.1:3333/mcp",
|
||||||
"description": "TrendRadar 新闻热点分析"
|
"type": "streamableHttp"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -2480,7 +2614,6 @@ notification:
|
|||||||
start: "20:00" # 开始时间(北京时间)
|
start: "20:00" # 开始时间(北京时间)
|
||||||
end: "22:00" # 结束时间(北京时间)
|
end: "22:00" # 结束时间(北京时间)
|
||||||
once_per_day: true # 每天只推送一次
|
once_per_day: true # 每天只推送一次
|
||||||
push_record_retention_days: 7 # 推送记录保留天数
|
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 配置项详解
|
#### 配置项详解
|
||||||
@ -2491,7 +2624,6 @@ notification:
|
|||||||
| `time_range.start` | string | `"20:00"` | 推送时间窗口开始时间(北京时间,HH:MM 格式) |
|
| `time_range.start` | string | `"20:00"` | 推送时间窗口开始时间(北京时间,HH:MM 格式) |
|
||||||
| `time_range.end` | string | `"22:00"` | 推送时间窗口结束时间(北京时间,HH:MM 格式) |
|
| `time_range.end` | string | `"22:00"` | 推送时间窗口结束时间(北京时间,HH:MM 格式) |
|
||||||
| `once_per_day` | bool | `true` | `true`=每天在窗口内只推送一次,`false`=窗口内每次执行都推送 |
|
| `once_per_day` | bool | `true` | `true`=每天在窗口内只推送一次,`false`=窗口内每次执行都推送 |
|
||||||
| `push_record_retention_days` | int | `7` | 推送记录保留天数(用于判断是否已推送) |
|
|
||||||
|
|
||||||
#### 使用场景
|
#### 使用场景
|
||||||
|
|
||||||
@ -2515,7 +2647,6 @@ PUSH_WINDOW_ENABLED=true
|
|||||||
PUSH_WINDOW_START=09:00
|
PUSH_WINDOW_START=09:00
|
||||||
PUSH_WINDOW_END=18:00
|
PUSH_WINDOW_END=18:00
|
||||||
PUSH_WINDOW_ONCE_PER_DAY=false
|
PUSH_WINDOW_ONCE_PER_DAY=false
|
||||||
PUSH_WINDOW_RETENTION_DAYS=7
|
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 完整配置示例
|
#### 完整配置示例
|
||||||
@ -2530,7 +2661,6 @@ notification:
|
|||||||
start: "20:00"
|
start: "20:00"
|
||||||
end: "22:00"
|
end: "22:00"
|
||||||
once_per_day: true
|
once_per_day: true
|
||||||
push_record_retention_days: 7
|
|
||||||
```
|
```
|
||||||
|
|
||||||
**场景:工作时间内每小时推送**
|
**场景:工作时间内每小时推送**
|
||||||
@ -2543,7 +2673,6 @@ notification:
|
|||||||
start: "09:00"
|
start: "09:00"
|
||||||
end: "18:00"
|
end: "18:00"
|
||||||
once_per_day: false
|
once_per_day: false
|
||||||
push_record_retention_days: 7
|
|
||||||
```
|
```
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
@ -2829,6 +2958,123 @@ notification:
|
|||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
|
### 11. 存储配置
|
||||||
|
|
||||||
|
<details id="storage-config">
|
||||||
|
<summary>👉 点击展开:<strong>存储架构配置详解</strong></summary>
|
||||||
|
<br>
|
||||||
|
|
||||||
|
#### 存储后端选择
|
||||||
|
|
||||||
|
**配置位置**:`config/config.yaml` 的 `storage` 部分
|
||||||
|
|
||||||
|
v4.0.0 版本重构了存储架构,支持多种存储后端:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
storage:
|
||||||
|
backend: auto # 存储后端:auto(自动选择)/ local(本地SQLite)/ remote(远程云存储)
|
||||||
|
|
||||||
|
formats:
|
||||||
|
sqlite: true # 是否启用SQLite存储
|
||||||
|
txt: true # 是否生成TXT快照
|
||||||
|
html: true # 是否生成HTML报告
|
||||||
|
|
||||||
|
local:
|
||||||
|
data_dir: "output" # 本地存储目录
|
||||||
|
retention_days: 0 # 本地数据保留天数,0表示永久保留
|
||||||
|
|
||||||
|
remote:
|
||||||
|
endpoint_url: "" # S3 API 端点
|
||||||
|
bucket_name: "" # 存储桶名称
|
||||||
|
access_key_id: "" # 访问密钥ID
|
||||||
|
secret_access_key: "" # 访问密钥
|
||||||
|
region: "" # 区域(可选)
|
||||||
|
retention_days: 0 # 远程数据保留天数,0表示永久保留
|
||||||
|
|
||||||
|
pull:
|
||||||
|
enabled: false # 是否启用启动时从远程拉取数据
|
||||||
|
days: 7 # 拉取最近N天的数据
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 后端选择策略
|
||||||
|
|
||||||
|
| backend 值 | 说明 | 适用场景 |
|
||||||
|
|-----------|------|---------|
|
||||||
|
| `auto` | **自动选择**(推荐) | 根据运行环境智能选择:<br>• GitHub Actions → Remote<br>• Docker/本地 → Local |
|
||||||
|
| `local` | 本地 SQLite 数据库 | Docker 部署、本地开发 |
|
||||||
|
| `remote` | 远程云存储(S3 兼容,如 Cloudflare R2) | GitHub Actions、多机器同步 |
|
||||||
|
|
||||||
|
|
||||||
|
#### 远程云存储配置
|
||||||
|
|
||||||
|
**环境变量**(推荐方式):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# GitHub Actions / Docker 环境变量
|
||||||
|
STORAGE_BACKEND=remote # 或 auto
|
||||||
|
|
||||||
|
# 本地/远程数据保留天数(0 表示永久保留)
|
||||||
|
LOCAL_RETENTION_DAYS=0
|
||||||
|
REMOTE_RETENTION_DAYS=0
|
||||||
|
|
||||||
|
# S3 兼容存储配置(以 Cloudflare R2 为例)
|
||||||
|
S3_BUCKET_NAME=your-bucket-name
|
||||||
|
S3_ACCESS_KEY_ID=your-access-key-id
|
||||||
|
S3_SECRET_ACCESS_KEY=your-secret-access-key
|
||||||
|
S3_ENDPOINT_URL=https://<account-id>.r2.cloudflarestorage.com
|
||||||
|
S3_REGION=auto
|
||||||
|
|
||||||
|
# 数据拉取配置(可选,从远程同步到本地)
|
||||||
|
PULL_ENABLED=false
|
||||||
|
PULL_DAYS=7
|
||||||
|
```
|
||||||
|
|
||||||
|
**获取凭据**:参见 [快速开始 - 远程存储配置](#-快速开始)
|
||||||
|
|
||||||
|
#### 数据清理策略
|
||||||
|
|
||||||
|
**自动清理**:每次运行结束时检查并删除超过保留天数的数据。
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
storage:
|
||||||
|
local:
|
||||||
|
retention_days: 30 # 本地保留最近30天数据
|
||||||
|
remote:
|
||||||
|
retention_days: 30 # 远程保留最近30天数据
|
||||||
|
```
|
||||||
|
|
||||||
|
**清理逻辑**:
|
||||||
|
- 本地存储:删除过期日期的文件夹(如 `output/2025-11-10/`)
|
||||||
|
- 远程存储:批量删除过期的云端对象(如 `news/2025-11-10.db`)
|
||||||
|
|
||||||
|
#### 时区配置(v4.0.0 新增)
|
||||||
|
|
||||||
|
**全球时区支持**:解决非中国用户推送时间窗口问题。
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
app:
|
||||||
|
timezone: "Asia/Shanghai" # 默认中国时区
|
||||||
|
# 其他示例:
|
||||||
|
# timezone: "America/Los_Angeles" # 美西时间
|
||||||
|
# timezone: "Europe/London" # 英国时间
|
||||||
|
```
|
||||||
|
|
||||||
|
**支持所有 IANA 时区名称**:[时区列表](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones)
|
||||||
|
|
||||||
|
|
||||||
|
#### 不兼容变更
|
||||||
|
|
||||||
|
⚠️ **v4.0.0 不兼容 v3.x 数据**:
|
||||||
|
|
||||||
|
1. 数据库结构完全重构,无法读取旧数据
|
||||||
|
2. 文件路径格式变更(ISO 格式)
|
||||||
|
|
||||||
|
**迁移建议**:
|
||||||
|
- 从 v4.0.0 开始重新收集数据
|
||||||
|
- 旧数据如需保留,请手动重命名目录格式(不推荐)
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
<br>
|
<br>
|
||||||
|
|
||||||
## 🤖 AI 智能分析
|
## 🤖 AI 智能分析
|
||||||
@ -2846,7 +3092,7 @@ AI 分析功能**不是**直接查询网络实时数据,而是分析你**本
|
|||||||
|
|
||||||
#### 使用说明:
|
#### 使用说明:
|
||||||
|
|
||||||
1. **项目自带测试数据**:`output` 目录默认包含 **2025年11月1日~11月15日** 的新闻数据,可用于快速体验 AI 功能
|
1. **项目自带测试数据**:`output` 目录默认包含 **2025-11-01~2025-11-15** 的新闻数据,可用于快速体验 AI 功能
|
||||||
|
|
||||||
2. **查询限制**:
|
2. **查询限制**:
|
||||||
- ✅ 只能查询已有日期范围内的数据(11月1-15日)
|
- ✅ 只能查询已有日期范围内的数据(11月1-15日)
|
||||||
|
|||||||
@ -1,12 +1,60 @@
|
|||||||
app:
|
app:
|
||||||
version_check_url: "https://raw.githubusercontent.com/sansan0/TrendRadar/refs/heads/master/version"
|
version_check_url: "https://raw.githubusercontent.com/sansan0/TrendRadar/refs/heads/master/version"
|
||||||
show_version_update: true # 控制显示版本更新提示,如果 false,则不接受新版本提示
|
show_version_update: true # 控制显示版本更新提示,如果 false,则不接受新版本提示
|
||||||
|
# 时区配置(影响所有时间显示、推送窗口判断、数据存储)
|
||||||
|
# 常用时区:
|
||||||
|
# - Asia/Shanghai (北京时间 UTC+8)
|
||||||
|
# - America/New_York (美东时间 UTC-5/-4)
|
||||||
|
# - Europe/London (伦敦时间 UTC+0/+1)
|
||||||
|
# 完整时区列表: https://en.wikipedia.org/wiki/List_of_tz_database_time_zones
|
||||||
|
timezone: "Asia/Shanghai"
|
||||||
|
|
||||||
|
# 存储配置
|
||||||
|
storage:
|
||||||
|
# 存储后端选择: local / remote / auto
|
||||||
|
# - local: 本地 SQLite + TXT/HTML 文件
|
||||||
|
# - remote: 远程云存储(S3 兼容协议,支持 R2/OSS/COS 等)
|
||||||
|
# - auto: 自动选择(GitHub Actions 环境且配置了远程存储则用 remote,否则用 local)
|
||||||
|
backend: "auto"
|
||||||
|
|
||||||
|
# 数据格式选项
|
||||||
|
formats:
|
||||||
|
sqlite: true # 主存储(必须启用)
|
||||||
|
txt: false # 是否生成 TXT 快照
|
||||||
|
html: false # 是否生成 HTML 报告
|
||||||
|
|
||||||
|
# 本地存储配置
|
||||||
|
local:
|
||||||
|
data_dir: "output" # 数据目录
|
||||||
|
retention_days: 0 # 本地数据保留天数(0 = 不清理)
|
||||||
|
|
||||||
|
# 远程存储配置(S3 兼容协议)
|
||||||
|
# 支持: Cloudflare R2, 阿里云 OSS, 腾讯云 COS, AWS S3, MinIO 等
|
||||||
|
# 建议将敏感信息配置在 GitHub Secrets 或环境变量中
|
||||||
|
remote:
|
||||||
|
# 数据保留天数(0 = 不清理远程数据)
|
||||||
|
retention_days: 0
|
||||||
|
# S3 兼容配置
|
||||||
|
endpoint_url: "" # 服务端点(或环境变量 S3_ENDPOINT_URL)
|
||||||
|
# Cloudflare R2: https://<account_id>.r2.cloudflarestorage.com
|
||||||
|
# 阿里云 OSS: https://oss-cn-hangzhou.aliyuncs.com
|
||||||
|
# 腾讯云 COS: https://cos.ap-guangzhou.myqcloud.com
|
||||||
|
bucket_name: "" # 存储桶名称(或环境变量 S3_BUCKET_NAME)
|
||||||
|
access_key_id: "" # 访问密钥 ID(或环境变量 S3_ACCESS_KEY_ID)
|
||||||
|
secret_access_key: "" # 访问密钥(或环境变量 S3_SECRET_ACCESS_KEY)
|
||||||
|
region: "" # 区域(可选,部分服务商需要,或环境变量 S3_REGION)
|
||||||
|
|
||||||
|
# 数据拉取配置(从远程同步到本地)
|
||||||
|
# 用于 MCP Server 等场景:爬虫存到远程,MCP 拉取到本地分析
|
||||||
|
pull:
|
||||||
|
enabled: false # 是否启用启动时自动拉取
|
||||||
|
days: 7 # 拉取最近 N 天的数据(0 = 不拉取)
|
||||||
|
|
||||||
crawler:
|
crawler:
|
||||||
request_interval: 1000 # 请求间隔(毫秒)
|
request_interval: 1000 # 请求间隔(毫秒)
|
||||||
enable_crawler: true # 是否启用爬取新闻功能,如果 false,则直接停止程序
|
enable_crawler: true # 是否启用爬取新闻功能,如果 false,则直接停止程序
|
||||||
use_proxy: false # 是否启用代理,false 时为关闭
|
use_proxy: false # 是否启用代理,false 时为关闭
|
||||||
default_proxy: "http://127.0.0.1:10086"
|
default_proxy: "http://127.0.0.1:10801"
|
||||||
|
|
||||||
# 🔸 daily(当日汇总模式)
|
# 🔸 daily(当日汇总模式)
|
||||||
# • 推送时机:按时推送(默认每小时推送一次)
|
# • 推送时机:按时推送(默认每小时推送一次)
|
||||||
@ -55,7 +103,6 @@ notification:
|
|||||||
start: "20:00" # 推送时间窗口开始(北京时间)
|
start: "20:00" # 推送时间窗口开始(北京时间)
|
||||||
end: "22:00" # 推送时间窗口结束(北京时间)
|
end: "22:00" # 推送时间窗口结束(北京时间)
|
||||||
once_per_day: true # 每天在时间窗口内只推送一次,如果 false,则窗口内每次执行都推送
|
once_per_day: true # 每天在时间窗口内只推送一次,如果 false,则窗口内每次执行都推送
|
||||||
push_record_retention_days: 7 # 推送记录保留天数
|
|
||||||
|
|
||||||
# ⚠️⚠️⚠️ 重要安全警告 / IMPORTANT SECURITY WARNING ⚠️⚠️⚠️
|
# ⚠️⚠️⚠️ 重要安全警告 / IMPORTANT SECURITY WARNING ⚠️⚠️⚠️
|
||||||
#
|
#
|
||||||
|
|||||||
35
docker/.env
35
docker/.env
@ -40,8 +40,6 @@ PUSH_WINDOW_START=
|
|||||||
PUSH_WINDOW_END=
|
PUSH_WINDOW_END=
|
||||||
# 每天只推送一次 (true/false)
|
# 每天只推送一次 (true/false)
|
||||||
PUSH_WINDOW_ONCE_PER_DAY=
|
PUSH_WINDOW_ONCE_PER_DAY=
|
||||||
# 推送记录保留天数 (数字,如 7)
|
|
||||||
PUSH_WINDOW_RETENTION_DAYS=
|
|
||||||
|
|
||||||
# ============================================
|
# ============================================
|
||||||
# 多账号配置
|
# 多账号配置
|
||||||
@ -87,6 +85,39 @@ BARK_URL=
|
|||||||
# Slack 推送配置(多账号用 ; 分隔)
|
# Slack 推送配置(多账号用 ; 分隔)
|
||||||
SLACK_WEBHOOK_URL=
|
SLACK_WEBHOOK_URL=
|
||||||
|
|
||||||
|
# ============================================
|
||||||
|
# 存储配置
|
||||||
|
# ============================================
|
||||||
|
|
||||||
|
# 存储后端选择 (local/remote/auto)
|
||||||
|
# - local: 本地 SQLite + TXT/HTML 文件
|
||||||
|
# - remote: 远程云存储(S3 兼容协议)
|
||||||
|
# - auto: 自动选择(GitHub Actions 用 remote,其他用 local)
|
||||||
|
STORAGE_BACKEND=auto
|
||||||
|
|
||||||
|
# 本地数据保留天数(0 = 无限制,不清理历史数据)
|
||||||
|
LOCAL_RETENTION_DAYS=0
|
||||||
|
|
||||||
|
# 远程数据保留天数(0 = 无限制,不清理历史数据)
|
||||||
|
REMOTE_RETENTION_DAYS=0
|
||||||
|
|
||||||
|
# 是否生成 TXT 快照 (true/false)
|
||||||
|
STORAGE_TXT_ENABLED=
|
||||||
|
|
||||||
|
# 是否生成 HTML 报告 (true/false)
|
||||||
|
STORAGE_HTML_ENABLED=
|
||||||
|
|
||||||
|
# 远程存储配置(S3 兼容协议,支持 R2/OSS/COS/S3 等)
|
||||||
|
S3_ENDPOINT_URL=
|
||||||
|
S3_BUCKET_NAME=
|
||||||
|
S3_ACCESS_KEY_ID=
|
||||||
|
S3_SECRET_ACCESS_KEY=
|
||||||
|
S3_REGION=
|
||||||
|
|
||||||
|
# 数据拉取配置(从远程同步到本地)
|
||||||
|
PULL_ENABLED=false
|
||||||
|
PULL_DAYS=7
|
||||||
|
|
||||||
# ============================================
|
# ============================================
|
||||||
# 运行配置
|
# 运行配置
|
||||||
# ============================================
|
# ============================================
|
||||||
|
|||||||
@ -53,8 +53,8 @@ RUN set -ex && \
|
|||||||
COPY requirements.txt .
|
COPY requirements.txt .
|
||||||
RUN pip install --no-cache-dir -r requirements.txt
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
COPY main.py .
|
|
||||||
COPY docker/manage.py .
|
COPY docker/manage.py .
|
||||||
|
COPY trendradar/ ./trendradar/
|
||||||
|
|
||||||
# 复制 entrypoint.sh 并强制转换为 LF 格式
|
# 复制 entrypoint.sh 并强制转换为 LF 格式
|
||||||
COPY docker/entrypoint.sh /entrypoint.sh.tmp
|
COPY docker/entrypoint.sh /entrypoint.sh.tmp
|
||||||
|
|||||||
@ -8,6 +8,8 @@ RUN pip install --no-cache-dir -r requirements.txt
|
|||||||
|
|
||||||
# 复制 MCP 服务器代码
|
# 复制 MCP 服务器代码
|
||||||
COPY mcp_server/ ./mcp_server/
|
COPY mcp_server/ ./mcp_server/
|
||||||
|
# 复制 trendradar 模块(MCP 服务需要读取 SQLite 数据)
|
||||||
|
COPY trendradar/ ./trendradar/
|
||||||
|
|
||||||
# 创建必要目录
|
# 创建必要目录
|
||||||
RUN mkdir -p /app/config /app/output
|
RUN mkdir -p /app/config /app/output
|
||||||
|
|||||||
@ -32,7 +32,6 @@ services:
|
|||||||
- PUSH_WINDOW_START=${PUSH_WINDOW_START:-}
|
- PUSH_WINDOW_START=${PUSH_WINDOW_START:-}
|
||||||
- PUSH_WINDOW_END=${PUSH_WINDOW_END:-}
|
- PUSH_WINDOW_END=${PUSH_WINDOW_END:-}
|
||||||
- PUSH_WINDOW_ONCE_PER_DAY=${PUSH_WINDOW_ONCE_PER_DAY:-}
|
- PUSH_WINDOW_ONCE_PER_DAY=${PUSH_WINDOW_ONCE_PER_DAY:-}
|
||||||
- PUSH_WINDOW_RETENTION_DAYS=${PUSH_WINDOW_RETENTION_DAYS:-}
|
|
||||||
# 通知渠道
|
# 通知渠道
|
||||||
- FEISHU_WEBHOOK_URL=${FEISHU_WEBHOOK_URL:-}
|
- FEISHU_WEBHOOK_URL=${FEISHU_WEBHOOK_URL:-}
|
||||||
- TELEGRAM_BOT_TOKEN=${TELEGRAM_BOT_TOKEN:-}
|
- TELEGRAM_BOT_TOKEN=${TELEGRAM_BOT_TOKEN:-}
|
||||||
@ -54,6 +53,21 @@ services:
|
|||||||
- BARK_URL=${BARK_URL:-}
|
- BARK_URL=${BARK_URL:-}
|
||||||
# Slack配置
|
# Slack配置
|
||||||
- SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL:-}
|
- SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL:-}
|
||||||
|
# 存储配置
|
||||||
|
- STORAGE_BACKEND=${STORAGE_BACKEND:-auto}
|
||||||
|
- LOCAL_RETENTION_DAYS=${LOCAL_RETENTION_DAYS:-0}
|
||||||
|
- REMOTE_RETENTION_DAYS=${REMOTE_RETENTION_DAYS:-0}
|
||||||
|
- STORAGE_TXT_ENABLED=${STORAGE_TXT_ENABLED:-true}
|
||||||
|
- STORAGE_HTML_ENABLED=${STORAGE_HTML_ENABLED:-true}
|
||||||
|
# 远程存储配置(S3 兼容协议)
|
||||||
|
- S3_ENDPOINT_URL=${S3_ENDPOINT_URL:-}
|
||||||
|
- S3_BUCKET_NAME=${S3_BUCKET_NAME:-}
|
||||||
|
- S3_ACCESS_KEY_ID=${S3_ACCESS_KEY_ID:-}
|
||||||
|
- S3_SECRET_ACCESS_KEY=${S3_SECRET_ACCESS_KEY:-}
|
||||||
|
- S3_REGION=${S3_REGION:-}
|
||||||
|
# 数据拉取配置
|
||||||
|
- PULL_ENABLED=${PULL_ENABLED:-false}
|
||||||
|
- PULL_DAYS=${PULL_DAYS:-7}
|
||||||
# 运行模式
|
# 运行模式
|
||||||
- CRON_SCHEDULE=${CRON_SCHEDULE:-*/5 * * * *}
|
- CRON_SCHEDULE=${CRON_SCHEDULE:-*/5 * * * *}
|
||||||
- RUN_MODE=${RUN_MODE:-cron}
|
- RUN_MODE=${RUN_MODE:-cron}
|
||||||
@ -71,7 +85,7 @@ services:
|
|||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
- ../config:/app/config:ro
|
- ../config:/app/config:ro
|
||||||
- ../output:/app/output:ro
|
- ../output:/app/output
|
||||||
|
|
||||||
environment:
|
environment:
|
||||||
- TZ=Asia/Shanghai
|
- TZ=Asia/Shanghai
|
||||||
|
|||||||
@ -30,7 +30,6 @@ services:
|
|||||||
- PUSH_WINDOW_START=${PUSH_WINDOW_START:-}
|
- PUSH_WINDOW_START=${PUSH_WINDOW_START:-}
|
||||||
- PUSH_WINDOW_END=${PUSH_WINDOW_END:-}
|
- PUSH_WINDOW_END=${PUSH_WINDOW_END:-}
|
||||||
- PUSH_WINDOW_ONCE_PER_DAY=${PUSH_WINDOW_ONCE_PER_DAY:-}
|
- PUSH_WINDOW_ONCE_PER_DAY=${PUSH_WINDOW_ONCE_PER_DAY:-}
|
||||||
- PUSH_WINDOW_RETENTION_DAYS=${PUSH_WINDOW_RETENTION_DAYS:-}
|
|
||||||
# 通知渠道
|
# 通知渠道
|
||||||
- FEISHU_WEBHOOK_URL=${FEISHU_WEBHOOK_URL:-}
|
- FEISHU_WEBHOOK_URL=${FEISHU_WEBHOOK_URL:-}
|
||||||
- TELEGRAM_BOT_TOKEN=${TELEGRAM_BOT_TOKEN:-}
|
- TELEGRAM_BOT_TOKEN=${TELEGRAM_BOT_TOKEN:-}
|
||||||
@ -52,6 +51,21 @@ services:
|
|||||||
- BARK_URL=${BARK_URL:-}
|
- BARK_URL=${BARK_URL:-}
|
||||||
# Slack配置
|
# Slack配置
|
||||||
- SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL:-}
|
- SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL:-}
|
||||||
|
# 存储配置
|
||||||
|
- STORAGE_BACKEND=${STORAGE_BACKEND:-auto}
|
||||||
|
- LOCAL_RETENTION_DAYS=${LOCAL_RETENTION_DAYS:-0}
|
||||||
|
- REMOTE_RETENTION_DAYS=${REMOTE_RETENTION_DAYS:-0}
|
||||||
|
- STORAGE_TXT_ENABLED=${STORAGE_TXT_ENABLED:-true}
|
||||||
|
- STORAGE_HTML_ENABLED=${STORAGE_HTML_ENABLED:-true}
|
||||||
|
# 远程存储配置(S3 兼容协议)
|
||||||
|
- S3_ENDPOINT_URL=${S3_ENDPOINT_URL:-}
|
||||||
|
- S3_BUCKET_NAME=${S3_BUCKET_NAME:-}
|
||||||
|
- S3_ACCESS_KEY_ID=${S3_ACCESS_KEY_ID:-}
|
||||||
|
- S3_SECRET_ACCESS_KEY=${S3_SECRET_ACCESS_KEY:-}
|
||||||
|
- S3_REGION=${S3_REGION:-}
|
||||||
|
# 数据拉取配置
|
||||||
|
- PULL_ENABLED=${PULL_ENABLED:-false}
|
||||||
|
- PULL_DAYS=${PULL_DAYS:-7}
|
||||||
# 运行模式
|
# 运行模式
|
||||||
- CRON_SCHEDULE=${CRON_SCHEDULE:-*/5 * * * *}
|
- CRON_SCHEDULE=${CRON_SCHEDULE:-*/5 * * * *}
|
||||||
- RUN_MODE=${RUN_MODE:-cron}
|
- RUN_MODE=${RUN_MODE:-cron}
|
||||||
@ -67,7 +81,7 @@ services:
|
|||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
- ../config:/app/config:ro
|
- ../config:/app/config:ro
|
||||||
- ../output:/app/output:ro
|
- ../output:/app/output
|
||||||
|
|
||||||
environment:
|
environment:
|
||||||
- TZ=Asia/Shanghai
|
- TZ=Asia/Shanghai
|
||||||
|
|||||||
@ -13,11 +13,11 @@ env >> /etc/environment
|
|||||||
case "${RUN_MODE:-cron}" in
|
case "${RUN_MODE:-cron}" in
|
||||||
"once")
|
"once")
|
||||||
echo "🔄 单次执行"
|
echo "🔄 单次执行"
|
||||||
exec /usr/local/bin/python main.py
|
exec /usr/local/bin/python -m trendradar
|
||||||
;;
|
;;
|
||||||
"cron")
|
"cron")
|
||||||
# 生成 crontab
|
# 生成 crontab
|
||||||
echo "${CRON_SCHEDULE:-*/30 * * * *} cd /app && /usr/local/bin/python main.py" > /tmp/crontab
|
echo "${CRON_SCHEDULE:-*/30 * * * *} cd /app && /usr/local/bin/python -m trendradar" > /tmp/crontab
|
||||||
|
|
||||||
echo "📅 生成的crontab内容:"
|
echo "📅 生成的crontab内容:"
|
||||||
cat /tmp/crontab
|
cat /tmp/crontab
|
||||||
@ -30,7 +30,7 @@ case "${RUN_MODE:-cron}" in
|
|||||||
# 立即执行一次(如果配置了)
|
# 立即执行一次(如果配置了)
|
||||||
if [ "${IMMEDIATE_RUN:-false}" = "true" ]; then
|
if [ "${IMMEDIATE_RUN:-false}" = "true" ]; then
|
||||||
echo "▶️ 立即执行一次"
|
echo "▶️ 立即执行一次"
|
||||||
/usr/local/bin/python main.py
|
/usr/local/bin/python -m trendradar
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# 启动 Web 服务器(如果配置了)
|
# 启动 Web 服务器(如果配置了)
|
||||||
|
|||||||
@ -33,7 +33,7 @@ def manual_run():
|
|||||||
print("🔄 手动执行爬虫...")
|
print("🔄 手动执行爬虫...")
|
||||||
try:
|
try:
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
["python", "main.py"], cwd="/app", capture_output=False, text=True
|
["python", "-m", "trendradar"], cwd="/app", capture_output=False, text=True
|
||||||
)
|
)
|
||||||
if result.returncode == 0:
|
if result.returncode == 0:
|
||||||
print("✅ 执行完成")
|
print("✅ 执行完成")
|
||||||
@ -285,12 +285,24 @@ def show_config():
|
|||||||
"TELEGRAM_CHAT_ID",
|
"TELEGRAM_CHAT_ID",
|
||||||
"CONFIG_PATH",
|
"CONFIG_PATH",
|
||||||
"FREQUENCY_WORDS_PATH",
|
"FREQUENCY_WORDS_PATH",
|
||||||
|
# 存储配置
|
||||||
|
"STORAGE_BACKEND",
|
||||||
|
"LOCAL_RETENTION_DAYS",
|
||||||
|
"REMOTE_RETENTION_DAYS",
|
||||||
|
"STORAGE_TXT_ENABLED",
|
||||||
|
"STORAGE_HTML_ENABLED",
|
||||||
|
"S3_BUCKET_NAME",
|
||||||
|
"S3_ACCESS_KEY_ID",
|
||||||
|
"S3_ENDPOINT_URL",
|
||||||
|
"S3_REGION",
|
||||||
|
"PULL_ENABLED",
|
||||||
|
"PULL_DAYS",
|
||||||
]
|
]
|
||||||
|
|
||||||
for var in env_vars:
|
for var in env_vars:
|
||||||
value = os.environ.get(var, "未设置")
|
value = os.environ.get(var, "未设置")
|
||||||
# 隐藏敏感信息
|
# 隐藏敏感信息
|
||||||
if any(sensitive in var for sensitive in ["WEBHOOK", "TOKEN", "KEY"]):
|
if any(sensitive in var for sensitive in ["WEBHOOK", "TOKEN", "KEY", "SECRET"]):
|
||||||
if value and value != "未设置":
|
if value and value != "未设置":
|
||||||
masked_value = value[:10] + "***" if len(value) > 10 else "***"
|
masked_value = value[:10] + "***" if len(value) > 10 else "***"
|
||||||
print(f" {var}: {masked_value}")
|
print(f" {var}: {masked_value}")
|
||||||
@ -331,6 +343,17 @@ def show_files():
|
|||||||
# 显示最近2天的文件
|
# 显示最近2天的文件
|
||||||
for date_dir in date_dirs[:2]:
|
for date_dir in date_dirs[:2]:
|
||||||
print(f" 📅 {date_dir.name}:")
|
print(f" 📅 {date_dir.name}:")
|
||||||
|
|
||||||
|
# 检查 SQLite 数据库文件
|
||||||
|
db_files = list(date_dir.glob("*.db"))
|
||||||
|
if db_files:
|
||||||
|
print(f" 💾 SQLite: {len(db_files)} 个数据库")
|
||||||
|
for db_file in db_files[:3]:
|
||||||
|
mtime = time.ctime(db_file.stat().st_mtime)
|
||||||
|
size_kb = db_file.stat().st_size // 1024
|
||||||
|
print(f" 📀 {db_file.name} ({size_kb}KB, {mtime.split()[3][:5]})")
|
||||||
|
|
||||||
|
# 检查子目录(html, txt)
|
||||||
for subdir in ["html", "txt"]:
|
for subdir in ["html", "txt"]:
|
||||||
sub_path = date_dir / subdir
|
sub_path = date_dir / subdir
|
||||||
if sub_path.exists():
|
if sub_path.exists():
|
||||||
|
|||||||
@ -4,4 +4,4 @@ TrendRadar MCP Server
|
|||||||
提供基于MCP协议的新闻聚合数据查询和系统管理接口。
|
提供基于MCP协议的新闻聚合数据查询和系统管理接口。
|
||||||
"""
|
"""
|
||||||
|
|
||||||
__version__ = "1.0.0"
|
__version__ = "1.1.0"
|
||||||
|
|||||||
@ -15,6 +15,7 @@ from .tools.analytics import AnalyticsTools
|
|||||||
from .tools.search_tools import SearchTools
|
from .tools.search_tools import SearchTools
|
||||||
from .tools.config_mgmt import ConfigManagementTools
|
from .tools.config_mgmt import ConfigManagementTools
|
||||||
from .tools.system import SystemManagementTools
|
from .tools.system import SystemManagementTools
|
||||||
|
from .tools.storage_sync import StorageSyncTools
|
||||||
from .utils.date_parser import DateParser
|
from .utils.date_parser import DateParser
|
||||||
from .utils.errors import MCPError
|
from .utils.errors import MCPError
|
||||||
|
|
||||||
@ -34,6 +35,7 @@ def _get_tools(project_root: Optional[str] = None):
|
|||||||
_tools_instances['search'] = SearchTools(project_root)
|
_tools_instances['search'] = SearchTools(project_root)
|
||||||
_tools_instances['config'] = ConfigManagementTools(project_root)
|
_tools_instances['config'] = ConfigManagementTools(project_root)
|
||||||
_tools_instances['system'] = SystemManagementTools(project_root)
|
_tools_instances['system'] = SystemManagementTools(project_root)
|
||||||
|
_tools_instances['storage'] = StorageSyncTools(project_root)
|
||||||
return _tools_instances
|
return _tools_instances
|
||||||
|
|
||||||
|
|
||||||
@ -657,6 +659,127 @@ async def trigger_crawl(
|
|||||||
return json.dumps(result, ensure_ascii=False, indent=2)
|
return json.dumps(result, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
|
||||||
|
# ==================== 存储同步工具 ====================
|
||||||
|
|
||||||
|
@mcp.tool
|
||||||
|
async def sync_from_remote(
|
||||||
|
days: int = 7
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
从远程存储拉取数据到本地
|
||||||
|
|
||||||
|
用于 MCP Server 等场景:爬虫存到远程云存储(如 Cloudflare R2),
|
||||||
|
MCP Server 拉取到本地进行分析查询。
|
||||||
|
|
||||||
|
Args:
|
||||||
|
days: 拉取最近 N 天的数据,默认 7 天
|
||||||
|
- 0: 不拉取
|
||||||
|
- 7: 拉取最近一周的数据
|
||||||
|
- 30: 拉取最近一个月的数据
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
JSON格式的同步结果,包含:
|
||||||
|
- success: 是否成功
|
||||||
|
- synced_files: 成功同步的文件数量
|
||||||
|
- synced_dates: 成功同步的日期列表
|
||||||
|
- skipped_dates: 跳过的日期(本地已存在)
|
||||||
|
- failed_dates: 失败的日期及错误信息
|
||||||
|
- message: 操作结果描述
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
- sync_from_remote() # 拉取最近7天
|
||||||
|
- sync_from_remote(days=30) # 拉取最近30天
|
||||||
|
|
||||||
|
Note:
|
||||||
|
需要在 config/config.yaml 中配置远程存储(storage.remote)或设置环境变量:
|
||||||
|
- S3_ENDPOINT_URL: 服务端点
|
||||||
|
- S3_BUCKET_NAME: 存储桶名称
|
||||||
|
- S3_ACCESS_KEY_ID: 访问密钥 ID
|
||||||
|
- S3_SECRET_ACCESS_KEY: 访问密钥
|
||||||
|
"""
|
||||||
|
tools = _get_tools()
|
||||||
|
result = tools['storage'].sync_from_remote(days=days)
|
||||||
|
return json.dumps(result, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
|
||||||
|
@mcp.tool
|
||||||
|
async def get_storage_status() -> str:
|
||||||
|
"""
|
||||||
|
获取存储配置和状态
|
||||||
|
|
||||||
|
查看当前存储后端配置、本地和远程存储的状态信息。
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
JSON格式的存储状态信息,包含:
|
||||||
|
- backend: 当前使用的后端类型(local/remote/auto)
|
||||||
|
- local: 本地存储状态
|
||||||
|
- data_dir: 数据目录
|
||||||
|
- retention_days: 保留天数
|
||||||
|
- total_size: 总大小
|
||||||
|
- date_count: 日期数量
|
||||||
|
- earliest_date: 最早日期
|
||||||
|
- latest_date: 最新日期
|
||||||
|
- remote: 远程存储状态
|
||||||
|
- configured: 是否已配置
|
||||||
|
- endpoint_url: 服务端点
|
||||||
|
- bucket_name: 存储桶名称
|
||||||
|
- date_count: 远程日期数量
|
||||||
|
- pull: 拉取配置
|
||||||
|
- enabled: 是否启用自动拉取
|
||||||
|
- days: 自动拉取天数
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
- get_storage_status() # 查看所有存储状态
|
||||||
|
"""
|
||||||
|
tools = _get_tools()
|
||||||
|
result = tools['storage'].get_storage_status()
|
||||||
|
return json.dumps(result, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
|
||||||
|
@mcp.tool
|
||||||
|
async def list_available_dates(
|
||||||
|
source: str = "both"
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
列出本地/远程可用的日期范围
|
||||||
|
|
||||||
|
查看本地和远程存储中有哪些日期的数据可用,
|
||||||
|
帮助了解数据覆盖范围和同步状态。
|
||||||
|
|
||||||
|
Args:
|
||||||
|
source: 数据来源,可选值:
|
||||||
|
- "local": 仅列出本地可用日期
|
||||||
|
- "remote": 仅列出远程可用日期
|
||||||
|
- "both": 同时列出两者并进行对比(默认)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
JSON格式的日期列表,包含:
|
||||||
|
- local: 本地日期信息(如果 source 包含 local)
|
||||||
|
- dates: 日期列表(按时间倒序)
|
||||||
|
- count: 日期数量
|
||||||
|
- earliest: 最早日期
|
||||||
|
- latest: 最新日期
|
||||||
|
- remote: 远程日期信息(如果 source 包含 remote)
|
||||||
|
- configured: 是否已配置远程存储
|
||||||
|
- dates: 日期列表
|
||||||
|
- count: 日期数量
|
||||||
|
- earliest: 最早日期
|
||||||
|
- latest: 最新日期
|
||||||
|
- comparison: 对比结果(仅当 source="both" 时)
|
||||||
|
- only_local: 仅本地存在的日期
|
||||||
|
- only_remote: 仅远程存在的日期
|
||||||
|
- both: 两边都存在的日期
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
- list_available_dates() # 查看本地和远程的对比
|
||||||
|
- list_available_dates(source="local") # 仅查看本地
|
||||||
|
- list_available_dates(source="remote") # 仅查看远程
|
||||||
|
"""
|
||||||
|
tools = _get_tools()
|
||||||
|
result = tools['storage'].list_available_dates(source=source)
|
||||||
|
return json.dumps(result, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
|
||||||
# ==================== 启动入口 ====================
|
# ==================== 启动入口 ====================
|
||||||
|
|
||||||
def run_server(
|
def run_server(
|
||||||
@ -721,6 +844,11 @@ def run_server(
|
|||||||
print(" 11. get_current_config - 获取当前系统配置")
|
print(" 11. get_current_config - 获取当前系统配置")
|
||||||
print(" 12. get_system_status - 获取系统运行状态")
|
print(" 12. get_system_status - 获取系统运行状态")
|
||||||
print(" 13. trigger_crawl - 手动触发爬取任务")
|
print(" 13. trigger_crawl - 手动触发爬取任务")
|
||||||
|
print()
|
||||||
|
print(" === 存储同步工具 ===")
|
||||||
|
print(" 14. sync_from_remote - 从远程存储拉取数据到本地")
|
||||||
|
print(" 15. get_storage_status - 获取存储配置和状态")
|
||||||
|
print(" 16. list_available_dates - 列出本地/远程可用日期")
|
||||||
print("=" * 60)
|
print("=" * 60)
|
||||||
print()
|
print()
|
||||||
|
|
||||||
|
|||||||
@ -517,24 +517,55 @@ class DataService:
|
|||||||
# 遍历日期文件夹
|
# 遍历日期文件夹
|
||||||
for date_folder in output_dir.iterdir():
|
for date_folder in output_dir.iterdir():
|
||||||
if date_folder.is_dir() and not date_folder.name.startswith('.'):
|
if date_folder.is_dir() and not date_folder.name.startswith('.'):
|
||||||
# 解析日期(格式: YYYY年MM月DD日)
|
folder_date = self._parse_date_folder_name(date_folder.name)
|
||||||
try:
|
if folder_date:
|
||||||
date_match = re.match(r'(\d{4})年(\d{2})月(\d{2})日', date_folder.name)
|
available_dates.append(folder_date)
|
||||||
if date_match:
|
|
||||||
folder_date = datetime(
|
|
||||||
int(date_match.group(1)),
|
|
||||||
int(date_match.group(2)),
|
|
||||||
int(date_match.group(3))
|
|
||||||
)
|
|
||||||
available_dates.append(folder_date)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
if not available_dates:
|
if not available_dates:
|
||||||
return (None, None)
|
return (None, None)
|
||||||
|
|
||||||
return (min(available_dates), max(available_dates))
|
return (min(available_dates), max(available_dates))
|
||||||
|
|
||||||
|
def _parse_date_folder_name(self, folder_name: str) -> Optional[datetime]:
|
||||||
|
"""
|
||||||
|
解析日期文件夹名称(兼容中文和ISO格式)
|
||||||
|
|
||||||
|
支持两种格式:
|
||||||
|
- 中文格式:YYYY年MM月DD日
|
||||||
|
- ISO格式:YYYY-MM-DD
|
||||||
|
|
||||||
|
Args:
|
||||||
|
folder_name: 文件夹名称
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
datetime 对象,解析失败返回 None
|
||||||
|
"""
|
||||||
|
# 尝试中文格式:YYYY年MM月DD日
|
||||||
|
chinese_match = re.match(r'(\d{4})年(\d{2})月(\d{2})日', folder_name)
|
||||||
|
if chinese_match:
|
||||||
|
try:
|
||||||
|
return datetime(
|
||||||
|
int(chinese_match.group(1)),
|
||||||
|
int(chinese_match.group(2)),
|
||||||
|
int(chinese_match.group(3))
|
||||||
|
)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# 尝试 ISO 格式:YYYY-MM-DD
|
||||||
|
iso_match = re.match(r'(\d{4})-(\d{2})-(\d{2})', folder_name)
|
||||||
|
if iso_match:
|
||||||
|
try:
|
||||||
|
return datetime(
|
||||||
|
int(iso_match.group(1)),
|
||||||
|
int(iso_match.group(2)),
|
||||||
|
int(iso_match.group(3))
|
||||||
|
)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
def get_system_status(self) -> Dict:
|
def get_system_status(self) -> Dict:
|
||||||
"""
|
"""
|
||||||
获取系统运行状态
|
获取系统运行状态
|
||||||
@ -553,26 +584,14 @@ class DataService:
|
|||||||
if output_dir.exists():
|
if output_dir.exists():
|
||||||
# 遍历日期文件夹
|
# 遍历日期文件夹
|
||||||
for date_folder in output_dir.iterdir():
|
for date_folder in output_dir.iterdir():
|
||||||
if date_folder.is_dir():
|
if date_folder.is_dir() and not date_folder.name.startswith('.'):
|
||||||
# 解析日期
|
# 解析日期(兼容中文和ISO格式)
|
||||||
try:
|
folder_date = self._parse_date_folder_name(date_folder.name)
|
||||||
date_str = date_folder.name
|
if folder_date:
|
||||||
# 格式: YYYY年MM月DD日
|
if oldest_record is None or folder_date < oldest_record:
|
||||||
date_match = re.match(r'(\d{4})年(\d{2})月(\d{2})日', date_str)
|
oldest_record = folder_date
|
||||||
if date_match:
|
if latest_record is None or folder_date > latest_record:
|
||||||
folder_date = datetime(
|
latest_record = folder_date
|
||||||
int(date_match.group(1)),
|
|
||||||
int(date_match.group(2)),
|
|
||||||
int(date_match.group(3))
|
|
||||||
)
|
|
||||||
|
|
||||||
if oldest_record is None or folder_date < oldest_record:
|
|
||||||
oldest_record = folder_date
|
|
||||||
if latest_record is None or folder_date > latest_record:
|
|
||||||
latest_record = folder_date
|
|
||||||
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# 计算存储大小
|
# 计算存储大小
|
||||||
for item in date_folder.rglob("*"):
|
for item in date_folder.rglob("*"):
|
||||||
|
|||||||
@ -2,9 +2,12 @@
|
|||||||
文件解析服务
|
文件解析服务
|
||||||
|
|
||||||
提供txt格式新闻数据和YAML配置文件的解析功能。
|
提供txt格式新闻数据和YAML配置文件的解析功能。
|
||||||
|
支持从 SQLite 数据库和 TXT 文件两种数据源读取。
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
import re
|
import re
|
||||||
|
import sqlite3
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, List, Tuple, Optional
|
from typing import Dict, List, Tuple, Optional
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
@ -145,17 +148,310 @@ class ParserService:
|
|||||||
|
|
||||||
def get_date_folder_name(self, date: datetime = None) -> str:
|
def get_date_folder_name(self, date: datetime = None) -> str:
|
||||||
"""
|
"""
|
||||||
获取日期文件夹名称
|
获取日期文件夹名称(兼容中文和ISO格式)
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
date: 日期对象,默认为今天
|
date: 日期对象,默认为今天
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
文件夹名称,格式: YYYY年MM月DD日
|
实际存在的文件夹名称,优先返回中文格式(YYYY年MM月DD日),
|
||||||
|
若不存在则返回 ISO 格式(YYYY-MM-DD)
|
||||||
"""
|
"""
|
||||||
if date is None:
|
if date is None:
|
||||||
date = datetime.now()
|
date = datetime.now()
|
||||||
return date.strftime("%Y年%m月%d日")
|
return self._find_date_folder(date)
|
||||||
|
|
||||||
|
def _get_date_folder_name(self, date: datetime = None) -> str:
|
||||||
|
"""
|
||||||
|
获取日期文件夹名称(兼容中文和ISO格式)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
date: 日期对象,默认为今天
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
实际存在的文件夹名称,优先返回中文格式(YYYY年MM月DD日),
|
||||||
|
若不存在则返回 ISO 格式(YYYY-MM-DD)
|
||||||
|
"""
|
||||||
|
if date is None:
|
||||||
|
date = datetime.now()
|
||||||
|
return self._find_date_folder(date)
|
||||||
|
|
||||||
|
def _find_date_folder(self, date: datetime) -> str:
|
||||||
|
"""
|
||||||
|
查找实际存在的日期文件夹
|
||||||
|
|
||||||
|
支持两种格式:
|
||||||
|
- 中文格式:YYYY年MM月DD日(优先)
|
||||||
|
- ISO格式:YYYY-MM-DD
|
||||||
|
|
||||||
|
Args:
|
||||||
|
date: 日期对象
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
实际存在的文件夹名称,若都不存在则返回中文格式
|
||||||
|
"""
|
||||||
|
output_dir = self.project_root / "output"
|
||||||
|
|
||||||
|
# 中文格式:YYYY年MM月DD日
|
||||||
|
chinese_format = date.strftime("%Y年%m月%d日")
|
||||||
|
# ISO格式:YYYY-MM-DD
|
||||||
|
iso_format = date.strftime("%Y-%m-%d")
|
||||||
|
|
||||||
|
# 优先检查中文格式
|
||||||
|
if (output_dir / chinese_format).exists():
|
||||||
|
return chinese_format
|
||||||
|
# 其次检查 ISO 格式
|
||||||
|
if (output_dir / iso_format).exists():
|
||||||
|
return iso_format
|
||||||
|
|
||||||
|
# 都不存在,返回中文格式(与项目现有风格一致)
|
||||||
|
return chinese_format
|
||||||
|
|
||||||
|
def _get_sqlite_db_path(self, date: datetime = None) -> Optional[Path]:
|
||||||
|
"""
|
||||||
|
获取 SQLite 数据库文件路径
|
||||||
|
|
||||||
|
Args:
|
||||||
|
date: 日期对象,默认为今天
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
数据库文件路径,如果不存在则返回 None
|
||||||
|
"""
|
||||||
|
date_folder = self._get_date_folder_name(date)
|
||||||
|
db_path = self.project_root / "output" / date_folder / "news.db"
|
||||||
|
if db_path.exists():
|
||||||
|
return db_path
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _get_txt_folder_path(self, date: datetime = None) -> Optional[Path]:
|
||||||
|
"""
|
||||||
|
获取 TXT 文件夹路径
|
||||||
|
|
||||||
|
Args:
|
||||||
|
date: 日期对象,默认为今天
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
TXT 文件夹路径,如果不存在则返回 None
|
||||||
|
"""
|
||||||
|
date_folder = self._get_date_folder_name(date)
|
||||||
|
txt_path = self.project_root / "output" / date_folder / "txt"
|
||||||
|
if txt_path.exists() and txt_path.is_dir():
|
||||||
|
return txt_path
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _read_from_txt(
|
||||||
|
self,
|
||||||
|
date: datetime = None,
|
||||||
|
platform_ids: Optional[List[str]] = None
|
||||||
|
) -> Optional[Tuple[Dict, Dict, Dict]]:
|
||||||
|
"""
|
||||||
|
从 TXT 文件夹读取新闻数据
|
||||||
|
|
||||||
|
Args:
|
||||||
|
date: 日期对象,默认为今天
|
||||||
|
platform_ids: 平台ID列表,None表示所有平台
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(all_titles, id_to_name, all_timestamps) 元组,如果不存在返回 None
|
||||||
|
"""
|
||||||
|
txt_folder = self._get_txt_folder_path(date)
|
||||||
|
if txt_folder is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 获取所有 TXT 文件并按时间排序
|
||||||
|
txt_files = sorted(txt_folder.glob("*.txt"))
|
||||||
|
if not txt_files:
|
||||||
|
return None
|
||||||
|
|
||||||
|
all_titles = {}
|
||||||
|
id_to_name = {}
|
||||||
|
all_timestamps = {}
|
||||||
|
|
||||||
|
for txt_file in txt_files:
|
||||||
|
try:
|
||||||
|
titles_by_id, file_id_to_name = self.parse_txt_file(txt_file)
|
||||||
|
|
||||||
|
# 记录时间戳
|
||||||
|
all_timestamps[txt_file.name] = txt_file.stat().st_mtime
|
||||||
|
|
||||||
|
# 合并 id_to_name
|
||||||
|
id_to_name.update(file_id_to_name)
|
||||||
|
|
||||||
|
# 合并标题数据
|
||||||
|
for source_id, titles in titles_by_id.items():
|
||||||
|
# 如果指定了 platform_ids,过滤
|
||||||
|
if platform_ids and source_id not in platform_ids:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if source_id not in all_titles:
|
||||||
|
all_titles[source_id] = {}
|
||||||
|
|
||||||
|
for title, data in titles.items():
|
||||||
|
if title not in all_titles[source_id]:
|
||||||
|
# 新标题
|
||||||
|
all_titles[source_id][title] = {
|
||||||
|
"ranks": data.get("ranks", []),
|
||||||
|
"url": data.get("url", ""),
|
||||||
|
"mobileUrl": data.get("mobileUrl", ""),
|
||||||
|
"first_time": txt_file.stem, # 使用文件名作为时间
|
||||||
|
"last_time": txt_file.stem,
|
||||||
|
"count": 1,
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
# 合并已存在的标题
|
||||||
|
existing = all_titles[source_id][title]
|
||||||
|
# 合并排名
|
||||||
|
for rank in data.get("ranks", []):
|
||||||
|
if rank not in existing["ranks"]:
|
||||||
|
existing["ranks"].append(rank)
|
||||||
|
# 更新 last_time
|
||||||
|
existing["last_time"] = txt_file.stem
|
||||||
|
existing["count"] += 1
|
||||||
|
# 保留 URL
|
||||||
|
if not existing["url"] and data.get("url"):
|
||||||
|
existing["url"] = data["url"]
|
||||||
|
if not existing["mobileUrl"] and data.get("mobileUrl"):
|
||||||
|
existing["mobileUrl"] = data["mobileUrl"]
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Warning: 解析 TXT 文件失败 {txt_file}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not all_titles:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return (all_titles, id_to_name, all_timestamps)
|
||||||
|
|
||||||
|
def _read_from_sqlite(
|
||||||
|
self,
|
||||||
|
date: datetime = None,
|
||||||
|
platform_ids: Optional[List[str]] = None
|
||||||
|
) -> Optional[Tuple[Dict, Dict, Dict]]:
|
||||||
|
"""
|
||||||
|
从 SQLite 数据库读取新闻数据
|
||||||
|
|
||||||
|
新表结构数据已按 URL 去重,包含:
|
||||||
|
- first_crawl_time: 首次抓取时间
|
||||||
|
- last_crawl_time: 最后抓取时间
|
||||||
|
- crawl_count: 抓取次数
|
||||||
|
|
||||||
|
Args:
|
||||||
|
date: 日期对象,默认为今天
|
||||||
|
platform_ids: 平台ID列表,None表示所有平台
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(all_titles, id_to_name, all_timestamps) 元组,如果数据库不存在返回 None
|
||||||
|
"""
|
||||||
|
db_path = self._get_sqlite_db_path(date)
|
||||||
|
if db_path is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
all_titles = {}
|
||||||
|
id_to_name = {}
|
||||||
|
all_timestamps = {}
|
||||||
|
|
||||||
|
try:
|
||||||
|
conn = sqlite3.connect(str(db_path))
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
# 检查表是否存在
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT name FROM sqlite_master
|
||||||
|
WHERE type='table' AND name='news_items'
|
||||||
|
""")
|
||||||
|
if not cursor.fetchone():
|
||||||
|
conn.close()
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 构建查询
|
||||||
|
if platform_ids:
|
||||||
|
placeholders = ','.join(['?' for _ in platform_ids])
|
||||||
|
query = f"""
|
||||||
|
SELECT n.id, n.platform_id, p.name as platform_name, n.title,
|
||||||
|
n.rank, n.url, n.mobile_url,
|
||||||
|
n.first_crawl_time, n.last_crawl_time, n.crawl_count
|
||||||
|
FROM news_items n
|
||||||
|
LEFT JOIN platforms p ON n.platform_id = p.id
|
||||||
|
WHERE n.platform_id IN ({placeholders})
|
||||||
|
"""
|
||||||
|
cursor.execute(query, platform_ids)
|
||||||
|
else:
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT n.id, n.platform_id, p.name as platform_name, n.title,
|
||||||
|
n.rank, n.url, n.mobile_url,
|
||||||
|
n.first_crawl_time, n.last_crawl_time, n.crawl_count
|
||||||
|
FROM news_items n
|
||||||
|
LEFT JOIN platforms p ON n.platform_id = p.id
|
||||||
|
""")
|
||||||
|
|
||||||
|
rows = cursor.fetchall()
|
||||||
|
|
||||||
|
# 收集所有 news_item_id 用于查询历史排名
|
||||||
|
news_ids = [row['id'] for row in rows]
|
||||||
|
rank_history_map = {}
|
||||||
|
|
||||||
|
if news_ids:
|
||||||
|
placeholders = ",".join("?" * len(news_ids))
|
||||||
|
cursor.execute(f"""
|
||||||
|
SELECT news_item_id, rank FROM rank_history
|
||||||
|
WHERE news_item_id IN ({placeholders})
|
||||||
|
ORDER BY news_item_id, crawl_time
|
||||||
|
""", news_ids)
|
||||||
|
|
||||||
|
for rh_row in cursor.fetchall():
|
||||||
|
news_id = rh_row['news_item_id']
|
||||||
|
rank = rh_row['rank']
|
||||||
|
if news_id not in rank_history_map:
|
||||||
|
rank_history_map[news_id] = []
|
||||||
|
rank_history_map[news_id].append(rank)
|
||||||
|
|
||||||
|
for row in rows:
|
||||||
|
news_id = row['id']
|
||||||
|
platform_id = row['platform_id']
|
||||||
|
platform_name = row['platform_name'] or platform_id
|
||||||
|
title = row['title']
|
||||||
|
|
||||||
|
# 更新 id_to_name
|
||||||
|
if platform_id not in id_to_name:
|
||||||
|
id_to_name[platform_id] = platform_name
|
||||||
|
|
||||||
|
# 初始化平台字典
|
||||||
|
if platform_id not in all_titles:
|
||||||
|
all_titles[platform_id] = {}
|
||||||
|
|
||||||
|
# 获取排名历史,如果为空则使用当前排名
|
||||||
|
ranks = rank_history_map.get(news_id, [row['rank']])
|
||||||
|
|
||||||
|
# 直接使用数据(已去重)
|
||||||
|
all_titles[platform_id][title] = {
|
||||||
|
"ranks": ranks,
|
||||||
|
"url": row['url'] or "",
|
||||||
|
"mobileUrl": row['mobile_url'] or "",
|
||||||
|
"first_time": row['first_crawl_time'] or "",
|
||||||
|
"last_time": row['last_crawl_time'] or "",
|
||||||
|
"count": row['crawl_count'] or 1,
|
||||||
|
}
|
||||||
|
|
||||||
|
# 获取抓取时间作为 timestamps
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT crawl_time FROM crawl_records
|
||||||
|
ORDER BY crawl_time
|
||||||
|
""")
|
||||||
|
for row in cursor.fetchall():
|
||||||
|
crawl_time = row['crawl_time']
|
||||||
|
all_timestamps[f"{crawl_time}.db"] = 0 # 用虚拟时间戳
|
||||||
|
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
if not all_titles:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return (all_titles, id_to_name, all_timestamps)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Warning: 从 SQLite 读取数据失败: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
def read_all_titles_for_date(
|
def read_all_titles_for_date(
|
||||||
self,
|
self,
|
||||||
@ -163,7 +459,7 @@ class ParserService:
|
|||||||
platform_ids: Optional[List[str]] = None
|
platform_ids: Optional[List[str]] = None
|
||||||
) -> Tuple[Dict, Dict, Dict]:
|
) -> Tuple[Dict, Dict, Dict]:
|
||||||
"""
|
"""
|
||||||
读取指定日期的所有标题文件(带缓存)
|
读取指定日期的所有标题(带缓存)
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
date: 日期对象,默认为今天
|
date: 日期对象,默认为今天
|
||||||
@ -193,71 +489,23 @@ class ParserService:
|
|||||||
if cached:
|
if cached:
|
||||||
return cached
|
return cached
|
||||||
|
|
||||||
# 缓存未命中,读取文件
|
# 优先从 SQLite 读取
|
||||||
date_folder = self.get_date_folder_name(date)
|
sqlite_result = self._read_from_sqlite(date, platform_ids)
|
||||||
txt_dir = self.project_root / "output" / date_folder / "txt"
|
if sqlite_result:
|
||||||
|
self.cache.set(cache_key, sqlite_result)
|
||||||
|
return sqlite_result
|
||||||
|
|
||||||
if not txt_dir.exists():
|
# SQLite 不存在,尝试从 TXT 读取
|
||||||
raise DataNotFoundError(
|
txt_result = self._read_from_txt(date, platform_ids)
|
||||||
f"未找到 {date_folder} 的数据目录",
|
if txt_result:
|
||||||
suggestion="请先运行爬虫或检查日期是否正确"
|
self.cache.set(cache_key, txt_result)
|
||||||
)
|
return txt_result
|
||||||
|
|
||||||
all_titles = {}
|
# 两种数据源都不存在
|
||||||
id_to_name = {}
|
raise DataNotFoundError(
|
||||||
all_timestamps = {}
|
f"未找到 {date_str} 的数据",
|
||||||
|
suggestion="请先运行爬虫或检查日期是否正确"
|
||||||
# 读取所有txt文件
|
)
|
||||||
txt_files = sorted(txt_dir.glob("*.txt"))
|
|
||||||
|
|
||||||
if not txt_files:
|
|
||||||
raise DataNotFoundError(
|
|
||||||
f"{date_folder} 没有数据文件",
|
|
||||||
suggestion="请等待爬虫任务完成"
|
|
||||||
)
|
|
||||||
|
|
||||||
for txt_file in txt_files:
|
|
||||||
try:
|
|
||||||
titles_by_id, file_id_to_name = self.parse_txt_file(txt_file)
|
|
||||||
|
|
||||||
# 更新id_to_name
|
|
||||||
id_to_name.update(file_id_to_name)
|
|
||||||
|
|
||||||
# 合并标题数据
|
|
||||||
for platform_id, titles in titles_by_id.items():
|
|
||||||
# 如果指定了平台过滤
|
|
||||||
if platform_ids and platform_id not in platform_ids:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if platform_id not in all_titles:
|
|
||||||
all_titles[platform_id] = {}
|
|
||||||
|
|
||||||
for title, info in titles.items():
|
|
||||||
if title in all_titles[platform_id]:
|
|
||||||
# 合并排名
|
|
||||||
all_titles[platform_id][title]["ranks"].extend(info["ranks"])
|
|
||||||
else:
|
|
||||||
all_titles[platform_id][title] = info.copy()
|
|
||||||
|
|
||||||
# 记录文件时间戳
|
|
||||||
all_timestamps[txt_file.name] = txt_file.stat().st_mtime
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
# 忽略单个文件的解析错误,继续处理其他文件
|
|
||||||
print(f"Warning: 解析文件 {txt_file} 失败: {e}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
if not all_titles:
|
|
||||||
raise DataNotFoundError(
|
|
||||||
f"{date_folder} 没有有效的数据",
|
|
||||||
suggestion="请检查数据文件格式或重新运行爬虫"
|
|
||||||
)
|
|
||||||
|
|
||||||
# 缓存结果
|
|
||||||
result = (all_titles, id_to_name, all_timestamps)
|
|
||||||
self.cache.set(cache_key, result)
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
def parse_yaml_config(self, config_path: str = None) -> dict:
|
def parse_yaml_config(self, config_path: str = None) -> dict:
|
||||||
"""
|
"""
|
||||||
|
|||||||
@ -25,7 +25,6 @@ def calculate_news_weight(news_data: Dict, rank_threshold: int = 5) -> float:
|
|||||||
"""
|
"""
|
||||||
计算新闻权重(用于排序)
|
计算新闻权重(用于排序)
|
||||||
|
|
||||||
基于 main.py 的权重算法实现,综合考虑:
|
|
||||||
- 排名权重 (60%):新闻在榜单中的排名
|
- 排名权重 (60%):新闻在榜单中的排名
|
||||||
- 频次权重 (30%):新闻出现的次数
|
- 频次权重 (30%):新闻出现的次数
|
||||||
- 热度权重 (10%):高排名出现的比例
|
- 热度权重 (10%):高排名出现的比例
|
||||||
|
|||||||
468
mcp_server/tools/storage_sync.py
Normal file
468
mcp_server/tools/storage_sync.py
Normal file
@ -0,0 +1,468 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
"""
|
||||||
|
存储同步工具
|
||||||
|
|
||||||
|
实现从远程存储拉取数据到本地、获取存储状态、列出可用日期等功能。
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
from ..utils.errors import MCPError
|
||||||
|
|
||||||
|
|
||||||
|
class StorageSyncTools:
|
||||||
|
"""存储同步工具类"""
|
||||||
|
|
||||||
|
def __init__(self, project_root: str = None):
|
||||||
|
"""
|
||||||
|
初始化存储同步工具
|
||||||
|
|
||||||
|
Args:
|
||||||
|
project_root: 项目根目录
|
||||||
|
"""
|
||||||
|
if project_root:
|
||||||
|
self.project_root = Path(project_root)
|
||||||
|
else:
|
||||||
|
current_file = Path(__file__)
|
||||||
|
self.project_root = current_file.parent.parent.parent
|
||||||
|
|
||||||
|
self._config = None
|
||||||
|
self._remote_backend = None
|
||||||
|
|
||||||
|
def _load_config(self) -> dict:
|
||||||
|
"""加载配置文件"""
|
||||||
|
if self._config is None:
|
||||||
|
config_path = self.project_root / "config" / "config.yaml"
|
||||||
|
if config_path.exists():
|
||||||
|
with open(config_path, "r", encoding="utf-8") as f:
|
||||||
|
self._config = yaml.safe_load(f)
|
||||||
|
else:
|
||||||
|
self._config = {}
|
||||||
|
return self._config
|
||||||
|
|
||||||
|
def _get_storage_config(self) -> dict:
|
||||||
|
"""获取存储配置"""
|
||||||
|
config = self._load_config()
|
||||||
|
return config.get("storage", {})
|
||||||
|
|
||||||
|
def _get_remote_config(self) -> dict:
|
||||||
|
"""
|
||||||
|
获取远程存储配置(合并配置文件和环境变量)
|
||||||
|
"""
|
||||||
|
storage_config = self._get_storage_config()
|
||||||
|
remote_config = storage_config.get("remote", {})
|
||||||
|
|
||||||
|
return {
|
||||||
|
"endpoint_url": remote_config.get("endpoint_url") or os.environ.get("S3_ENDPOINT_URL", ""),
|
||||||
|
"bucket_name": remote_config.get("bucket_name") or os.environ.get("S3_BUCKET_NAME", ""),
|
||||||
|
"access_key_id": remote_config.get("access_key_id") or os.environ.get("S3_ACCESS_KEY_ID", ""),
|
||||||
|
"secret_access_key": remote_config.get("secret_access_key") or os.environ.get("S3_SECRET_ACCESS_KEY", ""),
|
||||||
|
"region": remote_config.get("region") or os.environ.get("S3_REGION", ""),
|
||||||
|
}
|
||||||
|
|
||||||
|
def _has_remote_config(self) -> bool:
|
||||||
|
"""检查是否有有效的远程存储配置"""
|
||||||
|
config = self._get_remote_config()
|
||||||
|
return bool(
|
||||||
|
config.get("bucket_name") and
|
||||||
|
config.get("access_key_id") and
|
||||||
|
config.get("secret_access_key") and
|
||||||
|
config.get("endpoint_url")
|
||||||
|
)
|
||||||
|
|
||||||
|
def _get_remote_backend(self):
|
||||||
|
"""获取远程存储后端实例"""
|
||||||
|
if self._remote_backend is not None:
|
||||||
|
return self._remote_backend
|
||||||
|
|
||||||
|
if not self._has_remote_config():
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
from trendradar.storage.remote import RemoteStorageBackend
|
||||||
|
|
||||||
|
remote_config = self._get_remote_config()
|
||||||
|
config = self._load_config()
|
||||||
|
timezone = config.get("app", {}).get("timezone", "Asia/Shanghai")
|
||||||
|
|
||||||
|
self._remote_backend = RemoteStorageBackend(
|
||||||
|
bucket_name=remote_config["bucket_name"],
|
||||||
|
access_key_id=remote_config["access_key_id"],
|
||||||
|
secret_access_key=remote_config["secret_access_key"],
|
||||||
|
endpoint_url=remote_config["endpoint_url"],
|
||||||
|
region=remote_config.get("region", ""),
|
||||||
|
timezone=timezone,
|
||||||
|
)
|
||||||
|
return self._remote_backend
|
||||||
|
except ImportError:
|
||||||
|
print("[存储同步] 远程存储后端需要安装 boto3: pip install boto3")
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[存储同步] 创建远程后端失败: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _get_local_data_dir(self) -> Path:
|
||||||
|
"""获取本地数据目录"""
|
||||||
|
storage_config = self._get_storage_config()
|
||||||
|
local_config = storage_config.get("local", {})
|
||||||
|
data_dir = local_config.get("data_dir", "output")
|
||||||
|
return self.project_root / data_dir
|
||||||
|
|
||||||
|
def _parse_date_folder_name(self, folder_name: str) -> Optional[datetime]:
|
||||||
|
"""
|
||||||
|
解析日期文件夹名称(兼容中文和 ISO 格式)
|
||||||
|
|
||||||
|
支持两种格式:
|
||||||
|
- 中文格式:YYYY年MM月DD日
|
||||||
|
- ISO 格式:YYYY-MM-DD
|
||||||
|
"""
|
||||||
|
# 尝试 ISO 格式
|
||||||
|
iso_match = re.match(r'(\d{4})-(\d{2})-(\d{2})', folder_name)
|
||||||
|
if iso_match:
|
||||||
|
try:
|
||||||
|
return datetime(
|
||||||
|
int(iso_match.group(1)),
|
||||||
|
int(iso_match.group(2)),
|
||||||
|
int(iso_match.group(3))
|
||||||
|
)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# 尝试中文格式
|
||||||
|
chinese_match = re.match(r'(\d{4})年(\d{2})月(\d{2})日', folder_name)
|
||||||
|
if chinese_match:
|
||||||
|
try:
|
||||||
|
return datetime(
|
||||||
|
int(chinese_match.group(1)),
|
||||||
|
int(chinese_match.group(2)),
|
||||||
|
int(chinese_match.group(3))
|
||||||
|
)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _get_local_dates(self) -> List[str]:
|
||||||
|
"""获取本地可用的日期列表"""
|
||||||
|
local_dir = self._get_local_data_dir()
|
||||||
|
dates = []
|
||||||
|
|
||||||
|
if not local_dir.exists():
|
||||||
|
return dates
|
||||||
|
|
||||||
|
for item in local_dir.iterdir():
|
||||||
|
if item.is_dir() and not item.name.startswith('.'):
|
||||||
|
folder_date = self._parse_date_folder_name(item.name)
|
||||||
|
if folder_date:
|
||||||
|
dates.append(folder_date.strftime("%Y-%m-%d"))
|
||||||
|
|
||||||
|
return sorted(dates, reverse=True)
|
||||||
|
|
||||||
|
def _calculate_dir_size(self, path: Path) -> int:
|
||||||
|
"""计算目录大小(字节)"""
|
||||||
|
total_size = 0
|
||||||
|
if path.exists():
|
||||||
|
for item in path.rglob("*"):
|
||||||
|
if item.is_file():
|
||||||
|
total_size += item.stat().st_size
|
||||||
|
return total_size
|
||||||
|
|
||||||
|
def sync_from_remote(self, days: int = 7) -> Dict:
|
||||||
|
"""
|
||||||
|
从远程存储拉取数据到本地
|
||||||
|
|
||||||
|
Args:
|
||||||
|
days: 拉取最近 N 天的数据,默认 7 天
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
同步结果字典
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# 检查远程配置
|
||||||
|
if not self._has_remote_config():
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"error": {
|
||||||
|
"code": "REMOTE_NOT_CONFIGURED",
|
||||||
|
"message": "未配置远程存储",
|
||||||
|
"suggestion": "请在 config/config.yaml 中配置 storage.remote 或设置环境变量"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# 获取远程后端
|
||||||
|
remote_backend = self._get_remote_backend()
|
||||||
|
if remote_backend is None:
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"error": {
|
||||||
|
"code": "REMOTE_BACKEND_FAILED",
|
||||||
|
"message": "无法创建远程存储后端",
|
||||||
|
"suggestion": "请检查远程存储配置和 boto3 是否已安装"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# 获取本地数据目录
|
||||||
|
local_dir = self._get_local_data_dir()
|
||||||
|
local_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# 获取远程可用日期
|
||||||
|
remote_dates = remote_backend.list_remote_dates()
|
||||||
|
|
||||||
|
# 获取本地已有日期
|
||||||
|
local_dates = set(self._get_local_dates())
|
||||||
|
|
||||||
|
# 计算需要拉取的日期(最近 N 天)
|
||||||
|
from trendradar.utils.time import get_configured_time
|
||||||
|
config = self._load_config()
|
||||||
|
timezone = config.get("app", {}).get("timezone", "Asia/Shanghai")
|
||||||
|
now = get_configured_time(timezone)
|
||||||
|
|
||||||
|
target_dates = []
|
||||||
|
for i in range(days):
|
||||||
|
date = now - timedelta(days=i)
|
||||||
|
date_str = date.strftime("%Y-%m-%d")
|
||||||
|
if date_str in remote_dates:
|
||||||
|
target_dates.append(date_str)
|
||||||
|
|
||||||
|
# 执行拉取
|
||||||
|
synced_dates = []
|
||||||
|
skipped_dates = []
|
||||||
|
failed_dates = []
|
||||||
|
|
||||||
|
for date_str in target_dates:
|
||||||
|
# 检查本地是否已存在
|
||||||
|
if date_str in local_dates:
|
||||||
|
skipped_dates.append(date_str)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 拉取单个日期
|
||||||
|
try:
|
||||||
|
local_date_dir = local_dir / date_str
|
||||||
|
local_db_path = local_date_dir / "news.db"
|
||||||
|
remote_key = f"news/{date_str}.db"
|
||||||
|
|
||||||
|
local_date_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
remote_backend.s3_client.download_file(
|
||||||
|
remote_backend.bucket_name,
|
||||||
|
remote_key,
|
||||||
|
str(local_db_path)
|
||||||
|
)
|
||||||
|
synced_dates.append(date_str)
|
||||||
|
print(f"[存储同步] 已拉取: {date_str}")
|
||||||
|
except Exception as e:
|
||||||
|
failed_dates.append({"date": date_str, "error": str(e)})
|
||||||
|
print(f"[存储同步] 拉取失败 ({date_str}): {e}")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"synced_files": len(synced_dates),
|
||||||
|
"synced_dates": synced_dates,
|
||||||
|
"skipped_dates": skipped_dates,
|
||||||
|
"failed_dates": failed_dates,
|
||||||
|
"message": f"成功同步 {len(synced_dates)} 天数据" + (
|
||||||
|
f",跳过 {len(skipped_dates)} 天(本地已存在)" if skipped_dates else ""
|
||||||
|
) + (
|
||||||
|
f",失败 {len(failed_dates)} 天" if failed_dates else ""
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
except MCPError as e:
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"error": e.to_dict()
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"error": {
|
||||||
|
"code": "INTERNAL_ERROR",
|
||||||
|
"message": str(e)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_storage_status(self) -> Dict:
|
||||||
|
"""
|
||||||
|
获取存储配置和状态
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
存储状态字典
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
storage_config = self._get_storage_config()
|
||||||
|
config = self._load_config()
|
||||||
|
|
||||||
|
# 本地存储状态
|
||||||
|
local_config = storage_config.get("local", {})
|
||||||
|
local_dir = self._get_local_data_dir()
|
||||||
|
local_size = self._calculate_dir_size(local_dir)
|
||||||
|
local_dates = self._get_local_dates()
|
||||||
|
|
||||||
|
local_status = {
|
||||||
|
"data_dir": local_config.get("data_dir", "output"),
|
||||||
|
"retention_days": local_config.get("retention_days", 0),
|
||||||
|
"total_size": f"{local_size / 1024 / 1024:.2f} MB",
|
||||||
|
"total_size_bytes": local_size,
|
||||||
|
"date_count": len(local_dates),
|
||||||
|
"earliest_date": local_dates[-1] if local_dates else None,
|
||||||
|
"latest_date": local_dates[0] if local_dates else None,
|
||||||
|
}
|
||||||
|
|
||||||
|
# 远程存储状态
|
||||||
|
remote_config = storage_config.get("remote", {})
|
||||||
|
has_remote = self._has_remote_config()
|
||||||
|
|
||||||
|
remote_status = {
|
||||||
|
"configured": has_remote,
|
||||||
|
"retention_days": remote_config.get("retention_days", 0),
|
||||||
|
}
|
||||||
|
|
||||||
|
if has_remote:
|
||||||
|
merged_config = self._get_remote_config()
|
||||||
|
# 脱敏显示
|
||||||
|
endpoint = merged_config.get("endpoint_url", "")
|
||||||
|
bucket = merged_config.get("bucket_name", "")
|
||||||
|
remote_status["endpoint_url"] = endpoint
|
||||||
|
remote_status["bucket_name"] = bucket
|
||||||
|
|
||||||
|
# 尝试获取远程日期列表
|
||||||
|
remote_backend = self._get_remote_backend()
|
||||||
|
if remote_backend:
|
||||||
|
try:
|
||||||
|
remote_dates = remote_backend.list_remote_dates()
|
||||||
|
remote_status["date_count"] = len(remote_dates)
|
||||||
|
remote_status["earliest_date"] = remote_dates[-1] if remote_dates else None
|
||||||
|
remote_status["latest_date"] = remote_dates[0] if remote_dates else None
|
||||||
|
except Exception as e:
|
||||||
|
remote_status["error"] = str(e)
|
||||||
|
|
||||||
|
# 拉取配置状态
|
||||||
|
pull_config = storage_config.get("pull", {})
|
||||||
|
pull_status = {
|
||||||
|
"enabled": pull_config.get("enabled", False),
|
||||||
|
"days": pull_config.get("days", 7),
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"backend": storage_config.get("backend", "auto"),
|
||||||
|
"local": local_status,
|
||||||
|
"remote": remote_status,
|
||||||
|
"pull": pull_status,
|
||||||
|
}
|
||||||
|
|
||||||
|
except MCPError as e:
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"error": e.to_dict()
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"error": {
|
||||||
|
"code": "INTERNAL_ERROR",
|
||||||
|
"message": str(e)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
def list_available_dates(self, source: str = "both") -> Dict:
|
||||||
|
"""
|
||||||
|
列出可用的日期范围
|
||||||
|
|
||||||
|
Args:
|
||||||
|
source: 数据来源
|
||||||
|
- "local": 仅本地
|
||||||
|
- "remote": 仅远程
|
||||||
|
- "both": 两者都列出(默认)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
日期列表字典
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
result = {
|
||||||
|
"success": True,
|
||||||
|
}
|
||||||
|
|
||||||
|
# 本地日期
|
||||||
|
if source in ("local", "both"):
|
||||||
|
local_dates = self._get_local_dates()
|
||||||
|
result["local"] = {
|
||||||
|
"dates": local_dates,
|
||||||
|
"count": len(local_dates),
|
||||||
|
"earliest": local_dates[-1] if local_dates else None,
|
||||||
|
"latest": local_dates[0] if local_dates else None,
|
||||||
|
}
|
||||||
|
|
||||||
|
# 远程日期
|
||||||
|
if source in ("remote", "both"):
|
||||||
|
if not self._has_remote_config():
|
||||||
|
result["remote"] = {
|
||||||
|
"configured": False,
|
||||||
|
"dates": [],
|
||||||
|
"count": 0,
|
||||||
|
"earliest": None,
|
||||||
|
"latest": None,
|
||||||
|
"error": "未配置远程存储"
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
remote_backend = self._get_remote_backend()
|
||||||
|
if remote_backend:
|
||||||
|
try:
|
||||||
|
remote_dates = remote_backend.list_remote_dates()
|
||||||
|
result["remote"] = {
|
||||||
|
"configured": True,
|
||||||
|
"dates": remote_dates,
|
||||||
|
"count": len(remote_dates),
|
||||||
|
"earliest": remote_dates[-1] if remote_dates else None,
|
||||||
|
"latest": remote_dates[0] if remote_dates else None,
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
result["remote"] = {
|
||||||
|
"configured": True,
|
||||||
|
"dates": [],
|
||||||
|
"count": 0,
|
||||||
|
"earliest": None,
|
||||||
|
"latest": None,
|
||||||
|
"error": str(e)
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
result["remote"] = {
|
||||||
|
"configured": True,
|
||||||
|
"dates": [],
|
||||||
|
"count": 0,
|
||||||
|
"earliest": None,
|
||||||
|
"latest": None,
|
||||||
|
"error": "无法创建远程存储后端"
|
||||||
|
}
|
||||||
|
|
||||||
|
# 如果同时查询两者,计算差异
|
||||||
|
if source == "both" and "local" in result and "remote" in result:
|
||||||
|
local_set = set(result["local"]["dates"])
|
||||||
|
remote_set = set(result["remote"].get("dates", []))
|
||||||
|
|
||||||
|
result["comparison"] = {
|
||||||
|
"only_local": sorted(list(local_set - remote_set), reverse=True),
|
||||||
|
"only_remote": sorted(list(remote_set - local_set), reverse=True),
|
||||||
|
"both": sorted(list(local_set & remote_set), reverse=True),
|
||||||
|
}
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
except MCPError as e:
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"error": e.to_dict()
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"error": {
|
||||||
|
"code": "INTERNAL_ERROR",
|
||||||
|
"message": str(e)
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -87,13 +87,13 @@ class SystemManagementTools:
|
|||||||
>>> print(result['saved_files'])
|
>>> print(result['saved_files'])
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
import json
|
|
||||||
import time
|
import time
|
||||||
import random
|
|
||||||
import requests
|
|
||||||
from datetime import datetime
|
|
||||||
import pytz
|
|
||||||
import yaml
|
import yaml
|
||||||
|
from trendradar.crawler.fetcher import DataFetcher
|
||||||
|
from trendradar.storage.local import LocalStorageBackend
|
||||||
|
from trendradar.storage.base import convert_crawl_results_to_news_data
|
||||||
|
from trendradar.utils.time import get_configured_time, format_date_folder, format_time_filename
|
||||||
|
from ..services.cache_service import get_cache
|
||||||
|
|
||||||
# 参数验证
|
# 参数验证
|
||||||
platforms = validate_platforms(platforms)
|
platforms = validate_platforms(platforms)
|
||||||
@ -129,9 +129,6 @@ class SystemManagementTools:
|
|||||||
else:
|
else:
|
||||||
target_platforms = all_platforms
|
target_platforms = all_platforms
|
||||||
|
|
||||||
# 获取请求间隔
|
|
||||||
request_interval = config_data.get("crawler", {}).get("request_interval", 100)
|
|
||||||
|
|
||||||
# 构建平台ID列表
|
# 构建平台ID列表
|
||||||
ids = []
|
ids = []
|
||||||
for platform in target_platforms:
|
for platform in target_platforms:
|
||||||
@ -142,87 +139,82 @@ class SystemManagementTools:
|
|||||||
|
|
||||||
print(f"开始临时爬取,平台: {[p.get('name', p['id']) for p in target_platforms]}")
|
print(f"开始临时爬取,平台: {[p.get('name', p['id']) for p in target_platforms]}")
|
||||||
|
|
||||||
# 爬取数据
|
# 初始化数据获取器
|
||||||
results = {}
|
crawler_config = config_data.get("crawler", {})
|
||||||
id_to_name = {}
|
proxy_url = None
|
||||||
failed_ids = []
|
if crawler_config.get("use_proxy"):
|
||||||
|
proxy_url = crawler_config.get("proxy_url")
|
||||||
|
|
||||||
for i, id_info in enumerate(ids):
|
fetcher = DataFetcher(proxy_url=proxy_url)
|
||||||
if isinstance(id_info, tuple):
|
request_interval = crawler_config.get("request_interval", 100)
|
||||||
id_value, name = id_info
|
|
||||||
else:
|
|
||||||
id_value = id_info
|
|
||||||
name = id_value
|
|
||||||
|
|
||||||
id_to_name[id_value] = name
|
# 执行爬取
|
||||||
|
results, id_to_name, failed_ids = fetcher.crawl_websites(
|
||||||
|
ids_list=ids,
|
||||||
|
request_interval=request_interval
|
||||||
|
)
|
||||||
|
|
||||||
# 构建请求URL
|
# 获取当前时间(统一使用 trendradar 的时间工具)
|
||||||
url = f"https://newsnow.busiyi.world/api/s?id={id_value}&latest"
|
# 从配置中读取时区,默认为 Asia/Shanghai
|
||||||
|
timezone = config_data.get("app", {}).get("timezone", "Asia/Shanghai")
|
||||||
|
current_time = get_configured_time(timezone)
|
||||||
|
crawl_date = format_date_folder(None, timezone)
|
||||||
|
crawl_time_str = format_time_filename(timezone)
|
||||||
|
|
||||||
headers = {
|
# 转换为标准数据模型
|
||||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
|
news_data = convert_crawl_results_to_news_data(
|
||||||
"Accept": "application/json, text/plain, */*",
|
results=results,
|
||||||
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
id_to_name=id_to_name,
|
||||||
"Connection": "keep-alive",
|
failed_ids=failed_ids,
|
||||||
"Cache-Control": "no-cache",
|
crawl_time=crawl_time_str,
|
||||||
}
|
crawl_date=crawl_date
|
||||||
|
)
|
||||||
|
|
||||||
# 重试机制
|
# 初始化存储后端
|
||||||
max_retries = 2
|
storage = LocalStorageBackend(
|
||||||
retries = 0
|
data_dir=str(self.project_root / "output"),
|
||||||
success = False
|
enable_txt=True,
|
||||||
|
enable_html=True,
|
||||||
|
timezone=timezone
|
||||||
|
)
|
||||||
|
|
||||||
while retries <= max_retries and not success:
|
# 尝试持久化数据
|
||||||
try:
|
save_success = False
|
||||||
response = requests.get(url, headers=headers, timeout=10)
|
save_error_msg = ""
|
||||||
response.raise_for_status()
|
saved_files = {}
|
||||||
|
|
||||||
data_text = response.text
|
try:
|
||||||
data_json = json.loads(data_text)
|
# 1. 保存到 SQLite (核心持久化)
|
||||||
|
if storage.save_news_data(news_data):
|
||||||
|
save_success = True
|
||||||
|
|
||||||
status = data_json.get("status", "未知")
|
# 2. 如果请求保存到本地,生成 TXT/HTML 快照
|
||||||
if status not in ["success", "cache"]:
|
if save_to_local:
|
||||||
raise ValueError(f"响应状态异常: {status}")
|
# 保存 TXT
|
||||||
|
txt_path = storage.save_txt_snapshot(news_data)
|
||||||
|
if txt_path:
|
||||||
|
saved_files["txt"] = txt_path
|
||||||
|
|
||||||
status_info = "最新数据" if status == "success" else "缓存数据"
|
# 保存 HTML (使用简化版生成器)
|
||||||
print(f"获取 {id_value} 成功({status_info})")
|
html_content = self._generate_simple_html(results, id_to_name, failed_ids, current_time)
|
||||||
|
html_filename = f"{crawl_time_str}.html"
|
||||||
|
html_path = storage.save_html_report(html_content, html_filename)
|
||||||
|
if html_path:
|
||||||
|
saved_files["html"] = html_path
|
||||||
|
|
||||||
# 解析数据
|
except Exception as e:
|
||||||
results[id_value] = {}
|
# 捕获所有保存错误(特别是 Docker 只读卷导致的 PermissionError)
|
||||||
for index, item in enumerate(data_json.get("items", []), 1):
|
print(f"[System] 数据保存失败: {e}")
|
||||||
title = item["title"]
|
save_success = False
|
||||||
url_link = item.get("url", "")
|
save_error_msg = str(e)
|
||||||
mobile_url = item.get("mobileUrl", "")
|
|
||||||
|
|
||||||
if title in results[id_value]:
|
# 3. 清除缓存,确保下次查询获取最新数据
|
||||||
results[id_value][title]["ranks"].append(index)
|
# 即使保存失败,内存中的数据可能已经通过其他方式更新,或者是临时的
|
||||||
else:
|
get_cache().clear()
|
||||||
results[id_value][title] = {
|
print("[System] 缓存已清除")
|
||||||
"ranks": [index],
|
|
||||||
"url": url_link,
|
|
||||||
"mobileUrl": mobile_url,
|
|
||||||
}
|
|
||||||
|
|
||||||
success = True
|
# 构建返回结果
|
||||||
|
news_response_data = []
|
||||||
except Exception as e:
|
|
||||||
retries += 1
|
|
||||||
if retries <= max_retries:
|
|
||||||
wait_time = random.uniform(3, 5)
|
|
||||||
print(f"请求 {id_value} 失败: {e}. {wait_time:.2f}秒后重试...")
|
|
||||||
time.sleep(wait_time)
|
|
||||||
else:
|
|
||||||
print(f"请求 {id_value} 失败: {e}")
|
|
||||||
failed_ids.append(id_value)
|
|
||||||
|
|
||||||
# 请求间隔
|
|
||||||
if i < len(ids) - 1:
|
|
||||||
actual_interval = request_interval + random.randint(-10, 20)
|
|
||||||
actual_interval = max(50, actual_interval)
|
|
||||||
time.sleep(actual_interval / 1000)
|
|
||||||
|
|
||||||
# 格式化返回数据
|
|
||||||
news_data = []
|
|
||||||
for platform_id, titles_data in results.items():
|
for platform_id, titles_data in results.items():
|
||||||
platform_name = id_to_name.get(platform_id, platform_id)
|
platform_name = id_to_name.get(platform_id, platform_id)
|
||||||
for title, info in titles_data.items():
|
for title, info in titles_data.items():
|
||||||
@ -230,131 +222,42 @@ class SystemManagementTools:
|
|||||||
"platform_id": platform_id,
|
"platform_id": platform_id,
|
||||||
"platform_name": platform_name,
|
"platform_name": platform_name,
|
||||||
"title": title,
|
"title": title,
|
||||||
"ranks": info["ranks"]
|
"ranks": info.get("ranks", [])
|
||||||
}
|
}
|
||||||
|
|
||||||
# 条件性添加 URL 字段
|
|
||||||
if include_url:
|
if include_url:
|
||||||
news_item["url"] = info.get("url", "")
|
news_item["url"] = info.get("url", "")
|
||||||
news_item["mobile_url"] = info.get("mobileUrl", "")
|
news_item["mobile_url"] = info.get("mobileUrl", "")
|
||||||
|
news_response_data.append(news_item)
|
||||||
|
|
||||||
news_data.append(news_item)
|
|
||||||
|
|
||||||
# 获取北京时间
|
|
||||||
beijing_tz = pytz.timezone("Asia/Shanghai")
|
|
||||||
now = datetime.now(beijing_tz)
|
|
||||||
|
|
||||||
# 构建返回结果
|
|
||||||
result = {
|
result = {
|
||||||
"success": True,
|
"success": True,
|
||||||
"task_id": f"crawl_{int(time.time())}",
|
"task_id": f"crawl_{int(time.time())}",
|
||||||
"status": "completed",
|
"status": "completed",
|
||||||
"crawl_time": now.strftime("%Y-%m-%d %H:%M:%S"),
|
"crawl_time": current_time.strftime("%Y-%m-%d %H:%M:%S"),
|
||||||
"platforms": list(results.keys()),
|
"platforms": list(results.keys()),
|
||||||
"total_news": len(news_data),
|
"total_news": len(news_response_data),
|
||||||
"failed_platforms": failed_ids,
|
"failed_platforms": failed_ids,
|
||||||
"data": news_data,
|
"data": news_response_data,
|
||||||
"saved_to_local": save_to_local
|
"saved_to_local": save_success and save_to_local
|
||||||
}
|
}
|
||||||
|
|
||||||
# 如果需要持久化,调用保存逻辑
|
if save_success:
|
||||||
if save_to_local:
|
if save_to_local:
|
||||||
try:
|
result["saved_files"] = saved_files
|
||||||
import re
|
result["note"] = "数据已保存到 SQLite 数据库及 output 文件夹"
|
||||||
|
else:
|
||||||
# 辅助函数:清理标题
|
result["note"] = "数据已保存到 SQLite 数据库 (仅内存中返回结果,未生成TXT快照)"
|
||||||
def clean_title(title: str) -> str:
|
|
||||||
"""清理标题中的特殊字符"""
|
|
||||||
if not isinstance(title, str):
|
|
||||||
title = str(title)
|
|
||||||
cleaned_title = title.replace("\n", " ").replace("\r", " ")
|
|
||||||
cleaned_title = re.sub(r"\s+", " ", cleaned_title)
|
|
||||||
cleaned_title = cleaned_title.strip()
|
|
||||||
return cleaned_title
|
|
||||||
|
|
||||||
# 辅助函数:创建目录
|
|
||||||
def ensure_directory_exists(directory: str):
|
|
||||||
"""确保目录存在"""
|
|
||||||
Path(directory).mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
# 格式化日期和时间
|
|
||||||
date_folder = now.strftime("%Y年%m月%d日")
|
|
||||||
time_filename = now.strftime("%H时%M分")
|
|
||||||
|
|
||||||
# 创建 txt 文件路径
|
|
||||||
txt_dir = self.project_root / "output" / date_folder / "txt"
|
|
||||||
ensure_directory_exists(str(txt_dir))
|
|
||||||
txt_file_path = txt_dir / f"{time_filename}.txt"
|
|
||||||
|
|
||||||
# 创建 html 文件路径
|
|
||||||
html_dir = self.project_root / "output" / date_folder / "html"
|
|
||||||
ensure_directory_exists(str(html_dir))
|
|
||||||
html_file_path = html_dir / f"{time_filename}.html"
|
|
||||||
|
|
||||||
# 保存 txt 文件(按照 main.py 的格式)
|
|
||||||
with open(txt_file_path, "w", encoding="utf-8") as f:
|
|
||||||
for id_value, title_data in results.items():
|
|
||||||
# id | name 或 id
|
|
||||||
name = id_to_name.get(id_value)
|
|
||||||
if name and name != id_value:
|
|
||||||
f.write(f"{id_value} | {name}\n")
|
|
||||||
else:
|
|
||||||
f.write(f"{id_value}\n")
|
|
||||||
|
|
||||||
# 按排名排序标题
|
|
||||||
sorted_titles = []
|
|
||||||
for title, info in title_data.items():
|
|
||||||
cleaned = clean_title(title)
|
|
||||||
if isinstance(info, dict):
|
|
||||||
ranks = info.get("ranks", [])
|
|
||||||
url = info.get("url", "")
|
|
||||||
mobile_url = info.get("mobileUrl", "")
|
|
||||||
else:
|
|
||||||
ranks = info if isinstance(info, list) else []
|
|
||||||
url = ""
|
|
||||||
mobile_url = ""
|
|
||||||
|
|
||||||
rank = ranks[0] if ranks else 1
|
|
||||||
sorted_titles.append((rank, cleaned, url, mobile_url))
|
|
||||||
|
|
||||||
sorted_titles.sort(key=lambda x: x[0])
|
|
||||||
|
|
||||||
for rank, cleaned, url, mobile_url in sorted_titles:
|
|
||||||
line = f"{rank}. {cleaned}"
|
|
||||||
if url:
|
|
||||||
line += f" [URL:{url}]"
|
|
||||||
if mobile_url:
|
|
||||||
line += f" [MOBILE:{mobile_url}]"
|
|
||||||
f.write(line + "\n")
|
|
||||||
|
|
||||||
f.write("\n")
|
|
||||||
|
|
||||||
if failed_ids:
|
|
||||||
f.write("==== 以下ID请求失败 ====\n")
|
|
||||||
for id_value in failed_ids:
|
|
||||||
f.write(f"{id_value}\n")
|
|
||||||
|
|
||||||
# 保存 html 文件(简化版)
|
|
||||||
html_content = self._generate_simple_html(results, id_to_name, failed_ids, now)
|
|
||||||
with open(html_file_path, "w", encoding="utf-8") as f:
|
|
||||||
f.write(html_content)
|
|
||||||
|
|
||||||
print(f"数据已保存到:")
|
|
||||||
print(f" TXT: {txt_file_path}")
|
|
||||||
print(f" HTML: {html_file_path}")
|
|
||||||
|
|
||||||
result["saved_files"] = {
|
|
||||||
"txt": str(txt_file_path),
|
|
||||||
"html": str(html_file_path)
|
|
||||||
}
|
|
||||||
result["note"] = "数据已持久化到 output 文件夹"
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"保存文件失败: {e}")
|
|
||||||
result["save_error"] = str(e)
|
|
||||||
result["note"] = "爬取成功但保存失败,数据仅在内存中"
|
|
||||||
else:
|
else:
|
||||||
result["note"] = "临时爬取结果,未持久化到output文件夹"
|
# 明确告知用户保存失败
|
||||||
|
result["saved_to_local"] = False
|
||||||
|
result["save_error"] = save_error_msg
|
||||||
|
if "Read-only file system" in save_error_msg or "Permission denied" in save_error_msg:
|
||||||
|
result["note"] = "爬取成功,但无法写入数据库(Docker只读模式)。数据仅在本次返回中有效。"
|
||||||
|
else:
|
||||||
|
result["note"] = f"爬取成功但保存失败: {save_error_msg}"
|
||||||
|
|
||||||
|
# 清理资源
|
||||||
|
storage.cleanup()
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|||||||
@ -283,13 +283,13 @@ class DateParser:
|
|||||||
date: datetime对象
|
date: datetime对象
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
文件夹名称,格式: YYYY年MM月DD日
|
文件夹名称,格式: YYYY-MM-DD
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
>>> DateParser.format_date_folder(datetime(2025, 10, 11))
|
>>> DateParser.format_date_folder(datetime(2025, 10, 11))
|
||||||
'2025年10月11日'
|
'2025-10-11'
|
||||||
"""
|
"""
|
||||||
return date.strftime("%Y年%m月%d日")
|
return date.strftime("%Y-%m-%d")
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def validate_date_not_future(date: datetime) -> None:
|
def validate_date_not_future(date: datetime) -> None:
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
[project]
|
[project]
|
||||||
name = "trendradar-mcp"
|
name = "trendradar-mcp"
|
||||||
version = "1.0.3"
|
version = "1.1.0"
|
||||||
description = "TrendRadar MCP Server - 新闻热点聚合工具"
|
description = "TrendRadar MCP Server - 新闻热点聚合工具"
|
||||||
requires-python = ">=3.10"
|
requires-python = ">=3.10"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
|||||||
@ -3,3 +3,4 @@ pytz>=2025.2,<2026.0
|
|||||||
PyYAML>=6.0.3,<7.0.0
|
PyYAML>=6.0.3,<7.0.0
|
||||||
fastmcp>=2.12.0,<2.14.0
|
fastmcp>=2.12.0,<2.14.0
|
||||||
websockets>=13.0,<14.0
|
websockets>=13.0,<14.0
|
||||||
|
boto3>=1.35.0,<2.0.0
|
||||||
|
|||||||
13
trendradar/__init__.py
Normal file
13
trendradar/__init__.py
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
"""
|
||||||
|
TrendRadar - 热点新闻聚合与分析工具
|
||||||
|
|
||||||
|
使用方式:
|
||||||
|
python -m trendradar # 模块执行
|
||||||
|
trendradar # 安装后执行
|
||||||
|
"""
|
||||||
|
|
||||||
|
from trendradar.context import AppContext
|
||||||
|
|
||||||
|
__version__ = "4.0.0"
|
||||||
|
__all__ = ["AppContext", "__version__"]
|
||||||
719
trendradar/__main__.py
Normal file
719
trendradar/__main__.py
Normal file
@ -0,0 +1,719 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
"""
|
||||||
|
TrendRadar 主程序
|
||||||
|
|
||||||
|
热点新闻聚合与分析工具
|
||||||
|
支持: python -m trendradar
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import webbrowser
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, List, Tuple, Optional
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from trendradar.context import AppContext
|
||||||
|
|
||||||
|
# 版本号直接定义,避免循环导入
|
||||||
|
VERSION = "4.0.0"
|
||||||
|
from trendradar.core import load_config
|
||||||
|
from trendradar.crawler import DataFetcher
|
||||||
|
from trendradar.storage import convert_crawl_results_to_news_data
|
||||||
|
|
||||||
|
|
||||||
|
def check_version_update(
|
||||||
|
current_version: str, version_url: str, proxy_url: Optional[str] = None
|
||||||
|
) -> Tuple[bool, Optional[str]]:
|
||||||
|
"""检查版本更新"""
|
||||||
|
try:
|
||||||
|
proxies = None
|
||||||
|
if proxy_url:
|
||||||
|
proxies = {"http": proxy_url, "https": proxy_url}
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
|
||||||
|
"Accept": "text/plain, */*",
|
||||||
|
"Cache-Control": "no-cache",
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.get(
|
||||||
|
version_url, proxies=proxies, headers=headers, timeout=10
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
remote_version = response.text.strip()
|
||||||
|
print(f"当前版本: {current_version}, 远程版本: {remote_version}")
|
||||||
|
|
||||||
|
# 比较版本
|
||||||
|
def parse_version(version_str):
|
||||||
|
try:
|
||||||
|
parts = version_str.strip().split(".")
|
||||||
|
if len(parts) != 3:
|
||||||
|
raise ValueError("版本号格式不正确")
|
||||||
|
return int(parts[0]), int(parts[1]), int(parts[2])
|
||||||
|
except:
|
||||||
|
return 0, 0, 0
|
||||||
|
|
||||||
|
current_tuple = parse_version(current_version)
|
||||||
|
remote_tuple = parse_version(remote_version)
|
||||||
|
|
||||||
|
need_update = current_tuple < remote_tuple
|
||||||
|
return need_update, remote_version if need_update else None
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"版本检查失败: {e}")
|
||||||
|
return False, None
|
||||||
|
|
||||||
|
|
||||||
|
# === 主分析器 ===
|
||||||
|
class NewsAnalyzer:
|
||||||
|
"""新闻分析器"""
|
||||||
|
|
||||||
|
# 模式策略定义
|
||||||
|
MODE_STRATEGIES = {
|
||||||
|
"incremental": {
|
||||||
|
"mode_name": "增量模式",
|
||||||
|
"description": "增量模式(只关注新增新闻,无新增时不推送)",
|
||||||
|
"realtime_report_type": "实时增量",
|
||||||
|
"summary_report_type": "当日汇总",
|
||||||
|
"should_send_realtime": True,
|
||||||
|
"should_generate_summary": True,
|
||||||
|
"summary_mode": "daily",
|
||||||
|
},
|
||||||
|
"current": {
|
||||||
|
"mode_name": "当前榜单模式",
|
||||||
|
"description": "当前榜单模式(当前榜单匹配新闻 + 新增新闻区域 + 按时推送)",
|
||||||
|
"realtime_report_type": "实时当前榜单",
|
||||||
|
"summary_report_type": "当前榜单汇总",
|
||||||
|
"should_send_realtime": True,
|
||||||
|
"should_generate_summary": True,
|
||||||
|
"summary_mode": "current",
|
||||||
|
},
|
||||||
|
"daily": {
|
||||||
|
"mode_name": "当日汇总模式",
|
||||||
|
"description": "当日汇总模式(所有匹配新闻 + 新增新闻区域 + 按时推送)",
|
||||||
|
"realtime_report_type": "",
|
||||||
|
"summary_report_type": "当日汇总",
|
||||||
|
"should_send_realtime": False,
|
||||||
|
"should_generate_summary": True,
|
||||||
|
"summary_mode": "daily",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
# 加载配置
|
||||||
|
print("正在加载配置...")
|
||||||
|
config = load_config()
|
||||||
|
print(f"TrendRadar v{VERSION} 配置加载完成")
|
||||||
|
print(f"监控平台数量: {len(config['PLATFORMS'])}")
|
||||||
|
print(f"时区: {config.get('TIMEZONE', 'Asia/Shanghai')}")
|
||||||
|
|
||||||
|
# 创建应用上下文
|
||||||
|
self.ctx = AppContext(config)
|
||||||
|
|
||||||
|
self.request_interval = self.ctx.config["REQUEST_INTERVAL"]
|
||||||
|
self.report_mode = self.ctx.config["REPORT_MODE"]
|
||||||
|
self.rank_threshold = self.ctx.rank_threshold
|
||||||
|
self.is_github_actions = os.environ.get("GITHUB_ACTIONS") == "true"
|
||||||
|
self.is_docker_container = self._detect_docker_environment()
|
||||||
|
self.update_info = None
|
||||||
|
self.proxy_url = None
|
||||||
|
self._setup_proxy()
|
||||||
|
self.data_fetcher = DataFetcher(self.proxy_url)
|
||||||
|
|
||||||
|
# 初始化存储管理器(使用 AppContext)
|
||||||
|
self._init_storage_manager()
|
||||||
|
|
||||||
|
if self.is_github_actions:
|
||||||
|
self._check_version_update()
|
||||||
|
|
||||||
|
def _init_storage_manager(self) -> None:
|
||||||
|
"""初始化存储管理器(使用 AppContext)"""
|
||||||
|
# 获取数据保留天数(支持环境变量覆盖)
|
||||||
|
env_retention = os.environ.get("STORAGE_RETENTION_DAYS", "").strip()
|
||||||
|
if env_retention:
|
||||||
|
# 环境变量覆盖配置
|
||||||
|
self.ctx.config["STORAGE"]["RETENTION_DAYS"] = int(env_retention)
|
||||||
|
|
||||||
|
self.storage_manager = self.ctx.get_storage_manager()
|
||||||
|
print(f"存储后端: {self.storage_manager.backend_name}")
|
||||||
|
|
||||||
|
retention_days = self.ctx.config.get("STORAGE", {}).get("RETENTION_DAYS", 0)
|
||||||
|
if retention_days > 0:
|
||||||
|
print(f"数据保留天数: {retention_days} 天")
|
||||||
|
|
||||||
|
def _detect_docker_environment(self) -> bool:
|
||||||
|
"""检测是否运行在 Docker 容器中"""
|
||||||
|
try:
|
||||||
|
if os.environ.get("DOCKER_CONTAINER") == "true":
|
||||||
|
return True
|
||||||
|
|
||||||
|
if os.path.exists("/.dockerenv"):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _should_open_browser(self) -> bool:
|
||||||
|
"""判断是否应该打开浏览器"""
|
||||||
|
return not self.is_github_actions and not self.is_docker_container
|
||||||
|
|
||||||
|
def _setup_proxy(self) -> None:
|
||||||
|
"""设置代理配置"""
|
||||||
|
if not self.is_github_actions and self.ctx.config["USE_PROXY"]:
|
||||||
|
self.proxy_url = self.ctx.config["DEFAULT_PROXY"]
|
||||||
|
print("本地环境,使用代理")
|
||||||
|
elif not self.is_github_actions and not self.ctx.config["USE_PROXY"]:
|
||||||
|
print("本地环境,未启用代理")
|
||||||
|
else:
|
||||||
|
print("GitHub Actions环境,不使用代理")
|
||||||
|
|
||||||
|
def _check_version_update(self) -> None:
|
||||||
|
"""检查版本更新"""
|
||||||
|
try:
|
||||||
|
need_update, remote_version = check_version_update(
|
||||||
|
VERSION, self.ctx.config["VERSION_CHECK_URL"], self.proxy_url
|
||||||
|
)
|
||||||
|
|
||||||
|
if need_update and remote_version:
|
||||||
|
self.update_info = {
|
||||||
|
"current_version": VERSION,
|
||||||
|
"remote_version": remote_version,
|
||||||
|
}
|
||||||
|
print(f"发现新版本: {remote_version} (当前: {VERSION})")
|
||||||
|
else:
|
||||||
|
print("版本检查完成,当前为最新版本")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"版本检查出错: {e}")
|
||||||
|
|
||||||
|
def _get_mode_strategy(self) -> Dict:
|
||||||
|
"""获取当前模式的策略配置"""
|
||||||
|
return self.MODE_STRATEGIES.get(self.report_mode, self.MODE_STRATEGIES["daily"])
|
||||||
|
|
||||||
|
def _has_notification_configured(self) -> bool:
|
||||||
|
"""检查是否配置了任何通知渠道"""
|
||||||
|
cfg = self.ctx.config
|
||||||
|
return any(
|
||||||
|
[
|
||||||
|
cfg["FEISHU_WEBHOOK_URL"],
|
||||||
|
cfg["DINGTALK_WEBHOOK_URL"],
|
||||||
|
cfg["WEWORK_WEBHOOK_URL"],
|
||||||
|
(cfg["TELEGRAM_BOT_TOKEN"] and cfg["TELEGRAM_CHAT_ID"]),
|
||||||
|
(
|
||||||
|
cfg["EMAIL_FROM"]
|
||||||
|
and cfg["EMAIL_PASSWORD"]
|
||||||
|
and cfg["EMAIL_TO"]
|
||||||
|
),
|
||||||
|
(cfg["NTFY_SERVER_URL"] and cfg["NTFY_TOPIC"]),
|
||||||
|
cfg["BARK_URL"],
|
||||||
|
cfg["SLACK_WEBHOOK_URL"],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
def _has_valid_content(
|
||||||
|
self, stats: List[Dict], new_titles: Optional[Dict] = None
|
||||||
|
) -> bool:
|
||||||
|
"""检查是否有有效的新闻内容"""
|
||||||
|
if self.report_mode in ["incremental", "current"]:
|
||||||
|
# 增量模式和current模式下,只要stats有内容就说明有匹配的新闻
|
||||||
|
return any(stat["count"] > 0 for stat in stats)
|
||||||
|
else:
|
||||||
|
# 当日汇总模式下,检查是否有匹配的频率词新闻或新增新闻
|
||||||
|
has_matched_news = any(stat["count"] > 0 for stat in stats)
|
||||||
|
has_new_news = bool(
|
||||||
|
new_titles and any(len(titles) > 0 for titles in new_titles.values())
|
||||||
|
)
|
||||||
|
return has_matched_news or has_new_news
|
||||||
|
|
||||||
|
def _load_analysis_data(
|
||||||
|
self,
|
||||||
|
) -> Optional[Tuple[Dict, Dict, Dict, Dict, List, List]]:
|
||||||
|
"""统一的数据加载和预处理,使用当前监控平台列表过滤历史数据"""
|
||||||
|
try:
|
||||||
|
# 获取当前配置的监控平台ID列表
|
||||||
|
current_platform_ids = self.ctx.platform_ids
|
||||||
|
print(f"当前监控平台: {current_platform_ids}")
|
||||||
|
|
||||||
|
all_results, id_to_name, title_info = self.ctx.read_today_titles(
|
||||||
|
current_platform_ids
|
||||||
|
)
|
||||||
|
|
||||||
|
if not all_results:
|
||||||
|
print("没有找到当天的数据")
|
||||||
|
return None
|
||||||
|
|
||||||
|
total_titles = sum(len(titles) for titles in all_results.values())
|
||||||
|
print(f"读取到 {total_titles} 个标题(已按当前监控平台过滤)")
|
||||||
|
|
||||||
|
new_titles = self.ctx.detect_new_titles(current_platform_ids)
|
||||||
|
word_groups, filter_words, global_filters = self.ctx.load_frequency_words()
|
||||||
|
|
||||||
|
return (
|
||||||
|
all_results,
|
||||||
|
id_to_name,
|
||||||
|
title_info,
|
||||||
|
new_titles,
|
||||||
|
word_groups,
|
||||||
|
filter_words,
|
||||||
|
global_filters,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"数据加载失败: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _prepare_current_title_info(self, results: Dict, time_info: str) -> Dict:
|
||||||
|
"""从当前抓取结果构建标题信息"""
|
||||||
|
title_info = {}
|
||||||
|
for source_id, titles_data in results.items():
|
||||||
|
title_info[source_id] = {}
|
||||||
|
for title, title_data in titles_data.items():
|
||||||
|
ranks = title_data.get("ranks", [])
|
||||||
|
url = title_data.get("url", "")
|
||||||
|
mobile_url = title_data.get("mobileUrl", "")
|
||||||
|
|
||||||
|
title_info[source_id][title] = {
|
||||||
|
"first_time": time_info,
|
||||||
|
"last_time": time_info,
|
||||||
|
"count": 1,
|
||||||
|
"ranks": ranks,
|
||||||
|
"url": url,
|
||||||
|
"mobileUrl": mobile_url,
|
||||||
|
}
|
||||||
|
return title_info
|
||||||
|
|
||||||
|
def _run_analysis_pipeline(
|
||||||
|
self,
|
||||||
|
data_source: Dict,
|
||||||
|
mode: str,
|
||||||
|
title_info: Dict,
|
||||||
|
new_titles: Dict,
|
||||||
|
word_groups: List[Dict],
|
||||||
|
filter_words: List[str],
|
||||||
|
id_to_name: Dict,
|
||||||
|
failed_ids: Optional[List] = None,
|
||||||
|
is_daily_summary: bool = False,
|
||||||
|
global_filters: Optional[List[str]] = None,
|
||||||
|
) -> Tuple[List[Dict], Optional[str]]:
|
||||||
|
"""统一的分析流水线:数据处理 → 统计计算 → HTML生成"""
|
||||||
|
|
||||||
|
# 统计计算(使用 AppContext)
|
||||||
|
stats, total_titles = self.ctx.count_frequency(
|
||||||
|
data_source,
|
||||||
|
word_groups,
|
||||||
|
filter_words,
|
||||||
|
id_to_name,
|
||||||
|
title_info,
|
||||||
|
new_titles,
|
||||||
|
mode=mode,
|
||||||
|
global_filters=global_filters,
|
||||||
|
)
|
||||||
|
|
||||||
|
# HTML生成(如果启用)
|
||||||
|
html_file = None
|
||||||
|
if self.ctx.config["STORAGE"]["FORMATS"]["HTML"]:
|
||||||
|
html_file = self.ctx.generate_html(
|
||||||
|
stats,
|
||||||
|
total_titles,
|
||||||
|
failed_ids=failed_ids,
|
||||||
|
new_titles=new_titles,
|
||||||
|
id_to_name=id_to_name,
|
||||||
|
mode=mode,
|
||||||
|
is_daily_summary=is_daily_summary,
|
||||||
|
update_info=self.update_info if self.ctx.config["SHOW_VERSION_UPDATE"] else None,
|
||||||
|
)
|
||||||
|
|
||||||
|
return stats, html_file
|
||||||
|
|
||||||
|
def _send_notification_if_needed(
|
||||||
|
self,
|
||||||
|
stats: List[Dict],
|
||||||
|
report_type: str,
|
||||||
|
mode: str,
|
||||||
|
failed_ids: Optional[List] = None,
|
||||||
|
new_titles: Optional[Dict] = None,
|
||||||
|
id_to_name: Optional[Dict] = None,
|
||||||
|
html_file_path: Optional[str] = None,
|
||||||
|
) -> bool:
|
||||||
|
"""统一的通知发送逻辑,包含所有判断条件"""
|
||||||
|
has_notification = self._has_notification_configured()
|
||||||
|
cfg = self.ctx.config
|
||||||
|
|
||||||
|
if (
|
||||||
|
cfg["ENABLE_NOTIFICATION"]
|
||||||
|
and has_notification
|
||||||
|
and self._has_valid_content(stats, new_titles)
|
||||||
|
):
|
||||||
|
# 推送窗口控制
|
||||||
|
if cfg["PUSH_WINDOW"]["ENABLED"]:
|
||||||
|
push_manager = self.ctx.create_push_manager()
|
||||||
|
time_range_start = cfg["PUSH_WINDOW"]["TIME_RANGE"]["START"]
|
||||||
|
time_range_end = cfg["PUSH_WINDOW"]["TIME_RANGE"]["END"]
|
||||||
|
|
||||||
|
if not push_manager.is_in_time_range(time_range_start, time_range_end):
|
||||||
|
now = self.ctx.get_time()
|
||||||
|
print(
|
||||||
|
f"推送窗口控制:当前时间 {now.strftime('%H:%M')} 不在推送时间窗口 {time_range_start}-{time_range_end} 内,跳过推送"
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
|
if cfg["PUSH_WINDOW"]["ONCE_PER_DAY"]:
|
||||||
|
if push_manager.has_pushed_today():
|
||||||
|
print(f"推送窗口控制:今天已推送过,跳过本次推送")
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
print(f"推送窗口控制:今天首次推送")
|
||||||
|
|
||||||
|
# 准备报告数据
|
||||||
|
report_data = self.ctx.prepare_report(stats, failed_ids, new_titles, id_to_name, mode)
|
||||||
|
|
||||||
|
# 是否发送版本更新信息
|
||||||
|
update_info_to_send = self.update_info if cfg["SHOW_VERSION_UPDATE"] else None
|
||||||
|
|
||||||
|
# 使用 NotificationDispatcher 发送到所有渠道
|
||||||
|
dispatcher = self.ctx.create_notification_dispatcher()
|
||||||
|
results = dispatcher.dispatch_all(
|
||||||
|
report_data=report_data,
|
||||||
|
report_type=report_type,
|
||||||
|
update_info=update_info_to_send,
|
||||||
|
proxy_url=self.proxy_url,
|
||||||
|
mode=mode,
|
||||||
|
html_file_path=html_file_path,
|
||||||
|
)
|
||||||
|
|
||||||
|
if not results:
|
||||||
|
print("未配置任何通知渠道,跳过通知发送")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# 如果成功发送了任何通知,且启用了每天只推一次,则记录推送
|
||||||
|
if (
|
||||||
|
cfg["PUSH_WINDOW"]["ENABLED"]
|
||||||
|
and cfg["PUSH_WINDOW"]["ONCE_PER_DAY"]
|
||||||
|
and any(results.values())
|
||||||
|
):
|
||||||
|
push_manager = self.ctx.create_push_manager()
|
||||||
|
push_manager.record_push(report_type)
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
elif cfg["ENABLE_NOTIFICATION"] and not has_notification:
|
||||||
|
print("⚠️ 警告:通知功能已启用但未配置任何通知渠道,将跳过通知发送")
|
||||||
|
elif not cfg["ENABLE_NOTIFICATION"]:
|
||||||
|
print(f"跳过{report_type}通知:通知功能已禁用")
|
||||||
|
elif (
|
||||||
|
cfg["ENABLE_NOTIFICATION"]
|
||||||
|
and has_notification
|
||||||
|
and not self._has_valid_content(stats, new_titles)
|
||||||
|
):
|
||||||
|
mode_strategy = self._get_mode_strategy()
|
||||||
|
if "实时" in report_type:
|
||||||
|
print(
|
||||||
|
f"跳过实时推送通知:{mode_strategy['mode_name']}下未检测到匹配的新闻"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
print(
|
||||||
|
f"跳过{mode_strategy['summary_report_type']}通知:未匹配到有效的新闻内容"
|
||||||
|
)
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _generate_summary_report(self, mode_strategy: Dict) -> Optional[str]:
|
||||||
|
"""生成汇总报告(带通知)"""
|
||||||
|
summary_type = (
|
||||||
|
"当前榜单汇总" if mode_strategy["summary_mode"] == "current" else "当日汇总"
|
||||||
|
)
|
||||||
|
print(f"生成{summary_type}报告...")
|
||||||
|
|
||||||
|
# 加载分析数据
|
||||||
|
analysis_data = self._load_analysis_data()
|
||||||
|
if not analysis_data:
|
||||||
|
return None
|
||||||
|
|
||||||
|
all_results, id_to_name, title_info, new_titles, word_groups, filter_words, global_filters = (
|
||||||
|
analysis_data
|
||||||
|
)
|
||||||
|
|
||||||
|
# 运行分析流水线
|
||||||
|
stats, html_file = self._run_analysis_pipeline(
|
||||||
|
all_results,
|
||||||
|
mode_strategy["summary_mode"],
|
||||||
|
title_info,
|
||||||
|
new_titles,
|
||||||
|
word_groups,
|
||||||
|
filter_words,
|
||||||
|
id_to_name,
|
||||||
|
is_daily_summary=True,
|
||||||
|
global_filters=global_filters,
|
||||||
|
)
|
||||||
|
|
||||||
|
if html_file:
|
||||||
|
print(f"{summary_type}报告已生成: {html_file}")
|
||||||
|
|
||||||
|
# 发送通知
|
||||||
|
self._send_notification_if_needed(
|
||||||
|
stats,
|
||||||
|
mode_strategy["summary_report_type"],
|
||||||
|
mode_strategy["summary_mode"],
|
||||||
|
failed_ids=[],
|
||||||
|
new_titles=new_titles,
|
||||||
|
id_to_name=id_to_name,
|
||||||
|
html_file_path=html_file,
|
||||||
|
)
|
||||||
|
|
||||||
|
return html_file
|
||||||
|
|
||||||
|
def _generate_summary_html(self, mode: str = "daily") -> Optional[str]:
|
||||||
|
"""生成汇总HTML"""
|
||||||
|
summary_type = "当前榜单汇总" if mode == "current" else "当日汇总"
|
||||||
|
print(f"生成{summary_type}HTML...")
|
||||||
|
|
||||||
|
# 加载分析数据
|
||||||
|
analysis_data = self._load_analysis_data()
|
||||||
|
if not analysis_data:
|
||||||
|
return None
|
||||||
|
|
||||||
|
all_results, id_to_name, title_info, new_titles, word_groups, filter_words, global_filters = (
|
||||||
|
analysis_data
|
||||||
|
)
|
||||||
|
|
||||||
|
# 运行分析流水线
|
||||||
|
_, html_file = self._run_analysis_pipeline(
|
||||||
|
all_results,
|
||||||
|
mode,
|
||||||
|
title_info,
|
||||||
|
new_titles,
|
||||||
|
word_groups,
|
||||||
|
filter_words,
|
||||||
|
id_to_name,
|
||||||
|
is_daily_summary=True,
|
||||||
|
global_filters=global_filters,
|
||||||
|
)
|
||||||
|
|
||||||
|
if html_file:
|
||||||
|
print(f"{summary_type}HTML已生成: {html_file}")
|
||||||
|
return html_file
|
||||||
|
|
||||||
|
def _initialize_and_check_config(self) -> None:
|
||||||
|
"""通用初始化和配置检查"""
|
||||||
|
now = self.ctx.get_time()
|
||||||
|
print(f"当前北京时间: {now.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||||
|
|
||||||
|
if not self.ctx.config["ENABLE_CRAWLER"]:
|
||||||
|
print("爬虫功能已禁用(ENABLE_CRAWLER=False),程序退出")
|
||||||
|
return
|
||||||
|
|
||||||
|
has_notification = self._has_notification_configured()
|
||||||
|
if not self.ctx.config["ENABLE_NOTIFICATION"]:
|
||||||
|
print("通知功能已禁用(ENABLE_NOTIFICATION=False),将只进行数据抓取")
|
||||||
|
elif not has_notification:
|
||||||
|
print("未配置任何通知渠道,将只进行数据抓取,不发送通知")
|
||||||
|
else:
|
||||||
|
print("通知功能已启用,将发送通知")
|
||||||
|
|
||||||
|
mode_strategy = self._get_mode_strategy()
|
||||||
|
print(f"报告模式: {self.report_mode}")
|
||||||
|
print(f"运行模式: {mode_strategy['description']}")
|
||||||
|
|
||||||
|
def _crawl_data(self) -> Tuple[Dict, Dict, List]:
|
||||||
|
"""执行数据爬取"""
|
||||||
|
ids = []
|
||||||
|
for platform in self.ctx.platforms:
|
||||||
|
if "name" in platform:
|
||||||
|
ids.append((platform["id"], platform["name"]))
|
||||||
|
else:
|
||||||
|
ids.append(platform["id"])
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"配置的监控平台: {[p.get('name', p['id']) for p in self.ctx.platforms]}"
|
||||||
|
)
|
||||||
|
print(f"开始爬取数据,请求间隔 {self.request_interval} 毫秒")
|
||||||
|
Path("output").mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
results, id_to_name, failed_ids = self.data_fetcher.crawl_websites(
|
||||||
|
ids, self.request_interval
|
||||||
|
)
|
||||||
|
|
||||||
|
# 转换为 NewsData 格式并保存到存储后端
|
||||||
|
crawl_time = self.ctx.format_time()
|
||||||
|
crawl_date = self.ctx.format_date()
|
||||||
|
news_data = convert_crawl_results_to_news_data(
|
||||||
|
results, id_to_name, failed_ids, crawl_time, crawl_date
|
||||||
|
)
|
||||||
|
|
||||||
|
# 保存到存储后端(SQLite)
|
||||||
|
if self.storage_manager.save_news_data(news_data):
|
||||||
|
print(f"数据已保存到存储后端: {self.storage_manager.backend_name}")
|
||||||
|
|
||||||
|
# 保存 TXT 快照(如果启用)
|
||||||
|
txt_file = self.storage_manager.save_txt_snapshot(news_data)
|
||||||
|
if txt_file:
|
||||||
|
print(f"TXT 快照已保存: {txt_file}")
|
||||||
|
|
||||||
|
# 兼容:同时保存到原有 TXT 格式(确保向后兼容)
|
||||||
|
if self.ctx.config["STORAGE"]["FORMATS"]["TXT"]:
|
||||||
|
title_file = self.ctx.save_titles(results, id_to_name, failed_ids)
|
||||||
|
print(f"标题已保存到: {title_file}")
|
||||||
|
|
||||||
|
return results, id_to_name, failed_ids
|
||||||
|
|
||||||
|
def _execute_mode_strategy(
|
||||||
|
self, mode_strategy: Dict, results: Dict, id_to_name: Dict, failed_ids: List
|
||||||
|
) -> Optional[str]:
|
||||||
|
"""执行模式特定逻辑"""
|
||||||
|
# 获取当前监控平台ID列表
|
||||||
|
current_platform_ids = self.ctx.platform_ids
|
||||||
|
|
||||||
|
new_titles = self.ctx.detect_new_titles(current_platform_ids)
|
||||||
|
time_info = self.ctx.format_time()
|
||||||
|
if self.ctx.config["STORAGE"]["FORMATS"]["TXT"]:
|
||||||
|
self.ctx.save_titles(results, id_to_name, failed_ids)
|
||||||
|
word_groups, filter_words, global_filters = self.ctx.load_frequency_words()
|
||||||
|
|
||||||
|
# current模式下,实时推送需要使用完整的历史数据来保证统计信息的完整性
|
||||||
|
if self.report_mode == "current":
|
||||||
|
# 加载完整的历史数据(已按当前平台过滤)
|
||||||
|
analysis_data = self._load_analysis_data()
|
||||||
|
if analysis_data:
|
||||||
|
(
|
||||||
|
all_results,
|
||||||
|
historical_id_to_name,
|
||||||
|
historical_title_info,
|
||||||
|
historical_new_titles,
|
||||||
|
_,
|
||||||
|
_,
|
||||||
|
_,
|
||||||
|
) = analysis_data
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"current模式:使用过滤后的历史数据,包含平台:{list(all_results.keys())}"
|
||||||
|
)
|
||||||
|
|
||||||
|
stats, html_file = self._run_analysis_pipeline(
|
||||||
|
all_results,
|
||||||
|
self.report_mode,
|
||||||
|
historical_title_info,
|
||||||
|
historical_new_titles,
|
||||||
|
word_groups,
|
||||||
|
filter_words,
|
||||||
|
historical_id_to_name,
|
||||||
|
failed_ids=failed_ids,
|
||||||
|
global_filters=global_filters,
|
||||||
|
)
|
||||||
|
|
||||||
|
combined_id_to_name = {**historical_id_to_name, **id_to_name}
|
||||||
|
|
||||||
|
if html_file:
|
||||||
|
print(f"HTML报告已生成: {html_file}")
|
||||||
|
|
||||||
|
# 发送实时通知(使用完整历史数据的统计结果)
|
||||||
|
summary_html = None
|
||||||
|
if mode_strategy["should_send_realtime"]:
|
||||||
|
self._send_notification_if_needed(
|
||||||
|
stats,
|
||||||
|
mode_strategy["realtime_report_type"],
|
||||||
|
self.report_mode,
|
||||||
|
failed_ids=failed_ids,
|
||||||
|
new_titles=historical_new_titles,
|
||||||
|
id_to_name=combined_id_to_name,
|
||||||
|
html_file_path=html_file,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
print("❌ 严重错误:无法读取刚保存的数据文件")
|
||||||
|
raise RuntimeError("数据一致性检查失败:保存后立即读取失败")
|
||||||
|
else:
|
||||||
|
title_info = self._prepare_current_title_info(results, time_info)
|
||||||
|
stats, html_file = self._run_analysis_pipeline(
|
||||||
|
results,
|
||||||
|
self.report_mode,
|
||||||
|
title_info,
|
||||||
|
new_titles,
|
||||||
|
word_groups,
|
||||||
|
filter_words,
|
||||||
|
id_to_name,
|
||||||
|
failed_ids=failed_ids,
|
||||||
|
global_filters=global_filters,
|
||||||
|
)
|
||||||
|
if html_file:
|
||||||
|
print(f"HTML报告已生成: {html_file}")
|
||||||
|
|
||||||
|
# 发送实时通知(如果需要)
|
||||||
|
summary_html = None
|
||||||
|
if mode_strategy["should_send_realtime"]:
|
||||||
|
self._send_notification_if_needed(
|
||||||
|
stats,
|
||||||
|
mode_strategy["realtime_report_type"],
|
||||||
|
self.report_mode,
|
||||||
|
failed_ids=failed_ids,
|
||||||
|
new_titles=new_titles,
|
||||||
|
id_to_name=id_to_name,
|
||||||
|
html_file_path=html_file,
|
||||||
|
)
|
||||||
|
|
||||||
|
# 生成汇总报告(如果需要)
|
||||||
|
summary_html = None
|
||||||
|
if mode_strategy["should_generate_summary"]:
|
||||||
|
if mode_strategy["should_send_realtime"]:
|
||||||
|
# 如果已经发送了实时通知,汇总只生成HTML不发送通知
|
||||||
|
summary_html = self._generate_summary_html(
|
||||||
|
mode_strategy["summary_mode"]
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# daily模式:直接生成汇总报告并发送通知
|
||||||
|
summary_html = self._generate_summary_report(mode_strategy)
|
||||||
|
|
||||||
|
# 打开浏览器(仅在非容器环境)
|
||||||
|
if self._should_open_browser() and html_file:
|
||||||
|
if summary_html:
|
||||||
|
summary_url = "file://" + str(Path(summary_html).resolve())
|
||||||
|
print(f"正在打开汇总报告: {summary_url}")
|
||||||
|
webbrowser.open(summary_url)
|
||||||
|
else:
|
||||||
|
file_url = "file://" + str(Path(html_file).resolve())
|
||||||
|
print(f"正在打开HTML报告: {file_url}")
|
||||||
|
webbrowser.open(file_url)
|
||||||
|
elif self.is_docker_container and html_file:
|
||||||
|
if summary_html:
|
||||||
|
print(f"汇总报告已生成(Docker环境): {summary_html}")
|
||||||
|
else:
|
||||||
|
print(f"HTML报告已生成(Docker环境): {html_file}")
|
||||||
|
|
||||||
|
return summary_html
|
||||||
|
|
||||||
|
def run(self) -> None:
|
||||||
|
"""执行分析流程"""
|
||||||
|
try:
|
||||||
|
self._initialize_and_check_config()
|
||||||
|
|
||||||
|
mode_strategy = self._get_mode_strategy()
|
||||||
|
|
||||||
|
results, id_to_name, failed_ids = self._crawl_data()
|
||||||
|
|
||||||
|
self._execute_mode_strategy(mode_strategy, results, id_to_name, failed_ids)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"分析流程执行出错: {e}")
|
||||||
|
raise
|
||||||
|
finally:
|
||||||
|
# 清理资源(包括过期数据清理和数据库连接关闭)
|
||||||
|
self.ctx.cleanup()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""主程序入口"""
|
||||||
|
try:
|
||||||
|
analyzer = NewsAnalyzer()
|
||||||
|
analyzer.run()
|
||||||
|
except FileNotFoundError as e:
|
||||||
|
print(f"❌ 配置文件错误: {e}")
|
||||||
|
print("\n请确保以下文件存在:")
|
||||||
|
print(" • config/config.yaml")
|
||||||
|
print(" • config/frequency_words.txt")
|
||||||
|
print("\n参考项目文档进行正确配置")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ 程序运行错误: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
388
trendradar/context.py
Normal file
388
trendradar/context.py
Normal file
@ -0,0 +1,388 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
"""
|
||||||
|
应用上下文模块
|
||||||
|
|
||||||
|
提供配置上下文类,封装所有依赖配置的操作,消除全局状态和包装函数。
|
||||||
|
"""
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Callable, Dict, List, Optional, Tuple
|
||||||
|
|
||||||
|
from trendradar.utils.time import (
|
||||||
|
get_configured_time,
|
||||||
|
format_date_folder,
|
||||||
|
format_time_filename,
|
||||||
|
get_current_time_display,
|
||||||
|
convert_time_for_display,
|
||||||
|
)
|
||||||
|
from trendradar.core import (
|
||||||
|
load_frequency_words,
|
||||||
|
matches_word_groups,
|
||||||
|
save_titles_to_file,
|
||||||
|
read_all_today_titles,
|
||||||
|
detect_latest_new_titles,
|
||||||
|
is_first_crawl_today,
|
||||||
|
count_word_frequency,
|
||||||
|
)
|
||||||
|
from trendradar.report import (
|
||||||
|
clean_title,
|
||||||
|
prepare_report_data,
|
||||||
|
generate_html_report,
|
||||||
|
render_html_content,
|
||||||
|
)
|
||||||
|
from trendradar.notification import (
|
||||||
|
render_feishu_content,
|
||||||
|
render_dingtalk_content,
|
||||||
|
split_content_into_batches,
|
||||||
|
NotificationDispatcher,
|
||||||
|
PushRecordManager,
|
||||||
|
)
|
||||||
|
from trendradar.storage import get_storage_manager
|
||||||
|
|
||||||
|
|
||||||
|
class AppContext:
|
||||||
|
"""
|
||||||
|
应用上下文类
|
||||||
|
|
||||||
|
封装所有依赖配置的操作,提供统一的接口。
|
||||||
|
消除对全局 CONFIG 的依赖,提高可测试性。
|
||||||
|
|
||||||
|
使用示例:
|
||||||
|
config = load_config()
|
||||||
|
ctx = AppContext(config)
|
||||||
|
|
||||||
|
# 时间操作
|
||||||
|
now = ctx.get_time()
|
||||||
|
date_folder = ctx.format_date()
|
||||||
|
|
||||||
|
# 存储操作
|
||||||
|
storage = ctx.get_storage_manager()
|
||||||
|
|
||||||
|
# 报告生成
|
||||||
|
html = ctx.generate_html_report(stats, total_titles, ...)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, config: Dict[str, Any]):
|
||||||
|
"""
|
||||||
|
初始化应用上下文
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config: 完整的配置字典
|
||||||
|
"""
|
||||||
|
self.config = config
|
||||||
|
self._storage_manager = None
|
||||||
|
|
||||||
|
# === 配置访问 ===
|
||||||
|
|
||||||
|
@property
|
||||||
|
def timezone(self) -> str:
|
||||||
|
"""获取配置的时区"""
|
||||||
|
return self.config.get("TIMEZONE", "Asia/Shanghai")
|
||||||
|
|
||||||
|
@property
|
||||||
|
def rank_threshold(self) -> int:
|
||||||
|
"""获取排名阈值"""
|
||||||
|
return self.config.get("RANK_THRESHOLD", 50)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def weight_config(self) -> Dict:
|
||||||
|
"""获取权重配置"""
|
||||||
|
return self.config.get("WEIGHT_CONFIG", {})
|
||||||
|
|
||||||
|
@property
|
||||||
|
def platforms(self) -> List[Dict]:
|
||||||
|
"""获取平台配置列表"""
|
||||||
|
return self.config.get("PLATFORMS", [])
|
||||||
|
|
||||||
|
@property
|
||||||
|
def platform_ids(self) -> List[str]:
|
||||||
|
"""获取平台ID列表"""
|
||||||
|
return [p["id"] for p in self.platforms]
|
||||||
|
|
||||||
|
# === 时间操作 ===
|
||||||
|
|
||||||
|
def get_time(self) -> datetime:
|
||||||
|
"""获取当前配置时区的时间"""
|
||||||
|
return get_configured_time(self.timezone)
|
||||||
|
|
||||||
|
def format_date(self) -> str:
|
||||||
|
"""格式化日期文件夹 (YYYY-MM-DD)"""
|
||||||
|
return format_date_folder(timezone=self.timezone)
|
||||||
|
|
||||||
|
def format_time(self) -> str:
|
||||||
|
"""格式化时间文件名 (HH-MM)"""
|
||||||
|
return format_time_filename(self.timezone)
|
||||||
|
|
||||||
|
def get_time_display(self) -> str:
|
||||||
|
"""获取时间显示 (HH:MM)"""
|
||||||
|
return get_current_time_display(self.timezone)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def convert_time_display(time_str: str) -> str:
|
||||||
|
"""将 HH-MM 转换为 HH:MM"""
|
||||||
|
return convert_time_for_display(time_str)
|
||||||
|
|
||||||
|
# === 存储操作 ===
|
||||||
|
|
||||||
|
def get_storage_manager(self):
|
||||||
|
"""获取存储管理器(延迟初始化,单例)"""
|
||||||
|
if self._storage_manager is None:
|
||||||
|
storage_config = self.config.get("STORAGE", {})
|
||||||
|
remote_config = storage_config.get("REMOTE", {})
|
||||||
|
local_config = storage_config.get("LOCAL", {})
|
||||||
|
pull_config = storage_config.get("PULL", {})
|
||||||
|
|
||||||
|
self._storage_manager = get_storage_manager(
|
||||||
|
backend_type=storage_config.get("BACKEND", "auto"),
|
||||||
|
data_dir=local_config.get("DATA_DIR", "output"),
|
||||||
|
enable_txt=storage_config.get("FORMATS", {}).get("TXT", True),
|
||||||
|
enable_html=storage_config.get("FORMATS", {}).get("HTML", True),
|
||||||
|
remote_config={
|
||||||
|
"bucket_name": remote_config.get("BUCKET_NAME", ""),
|
||||||
|
"access_key_id": remote_config.get("ACCESS_KEY_ID", ""),
|
||||||
|
"secret_access_key": remote_config.get("SECRET_ACCESS_KEY", ""),
|
||||||
|
"endpoint_url": remote_config.get("ENDPOINT_URL", ""),
|
||||||
|
"region": remote_config.get("REGION", ""),
|
||||||
|
},
|
||||||
|
local_retention_days=local_config.get("RETENTION_DAYS", 0),
|
||||||
|
remote_retention_days=remote_config.get("RETENTION_DAYS", 0),
|
||||||
|
pull_enabled=pull_config.get("ENABLED", False),
|
||||||
|
pull_days=pull_config.get("DAYS", 7),
|
||||||
|
timezone=self.timezone,
|
||||||
|
)
|
||||||
|
return self._storage_manager
|
||||||
|
|
||||||
|
def get_output_path(self, subfolder: str, filename: str) -> str:
|
||||||
|
"""获取输出路径"""
|
||||||
|
output_dir = Path("output") / self.format_date() / subfolder
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
return str(output_dir / filename)
|
||||||
|
|
||||||
|
# === 数据处理 ===
|
||||||
|
|
||||||
|
def save_titles(self, results: Dict, id_to_name: Dict, failed_ids: List) -> str:
|
||||||
|
"""保存标题到文件"""
|
||||||
|
output_path = self.get_output_path("txt", f"{self.format_time()}.txt")
|
||||||
|
return save_titles_to_file(results, id_to_name, failed_ids, output_path, clean_title)
|
||||||
|
|
||||||
|
def read_today_titles(
|
||||||
|
self, platform_ids: Optional[List[str]] = None
|
||||||
|
) -> Tuple[Dict, Dict, Dict]:
|
||||||
|
"""读取当天所有标题"""
|
||||||
|
return read_all_today_titles(self.get_storage_manager(), platform_ids)
|
||||||
|
|
||||||
|
def detect_new_titles(
|
||||||
|
self, platform_ids: Optional[List[str]] = None
|
||||||
|
) -> Dict:
|
||||||
|
"""检测最新批次的新增标题"""
|
||||||
|
return detect_latest_new_titles(self.get_storage_manager(), platform_ids)
|
||||||
|
|
||||||
|
def is_first_crawl(self) -> bool:
|
||||||
|
"""检测是否是当天第一次爬取"""
|
||||||
|
return is_first_crawl_today("output", self.format_date())
|
||||||
|
|
||||||
|
# === 频率词处理 ===
|
||||||
|
|
||||||
|
def load_frequency_words(
|
||||||
|
self, frequency_file: Optional[str] = None
|
||||||
|
) -> Tuple[List[Dict], List[str], List[str]]:
|
||||||
|
"""加载频率词配置"""
|
||||||
|
return load_frequency_words(frequency_file)
|
||||||
|
|
||||||
|
def matches_word_groups(
|
||||||
|
self,
|
||||||
|
title: str,
|
||||||
|
word_groups: List[Dict],
|
||||||
|
filter_words: List[str],
|
||||||
|
global_filters: Optional[List[str]] = None,
|
||||||
|
) -> bool:
|
||||||
|
"""检查标题是否匹配词组规则"""
|
||||||
|
return matches_word_groups(title, word_groups, filter_words, global_filters)
|
||||||
|
|
||||||
|
# === 统计分析 ===
|
||||||
|
|
||||||
|
def count_frequency(
|
||||||
|
self,
|
||||||
|
results: Dict,
|
||||||
|
word_groups: List[Dict],
|
||||||
|
filter_words: List[str],
|
||||||
|
id_to_name: Dict,
|
||||||
|
title_info: Optional[Dict] = None,
|
||||||
|
new_titles: Optional[Dict] = None,
|
||||||
|
mode: str = "daily",
|
||||||
|
global_filters: Optional[List[str]] = None,
|
||||||
|
) -> Tuple[List[Dict], int]:
|
||||||
|
"""统计词频"""
|
||||||
|
return count_word_frequency(
|
||||||
|
results=results,
|
||||||
|
word_groups=word_groups,
|
||||||
|
filter_words=filter_words,
|
||||||
|
id_to_name=id_to_name,
|
||||||
|
title_info=title_info,
|
||||||
|
rank_threshold=self.rank_threshold,
|
||||||
|
new_titles=new_titles,
|
||||||
|
mode=mode,
|
||||||
|
global_filters=global_filters,
|
||||||
|
weight_config=self.weight_config,
|
||||||
|
max_news_per_keyword=self.config.get("MAX_NEWS_PER_KEYWORD", 0),
|
||||||
|
sort_by_position_first=self.config.get("SORT_BY_POSITION_FIRST", False),
|
||||||
|
is_first_crawl_func=self.is_first_crawl,
|
||||||
|
convert_time_func=self.convert_time_display,
|
||||||
|
)
|
||||||
|
|
||||||
|
# === 报告生成 ===
|
||||||
|
|
||||||
|
def prepare_report(
|
||||||
|
self,
|
||||||
|
stats: List[Dict],
|
||||||
|
failed_ids: Optional[List] = None,
|
||||||
|
new_titles: Optional[Dict] = None,
|
||||||
|
id_to_name: Optional[Dict] = None,
|
||||||
|
mode: str = "daily",
|
||||||
|
) -> Dict:
|
||||||
|
"""准备报告数据"""
|
||||||
|
return prepare_report_data(
|
||||||
|
stats=stats,
|
||||||
|
failed_ids=failed_ids,
|
||||||
|
new_titles=new_titles,
|
||||||
|
id_to_name=id_to_name,
|
||||||
|
mode=mode,
|
||||||
|
rank_threshold=self.rank_threshold,
|
||||||
|
matches_word_groups_func=self.matches_word_groups,
|
||||||
|
load_frequency_words_func=self.load_frequency_words,
|
||||||
|
)
|
||||||
|
|
||||||
|
def generate_html(
|
||||||
|
self,
|
||||||
|
stats: List[Dict],
|
||||||
|
total_titles: int,
|
||||||
|
failed_ids: Optional[List] = None,
|
||||||
|
new_titles: Optional[Dict] = None,
|
||||||
|
id_to_name: Optional[Dict] = None,
|
||||||
|
mode: str = "daily",
|
||||||
|
is_daily_summary: bool = False,
|
||||||
|
update_info: Optional[Dict] = None,
|
||||||
|
) -> str:
|
||||||
|
"""生成HTML报告"""
|
||||||
|
return generate_html_report(
|
||||||
|
stats=stats,
|
||||||
|
total_titles=total_titles,
|
||||||
|
failed_ids=failed_ids,
|
||||||
|
new_titles=new_titles,
|
||||||
|
id_to_name=id_to_name,
|
||||||
|
mode=mode,
|
||||||
|
is_daily_summary=is_daily_summary,
|
||||||
|
update_info=update_info,
|
||||||
|
rank_threshold=self.rank_threshold,
|
||||||
|
output_dir="output",
|
||||||
|
date_folder=self.format_date(),
|
||||||
|
time_filename=self.format_time(),
|
||||||
|
render_html_func=lambda *args, **kwargs: self.render_html(*args, **kwargs),
|
||||||
|
matches_word_groups_func=self.matches_word_groups,
|
||||||
|
load_frequency_words_func=self.load_frequency_words,
|
||||||
|
enable_index_copy=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
def render_html(
|
||||||
|
self,
|
||||||
|
report_data: Dict,
|
||||||
|
total_titles: int,
|
||||||
|
is_daily_summary: bool = False,
|
||||||
|
mode: str = "daily",
|
||||||
|
update_info: Optional[Dict] = None,
|
||||||
|
) -> str:
|
||||||
|
"""渲染HTML内容"""
|
||||||
|
return render_html_content(
|
||||||
|
report_data=report_data,
|
||||||
|
total_titles=total_titles,
|
||||||
|
is_daily_summary=is_daily_summary,
|
||||||
|
mode=mode,
|
||||||
|
update_info=update_info,
|
||||||
|
reverse_content_order=self.config.get("REVERSE_CONTENT_ORDER", False),
|
||||||
|
get_time_func=self.get_time,
|
||||||
|
)
|
||||||
|
|
||||||
|
# === 通知内容渲染 ===
|
||||||
|
|
||||||
|
def render_feishu(
|
||||||
|
self,
|
||||||
|
report_data: Dict,
|
||||||
|
update_info: Optional[Dict] = None,
|
||||||
|
mode: str = "daily",
|
||||||
|
) -> str:
|
||||||
|
"""渲染飞书内容"""
|
||||||
|
return render_feishu_content(
|
||||||
|
report_data=report_data,
|
||||||
|
update_info=update_info,
|
||||||
|
mode=mode,
|
||||||
|
separator=self.config.get("FEISHU_MESSAGE_SEPARATOR", "---"),
|
||||||
|
reverse_content_order=self.config.get("REVERSE_CONTENT_ORDER", False),
|
||||||
|
get_time_func=self.get_time,
|
||||||
|
)
|
||||||
|
|
||||||
|
def render_dingtalk(
|
||||||
|
self,
|
||||||
|
report_data: Dict,
|
||||||
|
update_info: Optional[Dict] = None,
|
||||||
|
mode: str = "daily",
|
||||||
|
) -> str:
|
||||||
|
"""渲染钉钉内容"""
|
||||||
|
return render_dingtalk_content(
|
||||||
|
report_data=report_data,
|
||||||
|
update_info=update_info,
|
||||||
|
mode=mode,
|
||||||
|
reverse_content_order=self.config.get("REVERSE_CONTENT_ORDER", False),
|
||||||
|
get_time_func=self.get_time,
|
||||||
|
)
|
||||||
|
|
||||||
|
def split_content(
|
||||||
|
self,
|
||||||
|
report_data: Dict,
|
||||||
|
format_type: str,
|
||||||
|
update_info: Optional[Dict] = None,
|
||||||
|
max_bytes: Optional[int] = None,
|
||||||
|
mode: str = "daily",
|
||||||
|
) -> List[str]:
|
||||||
|
"""分批处理消息内容"""
|
||||||
|
return split_content_into_batches(
|
||||||
|
report_data=report_data,
|
||||||
|
format_type=format_type,
|
||||||
|
update_info=update_info,
|
||||||
|
max_bytes=max_bytes,
|
||||||
|
mode=mode,
|
||||||
|
batch_sizes={
|
||||||
|
"dingtalk": self.config.get("DINGTALK_BATCH_SIZE", 20000),
|
||||||
|
"feishu": self.config.get("FEISHU_BATCH_SIZE", 29000),
|
||||||
|
"default": self.config.get("MESSAGE_BATCH_SIZE", 4000),
|
||||||
|
},
|
||||||
|
feishu_separator=self.config.get("FEISHU_MESSAGE_SEPARATOR", "---"),
|
||||||
|
reverse_content_order=self.config.get("REVERSE_CONTENT_ORDER", False),
|
||||||
|
get_time_func=self.get_time,
|
||||||
|
)
|
||||||
|
|
||||||
|
# === 通知发送 ===
|
||||||
|
|
||||||
|
def create_notification_dispatcher(self) -> NotificationDispatcher:
|
||||||
|
"""创建通知调度器"""
|
||||||
|
return NotificationDispatcher(
|
||||||
|
config=self.config,
|
||||||
|
get_time_func=self.get_time,
|
||||||
|
split_content_func=self.split_content,
|
||||||
|
)
|
||||||
|
|
||||||
|
def create_push_manager(self) -> PushRecordManager:
|
||||||
|
"""创建推送记录管理器"""
|
||||||
|
return PushRecordManager(
|
||||||
|
storage_backend=self.get_storage_manager(),
|
||||||
|
get_time_func=self.get_time,
|
||||||
|
)
|
||||||
|
|
||||||
|
# === 资源清理 ===
|
||||||
|
|
||||||
|
def cleanup(self):
|
||||||
|
"""清理资源"""
|
||||||
|
if self._storage_manager:
|
||||||
|
self._storage_manager.cleanup_old_data()
|
||||||
|
self._storage_manager.cleanup()
|
||||||
|
self._storage_manager = None
|
||||||
47
trendradar/core/__init__.py
Normal file
47
trendradar/core/__init__.py
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
"""
|
||||||
|
核心模块 - 配置管理和核心工具
|
||||||
|
"""
|
||||||
|
|
||||||
|
from trendradar.core.config import (
|
||||||
|
parse_multi_account_config,
|
||||||
|
validate_paired_configs,
|
||||||
|
limit_accounts,
|
||||||
|
get_account_at_index,
|
||||||
|
)
|
||||||
|
from trendradar.core.loader import load_config
|
||||||
|
from trendradar.core.frequency import load_frequency_words, matches_word_groups
|
||||||
|
from trendradar.core.data import (
|
||||||
|
save_titles_to_file,
|
||||||
|
read_all_today_titles_from_storage,
|
||||||
|
read_all_today_titles,
|
||||||
|
detect_latest_new_titles_from_storage,
|
||||||
|
detect_latest_new_titles,
|
||||||
|
is_first_crawl_today,
|
||||||
|
)
|
||||||
|
from trendradar.core.analyzer import (
|
||||||
|
calculate_news_weight,
|
||||||
|
format_time_display,
|
||||||
|
count_word_frequency,
|
||||||
|
)
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"parse_multi_account_config",
|
||||||
|
"validate_paired_configs",
|
||||||
|
"limit_accounts",
|
||||||
|
"get_account_at_index",
|
||||||
|
"load_config",
|
||||||
|
"load_frequency_words",
|
||||||
|
"matches_word_groups",
|
||||||
|
# 数据处理
|
||||||
|
"save_titles_to_file",
|
||||||
|
"read_all_today_titles_from_storage",
|
||||||
|
"read_all_today_titles",
|
||||||
|
"detect_latest_new_titles_from_storage",
|
||||||
|
"detect_latest_new_titles",
|
||||||
|
"is_first_crawl_today",
|
||||||
|
# 统计分析
|
||||||
|
"calculate_news_weight",
|
||||||
|
"format_time_display",
|
||||||
|
"count_word_frequency",
|
||||||
|
]
|
||||||
469
trendradar/core/analyzer.py
Normal file
469
trendradar/core/analyzer.py
Normal file
@ -0,0 +1,469 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
"""
|
||||||
|
统计分析模块
|
||||||
|
|
||||||
|
提供新闻统计和分析功能:
|
||||||
|
- calculate_news_weight: 计算新闻权重
|
||||||
|
- format_time_display: 格式化时间显示
|
||||||
|
- count_word_frequency: 统计词频
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import Dict, List, Tuple, Optional, Callable
|
||||||
|
|
||||||
|
from trendradar.core.frequency import matches_word_groups
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_news_weight(
|
||||||
|
title_data: Dict,
|
||||||
|
rank_threshold: int,
|
||||||
|
weight_config: Dict,
|
||||||
|
) -> float:
|
||||||
|
"""
|
||||||
|
计算新闻权重,用于排序
|
||||||
|
|
||||||
|
Args:
|
||||||
|
title_data: 标题数据,包含 ranks 和 count
|
||||||
|
rank_threshold: 排名阈值
|
||||||
|
weight_config: 权重配置 {RANK_WEIGHT, FREQUENCY_WEIGHT, HOTNESS_WEIGHT}
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
float: 计算出的权重值
|
||||||
|
"""
|
||||||
|
ranks = title_data.get("ranks", [])
|
||||||
|
if not ranks:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
count = title_data.get("count", len(ranks))
|
||||||
|
|
||||||
|
# 排名权重:Σ(11 - min(rank, 10)) / 出现次数
|
||||||
|
rank_scores = []
|
||||||
|
for rank in ranks:
|
||||||
|
score = 11 - min(rank, 10)
|
||||||
|
rank_scores.append(score)
|
||||||
|
|
||||||
|
rank_weight = sum(rank_scores) / len(ranks) if ranks else 0
|
||||||
|
|
||||||
|
# 频次权重:min(出现次数, 10) × 10
|
||||||
|
frequency_weight = min(count, 10) * 10
|
||||||
|
|
||||||
|
# 热度加成:高排名次数 / 总出现次数 × 100
|
||||||
|
high_rank_count = sum(1 for rank in ranks if rank <= rank_threshold)
|
||||||
|
hotness_ratio = high_rank_count / len(ranks) if ranks else 0
|
||||||
|
hotness_weight = hotness_ratio * 100
|
||||||
|
|
||||||
|
total_weight = (
|
||||||
|
rank_weight * weight_config["RANK_WEIGHT"]
|
||||||
|
+ frequency_weight * weight_config["FREQUENCY_WEIGHT"]
|
||||||
|
+ hotness_weight * weight_config["HOTNESS_WEIGHT"]
|
||||||
|
)
|
||||||
|
|
||||||
|
return total_weight
|
||||||
|
|
||||||
|
|
||||||
|
def format_time_display(
|
||||||
|
first_time: str,
|
||||||
|
last_time: str,
|
||||||
|
convert_time_func: Callable[[str], str],
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
格式化时间显示(将 HH-MM 转换为 HH:MM)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
first_time: 首次出现时间
|
||||||
|
last_time: 最后出现时间
|
||||||
|
convert_time_func: 时间格式转换函数
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: 格式化后的时间显示字符串
|
||||||
|
"""
|
||||||
|
if not first_time:
|
||||||
|
return ""
|
||||||
|
# 转换为显示格式
|
||||||
|
first_display = convert_time_func(first_time)
|
||||||
|
last_display = convert_time_func(last_time)
|
||||||
|
if first_display == last_display or not last_display:
|
||||||
|
return first_display
|
||||||
|
else:
|
||||||
|
return f"[{first_display} ~ {last_display}]"
|
||||||
|
|
||||||
|
|
||||||
|
def count_word_frequency(
|
||||||
|
results: Dict,
|
||||||
|
word_groups: List[Dict],
|
||||||
|
filter_words: List[str],
|
||||||
|
id_to_name: Dict,
|
||||||
|
title_info: Optional[Dict] = None,
|
||||||
|
rank_threshold: int = 3,
|
||||||
|
new_titles: Optional[Dict] = None,
|
||||||
|
mode: str = "daily",
|
||||||
|
global_filters: Optional[List[str]] = None,
|
||||||
|
weight_config: Optional[Dict] = None,
|
||||||
|
max_news_per_keyword: int = 0,
|
||||||
|
sort_by_position_first: bool = False,
|
||||||
|
is_first_crawl_func: Optional[Callable[[], bool]] = None,
|
||||||
|
convert_time_func: Optional[Callable[[str], str]] = None,
|
||||||
|
) -> Tuple[List[Dict], int]:
|
||||||
|
"""
|
||||||
|
统计词频,支持必须词、频率词、过滤词、全局过滤词,并标记新增标题
|
||||||
|
|
||||||
|
Args:
|
||||||
|
results: 抓取结果 {source_id: {title: title_data}}
|
||||||
|
word_groups: 词组配置列表
|
||||||
|
filter_words: 过滤词列表
|
||||||
|
id_to_name: ID 到名称的映射
|
||||||
|
title_info: 标题统计信息(可选)
|
||||||
|
rank_threshold: 排名阈值
|
||||||
|
new_titles: 新增标题(可选)
|
||||||
|
mode: 报告模式 (daily/incremental/current)
|
||||||
|
global_filters: 全局过滤词(可选)
|
||||||
|
weight_config: 权重配置
|
||||||
|
max_news_per_keyword: 每个关键词最大显示数量
|
||||||
|
sort_by_position_first: 是否优先按配置位置排序
|
||||||
|
is_first_crawl_func: 检测是否是当天第一次爬取的函数
|
||||||
|
convert_time_func: 时间格式转换函数
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple[List[Dict], int]: (统计结果列表, 总标题数)
|
||||||
|
"""
|
||||||
|
# 默认权重配置
|
||||||
|
if weight_config is None:
|
||||||
|
weight_config = {
|
||||||
|
"RANK_WEIGHT": 0.4,
|
||||||
|
"FREQUENCY_WEIGHT": 0.3,
|
||||||
|
"HOTNESS_WEIGHT": 0.3,
|
||||||
|
}
|
||||||
|
|
||||||
|
# 默认时间转换函数
|
||||||
|
if convert_time_func is None:
|
||||||
|
convert_time_func = lambda x: x
|
||||||
|
|
||||||
|
# 默认首次爬取检测函数
|
||||||
|
if is_first_crawl_func is None:
|
||||||
|
is_first_crawl_func = lambda: True
|
||||||
|
|
||||||
|
# 如果没有配置词组,创建一个包含所有新闻的虚拟词组
|
||||||
|
if not word_groups:
|
||||||
|
print("频率词配置为空,将显示所有新闻")
|
||||||
|
word_groups = [{"required": [], "normal": [], "group_key": "全部新闻"}]
|
||||||
|
filter_words = [] # 清空过滤词,显示所有新闻
|
||||||
|
|
||||||
|
is_first_today = is_first_crawl_func()
|
||||||
|
|
||||||
|
# 确定处理的数据源和新增标记逻辑
|
||||||
|
if mode == "incremental":
|
||||||
|
if is_first_today:
|
||||||
|
# 增量模式 + 当天第一次:处理所有新闻,都标记为新增
|
||||||
|
results_to_process = results
|
||||||
|
all_news_are_new = True
|
||||||
|
else:
|
||||||
|
# 增量模式 + 当天非第一次:只处理新增的新闻
|
||||||
|
results_to_process = new_titles if new_titles else {}
|
||||||
|
all_news_are_new = True
|
||||||
|
elif mode == "current":
|
||||||
|
# current 模式:只处理当前时间批次的新闻,但统计信息来自全部历史
|
||||||
|
if title_info:
|
||||||
|
latest_time = None
|
||||||
|
for source_titles in title_info.values():
|
||||||
|
for title_data in source_titles.values():
|
||||||
|
last_time = title_data.get("last_time", "")
|
||||||
|
if last_time:
|
||||||
|
if latest_time is None or last_time > latest_time:
|
||||||
|
latest_time = last_time
|
||||||
|
|
||||||
|
# 只处理 last_time 等于最新时间的新闻
|
||||||
|
if latest_time:
|
||||||
|
results_to_process = {}
|
||||||
|
for source_id, source_titles in results.items():
|
||||||
|
if source_id in title_info:
|
||||||
|
filtered_titles = {}
|
||||||
|
for title, title_data in source_titles.items():
|
||||||
|
if title in title_info[source_id]:
|
||||||
|
info = title_info[source_id][title]
|
||||||
|
if info.get("last_time") == latest_time:
|
||||||
|
filtered_titles[title] = title_data
|
||||||
|
if filtered_titles:
|
||||||
|
results_to_process[source_id] = filtered_titles
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"当前榜单模式:最新时间 {latest_time},筛选出 {sum(len(titles) for titles in results_to_process.values())} 条当前榜单新闻"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
results_to_process = results
|
||||||
|
else:
|
||||||
|
results_to_process = results
|
||||||
|
all_news_are_new = False
|
||||||
|
else:
|
||||||
|
# 当日汇总模式:处理所有新闻
|
||||||
|
results_to_process = results
|
||||||
|
all_news_are_new = False
|
||||||
|
total_input_news = sum(len(titles) for titles in results.values())
|
||||||
|
filter_status = (
|
||||||
|
"全部显示"
|
||||||
|
if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻"
|
||||||
|
else "频率词过滤"
|
||||||
|
)
|
||||||
|
print(f"当日汇总模式:处理 {total_input_news} 条新闻,模式:{filter_status}")
|
||||||
|
|
||||||
|
word_stats = {}
|
||||||
|
total_titles = 0
|
||||||
|
processed_titles = {}
|
||||||
|
matched_new_count = 0
|
||||||
|
|
||||||
|
if title_info is None:
|
||||||
|
title_info = {}
|
||||||
|
if new_titles is None:
|
||||||
|
new_titles = {}
|
||||||
|
|
||||||
|
for group in word_groups:
|
||||||
|
group_key = group["group_key"]
|
||||||
|
word_stats[group_key] = {"count": 0, "titles": {}}
|
||||||
|
|
||||||
|
for source_id, titles_data in results_to_process.items():
|
||||||
|
total_titles += len(titles_data)
|
||||||
|
|
||||||
|
if source_id not in processed_titles:
|
||||||
|
processed_titles[source_id] = {}
|
||||||
|
|
||||||
|
for title, title_data in titles_data.items():
|
||||||
|
if title in processed_titles.get(source_id, {}):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 使用统一的匹配逻辑
|
||||||
|
matches_frequency_words = matches_word_groups(
|
||||||
|
title, word_groups, filter_words, global_filters
|
||||||
|
)
|
||||||
|
|
||||||
|
if not matches_frequency_words:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 如果是增量模式或 current 模式第一次,统计匹配的新增新闻数量
|
||||||
|
if (mode == "incremental" and all_news_are_new) or (
|
||||||
|
mode == "current" and is_first_today
|
||||||
|
):
|
||||||
|
matched_new_count += 1
|
||||||
|
|
||||||
|
source_ranks = title_data.get("ranks", [])
|
||||||
|
source_url = title_data.get("url", "")
|
||||||
|
source_mobile_url = title_data.get("mobileUrl", "")
|
||||||
|
|
||||||
|
# 找到匹配的词组(防御性转换确保类型安全)
|
||||||
|
title_lower = str(title).lower() if not isinstance(title, str) else title.lower()
|
||||||
|
for group in word_groups:
|
||||||
|
required_words = group["required"]
|
||||||
|
normal_words = group["normal"]
|
||||||
|
|
||||||
|
# 如果是"全部新闻"模式,所有标题都匹配第一个(唯一的)词组
|
||||||
|
if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻":
|
||||||
|
group_key = group["group_key"]
|
||||||
|
word_stats[group_key]["count"] += 1
|
||||||
|
if source_id not in word_stats[group_key]["titles"]:
|
||||||
|
word_stats[group_key]["titles"][source_id] = []
|
||||||
|
else:
|
||||||
|
# 原有的匹配逻辑
|
||||||
|
if required_words:
|
||||||
|
all_required_present = all(
|
||||||
|
req_word.lower() in title_lower
|
||||||
|
for req_word in required_words
|
||||||
|
)
|
||||||
|
if not all_required_present:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if normal_words:
|
||||||
|
any_normal_present = any(
|
||||||
|
normal_word.lower() in title_lower
|
||||||
|
for normal_word in normal_words
|
||||||
|
)
|
||||||
|
if not any_normal_present:
|
||||||
|
continue
|
||||||
|
|
||||||
|
group_key = group["group_key"]
|
||||||
|
word_stats[group_key]["count"] += 1
|
||||||
|
if source_id not in word_stats[group_key]["titles"]:
|
||||||
|
word_stats[group_key]["titles"][source_id] = []
|
||||||
|
|
||||||
|
first_time = ""
|
||||||
|
last_time = ""
|
||||||
|
count_info = 1
|
||||||
|
ranks = source_ranks if source_ranks else []
|
||||||
|
url = source_url
|
||||||
|
mobile_url = source_mobile_url
|
||||||
|
|
||||||
|
# 对于 current 模式,从历史统计信息中获取完整数据
|
||||||
|
if (
|
||||||
|
mode == "current"
|
||||||
|
and title_info
|
||||||
|
and source_id in title_info
|
||||||
|
and title in title_info[source_id]
|
||||||
|
):
|
||||||
|
info = title_info[source_id][title]
|
||||||
|
first_time = info.get("first_time", "")
|
||||||
|
last_time = info.get("last_time", "")
|
||||||
|
count_info = info.get("count", 1)
|
||||||
|
if "ranks" in info and info["ranks"]:
|
||||||
|
ranks = info["ranks"]
|
||||||
|
url = info.get("url", source_url)
|
||||||
|
mobile_url = info.get("mobileUrl", source_mobile_url)
|
||||||
|
elif (
|
||||||
|
title_info
|
||||||
|
and source_id in title_info
|
||||||
|
and title in title_info[source_id]
|
||||||
|
):
|
||||||
|
info = title_info[source_id][title]
|
||||||
|
first_time = info.get("first_time", "")
|
||||||
|
last_time = info.get("last_time", "")
|
||||||
|
count_info = info.get("count", 1)
|
||||||
|
if "ranks" in info and info["ranks"]:
|
||||||
|
ranks = info["ranks"]
|
||||||
|
url = info.get("url", source_url)
|
||||||
|
mobile_url = info.get("mobileUrl", source_mobile_url)
|
||||||
|
|
||||||
|
if not ranks:
|
||||||
|
ranks = [99]
|
||||||
|
|
||||||
|
time_display = format_time_display(first_time, last_time, convert_time_func)
|
||||||
|
|
||||||
|
source_name = id_to_name.get(source_id, source_id)
|
||||||
|
|
||||||
|
# 判断是否为新增
|
||||||
|
is_new = False
|
||||||
|
if all_news_are_new:
|
||||||
|
# 增量模式下所有处理的新闻都是新增,或者当天第一次的所有新闻都是新增
|
||||||
|
is_new = True
|
||||||
|
elif new_titles and source_id in new_titles:
|
||||||
|
# 检查是否在新增列表中
|
||||||
|
new_titles_for_source = new_titles[source_id]
|
||||||
|
is_new = title in new_titles_for_source
|
||||||
|
|
||||||
|
word_stats[group_key]["titles"][source_id].append(
|
||||||
|
{
|
||||||
|
"title": title,
|
||||||
|
"source_name": source_name,
|
||||||
|
"first_time": first_time,
|
||||||
|
"last_time": last_time,
|
||||||
|
"time_display": time_display,
|
||||||
|
"count": count_info,
|
||||||
|
"ranks": ranks,
|
||||||
|
"rank_threshold": rank_threshold,
|
||||||
|
"url": url,
|
||||||
|
"mobileUrl": mobile_url,
|
||||||
|
"is_new": is_new,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
if source_id not in processed_titles:
|
||||||
|
processed_titles[source_id] = {}
|
||||||
|
processed_titles[source_id][title] = True
|
||||||
|
|
||||||
|
break
|
||||||
|
|
||||||
|
# 最后统一打印汇总信息
|
||||||
|
if mode == "incremental":
|
||||||
|
if is_first_today:
|
||||||
|
total_input_news = sum(len(titles) for titles in results.values())
|
||||||
|
filter_status = (
|
||||||
|
"全部显示"
|
||||||
|
if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻"
|
||||||
|
else "频率词匹配"
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
f"增量模式:当天第一次爬取,{total_input_news} 条新闻中有 {matched_new_count} 条{filter_status}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
if new_titles:
|
||||||
|
total_new_count = sum(len(titles) for titles in new_titles.values())
|
||||||
|
filter_status = (
|
||||||
|
"全部显示"
|
||||||
|
if len(word_groups) == 1
|
||||||
|
and word_groups[0]["group_key"] == "全部新闻"
|
||||||
|
else "匹配频率词"
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
f"增量模式:{total_new_count} 条新增新闻中,有 {matched_new_count} 条{filter_status}"
|
||||||
|
)
|
||||||
|
if matched_new_count == 0 and len(word_groups) > 1:
|
||||||
|
print("增量模式:没有新增新闻匹配频率词,将不会发送通知")
|
||||||
|
else:
|
||||||
|
print("增量模式:未检测到新增新闻")
|
||||||
|
elif mode == "current":
|
||||||
|
total_input_news = sum(len(titles) for titles in results_to_process.values())
|
||||||
|
if is_first_today:
|
||||||
|
filter_status = (
|
||||||
|
"全部显示"
|
||||||
|
if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻"
|
||||||
|
else "频率词匹配"
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
f"当前榜单模式:当天第一次爬取,{total_input_news} 条当前榜单新闻中有 {matched_new_count} 条{filter_status}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
matched_count = sum(stat["count"] for stat in word_stats.values())
|
||||||
|
filter_status = (
|
||||||
|
"全部显示"
|
||||||
|
if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻"
|
||||||
|
else "频率词匹配"
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
f"当前榜单模式:{total_input_news} 条当前榜单新闻中有 {matched_count} 条{filter_status}"
|
||||||
|
)
|
||||||
|
|
||||||
|
stats = []
|
||||||
|
# 创建 group_key 到位置和最大数量的映射
|
||||||
|
group_key_to_position = {
|
||||||
|
group["group_key"]: idx for idx, group in enumerate(word_groups)
|
||||||
|
}
|
||||||
|
group_key_to_max_count = {
|
||||||
|
group["group_key"]: group.get("max_count", 0) for group in word_groups
|
||||||
|
}
|
||||||
|
|
||||||
|
for group_key, data in word_stats.items():
|
||||||
|
all_titles = []
|
||||||
|
for source_id, title_list in data["titles"].items():
|
||||||
|
all_titles.extend(title_list)
|
||||||
|
|
||||||
|
# 按权重排序
|
||||||
|
sorted_titles = sorted(
|
||||||
|
all_titles,
|
||||||
|
key=lambda x: (
|
||||||
|
-calculate_news_weight(x, rank_threshold, weight_config),
|
||||||
|
min(x["ranks"]) if x["ranks"] else 999,
|
||||||
|
-x["count"],
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
# 应用最大显示数量限制(优先级:单独配置 > 全局配置)
|
||||||
|
group_max_count = group_key_to_max_count.get(group_key, 0)
|
||||||
|
if group_max_count == 0:
|
||||||
|
# 使用全局配置
|
||||||
|
group_max_count = max_news_per_keyword
|
||||||
|
|
||||||
|
if group_max_count > 0:
|
||||||
|
sorted_titles = sorted_titles[:group_max_count]
|
||||||
|
|
||||||
|
stats.append(
|
||||||
|
{
|
||||||
|
"word": group_key,
|
||||||
|
"count": data["count"],
|
||||||
|
"position": group_key_to_position.get(group_key, 999),
|
||||||
|
"titles": sorted_titles,
|
||||||
|
"percentage": (
|
||||||
|
round(data["count"] / total_titles * 100, 2)
|
||||||
|
if total_titles > 0
|
||||||
|
else 0
|
||||||
|
),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# 根据配置选择排序优先级
|
||||||
|
if sort_by_position_first:
|
||||||
|
# 先按配置位置,再按热点条数
|
||||||
|
stats.sort(key=lambda x: (x["position"], -x["count"]))
|
||||||
|
else:
|
||||||
|
# 先按热点条数,再按配置位置(原逻辑)
|
||||||
|
stats.sort(key=lambda x: (-x["count"], x["position"]))
|
||||||
|
|
||||||
|
# 打印过滤后的匹配新闻数(与推送显示一致)
|
||||||
|
matched_news_count = sum(len(stat["titles"]) for stat in stats if stat["count"] > 0)
|
||||||
|
if mode == "daily":
|
||||||
|
print(f"频率词过滤后:{matched_news_count} 条新闻匹配(将显示在推送中)")
|
||||||
|
|
||||||
|
return stats, total_titles
|
||||||
152
trendradar/core/config.py
Normal file
152
trendradar/core/config.py
Normal file
@ -0,0 +1,152 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
"""
|
||||||
|
配置工具模块 - 多账号配置解析和验证
|
||||||
|
|
||||||
|
提供多账号推送配置的解析、验证和限制功能
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import Dict, List, Optional, Tuple
|
||||||
|
|
||||||
|
|
||||||
|
def parse_multi_account_config(config_value: str, separator: str = ";") -> List[str]:
|
||||||
|
"""
|
||||||
|
解析多账号配置,返回账号列表
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config_value: 配置值字符串,多个账号用分隔符分隔
|
||||||
|
separator: 分隔符,默认为 ;
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
账号列表,空字符串会被保留(用于占位)
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
>>> parse_multi_account_config("url1;url2;url3")
|
||||||
|
['url1', 'url2', 'url3']
|
||||||
|
>>> parse_multi_account_config(";token2") # 第一个账号无token
|
||||||
|
['', 'token2']
|
||||||
|
>>> parse_multi_account_config("")
|
||||||
|
[]
|
||||||
|
"""
|
||||||
|
if not config_value:
|
||||||
|
return []
|
||||||
|
# 保留空字符串用于占位(如 ";token2" 表示第一个账号无token)
|
||||||
|
accounts = [acc.strip() for acc in config_value.split(separator)]
|
||||||
|
# 过滤掉全部为空的情况
|
||||||
|
if all(not acc for acc in accounts):
|
||||||
|
return []
|
||||||
|
return accounts
|
||||||
|
|
||||||
|
|
||||||
|
def validate_paired_configs(
|
||||||
|
configs: Dict[str, List[str]],
|
||||||
|
channel_name: str,
|
||||||
|
required_keys: Optional[List[str]] = None
|
||||||
|
) -> Tuple[bool, int]:
|
||||||
|
"""
|
||||||
|
验证配对配置的数量是否一致
|
||||||
|
|
||||||
|
对于需要多个配置项配对的渠道(如 Telegram 的 token 和 chat_id),
|
||||||
|
验证所有配置项的账号数量是否一致。
|
||||||
|
|
||||||
|
Args:
|
||||||
|
configs: 配置字典,key 为配置名,value 为账号列表
|
||||||
|
channel_name: 渠道名称,用于日志输出
|
||||||
|
required_keys: 必须有值的配置项列表
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(是否验证通过, 账号数量)
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
>>> validate_paired_configs({
|
||||||
|
... "token": ["t1", "t2"],
|
||||||
|
... "chat_id": ["c1", "c2"]
|
||||||
|
... }, "Telegram", ["token", "chat_id"])
|
||||||
|
(True, 2)
|
||||||
|
|
||||||
|
>>> validate_paired_configs({
|
||||||
|
... "token": ["t1", "t2"],
|
||||||
|
... "chat_id": ["c1"] # 数量不匹配
|
||||||
|
... }, "Telegram", ["token", "chat_id"])
|
||||||
|
(False, 0)
|
||||||
|
"""
|
||||||
|
# 过滤掉空列表
|
||||||
|
non_empty_configs = {k: v for k, v in configs.items() if v}
|
||||||
|
|
||||||
|
if not non_empty_configs:
|
||||||
|
return True, 0
|
||||||
|
|
||||||
|
# 检查必须项
|
||||||
|
if required_keys:
|
||||||
|
for key in required_keys:
|
||||||
|
if key not in non_empty_configs or not non_empty_configs[key]:
|
||||||
|
return True, 0 # 必须项为空,视为未配置
|
||||||
|
|
||||||
|
# 获取所有非空配置的长度
|
||||||
|
lengths = {k: len(v) for k, v in non_empty_configs.items()}
|
||||||
|
unique_lengths = set(lengths.values())
|
||||||
|
|
||||||
|
if len(unique_lengths) > 1:
|
||||||
|
print(f"❌ {channel_name} 配置错误:配对配置数量不一致,将跳过该渠道推送")
|
||||||
|
for key, length in lengths.items():
|
||||||
|
print(f" - {key}: {length} 个")
|
||||||
|
return False, 0
|
||||||
|
|
||||||
|
return True, list(unique_lengths)[0] if unique_lengths else 0
|
||||||
|
|
||||||
|
|
||||||
|
def limit_accounts(
|
||||||
|
accounts: List[str],
|
||||||
|
max_count: int,
|
||||||
|
channel_name: str
|
||||||
|
) -> List[str]:
|
||||||
|
"""
|
||||||
|
限制账号数量
|
||||||
|
|
||||||
|
当配置的账号数量超过最大限制时,只使用前 N 个账号,
|
||||||
|
并输出警告信息。
|
||||||
|
|
||||||
|
Args:
|
||||||
|
accounts: 账号列表
|
||||||
|
max_count: 最大账号数量
|
||||||
|
channel_name: 渠道名称,用于日志输出
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
限制后的账号列表
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
>>> limit_accounts(["a1", "a2", "a3"], 2, "飞书")
|
||||||
|
⚠️ 飞书 配置了 3 个账号,超过最大限制 2,只使用前 2 个
|
||||||
|
['a1', 'a2']
|
||||||
|
"""
|
||||||
|
if len(accounts) > max_count:
|
||||||
|
print(f"⚠️ {channel_name} 配置了 {len(accounts)} 个账号,超过最大限制 {max_count},只使用前 {max_count} 个")
|
||||||
|
print(f" ⚠️ 警告:如果您是 fork 用户,过多账号可能导致 GitHub Actions 运行时间过长,存在账号风险")
|
||||||
|
return accounts[:max_count]
|
||||||
|
return accounts
|
||||||
|
|
||||||
|
|
||||||
|
def get_account_at_index(accounts: List[str], index: int, default: str = "") -> str:
|
||||||
|
"""
|
||||||
|
安全获取指定索引的账号值
|
||||||
|
|
||||||
|
当索引超出范围或账号值为空时,返回默认值。
|
||||||
|
|
||||||
|
Args:
|
||||||
|
accounts: 账号列表
|
||||||
|
index: 索引
|
||||||
|
default: 默认值
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
账号值或默认值
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
>>> get_account_at_index(["a", "b", "c"], 1)
|
||||||
|
'b'
|
||||||
|
>>> get_account_at_index(["a", "", "c"], 1, "default")
|
||||||
|
'default'
|
||||||
|
>>> get_account_at_index(["a"], 5, "default")
|
||||||
|
'default'
|
||||||
|
"""
|
||||||
|
if index < len(accounts):
|
||||||
|
return accounts[index] if accounts[index] else default
|
||||||
|
return default
|
||||||
291
trendradar/core/data.py
Normal file
291
trendradar/core/data.py
Normal file
@ -0,0 +1,291 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
"""
|
||||||
|
数据处理模块
|
||||||
|
|
||||||
|
提供数据读取、保存和检测功能:
|
||||||
|
- save_titles_to_file: 保存标题到 TXT 文件
|
||||||
|
- read_all_today_titles: 从存储后端读取当天所有标题
|
||||||
|
- detect_latest_new_titles: 检测最新批次的新增标题
|
||||||
|
|
||||||
|
Author: TrendRadar Team
|
||||||
|
"""
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, List, Tuple, Optional, Callable
|
||||||
|
|
||||||
|
|
||||||
|
def save_titles_to_file(
|
||||||
|
results: Dict,
|
||||||
|
id_to_name: Dict,
|
||||||
|
failed_ids: List,
|
||||||
|
output_path: str,
|
||||||
|
clean_title_func: Callable[[str], str],
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
保存标题到 TXT 文件
|
||||||
|
|
||||||
|
Args:
|
||||||
|
results: 抓取结果 {source_id: {title: title_data}}
|
||||||
|
id_to_name: ID 到名称的映射
|
||||||
|
failed_ids: 失败的 ID 列表
|
||||||
|
output_path: 输出文件路径
|
||||||
|
clean_title_func: 标题清理函数
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: 保存的文件路径
|
||||||
|
"""
|
||||||
|
# 确保目录存在
|
||||||
|
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
with open(output_path, "w", encoding="utf-8") as f:
|
||||||
|
for id_value, title_data in results.items():
|
||||||
|
# id | name 或 id
|
||||||
|
name = id_to_name.get(id_value)
|
||||||
|
if name and name != id_value:
|
||||||
|
f.write(f"{id_value} | {name}\n")
|
||||||
|
else:
|
||||||
|
f.write(f"{id_value}\n")
|
||||||
|
|
||||||
|
# 按排名排序标题
|
||||||
|
sorted_titles = []
|
||||||
|
for title, info in title_data.items():
|
||||||
|
cleaned_title = clean_title_func(title)
|
||||||
|
if isinstance(info, dict):
|
||||||
|
ranks = info.get("ranks", [])
|
||||||
|
url = info.get("url", "")
|
||||||
|
mobile_url = info.get("mobileUrl", "")
|
||||||
|
else:
|
||||||
|
ranks = info if isinstance(info, list) else []
|
||||||
|
url = ""
|
||||||
|
mobile_url = ""
|
||||||
|
|
||||||
|
rank = ranks[0] if ranks else 1
|
||||||
|
sorted_titles.append((rank, cleaned_title, url, mobile_url))
|
||||||
|
|
||||||
|
sorted_titles.sort(key=lambda x: x[0])
|
||||||
|
|
||||||
|
for rank, cleaned_title, url, mobile_url in sorted_titles:
|
||||||
|
line = f"{rank}. {cleaned_title}"
|
||||||
|
|
||||||
|
if url:
|
||||||
|
line += f" [URL:{url}]"
|
||||||
|
if mobile_url:
|
||||||
|
line += f" [MOBILE:{mobile_url}]"
|
||||||
|
f.write(line + "\n")
|
||||||
|
|
||||||
|
f.write("\n")
|
||||||
|
|
||||||
|
if failed_ids:
|
||||||
|
f.write("==== 以下ID请求失败 ====\n")
|
||||||
|
for id_value in failed_ids:
|
||||||
|
f.write(f"{id_value}\n")
|
||||||
|
|
||||||
|
return output_path
|
||||||
|
|
||||||
|
|
||||||
|
def read_all_today_titles_from_storage(
|
||||||
|
storage_manager,
|
||||||
|
current_platform_ids: Optional[List[str]] = None,
|
||||||
|
) -> Tuple[Dict, Dict, Dict]:
|
||||||
|
"""
|
||||||
|
从存储后端读取当天所有标题(SQLite 数据)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
storage_manager: 存储管理器实例
|
||||||
|
current_platform_ids: 当前监控的平台 ID 列表(用于过滤)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple[Dict, Dict, Dict]: (all_results, id_to_name, title_info)
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
news_data = storage_manager.get_today_all_data()
|
||||||
|
|
||||||
|
if not news_data or not news_data.items:
|
||||||
|
return {}, {}, {}
|
||||||
|
|
||||||
|
all_results = {}
|
||||||
|
final_id_to_name = {}
|
||||||
|
title_info = {}
|
||||||
|
|
||||||
|
for source_id, news_list in news_data.items.items():
|
||||||
|
# 按平台过滤
|
||||||
|
if current_platform_ids is not None and source_id not in current_platform_ids:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 获取来源名称
|
||||||
|
source_name = news_data.id_to_name.get(source_id, source_id)
|
||||||
|
final_id_to_name[source_id] = source_name
|
||||||
|
|
||||||
|
if source_id not in all_results:
|
||||||
|
all_results[source_id] = {}
|
||||||
|
title_info[source_id] = {}
|
||||||
|
|
||||||
|
for item in news_list:
|
||||||
|
title = item.title
|
||||||
|
ranks = getattr(item, 'ranks', [item.rank])
|
||||||
|
first_time = getattr(item, 'first_time', item.crawl_time)
|
||||||
|
last_time = getattr(item, 'last_time', item.crawl_time)
|
||||||
|
count = getattr(item, 'count', 1)
|
||||||
|
|
||||||
|
all_results[source_id][title] = {
|
||||||
|
"ranks": ranks,
|
||||||
|
"url": item.url or "",
|
||||||
|
"mobileUrl": item.mobile_url or "",
|
||||||
|
}
|
||||||
|
|
||||||
|
title_info[source_id][title] = {
|
||||||
|
"first_time": first_time,
|
||||||
|
"last_time": last_time,
|
||||||
|
"count": count,
|
||||||
|
"ranks": ranks,
|
||||||
|
"url": item.url or "",
|
||||||
|
"mobileUrl": item.mobile_url or "",
|
||||||
|
}
|
||||||
|
|
||||||
|
return all_results, final_id_to_name, title_info
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[存储] 从存储后端读取数据失败: {e}")
|
||||||
|
return {}, {}, {}
|
||||||
|
|
||||||
|
|
||||||
|
def read_all_today_titles(
|
||||||
|
storage_manager,
|
||||||
|
current_platform_ids: Optional[List[str]] = None,
|
||||||
|
) -> Tuple[Dict, Dict, Dict]:
|
||||||
|
"""
|
||||||
|
读取当天所有标题(从存储后端)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
storage_manager: 存储管理器实例
|
||||||
|
current_platform_ids: 当前监控的平台 ID 列表(用于过滤)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple[Dict, Dict, Dict]: (all_results, id_to_name, title_info)
|
||||||
|
"""
|
||||||
|
all_results, final_id_to_name, title_info = read_all_today_titles_from_storage(
|
||||||
|
storage_manager, current_platform_ids
|
||||||
|
)
|
||||||
|
|
||||||
|
if all_results:
|
||||||
|
total_count = sum(len(titles) for titles in all_results.values())
|
||||||
|
print(f"[存储] 已从存储后端读取 {total_count} 条标题")
|
||||||
|
else:
|
||||||
|
print("[存储] 当天暂无数据")
|
||||||
|
|
||||||
|
return all_results, final_id_to_name, title_info
|
||||||
|
|
||||||
|
|
||||||
|
def detect_latest_new_titles_from_storage(
|
||||||
|
storage_manager,
|
||||||
|
current_platform_ids: Optional[List[str]] = None,
|
||||||
|
) -> Dict:
|
||||||
|
"""
|
||||||
|
从存储后端检测最新批次的新增标题
|
||||||
|
|
||||||
|
Args:
|
||||||
|
storage_manager: 存储管理器实例
|
||||||
|
current_platform_ids: 当前监控的平台 ID 列表(用于过滤)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict: 新增标题 {source_id: {title: title_data}}
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# 获取最新抓取数据
|
||||||
|
latest_data = storage_manager.get_latest_crawl_data()
|
||||||
|
if not latest_data or not latest_data.items:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
# 获取所有历史数据
|
||||||
|
all_data = storage_manager.get_today_all_data()
|
||||||
|
if not all_data or not all_data.items:
|
||||||
|
# 没有历史数据(第一次抓取),不应该有"新增"标题
|
||||||
|
return {}
|
||||||
|
|
||||||
|
# 收集历史标题(不包括最新批次的时间)
|
||||||
|
latest_time = latest_data.crawl_time
|
||||||
|
historical_titles = {}
|
||||||
|
|
||||||
|
for source_id, news_list in all_data.items.items():
|
||||||
|
if current_platform_ids is not None and source_id not in current_platform_ids:
|
||||||
|
continue
|
||||||
|
|
||||||
|
historical_titles[source_id] = set()
|
||||||
|
for item in news_list:
|
||||||
|
# 只统计非最新批次的标题
|
||||||
|
first_time = getattr(item, 'first_time', item.crawl_time)
|
||||||
|
if first_time != latest_time:
|
||||||
|
historical_titles[source_id].add(item.title)
|
||||||
|
|
||||||
|
# 检查是否是当天第一次抓取(没有任何历史标题)
|
||||||
|
# 如果所有平台的历史标题集合都为空,说明只有一个抓取批次,不应该有"新增"标题
|
||||||
|
has_historical_data = any(len(titles) > 0 for titles in historical_titles.values())
|
||||||
|
if not has_historical_data:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
# 找出新增标题
|
||||||
|
new_titles = {}
|
||||||
|
for source_id, news_list in latest_data.items.items():
|
||||||
|
if current_platform_ids is not None and source_id not in current_platform_ids:
|
||||||
|
continue
|
||||||
|
|
||||||
|
historical_set = historical_titles.get(source_id, set())
|
||||||
|
source_new_titles = {}
|
||||||
|
|
||||||
|
for item in news_list:
|
||||||
|
if item.title not in historical_set:
|
||||||
|
source_new_titles[item.title] = {
|
||||||
|
"ranks": [item.rank],
|
||||||
|
"url": item.url or "",
|
||||||
|
"mobileUrl": item.mobile_url or "",
|
||||||
|
}
|
||||||
|
|
||||||
|
if source_new_titles:
|
||||||
|
new_titles[source_id] = source_new_titles
|
||||||
|
|
||||||
|
return new_titles
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[存储] 从存储后端检测新标题失败: {e}")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
def detect_latest_new_titles(
|
||||||
|
storage_manager,
|
||||||
|
current_platform_ids: Optional[List[str]] = None,
|
||||||
|
) -> Dict:
|
||||||
|
"""
|
||||||
|
检测当日最新批次的新增标题(从存储后端)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
storage_manager: 存储管理器实例
|
||||||
|
current_platform_ids: 当前监控的平台 ID 列表(用于过滤)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict: 新增标题 {source_id: {title: title_data}}
|
||||||
|
"""
|
||||||
|
new_titles = detect_latest_new_titles_from_storage(storage_manager, current_platform_ids)
|
||||||
|
if new_titles:
|
||||||
|
total_new = sum(len(titles) for titles in new_titles.values())
|
||||||
|
print(f"[存储] 从存储后端检测到 {total_new} 条新增标题")
|
||||||
|
return new_titles
|
||||||
|
|
||||||
|
|
||||||
|
def is_first_crawl_today(output_dir: str, date_folder: str) -> bool:
|
||||||
|
"""
|
||||||
|
检测是否是当天第一次爬取
|
||||||
|
|
||||||
|
Args:
|
||||||
|
output_dir: 输出目录
|
||||||
|
date_folder: 日期文件夹名称
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: 是否是当天第一次爬取
|
||||||
|
"""
|
||||||
|
txt_dir = Path(output_dir) / date_folder / "txt"
|
||||||
|
|
||||||
|
if not txt_dir.exists():
|
||||||
|
return True
|
||||||
|
|
||||||
|
files = sorted([f for f in txt_dir.iterdir() if f.suffix == ".txt"])
|
||||||
|
return len(files) <= 1
|
||||||
194
trendradar/core/frequency.py
Normal file
194
trendradar/core/frequency.py
Normal file
@ -0,0 +1,194 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
"""
|
||||||
|
频率词配置加载模块
|
||||||
|
|
||||||
|
负责从配置文件加载频率词规则,支持:
|
||||||
|
- 普通词组
|
||||||
|
- 必须词(+前缀)
|
||||||
|
- 过滤词(!前缀)
|
||||||
|
- 全局过滤词([GLOBAL_FILTER] 区域)
|
||||||
|
- 最大显示数量(@前缀)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, List, Tuple, Optional
|
||||||
|
|
||||||
|
|
||||||
|
def load_frequency_words(
|
||||||
|
frequency_file: Optional[str] = None,
|
||||||
|
) -> Tuple[List[Dict], List[str], List[str]]:
|
||||||
|
"""
|
||||||
|
加载频率词配置
|
||||||
|
|
||||||
|
配置文件格式说明:
|
||||||
|
- 每个词组由空行分隔
|
||||||
|
- [GLOBAL_FILTER] 区域定义全局过滤词
|
||||||
|
- [WORD_GROUPS] 区域定义词组(默认)
|
||||||
|
|
||||||
|
词组语法:
|
||||||
|
- 普通词:直接写入,任意匹配即可
|
||||||
|
- +词:必须词,所有必须词都要匹配
|
||||||
|
- !词:过滤词,匹配则排除
|
||||||
|
- @数字:该词组最多显示的条数
|
||||||
|
|
||||||
|
Args:
|
||||||
|
frequency_file: 频率词配置文件路径,默认从环境变量 FREQUENCY_WORDS_PATH 获取或使用 config/frequency_words.txt
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(词组列表, 词组内过滤词, 全局过滤词)
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
FileNotFoundError: 频率词文件不存在
|
||||||
|
"""
|
||||||
|
if frequency_file is None:
|
||||||
|
frequency_file = os.environ.get(
|
||||||
|
"FREQUENCY_WORDS_PATH", "config/frequency_words.txt"
|
||||||
|
)
|
||||||
|
|
||||||
|
frequency_path = Path(frequency_file)
|
||||||
|
if not frequency_path.exists():
|
||||||
|
raise FileNotFoundError(f"频率词文件 {frequency_file} 不存在")
|
||||||
|
|
||||||
|
with open(frequency_path, "r", encoding="utf-8") as f:
|
||||||
|
content = f.read()
|
||||||
|
|
||||||
|
word_groups = [group.strip() for group in content.split("\n\n") if group.strip()]
|
||||||
|
|
||||||
|
processed_groups = []
|
||||||
|
filter_words = []
|
||||||
|
global_filters = []
|
||||||
|
|
||||||
|
# 默认区域(向后兼容)
|
||||||
|
current_section = "WORD_GROUPS"
|
||||||
|
|
||||||
|
for group in word_groups:
|
||||||
|
lines = [line.strip() for line in group.split("\n") if line.strip()]
|
||||||
|
|
||||||
|
if not lines:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 检查是否为区域标记
|
||||||
|
if lines[0].startswith("[") and lines[0].endswith("]"):
|
||||||
|
section_name = lines[0][1:-1].upper()
|
||||||
|
if section_name in ("GLOBAL_FILTER", "WORD_GROUPS"):
|
||||||
|
current_section = section_name
|
||||||
|
lines = lines[1:] # 移除标记行
|
||||||
|
|
||||||
|
# 处理全局过滤区域
|
||||||
|
if current_section == "GLOBAL_FILTER":
|
||||||
|
# 直接添加所有非空行到全局过滤列表
|
||||||
|
for line in lines:
|
||||||
|
# 忽略特殊语法前缀,只提取纯文本
|
||||||
|
if line.startswith(("!", "+", "@")):
|
||||||
|
continue # 全局过滤区不支持特殊语法
|
||||||
|
if line:
|
||||||
|
global_filters.append(line)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 处理词组区域
|
||||||
|
words = lines
|
||||||
|
|
||||||
|
group_required_words = []
|
||||||
|
group_normal_words = []
|
||||||
|
group_filter_words = []
|
||||||
|
group_max_count = 0 # 默认不限制
|
||||||
|
|
||||||
|
for word in words:
|
||||||
|
if word.startswith("@"):
|
||||||
|
# 解析最大显示数量(只接受正整数)
|
||||||
|
try:
|
||||||
|
count = int(word[1:])
|
||||||
|
if count > 0:
|
||||||
|
group_max_count = count
|
||||||
|
except (ValueError, IndexError):
|
||||||
|
pass # 忽略无效的@数字格式
|
||||||
|
elif word.startswith("!"):
|
||||||
|
filter_words.append(word[1:])
|
||||||
|
group_filter_words.append(word[1:])
|
||||||
|
elif word.startswith("+"):
|
||||||
|
group_required_words.append(word[1:])
|
||||||
|
else:
|
||||||
|
group_normal_words.append(word)
|
||||||
|
|
||||||
|
if group_required_words or group_normal_words:
|
||||||
|
if group_normal_words:
|
||||||
|
group_key = " ".join(group_normal_words)
|
||||||
|
else:
|
||||||
|
group_key = " ".join(group_required_words)
|
||||||
|
|
||||||
|
processed_groups.append(
|
||||||
|
{
|
||||||
|
"required": group_required_words,
|
||||||
|
"normal": group_normal_words,
|
||||||
|
"group_key": group_key,
|
||||||
|
"max_count": group_max_count,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return processed_groups, filter_words, global_filters
|
||||||
|
|
||||||
|
|
||||||
|
def matches_word_groups(
|
||||||
|
title: str,
|
||||||
|
word_groups: List[Dict],
|
||||||
|
filter_words: List[str],
|
||||||
|
global_filters: Optional[List[str]] = None
|
||||||
|
) -> bool:
|
||||||
|
"""
|
||||||
|
检查标题是否匹配词组规则
|
||||||
|
|
||||||
|
Args:
|
||||||
|
title: 标题文本
|
||||||
|
word_groups: 词组列表
|
||||||
|
filter_words: 过滤词列表
|
||||||
|
global_filters: 全局过滤词列表
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
是否匹配
|
||||||
|
"""
|
||||||
|
# 防御性类型检查:确保 title 是有效字符串
|
||||||
|
if not isinstance(title, str):
|
||||||
|
title = str(title) if title is not None else ""
|
||||||
|
if not title.strip():
|
||||||
|
return False
|
||||||
|
|
||||||
|
title_lower = title.lower()
|
||||||
|
|
||||||
|
# 全局过滤检查(优先级最高)
|
||||||
|
if global_filters:
|
||||||
|
if any(global_word.lower() in title_lower for global_word in global_filters):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# 如果没有配置词组,则匹配所有标题(支持显示全部新闻)
|
||||||
|
if not word_groups:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# 过滤词检查
|
||||||
|
if any(filter_word.lower() in title_lower for filter_word in filter_words):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# 词组匹配检查
|
||||||
|
for group in word_groups:
|
||||||
|
required_words = group["required"]
|
||||||
|
normal_words = group["normal"]
|
||||||
|
|
||||||
|
# 必须词检查
|
||||||
|
if required_words:
|
||||||
|
all_required_present = all(
|
||||||
|
req_word.lower() in title_lower for req_word in required_words
|
||||||
|
)
|
||||||
|
if not all_required_present:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 普通词检查
|
||||||
|
if normal_words:
|
||||||
|
any_normal_present = any(
|
||||||
|
normal_word.lower() in title_lower for normal_word in normal_words
|
||||||
|
)
|
||||||
|
if not any_normal_present:
|
||||||
|
continue
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
332
trendradar/core/loader.py
Normal file
332
trendradar/core/loader.py
Normal file
@ -0,0 +1,332 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
"""
|
||||||
|
配置加载模块
|
||||||
|
|
||||||
|
负责从 YAML 配置文件和环境变量加载配置。
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, Any, Optional
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
from .config import parse_multi_account_config, validate_paired_configs
|
||||||
|
|
||||||
|
|
||||||
|
def _get_env_bool(key: str, default: bool = False) -> Optional[bool]:
|
||||||
|
"""从环境变量获取布尔值,如果未设置返回 None"""
|
||||||
|
value = os.environ.get(key, "").strip().lower()
|
||||||
|
if not value:
|
||||||
|
return None
|
||||||
|
return value in ("true", "1")
|
||||||
|
|
||||||
|
|
||||||
|
def _get_env_int(key: str, default: int = 0) -> int:
|
||||||
|
"""从环境变量获取整数值"""
|
||||||
|
value = os.environ.get(key, "").strip()
|
||||||
|
if not value:
|
||||||
|
return default
|
||||||
|
try:
|
||||||
|
return int(value)
|
||||||
|
except ValueError:
|
||||||
|
return default
|
||||||
|
|
||||||
|
|
||||||
|
def _get_env_str(key: str, default: str = "") -> str:
|
||||||
|
"""从环境变量获取字符串值"""
|
||||||
|
return os.environ.get(key, "").strip() or default
|
||||||
|
|
||||||
|
|
||||||
|
def _load_app_config(config_data: Dict) -> Dict:
|
||||||
|
"""加载应用配置"""
|
||||||
|
app_config = config_data.get("app", {})
|
||||||
|
return {
|
||||||
|
"VERSION_CHECK_URL": app_config.get("version_check_url", ""),
|
||||||
|
"SHOW_VERSION_UPDATE": app_config.get("show_version_update", True),
|
||||||
|
"TIMEZONE": _get_env_str("TIMEZONE") or app_config.get("timezone", "Asia/Shanghai"),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _load_crawler_config(config_data: Dict) -> Dict:
|
||||||
|
"""加载爬虫配置"""
|
||||||
|
crawler_config = config_data.get("crawler", {})
|
||||||
|
enable_crawler_env = _get_env_bool("ENABLE_CRAWLER")
|
||||||
|
return {
|
||||||
|
"REQUEST_INTERVAL": crawler_config.get("request_interval", 100),
|
||||||
|
"USE_PROXY": crawler_config.get("use_proxy", False),
|
||||||
|
"DEFAULT_PROXY": crawler_config.get("default_proxy", ""),
|
||||||
|
"ENABLE_CRAWLER": enable_crawler_env if enable_crawler_env is not None else crawler_config.get("enable_crawler", True),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _load_report_config(config_data: Dict) -> Dict:
|
||||||
|
"""加载报告配置"""
|
||||||
|
report_config = config_data.get("report", {})
|
||||||
|
|
||||||
|
# 环境变量覆盖
|
||||||
|
sort_by_position_env = _get_env_bool("SORT_BY_POSITION_FIRST")
|
||||||
|
reverse_content_env = _get_env_bool("REVERSE_CONTENT_ORDER")
|
||||||
|
max_news_env = _get_env_int("MAX_NEWS_PER_KEYWORD")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"REPORT_MODE": _get_env_str("REPORT_MODE") or report_config.get("mode", "daily"),
|
||||||
|
"RANK_THRESHOLD": report_config.get("rank_threshold", 10),
|
||||||
|
"SORT_BY_POSITION_FIRST": sort_by_position_env if sort_by_position_env is not None else report_config.get("sort_by_position_first", False),
|
||||||
|
"MAX_NEWS_PER_KEYWORD": max_news_env or report_config.get("max_news_per_keyword", 0),
|
||||||
|
"REVERSE_CONTENT_ORDER": reverse_content_env if reverse_content_env is not None else report_config.get("reverse_content_order", False),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _load_notification_config(config_data: Dict) -> Dict:
|
||||||
|
"""加载通知配置"""
|
||||||
|
notification = config_data.get("notification", {})
|
||||||
|
enable_notification_env = _get_env_bool("ENABLE_NOTIFICATION")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"ENABLE_NOTIFICATION": enable_notification_env if enable_notification_env is not None else notification.get("enable_notification", True),
|
||||||
|
"MESSAGE_BATCH_SIZE": notification.get("message_batch_size", 4000),
|
||||||
|
"DINGTALK_BATCH_SIZE": notification.get("dingtalk_batch_size", 20000),
|
||||||
|
"FEISHU_BATCH_SIZE": notification.get("feishu_batch_size", 29000),
|
||||||
|
"BARK_BATCH_SIZE": notification.get("bark_batch_size", 3600),
|
||||||
|
"SLACK_BATCH_SIZE": notification.get("slack_batch_size", 4000),
|
||||||
|
"BATCH_SEND_INTERVAL": notification.get("batch_send_interval", 1.0),
|
||||||
|
"FEISHU_MESSAGE_SEPARATOR": notification.get("feishu_message_separator", "---"),
|
||||||
|
"MAX_ACCOUNTS_PER_CHANNEL": _get_env_int("MAX_ACCOUNTS_PER_CHANNEL") or notification.get("max_accounts_per_channel", 3),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _load_push_window_config(config_data: Dict) -> Dict:
|
||||||
|
"""加载推送窗口配置"""
|
||||||
|
notification = config_data.get("notification", {})
|
||||||
|
push_window = notification.get("push_window", {})
|
||||||
|
time_range = push_window.get("time_range", {})
|
||||||
|
|
||||||
|
enabled_env = _get_env_bool("PUSH_WINDOW_ENABLED")
|
||||||
|
once_per_day_env = _get_env_bool("PUSH_WINDOW_ONCE_PER_DAY")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"ENABLED": enabled_env if enabled_env is not None else push_window.get("enabled", False),
|
||||||
|
"TIME_RANGE": {
|
||||||
|
"START": _get_env_str("PUSH_WINDOW_START") or time_range.get("start", "08:00"),
|
||||||
|
"END": _get_env_str("PUSH_WINDOW_END") or time_range.get("end", "22:00"),
|
||||||
|
},
|
||||||
|
"ONCE_PER_DAY": once_per_day_env if once_per_day_env is not None else push_window.get("once_per_day", True),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _load_weight_config(config_data: Dict) -> Dict:
|
||||||
|
"""加载权重配置"""
|
||||||
|
weight = config_data.get("weight", {})
|
||||||
|
return {
|
||||||
|
"RANK_WEIGHT": weight.get("rank_weight", 1.0),
|
||||||
|
"FREQUENCY_WEIGHT": weight.get("frequency_weight", 1.0),
|
||||||
|
"HOTNESS_WEIGHT": weight.get("hotness_weight", 1.0),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _load_storage_config(config_data: Dict) -> Dict:
|
||||||
|
"""加载存储配置"""
|
||||||
|
storage = config_data.get("storage", {})
|
||||||
|
formats = storage.get("formats", {})
|
||||||
|
local = storage.get("local", {})
|
||||||
|
remote = storage.get("remote", {})
|
||||||
|
pull = storage.get("pull", {})
|
||||||
|
|
||||||
|
txt_enabled_env = _get_env_bool("STORAGE_TXT_ENABLED")
|
||||||
|
html_enabled_env = _get_env_bool("STORAGE_HTML_ENABLED")
|
||||||
|
pull_enabled_env = _get_env_bool("PULL_ENABLED")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"BACKEND": _get_env_str("STORAGE_BACKEND") or storage.get("backend", "auto"),
|
||||||
|
"FORMATS": {
|
||||||
|
"SQLITE": formats.get("sqlite", True),
|
||||||
|
"TXT": txt_enabled_env if txt_enabled_env is not None else formats.get("txt", True),
|
||||||
|
"HTML": html_enabled_env if html_enabled_env is not None else formats.get("html", True),
|
||||||
|
},
|
||||||
|
"LOCAL": {
|
||||||
|
"DATA_DIR": local.get("data_dir", "output"),
|
||||||
|
"RETENTION_DAYS": _get_env_int("LOCAL_RETENTION_DAYS") or local.get("retention_days", 0),
|
||||||
|
},
|
||||||
|
"REMOTE": {
|
||||||
|
"ENDPOINT_URL": _get_env_str("S3_ENDPOINT_URL") or remote.get("endpoint_url", ""),
|
||||||
|
"BUCKET_NAME": _get_env_str("S3_BUCKET_NAME") or remote.get("bucket_name", ""),
|
||||||
|
"ACCESS_KEY_ID": _get_env_str("S3_ACCESS_KEY_ID") or remote.get("access_key_id", ""),
|
||||||
|
"SECRET_ACCESS_KEY": _get_env_str("S3_SECRET_ACCESS_KEY") or remote.get("secret_access_key", ""),
|
||||||
|
"REGION": _get_env_str("S3_REGION") or remote.get("region", ""),
|
||||||
|
"RETENTION_DAYS": _get_env_int("REMOTE_RETENTION_DAYS") or remote.get("retention_days", 0),
|
||||||
|
},
|
||||||
|
"PULL": {
|
||||||
|
"ENABLED": pull_enabled_env if pull_enabled_env is not None else pull.get("enabled", False),
|
||||||
|
"DAYS": _get_env_int("PULL_DAYS") or pull.get("days", 7),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _load_webhook_config(config_data: Dict) -> Dict:
|
||||||
|
"""加载 Webhook 配置"""
|
||||||
|
notification = config_data.get("notification", {})
|
||||||
|
webhooks = notification.get("webhooks", {})
|
||||||
|
|
||||||
|
return {
|
||||||
|
# 飞书
|
||||||
|
"FEISHU_WEBHOOK_URL": _get_env_str("FEISHU_WEBHOOK_URL") or webhooks.get("feishu_url", ""),
|
||||||
|
# 钉钉
|
||||||
|
"DINGTALK_WEBHOOK_URL": _get_env_str("DINGTALK_WEBHOOK_URL") or webhooks.get("dingtalk_url", ""),
|
||||||
|
# 企业微信
|
||||||
|
"WEWORK_WEBHOOK_URL": _get_env_str("WEWORK_WEBHOOK_URL") or webhooks.get("wework_url", ""),
|
||||||
|
"WEWORK_MSG_TYPE": _get_env_str("WEWORK_MSG_TYPE") or webhooks.get("wework_msg_type", "markdown"),
|
||||||
|
# Telegram
|
||||||
|
"TELEGRAM_BOT_TOKEN": _get_env_str("TELEGRAM_BOT_TOKEN") or webhooks.get("telegram_bot_token", ""),
|
||||||
|
"TELEGRAM_CHAT_ID": _get_env_str("TELEGRAM_CHAT_ID") or webhooks.get("telegram_chat_id", ""),
|
||||||
|
# 邮件
|
||||||
|
"EMAIL_FROM": _get_env_str("EMAIL_FROM") or webhooks.get("email_from", ""),
|
||||||
|
"EMAIL_PASSWORD": _get_env_str("EMAIL_PASSWORD") or webhooks.get("email_password", ""),
|
||||||
|
"EMAIL_TO": _get_env_str("EMAIL_TO") or webhooks.get("email_to", ""),
|
||||||
|
"EMAIL_SMTP_SERVER": _get_env_str("EMAIL_SMTP_SERVER") or webhooks.get("email_smtp_server", ""),
|
||||||
|
"EMAIL_SMTP_PORT": _get_env_str("EMAIL_SMTP_PORT") or webhooks.get("email_smtp_port", ""),
|
||||||
|
# ntfy
|
||||||
|
"NTFY_SERVER_URL": _get_env_str("NTFY_SERVER_URL") or webhooks.get("ntfy_server_url") or "https://ntfy.sh",
|
||||||
|
"NTFY_TOPIC": _get_env_str("NTFY_TOPIC") or webhooks.get("ntfy_topic", ""),
|
||||||
|
"NTFY_TOKEN": _get_env_str("NTFY_TOKEN") or webhooks.get("ntfy_token", ""),
|
||||||
|
# Bark
|
||||||
|
"BARK_URL": _get_env_str("BARK_URL") or webhooks.get("bark_url", ""),
|
||||||
|
# Slack
|
||||||
|
"SLACK_WEBHOOK_URL": _get_env_str("SLACK_WEBHOOK_URL") or webhooks.get("slack_webhook_url", ""),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _print_notification_sources(config: Dict) -> None:
|
||||||
|
"""打印通知渠道配置来源信息"""
|
||||||
|
notification_sources = []
|
||||||
|
max_accounts = config["MAX_ACCOUNTS_PER_CHANNEL"]
|
||||||
|
|
||||||
|
if config["FEISHU_WEBHOOK_URL"]:
|
||||||
|
accounts = parse_multi_account_config(config["FEISHU_WEBHOOK_URL"])
|
||||||
|
count = min(len(accounts), max_accounts)
|
||||||
|
source = "环境变量" if os.environ.get("FEISHU_WEBHOOK_URL") else "配置文件"
|
||||||
|
notification_sources.append(f"飞书({source}, {count}个账号)")
|
||||||
|
|
||||||
|
if config["DINGTALK_WEBHOOK_URL"]:
|
||||||
|
accounts = parse_multi_account_config(config["DINGTALK_WEBHOOK_URL"])
|
||||||
|
count = min(len(accounts), max_accounts)
|
||||||
|
source = "环境变量" if os.environ.get("DINGTALK_WEBHOOK_URL") else "配置文件"
|
||||||
|
notification_sources.append(f"钉钉({source}, {count}个账号)")
|
||||||
|
|
||||||
|
if config["WEWORK_WEBHOOK_URL"]:
|
||||||
|
accounts = parse_multi_account_config(config["WEWORK_WEBHOOK_URL"])
|
||||||
|
count = min(len(accounts), max_accounts)
|
||||||
|
source = "环境变量" if os.environ.get("WEWORK_WEBHOOK_URL") else "配置文件"
|
||||||
|
notification_sources.append(f"企业微信({source}, {count}个账号)")
|
||||||
|
|
||||||
|
if config["TELEGRAM_BOT_TOKEN"] and config["TELEGRAM_CHAT_ID"]:
|
||||||
|
tokens = parse_multi_account_config(config["TELEGRAM_BOT_TOKEN"])
|
||||||
|
chat_ids = parse_multi_account_config(config["TELEGRAM_CHAT_ID"])
|
||||||
|
valid, count = validate_paired_configs(
|
||||||
|
{"bot_token": tokens, "chat_id": chat_ids},
|
||||||
|
"Telegram",
|
||||||
|
required_keys=["bot_token", "chat_id"]
|
||||||
|
)
|
||||||
|
if valid and count > 0:
|
||||||
|
count = min(count, max_accounts)
|
||||||
|
token_source = "环境变量" if os.environ.get("TELEGRAM_BOT_TOKEN") else "配置文件"
|
||||||
|
notification_sources.append(f"Telegram({token_source}, {count}个账号)")
|
||||||
|
|
||||||
|
if config["EMAIL_FROM"] and config["EMAIL_PASSWORD"] and config["EMAIL_TO"]:
|
||||||
|
from_source = "环境变量" if os.environ.get("EMAIL_FROM") else "配置文件"
|
||||||
|
notification_sources.append(f"邮件({from_source})")
|
||||||
|
|
||||||
|
if config["NTFY_SERVER_URL"] and config["NTFY_TOPIC"]:
|
||||||
|
topics = parse_multi_account_config(config["NTFY_TOPIC"])
|
||||||
|
tokens = parse_multi_account_config(config["NTFY_TOKEN"])
|
||||||
|
if tokens:
|
||||||
|
valid, count = validate_paired_configs(
|
||||||
|
{"topic": topics, "token": tokens},
|
||||||
|
"ntfy"
|
||||||
|
)
|
||||||
|
if valid and count > 0:
|
||||||
|
count = min(count, max_accounts)
|
||||||
|
server_source = "环境变量" if os.environ.get("NTFY_SERVER_URL") else "配置文件"
|
||||||
|
notification_sources.append(f"ntfy({server_source}, {count}个账号)")
|
||||||
|
else:
|
||||||
|
count = min(len(topics), max_accounts)
|
||||||
|
server_source = "环境变量" if os.environ.get("NTFY_SERVER_URL") else "配置文件"
|
||||||
|
notification_sources.append(f"ntfy({server_source}, {count}个账号)")
|
||||||
|
|
||||||
|
if config["BARK_URL"]:
|
||||||
|
accounts = parse_multi_account_config(config["BARK_URL"])
|
||||||
|
count = min(len(accounts), max_accounts)
|
||||||
|
bark_source = "环境变量" if os.environ.get("BARK_URL") else "配置文件"
|
||||||
|
notification_sources.append(f"Bark({bark_source}, {count}个账号)")
|
||||||
|
|
||||||
|
if config["SLACK_WEBHOOK_URL"]:
|
||||||
|
accounts = parse_multi_account_config(config["SLACK_WEBHOOK_URL"])
|
||||||
|
count = min(len(accounts), max_accounts)
|
||||||
|
slack_source = "环境变量" if os.environ.get("SLACK_WEBHOOK_URL") else "配置文件"
|
||||||
|
notification_sources.append(f"Slack({slack_source}, {count}个账号)")
|
||||||
|
|
||||||
|
if notification_sources:
|
||||||
|
print(f"通知渠道配置来源: {', '.join(notification_sources)}")
|
||||||
|
print(f"每个渠道最大账号数: {max_accounts}")
|
||||||
|
else:
|
||||||
|
print("未配置任何通知渠道")
|
||||||
|
|
||||||
|
|
||||||
|
def load_config(config_path: Optional[str] = None) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
加载配置文件
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config_path: 配置文件路径,默认从环境变量 CONFIG_PATH 获取或使用 config/config.yaml
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
包含所有配置的字典
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
FileNotFoundError: 配置文件不存在
|
||||||
|
"""
|
||||||
|
if config_path is None:
|
||||||
|
config_path = os.environ.get("CONFIG_PATH", "config/config.yaml")
|
||||||
|
|
||||||
|
if not Path(config_path).exists():
|
||||||
|
raise FileNotFoundError(f"配置文件 {config_path} 不存在")
|
||||||
|
|
||||||
|
with open(config_path, "r", encoding="utf-8") as f:
|
||||||
|
config_data = yaml.safe_load(f)
|
||||||
|
|
||||||
|
print(f"配置文件加载成功: {config_path}")
|
||||||
|
|
||||||
|
# 合并所有配置
|
||||||
|
config = {}
|
||||||
|
|
||||||
|
# 应用配置
|
||||||
|
config.update(_load_app_config(config_data))
|
||||||
|
|
||||||
|
# 爬虫配置
|
||||||
|
config.update(_load_crawler_config(config_data))
|
||||||
|
|
||||||
|
# 报告配置
|
||||||
|
config.update(_load_report_config(config_data))
|
||||||
|
|
||||||
|
# 通知配置
|
||||||
|
config.update(_load_notification_config(config_data))
|
||||||
|
|
||||||
|
# 推送窗口配置
|
||||||
|
config["PUSH_WINDOW"] = _load_push_window_config(config_data)
|
||||||
|
|
||||||
|
# 权重配置
|
||||||
|
config["WEIGHT_CONFIG"] = _load_weight_config(config_data)
|
||||||
|
|
||||||
|
# 平台配置
|
||||||
|
config["PLATFORMS"] = config_data.get("platforms", [])
|
||||||
|
|
||||||
|
# 存储配置
|
||||||
|
config["STORAGE"] = _load_storage_config(config_data)
|
||||||
|
|
||||||
|
# Webhook 配置
|
||||||
|
config.update(_load_webhook_config(config_data))
|
||||||
|
|
||||||
|
# 打印通知渠道配置来源
|
||||||
|
_print_notification_sources(config)
|
||||||
|
|
||||||
|
return config
|
||||||
8
trendradar/crawler/__init__.py
Normal file
8
trendradar/crawler/__init__.py
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
"""
|
||||||
|
爬虫模块 - 数据抓取功能
|
||||||
|
"""
|
||||||
|
|
||||||
|
from trendradar.crawler.fetcher import DataFetcher
|
||||||
|
|
||||||
|
__all__ = ["DataFetcher"]
|
||||||
184
trendradar/crawler/fetcher.py
Normal file
184
trendradar/crawler/fetcher.py
Normal file
@ -0,0 +1,184 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
"""
|
||||||
|
数据获取器模块
|
||||||
|
|
||||||
|
负责从 NewsNow API 抓取新闻数据,支持:
|
||||||
|
- 单个平台数据获取
|
||||||
|
- 批量平台数据爬取
|
||||||
|
- 自动重试机制
|
||||||
|
- 代理支持
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import random
|
||||||
|
import time
|
||||||
|
from typing import Dict, List, Tuple, Optional, Union
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
|
||||||
|
class DataFetcher:
|
||||||
|
"""数据获取器"""
|
||||||
|
|
||||||
|
# 默认 API 地址
|
||||||
|
DEFAULT_API_URL = "https://newsnow.busiyi.world/api/s"
|
||||||
|
|
||||||
|
# 默认请求头
|
||||||
|
DEFAULT_HEADERS = {
|
||||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
||||||
|
"Accept": "application/json, text/plain, */*",
|
||||||
|
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
||||||
|
"Connection": "keep-alive",
|
||||||
|
"Cache-Control": "no-cache",
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
proxy_url: Optional[str] = None,
|
||||||
|
api_url: Optional[str] = None,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
初始化数据获取器
|
||||||
|
|
||||||
|
Args:
|
||||||
|
proxy_url: 代理服务器 URL(可选)
|
||||||
|
api_url: API 基础 URL(可选,默认使用 DEFAULT_API_URL)
|
||||||
|
"""
|
||||||
|
self.proxy_url = proxy_url
|
||||||
|
self.api_url = api_url or self.DEFAULT_API_URL
|
||||||
|
|
||||||
|
def fetch_data(
|
||||||
|
self,
|
||||||
|
id_info: Union[str, Tuple[str, str]],
|
||||||
|
max_retries: int = 2,
|
||||||
|
min_retry_wait: int = 3,
|
||||||
|
max_retry_wait: int = 5,
|
||||||
|
) -> Tuple[Optional[str], str, str]:
|
||||||
|
"""
|
||||||
|
获取指定ID数据,支持重试
|
||||||
|
|
||||||
|
Args:
|
||||||
|
id_info: 平台ID 或 (平台ID, 别名) 元组
|
||||||
|
max_retries: 最大重试次数
|
||||||
|
min_retry_wait: 最小重试等待时间(秒)
|
||||||
|
max_retry_wait: 最大重试等待时间(秒)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(响应文本, 平台ID, 别名) 元组,失败时响应文本为 None
|
||||||
|
"""
|
||||||
|
if isinstance(id_info, tuple):
|
||||||
|
id_value, alias = id_info
|
||||||
|
else:
|
||||||
|
id_value = id_info
|
||||||
|
alias = id_value
|
||||||
|
|
||||||
|
url = f"{self.api_url}?id={id_value}&latest"
|
||||||
|
|
||||||
|
proxies = None
|
||||||
|
if self.proxy_url:
|
||||||
|
proxies = {"http": self.proxy_url, "https": self.proxy_url}
|
||||||
|
|
||||||
|
retries = 0
|
||||||
|
while retries <= max_retries:
|
||||||
|
try:
|
||||||
|
response = requests.get(
|
||||||
|
url,
|
||||||
|
proxies=proxies,
|
||||||
|
headers=self.DEFAULT_HEADERS,
|
||||||
|
timeout=10,
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
data_text = response.text
|
||||||
|
data_json = json.loads(data_text)
|
||||||
|
|
||||||
|
status = data_json.get("status", "未知")
|
||||||
|
if status not in ["success", "cache"]:
|
||||||
|
raise ValueError(f"响应状态异常: {status}")
|
||||||
|
|
||||||
|
status_info = "最新数据" if status == "success" else "缓存数据"
|
||||||
|
print(f"获取 {id_value} 成功({status_info})")
|
||||||
|
return data_text, id_value, alias
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
retries += 1
|
||||||
|
if retries <= max_retries:
|
||||||
|
base_wait = random.uniform(min_retry_wait, max_retry_wait)
|
||||||
|
additional_wait = (retries - 1) * random.uniform(1, 2)
|
||||||
|
wait_time = base_wait + additional_wait
|
||||||
|
print(f"请求 {id_value} 失败: {e}. {wait_time:.2f}秒后重试...")
|
||||||
|
time.sleep(wait_time)
|
||||||
|
else:
|
||||||
|
print(f"请求 {id_value} 失败: {e}")
|
||||||
|
return None, id_value, alias
|
||||||
|
|
||||||
|
return None, id_value, alias
|
||||||
|
|
||||||
|
def crawl_websites(
|
||||||
|
self,
|
||||||
|
ids_list: List[Union[str, Tuple[str, str]]],
|
||||||
|
request_interval: int = 100,
|
||||||
|
) -> Tuple[Dict, Dict, List]:
|
||||||
|
"""
|
||||||
|
爬取多个网站数据
|
||||||
|
|
||||||
|
Args:
|
||||||
|
ids_list: 平台ID列表,每个元素可以是字符串或 (平台ID, 别名) 元组
|
||||||
|
request_interval: 请求间隔(毫秒)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(结果字典, ID到名称的映射, 失败ID列表) 元组
|
||||||
|
"""
|
||||||
|
results = {}
|
||||||
|
id_to_name = {}
|
||||||
|
failed_ids = []
|
||||||
|
|
||||||
|
for i, id_info in enumerate(ids_list):
|
||||||
|
if isinstance(id_info, tuple):
|
||||||
|
id_value, name = id_info
|
||||||
|
else:
|
||||||
|
id_value = id_info
|
||||||
|
name = id_value
|
||||||
|
|
||||||
|
id_to_name[id_value] = name
|
||||||
|
response, _, _ = self.fetch_data(id_info)
|
||||||
|
|
||||||
|
if response:
|
||||||
|
try:
|
||||||
|
data = json.loads(response)
|
||||||
|
results[id_value] = {}
|
||||||
|
|
||||||
|
for index, item in enumerate(data.get("items", []), 1):
|
||||||
|
title = item.get("title")
|
||||||
|
# 跳过无效标题(None、float、空字符串)
|
||||||
|
if title is None or isinstance(title, float) or not str(title).strip():
|
||||||
|
continue
|
||||||
|
title = str(title).strip()
|
||||||
|
url = item.get("url", "")
|
||||||
|
mobile_url = item.get("mobileUrl", "")
|
||||||
|
|
||||||
|
if title in results[id_value]:
|
||||||
|
results[id_value][title]["ranks"].append(index)
|
||||||
|
else:
|
||||||
|
results[id_value][title] = {
|
||||||
|
"ranks": [index],
|
||||||
|
"url": url,
|
||||||
|
"mobileUrl": mobile_url,
|
||||||
|
}
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
print(f"解析 {id_value} 响应失败")
|
||||||
|
failed_ids.append(id_value)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"处理 {id_value} 数据出错: {e}")
|
||||||
|
failed_ids.append(id_value)
|
||||||
|
else:
|
||||||
|
failed_ids.append(id_value)
|
||||||
|
|
||||||
|
# 请求间隔(除了最后一个)
|
||||||
|
if i < len(ids_list) - 1:
|
||||||
|
actual_interval = request_interval + random.randint(-10, 20)
|
||||||
|
actual_interval = max(50, actual_interval)
|
||||||
|
time.sleep(actual_interval / 1000)
|
||||||
|
|
||||||
|
print(f"成功: {list(results.keys())}, 失败: {failed_ids}")
|
||||||
|
return results, id_to_name, failed_ids
|
||||||
81
trendradar/notification/__init__.py
Normal file
81
trendradar/notification/__init__.py
Normal file
@ -0,0 +1,81 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
"""
|
||||||
|
通知推送模块
|
||||||
|
|
||||||
|
提供多渠道通知推送功能,包括:
|
||||||
|
- 飞书、钉钉、企业微信
|
||||||
|
- Telegram、Slack
|
||||||
|
- Email、ntfy、Bark
|
||||||
|
|
||||||
|
模块结构:
|
||||||
|
- push_manager: 推送记录管理
|
||||||
|
- formatters: 内容格式转换
|
||||||
|
- batch: 批次处理工具
|
||||||
|
- renderer: 通知内容渲染
|
||||||
|
- splitter: 消息分批拆分
|
||||||
|
- senders: 消息发送器(各渠道发送函数)
|
||||||
|
- dispatcher: 多账号通知调度器
|
||||||
|
"""
|
||||||
|
|
||||||
|
from trendradar.notification.push_manager import PushRecordManager
|
||||||
|
from trendradar.notification.formatters import (
|
||||||
|
strip_markdown,
|
||||||
|
convert_markdown_to_mrkdwn,
|
||||||
|
)
|
||||||
|
from trendradar.notification.batch import (
|
||||||
|
get_batch_header,
|
||||||
|
get_max_batch_header_size,
|
||||||
|
truncate_to_bytes,
|
||||||
|
add_batch_headers,
|
||||||
|
)
|
||||||
|
from trendradar.notification.renderer import (
|
||||||
|
render_feishu_content,
|
||||||
|
render_dingtalk_content,
|
||||||
|
)
|
||||||
|
from trendradar.notification.splitter import (
|
||||||
|
split_content_into_batches,
|
||||||
|
DEFAULT_BATCH_SIZES,
|
||||||
|
)
|
||||||
|
from trendradar.notification.senders import (
|
||||||
|
send_to_feishu,
|
||||||
|
send_to_dingtalk,
|
||||||
|
send_to_wework,
|
||||||
|
send_to_telegram,
|
||||||
|
send_to_email,
|
||||||
|
send_to_ntfy,
|
||||||
|
send_to_bark,
|
||||||
|
send_to_slack,
|
||||||
|
SMTP_CONFIGS,
|
||||||
|
)
|
||||||
|
from trendradar.notification.dispatcher import NotificationDispatcher
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
# 推送记录管理
|
||||||
|
"PushRecordManager",
|
||||||
|
# 格式转换
|
||||||
|
"strip_markdown",
|
||||||
|
"convert_markdown_to_mrkdwn",
|
||||||
|
# 批次处理
|
||||||
|
"get_batch_header",
|
||||||
|
"get_max_batch_header_size",
|
||||||
|
"truncate_to_bytes",
|
||||||
|
"add_batch_headers",
|
||||||
|
# 内容渲染
|
||||||
|
"render_feishu_content",
|
||||||
|
"render_dingtalk_content",
|
||||||
|
# 消息分批
|
||||||
|
"split_content_into_batches",
|
||||||
|
"DEFAULT_BATCH_SIZES",
|
||||||
|
# 消息发送器
|
||||||
|
"send_to_feishu",
|
||||||
|
"send_to_dingtalk",
|
||||||
|
"send_to_wework",
|
||||||
|
"send_to_telegram",
|
||||||
|
"send_to_email",
|
||||||
|
"send_to_ntfy",
|
||||||
|
"send_to_bark",
|
||||||
|
"send_to_slack",
|
||||||
|
"SMTP_CONFIGS",
|
||||||
|
# 通知调度器
|
||||||
|
"NotificationDispatcher",
|
||||||
|
]
|
||||||
115
trendradar/notification/batch.py
Normal file
115
trendradar/notification/batch.py
Normal file
@ -0,0 +1,115 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
"""
|
||||||
|
批次处理模块
|
||||||
|
|
||||||
|
提供消息分批发送的辅助函数
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
|
||||||
|
def get_batch_header(format_type: str, batch_num: int, total_batches: int) -> str:
|
||||||
|
"""根据 format_type 生成对应格式的批次头部
|
||||||
|
|
||||||
|
Args:
|
||||||
|
format_type: 推送类型(telegram, slack, wework_text, bark, feishu, dingtalk, ntfy, wework)
|
||||||
|
batch_num: 当前批次编号
|
||||||
|
total_batches: 总批次数
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
格式化的批次头部字符串
|
||||||
|
"""
|
||||||
|
if format_type == "telegram":
|
||||||
|
return f"<b>[第 {batch_num}/{total_batches} 批次]</b>\n\n"
|
||||||
|
elif format_type == "slack":
|
||||||
|
return f"*[第 {batch_num}/{total_batches} 批次]*\n\n"
|
||||||
|
elif format_type in ("wework_text", "bark"):
|
||||||
|
# 企业微信文本模式和 Bark 使用纯文本格式
|
||||||
|
return f"[第 {batch_num}/{total_batches} 批次]\n\n"
|
||||||
|
else:
|
||||||
|
# 飞书、钉钉、ntfy、企业微信 markdown 模式
|
||||||
|
return f"**[第 {batch_num}/{total_batches} 批次]**\n\n"
|
||||||
|
|
||||||
|
|
||||||
|
def get_max_batch_header_size(format_type: str) -> int:
|
||||||
|
"""估算批次头部的最大字节数(假设最多 99 批次)
|
||||||
|
|
||||||
|
用于在分批时预留空间,避免事后截断破坏内容完整性。
|
||||||
|
|
||||||
|
Args:
|
||||||
|
format_type: 推送类型
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
最大头部字节数
|
||||||
|
"""
|
||||||
|
# 生成最坏情况的头部(99/99 批次)
|
||||||
|
max_header = get_batch_header(format_type, 99, 99)
|
||||||
|
return len(max_header.encode("utf-8"))
|
||||||
|
|
||||||
|
|
||||||
|
def truncate_to_bytes(text: str, max_bytes: int) -> str:
|
||||||
|
"""安全截断字符串到指定字节数,避免截断多字节字符
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: 要截断的文本
|
||||||
|
max_bytes: 最大字节数
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
截断后的文本
|
||||||
|
"""
|
||||||
|
text_bytes = text.encode("utf-8")
|
||||||
|
if len(text_bytes) <= max_bytes:
|
||||||
|
return text
|
||||||
|
|
||||||
|
# 截断到指定字节数
|
||||||
|
truncated = text_bytes[:max_bytes]
|
||||||
|
|
||||||
|
# 处理可能的不完整 UTF-8 字符
|
||||||
|
for i in range(min(4, len(truncated))):
|
||||||
|
try:
|
||||||
|
return truncated[: len(truncated) - i].decode("utf-8")
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 极端情况:返回空字符串
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def add_batch_headers(
|
||||||
|
batches: List[str], format_type: str, max_bytes: int
|
||||||
|
) -> List[str]:
|
||||||
|
"""为批次添加头部,动态计算确保总大小不超过限制
|
||||||
|
|
||||||
|
Args:
|
||||||
|
batches: 原始批次列表
|
||||||
|
format_type: 推送类型(bark, telegram, feishu 等)
|
||||||
|
max_bytes: 该推送类型的最大字节限制
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
添加头部后的批次列表
|
||||||
|
"""
|
||||||
|
if len(batches) <= 1:
|
||||||
|
return batches
|
||||||
|
|
||||||
|
total = len(batches)
|
||||||
|
result = []
|
||||||
|
|
||||||
|
for i, content in enumerate(batches, 1):
|
||||||
|
# 生成批次头部
|
||||||
|
header = get_batch_header(format_type, i, total)
|
||||||
|
header_size = len(header.encode("utf-8"))
|
||||||
|
|
||||||
|
# 动态计算允许的最大内容大小
|
||||||
|
max_content_size = max_bytes - header_size
|
||||||
|
content_size = len(content.encode("utf-8"))
|
||||||
|
|
||||||
|
# 如果超出,截断到安全大小
|
||||||
|
if content_size > max_content_size:
|
||||||
|
print(
|
||||||
|
f"警告:{format_type} 第 {i}/{total} 批次内容({content_size}字节) + 头部({header_size}字节) 超出限制({max_bytes}字节),截断到 {max_content_size} 字节"
|
||||||
|
)
|
||||||
|
content = truncate_to_bytes(content, max_content_size)
|
||||||
|
|
||||||
|
result.append(header + content)
|
||||||
|
|
||||||
|
return result
|
||||||
420
trendradar/notification/dispatcher.py
Normal file
420
trendradar/notification/dispatcher.py
Normal file
@ -0,0 +1,420 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
"""
|
||||||
|
通知调度器模块
|
||||||
|
|
||||||
|
提供统一的通知分发接口。
|
||||||
|
支持所有通知渠道的多账号配置,使用 `;` 分隔多个账号。
|
||||||
|
|
||||||
|
使用示例:
|
||||||
|
dispatcher = NotificationDispatcher(config, get_time_func, split_content_func)
|
||||||
|
results = dispatcher.dispatch_all(report_data, report_type, ...)
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import Any, Callable, Dict, List, Optional
|
||||||
|
|
||||||
|
from trendradar.core.config import (
|
||||||
|
get_account_at_index,
|
||||||
|
limit_accounts,
|
||||||
|
parse_multi_account_config,
|
||||||
|
validate_paired_configs,
|
||||||
|
)
|
||||||
|
|
||||||
|
from .senders import (
|
||||||
|
send_to_bark,
|
||||||
|
send_to_dingtalk,
|
||||||
|
send_to_email,
|
||||||
|
send_to_feishu,
|
||||||
|
send_to_ntfy,
|
||||||
|
send_to_slack,
|
||||||
|
send_to_telegram,
|
||||||
|
send_to_wework,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class NotificationDispatcher:
|
||||||
|
"""
|
||||||
|
统一的多账号通知调度器
|
||||||
|
|
||||||
|
将多账号发送逻辑封装,提供简洁的 dispatch_all 接口。
|
||||||
|
内部处理账号解析、数量限制、配对验证等逻辑。
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
config: Dict[str, Any],
|
||||||
|
get_time_func: Callable,
|
||||||
|
split_content_func: Callable,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
初始化通知调度器
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config: 完整的配置字典,包含所有通知渠道的配置
|
||||||
|
get_time_func: 获取当前时间的函数
|
||||||
|
split_content_func: 内容分批函数
|
||||||
|
"""
|
||||||
|
self.config = config
|
||||||
|
self.get_time_func = get_time_func
|
||||||
|
self.split_content_func = split_content_func
|
||||||
|
self.max_accounts = config.get("MAX_ACCOUNTS_PER_CHANNEL", 3)
|
||||||
|
|
||||||
|
def dispatch_all(
|
||||||
|
self,
|
||||||
|
report_data: Dict,
|
||||||
|
report_type: str,
|
||||||
|
update_info: Optional[Dict] = None,
|
||||||
|
proxy_url: Optional[str] = None,
|
||||||
|
mode: str = "daily",
|
||||||
|
html_file_path: Optional[str] = None,
|
||||||
|
) -> Dict[str, bool]:
|
||||||
|
"""
|
||||||
|
分发通知到所有已配置的渠道
|
||||||
|
|
||||||
|
Args:
|
||||||
|
report_data: 报告数据(由 prepare_report_data 生成)
|
||||||
|
report_type: 报告类型(如 "当日汇总"、"实时增量")
|
||||||
|
update_info: 版本更新信息(可选)
|
||||||
|
proxy_url: 代理 URL(可选)
|
||||||
|
mode: 报告模式 (daily/current/incremental)
|
||||||
|
html_file_path: HTML 报告文件路径(邮件使用)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict[str, bool]: 每个渠道的发送结果,key 为渠道名,value 为是否成功
|
||||||
|
"""
|
||||||
|
results = {}
|
||||||
|
|
||||||
|
# 飞书
|
||||||
|
if self.config.get("FEISHU_WEBHOOK_URL"):
|
||||||
|
results["feishu"] = self._send_feishu(
|
||||||
|
report_data, report_type, update_info, proxy_url, mode
|
||||||
|
)
|
||||||
|
|
||||||
|
# 钉钉
|
||||||
|
if self.config.get("DINGTALK_WEBHOOK_URL"):
|
||||||
|
results["dingtalk"] = self._send_dingtalk(
|
||||||
|
report_data, report_type, update_info, proxy_url, mode
|
||||||
|
)
|
||||||
|
|
||||||
|
# 企业微信
|
||||||
|
if self.config.get("WEWORK_WEBHOOK_URL"):
|
||||||
|
results["wework"] = self._send_wework(
|
||||||
|
report_data, report_type, update_info, proxy_url, mode
|
||||||
|
)
|
||||||
|
|
||||||
|
# Telegram(需要配对验证)
|
||||||
|
if self.config.get("TELEGRAM_BOT_TOKEN") and self.config.get("TELEGRAM_CHAT_ID"):
|
||||||
|
results["telegram"] = self._send_telegram(
|
||||||
|
report_data, report_type, update_info, proxy_url, mode
|
||||||
|
)
|
||||||
|
|
||||||
|
# ntfy(需要配对验证)
|
||||||
|
if self.config.get("NTFY_SERVER_URL") and self.config.get("NTFY_TOPIC"):
|
||||||
|
results["ntfy"] = self._send_ntfy(
|
||||||
|
report_data, report_type, update_info, proxy_url, mode
|
||||||
|
)
|
||||||
|
|
||||||
|
# Bark
|
||||||
|
if self.config.get("BARK_URL"):
|
||||||
|
results["bark"] = self._send_bark(
|
||||||
|
report_data, report_type, update_info, proxy_url, mode
|
||||||
|
)
|
||||||
|
|
||||||
|
# Slack
|
||||||
|
if self.config.get("SLACK_WEBHOOK_URL"):
|
||||||
|
results["slack"] = self._send_slack(
|
||||||
|
report_data, report_type, update_info, proxy_url, mode
|
||||||
|
)
|
||||||
|
|
||||||
|
# 邮件(保持原有逻辑,已支持多收件人)
|
||||||
|
if (
|
||||||
|
self.config.get("EMAIL_FROM")
|
||||||
|
and self.config.get("EMAIL_PASSWORD")
|
||||||
|
and self.config.get("EMAIL_TO")
|
||||||
|
):
|
||||||
|
results["email"] = self._send_email(report_type, html_file_path)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
def _send_to_multi_accounts(
|
||||||
|
self,
|
||||||
|
channel_name: str,
|
||||||
|
config_value: str,
|
||||||
|
send_func: Callable[..., bool],
|
||||||
|
**kwargs,
|
||||||
|
) -> bool:
|
||||||
|
"""
|
||||||
|
通用多账号发送逻辑
|
||||||
|
|
||||||
|
Args:
|
||||||
|
channel_name: 渠道名称(用于日志和账号数量限制提示)
|
||||||
|
config_value: 配置值(可能包含多个账号,用 ; 分隔)
|
||||||
|
send_func: 发送函数,签名为 (account, account_label=..., **kwargs) -> bool
|
||||||
|
**kwargs: 传递给发送函数的其他参数
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: 任一账号发送成功则返回 True
|
||||||
|
"""
|
||||||
|
accounts = parse_multi_account_config(config_value)
|
||||||
|
if not accounts:
|
||||||
|
return False
|
||||||
|
|
||||||
|
accounts = limit_accounts(accounts, self.max_accounts, channel_name)
|
||||||
|
results = []
|
||||||
|
|
||||||
|
for i, account in enumerate(accounts):
|
||||||
|
if account:
|
||||||
|
account_label = f"账号{i+1}" if len(accounts) > 1 else ""
|
||||||
|
result = send_func(account, account_label=account_label, **kwargs)
|
||||||
|
results.append(result)
|
||||||
|
|
||||||
|
return any(results) if results else False
|
||||||
|
|
||||||
|
def _send_feishu(
|
||||||
|
self,
|
||||||
|
report_data: Dict,
|
||||||
|
report_type: str,
|
||||||
|
update_info: Optional[Dict],
|
||||||
|
proxy_url: Optional[str],
|
||||||
|
mode: str,
|
||||||
|
) -> bool:
|
||||||
|
"""发送到飞书(多账号)"""
|
||||||
|
return self._send_to_multi_accounts(
|
||||||
|
channel_name="飞书",
|
||||||
|
config_value=self.config["FEISHU_WEBHOOK_URL"],
|
||||||
|
send_func=lambda url, account_label: send_to_feishu(
|
||||||
|
webhook_url=url,
|
||||||
|
report_data=report_data,
|
||||||
|
report_type=report_type,
|
||||||
|
update_info=update_info,
|
||||||
|
proxy_url=proxy_url,
|
||||||
|
mode=mode,
|
||||||
|
account_label=account_label,
|
||||||
|
batch_size=self.config.get("FEISHU_BATCH_SIZE", 29000),
|
||||||
|
batch_interval=self.config.get("BATCH_SEND_INTERVAL", 1.0),
|
||||||
|
split_content_func=self.split_content_func,
|
||||||
|
get_time_func=self.get_time_func,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
def _send_dingtalk(
|
||||||
|
self,
|
||||||
|
report_data: Dict,
|
||||||
|
report_type: str,
|
||||||
|
update_info: Optional[Dict],
|
||||||
|
proxy_url: Optional[str],
|
||||||
|
mode: str,
|
||||||
|
) -> bool:
|
||||||
|
"""发送到钉钉(多账号)"""
|
||||||
|
return self._send_to_multi_accounts(
|
||||||
|
channel_name="钉钉",
|
||||||
|
config_value=self.config["DINGTALK_WEBHOOK_URL"],
|
||||||
|
send_func=lambda url, account_label: send_to_dingtalk(
|
||||||
|
webhook_url=url,
|
||||||
|
report_data=report_data,
|
||||||
|
report_type=report_type,
|
||||||
|
update_info=update_info,
|
||||||
|
proxy_url=proxy_url,
|
||||||
|
mode=mode,
|
||||||
|
account_label=account_label,
|
||||||
|
batch_size=self.config.get("DINGTALK_BATCH_SIZE", 20000),
|
||||||
|
batch_interval=self.config.get("BATCH_SEND_INTERVAL", 1.0),
|
||||||
|
split_content_func=self.split_content_func,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
def _send_wework(
|
||||||
|
self,
|
||||||
|
report_data: Dict,
|
||||||
|
report_type: str,
|
||||||
|
update_info: Optional[Dict],
|
||||||
|
proxy_url: Optional[str],
|
||||||
|
mode: str,
|
||||||
|
) -> bool:
|
||||||
|
"""发送到企业微信(多账号)"""
|
||||||
|
return self._send_to_multi_accounts(
|
||||||
|
channel_name="企业微信",
|
||||||
|
config_value=self.config["WEWORK_WEBHOOK_URL"],
|
||||||
|
send_func=lambda url, account_label: send_to_wework(
|
||||||
|
webhook_url=url,
|
||||||
|
report_data=report_data,
|
||||||
|
report_type=report_type,
|
||||||
|
update_info=update_info,
|
||||||
|
proxy_url=proxy_url,
|
||||||
|
mode=mode,
|
||||||
|
account_label=account_label,
|
||||||
|
batch_size=self.config.get("MESSAGE_BATCH_SIZE", 4000),
|
||||||
|
batch_interval=self.config.get("BATCH_SEND_INTERVAL", 1.0),
|
||||||
|
msg_type=self.config.get("WEWORK_MSG_TYPE", "markdown"),
|
||||||
|
split_content_func=self.split_content_func,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
def _send_telegram(
|
||||||
|
self,
|
||||||
|
report_data: Dict,
|
||||||
|
report_type: str,
|
||||||
|
update_info: Optional[Dict],
|
||||||
|
proxy_url: Optional[str],
|
||||||
|
mode: str,
|
||||||
|
) -> bool:
|
||||||
|
"""发送到 Telegram(多账号,需验证 token 和 chat_id 配对)"""
|
||||||
|
telegram_tokens = parse_multi_account_config(self.config["TELEGRAM_BOT_TOKEN"])
|
||||||
|
telegram_chat_ids = parse_multi_account_config(self.config["TELEGRAM_CHAT_ID"])
|
||||||
|
|
||||||
|
if not telegram_tokens or not telegram_chat_ids:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# 验证配对
|
||||||
|
valid, count = validate_paired_configs(
|
||||||
|
{"bot_token": telegram_tokens, "chat_id": telegram_chat_ids},
|
||||||
|
"Telegram",
|
||||||
|
required_keys=["bot_token", "chat_id"],
|
||||||
|
)
|
||||||
|
if not valid or count == 0:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# 限制账号数量
|
||||||
|
telegram_tokens = limit_accounts(telegram_tokens, self.max_accounts, "Telegram")
|
||||||
|
telegram_chat_ids = telegram_chat_ids[: len(telegram_tokens)]
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for i in range(len(telegram_tokens)):
|
||||||
|
token = telegram_tokens[i]
|
||||||
|
chat_id = telegram_chat_ids[i]
|
||||||
|
if token and chat_id:
|
||||||
|
account_label = f"账号{i+1}" if len(telegram_tokens) > 1 else ""
|
||||||
|
result = send_to_telegram(
|
||||||
|
bot_token=token,
|
||||||
|
chat_id=chat_id,
|
||||||
|
report_data=report_data,
|
||||||
|
report_type=report_type,
|
||||||
|
update_info=update_info,
|
||||||
|
proxy_url=proxy_url,
|
||||||
|
mode=mode,
|
||||||
|
account_label=account_label,
|
||||||
|
batch_size=self.config.get("MESSAGE_BATCH_SIZE", 4000),
|
||||||
|
batch_interval=self.config.get("BATCH_SEND_INTERVAL", 1.0),
|
||||||
|
split_content_func=self.split_content_func,
|
||||||
|
)
|
||||||
|
results.append(result)
|
||||||
|
|
||||||
|
return any(results) if results else False
|
||||||
|
|
||||||
|
def _send_ntfy(
|
||||||
|
self,
|
||||||
|
report_data: Dict,
|
||||||
|
report_type: str,
|
||||||
|
update_info: Optional[Dict],
|
||||||
|
proxy_url: Optional[str],
|
||||||
|
mode: str,
|
||||||
|
) -> bool:
|
||||||
|
"""发送到 ntfy(多账号,需验证 topic 和 token 配对)"""
|
||||||
|
ntfy_server_url = self.config["NTFY_SERVER_URL"]
|
||||||
|
ntfy_topics = parse_multi_account_config(self.config["NTFY_TOPIC"])
|
||||||
|
ntfy_tokens = parse_multi_account_config(self.config.get("NTFY_TOKEN", ""))
|
||||||
|
|
||||||
|
if not ntfy_server_url or not ntfy_topics:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# 验证 token 和 topic 数量一致(如果配置了 token)
|
||||||
|
if ntfy_tokens and len(ntfy_tokens) != len(ntfy_topics):
|
||||||
|
print(
|
||||||
|
f"❌ ntfy 配置错误:topic 数量({len(ntfy_topics)})与 token 数量({len(ntfy_tokens)})不一致,跳过 ntfy 推送"
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
|
# 限制账号数量
|
||||||
|
ntfy_topics = limit_accounts(ntfy_topics, self.max_accounts, "ntfy")
|
||||||
|
if ntfy_tokens:
|
||||||
|
ntfy_tokens = ntfy_tokens[: len(ntfy_topics)]
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for i, topic in enumerate(ntfy_topics):
|
||||||
|
if topic:
|
||||||
|
token = get_account_at_index(ntfy_tokens, i, "") if ntfy_tokens else ""
|
||||||
|
account_label = f"账号{i+1}" if len(ntfy_topics) > 1 else ""
|
||||||
|
result = send_to_ntfy(
|
||||||
|
server_url=ntfy_server_url,
|
||||||
|
topic=topic,
|
||||||
|
token=token,
|
||||||
|
report_data=report_data,
|
||||||
|
report_type=report_type,
|
||||||
|
update_info=update_info,
|
||||||
|
proxy_url=proxy_url,
|
||||||
|
mode=mode,
|
||||||
|
account_label=account_label,
|
||||||
|
batch_size=3800,
|
||||||
|
split_content_func=self.split_content_func,
|
||||||
|
)
|
||||||
|
results.append(result)
|
||||||
|
|
||||||
|
return any(results) if results else False
|
||||||
|
|
||||||
|
def _send_bark(
|
||||||
|
self,
|
||||||
|
report_data: Dict,
|
||||||
|
report_type: str,
|
||||||
|
update_info: Optional[Dict],
|
||||||
|
proxy_url: Optional[str],
|
||||||
|
mode: str,
|
||||||
|
) -> bool:
|
||||||
|
"""发送到 Bark(多账号)"""
|
||||||
|
return self._send_to_multi_accounts(
|
||||||
|
channel_name="Bark",
|
||||||
|
config_value=self.config["BARK_URL"],
|
||||||
|
send_func=lambda url, account_label: send_to_bark(
|
||||||
|
bark_url=url,
|
||||||
|
report_data=report_data,
|
||||||
|
report_type=report_type,
|
||||||
|
update_info=update_info,
|
||||||
|
proxy_url=proxy_url,
|
||||||
|
mode=mode,
|
||||||
|
account_label=account_label,
|
||||||
|
batch_size=self.config.get("BARK_BATCH_SIZE", 3600),
|
||||||
|
batch_interval=self.config.get("BATCH_SEND_INTERVAL", 1.0),
|
||||||
|
split_content_func=self.split_content_func,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
def _send_slack(
|
||||||
|
self,
|
||||||
|
report_data: Dict,
|
||||||
|
report_type: str,
|
||||||
|
update_info: Optional[Dict],
|
||||||
|
proxy_url: Optional[str],
|
||||||
|
mode: str,
|
||||||
|
) -> bool:
|
||||||
|
"""发送到 Slack(多账号)"""
|
||||||
|
return self._send_to_multi_accounts(
|
||||||
|
channel_name="Slack",
|
||||||
|
config_value=self.config["SLACK_WEBHOOK_URL"],
|
||||||
|
send_func=lambda url, account_label: send_to_slack(
|
||||||
|
webhook_url=url,
|
||||||
|
report_data=report_data,
|
||||||
|
report_type=report_type,
|
||||||
|
update_info=update_info,
|
||||||
|
proxy_url=proxy_url,
|
||||||
|
mode=mode,
|
||||||
|
account_label=account_label,
|
||||||
|
batch_size=self.config.get("SLACK_BATCH_SIZE", 4000),
|
||||||
|
batch_interval=self.config.get("BATCH_SEND_INTERVAL", 1.0),
|
||||||
|
split_content_func=self.split_content_func,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
def _send_email(
|
||||||
|
self,
|
||||||
|
report_type: str,
|
||||||
|
html_file_path: Optional[str],
|
||||||
|
) -> bool:
|
||||||
|
"""发送邮件(保持原有逻辑,已支持多收件人)"""
|
||||||
|
return send_to_email(
|
||||||
|
from_email=self.config["EMAIL_FROM"],
|
||||||
|
password=self.config["EMAIL_PASSWORD"],
|
||||||
|
to_email=self.config["EMAIL_TO"],
|
||||||
|
report_type=report_type,
|
||||||
|
html_file_path=html_file_path,
|
||||||
|
custom_smtp_server=self.config.get("EMAIL_SMTP_SERVER", ""),
|
||||||
|
custom_smtp_port=self.config.get("EMAIL_SMTP_PORT", ""),
|
||||||
|
get_time_func=self.get_time_func,
|
||||||
|
)
|
||||||
80
trendradar/notification/formatters.py
Normal file
80
trendradar/notification/formatters.py
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
"""
|
||||||
|
通知内容格式转换模块
|
||||||
|
|
||||||
|
提供不同推送平台间的格式转换功能
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
def strip_markdown(text: str) -> str:
|
||||||
|
"""去除文本中的 markdown 语法格式,用于个人微信推送
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: 包含 markdown 格式的文本
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
纯文本内容
|
||||||
|
"""
|
||||||
|
# 去除粗体 **text** 或 __text__
|
||||||
|
text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)
|
||||||
|
text = re.sub(r'__(.+?)__', r'\1', text)
|
||||||
|
|
||||||
|
# 去除斜体 *text* 或 _text_
|
||||||
|
text = re.sub(r'\*(.+?)\*', r'\1', text)
|
||||||
|
text = re.sub(r'_(.+?)_', r'\1', text)
|
||||||
|
|
||||||
|
# 去除删除线 ~~text~~
|
||||||
|
text = re.sub(r'~~(.+?)~~', r'\1', text)
|
||||||
|
|
||||||
|
# 转换链接 [text](url) -> text url(保留 URL)
|
||||||
|
text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'\1 \2', text)
|
||||||
|
|
||||||
|
# 去除图片  -> alt
|
||||||
|
text = re.sub(r'!\[(.+?)\]\(.+?\)', r'\1', text)
|
||||||
|
|
||||||
|
# 去除行内代码 `code`
|
||||||
|
text = re.sub(r'`(.+?)`', r'\1', text)
|
||||||
|
|
||||||
|
# 去除引用符号 >
|
||||||
|
text = re.sub(r'^>\s*', '', text, flags=re.MULTILINE)
|
||||||
|
|
||||||
|
# 去除标题符号 # ## ### 等
|
||||||
|
text = re.sub(r'^#+\s*', '', text, flags=re.MULTILINE)
|
||||||
|
|
||||||
|
# 去除水平分割线 --- 或 ***
|
||||||
|
text = re.sub(r'^[\-\*]{3,}\s*$', '', text, flags=re.MULTILINE)
|
||||||
|
|
||||||
|
# 去除 HTML 标签 <font color='xxx'>text</font> -> text
|
||||||
|
text = re.sub(r'<font[^>]*>(.+?)</font>', r'\1', text)
|
||||||
|
text = re.sub(r'<[^>]+>', '', text)
|
||||||
|
|
||||||
|
# 清理多余的空行(保留最多两个连续空行)
|
||||||
|
text = re.sub(r'\n{3,}', '\n\n', text)
|
||||||
|
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def convert_markdown_to_mrkdwn(content: str) -> str:
|
||||||
|
"""
|
||||||
|
将标准 Markdown 转换为 Slack 的 mrkdwn 格式
|
||||||
|
|
||||||
|
转换规则:
|
||||||
|
- **粗体** → *粗体*
|
||||||
|
- [文本](url) → <url|文本>
|
||||||
|
- 保留其他格式(代码块、列表等)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
content: Markdown 格式的内容
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Slack mrkdwn 格式的内容
|
||||||
|
"""
|
||||||
|
# 1. 转换链接格式: [文本](url) → <url|文本>
|
||||||
|
content = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'<\2|\1>', content)
|
||||||
|
|
||||||
|
# 2. 转换粗体: **文本** → *文本*
|
||||||
|
content = re.sub(r'\*\*([^*]+)\*\*', r'*\1*', content)
|
||||||
|
|
||||||
|
return content
|
||||||
109
trendradar/notification/push_manager.py
Normal file
109
trendradar/notification/push_manager.py
Normal file
@ -0,0 +1,109 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
"""
|
||||||
|
推送记录管理模块
|
||||||
|
|
||||||
|
管理推送记录,支持每日只推送一次和时间窗口控制
|
||||||
|
通过 storage_backend 统一存储,支持本地 SQLite 和远程云存储
|
||||||
|
"""
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Callable, Optional, Any
|
||||||
|
|
||||||
|
import pytz
|
||||||
|
|
||||||
|
|
||||||
|
class PushRecordManager:
|
||||||
|
"""
|
||||||
|
推送记录管理器
|
||||||
|
|
||||||
|
通过 storage_backend 统一管理推送记录:
|
||||||
|
- 本地环境:使用 LocalStorageBackend,数据存储在本地 SQLite
|
||||||
|
- GitHub Actions:使用 RemoteStorageBackend,数据存储在云端
|
||||||
|
|
||||||
|
这样 once_per_day 功能在 GitHub Actions 上也能正常工作。
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
storage_backend: Any,
|
||||||
|
get_time_func: Optional[Callable[[], datetime]] = None,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
初始化推送记录管理器
|
||||||
|
|
||||||
|
Args:
|
||||||
|
storage_backend: 存储后端实例(LocalStorageBackend 或 RemoteStorageBackend)
|
||||||
|
get_time_func: 获取当前时间的函数(应使用配置的时区)
|
||||||
|
"""
|
||||||
|
self.storage_backend = storage_backend
|
||||||
|
self.get_time = get_time_func or self._default_get_time
|
||||||
|
|
||||||
|
print(f"[推送记录] 使用 {storage_backend.backend_name} 存储后端")
|
||||||
|
|
||||||
|
def _default_get_time(self) -> datetime:
|
||||||
|
"""默认时间获取函数(UTC+8)"""
|
||||||
|
return datetime.now(pytz.timezone("Asia/Shanghai"))
|
||||||
|
|
||||||
|
def has_pushed_today(self) -> bool:
|
||||||
|
"""
|
||||||
|
检查今天是否已经推送过
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
是否已推送
|
||||||
|
"""
|
||||||
|
return self.storage_backend.has_pushed_today()
|
||||||
|
|
||||||
|
def record_push(self, report_type: str) -> bool:
|
||||||
|
"""
|
||||||
|
记录推送
|
||||||
|
|
||||||
|
Args:
|
||||||
|
report_type: 报告类型
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
是否记录成功
|
||||||
|
"""
|
||||||
|
return self.storage_backend.record_push(report_type)
|
||||||
|
|
||||||
|
def is_in_time_range(self, start_time: str, end_time: str) -> bool:
|
||||||
|
"""
|
||||||
|
检查当前时间是否在指定时间范围内
|
||||||
|
|
||||||
|
Args:
|
||||||
|
start_time: 开始时间(格式:HH:MM)
|
||||||
|
end_time: 结束时间(格式:HH:MM)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
是否在时间范围内
|
||||||
|
"""
|
||||||
|
now = self.get_time()
|
||||||
|
current_time = now.strftime("%H:%M")
|
||||||
|
|
||||||
|
def normalize_time(time_str: str) -> str:
|
||||||
|
"""将时间字符串标准化为 HH:MM 格式"""
|
||||||
|
try:
|
||||||
|
parts = time_str.strip().split(":")
|
||||||
|
if len(parts) != 2:
|
||||||
|
raise ValueError(f"时间格式错误: {time_str}")
|
||||||
|
|
||||||
|
hour = int(parts[0])
|
||||||
|
minute = int(parts[1])
|
||||||
|
|
||||||
|
if not (0 <= hour <= 23 and 0 <= minute <= 59):
|
||||||
|
raise ValueError(f"时间范围错误: {time_str}")
|
||||||
|
|
||||||
|
return f"{hour:02d}:{minute:02d}"
|
||||||
|
except Exception as e:
|
||||||
|
print(f"时间格式化错误 '{time_str}': {e}")
|
||||||
|
return time_str
|
||||||
|
|
||||||
|
normalized_start = normalize_time(start_time)
|
||||||
|
normalized_end = normalize_time(end_time)
|
||||||
|
normalized_current = normalize_time(current_time)
|
||||||
|
|
||||||
|
result = normalized_start <= normalized_current <= normalized_end
|
||||||
|
|
||||||
|
if not result:
|
||||||
|
print(f"时间窗口判断:当前 {normalized_current},窗口 {normalized_start}-{normalized_end}")
|
||||||
|
|
||||||
|
return result
|
||||||
260
trendradar/notification/renderer.py
Normal file
260
trendradar/notification/renderer.py
Normal file
@ -0,0 +1,260 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
"""
|
||||||
|
通知内容渲染模块
|
||||||
|
|
||||||
|
提供多平台通知内容渲染功能,生成格式化的推送消息
|
||||||
|
"""
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Dict, List, Optional, Callable
|
||||||
|
|
||||||
|
from trendradar.report.formatter import format_title_for_platform
|
||||||
|
|
||||||
|
|
||||||
|
def render_feishu_content(
|
||||||
|
report_data: Dict,
|
||||||
|
update_info: Optional[Dict] = None,
|
||||||
|
mode: str = "daily",
|
||||||
|
separator: str = "---",
|
||||||
|
reverse_content_order: bool = False,
|
||||||
|
get_time_func: Optional[Callable[[], datetime]] = None,
|
||||||
|
) -> str:
|
||||||
|
"""渲染飞书通知内容
|
||||||
|
|
||||||
|
Args:
|
||||||
|
report_data: 报告数据字典,包含 stats, new_titles, failed_ids, total_new_count
|
||||||
|
update_info: 版本更新信息(可选)
|
||||||
|
mode: 报告模式 ("daily", "incremental", "current")
|
||||||
|
separator: 内容分隔符
|
||||||
|
reverse_content_order: 是否反转内容顺序(新增在前)
|
||||||
|
get_time_func: 获取当前时间的函数(可选,默认使用 datetime.now())
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
格式化的飞书消息内容
|
||||||
|
"""
|
||||||
|
# 生成热点词汇统计部分
|
||||||
|
stats_content = ""
|
||||||
|
if report_data["stats"]:
|
||||||
|
stats_content += "📊 **热点词汇统计**\n\n"
|
||||||
|
|
||||||
|
total_count = len(report_data["stats"])
|
||||||
|
|
||||||
|
for i, stat in enumerate(report_data["stats"]):
|
||||||
|
word = stat["word"]
|
||||||
|
count = stat["count"]
|
||||||
|
|
||||||
|
sequence_display = f"<font color='grey'>[{i + 1}/{total_count}]</font>"
|
||||||
|
|
||||||
|
if count >= 10:
|
||||||
|
stats_content += f"🔥 {sequence_display} **{word}** : <font color='red'>{count}</font> 条\n\n"
|
||||||
|
elif count >= 5:
|
||||||
|
stats_content += f"📈 {sequence_display} **{word}** : <font color='orange'>{count}</font> 条\n\n"
|
||||||
|
else:
|
||||||
|
stats_content += f"📌 {sequence_display} **{word}** : {count} 条\n\n"
|
||||||
|
|
||||||
|
for j, title_data in enumerate(stat["titles"], 1):
|
||||||
|
formatted_title = format_title_for_platform(
|
||||||
|
"feishu", title_data, show_source=True
|
||||||
|
)
|
||||||
|
stats_content += f" {j}. {formatted_title}\n"
|
||||||
|
|
||||||
|
if j < len(stat["titles"]):
|
||||||
|
stats_content += "\n"
|
||||||
|
|
||||||
|
if i < len(report_data["stats"]) - 1:
|
||||||
|
stats_content += f"\n{separator}\n\n"
|
||||||
|
|
||||||
|
# 生成新增新闻部分
|
||||||
|
new_titles_content = ""
|
||||||
|
if report_data["new_titles"]:
|
||||||
|
new_titles_content += (
|
||||||
|
f"🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
for source_data in report_data["new_titles"]:
|
||||||
|
new_titles_content += (
|
||||||
|
f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
for j, title_data in enumerate(source_data["titles"], 1):
|
||||||
|
title_data_copy = title_data.copy()
|
||||||
|
title_data_copy["is_new"] = False
|
||||||
|
formatted_title = format_title_for_platform(
|
||||||
|
"feishu", title_data_copy, show_source=False
|
||||||
|
)
|
||||||
|
new_titles_content += f" {j}. {formatted_title}\n"
|
||||||
|
|
||||||
|
new_titles_content += "\n"
|
||||||
|
|
||||||
|
# 根据配置决定内容顺序
|
||||||
|
text_content = ""
|
||||||
|
if reverse_content_order:
|
||||||
|
# 新增热点在前,热点词汇统计在后
|
||||||
|
if new_titles_content:
|
||||||
|
text_content += new_titles_content
|
||||||
|
if stats_content:
|
||||||
|
text_content += f"\n{separator}\n\n"
|
||||||
|
if stats_content:
|
||||||
|
text_content += stats_content
|
||||||
|
else:
|
||||||
|
# 默认:热点词汇统计在前,新增热点在后
|
||||||
|
if stats_content:
|
||||||
|
text_content += stats_content
|
||||||
|
if new_titles_content:
|
||||||
|
text_content += f"\n{separator}\n\n"
|
||||||
|
if new_titles_content:
|
||||||
|
text_content += new_titles_content
|
||||||
|
|
||||||
|
if not text_content:
|
||||||
|
if mode == "incremental":
|
||||||
|
mode_text = "增量模式下暂无新增匹配的热点词汇"
|
||||||
|
elif mode == "current":
|
||||||
|
mode_text = "当前榜单模式下暂无匹配的热点词汇"
|
||||||
|
else:
|
||||||
|
mode_text = "暂无匹配的热点词汇"
|
||||||
|
text_content = f"📭 {mode_text}\n\n"
|
||||||
|
|
||||||
|
if report_data["failed_ids"]:
|
||||||
|
if text_content and "暂无匹配" not in text_content:
|
||||||
|
text_content += f"\n{separator}\n\n"
|
||||||
|
|
||||||
|
text_content += "⚠️ **数据获取失败的平台:**\n\n"
|
||||||
|
for i, id_value in enumerate(report_data["failed_ids"], 1):
|
||||||
|
text_content += f" • <font color='red'>{id_value}</font>\n"
|
||||||
|
|
||||||
|
# 获取当前时间
|
||||||
|
now = get_time_func() if get_time_func else datetime.now()
|
||||||
|
text_content += (
|
||||||
|
f"\n\n<font color='grey'>更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}</font>"
|
||||||
|
)
|
||||||
|
|
||||||
|
if update_info:
|
||||||
|
text_content += f"\n<font color='grey'>TrendRadar 发现新版本 {update_info['remote_version']},当前 {update_info['current_version']}</font>"
|
||||||
|
|
||||||
|
return text_content
|
||||||
|
|
||||||
|
|
||||||
|
def render_dingtalk_content(
|
||||||
|
report_data: Dict,
|
||||||
|
update_info: Optional[Dict] = None,
|
||||||
|
mode: str = "daily",
|
||||||
|
reverse_content_order: bool = False,
|
||||||
|
get_time_func: Optional[Callable[[], datetime]] = None,
|
||||||
|
) -> str:
|
||||||
|
"""渲染钉钉通知内容
|
||||||
|
|
||||||
|
Args:
|
||||||
|
report_data: 报告数据字典,包含 stats, new_titles, failed_ids, total_new_count
|
||||||
|
update_info: 版本更新信息(可选)
|
||||||
|
mode: 报告模式 ("daily", "incremental", "current")
|
||||||
|
reverse_content_order: 是否反转内容顺序(新增在前)
|
||||||
|
get_time_func: 获取当前时间的函数(可选,默认使用 datetime.now())
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
格式化的钉钉消息内容
|
||||||
|
"""
|
||||||
|
total_titles = sum(
|
||||||
|
len(stat["titles"]) for stat in report_data["stats"] if stat["count"] > 0
|
||||||
|
)
|
||||||
|
now = get_time_func() if get_time_func else datetime.now()
|
||||||
|
|
||||||
|
# 头部信息
|
||||||
|
header_content = f"**总新闻数:** {total_titles}\n\n"
|
||||||
|
header_content += f"**时间:** {now.strftime('%Y-%m-%d %H:%M:%S')}\n\n"
|
||||||
|
header_content += "**类型:** 热点分析报告\n\n"
|
||||||
|
header_content += "---\n\n"
|
||||||
|
|
||||||
|
# 生成热点词汇统计部分
|
||||||
|
stats_content = ""
|
||||||
|
if report_data["stats"]:
|
||||||
|
stats_content += "📊 **热点词汇统计**\n\n"
|
||||||
|
|
||||||
|
total_count = len(report_data["stats"])
|
||||||
|
|
||||||
|
for i, stat in enumerate(report_data["stats"]):
|
||||||
|
word = stat["word"]
|
||||||
|
count = stat["count"]
|
||||||
|
|
||||||
|
sequence_display = f"[{i + 1}/{total_count}]"
|
||||||
|
|
||||||
|
if count >= 10:
|
||||||
|
stats_content += f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n"
|
||||||
|
elif count >= 5:
|
||||||
|
stats_content += f"📈 {sequence_display} **{word}** : **{count}** 条\n\n"
|
||||||
|
else:
|
||||||
|
stats_content += f"📌 {sequence_display} **{word}** : {count} 条\n\n"
|
||||||
|
|
||||||
|
for j, title_data in enumerate(stat["titles"], 1):
|
||||||
|
formatted_title = format_title_for_platform(
|
||||||
|
"dingtalk", title_data, show_source=True
|
||||||
|
)
|
||||||
|
stats_content += f" {j}. {formatted_title}\n"
|
||||||
|
|
||||||
|
if j < len(stat["titles"]):
|
||||||
|
stats_content += "\n"
|
||||||
|
|
||||||
|
if i < len(report_data["stats"]) - 1:
|
||||||
|
stats_content += "\n---\n\n"
|
||||||
|
|
||||||
|
# 生成新增新闻部分
|
||||||
|
new_titles_content = ""
|
||||||
|
if report_data["new_titles"]:
|
||||||
|
new_titles_content += (
|
||||||
|
f"🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
for source_data in report_data["new_titles"]:
|
||||||
|
new_titles_content += f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n"
|
||||||
|
|
||||||
|
for j, title_data in enumerate(source_data["titles"], 1):
|
||||||
|
title_data_copy = title_data.copy()
|
||||||
|
title_data_copy["is_new"] = False
|
||||||
|
formatted_title = format_title_for_platform(
|
||||||
|
"dingtalk", title_data_copy, show_source=False
|
||||||
|
)
|
||||||
|
new_titles_content += f" {j}. {formatted_title}\n"
|
||||||
|
|
||||||
|
new_titles_content += "\n"
|
||||||
|
|
||||||
|
# 根据配置决定内容顺序
|
||||||
|
text_content = header_content
|
||||||
|
if reverse_content_order:
|
||||||
|
# 新增热点在前,热点词汇统计在后
|
||||||
|
if new_titles_content:
|
||||||
|
text_content += new_titles_content
|
||||||
|
if stats_content:
|
||||||
|
text_content += "\n---\n\n"
|
||||||
|
if stats_content:
|
||||||
|
text_content += stats_content
|
||||||
|
else:
|
||||||
|
# 默认:热点词汇统计在前,新增热点在后
|
||||||
|
if stats_content:
|
||||||
|
text_content += stats_content
|
||||||
|
if new_titles_content:
|
||||||
|
text_content += "\n---\n\n"
|
||||||
|
if new_titles_content:
|
||||||
|
text_content += new_titles_content
|
||||||
|
|
||||||
|
if not stats_content and not new_titles_content:
|
||||||
|
if mode == "incremental":
|
||||||
|
mode_text = "增量模式下暂无新增匹配的热点词汇"
|
||||||
|
elif mode == "current":
|
||||||
|
mode_text = "当前榜单模式下暂无匹配的热点词汇"
|
||||||
|
else:
|
||||||
|
mode_text = "暂无匹配的热点词汇"
|
||||||
|
text_content += f"📭 {mode_text}\n\n"
|
||||||
|
|
||||||
|
if report_data["failed_ids"]:
|
||||||
|
if "暂无匹配" not in text_content:
|
||||||
|
text_content += "\n---\n\n"
|
||||||
|
|
||||||
|
text_content += "⚠️ **数据获取失败的平台:**\n\n"
|
||||||
|
for i, id_value in enumerate(report_data["failed_ids"], 1):
|
||||||
|
text_content += f" • **{id_value}**\n"
|
||||||
|
|
||||||
|
text_content += f"\n\n> 更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}"
|
||||||
|
|
||||||
|
if update_info:
|
||||||
|
text_content += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**,当前 **{update_info['current_version']}**"
|
||||||
|
|
||||||
|
return text_content
|
||||||
1033
trendradar/notification/senders.py
Normal file
1033
trendradar/notification/senders.py
Normal file
File diff suppressed because it is too large
Load Diff
580
trendradar/notification/splitter.py
Normal file
580
trendradar/notification/splitter.py
Normal file
@ -0,0 +1,580 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
"""
|
||||||
|
消息分批处理模块
|
||||||
|
|
||||||
|
提供消息内容分批拆分功能,确保消息大小不超过各平台限制
|
||||||
|
"""
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Dict, List, Optional, Callable
|
||||||
|
|
||||||
|
from trendradar.report.formatter import format_title_for_platform
|
||||||
|
|
||||||
|
|
||||||
|
# 默认批次大小配置
|
||||||
|
DEFAULT_BATCH_SIZES = {
|
||||||
|
"dingtalk": 20000,
|
||||||
|
"feishu": 29000,
|
||||||
|
"ntfy": 3800,
|
||||||
|
"default": 4000,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def split_content_into_batches(
|
||||||
|
report_data: Dict,
|
||||||
|
format_type: str,
|
||||||
|
update_info: Optional[Dict] = None,
|
||||||
|
max_bytes: Optional[int] = None,
|
||||||
|
mode: str = "daily",
|
||||||
|
batch_sizes: Optional[Dict[str, int]] = None,
|
||||||
|
feishu_separator: str = "---",
|
||||||
|
reverse_content_order: bool = False,
|
||||||
|
get_time_func: Optional[Callable[[], datetime]] = None,
|
||||||
|
) -> List[str]:
|
||||||
|
"""分批处理消息内容,确保词组标题+至少第一条新闻的完整性
|
||||||
|
|
||||||
|
Args:
|
||||||
|
report_data: 报告数据字典,包含 stats, new_titles, failed_ids, total_new_count
|
||||||
|
format_type: 格式类型 (feishu, dingtalk, wework, telegram, ntfy, bark, slack)
|
||||||
|
update_info: 版本更新信息(可选)
|
||||||
|
max_bytes: 最大字节数(可选,如果不指定则使用默认配置)
|
||||||
|
mode: 报告模式 (daily, incremental, current)
|
||||||
|
batch_sizes: 批次大小配置字典(可选)
|
||||||
|
feishu_separator: 飞书消息分隔符
|
||||||
|
reverse_content_order: 是否反转内容顺序(新增在前)
|
||||||
|
get_time_func: 获取当前时间的函数(可选)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
分批后的消息内容列表
|
||||||
|
"""
|
||||||
|
# 合并批次大小配置
|
||||||
|
sizes = {**DEFAULT_BATCH_SIZES, **(batch_sizes or {})}
|
||||||
|
|
||||||
|
if max_bytes is None:
|
||||||
|
if format_type == "dingtalk":
|
||||||
|
max_bytes = sizes.get("dingtalk", 20000)
|
||||||
|
elif format_type == "feishu":
|
||||||
|
max_bytes = sizes.get("feishu", 29000)
|
||||||
|
elif format_type == "ntfy":
|
||||||
|
max_bytes = sizes.get("ntfy", 3800)
|
||||||
|
else:
|
||||||
|
max_bytes = sizes.get("default", 4000)
|
||||||
|
|
||||||
|
batches = []
|
||||||
|
|
||||||
|
total_titles = sum(
|
||||||
|
len(stat["titles"]) for stat in report_data["stats"] if stat["count"] > 0
|
||||||
|
)
|
||||||
|
now = get_time_func() if get_time_func else datetime.now()
|
||||||
|
|
||||||
|
base_header = ""
|
||||||
|
if format_type in ("wework", "bark"):
|
||||||
|
base_header = f"**总新闻数:** {total_titles}\n\n\n\n"
|
||||||
|
elif format_type == "telegram":
|
||||||
|
base_header = f"总新闻数: {total_titles}\n\n"
|
||||||
|
elif format_type == "ntfy":
|
||||||
|
base_header = f"**总新闻数:** {total_titles}\n\n"
|
||||||
|
elif format_type == "feishu":
|
||||||
|
base_header = ""
|
||||||
|
elif format_type == "dingtalk":
|
||||||
|
base_header = f"**总新闻数:** {total_titles}\n\n"
|
||||||
|
base_header += f"**时间:** {now.strftime('%Y-%m-%d %H:%M:%S')}\n\n"
|
||||||
|
base_header += f"**类型:** 热点分析报告\n\n"
|
||||||
|
base_header += "---\n\n"
|
||||||
|
elif format_type == "slack":
|
||||||
|
base_header = f"*总新闻数:* {total_titles}\n\n"
|
||||||
|
|
||||||
|
base_footer = ""
|
||||||
|
if format_type in ("wework", "bark"):
|
||||||
|
base_footer = f"\n\n\n> 更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}"
|
||||||
|
if update_info:
|
||||||
|
base_footer += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**,当前 **{update_info['current_version']}**"
|
||||||
|
elif format_type == "telegram":
|
||||||
|
base_footer = f"\n\n更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}"
|
||||||
|
if update_info:
|
||||||
|
base_footer += f"\nTrendRadar 发现新版本 {update_info['remote_version']},当前 {update_info['current_version']}"
|
||||||
|
elif format_type == "ntfy":
|
||||||
|
base_footer = f"\n\n> 更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}"
|
||||||
|
if update_info:
|
||||||
|
base_footer += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**,当前 **{update_info['current_version']}**"
|
||||||
|
elif format_type == "feishu":
|
||||||
|
base_footer = f"\n\n<font color='grey'>更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}</font>"
|
||||||
|
if update_info:
|
||||||
|
base_footer += f"\n<font color='grey'>TrendRadar 发现新版本 {update_info['remote_version']},当前 {update_info['current_version']}</font>"
|
||||||
|
elif format_type == "dingtalk":
|
||||||
|
base_footer = f"\n\n> 更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}"
|
||||||
|
if update_info:
|
||||||
|
base_footer += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**,当前 **{update_info['current_version']}**"
|
||||||
|
elif format_type == "slack":
|
||||||
|
base_footer = f"\n\n_更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}_"
|
||||||
|
if update_info:
|
||||||
|
base_footer += f"\n_TrendRadar 发现新版本 *{update_info['remote_version']}*,当前 *{update_info['current_version']}_"
|
||||||
|
|
||||||
|
stats_header = ""
|
||||||
|
if report_data["stats"]:
|
||||||
|
if format_type in ("wework", "bark"):
|
||||||
|
stats_header = f"📊 **热点词汇统计**\n\n"
|
||||||
|
elif format_type == "telegram":
|
||||||
|
stats_header = f"📊 热点词汇统计\n\n"
|
||||||
|
elif format_type == "ntfy":
|
||||||
|
stats_header = f"📊 **热点词汇统计**\n\n"
|
||||||
|
elif format_type == "feishu":
|
||||||
|
stats_header = f"📊 **热点词汇统计**\n\n"
|
||||||
|
elif format_type == "dingtalk":
|
||||||
|
stats_header = f"📊 **热点词汇统计**\n\n"
|
||||||
|
elif format_type == "slack":
|
||||||
|
stats_header = f"📊 *热点词汇统计*\n\n"
|
||||||
|
|
||||||
|
current_batch = base_header
|
||||||
|
current_batch_has_content = False
|
||||||
|
|
||||||
|
if (
|
||||||
|
not report_data["stats"]
|
||||||
|
and not report_data["new_titles"]
|
||||||
|
and not report_data["failed_ids"]
|
||||||
|
):
|
||||||
|
if mode == "incremental":
|
||||||
|
mode_text = "增量模式下暂无新增匹配的热点词汇"
|
||||||
|
elif mode == "current":
|
||||||
|
mode_text = "当前榜单模式下暂无匹配的热点词汇"
|
||||||
|
else:
|
||||||
|
mode_text = "暂无匹配的热点词汇"
|
||||||
|
simple_content = f"📭 {mode_text}\n\n"
|
||||||
|
final_content = base_header + simple_content + base_footer
|
||||||
|
batches.append(final_content)
|
||||||
|
return batches
|
||||||
|
|
||||||
|
# 定义处理热点词汇统计的函数
|
||||||
|
def process_stats_section(current_batch, current_batch_has_content, batches):
|
||||||
|
"""处理热点词汇统计"""
|
||||||
|
if not report_data["stats"]:
|
||||||
|
return current_batch, current_batch_has_content, batches
|
||||||
|
|
||||||
|
total_count = len(report_data["stats"])
|
||||||
|
|
||||||
|
# 添加统计标题
|
||||||
|
test_content = current_batch + stats_header
|
||||||
|
if (
|
||||||
|
len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
|
||||||
|
< max_bytes
|
||||||
|
):
|
||||||
|
current_batch = test_content
|
||||||
|
current_batch_has_content = True
|
||||||
|
else:
|
||||||
|
if current_batch_has_content:
|
||||||
|
batches.append(current_batch + base_footer)
|
||||||
|
current_batch = base_header + stats_header
|
||||||
|
current_batch_has_content = True
|
||||||
|
|
||||||
|
# 逐个处理词组(确保词组标题+第一条新闻的原子性)
|
||||||
|
for i, stat in enumerate(report_data["stats"]):
|
||||||
|
word = stat["word"]
|
||||||
|
count = stat["count"]
|
||||||
|
sequence_display = f"[{i + 1}/{total_count}]"
|
||||||
|
|
||||||
|
# 构建词组标题
|
||||||
|
word_header = ""
|
||||||
|
if format_type in ("wework", "bark"):
|
||||||
|
if count >= 10:
|
||||||
|
word_header = (
|
||||||
|
f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n"
|
||||||
|
)
|
||||||
|
elif count >= 5:
|
||||||
|
word_header = (
|
||||||
|
f"📈 {sequence_display} **{word}** : **{count}** 条\n\n"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
word_header = f"📌 {sequence_display} **{word}** : {count} 条\n\n"
|
||||||
|
elif format_type == "telegram":
|
||||||
|
if count >= 10:
|
||||||
|
word_header = f"🔥 {sequence_display} {word} : {count} 条\n\n"
|
||||||
|
elif count >= 5:
|
||||||
|
word_header = f"📈 {sequence_display} {word} : {count} 条\n\n"
|
||||||
|
else:
|
||||||
|
word_header = f"📌 {sequence_display} {word} : {count} 条\n\n"
|
||||||
|
elif format_type == "ntfy":
|
||||||
|
if count >= 10:
|
||||||
|
word_header = (
|
||||||
|
f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n"
|
||||||
|
)
|
||||||
|
elif count >= 5:
|
||||||
|
word_header = (
|
||||||
|
f"📈 {sequence_display} **{word}** : **{count}** 条\n\n"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
word_header = f"📌 {sequence_display} **{word}** : {count} 条\n\n"
|
||||||
|
elif format_type == "feishu":
|
||||||
|
if count >= 10:
|
||||||
|
word_header = f"🔥 <font color='grey'>{sequence_display}</font> **{word}** : <font color='red'>{count}</font> 条\n\n"
|
||||||
|
elif count >= 5:
|
||||||
|
word_header = f"📈 <font color='grey'>{sequence_display}</font> **{word}** : <font color='orange'>{count}</font> 条\n\n"
|
||||||
|
else:
|
||||||
|
word_header = f"📌 <font color='grey'>{sequence_display}</font> **{word}** : {count} 条\n\n"
|
||||||
|
elif format_type == "dingtalk":
|
||||||
|
if count >= 10:
|
||||||
|
word_header = (
|
||||||
|
f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n"
|
||||||
|
)
|
||||||
|
elif count >= 5:
|
||||||
|
word_header = (
|
||||||
|
f"📈 {sequence_display} **{word}** : **{count}** 条\n\n"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
word_header = f"📌 {sequence_display} **{word}** : {count} 条\n\n"
|
||||||
|
elif format_type == "slack":
|
||||||
|
if count >= 10:
|
||||||
|
word_header = (
|
||||||
|
f"🔥 {sequence_display} *{word}* : *{count}* 条\n\n"
|
||||||
|
)
|
||||||
|
elif count >= 5:
|
||||||
|
word_header = (
|
||||||
|
f"📈 {sequence_display} *{word}* : *{count}* 条\n\n"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
word_header = f"📌 {sequence_display} *{word}* : {count} 条\n\n"
|
||||||
|
|
||||||
|
# 构建第一条新闻
|
||||||
|
first_news_line = ""
|
||||||
|
if stat["titles"]:
|
||||||
|
first_title_data = stat["titles"][0]
|
||||||
|
if format_type in ("wework", "bark"):
|
||||||
|
formatted_title = format_title_for_platform(
|
||||||
|
"wework", first_title_data, show_source=True
|
||||||
|
)
|
||||||
|
elif format_type == "telegram":
|
||||||
|
formatted_title = format_title_for_platform(
|
||||||
|
"telegram", first_title_data, show_source=True
|
||||||
|
)
|
||||||
|
elif format_type == "ntfy":
|
||||||
|
formatted_title = format_title_for_platform(
|
||||||
|
"ntfy", first_title_data, show_source=True
|
||||||
|
)
|
||||||
|
elif format_type == "feishu":
|
||||||
|
formatted_title = format_title_for_platform(
|
||||||
|
"feishu", first_title_data, show_source=True
|
||||||
|
)
|
||||||
|
elif format_type == "dingtalk":
|
||||||
|
formatted_title = format_title_for_platform(
|
||||||
|
"dingtalk", first_title_data, show_source=True
|
||||||
|
)
|
||||||
|
elif format_type == "slack":
|
||||||
|
formatted_title = format_title_for_platform(
|
||||||
|
"slack", first_title_data, show_source=True
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
formatted_title = f"{first_title_data['title']}"
|
||||||
|
|
||||||
|
first_news_line = f" 1. {formatted_title}\n"
|
||||||
|
if len(stat["titles"]) > 1:
|
||||||
|
first_news_line += "\n"
|
||||||
|
|
||||||
|
# 原子性检查:词组标题+第一条新闻必须一起处理
|
||||||
|
word_with_first_news = word_header + first_news_line
|
||||||
|
test_content = current_batch + word_with_first_news
|
||||||
|
|
||||||
|
if (
|
||||||
|
len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
|
||||||
|
>= max_bytes
|
||||||
|
):
|
||||||
|
# 当前批次容纳不下,开启新批次
|
||||||
|
if current_batch_has_content:
|
||||||
|
batches.append(current_batch + base_footer)
|
||||||
|
current_batch = base_header + stats_header + word_with_first_news
|
||||||
|
current_batch_has_content = True
|
||||||
|
start_index = 1
|
||||||
|
else:
|
||||||
|
current_batch = test_content
|
||||||
|
current_batch_has_content = True
|
||||||
|
start_index = 1
|
||||||
|
|
||||||
|
# 处理剩余新闻条目
|
||||||
|
for j in range(start_index, len(stat["titles"])):
|
||||||
|
title_data = stat["titles"][j]
|
||||||
|
if format_type in ("wework", "bark"):
|
||||||
|
formatted_title = format_title_for_platform(
|
||||||
|
"wework", title_data, show_source=True
|
||||||
|
)
|
||||||
|
elif format_type == "telegram":
|
||||||
|
formatted_title = format_title_for_platform(
|
||||||
|
"telegram", title_data, show_source=True
|
||||||
|
)
|
||||||
|
elif format_type == "ntfy":
|
||||||
|
formatted_title = format_title_for_platform(
|
||||||
|
"ntfy", title_data, show_source=True
|
||||||
|
)
|
||||||
|
elif format_type == "feishu":
|
||||||
|
formatted_title = format_title_for_platform(
|
||||||
|
"feishu", title_data, show_source=True
|
||||||
|
)
|
||||||
|
elif format_type == "dingtalk":
|
||||||
|
formatted_title = format_title_for_platform(
|
||||||
|
"dingtalk", title_data, show_source=True
|
||||||
|
)
|
||||||
|
elif format_type == "slack":
|
||||||
|
formatted_title = format_title_for_platform(
|
||||||
|
"slack", title_data, show_source=True
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
formatted_title = f"{title_data['title']}"
|
||||||
|
|
||||||
|
news_line = f" {j + 1}. {formatted_title}\n"
|
||||||
|
if j < len(stat["titles"]) - 1:
|
||||||
|
news_line += "\n"
|
||||||
|
|
||||||
|
test_content = current_batch + news_line
|
||||||
|
if (
|
||||||
|
len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
|
||||||
|
>= max_bytes
|
||||||
|
):
|
||||||
|
if current_batch_has_content:
|
||||||
|
batches.append(current_batch + base_footer)
|
||||||
|
current_batch = base_header + stats_header + word_header + news_line
|
||||||
|
current_batch_has_content = True
|
||||||
|
else:
|
||||||
|
current_batch = test_content
|
||||||
|
current_batch_has_content = True
|
||||||
|
|
||||||
|
# 词组间分隔符
|
||||||
|
if i < len(report_data["stats"]) - 1:
|
||||||
|
separator = ""
|
||||||
|
if format_type in ("wework", "bark"):
|
||||||
|
separator = f"\n\n\n\n"
|
||||||
|
elif format_type == "telegram":
|
||||||
|
separator = f"\n\n"
|
||||||
|
elif format_type == "ntfy":
|
||||||
|
separator = f"\n\n"
|
||||||
|
elif format_type == "feishu":
|
||||||
|
separator = f"\n{feishu_separator}\n\n"
|
||||||
|
elif format_type == "dingtalk":
|
||||||
|
separator = f"\n---\n\n"
|
||||||
|
elif format_type == "slack":
|
||||||
|
separator = f"\n\n"
|
||||||
|
|
||||||
|
test_content = current_batch + separator
|
||||||
|
if (
|
||||||
|
len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
|
||||||
|
< max_bytes
|
||||||
|
):
|
||||||
|
current_batch = test_content
|
||||||
|
|
||||||
|
return current_batch, current_batch_has_content, batches
|
||||||
|
|
||||||
|
# 定义处理新增新闻的函数
|
||||||
|
def process_new_titles_section(current_batch, current_batch_has_content, batches):
|
||||||
|
"""处理新增新闻"""
|
||||||
|
if not report_data["new_titles"]:
|
||||||
|
return current_batch, current_batch_has_content, batches
|
||||||
|
|
||||||
|
new_header = ""
|
||||||
|
if format_type in ("wework", "bark"):
|
||||||
|
new_header = f"\n\n\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
|
||||||
|
elif format_type == "telegram":
|
||||||
|
new_header = (
|
||||||
|
f"\n\n🆕 本次新增热点新闻 (共 {report_data['total_new_count']} 条)\n\n"
|
||||||
|
)
|
||||||
|
elif format_type == "ntfy":
|
||||||
|
new_header = f"\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
|
||||||
|
elif format_type == "feishu":
|
||||||
|
new_header = f"\n{feishu_separator}\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
|
||||||
|
elif format_type == "dingtalk":
|
||||||
|
new_header = f"\n---\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
|
||||||
|
elif format_type == "slack":
|
||||||
|
new_header = f"\n\n🆕 *本次新增热点新闻* (共 {report_data['total_new_count']} 条)\n\n"
|
||||||
|
|
||||||
|
test_content = current_batch + new_header
|
||||||
|
if (
|
||||||
|
len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
|
||||||
|
>= max_bytes
|
||||||
|
):
|
||||||
|
if current_batch_has_content:
|
||||||
|
batches.append(current_batch + base_footer)
|
||||||
|
current_batch = base_header + new_header
|
||||||
|
current_batch_has_content = True
|
||||||
|
else:
|
||||||
|
current_batch = test_content
|
||||||
|
current_batch_has_content = True
|
||||||
|
|
||||||
|
# 逐个处理新增新闻来源
|
||||||
|
for source_data in report_data["new_titles"]:
|
||||||
|
source_header = ""
|
||||||
|
if format_type in ("wework", "bark"):
|
||||||
|
source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n"
|
||||||
|
elif format_type == "telegram":
|
||||||
|
source_header = f"{source_data['source_name']} ({len(source_data['titles'])} 条):\n\n"
|
||||||
|
elif format_type == "ntfy":
|
||||||
|
source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n"
|
||||||
|
elif format_type == "feishu":
|
||||||
|
source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n"
|
||||||
|
elif format_type == "dingtalk":
|
||||||
|
source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n"
|
||||||
|
elif format_type == "slack":
|
||||||
|
source_header = f"*{source_data['source_name']}* ({len(source_data['titles'])} 条):\n\n"
|
||||||
|
|
||||||
|
# 构建第一条新增新闻
|
||||||
|
first_news_line = ""
|
||||||
|
if source_data["titles"]:
|
||||||
|
first_title_data = source_data["titles"][0]
|
||||||
|
title_data_copy = first_title_data.copy()
|
||||||
|
title_data_copy["is_new"] = False
|
||||||
|
|
||||||
|
if format_type in ("wework", "bark"):
|
||||||
|
formatted_title = format_title_for_platform(
|
||||||
|
"wework", title_data_copy, show_source=False
|
||||||
|
)
|
||||||
|
elif format_type == "telegram":
|
||||||
|
formatted_title = format_title_for_platform(
|
||||||
|
"telegram", title_data_copy, show_source=False
|
||||||
|
)
|
||||||
|
elif format_type == "feishu":
|
||||||
|
formatted_title = format_title_for_platform(
|
||||||
|
"feishu", title_data_copy, show_source=False
|
||||||
|
)
|
||||||
|
elif format_type == "dingtalk":
|
||||||
|
formatted_title = format_title_for_platform(
|
||||||
|
"dingtalk", title_data_copy, show_source=False
|
||||||
|
)
|
||||||
|
elif format_type == "slack":
|
||||||
|
formatted_title = format_title_for_platform(
|
||||||
|
"slack", title_data_copy, show_source=False
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
formatted_title = f"{title_data_copy['title']}"
|
||||||
|
|
||||||
|
first_news_line = f" 1. {formatted_title}\n"
|
||||||
|
|
||||||
|
# 原子性检查:来源标题+第一条新闻
|
||||||
|
source_with_first_news = source_header + first_news_line
|
||||||
|
test_content = current_batch + source_with_first_news
|
||||||
|
|
||||||
|
if (
|
||||||
|
len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
|
||||||
|
>= max_bytes
|
||||||
|
):
|
||||||
|
if current_batch_has_content:
|
||||||
|
batches.append(current_batch + base_footer)
|
||||||
|
current_batch = base_header + new_header + source_with_first_news
|
||||||
|
current_batch_has_content = True
|
||||||
|
start_index = 1
|
||||||
|
else:
|
||||||
|
current_batch = test_content
|
||||||
|
current_batch_has_content = True
|
||||||
|
start_index = 1
|
||||||
|
|
||||||
|
# 处理剩余新增新闻
|
||||||
|
for j in range(start_index, len(source_data["titles"])):
|
||||||
|
title_data = source_data["titles"][j]
|
||||||
|
title_data_copy = title_data.copy()
|
||||||
|
title_data_copy["is_new"] = False
|
||||||
|
|
||||||
|
if format_type == "wework":
|
||||||
|
formatted_title = format_title_for_platform(
|
||||||
|
"wework", title_data_copy, show_source=False
|
||||||
|
)
|
||||||
|
elif format_type == "telegram":
|
||||||
|
formatted_title = format_title_for_platform(
|
||||||
|
"telegram", title_data_copy, show_source=False
|
||||||
|
)
|
||||||
|
elif format_type == "feishu":
|
||||||
|
formatted_title = format_title_for_platform(
|
||||||
|
"feishu", title_data_copy, show_source=False
|
||||||
|
)
|
||||||
|
elif format_type == "dingtalk":
|
||||||
|
formatted_title = format_title_for_platform(
|
||||||
|
"dingtalk", title_data_copy, show_source=False
|
||||||
|
)
|
||||||
|
elif format_type == "slack":
|
||||||
|
formatted_title = format_title_for_platform(
|
||||||
|
"slack", title_data_copy, show_source=False
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
formatted_title = f"{title_data_copy['title']}"
|
||||||
|
|
||||||
|
news_line = f" {j + 1}. {formatted_title}\n"
|
||||||
|
|
||||||
|
test_content = current_batch + news_line
|
||||||
|
if (
|
||||||
|
len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
|
||||||
|
>= max_bytes
|
||||||
|
):
|
||||||
|
if current_batch_has_content:
|
||||||
|
batches.append(current_batch + base_footer)
|
||||||
|
current_batch = base_header + new_header + source_header + news_line
|
||||||
|
current_batch_has_content = True
|
||||||
|
else:
|
||||||
|
current_batch = test_content
|
||||||
|
current_batch_has_content = True
|
||||||
|
|
||||||
|
current_batch += "\n"
|
||||||
|
|
||||||
|
return current_batch, current_batch_has_content, batches
|
||||||
|
|
||||||
|
# 根据配置决定处理顺序
|
||||||
|
if reverse_content_order:
|
||||||
|
# 新增热点在前,热点词汇统计在后
|
||||||
|
current_batch, current_batch_has_content, batches = process_new_titles_section(
|
||||||
|
current_batch, current_batch_has_content, batches
|
||||||
|
)
|
||||||
|
current_batch, current_batch_has_content, batches = process_stats_section(
|
||||||
|
current_batch, current_batch_has_content, batches
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# 默认:热点词汇统计在前,新增热点在后
|
||||||
|
current_batch, current_batch_has_content, batches = process_stats_section(
|
||||||
|
current_batch, current_batch_has_content, batches
|
||||||
|
)
|
||||||
|
current_batch, current_batch_has_content, batches = process_new_titles_section(
|
||||||
|
current_batch, current_batch_has_content, batches
|
||||||
|
)
|
||||||
|
|
||||||
|
if report_data["failed_ids"]:
|
||||||
|
failed_header = ""
|
||||||
|
if format_type == "wework":
|
||||||
|
failed_header = f"\n\n\n\n⚠️ **数据获取失败的平台:**\n\n"
|
||||||
|
elif format_type == "telegram":
|
||||||
|
failed_header = f"\n\n⚠️ 数据获取失败的平台:\n\n"
|
||||||
|
elif format_type == "ntfy":
|
||||||
|
failed_header = f"\n\n⚠️ **数据获取失败的平台:**\n\n"
|
||||||
|
elif format_type == "feishu":
|
||||||
|
failed_header = f"\n{feishu_separator}\n\n⚠️ **数据获取失败的平台:**\n\n"
|
||||||
|
elif format_type == "dingtalk":
|
||||||
|
failed_header = f"\n---\n\n⚠️ **数据获取失败的平台:**\n\n"
|
||||||
|
|
||||||
|
test_content = current_batch + failed_header
|
||||||
|
if (
|
||||||
|
len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
|
||||||
|
>= max_bytes
|
||||||
|
):
|
||||||
|
if current_batch_has_content:
|
||||||
|
batches.append(current_batch + base_footer)
|
||||||
|
current_batch = base_header + failed_header
|
||||||
|
current_batch_has_content = True
|
||||||
|
else:
|
||||||
|
current_batch = test_content
|
||||||
|
current_batch_has_content = True
|
||||||
|
|
||||||
|
for i, id_value in enumerate(report_data["failed_ids"], 1):
|
||||||
|
if format_type == "feishu":
|
||||||
|
failed_line = f" • <font color='red'>{id_value}</font>\n"
|
||||||
|
elif format_type == "dingtalk":
|
||||||
|
failed_line = f" • **{id_value}**\n"
|
||||||
|
else:
|
||||||
|
failed_line = f" • {id_value}\n"
|
||||||
|
|
||||||
|
test_content = current_batch + failed_line
|
||||||
|
if (
|
||||||
|
len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
|
||||||
|
>= max_bytes
|
||||||
|
):
|
||||||
|
if current_batch_has_content:
|
||||||
|
batches.append(current_batch + base_footer)
|
||||||
|
current_batch = base_header + failed_header + failed_line
|
||||||
|
current_batch_has_content = True
|
||||||
|
else:
|
||||||
|
current_batch = test_content
|
||||||
|
current_batch_has_content = True
|
||||||
|
|
||||||
|
# 完成最后批次
|
||||||
|
if current_batch_has_content:
|
||||||
|
batches.append(current_batch + base_footer)
|
||||||
|
|
||||||
|
return batches
|
||||||
40
trendradar/report/__init__.py
Normal file
40
trendradar/report/__init__.py
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
"""
|
||||||
|
报告生成模块
|
||||||
|
|
||||||
|
提供报告生成和格式化功能,包括:
|
||||||
|
- HTML 报告生成
|
||||||
|
- 标题格式化工具
|
||||||
|
|
||||||
|
模块结构:
|
||||||
|
- helpers: 报告辅助函数(清理、转义、格式化)
|
||||||
|
- formatter: 平台标题格式化
|
||||||
|
- html: HTML 报告渲染
|
||||||
|
- generator: 报告生成器
|
||||||
|
"""
|
||||||
|
|
||||||
|
from trendradar.report.helpers import (
|
||||||
|
clean_title,
|
||||||
|
html_escape,
|
||||||
|
format_rank_display,
|
||||||
|
)
|
||||||
|
from trendradar.report.formatter import format_title_for_platform
|
||||||
|
from trendradar.report.html import render_html_content
|
||||||
|
from trendradar.report.generator import (
|
||||||
|
prepare_report_data,
|
||||||
|
generate_html_report,
|
||||||
|
)
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
# 辅助函数
|
||||||
|
"clean_title",
|
||||||
|
"html_escape",
|
||||||
|
"format_rank_display",
|
||||||
|
# 格式化函数
|
||||||
|
"format_title_for_platform",
|
||||||
|
# HTML 渲染
|
||||||
|
"render_html_content",
|
||||||
|
# 报告生成器
|
||||||
|
"prepare_report_data",
|
||||||
|
"generate_html_report",
|
||||||
|
]
|
||||||
223
trendradar/report/formatter.py
Normal file
223
trendradar/report/formatter.py
Normal file
@ -0,0 +1,223 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
"""
|
||||||
|
平台标题格式化模块
|
||||||
|
|
||||||
|
提供多平台标题格式化功能
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import Dict
|
||||||
|
|
||||||
|
from trendradar.report.helpers import clean_title, html_escape, format_rank_display
|
||||||
|
|
||||||
|
|
||||||
|
def format_title_for_platform(
|
||||||
|
platform: str, title_data: Dict, show_source: bool = True
|
||||||
|
) -> str:
|
||||||
|
"""统一的标题格式化方法
|
||||||
|
|
||||||
|
为不同平台生成对应格式的标题字符串。
|
||||||
|
|
||||||
|
Args:
|
||||||
|
platform: 目标平台,支持:
|
||||||
|
- "feishu": 飞书
|
||||||
|
- "dingtalk": 钉钉
|
||||||
|
- "wework": 企业微信
|
||||||
|
- "bark": Bark
|
||||||
|
- "telegram": Telegram
|
||||||
|
- "ntfy": ntfy
|
||||||
|
- "slack": Slack
|
||||||
|
- "html": HTML 报告
|
||||||
|
title_data: 标题数据字典,包含以下字段:
|
||||||
|
- title: 标题文本
|
||||||
|
- source_name: 来源名称
|
||||||
|
- time_display: 时间显示
|
||||||
|
- count: 出现次数
|
||||||
|
- ranks: 排名列表
|
||||||
|
- rank_threshold: 高亮阈值
|
||||||
|
- url: PC端链接
|
||||||
|
- mobile_url: 移动端链接(优先使用)
|
||||||
|
- is_new: 是否为新增标题(可选)
|
||||||
|
show_source: 是否显示来源名称
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
格式化后的标题字符串
|
||||||
|
"""
|
||||||
|
rank_display = format_rank_display(
|
||||||
|
title_data["ranks"], title_data["rank_threshold"], platform
|
||||||
|
)
|
||||||
|
|
||||||
|
link_url = title_data["mobile_url"] or title_data["url"]
|
||||||
|
cleaned_title = clean_title(title_data["title"])
|
||||||
|
|
||||||
|
if platform == "feishu":
|
||||||
|
if link_url:
|
||||||
|
formatted_title = f"[{cleaned_title}]({link_url})"
|
||||||
|
else:
|
||||||
|
formatted_title = cleaned_title
|
||||||
|
|
||||||
|
title_prefix = "🆕 " if title_data.get("is_new") else ""
|
||||||
|
|
||||||
|
if show_source:
|
||||||
|
result = f"<font color='grey'>[{title_data['source_name']}]</font> {title_prefix}{formatted_title}"
|
||||||
|
else:
|
||||||
|
result = f"{title_prefix}{formatted_title}"
|
||||||
|
|
||||||
|
if rank_display:
|
||||||
|
result += f" {rank_display}"
|
||||||
|
if title_data["time_display"]:
|
||||||
|
result += f" <font color='grey'>- {title_data['time_display']}</font>"
|
||||||
|
if title_data["count"] > 1:
|
||||||
|
result += f" <font color='green'>({title_data['count']}次)</font>"
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
elif platform == "dingtalk":
|
||||||
|
if link_url:
|
||||||
|
formatted_title = f"[{cleaned_title}]({link_url})"
|
||||||
|
else:
|
||||||
|
formatted_title = cleaned_title
|
||||||
|
|
||||||
|
title_prefix = "🆕 " if title_data.get("is_new") else ""
|
||||||
|
|
||||||
|
if show_source:
|
||||||
|
result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}"
|
||||||
|
else:
|
||||||
|
result = f"{title_prefix}{formatted_title}"
|
||||||
|
|
||||||
|
if rank_display:
|
||||||
|
result += f" {rank_display}"
|
||||||
|
if title_data["time_display"]:
|
||||||
|
result += f" - {title_data['time_display']}"
|
||||||
|
if title_data["count"] > 1:
|
||||||
|
result += f" ({title_data['count']}次)"
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
elif platform in ("wework", "bark"):
|
||||||
|
# WeWork 和 Bark 使用 markdown 格式
|
||||||
|
if link_url:
|
||||||
|
formatted_title = f"[{cleaned_title}]({link_url})"
|
||||||
|
else:
|
||||||
|
formatted_title = cleaned_title
|
||||||
|
|
||||||
|
title_prefix = "🆕 " if title_data.get("is_new") else ""
|
||||||
|
|
||||||
|
if show_source:
|
||||||
|
result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}"
|
||||||
|
else:
|
||||||
|
result = f"{title_prefix}{formatted_title}"
|
||||||
|
|
||||||
|
if rank_display:
|
||||||
|
result += f" {rank_display}"
|
||||||
|
if title_data["time_display"]:
|
||||||
|
result += f" - {title_data['time_display']}"
|
||||||
|
if title_data["count"] > 1:
|
||||||
|
result += f" ({title_data['count']}次)"
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
elif platform == "telegram":
|
||||||
|
if link_url:
|
||||||
|
formatted_title = f'<a href="{link_url}">{html_escape(cleaned_title)}</a>'
|
||||||
|
else:
|
||||||
|
formatted_title = cleaned_title
|
||||||
|
|
||||||
|
title_prefix = "🆕 " if title_data.get("is_new") else ""
|
||||||
|
|
||||||
|
if show_source:
|
||||||
|
result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}"
|
||||||
|
else:
|
||||||
|
result = f"{title_prefix}{formatted_title}"
|
||||||
|
|
||||||
|
if rank_display:
|
||||||
|
result += f" {rank_display}"
|
||||||
|
if title_data["time_display"]:
|
||||||
|
result += f" <code>- {title_data['time_display']}</code>"
|
||||||
|
if title_data["count"] > 1:
|
||||||
|
result += f" <code>({title_data['count']}次)</code>"
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
elif platform == "ntfy":
|
||||||
|
if link_url:
|
||||||
|
formatted_title = f"[{cleaned_title}]({link_url})"
|
||||||
|
else:
|
||||||
|
formatted_title = cleaned_title
|
||||||
|
|
||||||
|
title_prefix = "🆕 " if title_data.get("is_new") else ""
|
||||||
|
|
||||||
|
if show_source:
|
||||||
|
result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}"
|
||||||
|
else:
|
||||||
|
result = f"{title_prefix}{formatted_title}"
|
||||||
|
|
||||||
|
if rank_display:
|
||||||
|
result += f" {rank_display}"
|
||||||
|
if title_data["time_display"]:
|
||||||
|
result += f" `- {title_data['time_display']}`"
|
||||||
|
if title_data["count"] > 1:
|
||||||
|
result += f" `({title_data['count']}次)`"
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
elif platform == "slack":
|
||||||
|
# Slack 使用 mrkdwn 格式
|
||||||
|
if link_url:
|
||||||
|
# Slack 链接格式: <url|text>
|
||||||
|
formatted_title = f"<{link_url}|{cleaned_title}>"
|
||||||
|
else:
|
||||||
|
formatted_title = cleaned_title
|
||||||
|
|
||||||
|
title_prefix = "🆕 " if title_data.get("is_new") else ""
|
||||||
|
|
||||||
|
if show_source:
|
||||||
|
result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}"
|
||||||
|
else:
|
||||||
|
result = f"{title_prefix}{formatted_title}"
|
||||||
|
|
||||||
|
# 排名(使用 * 加粗)
|
||||||
|
rank_display = format_rank_display(
|
||||||
|
title_data["ranks"], title_data["rank_threshold"], "slack"
|
||||||
|
)
|
||||||
|
if rank_display:
|
||||||
|
result += f" {rank_display}"
|
||||||
|
if title_data["time_display"]:
|
||||||
|
result += f" `- {title_data['time_display']}`"
|
||||||
|
if title_data["count"] > 1:
|
||||||
|
result += f" `({title_data['count']}次)`"
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
elif platform == "html":
|
||||||
|
rank_display = format_rank_display(
|
||||||
|
title_data["ranks"], title_data["rank_threshold"], "html"
|
||||||
|
)
|
||||||
|
|
||||||
|
link_url = title_data["mobile_url"] or title_data["url"]
|
||||||
|
|
||||||
|
escaped_title = html_escape(cleaned_title)
|
||||||
|
escaped_source_name = html_escape(title_data["source_name"])
|
||||||
|
|
||||||
|
if link_url:
|
||||||
|
escaped_url = html_escape(link_url)
|
||||||
|
formatted_title = f'[{escaped_source_name}] <a href="{escaped_url}" target="_blank" class="news-link">{escaped_title}</a>'
|
||||||
|
else:
|
||||||
|
formatted_title = (
|
||||||
|
f'[{escaped_source_name}] <span class="no-link">{escaped_title}</span>'
|
||||||
|
)
|
||||||
|
|
||||||
|
if rank_display:
|
||||||
|
formatted_title += f" {rank_display}"
|
||||||
|
if title_data["time_display"]:
|
||||||
|
escaped_time = html_escape(title_data["time_display"])
|
||||||
|
formatted_title += f" <font color='grey'>- {escaped_time}</font>"
|
||||||
|
if title_data["count"] > 1:
|
||||||
|
formatted_title += f" <font color='green'>({title_data['count']}次)</font>"
|
||||||
|
|
||||||
|
if title_data.get("is_new"):
|
||||||
|
formatted_title = f"<div class='new-title'>🆕 {formatted_title}</div>"
|
||||||
|
|
||||||
|
return formatted_title
|
||||||
|
|
||||||
|
else:
|
||||||
|
return cleaned_title
|
||||||
235
trendradar/report/generator.py
Normal file
235
trendradar/report/generator.py
Normal file
@ -0,0 +1,235 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
"""
|
||||||
|
报告生成模块
|
||||||
|
|
||||||
|
提供报告数据准备和 HTML 生成功能:
|
||||||
|
- prepare_report_data: 准备报告数据
|
||||||
|
- generate_html_report: 生成 HTML 报告
|
||||||
|
"""
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, List, Optional, Callable
|
||||||
|
|
||||||
|
|
||||||
|
def prepare_report_data(
|
||||||
|
stats: List[Dict],
|
||||||
|
failed_ids: Optional[List] = None,
|
||||||
|
new_titles: Optional[Dict] = None,
|
||||||
|
id_to_name: Optional[Dict] = None,
|
||||||
|
mode: str = "daily",
|
||||||
|
rank_threshold: int = 3,
|
||||||
|
matches_word_groups_func: Optional[Callable] = None,
|
||||||
|
load_frequency_words_func: Optional[Callable] = None,
|
||||||
|
) -> Dict:
|
||||||
|
"""
|
||||||
|
准备报告数据
|
||||||
|
|
||||||
|
Args:
|
||||||
|
stats: 统计结果列表
|
||||||
|
failed_ids: 失败的 ID 列表
|
||||||
|
new_titles: 新增标题
|
||||||
|
id_to_name: ID 到名称的映射
|
||||||
|
mode: 报告模式 (daily/incremental/current)
|
||||||
|
rank_threshold: 排名阈值
|
||||||
|
matches_word_groups_func: 词组匹配函数
|
||||||
|
load_frequency_words_func: 加载频率词函数
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict: 准备好的报告数据
|
||||||
|
"""
|
||||||
|
processed_new_titles = []
|
||||||
|
|
||||||
|
# 在增量模式下隐藏新增新闻区域
|
||||||
|
hide_new_section = mode == "incremental"
|
||||||
|
|
||||||
|
# 只有在非隐藏模式下才处理新增新闻部分
|
||||||
|
if not hide_new_section:
|
||||||
|
filtered_new_titles = {}
|
||||||
|
if new_titles and id_to_name:
|
||||||
|
# 如果提供了匹配函数,使用它过滤
|
||||||
|
if matches_word_groups_func and load_frequency_words_func:
|
||||||
|
word_groups, filter_words, global_filters = load_frequency_words_func()
|
||||||
|
for source_id, titles_data in new_titles.items():
|
||||||
|
filtered_titles = {}
|
||||||
|
for title, title_data in titles_data.items():
|
||||||
|
if matches_word_groups_func(title, word_groups, filter_words, global_filters):
|
||||||
|
filtered_titles[title] = title_data
|
||||||
|
if filtered_titles:
|
||||||
|
filtered_new_titles[source_id] = filtered_titles
|
||||||
|
else:
|
||||||
|
# 没有匹配函数时,使用全部
|
||||||
|
filtered_new_titles = new_titles
|
||||||
|
|
||||||
|
# 打印过滤后的新增热点数(与推送显示一致)
|
||||||
|
original_new_count = sum(len(titles) for titles in new_titles.values()) if new_titles else 0
|
||||||
|
filtered_new_count = sum(len(titles) for titles in filtered_new_titles.values()) if filtered_new_titles else 0
|
||||||
|
if original_new_count > 0:
|
||||||
|
print(f"频率词过滤后:{filtered_new_count} 条新增热点匹配(原始 {original_new_count} 条)")
|
||||||
|
|
||||||
|
if filtered_new_titles and id_to_name:
|
||||||
|
for source_id, titles_data in filtered_new_titles.items():
|
||||||
|
source_name = id_to_name.get(source_id, source_id)
|
||||||
|
source_titles = []
|
||||||
|
|
||||||
|
for title, title_data in titles_data.items():
|
||||||
|
url = title_data.get("url", "")
|
||||||
|
mobile_url = title_data.get("mobileUrl", "")
|
||||||
|
ranks = title_data.get("ranks", [])
|
||||||
|
|
||||||
|
processed_title = {
|
||||||
|
"title": title,
|
||||||
|
"source_name": source_name,
|
||||||
|
"time_display": "",
|
||||||
|
"count": 1,
|
||||||
|
"ranks": ranks,
|
||||||
|
"rank_threshold": rank_threshold,
|
||||||
|
"url": url,
|
||||||
|
"mobile_url": mobile_url,
|
||||||
|
"is_new": True,
|
||||||
|
}
|
||||||
|
source_titles.append(processed_title)
|
||||||
|
|
||||||
|
if source_titles:
|
||||||
|
processed_new_titles.append(
|
||||||
|
{
|
||||||
|
"source_id": source_id,
|
||||||
|
"source_name": source_name,
|
||||||
|
"titles": source_titles,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
processed_stats = []
|
||||||
|
for stat in stats:
|
||||||
|
if stat["count"] <= 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
processed_titles = []
|
||||||
|
for title_data in stat["titles"]:
|
||||||
|
processed_title = {
|
||||||
|
"title": title_data["title"],
|
||||||
|
"source_name": title_data["source_name"],
|
||||||
|
"time_display": title_data["time_display"],
|
||||||
|
"count": title_data["count"],
|
||||||
|
"ranks": title_data["ranks"],
|
||||||
|
"rank_threshold": title_data["rank_threshold"],
|
||||||
|
"url": title_data.get("url", ""),
|
||||||
|
"mobile_url": title_data.get("mobileUrl", ""),
|
||||||
|
"is_new": title_data.get("is_new", False),
|
||||||
|
}
|
||||||
|
processed_titles.append(processed_title)
|
||||||
|
|
||||||
|
processed_stats.append(
|
||||||
|
{
|
||||||
|
"word": stat["word"],
|
||||||
|
"count": stat["count"],
|
||||||
|
"percentage": stat.get("percentage", 0),
|
||||||
|
"titles": processed_titles,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"stats": processed_stats,
|
||||||
|
"new_titles": processed_new_titles,
|
||||||
|
"failed_ids": failed_ids or [],
|
||||||
|
"total_new_count": sum(
|
||||||
|
len(source["titles"]) for source in processed_new_titles
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def generate_html_report(
|
||||||
|
stats: List[Dict],
|
||||||
|
total_titles: int,
|
||||||
|
failed_ids: Optional[List] = None,
|
||||||
|
new_titles: Optional[Dict] = None,
|
||||||
|
id_to_name: Optional[Dict] = None,
|
||||||
|
mode: str = "daily",
|
||||||
|
is_daily_summary: bool = False,
|
||||||
|
update_info: Optional[Dict] = None,
|
||||||
|
rank_threshold: int = 3,
|
||||||
|
output_dir: str = "output",
|
||||||
|
date_folder: str = "",
|
||||||
|
time_filename: str = "",
|
||||||
|
render_html_func: Optional[Callable] = None,
|
||||||
|
matches_word_groups_func: Optional[Callable] = None,
|
||||||
|
load_frequency_words_func: Optional[Callable] = None,
|
||||||
|
enable_index_copy: bool = True,
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
生成 HTML 报告
|
||||||
|
|
||||||
|
Args:
|
||||||
|
stats: 统计结果列表
|
||||||
|
total_titles: 总标题数
|
||||||
|
failed_ids: 失败的 ID 列表
|
||||||
|
new_titles: 新增标题
|
||||||
|
id_to_name: ID 到名称的映射
|
||||||
|
mode: 报告模式 (daily/incremental/current)
|
||||||
|
is_daily_summary: 是否是每日汇总
|
||||||
|
update_info: 更新信息
|
||||||
|
rank_threshold: 排名阈值
|
||||||
|
output_dir: 输出目录
|
||||||
|
date_folder: 日期文件夹名称
|
||||||
|
time_filename: 时间文件名
|
||||||
|
render_html_func: HTML 渲染函数
|
||||||
|
matches_word_groups_func: 词组匹配函数
|
||||||
|
load_frequency_words_func: 加载频率词函数
|
||||||
|
enable_index_copy: 是否复制到 index.html
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: 生成的 HTML 文件路径
|
||||||
|
"""
|
||||||
|
if is_daily_summary:
|
||||||
|
if mode == "current":
|
||||||
|
filename = "当前榜单汇总.html"
|
||||||
|
elif mode == "incremental":
|
||||||
|
filename = "当日增量.html"
|
||||||
|
else:
|
||||||
|
filename = "当日汇总.html"
|
||||||
|
else:
|
||||||
|
filename = f"{time_filename}.html"
|
||||||
|
|
||||||
|
# 构建输出路径
|
||||||
|
output_path = Path(output_dir) / date_folder / "html"
|
||||||
|
output_path.mkdir(parents=True, exist_ok=True)
|
||||||
|
file_path = str(output_path / filename)
|
||||||
|
|
||||||
|
# 准备报告数据
|
||||||
|
report_data = prepare_report_data(
|
||||||
|
stats,
|
||||||
|
failed_ids,
|
||||||
|
new_titles,
|
||||||
|
id_to_name,
|
||||||
|
mode,
|
||||||
|
rank_threshold,
|
||||||
|
matches_word_groups_func,
|
||||||
|
load_frequency_words_func,
|
||||||
|
)
|
||||||
|
|
||||||
|
# 渲染 HTML 内容
|
||||||
|
if render_html_func:
|
||||||
|
html_content = render_html_func(
|
||||||
|
report_data, total_titles, is_daily_summary, mode, update_info
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# 默认简单 HTML
|
||||||
|
html_content = f"<html><body><h1>Report</h1><pre>{report_data}</pre></body></html>"
|
||||||
|
|
||||||
|
# 写入文件
|
||||||
|
with open(file_path, "w", encoding="utf-8") as f:
|
||||||
|
f.write(html_content)
|
||||||
|
|
||||||
|
# 如果是每日汇总且启用 index 复制
|
||||||
|
if is_daily_summary and enable_index_copy:
|
||||||
|
# 生成到根目录(供 GitHub Pages 访问)
|
||||||
|
root_index_path = Path("index.html")
|
||||||
|
with open(root_index_path, "w", encoding="utf-8") as f:
|
||||||
|
f.write(html_content)
|
||||||
|
|
||||||
|
# 同时生成到 output 目录(供 Docker Volume 挂载访问)
|
||||||
|
output_index_path = Path(output_dir) / "index.html"
|
||||||
|
Path(output_dir).mkdir(parents=True, exist_ok=True)
|
||||||
|
with open(output_index_path, "w", encoding="utf-8") as f:
|
||||||
|
f.write(html_content)
|
||||||
|
|
||||||
|
return file_path
|
||||||
125
trendradar/report/helpers.py
Normal file
125
trendradar/report/helpers.py
Normal file
@ -0,0 +1,125 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
"""
|
||||||
|
报告辅助函数模块
|
||||||
|
|
||||||
|
提供报告生成相关的通用辅助函数
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
|
||||||
|
def clean_title(title: str) -> str:
|
||||||
|
"""清理标题中的特殊字符
|
||||||
|
|
||||||
|
清理规则:
|
||||||
|
- 将换行符(\n, \r)替换为空格
|
||||||
|
- 将多个连续空白字符合并为单个空格
|
||||||
|
- 去除首尾空白
|
||||||
|
|
||||||
|
Args:
|
||||||
|
title: 原始标题字符串
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
清理后的标题字符串
|
||||||
|
"""
|
||||||
|
if not isinstance(title, str):
|
||||||
|
title = str(title)
|
||||||
|
cleaned_title = title.replace("\n", " ").replace("\r", " ")
|
||||||
|
cleaned_title = re.sub(r"\s+", " ", cleaned_title)
|
||||||
|
cleaned_title = cleaned_title.strip()
|
||||||
|
return cleaned_title
|
||||||
|
|
||||||
|
|
||||||
|
def html_escape(text: str) -> str:
|
||||||
|
"""HTML特殊字符转义
|
||||||
|
|
||||||
|
转义规则(按顺序):
|
||||||
|
- & → &
|
||||||
|
- < → <
|
||||||
|
- > → >
|
||||||
|
- " → "
|
||||||
|
- ' → '
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: 原始文本
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
转义后的文本
|
||||||
|
"""
|
||||||
|
if not isinstance(text, str):
|
||||||
|
text = str(text)
|
||||||
|
|
||||||
|
return (
|
||||||
|
text.replace("&", "&")
|
||||||
|
.replace("<", "<")
|
||||||
|
.replace(">", ">")
|
||||||
|
.replace('"', """)
|
||||||
|
.replace("'", "'")
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def format_rank_display(ranks: List[int], rank_threshold: int, format_type: str) -> str:
|
||||||
|
"""格式化排名显示
|
||||||
|
|
||||||
|
根据不同平台类型生成对应格式的排名字符串。
|
||||||
|
当最小排名小于等于阈值时,使用高亮格式。
|
||||||
|
|
||||||
|
Args:
|
||||||
|
ranks: 排名列表(可能包含重复值)
|
||||||
|
rank_threshold: 高亮阈值,小于等于此值的排名会高亮显示
|
||||||
|
format_type: 平台类型,支持:
|
||||||
|
- "html": HTML格式
|
||||||
|
- "feishu": 飞书格式
|
||||||
|
- "dingtalk": 钉钉格式
|
||||||
|
- "wework": 企业微信格式
|
||||||
|
- "telegram": Telegram格式
|
||||||
|
- "slack": Slack格式
|
||||||
|
- 其他: 默认markdown格式
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
格式化后的排名字符串,如 "[1]" 或 "[1 - 5]"
|
||||||
|
如果排名列表为空,返回空字符串
|
||||||
|
"""
|
||||||
|
if not ranks:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
unique_ranks = sorted(set(ranks))
|
||||||
|
min_rank = unique_ranks[0]
|
||||||
|
max_rank = unique_ranks[-1]
|
||||||
|
|
||||||
|
# 根据平台类型选择高亮格式
|
||||||
|
if format_type == "html":
|
||||||
|
highlight_start = "<font color='red'><strong>"
|
||||||
|
highlight_end = "</strong></font>"
|
||||||
|
elif format_type == "feishu":
|
||||||
|
highlight_start = "<font color='red'>**"
|
||||||
|
highlight_end = "**</font>"
|
||||||
|
elif format_type == "dingtalk":
|
||||||
|
highlight_start = "**"
|
||||||
|
highlight_end = "**"
|
||||||
|
elif format_type == "wework":
|
||||||
|
highlight_start = "**"
|
||||||
|
highlight_end = "**"
|
||||||
|
elif format_type == "telegram":
|
||||||
|
highlight_start = "<b>"
|
||||||
|
highlight_end = "</b>"
|
||||||
|
elif format_type == "slack":
|
||||||
|
highlight_start = "*"
|
||||||
|
highlight_end = "*"
|
||||||
|
else:
|
||||||
|
# 默认 markdown 格式
|
||||||
|
highlight_start = "**"
|
||||||
|
highlight_end = "**"
|
||||||
|
|
||||||
|
# 生成排名显示
|
||||||
|
if min_rank <= rank_threshold:
|
||||||
|
if min_rank == max_rank:
|
||||||
|
return f"{highlight_start}[{min_rank}]{highlight_end}"
|
||||||
|
else:
|
||||||
|
return f"{highlight_start}[{min_rank} - {max_rank}]{highlight_end}"
|
||||||
|
else:
|
||||||
|
if min_rank == max_rank:
|
||||||
|
return f"[{min_rank}]"
|
||||||
|
else:
|
||||||
|
return f"[{min_rank} - {max_rank}]"
|
||||||
1050
trendradar/report/html.py
Normal file
1050
trendradar/report/html.py
Normal file
File diff suppressed because it is too large
Load Diff
44
trendradar/storage/__init__.py
Normal file
44
trendradar/storage/__init__.py
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
"""
|
||||||
|
存储模块 - 支持多种存储后端
|
||||||
|
|
||||||
|
支持的存储后端:
|
||||||
|
- local: 本地 SQLite + TXT/HTML 文件
|
||||||
|
- remote: 远程云存储(S3 兼容协议:R2/OSS/COS/S3 等)
|
||||||
|
- auto: 根据环境自动选择(GitHub Actions 用 remote,其他用 local)
|
||||||
|
"""
|
||||||
|
|
||||||
|
from trendradar.storage.base import (
|
||||||
|
StorageBackend,
|
||||||
|
NewsItem,
|
||||||
|
NewsData,
|
||||||
|
convert_crawl_results_to_news_data,
|
||||||
|
convert_news_data_to_results,
|
||||||
|
)
|
||||||
|
from trendradar.storage.local import LocalStorageBackend
|
||||||
|
from trendradar.storage.manager import StorageManager, get_storage_manager
|
||||||
|
|
||||||
|
# 远程后端可选导入(需要 boto3)
|
||||||
|
try:
|
||||||
|
from trendradar.storage.remote import RemoteStorageBackend
|
||||||
|
HAS_REMOTE = True
|
||||||
|
except ImportError:
|
||||||
|
RemoteStorageBackend = None
|
||||||
|
HAS_REMOTE = False
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
# 基础类
|
||||||
|
"StorageBackend",
|
||||||
|
"NewsItem",
|
||||||
|
"NewsData",
|
||||||
|
# 转换函数
|
||||||
|
"convert_crawl_results_to_news_data",
|
||||||
|
"convert_news_data_to_results",
|
||||||
|
# 后端实现
|
||||||
|
"LocalStorageBackend",
|
||||||
|
"RemoteStorageBackend",
|
||||||
|
"HAS_REMOTE",
|
||||||
|
# 管理器
|
||||||
|
"StorageManager",
|
||||||
|
"get_storage_manager",
|
||||||
|
]
|
||||||
457
trendradar/storage/base.py
Normal file
457
trendradar/storage/base.py
Normal file
@ -0,0 +1,457 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
"""
|
||||||
|
存储后端抽象基类和数据模型
|
||||||
|
|
||||||
|
定义统一的存储接口,所有存储后端都需要实现这些方法
|
||||||
|
"""
|
||||||
|
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Dict, List, Optional, Any
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class NewsItem:
|
||||||
|
"""新闻条目数据模型"""
|
||||||
|
|
||||||
|
title: str # 新闻标题
|
||||||
|
source_id: str # 来源平台ID(如 toutiao, baidu)
|
||||||
|
source_name: str = "" # 来源平台名称(运行时使用,数据库不存储)
|
||||||
|
rank: int = 0 # 排名
|
||||||
|
url: str = "" # 链接 URL
|
||||||
|
mobile_url: str = "" # 移动端 URL
|
||||||
|
crawl_time: str = "" # 抓取时间(HH:MM 格式)
|
||||||
|
|
||||||
|
# 统计信息(用于分析)
|
||||||
|
ranks: List[int] = field(default_factory=list) # 历史排名列表
|
||||||
|
first_time: str = "" # 首次出现时间
|
||||||
|
last_time: str = "" # 最后出现时间
|
||||||
|
count: int = 1 # 出现次数
|
||||||
|
|
||||||
|
def to_dict(self) -> Dict[str, Any]:
|
||||||
|
"""转换为字典"""
|
||||||
|
return {
|
||||||
|
"title": self.title,
|
||||||
|
"source_id": self.source_id,
|
||||||
|
"source_name": self.source_name,
|
||||||
|
"rank": self.rank,
|
||||||
|
"url": self.url,
|
||||||
|
"mobile_url": self.mobile_url,
|
||||||
|
"crawl_time": self.crawl_time,
|
||||||
|
"ranks": self.ranks,
|
||||||
|
"first_time": self.first_time,
|
||||||
|
"last_time": self.last_time,
|
||||||
|
"count": self.count,
|
||||||
|
}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_dict(cls, data: Dict[str, Any]) -> "NewsItem":
|
||||||
|
"""从字典创建"""
|
||||||
|
return cls(
|
||||||
|
title=data.get("title", ""),
|
||||||
|
source_id=data.get("source_id", ""),
|
||||||
|
source_name=data.get("source_name", ""),
|
||||||
|
rank=data.get("rank", 0),
|
||||||
|
url=data.get("url", ""),
|
||||||
|
mobile_url=data.get("mobile_url", ""),
|
||||||
|
crawl_time=data.get("crawl_time", ""),
|
||||||
|
ranks=data.get("ranks", []),
|
||||||
|
first_time=data.get("first_time", ""),
|
||||||
|
last_time=data.get("last_time", ""),
|
||||||
|
count=data.get("count", 1),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class NewsData:
|
||||||
|
"""
|
||||||
|
新闻数据集合
|
||||||
|
|
||||||
|
结构:
|
||||||
|
- date: 日期(YYYY-MM-DD)
|
||||||
|
- crawl_time: 抓取时间(HH时MM分)
|
||||||
|
- items: 按来源ID分组的新闻条目
|
||||||
|
- id_to_name: 来源ID到名称的映射
|
||||||
|
- failed_ids: 失败的来源ID列表
|
||||||
|
"""
|
||||||
|
|
||||||
|
date: str # 日期
|
||||||
|
crawl_time: str # 抓取时间
|
||||||
|
items: Dict[str, List[NewsItem]] # 按来源分组的新闻
|
||||||
|
id_to_name: Dict[str, str] = field(default_factory=dict) # ID到名称映射
|
||||||
|
failed_ids: List[str] = field(default_factory=list) # 失败的ID
|
||||||
|
|
||||||
|
def to_dict(self) -> Dict[str, Any]:
|
||||||
|
"""转换为字典"""
|
||||||
|
items_dict = {}
|
||||||
|
for source_id, news_list in self.items.items():
|
||||||
|
items_dict[source_id] = [item.to_dict() for item in news_list]
|
||||||
|
|
||||||
|
return {
|
||||||
|
"date": self.date,
|
||||||
|
"crawl_time": self.crawl_time,
|
||||||
|
"items": items_dict,
|
||||||
|
"id_to_name": self.id_to_name,
|
||||||
|
"failed_ids": self.failed_ids,
|
||||||
|
}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_dict(cls, data: Dict[str, Any]) -> "NewsData":
|
||||||
|
"""从字典创建"""
|
||||||
|
items = {}
|
||||||
|
items_data = data.get("items", {})
|
||||||
|
for source_id, news_list in items_data.items():
|
||||||
|
items[source_id] = [NewsItem.from_dict(item) for item in news_list]
|
||||||
|
|
||||||
|
return cls(
|
||||||
|
date=data.get("date", ""),
|
||||||
|
crawl_time=data.get("crawl_time", ""),
|
||||||
|
items=items,
|
||||||
|
id_to_name=data.get("id_to_name", {}),
|
||||||
|
failed_ids=data.get("failed_ids", []),
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_total_count(self) -> int:
|
||||||
|
"""获取新闻总数"""
|
||||||
|
return sum(len(news_list) for news_list in self.items.values())
|
||||||
|
|
||||||
|
def merge_with(self, other: "NewsData") -> "NewsData":
|
||||||
|
"""
|
||||||
|
合并另一个 NewsData 到当前数据
|
||||||
|
|
||||||
|
合并规则:
|
||||||
|
- 相同 source_id + title 的新闻合并排名历史
|
||||||
|
- 更新 last_time 和 count
|
||||||
|
- 保留较早的 first_time
|
||||||
|
"""
|
||||||
|
merged_items = {}
|
||||||
|
|
||||||
|
# 复制当前数据
|
||||||
|
for source_id, news_list in self.items.items():
|
||||||
|
merged_items[source_id] = {item.title: item for item in news_list}
|
||||||
|
|
||||||
|
# 合并其他数据
|
||||||
|
for source_id, news_list in other.items.items():
|
||||||
|
if source_id not in merged_items:
|
||||||
|
merged_items[source_id] = {}
|
||||||
|
|
||||||
|
for item in news_list:
|
||||||
|
if item.title in merged_items[source_id]:
|
||||||
|
# 合并已存在的新闻
|
||||||
|
existing = merged_items[source_id][item.title]
|
||||||
|
|
||||||
|
# 合并排名
|
||||||
|
existing_ranks = set(existing.ranks) if existing.ranks else set()
|
||||||
|
new_ranks = set(item.ranks) if item.ranks else set()
|
||||||
|
merged_ranks = sorted(existing_ranks | new_ranks)
|
||||||
|
existing.ranks = merged_ranks
|
||||||
|
|
||||||
|
# 更新时间
|
||||||
|
if item.first_time and (not existing.first_time or item.first_time < existing.first_time):
|
||||||
|
existing.first_time = item.first_time
|
||||||
|
if item.last_time and (not existing.last_time or item.last_time > existing.last_time):
|
||||||
|
existing.last_time = item.last_time
|
||||||
|
|
||||||
|
# 更新计数
|
||||||
|
existing.count += 1
|
||||||
|
|
||||||
|
# 保留URL(如果原来没有)
|
||||||
|
if not existing.url and item.url:
|
||||||
|
existing.url = item.url
|
||||||
|
if not existing.mobile_url and item.mobile_url:
|
||||||
|
existing.mobile_url = item.mobile_url
|
||||||
|
else:
|
||||||
|
# 添加新新闻
|
||||||
|
merged_items[source_id][item.title] = item
|
||||||
|
|
||||||
|
# 转换回列表格式
|
||||||
|
final_items = {}
|
||||||
|
for source_id, items_dict in merged_items.items():
|
||||||
|
final_items[source_id] = list(items_dict.values())
|
||||||
|
|
||||||
|
# 合并 id_to_name
|
||||||
|
merged_id_to_name = {**self.id_to_name, **other.id_to_name}
|
||||||
|
|
||||||
|
# 合并 failed_ids(去重)
|
||||||
|
merged_failed_ids = list(set(self.failed_ids + other.failed_ids))
|
||||||
|
|
||||||
|
return NewsData(
|
||||||
|
date=self.date or other.date,
|
||||||
|
crawl_time=other.crawl_time, # 使用较新的抓取时间
|
||||||
|
items=final_items,
|
||||||
|
id_to_name=merged_id_to_name,
|
||||||
|
failed_ids=merged_failed_ids,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class StorageBackend(ABC):
|
||||||
|
"""
|
||||||
|
存储后端抽象基类
|
||||||
|
|
||||||
|
所有存储后端都需要实现这些方法,以支持:
|
||||||
|
- 保存新闻数据
|
||||||
|
- 读取当天所有数据
|
||||||
|
- 检测新增新闻
|
||||||
|
- 生成报告文件(TXT/HTML)
|
||||||
|
"""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def save_news_data(self, data: NewsData) -> bool:
|
||||||
|
"""
|
||||||
|
保存新闻数据
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data: 新闻数据
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
是否保存成功
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def get_today_all_data(self, date: Optional[str] = None) -> Optional[NewsData]:
|
||||||
|
"""
|
||||||
|
获取指定日期的所有新闻数据
|
||||||
|
|
||||||
|
Args:
|
||||||
|
date: 日期字符串(YYYY-MM-DD),默认为今天
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
合并后的新闻数据,如果没有数据返回 None
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def get_latest_crawl_data(self, date: Optional[str] = None) -> Optional[NewsData]:
|
||||||
|
"""
|
||||||
|
获取最新一次抓取的数据
|
||||||
|
|
||||||
|
Args:
|
||||||
|
date: 日期字符串,默认为今天
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
最新抓取的新闻数据
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def detect_new_titles(self, current_data: NewsData) -> Dict[str, Dict]:
|
||||||
|
"""
|
||||||
|
检测新增的标题
|
||||||
|
|
||||||
|
Args:
|
||||||
|
current_data: 当前抓取的数据
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
新增的标题数据,格式: {source_id: {title: title_data}}
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def save_txt_snapshot(self, data: NewsData) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
保存 TXT 快照(可选功能,本地环境可用)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data: 新闻数据
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
保存的文件路径,如果不支持返回 None
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def save_html_report(self, html_content: str, filename: str, is_summary: bool = False) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
保存 HTML 报告
|
||||||
|
|
||||||
|
Args:
|
||||||
|
html_content: HTML 内容
|
||||||
|
filename: 文件名
|
||||||
|
is_summary: 是否为汇总报告
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
保存的文件路径
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def is_first_crawl_today(self, date: Optional[str] = None) -> bool:
|
||||||
|
"""
|
||||||
|
检查是否是当天第一次抓取
|
||||||
|
|
||||||
|
Args:
|
||||||
|
date: 日期字符串,默认为今天
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
是否是第一次抓取
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def cleanup(self) -> None:
|
||||||
|
"""
|
||||||
|
清理资源(如临时文件、数据库连接等)
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def cleanup_old_data(self, retention_days: int) -> int:
|
||||||
|
"""
|
||||||
|
清理过期数据
|
||||||
|
|
||||||
|
Args:
|
||||||
|
retention_days: 保留天数(0 表示不清理)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
删除的日期目录数量
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@property
|
||||||
|
@abstractmethod
|
||||||
|
def backend_name(self) -> str:
|
||||||
|
"""
|
||||||
|
存储后端名称
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@property
|
||||||
|
@abstractmethod
|
||||||
|
def supports_txt(self) -> bool:
|
||||||
|
"""
|
||||||
|
是否支持生成 TXT 快照
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
# === 推送记录相关方法 ===
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def has_pushed_today(self, date: Optional[str] = None) -> bool:
|
||||||
|
"""
|
||||||
|
检查指定日期是否已推送过
|
||||||
|
|
||||||
|
Args:
|
||||||
|
date: 日期字符串(YYYY-MM-DD),默认为今天
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
是否已推送
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def record_push(self, report_type: str, date: Optional[str] = None) -> bool:
|
||||||
|
"""
|
||||||
|
记录推送
|
||||||
|
|
||||||
|
Args:
|
||||||
|
report_type: 报告类型
|
||||||
|
date: 日期字符串(YYYY-MM-DD),默认为今天
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
是否记录成功
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def convert_crawl_results_to_news_data(
|
||||||
|
results: Dict[str, Dict],
|
||||||
|
id_to_name: Dict[str, str],
|
||||||
|
failed_ids: List[str],
|
||||||
|
crawl_time: str,
|
||||||
|
crawl_date: str,
|
||||||
|
) -> NewsData:
|
||||||
|
"""
|
||||||
|
将爬虫结果转换为 NewsData 格式
|
||||||
|
|
||||||
|
Args:
|
||||||
|
results: 爬虫返回的结果 {source_id: {title: {ranks: [], url: "", mobileUrl: ""}}}
|
||||||
|
id_to_name: 来源ID到名称的映射
|
||||||
|
failed_ids: 失败的来源ID
|
||||||
|
crawl_time: 抓取时间(HH:MM)
|
||||||
|
crawl_date: 抓取日期(YYYY-MM-DD)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
NewsData 对象
|
||||||
|
"""
|
||||||
|
items = {}
|
||||||
|
|
||||||
|
for source_id, titles_data in results.items():
|
||||||
|
source_name = id_to_name.get(source_id, source_id)
|
||||||
|
news_list = []
|
||||||
|
|
||||||
|
for title, data in titles_data.items():
|
||||||
|
if isinstance(data, dict):
|
||||||
|
ranks = data.get("ranks", [])
|
||||||
|
url = data.get("url", "")
|
||||||
|
mobile_url = data.get("mobileUrl", "")
|
||||||
|
else:
|
||||||
|
# 兼容旧格式
|
||||||
|
ranks = data if isinstance(data, list) else []
|
||||||
|
url = ""
|
||||||
|
mobile_url = ""
|
||||||
|
|
||||||
|
rank = ranks[0] if ranks else 99
|
||||||
|
|
||||||
|
news_item = NewsItem(
|
||||||
|
title=title,
|
||||||
|
source_id=source_id,
|
||||||
|
source_name=source_name,
|
||||||
|
rank=rank,
|
||||||
|
url=url,
|
||||||
|
mobile_url=mobile_url,
|
||||||
|
crawl_time=crawl_time,
|
||||||
|
ranks=ranks,
|
||||||
|
first_time=crawl_time,
|
||||||
|
last_time=crawl_time,
|
||||||
|
count=1,
|
||||||
|
)
|
||||||
|
news_list.append(news_item)
|
||||||
|
|
||||||
|
items[source_id] = news_list
|
||||||
|
|
||||||
|
return NewsData(
|
||||||
|
date=crawl_date,
|
||||||
|
crawl_time=crawl_time,
|
||||||
|
items=items,
|
||||||
|
id_to_name=id_to_name,
|
||||||
|
failed_ids=failed_ids,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def convert_news_data_to_results(data: NewsData) -> tuple:
|
||||||
|
"""
|
||||||
|
将 NewsData 转换回原有的 results 格式(用于兼容现有代码)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data: NewsData 对象
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(results, id_to_name, title_info) 元组
|
||||||
|
"""
|
||||||
|
results = {}
|
||||||
|
title_info = {}
|
||||||
|
|
||||||
|
for source_id, news_list in data.items.items():
|
||||||
|
results[source_id] = {}
|
||||||
|
title_info[source_id] = {}
|
||||||
|
|
||||||
|
for item in news_list:
|
||||||
|
results[source_id][item.title] = {
|
||||||
|
"ranks": item.ranks,
|
||||||
|
"url": item.url,
|
||||||
|
"mobileUrl": item.mobile_url,
|
||||||
|
}
|
||||||
|
|
||||||
|
title_info[source_id][item.title] = {
|
||||||
|
"first_time": item.first_time,
|
||||||
|
"last_time": item.last_time,
|
||||||
|
"count": item.count,
|
||||||
|
"ranks": item.ranks,
|
||||||
|
"url": item.url,
|
||||||
|
"mobileUrl": item.mobile_url,
|
||||||
|
}
|
||||||
|
|
||||||
|
return results, data.id_to_name, title_info
|
||||||
869
trendradar/storage/local.py
Normal file
869
trendradar/storage/local.py
Normal file
@ -0,0 +1,869 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
"""
|
||||||
|
本地存储后端 - SQLite + TXT/HTML
|
||||||
|
|
||||||
|
使用 SQLite 作为主存储,支持可选的 TXT 快照和 HTML 报告
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sqlite3
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import pytz
|
||||||
|
import re
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, List, Optional, Any
|
||||||
|
|
||||||
|
from trendradar.storage.base import StorageBackend, NewsItem, NewsData
|
||||||
|
from trendradar.utils.time import (
|
||||||
|
get_configured_time,
|
||||||
|
format_date_folder,
|
||||||
|
format_time_filename,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class LocalStorageBackend(StorageBackend):
|
||||||
|
"""
|
||||||
|
本地存储后端
|
||||||
|
|
||||||
|
使用 SQLite 数据库存储新闻数据,支持:
|
||||||
|
- 按日期组织的 SQLite 数据库文件
|
||||||
|
- 可选的 TXT 快照(用于调试)
|
||||||
|
- HTML 报告生成
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
data_dir: str = "output",
|
||||||
|
enable_txt: bool = True,
|
||||||
|
enable_html: bool = True,
|
||||||
|
timezone: str = "Asia/Shanghai",
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
初始化本地存储后端
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data_dir: 数据目录路径
|
||||||
|
enable_txt: 是否启用 TXT 快照
|
||||||
|
enable_html: 是否启用 HTML 报告
|
||||||
|
timezone: 时区配置(默认 Asia/Shanghai)
|
||||||
|
"""
|
||||||
|
self.data_dir = Path(data_dir)
|
||||||
|
self.enable_txt = enable_txt
|
||||||
|
self.enable_html = enable_html
|
||||||
|
self.timezone = timezone
|
||||||
|
self._db_connections: Dict[str, sqlite3.Connection] = {}
|
||||||
|
|
||||||
|
@property
|
||||||
|
def backend_name(self) -> str:
|
||||||
|
return "local"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def supports_txt(self) -> bool:
|
||||||
|
return self.enable_txt
|
||||||
|
|
||||||
|
def _get_configured_time(self) -> datetime:
|
||||||
|
"""获取配置时区的当前时间"""
|
||||||
|
return get_configured_time(self.timezone)
|
||||||
|
|
||||||
|
def _format_date_folder(self, date: Optional[str] = None) -> str:
|
||||||
|
"""格式化日期文件夹名 (ISO 格式: YYYY-MM-DD)"""
|
||||||
|
return format_date_folder(date, self.timezone)
|
||||||
|
|
||||||
|
def _format_time_filename(self) -> str:
|
||||||
|
"""格式化时间文件名 (格式: HH-MM)"""
|
||||||
|
return format_time_filename(self.timezone)
|
||||||
|
|
||||||
|
def _get_db_path(self, date: Optional[str] = None) -> Path:
|
||||||
|
"""获取 SQLite 数据库路径"""
|
||||||
|
date_folder = self._format_date_folder(date)
|
||||||
|
db_dir = self.data_dir / date_folder
|
||||||
|
db_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
return db_dir / "news.db"
|
||||||
|
|
||||||
|
def _get_connection(self, date: Optional[str] = None) -> sqlite3.Connection:
|
||||||
|
"""获取数据库连接(带缓存)"""
|
||||||
|
db_path = str(self._get_db_path(date))
|
||||||
|
|
||||||
|
if db_path not in self._db_connections:
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
self._init_tables(conn)
|
||||||
|
self._db_connections[db_path] = conn
|
||||||
|
|
||||||
|
return self._db_connections[db_path]
|
||||||
|
|
||||||
|
def _get_schema_path(self) -> Path:
|
||||||
|
"""获取 schema.sql 文件路径"""
|
||||||
|
return Path(__file__).parent / "schema.sql"
|
||||||
|
|
||||||
|
def _init_tables(self, conn: sqlite3.Connection) -> None:
|
||||||
|
"""从 schema.sql 初始化数据库表结构"""
|
||||||
|
schema_path = self._get_schema_path()
|
||||||
|
|
||||||
|
if schema_path.exists():
|
||||||
|
with open(schema_path, "r", encoding="utf-8") as f:
|
||||||
|
schema_sql = f.read()
|
||||||
|
conn.executescript(schema_sql)
|
||||||
|
else:
|
||||||
|
raise FileNotFoundError(f"Schema file not found: {schema_path}")
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
def save_news_data(self, data: NewsData) -> bool:
|
||||||
|
"""
|
||||||
|
保存新闻数据到 SQLite(以 URL 为唯一标识,支持标题更新检测)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data: 新闻数据
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
是否保存成功
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
conn = self._get_connection(data.date)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
# 获取配置时区的当前时间
|
||||||
|
now_str = self._get_configured_time().strftime("%Y-%m-%d %H:%M:%S")
|
||||||
|
|
||||||
|
# 首先同步平台信息到 platforms 表
|
||||||
|
for source_id, source_name in data.id_to_name.items():
|
||||||
|
cursor.execute("""
|
||||||
|
INSERT INTO platforms (id, name, updated_at)
|
||||||
|
VALUES (?, ?, ?)
|
||||||
|
ON CONFLICT(id) DO UPDATE SET
|
||||||
|
name = excluded.name,
|
||||||
|
updated_at = excluded.updated_at
|
||||||
|
""", (source_id, source_name, now_str))
|
||||||
|
|
||||||
|
# 统计计数器
|
||||||
|
new_count = 0
|
||||||
|
updated_count = 0
|
||||||
|
title_changed_count = 0
|
||||||
|
success_sources = []
|
||||||
|
|
||||||
|
for source_id, news_list in data.items.items():
|
||||||
|
success_sources.append(source_id)
|
||||||
|
|
||||||
|
for item in news_list:
|
||||||
|
try:
|
||||||
|
# 检查是否已存在(通过 URL + platform_id)
|
||||||
|
if item.url:
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT id, title FROM news_items
|
||||||
|
WHERE url = ? AND platform_id = ?
|
||||||
|
""", (item.url, source_id))
|
||||||
|
existing = cursor.fetchone()
|
||||||
|
|
||||||
|
if existing:
|
||||||
|
# 已存在,更新记录
|
||||||
|
existing_id, existing_title = existing
|
||||||
|
|
||||||
|
# 检查标题是否变化
|
||||||
|
if existing_title != item.title:
|
||||||
|
# 记录标题变更
|
||||||
|
cursor.execute("""
|
||||||
|
INSERT INTO title_changes
|
||||||
|
(news_item_id, old_title, new_title, changed_at)
|
||||||
|
VALUES (?, ?, ?, ?)
|
||||||
|
""", (existing_id, existing_title, item.title, now_str))
|
||||||
|
title_changed_count += 1
|
||||||
|
|
||||||
|
# 记录排名历史
|
||||||
|
cursor.execute("""
|
||||||
|
INSERT INTO rank_history
|
||||||
|
(news_item_id, rank, crawl_time, created_at)
|
||||||
|
VALUES (?, ?, ?, ?)
|
||||||
|
""", (existing_id, item.rank, data.crawl_time, now_str))
|
||||||
|
|
||||||
|
# 更新现有记录
|
||||||
|
cursor.execute("""
|
||||||
|
UPDATE news_items SET
|
||||||
|
title = ?,
|
||||||
|
rank = ?,
|
||||||
|
mobile_url = ?,
|
||||||
|
last_crawl_time = ?,
|
||||||
|
crawl_count = crawl_count + 1,
|
||||||
|
updated_at = ?
|
||||||
|
WHERE id = ?
|
||||||
|
""", (item.title, item.rank, item.mobile_url,
|
||||||
|
data.crawl_time, now_str, existing_id))
|
||||||
|
updated_count += 1
|
||||||
|
else:
|
||||||
|
# 不存在,插入新记录
|
||||||
|
cursor.execute("""
|
||||||
|
INSERT INTO news_items
|
||||||
|
(title, platform_id, rank, url, mobile_url,
|
||||||
|
first_crawl_time, last_crawl_time, crawl_count,
|
||||||
|
created_at, updated_at)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?, ?, 1, ?, ?)
|
||||||
|
""", (item.title, source_id, item.rank, item.url,
|
||||||
|
item.mobile_url, data.crawl_time, data.crawl_time,
|
||||||
|
now_str, now_str))
|
||||||
|
new_id = cursor.lastrowid
|
||||||
|
# 记录初始排名
|
||||||
|
cursor.execute("""
|
||||||
|
INSERT INTO rank_history
|
||||||
|
(news_item_id, rank, crawl_time, created_at)
|
||||||
|
VALUES (?, ?, ?, ?)
|
||||||
|
""", (new_id, item.rank, data.crawl_time, now_str))
|
||||||
|
new_count += 1
|
||||||
|
else:
|
||||||
|
# URL 为空的情况,直接插入(不做去重)
|
||||||
|
cursor.execute("""
|
||||||
|
INSERT INTO news_items
|
||||||
|
(title, platform_id, rank, url, mobile_url,
|
||||||
|
first_crawl_time, last_crawl_time, crawl_count,
|
||||||
|
created_at, updated_at)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?, ?, 1, ?, ?)
|
||||||
|
""", (item.title, source_id, item.rank, item.url,
|
||||||
|
item.mobile_url, data.crawl_time, data.crawl_time,
|
||||||
|
now_str, now_str))
|
||||||
|
new_id = cursor.lastrowid
|
||||||
|
# 记录初始排名
|
||||||
|
cursor.execute("""
|
||||||
|
INSERT INTO rank_history
|
||||||
|
(news_item_id, rank, crawl_time, created_at)
|
||||||
|
VALUES (?, ?, ?, ?)
|
||||||
|
""", (new_id, item.rank, data.crawl_time, now_str))
|
||||||
|
new_count += 1
|
||||||
|
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
print(f"保存新闻条目失败 [{item.title[:30]}...]: {e}")
|
||||||
|
|
||||||
|
total_items = new_count + updated_count
|
||||||
|
|
||||||
|
# 记录抓取信息
|
||||||
|
cursor.execute("""
|
||||||
|
INSERT OR REPLACE INTO crawl_records
|
||||||
|
(crawl_time, total_items, created_at)
|
||||||
|
VALUES (?, ?, ?)
|
||||||
|
""", (data.crawl_time, total_items, now_str))
|
||||||
|
|
||||||
|
# 获取刚插入的 crawl_record 的 ID
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT id FROM crawl_records WHERE crawl_time = ?
|
||||||
|
""", (data.crawl_time,))
|
||||||
|
record_row = cursor.fetchone()
|
||||||
|
if record_row:
|
||||||
|
crawl_record_id = record_row[0]
|
||||||
|
|
||||||
|
# 记录成功的来源
|
||||||
|
for source_id in success_sources:
|
||||||
|
cursor.execute("""
|
||||||
|
INSERT OR REPLACE INTO crawl_source_status
|
||||||
|
(crawl_record_id, platform_id, status)
|
||||||
|
VALUES (?, ?, 'success')
|
||||||
|
""", (crawl_record_id, source_id))
|
||||||
|
|
||||||
|
# 记录失败的来源
|
||||||
|
for failed_id in data.failed_ids:
|
||||||
|
# 确保失败的平台也在 platforms 表中
|
||||||
|
cursor.execute("""
|
||||||
|
INSERT OR IGNORE INTO platforms (id, name, updated_at)
|
||||||
|
VALUES (?, ?, ?)
|
||||||
|
""", (failed_id, failed_id, now_str))
|
||||||
|
|
||||||
|
cursor.execute("""
|
||||||
|
INSERT OR REPLACE INTO crawl_source_status
|
||||||
|
(crawl_record_id, platform_id, status)
|
||||||
|
VALUES (?, ?, 'failed')
|
||||||
|
""", (crawl_record_id, failed_id))
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
# 输出详细的存储统计日志
|
||||||
|
log_parts = [f"[本地存储] 处理完成:新增 {new_count} 条"]
|
||||||
|
if updated_count > 0:
|
||||||
|
log_parts.append(f"更新 {updated_count} 条")
|
||||||
|
if title_changed_count > 0:
|
||||||
|
log_parts.append(f"标题变更 {title_changed_count} 条")
|
||||||
|
print(",".join(log_parts))
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[本地存储] 保存失败: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def get_today_all_data(self, date: Optional[str] = None) -> Optional[NewsData]:
|
||||||
|
"""
|
||||||
|
获取指定日期的所有新闻数据(合并后)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
date: 日期字符串,默认为今天
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
合并后的新闻数据
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
db_path = self._get_db_path(date)
|
||||||
|
if not db_path.exists():
|
||||||
|
return None
|
||||||
|
|
||||||
|
conn = self._get_connection(date)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
# 获取所有新闻数据(包含 id 用于查询排名历史)
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT n.id, n.title, n.platform_id, p.name as platform_name,
|
||||||
|
n.rank, n.url, n.mobile_url,
|
||||||
|
n.first_crawl_time, n.last_crawl_time, n.crawl_count
|
||||||
|
FROM news_items n
|
||||||
|
LEFT JOIN platforms p ON n.platform_id = p.id
|
||||||
|
ORDER BY n.platform_id, n.last_crawl_time
|
||||||
|
""")
|
||||||
|
|
||||||
|
rows = cursor.fetchall()
|
||||||
|
if not rows:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 收集所有 news_item_id
|
||||||
|
news_ids = [row[0] for row in rows]
|
||||||
|
|
||||||
|
# 批量查询排名历史
|
||||||
|
rank_history_map: Dict[int, List[int]] = {}
|
||||||
|
if news_ids:
|
||||||
|
placeholders = ",".join("?" * len(news_ids))
|
||||||
|
cursor.execute(f"""
|
||||||
|
SELECT news_item_id, rank FROM rank_history
|
||||||
|
WHERE news_item_id IN ({placeholders})
|
||||||
|
ORDER BY news_item_id, crawl_time
|
||||||
|
""", news_ids)
|
||||||
|
for rh_row in cursor.fetchall():
|
||||||
|
news_id, rank = rh_row[0], rh_row[1]
|
||||||
|
if news_id not in rank_history_map:
|
||||||
|
rank_history_map[news_id] = []
|
||||||
|
if rank not in rank_history_map[news_id]:
|
||||||
|
rank_history_map[news_id].append(rank)
|
||||||
|
|
||||||
|
# 按 platform_id 分组
|
||||||
|
items: Dict[str, List[NewsItem]] = {}
|
||||||
|
id_to_name: Dict[str, str] = {}
|
||||||
|
crawl_date = self._format_date_folder(date)
|
||||||
|
|
||||||
|
for row in rows:
|
||||||
|
news_id = row[0]
|
||||||
|
platform_id = row[2]
|
||||||
|
title = row[1]
|
||||||
|
platform_name = row[3] or platform_id
|
||||||
|
|
||||||
|
id_to_name[platform_id] = platform_name
|
||||||
|
|
||||||
|
if platform_id not in items:
|
||||||
|
items[platform_id] = []
|
||||||
|
|
||||||
|
# 获取排名历史,如果没有则使用当前排名
|
||||||
|
ranks = rank_history_map.get(news_id, [row[4]])
|
||||||
|
|
||||||
|
items[platform_id].append(NewsItem(
|
||||||
|
title=title,
|
||||||
|
source_id=platform_id,
|
||||||
|
source_name=platform_name,
|
||||||
|
rank=row[4],
|
||||||
|
url=row[5] or "",
|
||||||
|
mobile_url=row[6] or "",
|
||||||
|
crawl_time=row[8], # last_crawl_time
|
||||||
|
ranks=ranks,
|
||||||
|
first_time=row[7], # first_crawl_time
|
||||||
|
last_time=row[8], # last_crawl_time
|
||||||
|
count=row[9], # crawl_count
|
||||||
|
))
|
||||||
|
|
||||||
|
final_items = items
|
||||||
|
|
||||||
|
# 获取失败的来源
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT DISTINCT css.platform_id
|
||||||
|
FROM crawl_source_status css
|
||||||
|
JOIN crawl_records cr ON css.crawl_record_id = cr.id
|
||||||
|
WHERE css.status = 'failed'
|
||||||
|
""")
|
||||||
|
failed_ids = [row[0] for row in cursor.fetchall()]
|
||||||
|
|
||||||
|
# 获取最新的抓取时间
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT crawl_time FROM crawl_records
|
||||||
|
ORDER BY crawl_time DESC
|
||||||
|
LIMIT 1
|
||||||
|
""")
|
||||||
|
|
||||||
|
time_row = cursor.fetchone()
|
||||||
|
crawl_time = time_row[0] if time_row else self._format_time_filename()
|
||||||
|
|
||||||
|
return NewsData(
|
||||||
|
date=crawl_date,
|
||||||
|
crawl_time=crawl_time,
|
||||||
|
items=final_items,
|
||||||
|
id_to_name=id_to_name,
|
||||||
|
failed_ids=failed_ids,
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[本地存储] 读取数据失败: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_latest_crawl_data(self, date: Optional[str] = None) -> Optional[NewsData]:
|
||||||
|
"""
|
||||||
|
获取最新一次抓取的数据
|
||||||
|
|
||||||
|
Args:
|
||||||
|
date: 日期字符串,默认为今天
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
最新抓取的新闻数据
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
db_path = self._get_db_path(date)
|
||||||
|
if not db_path.exists():
|
||||||
|
return None
|
||||||
|
|
||||||
|
conn = self._get_connection(date)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
# 获取最新的抓取时间
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT crawl_time FROM crawl_records
|
||||||
|
ORDER BY crawl_time DESC
|
||||||
|
LIMIT 1
|
||||||
|
""")
|
||||||
|
|
||||||
|
time_row = cursor.fetchone()
|
||||||
|
if not time_row:
|
||||||
|
return None
|
||||||
|
|
||||||
|
latest_time = time_row[0]
|
||||||
|
|
||||||
|
# 获取该时间的新闻数据(包含 id 用于查询排名历史)
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT n.id, n.title, n.platform_id, p.name as platform_name,
|
||||||
|
n.rank, n.url, n.mobile_url,
|
||||||
|
n.first_crawl_time, n.last_crawl_time, n.crawl_count
|
||||||
|
FROM news_items n
|
||||||
|
LEFT JOIN platforms p ON n.platform_id = p.id
|
||||||
|
WHERE n.last_crawl_time = ?
|
||||||
|
""", (latest_time,))
|
||||||
|
|
||||||
|
rows = cursor.fetchall()
|
||||||
|
if not rows:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 收集所有 news_item_id
|
||||||
|
news_ids = [row[0] for row in rows]
|
||||||
|
|
||||||
|
# 批量查询排名历史
|
||||||
|
rank_history_map: Dict[int, List[int]] = {}
|
||||||
|
if news_ids:
|
||||||
|
placeholders = ",".join("?" * len(news_ids))
|
||||||
|
cursor.execute(f"""
|
||||||
|
SELECT news_item_id, rank FROM rank_history
|
||||||
|
WHERE news_item_id IN ({placeholders})
|
||||||
|
ORDER BY news_item_id, crawl_time
|
||||||
|
""", news_ids)
|
||||||
|
for rh_row in cursor.fetchall():
|
||||||
|
news_id, rank = rh_row[0], rh_row[1]
|
||||||
|
if news_id not in rank_history_map:
|
||||||
|
rank_history_map[news_id] = []
|
||||||
|
if rank not in rank_history_map[news_id]:
|
||||||
|
rank_history_map[news_id].append(rank)
|
||||||
|
|
||||||
|
items: Dict[str, List[NewsItem]] = {}
|
||||||
|
id_to_name: Dict[str, str] = {}
|
||||||
|
crawl_date = self._format_date_folder(date)
|
||||||
|
|
||||||
|
for row in rows:
|
||||||
|
news_id = row[0]
|
||||||
|
platform_id = row[2]
|
||||||
|
platform_name = row[3] or platform_id
|
||||||
|
id_to_name[platform_id] = platform_name
|
||||||
|
|
||||||
|
if platform_id not in items:
|
||||||
|
items[platform_id] = []
|
||||||
|
|
||||||
|
# 获取排名历史,如果没有则使用当前排名
|
||||||
|
ranks = rank_history_map.get(news_id, [row[4]])
|
||||||
|
|
||||||
|
items[platform_id].append(NewsItem(
|
||||||
|
title=row[1],
|
||||||
|
source_id=platform_id,
|
||||||
|
source_name=platform_name,
|
||||||
|
rank=row[4],
|
||||||
|
url=row[5] or "",
|
||||||
|
mobile_url=row[6] or "",
|
||||||
|
crawl_time=row[8], # last_crawl_time
|
||||||
|
ranks=ranks,
|
||||||
|
first_time=row[7], # first_crawl_time
|
||||||
|
last_time=row[8], # last_crawl_time
|
||||||
|
count=row[9], # crawl_count
|
||||||
|
))
|
||||||
|
|
||||||
|
# 获取失败的来源(针对最新一次抓取)
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT css.platform_id
|
||||||
|
FROM crawl_source_status css
|
||||||
|
JOIN crawl_records cr ON css.crawl_record_id = cr.id
|
||||||
|
WHERE cr.crawl_time = ? AND css.status = 'failed'
|
||||||
|
""", (latest_time,))
|
||||||
|
|
||||||
|
failed_ids = [row[0] for row in cursor.fetchall()]
|
||||||
|
|
||||||
|
return NewsData(
|
||||||
|
date=crawl_date,
|
||||||
|
crawl_time=latest_time,
|
||||||
|
items=items,
|
||||||
|
id_to_name=id_to_name,
|
||||||
|
failed_ids=failed_ids,
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[本地存储] 获取最新数据失败: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def detect_new_titles(self, current_data: NewsData) -> Dict[str, Dict]:
|
||||||
|
"""
|
||||||
|
检测新增的标题
|
||||||
|
|
||||||
|
Args:
|
||||||
|
current_data: 当前抓取的数据
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
新增的标题数据 {source_id: {title: NewsItem}}
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# 获取历史数据
|
||||||
|
historical_data = self.get_today_all_data(current_data.date)
|
||||||
|
|
||||||
|
if not historical_data:
|
||||||
|
# 没有历史数据,所有都是新的
|
||||||
|
new_titles = {}
|
||||||
|
for source_id, news_list in current_data.items.items():
|
||||||
|
new_titles[source_id] = {item.title: item for item in news_list}
|
||||||
|
return new_titles
|
||||||
|
|
||||||
|
# 收集历史标题
|
||||||
|
historical_titles: Dict[str, set] = {}
|
||||||
|
for source_id, news_list in historical_data.items.items():
|
||||||
|
historical_titles[source_id] = {item.title for item in news_list}
|
||||||
|
|
||||||
|
# 检测新增
|
||||||
|
new_titles = {}
|
||||||
|
for source_id, news_list in current_data.items.items():
|
||||||
|
hist_set = historical_titles.get(source_id, set())
|
||||||
|
for item in news_list:
|
||||||
|
if item.title not in hist_set:
|
||||||
|
if source_id not in new_titles:
|
||||||
|
new_titles[source_id] = {}
|
||||||
|
new_titles[source_id][item.title] = item
|
||||||
|
|
||||||
|
return new_titles
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[本地存储] 检测新标题失败: {e}")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def save_txt_snapshot(self, data: NewsData) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
保存 TXT 快照
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data: 新闻数据
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
保存的文件路径
|
||||||
|
"""
|
||||||
|
if not self.enable_txt:
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
date_folder = self._format_date_folder(data.date)
|
||||||
|
txt_dir = self.data_dir / date_folder / "txt"
|
||||||
|
txt_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
file_path = txt_dir / f"{data.crawl_time}.txt"
|
||||||
|
|
||||||
|
with open(file_path, "w", encoding="utf-8") as f:
|
||||||
|
for source_id, news_list in data.items.items():
|
||||||
|
source_name = data.id_to_name.get(source_id, source_id)
|
||||||
|
|
||||||
|
# 写入来源标题
|
||||||
|
if source_name and source_name != source_id:
|
||||||
|
f.write(f"{source_id} | {source_name}\n")
|
||||||
|
else:
|
||||||
|
f.write(f"{source_id}\n")
|
||||||
|
|
||||||
|
# 按排名排序
|
||||||
|
sorted_news = sorted(news_list, key=lambda x: x.rank)
|
||||||
|
|
||||||
|
for item in sorted_news:
|
||||||
|
line = f"{item.rank}. {item.title}"
|
||||||
|
if item.url:
|
||||||
|
line += f" [URL:{item.url}]"
|
||||||
|
if item.mobile_url:
|
||||||
|
line += f" [MOBILE:{item.mobile_url}]"
|
||||||
|
f.write(line + "\n")
|
||||||
|
|
||||||
|
f.write("\n")
|
||||||
|
|
||||||
|
# 写入失败的来源
|
||||||
|
if data.failed_ids:
|
||||||
|
f.write("==== 以下ID请求失败 ====\n")
|
||||||
|
for failed_id in data.failed_ids:
|
||||||
|
f.write(f"{failed_id}\n")
|
||||||
|
|
||||||
|
print(f"[本地存储] TXT 快照已保存: {file_path}")
|
||||||
|
return str(file_path)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[本地存储] 保存 TXT 快照失败: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def save_html_report(self, html_content: str, filename: str, is_summary: bool = False) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
保存 HTML 报告
|
||||||
|
|
||||||
|
Args:
|
||||||
|
html_content: HTML 内容
|
||||||
|
filename: 文件名
|
||||||
|
is_summary: 是否为汇总报告
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
保存的文件路径
|
||||||
|
"""
|
||||||
|
if not self.enable_html:
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
date_folder = self._format_date_folder()
|
||||||
|
html_dir = self.data_dir / date_folder / "html"
|
||||||
|
html_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
file_path = html_dir / filename
|
||||||
|
|
||||||
|
with open(file_path, "w", encoding="utf-8") as f:
|
||||||
|
f.write(html_content)
|
||||||
|
|
||||||
|
print(f"[本地存储] HTML 报告已保存: {file_path}")
|
||||||
|
return str(file_path)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[本地存储] 保存 HTML 报告失败: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def is_first_crawl_today(self, date: Optional[str] = None) -> bool:
|
||||||
|
"""
|
||||||
|
检查是否是当天第一次抓取
|
||||||
|
|
||||||
|
Args:
|
||||||
|
date: 日期字符串,默认为今天
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
是否是第一次抓取
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
db_path = self._get_db_path(date)
|
||||||
|
if not db_path.exists():
|
||||||
|
return True
|
||||||
|
|
||||||
|
conn = self._get_connection(date)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT COUNT(*) as count FROM crawl_records
|
||||||
|
""")
|
||||||
|
|
||||||
|
row = cursor.fetchone()
|
||||||
|
count = row[0] if row else 0
|
||||||
|
|
||||||
|
# 如果只有一条或没有记录,视为第一次抓取
|
||||||
|
return count <= 1
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[本地存储] 检查首次抓取失败: {e}")
|
||||||
|
return True
|
||||||
|
|
||||||
|
def get_crawl_times(self, date: Optional[str] = None) -> List[str]:
|
||||||
|
"""
|
||||||
|
获取指定日期的所有抓取时间列表
|
||||||
|
|
||||||
|
Args:
|
||||||
|
date: 日期字符串,默认为今天
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
抓取时间列表(按时间排序)
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
db_path = self._get_db_path(date)
|
||||||
|
if not db_path.exists():
|
||||||
|
return []
|
||||||
|
|
||||||
|
conn = self._get_connection(date)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT crawl_time FROM crawl_records
|
||||||
|
ORDER BY crawl_time
|
||||||
|
""")
|
||||||
|
|
||||||
|
rows = cursor.fetchall()
|
||||||
|
return [row[0] for row in rows]
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[本地存储] 获取抓取时间列表失败: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def cleanup(self) -> None:
|
||||||
|
"""清理资源(关闭数据库连接)"""
|
||||||
|
for db_path, conn in self._db_connections.items():
|
||||||
|
try:
|
||||||
|
conn.close()
|
||||||
|
print(f"[本地存储] 关闭数据库连接: {db_path}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[本地存储] 关闭连接失败 {db_path}: {e}")
|
||||||
|
|
||||||
|
self._db_connections.clear()
|
||||||
|
|
||||||
|
def cleanup_old_data(self, retention_days: int) -> int:
|
||||||
|
"""
|
||||||
|
清理过期数据
|
||||||
|
|
||||||
|
Args:
|
||||||
|
retention_days: 保留天数(0 表示不清理)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
删除的日期目录数量
|
||||||
|
"""
|
||||||
|
if retention_days <= 0:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
deleted_count = 0
|
||||||
|
cutoff_date = self._get_configured_time() - timedelta(days=retention_days)
|
||||||
|
|
||||||
|
try:
|
||||||
|
if not self.data_dir.exists():
|
||||||
|
return 0
|
||||||
|
|
||||||
|
for date_folder in self.data_dir.iterdir():
|
||||||
|
if not date_folder.is_dir() or date_folder.name.startswith('.'):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 解析日期文件夹名(支持两种格式)
|
||||||
|
folder_date = None
|
||||||
|
try:
|
||||||
|
# ISO 格式: YYYY-MM-DD
|
||||||
|
date_match = re.match(r'(\d{4})-(\d{2})-(\d{2})', date_folder.name)
|
||||||
|
if date_match:
|
||||||
|
folder_date = datetime(
|
||||||
|
int(date_match.group(1)),
|
||||||
|
int(date_match.group(2)),
|
||||||
|
int(date_match.group(3)),
|
||||||
|
tzinfo=pytz.timezone("Asia/Shanghai")
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# 旧中文格式: YYYY年MM月DD日
|
||||||
|
date_match = re.match(r'(\d{4})年(\d{2})月(\d{2})日', date_folder.name)
|
||||||
|
if date_match:
|
||||||
|
folder_date = datetime(
|
||||||
|
int(date_match.group(1)),
|
||||||
|
int(date_match.group(2)),
|
||||||
|
int(date_match.group(3)),
|
||||||
|
tzinfo=pytz.timezone("Asia/Shanghai")
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if folder_date and folder_date < cutoff_date:
|
||||||
|
# 先关闭该日期的数据库连接
|
||||||
|
db_path = str(self._get_db_path(date_folder.name))
|
||||||
|
if db_path in self._db_connections:
|
||||||
|
try:
|
||||||
|
self._db_connections[db_path].close()
|
||||||
|
del self._db_connections[db_path]
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# 删除整个日期目录
|
||||||
|
try:
|
||||||
|
shutil.rmtree(date_folder)
|
||||||
|
deleted_count += 1
|
||||||
|
print(f"[本地存储] 清理过期数据: {date_folder.name}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[本地存储] 删除目录失败 {date_folder.name}: {e}")
|
||||||
|
|
||||||
|
if deleted_count > 0:
|
||||||
|
print(f"[本地存储] 共清理 {deleted_count} 个过期日期目录")
|
||||||
|
|
||||||
|
return deleted_count
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[本地存储] 清理过期数据失败: {e}")
|
||||||
|
return deleted_count
|
||||||
|
|
||||||
|
def has_pushed_today(self, date: Optional[str] = None) -> bool:
|
||||||
|
"""
|
||||||
|
检查指定日期是否已推送过
|
||||||
|
|
||||||
|
Args:
|
||||||
|
date: 日期字符串(YYYY-MM-DD),默认为今天
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
是否已推送
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
conn = self._get_connection(date)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
target_date = self._format_date_folder(date)
|
||||||
|
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT pushed FROM push_records WHERE date = ?
|
||||||
|
""", (target_date,))
|
||||||
|
|
||||||
|
row = cursor.fetchone()
|
||||||
|
if row:
|
||||||
|
return bool(row[0])
|
||||||
|
return False
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[本地存储] 检查推送记录失败: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def record_push(self, report_type: str, date: Optional[str] = None) -> bool:
|
||||||
|
"""
|
||||||
|
记录推送
|
||||||
|
|
||||||
|
Args:
|
||||||
|
report_type: 报告类型
|
||||||
|
date: 日期字符串(YYYY-MM-DD),默认为今天
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
是否记录成功
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
conn = self._get_connection(date)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
target_date = self._format_date_folder(date)
|
||||||
|
now_str = self._get_configured_time().strftime("%Y-%m-%d %H:%M:%S")
|
||||||
|
|
||||||
|
cursor.execute("""
|
||||||
|
INSERT INTO push_records (date, pushed, push_time, report_type, created_at)
|
||||||
|
VALUES (?, 1, ?, ?, ?)
|
||||||
|
ON CONFLICT(date) DO UPDATE SET
|
||||||
|
pushed = 1,
|
||||||
|
push_time = excluded.push_time,
|
||||||
|
report_type = excluded.report_type
|
||||||
|
""", (target_date, now_str, report_type, now_str))
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
print(f"[本地存储] 推送记录已保存: {report_type} at {now_str}")
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[本地存储] 记录推送失败: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def __del__(self):
|
||||||
|
"""析构函数,确保关闭连接"""
|
||||||
|
self.cleanup()
|
||||||
316
trendradar/storage/manager.py
Normal file
316
trendradar/storage/manager.py
Normal file
@ -0,0 +1,316 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
"""
|
||||||
|
存储管理器 - 统一管理存储后端
|
||||||
|
|
||||||
|
根据环境和配置自动选择合适的存储后端
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from trendradar.storage.base import StorageBackend, NewsData
|
||||||
|
|
||||||
|
|
||||||
|
# 存储管理器单例
|
||||||
|
_storage_manager: Optional["StorageManager"] = None
|
||||||
|
|
||||||
|
|
||||||
|
class StorageManager:
|
||||||
|
"""
|
||||||
|
存储管理器
|
||||||
|
|
||||||
|
功能:
|
||||||
|
- 自动检测运行环境(GitHub Actions / Docker / 本地)
|
||||||
|
- 根据配置选择存储后端(local / remote / auto)
|
||||||
|
- 提供统一的存储接口
|
||||||
|
- 支持从远程拉取数据到本地
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
backend_type: str = "auto",
|
||||||
|
data_dir: str = "output",
|
||||||
|
enable_txt: bool = True,
|
||||||
|
enable_html: bool = True,
|
||||||
|
remote_config: Optional[dict] = None,
|
||||||
|
local_retention_days: int = 0,
|
||||||
|
remote_retention_days: int = 0,
|
||||||
|
pull_enabled: bool = False,
|
||||||
|
pull_days: int = 0,
|
||||||
|
timezone: str = "Asia/Shanghai",
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
初始化存储管理器
|
||||||
|
|
||||||
|
Args:
|
||||||
|
backend_type: 存储后端类型 (local / remote / auto)
|
||||||
|
data_dir: 本地数据目录
|
||||||
|
enable_txt: 是否启用 TXT 快照
|
||||||
|
enable_html: 是否启用 HTML 报告
|
||||||
|
remote_config: 远程存储配置(endpoint_url, bucket_name, access_key_id 等)
|
||||||
|
local_retention_days: 本地数据保留天数(0 = 无限制)
|
||||||
|
remote_retention_days: 远程数据保留天数(0 = 无限制)
|
||||||
|
pull_enabled: 是否启用启动时自动拉取
|
||||||
|
pull_days: 拉取最近 N 天的数据
|
||||||
|
timezone: 时区配置(默认 Asia/Shanghai)
|
||||||
|
"""
|
||||||
|
self.backend_type = backend_type
|
||||||
|
self.data_dir = data_dir
|
||||||
|
self.enable_txt = enable_txt
|
||||||
|
self.enable_html = enable_html
|
||||||
|
self.remote_config = remote_config or {}
|
||||||
|
self.local_retention_days = local_retention_days
|
||||||
|
self.remote_retention_days = remote_retention_days
|
||||||
|
self.pull_enabled = pull_enabled
|
||||||
|
self.pull_days = pull_days
|
||||||
|
self.timezone = timezone
|
||||||
|
|
||||||
|
self._backend: Optional[StorageBackend] = None
|
||||||
|
self._remote_backend: Optional[StorageBackend] = None
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def is_github_actions() -> bool:
|
||||||
|
"""检测是否在 GitHub Actions 环境中运行"""
|
||||||
|
return os.environ.get("GITHUB_ACTIONS") == "true"
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def is_docker() -> bool:
|
||||||
|
"""检测是否在 Docker 容器中运行"""
|
||||||
|
# 方法1: 检查 /.dockerenv 文件
|
||||||
|
if os.path.exists("/.dockerenv"):
|
||||||
|
return True
|
||||||
|
|
||||||
|
# 方法2: 检查 cgroup(Linux)
|
||||||
|
try:
|
||||||
|
with open("/proc/1/cgroup", "r") as f:
|
||||||
|
return "docker" in f.read()
|
||||||
|
except (FileNotFoundError, PermissionError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
# 方法3: 检查环境变量
|
||||||
|
return os.environ.get("DOCKER_CONTAINER") == "true"
|
||||||
|
|
||||||
|
def _resolve_backend_type(self) -> str:
|
||||||
|
"""解析实际使用的后端类型"""
|
||||||
|
if self.backend_type == "auto":
|
||||||
|
if self.is_github_actions():
|
||||||
|
# GitHub Actions 环境,检查是否配置了远程存储
|
||||||
|
if self._has_remote_config():
|
||||||
|
return "remote"
|
||||||
|
else:
|
||||||
|
print("[存储管理器] GitHub Actions 环境但未配置远程存储,使用本地存储")
|
||||||
|
return "local"
|
||||||
|
else:
|
||||||
|
return "local"
|
||||||
|
return self.backend_type
|
||||||
|
|
||||||
|
def _has_remote_config(self) -> bool:
|
||||||
|
"""检查是否有有效的远程存储配置"""
|
||||||
|
# 检查配置或环境变量
|
||||||
|
bucket_name = self.remote_config.get("bucket_name") or os.environ.get("S3_BUCKET_NAME")
|
||||||
|
access_key = self.remote_config.get("access_key_id") or os.environ.get("S3_ACCESS_KEY_ID")
|
||||||
|
secret_key = self.remote_config.get("secret_access_key") or os.environ.get("S3_SECRET_ACCESS_KEY")
|
||||||
|
endpoint = self.remote_config.get("endpoint_url") or os.environ.get("S3_ENDPOINT_URL")
|
||||||
|
|
||||||
|
# 调试日志
|
||||||
|
has_config = bool(bucket_name and access_key and secret_key and endpoint)
|
||||||
|
if not has_config:
|
||||||
|
print(f"[存储管理器] 远程存储配置检查失败:")
|
||||||
|
print(f" - bucket_name: {'已配置' if bucket_name else '未配置'}")
|
||||||
|
print(f" - access_key_id: {'已配置' if access_key else '未配置'}")
|
||||||
|
print(f" - secret_access_key: {'已配置' if secret_key else '未配置'}")
|
||||||
|
print(f" - endpoint_url: {'已配置' if endpoint else '未配置'}")
|
||||||
|
|
||||||
|
return has_config
|
||||||
|
|
||||||
|
def _create_remote_backend(self) -> Optional[StorageBackend]:
|
||||||
|
"""创建远程存储后端"""
|
||||||
|
try:
|
||||||
|
from trendradar.storage.remote import RemoteStorageBackend
|
||||||
|
|
||||||
|
return RemoteStorageBackend(
|
||||||
|
bucket_name=self.remote_config.get("bucket_name") or os.environ.get("S3_BUCKET_NAME", ""),
|
||||||
|
access_key_id=self.remote_config.get("access_key_id") or os.environ.get("S3_ACCESS_KEY_ID", ""),
|
||||||
|
secret_access_key=self.remote_config.get("secret_access_key") or os.environ.get("S3_SECRET_ACCESS_KEY", ""),
|
||||||
|
endpoint_url=self.remote_config.get("endpoint_url") or os.environ.get("S3_ENDPOINT_URL", ""),
|
||||||
|
region=self.remote_config.get("region") or os.environ.get("S3_REGION", ""),
|
||||||
|
enable_txt=self.enable_txt,
|
||||||
|
enable_html=self.enable_html,
|
||||||
|
timezone=self.timezone,
|
||||||
|
)
|
||||||
|
except ImportError as e:
|
||||||
|
print(f"[存储管理器] 远程后端导入失败: {e}")
|
||||||
|
print("[存储管理器] 请确保已安装 boto3: pip install boto3")
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[存储管理器] 远程后端初始化失败: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_backend(self) -> StorageBackend:
|
||||||
|
"""获取存储后端实例"""
|
||||||
|
if self._backend is None:
|
||||||
|
resolved_type = self._resolve_backend_type()
|
||||||
|
|
||||||
|
if resolved_type == "remote":
|
||||||
|
self._backend = self._create_remote_backend()
|
||||||
|
if self._backend:
|
||||||
|
print(f"[存储管理器] 使用远程存储后端")
|
||||||
|
else:
|
||||||
|
print("[存储管理器] 回退到本地存储")
|
||||||
|
resolved_type = "local"
|
||||||
|
|
||||||
|
if resolved_type == "local" or self._backend is None:
|
||||||
|
from trendradar.storage.local import LocalStorageBackend
|
||||||
|
|
||||||
|
self._backend = LocalStorageBackend(
|
||||||
|
data_dir=self.data_dir,
|
||||||
|
enable_txt=self.enable_txt,
|
||||||
|
enable_html=self.enable_html,
|
||||||
|
timezone=self.timezone,
|
||||||
|
)
|
||||||
|
print(f"[存储管理器] 使用本地存储后端 (数据目录: {self.data_dir})")
|
||||||
|
|
||||||
|
return self._backend
|
||||||
|
|
||||||
|
def pull_from_remote(self) -> int:
|
||||||
|
"""
|
||||||
|
从远程拉取数据到本地
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
成功拉取的文件数量
|
||||||
|
"""
|
||||||
|
if not self.pull_enabled or self.pull_days <= 0:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
if not self._has_remote_config():
|
||||||
|
print("[存储管理器] 未配置远程存储,无法拉取")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
# 创建远程后端(如果还没有)
|
||||||
|
if self._remote_backend is None:
|
||||||
|
self._remote_backend = self._create_remote_backend()
|
||||||
|
|
||||||
|
if self._remote_backend is None:
|
||||||
|
print("[存储管理器] 无法创建远程后端,拉取失败")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
# 调用拉取方法
|
||||||
|
return self._remote_backend.pull_recent_days(self.pull_days, self.data_dir)
|
||||||
|
|
||||||
|
def save_news_data(self, data: NewsData) -> bool:
|
||||||
|
"""保存新闻数据"""
|
||||||
|
return self.get_backend().save_news_data(data)
|
||||||
|
|
||||||
|
def get_today_all_data(self, date: Optional[str] = None) -> Optional[NewsData]:
|
||||||
|
"""获取当天所有数据"""
|
||||||
|
return self.get_backend().get_today_all_data(date)
|
||||||
|
|
||||||
|
def get_latest_crawl_data(self, date: Optional[str] = None) -> Optional[NewsData]:
|
||||||
|
"""获取最新抓取数据"""
|
||||||
|
return self.get_backend().get_latest_crawl_data(date)
|
||||||
|
|
||||||
|
def detect_new_titles(self, current_data: NewsData) -> dict:
|
||||||
|
"""检测新增标题"""
|
||||||
|
return self.get_backend().detect_new_titles(current_data)
|
||||||
|
|
||||||
|
def save_txt_snapshot(self, data: NewsData) -> Optional[str]:
|
||||||
|
"""保存 TXT 快照"""
|
||||||
|
return self.get_backend().save_txt_snapshot(data)
|
||||||
|
|
||||||
|
def save_html_report(self, html_content: str, filename: str, is_summary: bool = False) -> Optional[str]:
|
||||||
|
"""保存 HTML 报告"""
|
||||||
|
return self.get_backend().save_html_report(html_content, filename, is_summary)
|
||||||
|
|
||||||
|
def is_first_crawl_today(self, date: Optional[str] = None) -> bool:
|
||||||
|
"""检查是否是当天第一次抓取"""
|
||||||
|
return self.get_backend().is_first_crawl_today(date)
|
||||||
|
|
||||||
|
def cleanup(self) -> None:
|
||||||
|
"""清理资源"""
|
||||||
|
if self._backend:
|
||||||
|
self._backend.cleanup()
|
||||||
|
if self._remote_backend:
|
||||||
|
self._remote_backend.cleanup()
|
||||||
|
|
||||||
|
def cleanup_old_data(self) -> int:
|
||||||
|
"""
|
||||||
|
清理过期数据
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
删除的日期目录数量
|
||||||
|
"""
|
||||||
|
total_deleted = 0
|
||||||
|
|
||||||
|
# 清理本地数据
|
||||||
|
if self.local_retention_days > 0:
|
||||||
|
total_deleted += self.get_backend().cleanup_old_data(self.local_retention_days)
|
||||||
|
|
||||||
|
# 清理远程数据(如果配置了)
|
||||||
|
if self.remote_retention_days > 0 and self._has_remote_config():
|
||||||
|
if self._remote_backend is None:
|
||||||
|
self._remote_backend = self._create_remote_backend()
|
||||||
|
if self._remote_backend:
|
||||||
|
total_deleted += self._remote_backend.cleanup_old_data(self.remote_retention_days)
|
||||||
|
|
||||||
|
return total_deleted
|
||||||
|
|
||||||
|
@property
|
||||||
|
def backend_name(self) -> str:
|
||||||
|
"""获取当前后端名称"""
|
||||||
|
return self.get_backend().backend_name
|
||||||
|
|
||||||
|
@property
|
||||||
|
def supports_txt(self) -> bool:
|
||||||
|
"""是否支持 TXT 快照"""
|
||||||
|
return self.get_backend().supports_txt
|
||||||
|
|
||||||
|
|
||||||
|
def get_storage_manager(
|
||||||
|
backend_type: str = "auto",
|
||||||
|
data_dir: str = "output",
|
||||||
|
enable_txt: bool = True,
|
||||||
|
enable_html: bool = True,
|
||||||
|
remote_config: Optional[dict] = None,
|
||||||
|
local_retention_days: int = 0,
|
||||||
|
remote_retention_days: int = 0,
|
||||||
|
pull_enabled: bool = False,
|
||||||
|
pull_days: int = 0,
|
||||||
|
timezone: str = "Asia/Shanghai",
|
||||||
|
force_new: bool = False,
|
||||||
|
) -> StorageManager:
|
||||||
|
"""
|
||||||
|
获取存储管理器单例
|
||||||
|
|
||||||
|
Args:
|
||||||
|
backend_type: 存储后端类型
|
||||||
|
data_dir: 本地数据目录
|
||||||
|
enable_txt: 是否启用 TXT 快照
|
||||||
|
enable_html: 是否启用 HTML 报告
|
||||||
|
remote_config: 远程存储配置
|
||||||
|
local_retention_days: 本地数据保留天数(0 = 无限制)
|
||||||
|
remote_retention_days: 远程数据保留天数(0 = 无限制)
|
||||||
|
pull_enabled: 是否启用启动时自动拉取
|
||||||
|
pull_days: 拉取最近 N 天的数据
|
||||||
|
timezone: 时区配置(默认 Asia/Shanghai)
|
||||||
|
force_new: 是否强制创建新实例
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
StorageManager 实例
|
||||||
|
"""
|
||||||
|
global _storage_manager
|
||||||
|
|
||||||
|
if _storage_manager is None or force_new:
|
||||||
|
_storage_manager = StorageManager(
|
||||||
|
backend_type=backend_type,
|
||||||
|
data_dir=data_dir,
|
||||||
|
enable_txt=enable_txt,
|
||||||
|
enable_html=enable_html,
|
||||||
|
remote_config=remote_config,
|
||||||
|
local_retention_days=local_retention_days,
|
||||||
|
remote_retention_days=remote_retention_days,
|
||||||
|
pull_enabled=pull_enabled,
|
||||||
|
pull_days=pull_days,
|
||||||
|
timezone=timezone,
|
||||||
|
)
|
||||||
|
|
||||||
|
return _storage_manager
|
||||||
1071
trendradar/storage/remote.py
Normal file
1071
trendradar/storage/remote.py
Normal file
File diff suppressed because it is too large
Load Diff
117
trendradar/storage/schema.sql
Normal file
117
trendradar/storage/schema.sql
Normal file
@ -0,0 +1,117 @@
|
|||||||
|
-- TrendRadar 数据库表结构
|
||||||
|
|
||||||
|
-- ============================================
|
||||||
|
-- 平台信息表
|
||||||
|
-- 核心:id 不变,name 可变
|
||||||
|
-- ============================================
|
||||||
|
CREATE TABLE IF NOT EXISTS platforms (
|
||||||
|
id TEXT PRIMARY KEY,
|
||||||
|
name TEXT NOT NULL,
|
||||||
|
is_active INTEGER DEFAULT 1,
|
||||||
|
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||||
|
);
|
||||||
|
|
||||||
|
-- ============================================
|
||||||
|
-- 新闻条目表
|
||||||
|
-- 以 URL + platform_id 为唯一标识,支持去重存储
|
||||||
|
-- ============================================
|
||||||
|
CREATE TABLE IF NOT EXISTS news_items (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
title TEXT NOT NULL,
|
||||||
|
platform_id TEXT NOT NULL,
|
||||||
|
rank INTEGER NOT NULL,
|
||||||
|
url TEXT DEFAULT '',
|
||||||
|
mobile_url TEXT DEFAULT '',
|
||||||
|
first_crawl_time TEXT NOT NULL, -- 首次抓取时间
|
||||||
|
last_crawl_time TEXT NOT NULL, -- 最后抓取时间
|
||||||
|
crawl_count INTEGER DEFAULT 1, -- 抓取次数
|
||||||
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
FOREIGN KEY (platform_id) REFERENCES platforms(id)
|
||||||
|
);
|
||||||
|
|
||||||
|
-- ============================================
|
||||||
|
-- 标题变更历史表
|
||||||
|
-- 记录同一 URL 下标题的变化
|
||||||
|
-- ============================================
|
||||||
|
CREATE TABLE IF NOT EXISTS title_changes (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
news_item_id INTEGER NOT NULL,
|
||||||
|
old_title TEXT NOT NULL,
|
||||||
|
new_title TEXT NOT NULL,
|
||||||
|
changed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
FOREIGN KEY (news_item_id) REFERENCES news_items(id)
|
||||||
|
);
|
||||||
|
|
||||||
|
-- ============================================
|
||||||
|
-- 排名历史表
|
||||||
|
-- 记录每次抓取时的排名变化
|
||||||
|
-- ============================================
|
||||||
|
CREATE TABLE IF NOT EXISTS rank_history (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
news_item_id INTEGER NOT NULL,
|
||||||
|
rank INTEGER NOT NULL,
|
||||||
|
crawl_time TEXT NOT NULL,
|
||||||
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
FOREIGN KEY (news_item_id) REFERENCES news_items(id)
|
||||||
|
);
|
||||||
|
|
||||||
|
-- ============================================
|
||||||
|
-- 抓取记录表
|
||||||
|
-- 记录每次抓取的时间和数量
|
||||||
|
-- ============================================
|
||||||
|
CREATE TABLE IF NOT EXISTS crawl_records (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
crawl_time TEXT NOT NULL UNIQUE,
|
||||||
|
total_items INTEGER DEFAULT 0,
|
||||||
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||||
|
);
|
||||||
|
|
||||||
|
-- ============================================
|
||||||
|
-- 抓取来源状态表
|
||||||
|
-- 记录每次抓取各平台的成功/失败状态
|
||||||
|
-- ============================================
|
||||||
|
CREATE TABLE IF NOT EXISTS crawl_source_status (
|
||||||
|
crawl_record_id INTEGER NOT NULL,
|
||||||
|
platform_id TEXT NOT NULL,
|
||||||
|
status TEXT NOT NULL CHECK(status IN ('success', 'failed')),
|
||||||
|
PRIMARY KEY (crawl_record_id, platform_id),
|
||||||
|
FOREIGN KEY (crawl_record_id) REFERENCES crawl_records(id),
|
||||||
|
FOREIGN KEY (platform_id) REFERENCES platforms(id)
|
||||||
|
);
|
||||||
|
|
||||||
|
-- ============================================
|
||||||
|
-- 推送记录表
|
||||||
|
-- 用于 push_window once_per_day 功能
|
||||||
|
-- ============================================
|
||||||
|
CREATE TABLE IF NOT EXISTS push_records (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
date TEXT NOT NULL UNIQUE,
|
||||||
|
pushed INTEGER DEFAULT 0,
|
||||||
|
push_time TEXT,
|
||||||
|
report_type TEXT,
|
||||||
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||||
|
);
|
||||||
|
|
||||||
|
-- ============================================
|
||||||
|
-- 索引定义
|
||||||
|
-- ============================================
|
||||||
|
|
||||||
|
-- 平台索引
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_news_platform ON news_items(platform_id);
|
||||||
|
|
||||||
|
-- 时间索引(用于查询最新数据)
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_news_crawl_time ON news_items(last_crawl_time);
|
||||||
|
|
||||||
|
-- 标题索引(用于标题搜索)
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_news_title ON news_items(title);
|
||||||
|
|
||||||
|
-- URL + platform_id 唯一索引(仅对非空 URL,实现去重)
|
||||||
|
CREATE UNIQUE INDEX IF NOT EXISTS idx_news_url_platform
|
||||||
|
ON news_items(url, platform_id) WHERE url != '';
|
||||||
|
|
||||||
|
-- 抓取状态索引
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_crawl_status_record ON crawl_source_status(crawl_record_id);
|
||||||
|
|
||||||
|
-- 排名历史索引
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_rank_history_news ON rank_history(news_item_id);
|
||||||
20
trendradar/utils/__init__.py
Normal file
20
trendradar/utils/__init__.py
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
"""
|
||||||
|
工具模块 - 公共工具函数
|
||||||
|
"""
|
||||||
|
|
||||||
|
from trendradar.utils.time import (
|
||||||
|
get_configured_time,
|
||||||
|
format_date_folder,
|
||||||
|
format_time_filename,
|
||||||
|
get_current_time_display,
|
||||||
|
convert_time_for_display,
|
||||||
|
)
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"get_configured_time",
|
||||||
|
"format_date_folder",
|
||||||
|
"format_time_filename",
|
||||||
|
"get_current_time_display",
|
||||||
|
"convert_time_for_display",
|
||||||
|
]
|
||||||
91
trendradar/utils/time.py
Normal file
91
trendradar/utils/time.py
Normal file
@ -0,0 +1,91 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
"""
|
||||||
|
时间工具模块 - 统一时间处理函数
|
||||||
|
"""
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import pytz
|
||||||
|
|
||||||
|
# 默认时区
|
||||||
|
DEFAULT_TIMEZONE = "Asia/Shanghai"
|
||||||
|
|
||||||
|
|
||||||
|
def get_configured_time(timezone: str = DEFAULT_TIMEZONE) -> datetime:
|
||||||
|
"""
|
||||||
|
获取配置时区的当前时间
|
||||||
|
|
||||||
|
Args:
|
||||||
|
timezone: 时区名称,如 'Asia/Shanghai', 'America/Los_Angeles'
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
带时区信息的当前时间
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
tz = pytz.timezone(timezone)
|
||||||
|
except pytz.UnknownTimeZoneError:
|
||||||
|
print(f"[警告] 未知时区 '{timezone}',使用默认时区 {DEFAULT_TIMEZONE}")
|
||||||
|
tz = pytz.timezone(DEFAULT_TIMEZONE)
|
||||||
|
return datetime.now(tz)
|
||||||
|
|
||||||
|
|
||||||
|
def format_date_folder(
|
||||||
|
date: Optional[str] = None, timezone: str = DEFAULT_TIMEZONE
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
格式化日期文件夹名 (ISO 格式: YYYY-MM-DD)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
date: 指定日期字符串,为 None 则使用当前日期
|
||||||
|
timezone: 时区名称
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
格式化后的日期字符串,如 '2025-12-09'
|
||||||
|
"""
|
||||||
|
if date:
|
||||||
|
return date
|
||||||
|
return get_configured_time(timezone).strftime("%Y-%m-%d")
|
||||||
|
|
||||||
|
|
||||||
|
def format_time_filename(timezone: str = DEFAULT_TIMEZONE) -> str:
|
||||||
|
"""
|
||||||
|
格式化时间文件名 (格式: HH-MM,用于文件名)
|
||||||
|
|
||||||
|
Windows 系统不支持冒号作为文件名,因此使用连字符
|
||||||
|
|
||||||
|
Args:
|
||||||
|
timezone: 时区名称
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
格式化后的时间字符串,如 '15-30'
|
||||||
|
"""
|
||||||
|
return get_configured_time(timezone).strftime("%H-%M")
|
||||||
|
|
||||||
|
|
||||||
|
def get_current_time_display(timezone: str = DEFAULT_TIMEZONE) -> str:
|
||||||
|
"""
|
||||||
|
获取当前时间显示 (格式: HH:MM,用于显示)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
timezone: 时区名称
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
格式化后的时间字符串,如 '15:30'
|
||||||
|
"""
|
||||||
|
return get_configured_time(timezone).strftime("%H:%M")
|
||||||
|
|
||||||
|
|
||||||
|
def convert_time_for_display(time_str: str) -> str:
|
||||||
|
"""
|
||||||
|
将 HH-MM 格式转换为 HH:MM 格式用于显示
|
||||||
|
|
||||||
|
Args:
|
||||||
|
time_str: 输入时间字符串,如 '15-30'
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
转换后的时间字符串,如 '15:30'
|
||||||
|
"""
|
||||||
|
if time_str and "-" in time_str and len(time_str) == 5:
|
||||||
|
return time_str.replace("-", ":")
|
||||||
|
return time_str
|
||||||
Loading…
Reference in New Issue
Block a user