v4.0.0 大大大更新

This commit is contained in:
sansan
2025-12-13 13:44:35 +08:00
parent 97c05aa33c
commit c7bacdfff7
61 changed files with 12407 additions and 5889 deletions
+44
View File
@@ -0,0 +1,44 @@
# coding=utf-8
"""
存储模块 - 支持多种存储后端
支持的存储后端:
- local: 本地 SQLite + TXT/HTML 文件
- remote: 远程云存储(S3 兼容协议:R2/OSS/COS/S3 等)
- auto: 根据环境自动选择(GitHub Actions 用 remote,其他用 local
"""
from trendradar.storage.base import (
StorageBackend,
NewsItem,
NewsData,
convert_crawl_results_to_news_data,
convert_news_data_to_results,
)
from trendradar.storage.local import LocalStorageBackend
from trendradar.storage.manager import StorageManager, get_storage_manager
# 远程后端可选导入(需要 boto3)
try:
from trendradar.storage.remote import RemoteStorageBackend
HAS_REMOTE = True
except ImportError:
RemoteStorageBackend = None
HAS_REMOTE = False
__all__ = [
# 基础类
"StorageBackend",
"NewsItem",
"NewsData",
# 转换函数
"convert_crawl_results_to_news_data",
"convert_news_data_to_results",
# 后端实现
"LocalStorageBackend",
"RemoteStorageBackend",
"HAS_REMOTE",
# 管理器
"StorageManager",
"get_storage_manager",
]
+457
View File
@@ -0,0 +1,457 @@
# coding=utf-8
"""
存储后端抽象基类和数据模型
定义统一的存储接口,所有存储后端都需要实现这些方法
"""
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from datetime import datetime
from typing import Dict, List, Optional, Any
import json
@dataclass
class NewsItem:
"""新闻条目数据模型"""
title: str # 新闻标题
source_id: str # 来源平台ID(如 toutiao, baidu
source_name: str = "" # 来源平台名称(运行时使用,数据库不存储)
rank: int = 0 # 排名
url: str = "" # 链接 URL
mobile_url: str = "" # 移动端 URL
crawl_time: str = "" # 抓取时间(HH:MM 格式)
# 统计信息(用于分析)
ranks: List[int] = field(default_factory=list) # 历史排名列表
first_time: str = "" # 首次出现时间
last_time: str = "" # 最后出现时间
count: int = 1 # 出现次数
def to_dict(self) -> Dict[str, Any]:
"""转换为字典"""
return {
"title": self.title,
"source_id": self.source_id,
"source_name": self.source_name,
"rank": self.rank,
"url": self.url,
"mobile_url": self.mobile_url,
"crawl_time": self.crawl_time,
"ranks": self.ranks,
"first_time": self.first_time,
"last_time": self.last_time,
"count": self.count,
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "NewsItem":
"""从字典创建"""
return cls(
title=data.get("title", ""),
source_id=data.get("source_id", ""),
source_name=data.get("source_name", ""),
rank=data.get("rank", 0),
url=data.get("url", ""),
mobile_url=data.get("mobile_url", ""),
crawl_time=data.get("crawl_time", ""),
ranks=data.get("ranks", []),
first_time=data.get("first_time", ""),
last_time=data.get("last_time", ""),
count=data.get("count", 1),
)
@dataclass
class NewsData:
"""
新闻数据集合
结构:
- date: 日期(YYYY-MM-DD
- crawl_time: 抓取时间(HH时MM分)
- items: 按来源ID分组的新闻条目
- id_to_name: 来源ID到名称的映射
- failed_ids: 失败的来源ID列表
"""
date: str # 日期
crawl_time: str # 抓取时间
items: Dict[str, List[NewsItem]] # 按来源分组的新闻
id_to_name: Dict[str, str] = field(default_factory=dict) # ID到名称映射
failed_ids: List[str] = field(default_factory=list) # 失败的ID
def to_dict(self) -> Dict[str, Any]:
"""转换为字典"""
items_dict = {}
for source_id, news_list in self.items.items():
items_dict[source_id] = [item.to_dict() for item in news_list]
return {
"date": self.date,
"crawl_time": self.crawl_time,
"items": items_dict,
"id_to_name": self.id_to_name,
"failed_ids": self.failed_ids,
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "NewsData":
"""从字典创建"""
items = {}
items_data = data.get("items", {})
for source_id, news_list in items_data.items():
items[source_id] = [NewsItem.from_dict(item) for item in news_list]
return cls(
date=data.get("date", ""),
crawl_time=data.get("crawl_time", ""),
items=items,
id_to_name=data.get("id_to_name", {}),
failed_ids=data.get("failed_ids", []),
)
def get_total_count(self) -> int:
"""获取新闻总数"""
return sum(len(news_list) for news_list in self.items.values())
def merge_with(self, other: "NewsData") -> "NewsData":
"""
合并另一个 NewsData 到当前数据
合并规则:
- 相同 source_id + title 的新闻合并排名历史
- 更新 last_time 和 count
- 保留较早的 first_time
"""
merged_items = {}
# 复制当前数据
for source_id, news_list in self.items.items():
merged_items[source_id] = {item.title: item for item in news_list}
# 合并其他数据
for source_id, news_list in other.items.items():
if source_id not in merged_items:
merged_items[source_id] = {}
for item in news_list:
if item.title in merged_items[source_id]:
# 合并已存在的新闻
existing = merged_items[source_id][item.title]
# 合并排名
existing_ranks = set(existing.ranks) if existing.ranks else set()
new_ranks = set(item.ranks) if item.ranks else set()
merged_ranks = sorted(existing_ranks | new_ranks)
existing.ranks = merged_ranks
# 更新时间
if item.first_time and (not existing.first_time or item.first_time < existing.first_time):
existing.first_time = item.first_time
if item.last_time and (not existing.last_time or item.last_time > existing.last_time):
existing.last_time = item.last_time
# 更新计数
existing.count += 1
# 保留URL(如果原来没有)
if not existing.url and item.url:
existing.url = item.url
if not existing.mobile_url and item.mobile_url:
existing.mobile_url = item.mobile_url
else:
# 添加新新闻
merged_items[source_id][item.title] = item
# 转换回列表格式
final_items = {}
for source_id, items_dict in merged_items.items():
final_items[source_id] = list(items_dict.values())
# 合并 id_to_name
merged_id_to_name = {**self.id_to_name, **other.id_to_name}
# 合并 failed_ids(去重)
merged_failed_ids = list(set(self.failed_ids + other.failed_ids))
return NewsData(
date=self.date or other.date,
crawl_time=other.crawl_time, # 使用较新的抓取时间
items=final_items,
id_to_name=merged_id_to_name,
failed_ids=merged_failed_ids,
)
class StorageBackend(ABC):
"""
存储后端抽象基类
所有存储后端都需要实现这些方法,以支持:
- 保存新闻数据
- 读取当天所有数据
- 检测新增新闻
- 生成报告文件(TXT/HTML
"""
@abstractmethod
def save_news_data(self, data: NewsData) -> bool:
"""
保存新闻数据
Args:
data: 新闻数据
Returns:
是否保存成功
"""
pass
@abstractmethod
def get_today_all_data(self, date: Optional[str] = None) -> Optional[NewsData]:
"""
获取指定日期的所有新闻数据
Args:
date: 日期字符串(YYYY-MM-DD),默认为今天
Returns:
合并后的新闻数据,如果没有数据返回 None
"""
pass
@abstractmethod
def get_latest_crawl_data(self, date: Optional[str] = None) -> Optional[NewsData]:
"""
获取最新一次抓取的数据
Args:
date: 日期字符串,默认为今天
Returns:
最新抓取的新闻数据
"""
pass
@abstractmethod
def detect_new_titles(self, current_data: NewsData) -> Dict[str, Dict]:
"""
检测新增的标题
Args:
current_data: 当前抓取的数据
Returns:
新增的标题数据,格式: {source_id: {title: title_data}}
"""
pass
@abstractmethod
def save_txt_snapshot(self, data: NewsData) -> Optional[str]:
"""
保存 TXT 快照(可选功能,本地环境可用)
Args:
data: 新闻数据
Returns:
保存的文件路径,如果不支持返回 None
"""
pass
@abstractmethod
def save_html_report(self, html_content: str, filename: str, is_summary: bool = False) -> Optional[str]:
"""
保存 HTML 报告
Args:
html_content: HTML 内容
filename: 文件名
is_summary: 是否为汇总报告
Returns:
保存的文件路径
"""
pass
@abstractmethod
def is_first_crawl_today(self, date: Optional[str] = None) -> bool:
"""
检查是否是当天第一次抓取
Args:
date: 日期字符串,默认为今天
Returns:
是否是第一次抓取
"""
pass
@abstractmethod
def cleanup(self) -> None:
"""
清理资源(如临时文件、数据库连接等)
"""
pass
@abstractmethod
def cleanup_old_data(self, retention_days: int) -> int:
"""
清理过期数据
Args:
retention_days: 保留天数(0 表示不清理)
Returns:
删除的日期目录数量
"""
pass
@property
@abstractmethod
def backend_name(self) -> str:
"""
存储后端名称
"""
pass
@property
@abstractmethod
def supports_txt(self) -> bool:
"""
是否支持生成 TXT 快照
"""
pass
# === 推送记录相关方法 ===
@abstractmethod
def has_pushed_today(self, date: Optional[str] = None) -> bool:
"""
检查指定日期是否已推送过
Args:
date: 日期字符串(YYYY-MM-DD),默认为今天
Returns:
是否已推送
"""
pass
@abstractmethod
def record_push(self, report_type: str, date: Optional[str] = None) -> bool:
"""
记录推送
Args:
report_type: 报告类型
date: 日期字符串(YYYY-MM-DD),默认为今天
Returns:
是否记录成功
"""
pass
def convert_crawl_results_to_news_data(
results: Dict[str, Dict],
id_to_name: Dict[str, str],
failed_ids: List[str],
crawl_time: str,
crawl_date: str,
) -> NewsData:
"""
将爬虫结果转换为 NewsData 格式
Args:
results: 爬虫返回的结果 {source_id: {title: {ranks: [], url: "", mobileUrl: ""}}}
id_to_name: 来源ID到名称的映射
failed_ids: 失败的来源ID
crawl_time: 抓取时间(HH:MM
crawl_date: 抓取日期(YYYY-MM-DD
Returns:
NewsData 对象
"""
items = {}
for source_id, titles_data in results.items():
source_name = id_to_name.get(source_id, source_id)
news_list = []
for title, data in titles_data.items():
if isinstance(data, dict):
ranks = data.get("ranks", [])
url = data.get("url", "")
mobile_url = data.get("mobileUrl", "")
else:
# 兼容旧格式
ranks = data if isinstance(data, list) else []
url = ""
mobile_url = ""
rank = ranks[0] if ranks else 99
news_item = NewsItem(
title=title,
source_id=source_id,
source_name=source_name,
rank=rank,
url=url,
mobile_url=mobile_url,
crawl_time=crawl_time,
ranks=ranks,
first_time=crawl_time,
last_time=crawl_time,
count=1,
)
news_list.append(news_item)
items[source_id] = news_list
return NewsData(
date=crawl_date,
crawl_time=crawl_time,
items=items,
id_to_name=id_to_name,
failed_ids=failed_ids,
)
def convert_news_data_to_results(data: NewsData) -> tuple:
"""
将 NewsData 转换回原有的 results 格式(用于兼容现有代码)
Args:
data: NewsData 对象
Returns:
(results, id_to_name, title_info) 元组
"""
results = {}
title_info = {}
for source_id, news_list in data.items.items():
results[source_id] = {}
title_info[source_id] = {}
for item in news_list:
results[source_id][item.title] = {
"ranks": item.ranks,
"url": item.url,
"mobileUrl": item.mobile_url,
}
title_info[source_id][item.title] = {
"first_time": item.first_time,
"last_time": item.last_time,
"count": item.count,
"ranks": item.ranks,
"url": item.url,
"mobileUrl": item.mobile_url,
}
return results, data.id_to_name, title_info
+869
View File
@@ -0,0 +1,869 @@
# coding=utf-8
"""
本地存储后端 - SQLite + TXT/HTML
使用 SQLite 作为主存储,支持可选的 TXT 快照和 HTML 报告
"""
import sqlite3
import os
import shutil
import pytz
import re
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, List, Optional, Any
from trendradar.storage.base import StorageBackend, NewsItem, NewsData
from trendradar.utils.time import (
get_configured_time,
format_date_folder,
format_time_filename,
)
class LocalStorageBackend(StorageBackend):
"""
本地存储后端
使用 SQLite 数据库存储新闻数据,支持:
- 按日期组织的 SQLite 数据库文件
- 可选的 TXT 快照(用于调试)
- HTML 报告生成
"""
def __init__(
self,
data_dir: str = "output",
enable_txt: bool = True,
enable_html: bool = True,
timezone: str = "Asia/Shanghai",
):
"""
初始化本地存储后端
Args:
data_dir: 数据目录路径
enable_txt: 是否启用 TXT 快照
enable_html: 是否启用 HTML 报告
timezone: 时区配置(默认 Asia/Shanghai
"""
self.data_dir = Path(data_dir)
self.enable_txt = enable_txt
self.enable_html = enable_html
self.timezone = timezone
self._db_connections: Dict[str, sqlite3.Connection] = {}
@property
def backend_name(self) -> str:
return "local"
@property
def supports_txt(self) -> bool:
return self.enable_txt
def _get_configured_time(self) -> datetime:
"""获取配置时区的当前时间"""
return get_configured_time(self.timezone)
def _format_date_folder(self, date: Optional[str] = None) -> str:
"""格式化日期文件夹名 (ISO 格式: YYYY-MM-DD)"""
return format_date_folder(date, self.timezone)
def _format_time_filename(self) -> str:
"""格式化时间文件名 (格式: HH-MM)"""
return format_time_filename(self.timezone)
def _get_db_path(self, date: Optional[str] = None) -> Path:
"""获取 SQLite 数据库路径"""
date_folder = self._format_date_folder(date)
db_dir = self.data_dir / date_folder
db_dir.mkdir(parents=True, exist_ok=True)
return db_dir / "news.db"
def _get_connection(self, date: Optional[str] = None) -> sqlite3.Connection:
"""获取数据库连接(带缓存)"""
db_path = str(self._get_db_path(date))
if db_path not in self._db_connections:
conn = sqlite3.connect(db_path)
conn.row_factory = sqlite3.Row
self._init_tables(conn)
self._db_connections[db_path] = conn
return self._db_connections[db_path]
def _get_schema_path(self) -> Path:
"""获取 schema.sql 文件路径"""
return Path(__file__).parent / "schema.sql"
def _init_tables(self, conn: sqlite3.Connection) -> None:
"""从 schema.sql 初始化数据库表结构"""
schema_path = self._get_schema_path()
if schema_path.exists():
with open(schema_path, "r", encoding="utf-8") as f:
schema_sql = f.read()
conn.executescript(schema_sql)
else:
raise FileNotFoundError(f"Schema file not found: {schema_path}")
conn.commit()
def save_news_data(self, data: NewsData) -> bool:
"""
保存新闻数据到 SQLite(以 URL 为唯一标识,支持标题更新检测)
Args:
data: 新闻数据
Returns:
是否保存成功
"""
try:
conn = self._get_connection(data.date)
cursor = conn.cursor()
# 获取配置时区的当前时间
now_str = self._get_configured_time().strftime("%Y-%m-%d %H:%M:%S")
# 首先同步平台信息到 platforms 表
for source_id, source_name in data.id_to_name.items():
cursor.execute("""
INSERT INTO platforms (id, name, updated_at)
VALUES (?, ?, ?)
ON CONFLICT(id) DO UPDATE SET
name = excluded.name,
updated_at = excluded.updated_at
""", (source_id, source_name, now_str))
# 统计计数器
new_count = 0
updated_count = 0
title_changed_count = 0
success_sources = []
for source_id, news_list in data.items.items():
success_sources.append(source_id)
for item in news_list:
try:
# 检查是否已存在(通过 URL + platform_id
if item.url:
cursor.execute("""
SELECT id, title FROM news_items
WHERE url = ? AND platform_id = ?
""", (item.url, source_id))
existing = cursor.fetchone()
if existing:
# 已存在,更新记录
existing_id, existing_title = existing
# 检查标题是否变化
if existing_title != item.title:
# 记录标题变更
cursor.execute("""
INSERT INTO title_changes
(news_item_id, old_title, new_title, changed_at)
VALUES (?, ?, ?, ?)
""", (existing_id, existing_title, item.title, now_str))
title_changed_count += 1
# 记录排名历史
cursor.execute("""
INSERT INTO rank_history
(news_item_id, rank, crawl_time, created_at)
VALUES (?, ?, ?, ?)
""", (existing_id, item.rank, data.crawl_time, now_str))
# 更新现有记录
cursor.execute("""
UPDATE news_items SET
title = ?,
rank = ?,
mobile_url = ?,
last_crawl_time = ?,
crawl_count = crawl_count + 1,
updated_at = ?
WHERE id = ?
""", (item.title, item.rank, item.mobile_url,
data.crawl_time, now_str, existing_id))
updated_count += 1
else:
# 不存在,插入新记录
cursor.execute("""
INSERT INTO news_items
(title, platform_id, rank, url, mobile_url,
first_crawl_time, last_crawl_time, crawl_count,
created_at, updated_at)
VALUES (?, ?, ?, ?, ?, ?, ?, 1, ?, ?)
""", (item.title, source_id, item.rank, item.url,
item.mobile_url, data.crawl_time, data.crawl_time,
now_str, now_str))
new_id = cursor.lastrowid
# 记录初始排名
cursor.execute("""
INSERT INTO rank_history
(news_item_id, rank, crawl_time, created_at)
VALUES (?, ?, ?, ?)
""", (new_id, item.rank, data.crawl_time, now_str))
new_count += 1
else:
# URL 为空的情况,直接插入(不做去重)
cursor.execute("""
INSERT INTO news_items
(title, platform_id, rank, url, mobile_url,
first_crawl_time, last_crawl_time, crawl_count,
created_at, updated_at)
VALUES (?, ?, ?, ?, ?, ?, ?, 1, ?, ?)
""", (item.title, source_id, item.rank, item.url,
item.mobile_url, data.crawl_time, data.crawl_time,
now_str, now_str))
new_id = cursor.lastrowid
# 记录初始排名
cursor.execute("""
INSERT INTO rank_history
(news_item_id, rank, crawl_time, created_at)
VALUES (?, ?, ?, ?)
""", (new_id, item.rank, data.crawl_time, now_str))
new_count += 1
except sqlite3.Error as e:
print(f"保存新闻条目失败 [{item.title[:30]}...]: {e}")
total_items = new_count + updated_count
# 记录抓取信息
cursor.execute("""
INSERT OR REPLACE INTO crawl_records
(crawl_time, total_items, created_at)
VALUES (?, ?, ?)
""", (data.crawl_time, total_items, now_str))
# 获取刚插入的 crawl_record 的 ID
cursor.execute("""
SELECT id FROM crawl_records WHERE crawl_time = ?
""", (data.crawl_time,))
record_row = cursor.fetchone()
if record_row:
crawl_record_id = record_row[0]
# 记录成功的来源
for source_id in success_sources:
cursor.execute("""
INSERT OR REPLACE INTO crawl_source_status
(crawl_record_id, platform_id, status)
VALUES (?, ?, 'success')
""", (crawl_record_id, source_id))
# 记录失败的来源
for failed_id in data.failed_ids:
# 确保失败的平台也在 platforms 表中
cursor.execute("""
INSERT OR IGNORE INTO platforms (id, name, updated_at)
VALUES (?, ?, ?)
""", (failed_id, failed_id, now_str))
cursor.execute("""
INSERT OR REPLACE INTO crawl_source_status
(crawl_record_id, platform_id, status)
VALUES (?, ?, 'failed')
""", (crawl_record_id, failed_id))
conn.commit()
# 输出详细的存储统计日志
log_parts = [f"[本地存储] 处理完成:新增 {new_count}"]
if updated_count > 0:
log_parts.append(f"更新 {updated_count}")
if title_changed_count > 0:
log_parts.append(f"标题变更 {title_changed_count}")
print("".join(log_parts))
return True
except Exception as e:
print(f"[本地存储] 保存失败: {e}")
return False
def get_today_all_data(self, date: Optional[str] = None) -> Optional[NewsData]:
"""
获取指定日期的所有新闻数据(合并后)
Args:
date: 日期字符串,默认为今天
Returns:
合并后的新闻数据
"""
try:
db_path = self._get_db_path(date)
if not db_path.exists():
return None
conn = self._get_connection(date)
cursor = conn.cursor()
# 获取所有新闻数据(包含 id 用于查询排名历史)
cursor.execute("""
SELECT n.id, n.title, n.platform_id, p.name as platform_name,
n.rank, n.url, n.mobile_url,
n.first_crawl_time, n.last_crawl_time, n.crawl_count
FROM news_items n
LEFT JOIN platforms p ON n.platform_id = p.id
ORDER BY n.platform_id, n.last_crawl_time
""")
rows = cursor.fetchall()
if not rows:
return None
# 收集所有 news_item_id
news_ids = [row[0] for row in rows]
# 批量查询排名历史
rank_history_map: Dict[int, List[int]] = {}
if news_ids:
placeholders = ",".join("?" * len(news_ids))
cursor.execute(f"""
SELECT news_item_id, rank FROM rank_history
WHERE news_item_id IN ({placeholders})
ORDER BY news_item_id, crawl_time
""", news_ids)
for rh_row in cursor.fetchall():
news_id, rank = rh_row[0], rh_row[1]
if news_id not in rank_history_map:
rank_history_map[news_id] = []
if rank not in rank_history_map[news_id]:
rank_history_map[news_id].append(rank)
# 按 platform_id 分组
items: Dict[str, List[NewsItem]] = {}
id_to_name: Dict[str, str] = {}
crawl_date = self._format_date_folder(date)
for row in rows:
news_id = row[0]
platform_id = row[2]
title = row[1]
platform_name = row[3] or platform_id
id_to_name[platform_id] = platform_name
if platform_id not in items:
items[platform_id] = []
# 获取排名历史,如果没有则使用当前排名
ranks = rank_history_map.get(news_id, [row[4]])
items[platform_id].append(NewsItem(
title=title,
source_id=platform_id,
source_name=platform_name,
rank=row[4],
url=row[5] or "",
mobile_url=row[6] or "",
crawl_time=row[8], # last_crawl_time
ranks=ranks,
first_time=row[7], # first_crawl_time
last_time=row[8], # last_crawl_time
count=row[9], # crawl_count
))
final_items = items
# 获取失败的来源
cursor.execute("""
SELECT DISTINCT css.platform_id
FROM crawl_source_status css
JOIN crawl_records cr ON css.crawl_record_id = cr.id
WHERE css.status = 'failed'
""")
failed_ids = [row[0] for row in cursor.fetchall()]
# 获取最新的抓取时间
cursor.execute("""
SELECT crawl_time FROM crawl_records
ORDER BY crawl_time DESC
LIMIT 1
""")
time_row = cursor.fetchone()
crawl_time = time_row[0] if time_row else self._format_time_filename()
return NewsData(
date=crawl_date,
crawl_time=crawl_time,
items=final_items,
id_to_name=id_to_name,
failed_ids=failed_ids,
)
except Exception as e:
print(f"[本地存储] 读取数据失败: {e}")
return None
def get_latest_crawl_data(self, date: Optional[str] = None) -> Optional[NewsData]:
"""
获取最新一次抓取的数据
Args:
date: 日期字符串,默认为今天
Returns:
最新抓取的新闻数据
"""
try:
db_path = self._get_db_path(date)
if not db_path.exists():
return None
conn = self._get_connection(date)
cursor = conn.cursor()
# 获取最新的抓取时间
cursor.execute("""
SELECT crawl_time FROM crawl_records
ORDER BY crawl_time DESC
LIMIT 1
""")
time_row = cursor.fetchone()
if not time_row:
return None
latest_time = time_row[0]
# 获取该时间的新闻数据(包含 id 用于查询排名历史)
cursor.execute("""
SELECT n.id, n.title, n.platform_id, p.name as platform_name,
n.rank, n.url, n.mobile_url,
n.first_crawl_time, n.last_crawl_time, n.crawl_count
FROM news_items n
LEFT JOIN platforms p ON n.platform_id = p.id
WHERE n.last_crawl_time = ?
""", (latest_time,))
rows = cursor.fetchall()
if not rows:
return None
# 收集所有 news_item_id
news_ids = [row[0] for row in rows]
# 批量查询排名历史
rank_history_map: Dict[int, List[int]] = {}
if news_ids:
placeholders = ",".join("?" * len(news_ids))
cursor.execute(f"""
SELECT news_item_id, rank FROM rank_history
WHERE news_item_id IN ({placeholders})
ORDER BY news_item_id, crawl_time
""", news_ids)
for rh_row in cursor.fetchall():
news_id, rank = rh_row[0], rh_row[1]
if news_id not in rank_history_map:
rank_history_map[news_id] = []
if rank not in rank_history_map[news_id]:
rank_history_map[news_id].append(rank)
items: Dict[str, List[NewsItem]] = {}
id_to_name: Dict[str, str] = {}
crawl_date = self._format_date_folder(date)
for row in rows:
news_id = row[0]
platform_id = row[2]
platform_name = row[3] or platform_id
id_to_name[platform_id] = platform_name
if platform_id not in items:
items[platform_id] = []
# 获取排名历史,如果没有则使用当前排名
ranks = rank_history_map.get(news_id, [row[4]])
items[platform_id].append(NewsItem(
title=row[1],
source_id=platform_id,
source_name=platform_name,
rank=row[4],
url=row[5] or "",
mobile_url=row[6] or "",
crawl_time=row[8], # last_crawl_time
ranks=ranks,
first_time=row[7], # first_crawl_time
last_time=row[8], # last_crawl_time
count=row[9], # crawl_count
))
# 获取失败的来源(针对最新一次抓取)
cursor.execute("""
SELECT css.platform_id
FROM crawl_source_status css
JOIN crawl_records cr ON css.crawl_record_id = cr.id
WHERE cr.crawl_time = ? AND css.status = 'failed'
""", (latest_time,))
failed_ids = [row[0] for row in cursor.fetchall()]
return NewsData(
date=crawl_date,
crawl_time=latest_time,
items=items,
id_to_name=id_to_name,
failed_ids=failed_ids,
)
except Exception as e:
print(f"[本地存储] 获取最新数据失败: {e}")
return None
def detect_new_titles(self, current_data: NewsData) -> Dict[str, Dict]:
"""
检测新增的标题
Args:
current_data: 当前抓取的数据
Returns:
新增的标题数据 {source_id: {title: NewsItem}}
"""
try:
# 获取历史数据
historical_data = self.get_today_all_data(current_data.date)
if not historical_data:
# 没有历史数据,所有都是新的
new_titles = {}
for source_id, news_list in current_data.items.items():
new_titles[source_id] = {item.title: item for item in news_list}
return new_titles
# 收集历史标题
historical_titles: Dict[str, set] = {}
for source_id, news_list in historical_data.items.items():
historical_titles[source_id] = {item.title for item in news_list}
# 检测新增
new_titles = {}
for source_id, news_list in current_data.items.items():
hist_set = historical_titles.get(source_id, set())
for item in news_list:
if item.title not in hist_set:
if source_id not in new_titles:
new_titles[source_id] = {}
new_titles[source_id][item.title] = item
return new_titles
except Exception as e:
print(f"[本地存储] 检测新标题失败: {e}")
return {}
def save_txt_snapshot(self, data: NewsData) -> Optional[str]:
"""
保存 TXT 快照
Args:
data: 新闻数据
Returns:
保存的文件路径
"""
if not self.enable_txt:
return None
try:
date_folder = self._format_date_folder(data.date)
txt_dir = self.data_dir / date_folder / "txt"
txt_dir.mkdir(parents=True, exist_ok=True)
file_path = txt_dir / f"{data.crawl_time}.txt"
with open(file_path, "w", encoding="utf-8") as f:
for source_id, news_list in data.items.items():
source_name = data.id_to_name.get(source_id, source_id)
# 写入来源标题
if source_name and source_name != source_id:
f.write(f"{source_id} | {source_name}\n")
else:
f.write(f"{source_id}\n")
# 按排名排序
sorted_news = sorted(news_list, key=lambda x: x.rank)
for item in sorted_news:
line = f"{item.rank}. {item.title}"
if item.url:
line += f" [URL:{item.url}]"
if item.mobile_url:
line += f" [MOBILE:{item.mobile_url}]"
f.write(line + "\n")
f.write("\n")
# 写入失败的来源
if data.failed_ids:
f.write("==== 以下ID请求失败 ====\n")
for failed_id in data.failed_ids:
f.write(f"{failed_id}\n")
print(f"[本地存储] TXT 快照已保存: {file_path}")
return str(file_path)
except Exception as e:
print(f"[本地存储] 保存 TXT 快照失败: {e}")
return None
def save_html_report(self, html_content: str, filename: str, is_summary: bool = False) -> Optional[str]:
"""
保存 HTML 报告
Args:
html_content: HTML 内容
filename: 文件名
is_summary: 是否为汇总报告
Returns:
保存的文件路径
"""
if not self.enable_html:
return None
try:
date_folder = self._format_date_folder()
html_dir = self.data_dir / date_folder / "html"
html_dir.mkdir(parents=True, exist_ok=True)
file_path = html_dir / filename
with open(file_path, "w", encoding="utf-8") as f:
f.write(html_content)
print(f"[本地存储] HTML 报告已保存: {file_path}")
return str(file_path)
except Exception as e:
print(f"[本地存储] 保存 HTML 报告失败: {e}")
return None
def is_first_crawl_today(self, date: Optional[str] = None) -> bool:
"""
检查是否是当天第一次抓取
Args:
date: 日期字符串,默认为今天
Returns:
是否是第一次抓取
"""
try:
db_path = self._get_db_path(date)
if not db_path.exists():
return True
conn = self._get_connection(date)
cursor = conn.cursor()
cursor.execute("""
SELECT COUNT(*) as count FROM crawl_records
""")
row = cursor.fetchone()
count = row[0] if row else 0
# 如果只有一条或没有记录,视为第一次抓取
return count <= 1
except Exception as e:
print(f"[本地存储] 检查首次抓取失败: {e}")
return True
def get_crawl_times(self, date: Optional[str] = None) -> List[str]:
"""
获取指定日期的所有抓取时间列表
Args:
date: 日期字符串,默认为今天
Returns:
抓取时间列表(按时间排序)
"""
try:
db_path = self._get_db_path(date)
if not db_path.exists():
return []
conn = self._get_connection(date)
cursor = conn.cursor()
cursor.execute("""
SELECT crawl_time FROM crawl_records
ORDER BY crawl_time
""")
rows = cursor.fetchall()
return [row[0] for row in rows]
except Exception as e:
print(f"[本地存储] 获取抓取时间列表失败: {e}")
return []
def cleanup(self) -> None:
"""清理资源(关闭数据库连接)"""
for db_path, conn in self._db_connections.items():
try:
conn.close()
print(f"[本地存储] 关闭数据库连接: {db_path}")
except Exception as e:
print(f"[本地存储] 关闭连接失败 {db_path}: {e}")
self._db_connections.clear()
def cleanup_old_data(self, retention_days: int) -> int:
"""
清理过期数据
Args:
retention_days: 保留天数(0 表示不清理)
Returns:
删除的日期目录数量
"""
if retention_days <= 0:
return 0
deleted_count = 0
cutoff_date = self._get_configured_time() - timedelta(days=retention_days)
try:
if not self.data_dir.exists():
return 0
for date_folder in self.data_dir.iterdir():
if not date_folder.is_dir() or date_folder.name.startswith('.'):
continue
# 解析日期文件夹名(支持两种格式)
folder_date = None
try:
# ISO 格式: YYYY-MM-DD
date_match = re.match(r'(\d{4})-(\d{2})-(\d{2})', date_folder.name)
if date_match:
folder_date = datetime(
int(date_match.group(1)),
int(date_match.group(2)),
int(date_match.group(3)),
tzinfo=pytz.timezone("Asia/Shanghai")
)
else:
# 旧中文格式: YYYY年MM月DD日
date_match = re.match(r'(\d{4})年(\d{2})月(\d{2})日', date_folder.name)
if date_match:
folder_date = datetime(
int(date_match.group(1)),
int(date_match.group(2)),
int(date_match.group(3)),
tzinfo=pytz.timezone("Asia/Shanghai")
)
except Exception:
continue
if folder_date and folder_date < cutoff_date:
# 先关闭该日期的数据库连接
db_path = str(self._get_db_path(date_folder.name))
if db_path in self._db_connections:
try:
self._db_connections[db_path].close()
del self._db_connections[db_path]
except Exception:
pass
# 删除整个日期目录
try:
shutil.rmtree(date_folder)
deleted_count += 1
print(f"[本地存储] 清理过期数据: {date_folder.name}")
except Exception as e:
print(f"[本地存储] 删除目录失败 {date_folder.name}: {e}")
if deleted_count > 0:
print(f"[本地存储] 共清理 {deleted_count} 个过期日期目录")
return deleted_count
except Exception as e:
print(f"[本地存储] 清理过期数据失败: {e}")
return deleted_count
def has_pushed_today(self, date: Optional[str] = None) -> bool:
"""
检查指定日期是否已推送过
Args:
date: 日期字符串(YYYY-MM-DD),默认为今天
Returns:
是否已推送
"""
try:
conn = self._get_connection(date)
cursor = conn.cursor()
target_date = self._format_date_folder(date)
cursor.execute("""
SELECT pushed FROM push_records WHERE date = ?
""", (target_date,))
row = cursor.fetchone()
if row:
return bool(row[0])
return False
except Exception as e:
print(f"[本地存储] 检查推送记录失败: {e}")
return False
def record_push(self, report_type: str, date: Optional[str] = None) -> bool:
"""
记录推送
Args:
report_type: 报告类型
date: 日期字符串(YYYY-MM-DD),默认为今天
Returns:
是否记录成功
"""
try:
conn = self._get_connection(date)
cursor = conn.cursor()
target_date = self._format_date_folder(date)
now_str = self._get_configured_time().strftime("%Y-%m-%d %H:%M:%S")
cursor.execute("""
INSERT INTO push_records (date, pushed, push_time, report_type, created_at)
VALUES (?, 1, ?, ?, ?)
ON CONFLICT(date) DO UPDATE SET
pushed = 1,
push_time = excluded.push_time,
report_type = excluded.report_type
""", (target_date, now_str, report_type, now_str))
conn.commit()
print(f"[本地存储] 推送记录已保存: {report_type} at {now_str}")
return True
except Exception as e:
print(f"[本地存储] 记录推送失败: {e}")
return False
def __del__(self):
"""析构函数,确保关闭连接"""
self.cleanup()
+316
View File
@@ -0,0 +1,316 @@
# coding=utf-8
"""
存储管理器 - 统一管理存储后端
根据环境和配置自动选择合适的存储后端
"""
import os
from typing import Optional
from trendradar.storage.base import StorageBackend, NewsData
# 存储管理器单例
_storage_manager: Optional["StorageManager"] = None
class StorageManager:
"""
存储管理器
功能:
- 自动检测运行环境(GitHub Actions / Docker / 本地)
- 根据配置选择存储后端(local / remote / auto
- 提供统一的存储接口
- 支持从远程拉取数据到本地
"""
def __init__(
self,
backend_type: str = "auto",
data_dir: str = "output",
enable_txt: bool = True,
enable_html: bool = True,
remote_config: Optional[dict] = None,
local_retention_days: int = 0,
remote_retention_days: int = 0,
pull_enabled: bool = False,
pull_days: int = 0,
timezone: str = "Asia/Shanghai",
):
"""
初始化存储管理器
Args:
backend_type: 存储后端类型 (local / remote / auto)
data_dir: 本地数据目录
enable_txt: 是否启用 TXT 快照
enable_html: 是否启用 HTML 报告
remote_config: 远程存储配置(endpoint_url, bucket_name, access_key_id 等)
local_retention_days: 本地数据保留天数(0 = 无限制)
remote_retention_days: 远程数据保留天数(0 = 无限制)
pull_enabled: 是否启用启动时自动拉取
pull_days: 拉取最近 N 天的数据
timezone: 时区配置(默认 Asia/Shanghai
"""
self.backend_type = backend_type
self.data_dir = data_dir
self.enable_txt = enable_txt
self.enable_html = enable_html
self.remote_config = remote_config or {}
self.local_retention_days = local_retention_days
self.remote_retention_days = remote_retention_days
self.pull_enabled = pull_enabled
self.pull_days = pull_days
self.timezone = timezone
self._backend: Optional[StorageBackend] = None
self._remote_backend: Optional[StorageBackend] = None
@staticmethod
def is_github_actions() -> bool:
"""检测是否在 GitHub Actions 环境中运行"""
return os.environ.get("GITHUB_ACTIONS") == "true"
@staticmethod
def is_docker() -> bool:
"""检测是否在 Docker 容器中运行"""
# 方法1: 检查 /.dockerenv 文件
if os.path.exists("/.dockerenv"):
return True
# 方法2: 检查 cgroupLinux
try:
with open("/proc/1/cgroup", "r") as f:
return "docker" in f.read()
except (FileNotFoundError, PermissionError):
pass
# 方法3: 检查环境变量
return os.environ.get("DOCKER_CONTAINER") == "true"
def _resolve_backend_type(self) -> str:
"""解析实际使用的后端类型"""
if self.backend_type == "auto":
if self.is_github_actions():
# GitHub Actions 环境,检查是否配置了远程存储
if self._has_remote_config():
return "remote"
else:
print("[存储管理器] GitHub Actions 环境但未配置远程存储,使用本地存储")
return "local"
else:
return "local"
return self.backend_type
def _has_remote_config(self) -> bool:
"""检查是否有有效的远程存储配置"""
# 检查配置或环境变量
bucket_name = self.remote_config.get("bucket_name") or os.environ.get("S3_BUCKET_NAME")
access_key = self.remote_config.get("access_key_id") or os.environ.get("S3_ACCESS_KEY_ID")
secret_key = self.remote_config.get("secret_access_key") or os.environ.get("S3_SECRET_ACCESS_KEY")
endpoint = self.remote_config.get("endpoint_url") or os.environ.get("S3_ENDPOINT_URL")
# 调试日志
has_config = bool(bucket_name and access_key and secret_key and endpoint)
if not has_config:
print(f"[存储管理器] 远程存储配置检查失败:")
print(f" - bucket_name: {'已配置' if bucket_name else '未配置'}")
print(f" - access_key_id: {'已配置' if access_key else '未配置'}")
print(f" - secret_access_key: {'已配置' if secret_key else '未配置'}")
print(f" - endpoint_url: {'已配置' if endpoint else '未配置'}")
return has_config
def _create_remote_backend(self) -> Optional[StorageBackend]:
"""创建远程存储后端"""
try:
from trendradar.storage.remote import RemoteStorageBackend
return RemoteStorageBackend(
bucket_name=self.remote_config.get("bucket_name") or os.environ.get("S3_BUCKET_NAME", ""),
access_key_id=self.remote_config.get("access_key_id") or os.environ.get("S3_ACCESS_KEY_ID", ""),
secret_access_key=self.remote_config.get("secret_access_key") or os.environ.get("S3_SECRET_ACCESS_KEY", ""),
endpoint_url=self.remote_config.get("endpoint_url") or os.environ.get("S3_ENDPOINT_URL", ""),
region=self.remote_config.get("region") or os.environ.get("S3_REGION", ""),
enable_txt=self.enable_txt,
enable_html=self.enable_html,
timezone=self.timezone,
)
except ImportError as e:
print(f"[存储管理器] 远程后端导入失败: {e}")
print("[存储管理器] 请确保已安装 boto3: pip install boto3")
return None
except Exception as e:
print(f"[存储管理器] 远程后端初始化失败: {e}")
return None
def get_backend(self) -> StorageBackend:
"""获取存储后端实例"""
if self._backend is None:
resolved_type = self._resolve_backend_type()
if resolved_type == "remote":
self._backend = self._create_remote_backend()
if self._backend:
print(f"[存储管理器] 使用远程存储后端")
else:
print("[存储管理器] 回退到本地存储")
resolved_type = "local"
if resolved_type == "local" or self._backend is None:
from trendradar.storage.local import LocalStorageBackend
self._backend = LocalStorageBackend(
data_dir=self.data_dir,
enable_txt=self.enable_txt,
enable_html=self.enable_html,
timezone=self.timezone,
)
print(f"[存储管理器] 使用本地存储后端 (数据目录: {self.data_dir})")
return self._backend
def pull_from_remote(self) -> int:
"""
从远程拉取数据到本地
Returns:
成功拉取的文件数量
"""
if not self.pull_enabled or self.pull_days <= 0:
return 0
if not self._has_remote_config():
print("[存储管理器] 未配置远程存储,无法拉取")
return 0
# 创建远程后端(如果还没有)
if self._remote_backend is None:
self._remote_backend = self._create_remote_backend()
if self._remote_backend is None:
print("[存储管理器] 无法创建远程后端,拉取失败")
return 0
# 调用拉取方法
return self._remote_backend.pull_recent_days(self.pull_days, self.data_dir)
def save_news_data(self, data: NewsData) -> bool:
"""保存新闻数据"""
return self.get_backend().save_news_data(data)
def get_today_all_data(self, date: Optional[str] = None) -> Optional[NewsData]:
"""获取当天所有数据"""
return self.get_backend().get_today_all_data(date)
def get_latest_crawl_data(self, date: Optional[str] = None) -> Optional[NewsData]:
"""获取最新抓取数据"""
return self.get_backend().get_latest_crawl_data(date)
def detect_new_titles(self, current_data: NewsData) -> dict:
"""检测新增标题"""
return self.get_backend().detect_new_titles(current_data)
def save_txt_snapshot(self, data: NewsData) -> Optional[str]:
"""保存 TXT 快照"""
return self.get_backend().save_txt_snapshot(data)
def save_html_report(self, html_content: str, filename: str, is_summary: bool = False) -> Optional[str]:
"""保存 HTML 报告"""
return self.get_backend().save_html_report(html_content, filename, is_summary)
def is_first_crawl_today(self, date: Optional[str] = None) -> bool:
"""检查是否是当天第一次抓取"""
return self.get_backend().is_first_crawl_today(date)
def cleanup(self) -> None:
"""清理资源"""
if self._backend:
self._backend.cleanup()
if self._remote_backend:
self._remote_backend.cleanup()
def cleanup_old_data(self) -> int:
"""
清理过期数据
Returns:
删除的日期目录数量
"""
total_deleted = 0
# 清理本地数据
if self.local_retention_days > 0:
total_deleted += self.get_backend().cleanup_old_data(self.local_retention_days)
# 清理远程数据(如果配置了)
if self.remote_retention_days > 0 and self._has_remote_config():
if self._remote_backend is None:
self._remote_backend = self._create_remote_backend()
if self._remote_backend:
total_deleted += self._remote_backend.cleanup_old_data(self.remote_retention_days)
return total_deleted
@property
def backend_name(self) -> str:
"""获取当前后端名称"""
return self.get_backend().backend_name
@property
def supports_txt(self) -> bool:
"""是否支持 TXT 快照"""
return self.get_backend().supports_txt
def get_storage_manager(
backend_type: str = "auto",
data_dir: str = "output",
enable_txt: bool = True,
enable_html: bool = True,
remote_config: Optional[dict] = None,
local_retention_days: int = 0,
remote_retention_days: int = 0,
pull_enabled: bool = False,
pull_days: int = 0,
timezone: str = "Asia/Shanghai",
force_new: bool = False,
) -> StorageManager:
"""
获取存储管理器单例
Args:
backend_type: 存储后端类型
data_dir: 本地数据目录
enable_txt: 是否启用 TXT 快照
enable_html: 是否启用 HTML 报告
remote_config: 远程存储配置
local_retention_days: 本地数据保留天数(0 = 无限制)
remote_retention_days: 远程数据保留天数(0 = 无限制)
pull_enabled: 是否启用启动时自动拉取
pull_days: 拉取最近 N 天的数据
timezone: 时区配置(默认 Asia/Shanghai
force_new: 是否强制创建新实例
Returns:
StorageManager 实例
"""
global _storage_manager
if _storage_manager is None or force_new:
_storage_manager = StorageManager(
backend_type=backend_type,
data_dir=data_dir,
enable_txt=enable_txt,
enable_html=enable_html,
remote_config=remote_config,
local_retention_days=local_retention_days,
remote_retention_days=remote_retention_days,
pull_enabled=pull_enabled,
pull_days=pull_days,
timezone=timezone,
)
return _storage_manager
File diff suppressed because it is too large Load Diff
+117
View File
@@ -0,0 +1,117 @@
-- TrendRadar 数据库表结构
-- ============================================
-- 平台信息表
-- 核心:id 不变,name 可变
-- ============================================
CREATE TABLE IF NOT EXISTS platforms (
id TEXT PRIMARY KEY,
name TEXT NOT NULL,
is_active INTEGER DEFAULT 1,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
-- ============================================
-- 新闻条目表
-- 以 URL + platform_id 为唯一标识,支持去重存储
-- ============================================
CREATE TABLE IF NOT EXISTS news_items (
id INTEGER PRIMARY KEY AUTOINCREMENT,
title TEXT NOT NULL,
platform_id TEXT NOT NULL,
rank INTEGER NOT NULL,
url TEXT DEFAULT '',
mobile_url TEXT DEFAULT '',
first_crawl_time TEXT NOT NULL, -- 首次抓取时间
last_crawl_time TEXT NOT NULL, -- 最后抓取时间
crawl_count INTEGER DEFAULT 1, -- 抓取次数
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (platform_id) REFERENCES platforms(id)
);
-- ============================================
-- 标题变更历史表
-- 记录同一 URL 下标题的变化
-- ============================================
CREATE TABLE IF NOT EXISTS title_changes (
id INTEGER PRIMARY KEY AUTOINCREMENT,
news_item_id INTEGER NOT NULL,
old_title TEXT NOT NULL,
new_title TEXT NOT NULL,
changed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (news_item_id) REFERENCES news_items(id)
);
-- ============================================
-- 排名历史表
-- 记录每次抓取时的排名变化
-- ============================================
CREATE TABLE IF NOT EXISTS rank_history (
id INTEGER PRIMARY KEY AUTOINCREMENT,
news_item_id INTEGER NOT NULL,
rank INTEGER NOT NULL,
crawl_time TEXT NOT NULL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (news_item_id) REFERENCES news_items(id)
);
-- ============================================
-- 抓取记录表
-- 记录每次抓取的时间和数量
-- ============================================
CREATE TABLE IF NOT EXISTS crawl_records (
id INTEGER PRIMARY KEY AUTOINCREMENT,
crawl_time TEXT NOT NULL UNIQUE,
total_items INTEGER DEFAULT 0,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
-- ============================================
-- 抓取来源状态表
-- 记录每次抓取各平台的成功/失败状态
-- ============================================
CREATE TABLE IF NOT EXISTS crawl_source_status (
crawl_record_id INTEGER NOT NULL,
platform_id TEXT NOT NULL,
status TEXT NOT NULL CHECK(status IN ('success', 'failed')),
PRIMARY KEY (crawl_record_id, platform_id),
FOREIGN KEY (crawl_record_id) REFERENCES crawl_records(id),
FOREIGN KEY (platform_id) REFERENCES platforms(id)
);
-- ============================================
-- 推送记录表
-- 用于 push_window once_per_day 功能
-- ============================================
CREATE TABLE IF NOT EXISTS push_records (
id INTEGER PRIMARY KEY AUTOINCREMENT,
date TEXT NOT NULL UNIQUE,
pushed INTEGER DEFAULT 0,
push_time TEXT,
report_type TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
-- ============================================
-- 索引定义
-- ============================================
-- 平台索引
CREATE INDEX IF NOT EXISTS idx_news_platform ON news_items(platform_id);
-- 时间索引(用于查询最新数据)
CREATE INDEX IF NOT EXISTS idx_news_crawl_time ON news_items(last_crawl_time);
-- 标题索引(用于标题搜索)
CREATE INDEX IF NOT EXISTS idx_news_title ON news_items(title);
-- URL + platform_id 唯一索引(仅对非空 URL,实现去重)
CREATE UNIQUE INDEX IF NOT EXISTS idx_news_url_platform
ON news_items(url, platform_id) WHERE url != '';
-- 抓取状态索引
CREATE INDEX IF NOT EXISTS idx_crawl_status_record ON crawl_source_status(crawl_record_id);
-- 排名历史索引
CREATE INDEX IF NOT EXISTS idx_rank_history_news ON rank_history(news_item_id);