mirror of
https://gitee.com/houhuan/TrendRadar.git
synced 2026-05-01 01:22:42 +08:00
v4.0.0 大大大更新
This commit is contained in:
@@ -0,0 +1,44 @@
|
||||
# coding=utf-8
|
||||
"""
|
||||
存储模块 - 支持多种存储后端
|
||||
|
||||
支持的存储后端:
|
||||
- local: 本地 SQLite + TXT/HTML 文件
|
||||
- remote: 远程云存储(S3 兼容协议:R2/OSS/COS/S3 等)
|
||||
- auto: 根据环境自动选择(GitHub Actions 用 remote,其他用 local)
|
||||
"""
|
||||
|
||||
from trendradar.storage.base import (
|
||||
StorageBackend,
|
||||
NewsItem,
|
||||
NewsData,
|
||||
convert_crawl_results_to_news_data,
|
||||
convert_news_data_to_results,
|
||||
)
|
||||
from trendradar.storage.local import LocalStorageBackend
|
||||
from trendradar.storage.manager import StorageManager, get_storage_manager
|
||||
|
||||
# 远程后端可选导入(需要 boto3)
|
||||
try:
|
||||
from trendradar.storage.remote import RemoteStorageBackend
|
||||
HAS_REMOTE = True
|
||||
except ImportError:
|
||||
RemoteStorageBackend = None
|
||||
HAS_REMOTE = False
|
||||
|
||||
__all__ = [
|
||||
# 基础类
|
||||
"StorageBackend",
|
||||
"NewsItem",
|
||||
"NewsData",
|
||||
# 转换函数
|
||||
"convert_crawl_results_to_news_data",
|
||||
"convert_news_data_to_results",
|
||||
# 后端实现
|
||||
"LocalStorageBackend",
|
||||
"RemoteStorageBackend",
|
||||
"HAS_REMOTE",
|
||||
# 管理器
|
||||
"StorageManager",
|
||||
"get_storage_manager",
|
||||
]
|
||||
@@ -0,0 +1,457 @@
|
||||
# coding=utf-8
|
||||
"""
|
||||
存储后端抽象基类和数据模型
|
||||
|
||||
定义统一的存储接口,所有存储后端都需要实现这些方法
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Optional, Any
|
||||
import json
|
||||
|
||||
|
||||
@dataclass
|
||||
class NewsItem:
|
||||
"""新闻条目数据模型"""
|
||||
|
||||
title: str # 新闻标题
|
||||
source_id: str # 来源平台ID(如 toutiao, baidu)
|
||||
source_name: str = "" # 来源平台名称(运行时使用,数据库不存储)
|
||||
rank: int = 0 # 排名
|
||||
url: str = "" # 链接 URL
|
||||
mobile_url: str = "" # 移动端 URL
|
||||
crawl_time: str = "" # 抓取时间(HH:MM 格式)
|
||||
|
||||
# 统计信息(用于分析)
|
||||
ranks: List[int] = field(default_factory=list) # 历史排名列表
|
||||
first_time: str = "" # 首次出现时间
|
||||
last_time: str = "" # 最后出现时间
|
||||
count: int = 1 # 出现次数
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""转换为字典"""
|
||||
return {
|
||||
"title": self.title,
|
||||
"source_id": self.source_id,
|
||||
"source_name": self.source_name,
|
||||
"rank": self.rank,
|
||||
"url": self.url,
|
||||
"mobile_url": self.mobile_url,
|
||||
"crawl_time": self.crawl_time,
|
||||
"ranks": self.ranks,
|
||||
"first_time": self.first_time,
|
||||
"last_time": self.last_time,
|
||||
"count": self.count,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> "NewsItem":
|
||||
"""从字典创建"""
|
||||
return cls(
|
||||
title=data.get("title", ""),
|
||||
source_id=data.get("source_id", ""),
|
||||
source_name=data.get("source_name", ""),
|
||||
rank=data.get("rank", 0),
|
||||
url=data.get("url", ""),
|
||||
mobile_url=data.get("mobile_url", ""),
|
||||
crawl_time=data.get("crawl_time", ""),
|
||||
ranks=data.get("ranks", []),
|
||||
first_time=data.get("first_time", ""),
|
||||
last_time=data.get("last_time", ""),
|
||||
count=data.get("count", 1),
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class NewsData:
|
||||
"""
|
||||
新闻数据集合
|
||||
|
||||
结构:
|
||||
- date: 日期(YYYY-MM-DD)
|
||||
- crawl_time: 抓取时间(HH时MM分)
|
||||
- items: 按来源ID分组的新闻条目
|
||||
- id_to_name: 来源ID到名称的映射
|
||||
- failed_ids: 失败的来源ID列表
|
||||
"""
|
||||
|
||||
date: str # 日期
|
||||
crawl_time: str # 抓取时间
|
||||
items: Dict[str, List[NewsItem]] # 按来源分组的新闻
|
||||
id_to_name: Dict[str, str] = field(default_factory=dict) # ID到名称映射
|
||||
failed_ids: List[str] = field(default_factory=list) # 失败的ID
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""转换为字典"""
|
||||
items_dict = {}
|
||||
for source_id, news_list in self.items.items():
|
||||
items_dict[source_id] = [item.to_dict() for item in news_list]
|
||||
|
||||
return {
|
||||
"date": self.date,
|
||||
"crawl_time": self.crawl_time,
|
||||
"items": items_dict,
|
||||
"id_to_name": self.id_to_name,
|
||||
"failed_ids": self.failed_ids,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> "NewsData":
|
||||
"""从字典创建"""
|
||||
items = {}
|
||||
items_data = data.get("items", {})
|
||||
for source_id, news_list in items_data.items():
|
||||
items[source_id] = [NewsItem.from_dict(item) for item in news_list]
|
||||
|
||||
return cls(
|
||||
date=data.get("date", ""),
|
||||
crawl_time=data.get("crawl_time", ""),
|
||||
items=items,
|
||||
id_to_name=data.get("id_to_name", {}),
|
||||
failed_ids=data.get("failed_ids", []),
|
||||
)
|
||||
|
||||
def get_total_count(self) -> int:
|
||||
"""获取新闻总数"""
|
||||
return sum(len(news_list) for news_list in self.items.values())
|
||||
|
||||
def merge_with(self, other: "NewsData") -> "NewsData":
|
||||
"""
|
||||
合并另一个 NewsData 到当前数据
|
||||
|
||||
合并规则:
|
||||
- 相同 source_id + title 的新闻合并排名历史
|
||||
- 更新 last_time 和 count
|
||||
- 保留较早的 first_time
|
||||
"""
|
||||
merged_items = {}
|
||||
|
||||
# 复制当前数据
|
||||
for source_id, news_list in self.items.items():
|
||||
merged_items[source_id] = {item.title: item for item in news_list}
|
||||
|
||||
# 合并其他数据
|
||||
for source_id, news_list in other.items.items():
|
||||
if source_id not in merged_items:
|
||||
merged_items[source_id] = {}
|
||||
|
||||
for item in news_list:
|
||||
if item.title in merged_items[source_id]:
|
||||
# 合并已存在的新闻
|
||||
existing = merged_items[source_id][item.title]
|
||||
|
||||
# 合并排名
|
||||
existing_ranks = set(existing.ranks) if existing.ranks else set()
|
||||
new_ranks = set(item.ranks) if item.ranks else set()
|
||||
merged_ranks = sorted(existing_ranks | new_ranks)
|
||||
existing.ranks = merged_ranks
|
||||
|
||||
# 更新时间
|
||||
if item.first_time and (not existing.first_time or item.first_time < existing.first_time):
|
||||
existing.first_time = item.first_time
|
||||
if item.last_time and (not existing.last_time or item.last_time > existing.last_time):
|
||||
existing.last_time = item.last_time
|
||||
|
||||
# 更新计数
|
||||
existing.count += 1
|
||||
|
||||
# 保留URL(如果原来没有)
|
||||
if not existing.url and item.url:
|
||||
existing.url = item.url
|
||||
if not existing.mobile_url and item.mobile_url:
|
||||
existing.mobile_url = item.mobile_url
|
||||
else:
|
||||
# 添加新新闻
|
||||
merged_items[source_id][item.title] = item
|
||||
|
||||
# 转换回列表格式
|
||||
final_items = {}
|
||||
for source_id, items_dict in merged_items.items():
|
||||
final_items[source_id] = list(items_dict.values())
|
||||
|
||||
# 合并 id_to_name
|
||||
merged_id_to_name = {**self.id_to_name, **other.id_to_name}
|
||||
|
||||
# 合并 failed_ids(去重)
|
||||
merged_failed_ids = list(set(self.failed_ids + other.failed_ids))
|
||||
|
||||
return NewsData(
|
||||
date=self.date or other.date,
|
||||
crawl_time=other.crawl_time, # 使用较新的抓取时间
|
||||
items=final_items,
|
||||
id_to_name=merged_id_to_name,
|
||||
failed_ids=merged_failed_ids,
|
||||
)
|
||||
|
||||
|
||||
class StorageBackend(ABC):
|
||||
"""
|
||||
存储后端抽象基类
|
||||
|
||||
所有存储后端都需要实现这些方法,以支持:
|
||||
- 保存新闻数据
|
||||
- 读取当天所有数据
|
||||
- 检测新增新闻
|
||||
- 生成报告文件(TXT/HTML)
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def save_news_data(self, data: NewsData) -> bool:
|
||||
"""
|
||||
保存新闻数据
|
||||
|
||||
Args:
|
||||
data: 新闻数据
|
||||
|
||||
Returns:
|
||||
是否保存成功
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_today_all_data(self, date: Optional[str] = None) -> Optional[NewsData]:
|
||||
"""
|
||||
获取指定日期的所有新闻数据
|
||||
|
||||
Args:
|
||||
date: 日期字符串(YYYY-MM-DD),默认为今天
|
||||
|
||||
Returns:
|
||||
合并后的新闻数据,如果没有数据返回 None
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_latest_crawl_data(self, date: Optional[str] = None) -> Optional[NewsData]:
|
||||
"""
|
||||
获取最新一次抓取的数据
|
||||
|
||||
Args:
|
||||
date: 日期字符串,默认为今天
|
||||
|
||||
Returns:
|
||||
最新抓取的新闻数据
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def detect_new_titles(self, current_data: NewsData) -> Dict[str, Dict]:
|
||||
"""
|
||||
检测新增的标题
|
||||
|
||||
Args:
|
||||
current_data: 当前抓取的数据
|
||||
|
||||
Returns:
|
||||
新增的标题数据,格式: {source_id: {title: title_data}}
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def save_txt_snapshot(self, data: NewsData) -> Optional[str]:
|
||||
"""
|
||||
保存 TXT 快照(可选功能,本地环境可用)
|
||||
|
||||
Args:
|
||||
data: 新闻数据
|
||||
|
||||
Returns:
|
||||
保存的文件路径,如果不支持返回 None
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def save_html_report(self, html_content: str, filename: str, is_summary: bool = False) -> Optional[str]:
|
||||
"""
|
||||
保存 HTML 报告
|
||||
|
||||
Args:
|
||||
html_content: HTML 内容
|
||||
filename: 文件名
|
||||
is_summary: 是否为汇总报告
|
||||
|
||||
Returns:
|
||||
保存的文件路径
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def is_first_crawl_today(self, date: Optional[str] = None) -> bool:
|
||||
"""
|
||||
检查是否是当天第一次抓取
|
||||
|
||||
Args:
|
||||
date: 日期字符串,默认为今天
|
||||
|
||||
Returns:
|
||||
是否是第一次抓取
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def cleanup(self) -> None:
|
||||
"""
|
||||
清理资源(如临时文件、数据库连接等)
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def cleanup_old_data(self, retention_days: int) -> int:
|
||||
"""
|
||||
清理过期数据
|
||||
|
||||
Args:
|
||||
retention_days: 保留天数(0 表示不清理)
|
||||
|
||||
Returns:
|
||||
删除的日期目录数量
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def backend_name(self) -> str:
|
||||
"""
|
||||
存储后端名称
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def supports_txt(self) -> bool:
|
||||
"""
|
||||
是否支持生成 TXT 快照
|
||||
"""
|
||||
pass
|
||||
|
||||
# === 推送记录相关方法 ===
|
||||
|
||||
@abstractmethod
|
||||
def has_pushed_today(self, date: Optional[str] = None) -> bool:
|
||||
"""
|
||||
检查指定日期是否已推送过
|
||||
|
||||
Args:
|
||||
date: 日期字符串(YYYY-MM-DD),默认为今天
|
||||
|
||||
Returns:
|
||||
是否已推送
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def record_push(self, report_type: str, date: Optional[str] = None) -> bool:
|
||||
"""
|
||||
记录推送
|
||||
|
||||
Args:
|
||||
report_type: 报告类型
|
||||
date: 日期字符串(YYYY-MM-DD),默认为今天
|
||||
|
||||
Returns:
|
||||
是否记录成功
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
def convert_crawl_results_to_news_data(
|
||||
results: Dict[str, Dict],
|
||||
id_to_name: Dict[str, str],
|
||||
failed_ids: List[str],
|
||||
crawl_time: str,
|
||||
crawl_date: str,
|
||||
) -> NewsData:
|
||||
"""
|
||||
将爬虫结果转换为 NewsData 格式
|
||||
|
||||
Args:
|
||||
results: 爬虫返回的结果 {source_id: {title: {ranks: [], url: "", mobileUrl: ""}}}
|
||||
id_to_name: 来源ID到名称的映射
|
||||
failed_ids: 失败的来源ID
|
||||
crawl_time: 抓取时间(HH:MM)
|
||||
crawl_date: 抓取日期(YYYY-MM-DD)
|
||||
|
||||
Returns:
|
||||
NewsData 对象
|
||||
"""
|
||||
items = {}
|
||||
|
||||
for source_id, titles_data in results.items():
|
||||
source_name = id_to_name.get(source_id, source_id)
|
||||
news_list = []
|
||||
|
||||
for title, data in titles_data.items():
|
||||
if isinstance(data, dict):
|
||||
ranks = data.get("ranks", [])
|
||||
url = data.get("url", "")
|
||||
mobile_url = data.get("mobileUrl", "")
|
||||
else:
|
||||
# 兼容旧格式
|
||||
ranks = data if isinstance(data, list) else []
|
||||
url = ""
|
||||
mobile_url = ""
|
||||
|
||||
rank = ranks[0] if ranks else 99
|
||||
|
||||
news_item = NewsItem(
|
||||
title=title,
|
||||
source_id=source_id,
|
||||
source_name=source_name,
|
||||
rank=rank,
|
||||
url=url,
|
||||
mobile_url=mobile_url,
|
||||
crawl_time=crawl_time,
|
||||
ranks=ranks,
|
||||
first_time=crawl_time,
|
||||
last_time=crawl_time,
|
||||
count=1,
|
||||
)
|
||||
news_list.append(news_item)
|
||||
|
||||
items[source_id] = news_list
|
||||
|
||||
return NewsData(
|
||||
date=crawl_date,
|
||||
crawl_time=crawl_time,
|
||||
items=items,
|
||||
id_to_name=id_to_name,
|
||||
failed_ids=failed_ids,
|
||||
)
|
||||
|
||||
|
||||
def convert_news_data_to_results(data: NewsData) -> tuple:
|
||||
"""
|
||||
将 NewsData 转换回原有的 results 格式(用于兼容现有代码)
|
||||
|
||||
Args:
|
||||
data: NewsData 对象
|
||||
|
||||
Returns:
|
||||
(results, id_to_name, title_info) 元组
|
||||
"""
|
||||
results = {}
|
||||
title_info = {}
|
||||
|
||||
for source_id, news_list in data.items.items():
|
||||
results[source_id] = {}
|
||||
title_info[source_id] = {}
|
||||
|
||||
for item in news_list:
|
||||
results[source_id][item.title] = {
|
||||
"ranks": item.ranks,
|
||||
"url": item.url,
|
||||
"mobileUrl": item.mobile_url,
|
||||
}
|
||||
|
||||
title_info[source_id][item.title] = {
|
||||
"first_time": item.first_time,
|
||||
"last_time": item.last_time,
|
||||
"count": item.count,
|
||||
"ranks": item.ranks,
|
||||
"url": item.url,
|
||||
"mobileUrl": item.mobile_url,
|
||||
}
|
||||
|
||||
return results, data.id_to_name, title_info
|
||||
@@ -0,0 +1,869 @@
|
||||
# coding=utf-8
|
||||
"""
|
||||
本地存储后端 - SQLite + TXT/HTML
|
||||
|
||||
使用 SQLite 作为主存储,支持可选的 TXT 快照和 HTML 报告
|
||||
"""
|
||||
|
||||
import sqlite3
|
||||
import os
|
||||
import shutil
|
||||
import pytz
|
||||
import re
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Any
|
||||
|
||||
from trendradar.storage.base import StorageBackend, NewsItem, NewsData
|
||||
from trendradar.utils.time import (
|
||||
get_configured_time,
|
||||
format_date_folder,
|
||||
format_time_filename,
|
||||
)
|
||||
|
||||
|
||||
class LocalStorageBackend(StorageBackend):
|
||||
"""
|
||||
本地存储后端
|
||||
|
||||
使用 SQLite 数据库存储新闻数据,支持:
|
||||
- 按日期组织的 SQLite 数据库文件
|
||||
- 可选的 TXT 快照(用于调试)
|
||||
- HTML 报告生成
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
data_dir: str = "output",
|
||||
enable_txt: bool = True,
|
||||
enable_html: bool = True,
|
||||
timezone: str = "Asia/Shanghai",
|
||||
):
|
||||
"""
|
||||
初始化本地存储后端
|
||||
|
||||
Args:
|
||||
data_dir: 数据目录路径
|
||||
enable_txt: 是否启用 TXT 快照
|
||||
enable_html: 是否启用 HTML 报告
|
||||
timezone: 时区配置(默认 Asia/Shanghai)
|
||||
"""
|
||||
self.data_dir = Path(data_dir)
|
||||
self.enable_txt = enable_txt
|
||||
self.enable_html = enable_html
|
||||
self.timezone = timezone
|
||||
self._db_connections: Dict[str, sqlite3.Connection] = {}
|
||||
|
||||
@property
|
||||
def backend_name(self) -> str:
|
||||
return "local"
|
||||
|
||||
@property
|
||||
def supports_txt(self) -> bool:
|
||||
return self.enable_txt
|
||||
|
||||
def _get_configured_time(self) -> datetime:
|
||||
"""获取配置时区的当前时间"""
|
||||
return get_configured_time(self.timezone)
|
||||
|
||||
def _format_date_folder(self, date: Optional[str] = None) -> str:
|
||||
"""格式化日期文件夹名 (ISO 格式: YYYY-MM-DD)"""
|
||||
return format_date_folder(date, self.timezone)
|
||||
|
||||
def _format_time_filename(self) -> str:
|
||||
"""格式化时间文件名 (格式: HH-MM)"""
|
||||
return format_time_filename(self.timezone)
|
||||
|
||||
def _get_db_path(self, date: Optional[str] = None) -> Path:
|
||||
"""获取 SQLite 数据库路径"""
|
||||
date_folder = self._format_date_folder(date)
|
||||
db_dir = self.data_dir / date_folder
|
||||
db_dir.mkdir(parents=True, exist_ok=True)
|
||||
return db_dir / "news.db"
|
||||
|
||||
def _get_connection(self, date: Optional[str] = None) -> sqlite3.Connection:
|
||||
"""获取数据库连接(带缓存)"""
|
||||
db_path = str(self._get_db_path(date))
|
||||
|
||||
if db_path not in self._db_connections:
|
||||
conn = sqlite3.connect(db_path)
|
||||
conn.row_factory = sqlite3.Row
|
||||
self._init_tables(conn)
|
||||
self._db_connections[db_path] = conn
|
||||
|
||||
return self._db_connections[db_path]
|
||||
|
||||
def _get_schema_path(self) -> Path:
|
||||
"""获取 schema.sql 文件路径"""
|
||||
return Path(__file__).parent / "schema.sql"
|
||||
|
||||
def _init_tables(self, conn: sqlite3.Connection) -> None:
|
||||
"""从 schema.sql 初始化数据库表结构"""
|
||||
schema_path = self._get_schema_path()
|
||||
|
||||
if schema_path.exists():
|
||||
with open(schema_path, "r", encoding="utf-8") as f:
|
||||
schema_sql = f.read()
|
||||
conn.executescript(schema_sql)
|
||||
else:
|
||||
raise FileNotFoundError(f"Schema file not found: {schema_path}")
|
||||
|
||||
conn.commit()
|
||||
|
||||
def save_news_data(self, data: NewsData) -> bool:
|
||||
"""
|
||||
保存新闻数据到 SQLite(以 URL 为唯一标识,支持标题更新检测)
|
||||
|
||||
Args:
|
||||
data: 新闻数据
|
||||
|
||||
Returns:
|
||||
是否保存成功
|
||||
"""
|
||||
try:
|
||||
conn = self._get_connection(data.date)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# 获取配置时区的当前时间
|
||||
now_str = self._get_configured_time().strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
# 首先同步平台信息到 platforms 表
|
||||
for source_id, source_name in data.id_to_name.items():
|
||||
cursor.execute("""
|
||||
INSERT INTO platforms (id, name, updated_at)
|
||||
VALUES (?, ?, ?)
|
||||
ON CONFLICT(id) DO UPDATE SET
|
||||
name = excluded.name,
|
||||
updated_at = excluded.updated_at
|
||||
""", (source_id, source_name, now_str))
|
||||
|
||||
# 统计计数器
|
||||
new_count = 0
|
||||
updated_count = 0
|
||||
title_changed_count = 0
|
||||
success_sources = []
|
||||
|
||||
for source_id, news_list in data.items.items():
|
||||
success_sources.append(source_id)
|
||||
|
||||
for item in news_list:
|
||||
try:
|
||||
# 检查是否已存在(通过 URL + platform_id)
|
||||
if item.url:
|
||||
cursor.execute("""
|
||||
SELECT id, title FROM news_items
|
||||
WHERE url = ? AND platform_id = ?
|
||||
""", (item.url, source_id))
|
||||
existing = cursor.fetchone()
|
||||
|
||||
if existing:
|
||||
# 已存在,更新记录
|
||||
existing_id, existing_title = existing
|
||||
|
||||
# 检查标题是否变化
|
||||
if existing_title != item.title:
|
||||
# 记录标题变更
|
||||
cursor.execute("""
|
||||
INSERT INTO title_changes
|
||||
(news_item_id, old_title, new_title, changed_at)
|
||||
VALUES (?, ?, ?, ?)
|
||||
""", (existing_id, existing_title, item.title, now_str))
|
||||
title_changed_count += 1
|
||||
|
||||
# 记录排名历史
|
||||
cursor.execute("""
|
||||
INSERT INTO rank_history
|
||||
(news_item_id, rank, crawl_time, created_at)
|
||||
VALUES (?, ?, ?, ?)
|
||||
""", (existing_id, item.rank, data.crawl_time, now_str))
|
||||
|
||||
# 更新现有记录
|
||||
cursor.execute("""
|
||||
UPDATE news_items SET
|
||||
title = ?,
|
||||
rank = ?,
|
||||
mobile_url = ?,
|
||||
last_crawl_time = ?,
|
||||
crawl_count = crawl_count + 1,
|
||||
updated_at = ?
|
||||
WHERE id = ?
|
||||
""", (item.title, item.rank, item.mobile_url,
|
||||
data.crawl_time, now_str, existing_id))
|
||||
updated_count += 1
|
||||
else:
|
||||
# 不存在,插入新记录
|
||||
cursor.execute("""
|
||||
INSERT INTO news_items
|
||||
(title, platform_id, rank, url, mobile_url,
|
||||
first_crawl_time, last_crawl_time, crawl_count,
|
||||
created_at, updated_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, 1, ?, ?)
|
||||
""", (item.title, source_id, item.rank, item.url,
|
||||
item.mobile_url, data.crawl_time, data.crawl_time,
|
||||
now_str, now_str))
|
||||
new_id = cursor.lastrowid
|
||||
# 记录初始排名
|
||||
cursor.execute("""
|
||||
INSERT INTO rank_history
|
||||
(news_item_id, rank, crawl_time, created_at)
|
||||
VALUES (?, ?, ?, ?)
|
||||
""", (new_id, item.rank, data.crawl_time, now_str))
|
||||
new_count += 1
|
||||
else:
|
||||
# URL 为空的情况,直接插入(不做去重)
|
||||
cursor.execute("""
|
||||
INSERT INTO news_items
|
||||
(title, platform_id, rank, url, mobile_url,
|
||||
first_crawl_time, last_crawl_time, crawl_count,
|
||||
created_at, updated_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, 1, ?, ?)
|
||||
""", (item.title, source_id, item.rank, item.url,
|
||||
item.mobile_url, data.crawl_time, data.crawl_time,
|
||||
now_str, now_str))
|
||||
new_id = cursor.lastrowid
|
||||
# 记录初始排名
|
||||
cursor.execute("""
|
||||
INSERT INTO rank_history
|
||||
(news_item_id, rank, crawl_time, created_at)
|
||||
VALUES (?, ?, ?, ?)
|
||||
""", (new_id, item.rank, data.crawl_time, now_str))
|
||||
new_count += 1
|
||||
|
||||
except sqlite3.Error as e:
|
||||
print(f"保存新闻条目失败 [{item.title[:30]}...]: {e}")
|
||||
|
||||
total_items = new_count + updated_count
|
||||
|
||||
# 记录抓取信息
|
||||
cursor.execute("""
|
||||
INSERT OR REPLACE INTO crawl_records
|
||||
(crawl_time, total_items, created_at)
|
||||
VALUES (?, ?, ?)
|
||||
""", (data.crawl_time, total_items, now_str))
|
||||
|
||||
# 获取刚插入的 crawl_record 的 ID
|
||||
cursor.execute("""
|
||||
SELECT id FROM crawl_records WHERE crawl_time = ?
|
||||
""", (data.crawl_time,))
|
||||
record_row = cursor.fetchone()
|
||||
if record_row:
|
||||
crawl_record_id = record_row[0]
|
||||
|
||||
# 记录成功的来源
|
||||
for source_id in success_sources:
|
||||
cursor.execute("""
|
||||
INSERT OR REPLACE INTO crawl_source_status
|
||||
(crawl_record_id, platform_id, status)
|
||||
VALUES (?, ?, 'success')
|
||||
""", (crawl_record_id, source_id))
|
||||
|
||||
# 记录失败的来源
|
||||
for failed_id in data.failed_ids:
|
||||
# 确保失败的平台也在 platforms 表中
|
||||
cursor.execute("""
|
||||
INSERT OR IGNORE INTO platforms (id, name, updated_at)
|
||||
VALUES (?, ?, ?)
|
||||
""", (failed_id, failed_id, now_str))
|
||||
|
||||
cursor.execute("""
|
||||
INSERT OR REPLACE INTO crawl_source_status
|
||||
(crawl_record_id, platform_id, status)
|
||||
VALUES (?, ?, 'failed')
|
||||
""", (crawl_record_id, failed_id))
|
||||
|
||||
conn.commit()
|
||||
|
||||
# 输出详细的存储统计日志
|
||||
log_parts = [f"[本地存储] 处理完成:新增 {new_count} 条"]
|
||||
if updated_count > 0:
|
||||
log_parts.append(f"更新 {updated_count} 条")
|
||||
if title_changed_count > 0:
|
||||
log_parts.append(f"标题变更 {title_changed_count} 条")
|
||||
print(",".join(log_parts))
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"[本地存储] 保存失败: {e}")
|
||||
return False
|
||||
|
||||
def get_today_all_data(self, date: Optional[str] = None) -> Optional[NewsData]:
|
||||
"""
|
||||
获取指定日期的所有新闻数据(合并后)
|
||||
|
||||
Args:
|
||||
date: 日期字符串,默认为今天
|
||||
|
||||
Returns:
|
||||
合并后的新闻数据
|
||||
"""
|
||||
try:
|
||||
db_path = self._get_db_path(date)
|
||||
if not db_path.exists():
|
||||
return None
|
||||
|
||||
conn = self._get_connection(date)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# 获取所有新闻数据(包含 id 用于查询排名历史)
|
||||
cursor.execute("""
|
||||
SELECT n.id, n.title, n.platform_id, p.name as platform_name,
|
||||
n.rank, n.url, n.mobile_url,
|
||||
n.first_crawl_time, n.last_crawl_time, n.crawl_count
|
||||
FROM news_items n
|
||||
LEFT JOIN platforms p ON n.platform_id = p.id
|
||||
ORDER BY n.platform_id, n.last_crawl_time
|
||||
""")
|
||||
|
||||
rows = cursor.fetchall()
|
||||
if not rows:
|
||||
return None
|
||||
|
||||
# 收集所有 news_item_id
|
||||
news_ids = [row[0] for row in rows]
|
||||
|
||||
# 批量查询排名历史
|
||||
rank_history_map: Dict[int, List[int]] = {}
|
||||
if news_ids:
|
||||
placeholders = ",".join("?" * len(news_ids))
|
||||
cursor.execute(f"""
|
||||
SELECT news_item_id, rank FROM rank_history
|
||||
WHERE news_item_id IN ({placeholders})
|
||||
ORDER BY news_item_id, crawl_time
|
||||
""", news_ids)
|
||||
for rh_row in cursor.fetchall():
|
||||
news_id, rank = rh_row[0], rh_row[1]
|
||||
if news_id not in rank_history_map:
|
||||
rank_history_map[news_id] = []
|
||||
if rank not in rank_history_map[news_id]:
|
||||
rank_history_map[news_id].append(rank)
|
||||
|
||||
# 按 platform_id 分组
|
||||
items: Dict[str, List[NewsItem]] = {}
|
||||
id_to_name: Dict[str, str] = {}
|
||||
crawl_date = self._format_date_folder(date)
|
||||
|
||||
for row in rows:
|
||||
news_id = row[0]
|
||||
platform_id = row[2]
|
||||
title = row[1]
|
||||
platform_name = row[3] or platform_id
|
||||
|
||||
id_to_name[platform_id] = platform_name
|
||||
|
||||
if platform_id not in items:
|
||||
items[platform_id] = []
|
||||
|
||||
# 获取排名历史,如果没有则使用当前排名
|
||||
ranks = rank_history_map.get(news_id, [row[4]])
|
||||
|
||||
items[platform_id].append(NewsItem(
|
||||
title=title,
|
||||
source_id=platform_id,
|
||||
source_name=platform_name,
|
||||
rank=row[4],
|
||||
url=row[5] or "",
|
||||
mobile_url=row[6] or "",
|
||||
crawl_time=row[8], # last_crawl_time
|
||||
ranks=ranks,
|
||||
first_time=row[7], # first_crawl_time
|
||||
last_time=row[8], # last_crawl_time
|
||||
count=row[9], # crawl_count
|
||||
))
|
||||
|
||||
final_items = items
|
||||
|
||||
# 获取失败的来源
|
||||
cursor.execute("""
|
||||
SELECT DISTINCT css.platform_id
|
||||
FROM crawl_source_status css
|
||||
JOIN crawl_records cr ON css.crawl_record_id = cr.id
|
||||
WHERE css.status = 'failed'
|
||||
""")
|
||||
failed_ids = [row[0] for row in cursor.fetchall()]
|
||||
|
||||
# 获取最新的抓取时间
|
||||
cursor.execute("""
|
||||
SELECT crawl_time FROM crawl_records
|
||||
ORDER BY crawl_time DESC
|
||||
LIMIT 1
|
||||
""")
|
||||
|
||||
time_row = cursor.fetchone()
|
||||
crawl_time = time_row[0] if time_row else self._format_time_filename()
|
||||
|
||||
return NewsData(
|
||||
date=crawl_date,
|
||||
crawl_time=crawl_time,
|
||||
items=final_items,
|
||||
id_to_name=id_to_name,
|
||||
failed_ids=failed_ids,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
print(f"[本地存储] 读取数据失败: {e}")
|
||||
return None
|
||||
|
||||
def get_latest_crawl_data(self, date: Optional[str] = None) -> Optional[NewsData]:
|
||||
"""
|
||||
获取最新一次抓取的数据
|
||||
|
||||
Args:
|
||||
date: 日期字符串,默认为今天
|
||||
|
||||
Returns:
|
||||
最新抓取的新闻数据
|
||||
"""
|
||||
try:
|
||||
db_path = self._get_db_path(date)
|
||||
if not db_path.exists():
|
||||
return None
|
||||
|
||||
conn = self._get_connection(date)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# 获取最新的抓取时间
|
||||
cursor.execute("""
|
||||
SELECT crawl_time FROM crawl_records
|
||||
ORDER BY crawl_time DESC
|
||||
LIMIT 1
|
||||
""")
|
||||
|
||||
time_row = cursor.fetchone()
|
||||
if not time_row:
|
||||
return None
|
||||
|
||||
latest_time = time_row[0]
|
||||
|
||||
# 获取该时间的新闻数据(包含 id 用于查询排名历史)
|
||||
cursor.execute("""
|
||||
SELECT n.id, n.title, n.platform_id, p.name as platform_name,
|
||||
n.rank, n.url, n.mobile_url,
|
||||
n.first_crawl_time, n.last_crawl_time, n.crawl_count
|
||||
FROM news_items n
|
||||
LEFT JOIN platforms p ON n.platform_id = p.id
|
||||
WHERE n.last_crawl_time = ?
|
||||
""", (latest_time,))
|
||||
|
||||
rows = cursor.fetchall()
|
||||
if not rows:
|
||||
return None
|
||||
|
||||
# 收集所有 news_item_id
|
||||
news_ids = [row[0] for row in rows]
|
||||
|
||||
# 批量查询排名历史
|
||||
rank_history_map: Dict[int, List[int]] = {}
|
||||
if news_ids:
|
||||
placeholders = ",".join("?" * len(news_ids))
|
||||
cursor.execute(f"""
|
||||
SELECT news_item_id, rank FROM rank_history
|
||||
WHERE news_item_id IN ({placeholders})
|
||||
ORDER BY news_item_id, crawl_time
|
||||
""", news_ids)
|
||||
for rh_row in cursor.fetchall():
|
||||
news_id, rank = rh_row[0], rh_row[1]
|
||||
if news_id not in rank_history_map:
|
||||
rank_history_map[news_id] = []
|
||||
if rank not in rank_history_map[news_id]:
|
||||
rank_history_map[news_id].append(rank)
|
||||
|
||||
items: Dict[str, List[NewsItem]] = {}
|
||||
id_to_name: Dict[str, str] = {}
|
||||
crawl_date = self._format_date_folder(date)
|
||||
|
||||
for row in rows:
|
||||
news_id = row[0]
|
||||
platform_id = row[2]
|
||||
platform_name = row[3] or platform_id
|
||||
id_to_name[platform_id] = platform_name
|
||||
|
||||
if platform_id not in items:
|
||||
items[platform_id] = []
|
||||
|
||||
# 获取排名历史,如果没有则使用当前排名
|
||||
ranks = rank_history_map.get(news_id, [row[4]])
|
||||
|
||||
items[platform_id].append(NewsItem(
|
||||
title=row[1],
|
||||
source_id=platform_id,
|
||||
source_name=platform_name,
|
||||
rank=row[4],
|
||||
url=row[5] or "",
|
||||
mobile_url=row[6] or "",
|
||||
crawl_time=row[8], # last_crawl_time
|
||||
ranks=ranks,
|
||||
first_time=row[7], # first_crawl_time
|
||||
last_time=row[8], # last_crawl_time
|
||||
count=row[9], # crawl_count
|
||||
))
|
||||
|
||||
# 获取失败的来源(针对最新一次抓取)
|
||||
cursor.execute("""
|
||||
SELECT css.platform_id
|
||||
FROM crawl_source_status css
|
||||
JOIN crawl_records cr ON css.crawl_record_id = cr.id
|
||||
WHERE cr.crawl_time = ? AND css.status = 'failed'
|
||||
""", (latest_time,))
|
||||
|
||||
failed_ids = [row[0] for row in cursor.fetchall()]
|
||||
|
||||
return NewsData(
|
||||
date=crawl_date,
|
||||
crawl_time=latest_time,
|
||||
items=items,
|
||||
id_to_name=id_to_name,
|
||||
failed_ids=failed_ids,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
print(f"[本地存储] 获取最新数据失败: {e}")
|
||||
return None
|
||||
|
||||
def detect_new_titles(self, current_data: NewsData) -> Dict[str, Dict]:
|
||||
"""
|
||||
检测新增的标题
|
||||
|
||||
Args:
|
||||
current_data: 当前抓取的数据
|
||||
|
||||
Returns:
|
||||
新增的标题数据 {source_id: {title: NewsItem}}
|
||||
"""
|
||||
try:
|
||||
# 获取历史数据
|
||||
historical_data = self.get_today_all_data(current_data.date)
|
||||
|
||||
if not historical_data:
|
||||
# 没有历史数据,所有都是新的
|
||||
new_titles = {}
|
||||
for source_id, news_list in current_data.items.items():
|
||||
new_titles[source_id] = {item.title: item for item in news_list}
|
||||
return new_titles
|
||||
|
||||
# 收集历史标题
|
||||
historical_titles: Dict[str, set] = {}
|
||||
for source_id, news_list in historical_data.items.items():
|
||||
historical_titles[source_id] = {item.title for item in news_list}
|
||||
|
||||
# 检测新增
|
||||
new_titles = {}
|
||||
for source_id, news_list in current_data.items.items():
|
||||
hist_set = historical_titles.get(source_id, set())
|
||||
for item in news_list:
|
||||
if item.title not in hist_set:
|
||||
if source_id not in new_titles:
|
||||
new_titles[source_id] = {}
|
||||
new_titles[source_id][item.title] = item
|
||||
|
||||
return new_titles
|
||||
|
||||
except Exception as e:
|
||||
print(f"[本地存储] 检测新标题失败: {e}")
|
||||
return {}
|
||||
|
||||
def save_txt_snapshot(self, data: NewsData) -> Optional[str]:
|
||||
"""
|
||||
保存 TXT 快照
|
||||
|
||||
Args:
|
||||
data: 新闻数据
|
||||
|
||||
Returns:
|
||||
保存的文件路径
|
||||
"""
|
||||
if not self.enable_txt:
|
||||
return None
|
||||
|
||||
try:
|
||||
date_folder = self._format_date_folder(data.date)
|
||||
txt_dir = self.data_dir / date_folder / "txt"
|
||||
txt_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
file_path = txt_dir / f"{data.crawl_time}.txt"
|
||||
|
||||
with open(file_path, "w", encoding="utf-8") as f:
|
||||
for source_id, news_list in data.items.items():
|
||||
source_name = data.id_to_name.get(source_id, source_id)
|
||||
|
||||
# 写入来源标题
|
||||
if source_name and source_name != source_id:
|
||||
f.write(f"{source_id} | {source_name}\n")
|
||||
else:
|
||||
f.write(f"{source_id}\n")
|
||||
|
||||
# 按排名排序
|
||||
sorted_news = sorted(news_list, key=lambda x: x.rank)
|
||||
|
||||
for item in sorted_news:
|
||||
line = f"{item.rank}. {item.title}"
|
||||
if item.url:
|
||||
line += f" [URL:{item.url}]"
|
||||
if item.mobile_url:
|
||||
line += f" [MOBILE:{item.mobile_url}]"
|
||||
f.write(line + "\n")
|
||||
|
||||
f.write("\n")
|
||||
|
||||
# 写入失败的来源
|
||||
if data.failed_ids:
|
||||
f.write("==== 以下ID请求失败 ====\n")
|
||||
for failed_id in data.failed_ids:
|
||||
f.write(f"{failed_id}\n")
|
||||
|
||||
print(f"[本地存储] TXT 快照已保存: {file_path}")
|
||||
return str(file_path)
|
||||
|
||||
except Exception as e:
|
||||
print(f"[本地存储] 保存 TXT 快照失败: {e}")
|
||||
return None
|
||||
|
||||
def save_html_report(self, html_content: str, filename: str, is_summary: bool = False) -> Optional[str]:
|
||||
"""
|
||||
保存 HTML 报告
|
||||
|
||||
Args:
|
||||
html_content: HTML 内容
|
||||
filename: 文件名
|
||||
is_summary: 是否为汇总报告
|
||||
|
||||
Returns:
|
||||
保存的文件路径
|
||||
"""
|
||||
if not self.enable_html:
|
||||
return None
|
||||
|
||||
try:
|
||||
date_folder = self._format_date_folder()
|
||||
html_dir = self.data_dir / date_folder / "html"
|
||||
html_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
file_path = html_dir / filename
|
||||
|
||||
with open(file_path, "w", encoding="utf-8") as f:
|
||||
f.write(html_content)
|
||||
|
||||
print(f"[本地存储] HTML 报告已保存: {file_path}")
|
||||
return str(file_path)
|
||||
|
||||
except Exception as e:
|
||||
print(f"[本地存储] 保存 HTML 报告失败: {e}")
|
||||
return None
|
||||
|
||||
def is_first_crawl_today(self, date: Optional[str] = None) -> bool:
|
||||
"""
|
||||
检查是否是当天第一次抓取
|
||||
|
||||
Args:
|
||||
date: 日期字符串,默认为今天
|
||||
|
||||
Returns:
|
||||
是否是第一次抓取
|
||||
"""
|
||||
try:
|
||||
db_path = self._get_db_path(date)
|
||||
if not db_path.exists():
|
||||
return True
|
||||
|
||||
conn = self._get_connection(date)
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("""
|
||||
SELECT COUNT(*) as count FROM crawl_records
|
||||
""")
|
||||
|
||||
row = cursor.fetchone()
|
||||
count = row[0] if row else 0
|
||||
|
||||
# 如果只有一条或没有记录,视为第一次抓取
|
||||
return count <= 1
|
||||
|
||||
except Exception as e:
|
||||
print(f"[本地存储] 检查首次抓取失败: {e}")
|
||||
return True
|
||||
|
||||
def get_crawl_times(self, date: Optional[str] = None) -> List[str]:
|
||||
"""
|
||||
获取指定日期的所有抓取时间列表
|
||||
|
||||
Args:
|
||||
date: 日期字符串,默认为今天
|
||||
|
||||
Returns:
|
||||
抓取时间列表(按时间排序)
|
||||
"""
|
||||
try:
|
||||
db_path = self._get_db_path(date)
|
||||
if not db_path.exists():
|
||||
return []
|
||||
|
||||
conn = self._get_connection(date)
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("""
|
||||
SELECT crawl_time FROM crawl_records
|
||||
ORDER BY crawl_time
|
||||
""")
|
||||
|
||||
rows = cursor.fetchall()
|
||||
return [row[0] for row in rows]
|
||||
|
||||
except Exception as e:
|
||||
print(f"[本地存储] 获取抓取时间列表失败: {e}")
|
||||
return []
|
||||
|
||||
def cleanup(self) -> None:
|
||||
"""清理资源(关闭数据库连接)"""
|
||||
for db_path, conn in self._db_connections.items():
|
||||
try:
|
||||
conn.close()
|
||||
print(f"[本地存储] 关闭数据库连接: {db_path}")
|
||||
except Exception as e:
|
||||
print(f"[本地存储] 关闭连接失败 {db_path}: {e}")
|
||||
|
||||
self._db_connections.clear()
|
||||
|
||||
def cleanup_old_data(self, retention_days: int) -> int:
|
||||
"""
|
||||
清理过期数据
|
||||
|
||||
Args:
|
||||
retention_days: 保留天数(0 表示不清理)
|
||||
|
||||
Returns:
|
||||
删除的日期目录数量
|
||||
"""
|
||||
if retention_days <= 0:
|
||||
return 0
|
||||
|
||||
deleted_count = 0
|
||||
cutoff_date = self._get_configured_time() - timedelta(days=retention_days)
|
||||
|
||||
try:
|
||||
if not self.data_dir.exists():
|
||||
return 0
|
||||
|
||||
for date_folder in self.data_dir.iterdir():
|
||||
if not date_folder.is_dir() or date_folder.name.startswith('.'):
|
||||
continue
|
||||
|
||||
# 解析日期文件夹名(支持两种格式)
|
||||
folder_date = None
|
||||
try:
|
||||
# ISO 格式: YYYY-MM-DD
|
||||
date_match = re.match(r'(\d{4})-(\d{2})-(\d{2})', date_folder.name)
|
||||
if date_match:
|
||||
folder_date = datetime(
|
||||
int(date_match.group(1)),
|
||||
int(date_match.group(2)),
|
||||
int(date_match.group(3)),
|
||||
tzinfo=pytz.timezone("Asia/Shanghai")
|
||||
)
|
||||
else:
|
||||
# 旧中文格式: YYYY年MM月DD日
|
||||
date_match = re.match(r'(\d{4})年(\d{2})月(\d{2})日', date_folder.name)
|
||||
if date_match:
|
||||
folder_date = datetime(
|
||||
int(date_match.group(1)),
|
||||
int(date_match.group(2)),
|
||||
int(date_match.group(3)),
|
||||
tzinfo=pytz.timezone("Asia/Shanghai")
|
||||
)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if folder_date and folder_date < cutoff_date:
|
||||
# 先关闭该日期的数据库连接
|
||||
db_path = str(self._get_db_path(date_folder.name))
|
||||
if db_path in self._db_connections:
|
||||
try:
|
||||
self._db_connections[db_path].close()
|
||||
del self._db_connections[db_path]
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 删除整个日期目录
|
||||
try:
|
||||
shutil.rmtree(date_folder)
|
||||
deleted_count += 1
|
||||
print(f"[本地存储] 清理过期数据: {date_folder.name}")
|
||||
except Exception as e:
|
||||
print(f"[本地存储] 删除目录失败 {date_folder.name}: {e}")
|
||||
|
||||
if deleted_count > 0:
|
||||
print(f"[本地存储] 共清理 {deleted_count} 个过期日期目录")
|
||||
|
||||
return deleted_count
|
||||
|
||||
except Exception as e:
|
||||
print(f"[本地存储] 清理过期数据失败: {e}")
|
||||
return deleted_count
|
||||
|
||||
def has_pushed_today(self, date: Optional[str] = None) -> bool:
|
||||
"""
|
||||
检查指定日期是否已推送过
|
||||
|
||||
Args:
|
||||
date: 日期字符串(YYYY-MM-DD),默认为今天
|
||||
|
||||
Returns:
|
||||
是否已推送
|
||||
"""
|
||||
try:
|
||||
conn = self._get_connection(date)
|
||||
cursor = conn.cursor()
|
||||
|
||||
target_date = self._format_date_folder(date)
|
||||
|
||||
cursor.execute("""
|
||||
SELECT pushed FROM push_records WHERE date = ?
|
||||
""", (target_date,))
|
||||
|
||||
row = cursor.fetchone()
|
||||
if row:
|
||||
return bool(row[0])
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
print(f"[本地存储] 检查推送记录失败: {e}")
|
||||
return False
|
||||
|
||||
def record_push(self, report_type: str, date: Optional[str] = None) -> bool:
|
||||
"""
|
||||
记录推送
|
||||
|
||||
Args:
|
||||
report_type: 报告类型
|
||||
date: 日期字符串(YYYY-MM-DD),默认为今天
|
||||
|
||||
Returns:
|
||||
是否记录成功
|
||||
"""
|
||||
try:
|
||||
conn = self._get_connection(date)
|
||||
cursor = conn.cursor()
|
||||
|
||||
target_date = self._format_date_folder(date)
|
||||
now_str = self._get_configured_time().strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
cursor.execute("""
|
||||
INSERT INTO push_records (date, pushed, push_time, report_type, created_at)
|
||||
VALUES (?, 1, ?, ?, ?)
|
||||
ON CONFLICT(date) DO UPDATE SET
|
||||
pushed = 1,
|
||||
push_time = excluded.push_time,
|
||||
report_type = excluded.report_type
|
||||
""", (target_date, now_str, report_type, now_str))
|
||||
|
||||
conn.commit()
|
||||
|
||||
print(f"[本地存储] 推送记录已保存: {report_type} at {now_str}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"[本地存储] 记录推送失败: {e}")
|
||||
return False
|
||||
|
||||
def __del__(self):
|
||||
"""析构函数,确保关闭连接"""
|
||||
self.cleanup()
|
||||
@@ -0,0 +1,316 @@
|
||||
# coding=utf-8
|
||||
"""
|
||||
存储管理器 - 统一管理存储后端
|
||||
|
||||
根据环境和配置自动选择合适的存储后端
|
||||
"""
|
||||
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
from trendradar.storage.base import StorageBackend, NewsData
|
||||
|
||||
|
||||
# 存储管理器单例
|
||||
_storage_manager: Optional["StorageManager"] = None
|
||||
|
||||
|
||||
class StorageManager:
|
||||
"""
|
||||
存储管理器
|
||||
|
||||
功能:
|
||||
- 自动检测运行环境(GitHub Actions / Docker / 本地)
|
||||
- 根据配置选择存储后端(local / remote / auto)
|
||||
- 提供统一的存储接口
|
||||
- 支持从远程拉取数据到本地
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
backend_type: str = "auto",
|
||||
data_dir: str = "output",
|
||||
enable_txt: bool = True,
|
||||
enable_html: bool = True,
|
||||
remote_config: Optional[dict] = None,
|
||||
local_retention_days: int = 0,
|
||||
remote_retention_days: int = 0,
|
||||
pull_enabled: bool = False,
|
||||
pull_days: int = 0,
|
||||
timezone: str = "Asia/Shanghai",
|
||||
):
|
||||
"""
|
||||
初始化存储管理器
|
||||
|
||||
Args:
|
||||
backend_type: 存储后端类型 (local / remote / auto)
|
||||
data_dir: 本地数据目录
|
||||
enable_txt: 是否启用 TXT 快照
|
||||
enable_html: 是否启用 HTML 报告
|
||||
remote_config: 远程存储配置(endpoint_url, bucket_name, access_key_id 等)
|
||||
local_retention_days: 本地数据保留天数(0 = 无限制)
|
||||
remote_retention_days: 远程数据保留天数(0 = 无限制)
|
||||
pull_enabled: 是否启用启动时自动拉取
|
||||
pull_days: 拉取最近 N 天的数据
|
||||
timezone: 时区配置(默认 Asia/Shanghai)
|
||||
"""
|
||||
self.backend_type = backend_type
|
||||
self.data_dir = data_dir
|
||||
self.enable_txt = enable_txt
|
||||
self.enable_html = enable_html
|
||||
self.remote_config = remote_config or {}
|
||||
self.local_retention_days = local_retention_days
|
||||
self.remote_retention_days = remote_retention_days
|
||||
self.pull_enabled = pull_enabled
|
||||
self.pull_days = pull_days
|
||||
self.timezone = timezone
|
||||
|
||||
self._backend: Optional[StorageBackend] = None
|
||||
self._remote_backend: Optional[StorageBackend] = None
|
||||
|
||||
@staticmethod
|
||||
def is_github_actions() -> bool:
|
||||
"""检测是否在 GitHub Actions 环境中运行"""
|
||||
return os.environ.get("GITHUB_ACTIONS") == "true"
|
||||
|
||||
@staticmethod
|
||||
def is_docker() -> bool:
|
||||
"""检测是否在 Docker 容器中运行"""
|
||||
# 方法1: 检查 /.dockerenv 文件
|
||||
if os.path.exists("/.dockerenv"):
|
||||
return True
|
||||
|
||||
# 方法2: 检查 cgroup(Linux)
|
||||
try:
|
||||
with open("/proc/1/cgroup", "r") as f:
|
||||
return "docker" in f.read()
|
||||
except (FileNotFoundError, PermissionError):
|
||||
pass
|
||||
|
||||
# 方法3: 检查环境变量
|
||||
return os.environ.get("DOCKER_CONTAINER") == "true"
|
||||
|
||||
def _resolve_backend_type(self) -> str:
|
||||
"""解析实际使用的后端类型"""
|
||||
if self.backend_type == "auto":
|
||||
if self.is_github_actions():
|
||||
# GitHub Actions 环境,检查是否配置了远程存储
|
||||
if self._has_remote_config():
|
||||
return "remote"
|
||||
else:
|
||||
print("[存储管理器] GitHub Actions 环境但未配置远程存储,使用本地存储")
|
||||
return "local"
|
||||
else:
|
||||
return "local"
|
||||
return self.backend_type
|
||||
|
||||
def _has_remote_config(self) -> bool:
|
||||
"""检查是否有有效的远程存储配置"""
|
||||
# 检查配置或环境变量
|
||||
bucket_name = self.remote_config.get("bucket_name") or os.environ.get("S3_BUCKET_NAME")
|
||||
access_key = self.remote_config.get("access_key_id") or os.environ.get("S3_ACCESS_KEY_ID")
|
||||
secret_key = self.remote_config.get("secret_access_key") or os.environ.get("S3_SECRET_ACCESS_KEY")
|
||||
endpoint = self.remote_config.get("endpoint_url") or os.environ.get("S3_ENDPOINT_URL")
|
||||
|
||||
# 调试日志
|
||||
has_config = bool(bucket_name and access_key and secret_key and endpoint)
|
||||
if not has_config:
|
||||
print(f"[存储管理器] 远程存储配置检查失败:")
|
||||
print(f" - bucket_name: {'已配置' if bucket_name else '未配置'}")
|
||||
print(f" - access_key_id: {'已配置' if access_key else '未配置'}")
|
||||
print(f" - secret_access_key: {'已配置' if secret_key else '未配置'}")
|
||||
print(f" - endpoint_url: {'已配置' if endpoint else '未配置'}")
|
||||
|
||||
return has_config
|
||||
|
||||
def _create_remote_backend(self) -> Optional[StorageBackend]:
|
||||
"""创建远程存储后端"""
|
||||
try:
|
||||
from trendradar.storage.remote import RemoteStorageBackend
|
||||
|
||||
return RemoteStorageBackend(
|
||||
bucket_name=self.remote_config.get("bucket_name") or os.environ.get("S3_BUCKET_NAME", ""),
|
||||
access_key_id=self.remote_config.get("access_key_id") or os.environ.get("S3_ACCESS_KEY_ID", ""),
|
||||
secret_access_key=self.remote_config.get("secret_access_key") or os.environ.get("S3_SECRET_ACCESS_KEY", ""),
|
||||
endpoint_url=self.remote_config.get("endpoint_url") or os.environ.get("S3_ENDPOINT_URL", ""),
|
||||
region=self.remote_config.get("region") or os.environ.get("S3_REGION", ""),
|
||||
enable_txt=self.enable_txt,
|
||||
enable_html=self.enable_html,
|
||||
timezone=self.timezone,
|
||||
)
|
||||
except ImportError as e:
|
||||
print(f"[存储管理器] 远程后端导入失败: {e}")
|
||||
print("[存储管理器] 请确保已安装 boto3: pip install boto3")
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f"[存储管理器] 远程后端初始化失败: {e}")
|
||||
return None
|
||||
|
||||
def get_backend(self) -> StorageBackend:
|
||||
"""获取存储后端实例"""
|
||||
if self._backend is None:
|
||||
resolved_type = self._resolve_backend_type()
|
||||
|
||||
if resolved_type == "remote":
|
||||
self._backend = self._create_remote_backend()
|
||||
if self._backend:
|
||||
print(f"[存储管理器] 使用远程存储后端")
|
||||
else:
|
||||
print("[存储管理器] 回退到本地存储")
|
||||
resolved_type = "local"
|
||||
|
||||
if resolved_type == "local" or self._backend is None:
|
||||
from trendradar.storage.local import LocalStorageBackend
|
||||
|
||||
self._backend = LocalStorageBackend(
|
||||
data_dir=self.data_dir,
|
||||
enable_txt=self.enable_txt,
|
||||
enable_html=self.enable_html,
|
||||
timezone=self.timezone,
|
||||
)
|
||||
print(f"[存储管理器] 使用本地存储后端 (数据目录: {self.data_dir})")
|
||||
|
||||
return self._backend
|
||||
|
||||
def pull_from_remote(self) -> int:
|
||||
"""
|
||||
从远程拉取数据到本地
|
||||
|
||||
Returns:
|
||||
成功拉取的文件数量
|
||||
"""
|
||||
if not self.pull_enabled or self.pull_days <= 0:
|
||||
return 0
|
||||
|
||||
if not self._has_remote_config():
|
||||
print("[存储管理器] 未配置远程存储,无法拉取")
|
||||
return 0
|
||||
|
||||
# 创建远程后端(如果还没有)
|
||||
if self._remote_backend is None:
|
||||
self._remote_backend = self._create_remote_backend()
|
||||
|
||||
if self._remote_backend is None:
|
||||
print("[存储管理器] 无法创建远程后端,拉取失败")
|
||||
return 0
|
||||
|
||||
# 调用拉取方法
|
||||
return self._remote_backend.pull_recent_days(self.pull_days, self.data_dir)
|
||||
|
||||
def save_news_data(self, data: NewsData) -> bool:
|
||||
"""保存新闻数据"""
|
||||
return self.get_backend().save_news_data(data)
|
||||
|
||||
def get_today_all_data(self, date: Optional[str] = None) -> Optional[NewsData]:
|
||||
"""获取当天所有数据"""
|
||||
return self.get_backend().get_today_all_data(date)
|
||||
|
||||
def get_latest_crawl_data(self, date: Optional[str] = None) -> Optional[NewsData]:
|
||||
"""获取最新抓取数据"""
|
||||
return self.get_backend().get_latest_crawl_data(date)
|
||||
|
||||
def detect_new_titles(self, current_data: NewsData) -> dict:
|
||||
"""检测新增标题"""
|
||||
return self.get_backend().detect_new_titles(current_data)
|
||||
|
||||
def save_txt_snapshot(self, data: NewsData) -> Optional[str]:
|
||||
"""保存 TXT 快照"""
|
||||
return self.get_backend().save_txt_snapshot(data)
|
||||
|
||||
def save_html_report(self, html_content: str, filename: str, is_summary: bool = False) -> Optional[str]:
|
||||
"""保存 HTML 报告"""
|
||||
return self.get_backend().save_html_report(html_content, filename, is_summary)
|
||||
|
||||
def is_first_crawl_today(self, date: Optional[str] = None) -> bool:
|
||||
"""检查是否是当天第一次抓取"""
|
||||
return self.get_backend().is_first_crawl_today(date)
|
||||
|
||||
def cleanup(self) -> None:
|
||||
"""清理资源"""
|
||||
if self._backend:
|
||||
self._backend.cleanup()
|
||||
if self._remote_backend:
|
||||
self._remote_backend.cleanup()
|
||||
|
||||
def cleanup_old_data(self) -> int:
|
||||
"""
|
||||
清理过期数据
|
||||
|
||||
Returns:
|
||||
删除的日期目录数量
|
||||
"""
|
||||
total_deleted = 0
|
||||
|
||||
# 清理本地数据
|
||||
if self.local_retention_days > 0:
|
||||
total_deleted += self.get_backend().cleanup_old_data(self.local_retention_days)
|
||||
|
||||
# 清理远程数据(如果配置了)
|
||||
if self.remote_retention_days > 0 and self._has_remote_config():
|
||||
if self._remote_backend is None:
|
||||
self._remote_backend = self._create_remote_backend()
|
||||
if self._remote_backend:
|
||||
total_deleted += self._remote_backend.cleanup_old_data(self.remote_retention_days)
|
||||
|
||||
return total_deleted
|
||||
|
||||
@property
|
||||
def backend_name(self) -> str:
|
||||
"""获取当前后端名称"""
|
||||
return self.get_backend().backend_name
|
||||
|
||||
@property
|
||||
def supports_txt(self) -> bool:
|
||||
"""是否支持 TXT 快照"""
|
||||
return self.get_backend().supports_txt
|
||||
|
||||
|
||||
def get_storage_manager(
|
||||
backend_type: str = "auto",
|
||||
data_dir: str = "output",
|
||||
enable_txt: bool = True,
|
||||
enable_html: bool = True,
|
||||
remote_config: Optional[dict] = None,
|
||||
local_retention_days: int = 0,
|
||||
remote_retention_days: int = 0,
|
||||
pull_enabled: bool = False,
|
||||
pull_days: int = 0,
|
||||
timezone: str = "Asia/Shanghai",
|
||||
force_new: bool = False,
|
||||
) -> StorageManager:
|
||||
"""
|
||||
获取存储管理器单例
|
||||
|
||||
Args:
|
||||
backend_type: 存储后端类型
|
||||
data_dir: 本地数据目录
|
||||
enable_txt: 是否启用 TXT 快照
|
||||
enable_html: 是否启用 HTML 报告
|
||||
remote_config: 远程存储配置
|
||||
local_retention_days: 本地数据保留天数(0 = 无限制)
|
||||
remote_retention_days: 远程数据保留天数(0 = 无限制)
|
||||
pull_enabled: 是否启用启动时自动拉取
|
||||
pull_days: 拉取最近 N 天的数据
|
||||
timezone: 时区配置(默认 Asia/Shanghai)
|
||||
force_new: 是否强制创建新实例
|
||||
|
||||
Returns:
|
||||
StorageManager 实例
|
||||
"""
|
||||
global _storage_manager
|
||||
|
||||
if _storage_manager is None or force_new:
|
||||
_storage_manager = StorageManager(
|
||||
backend_type=backend_type,
|
||||
data_dir=data_dir,
|
||||
enable_txt=enable_txt,
|
||||
enable_html=enable_html,
|
||||
remote_config=remote_config,
|
||||
local_retention_days=local_retention_days,
|
||||
remote_retention_days=remote_retention_days,
|
||||
pull_enabled=pull_enabled,
|
||||
pull_days=pull_days,
|
||||
timezone=timezone,
|
||||
)
|
||||
|
||||
return _storage_manager
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,117 @@
|
||||
-- TrendRadar 数据库表结构
|
||||
|
||||
-- ============================================
|
||||
-- 平台信息表
|
||||
-- 核心:id 不变,name 可变
|
||||
-- ============================================
|
||||
CREATE TABLE IF NOT EXISTS platforms (
|
||||
id TEXT PRIMARY KEY,
|
||||
name TEXT NOT NULL,
|
||||
is_active INTEGER DEFAULT 1,
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
-- ============================================
|
||||
-- 新闻条目表
|
||||
-- 以 URL + platform_id 为唯一标识,支持去重存储
|
||||
-- ============================================
|
||||
CREATE TABLE IF NOT EXISTS news_items (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
title TEXT NOT NULL,
|
||||
platform_id TEXT NOT NULL,
|
||||
rank INTEGER NOT NULL,
|
||||
url TEXT DEFAULT '',
|
||||
mobile_url TEXT DEFAULT '',
|
||||
first_crawl_time TEXT NOT NULL, -- 首次抓取时间
|
||||
last_crawl_time TEXT NOT NULL, -- 最后抓取时间
|
||||
crawl_count INTEGER DEFAULT 1, -- 抓取次数
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
FOREIGN KEY (platform_id) REFERENCES platforms(id)
|
||||
);
|
||||
|
||||
-- ============================================
|
||||
-- 标题变更历史表
|
||||
-- 记录同一 URL 下标题的变化
|
||||
-- ============================================
|
||||
CREATE TABLE IF NOT EXISTS title_changes (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
news_item_id INTEGER NOT NULL,
|
||||
old_title TEXT NOT NULL,
|
||||
new_title TEXT NOT NULL,
|
||||
changed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
FOREIGN KEY (news_item_id) REFERENCES news_items(id)
|
||||
);
|
||||
|
||||
-- ============================================
|
||||
-- 排名历史表
|
||||
-- 记录每次抓取时的排名变化
|
||||
-- ============================================
|
||||
CREATE TABLE IF NOT EXISTS rank_history (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
news_item_id INTEGER NOT NULL,
|
||||
rank INTEGER NOT NULL,
|
||||
crawl_time TEXT NOT NULL,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
FOREIGN KEY (news_item_id) REFERENCES news_items(id)
|
||||
);
|
||||
|
||||
-- ============================================
|
||||
-- 抓取记录表
|
||||
-- 记录每次抓取的时间和数量
|
||||
-- ============================================
|
||||
CREATE TABLE IF NOT EXISTS crawl_records (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
crawl_time TEXT NOT NULL UNIQUE,
|
||||
total_items INTEGER DEFAULT 0,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
-- ============================================
|
||||
-- 抓取来源状态表
|
||||
-- 记录每次抓取各平台的成功/失败状态
|
||||
-- ============================================
|
||||
CREATE TABLE IF NOT EXISTS crawl_source_status (
|
||||
crawl_record_id INTEGER NOT NULL,
|
||||
platform_id TEXT NOT NULL,
|
||||
status TEXT NOT NULL CHECK(status IN ('success', 'failed')),
|
||||
PRIMARY KEY (crawl_record_id, platform_id),
|
||||
FOREIGN KEY (crawl_record_id) REFERENCES crawl_records(id),
|
||||
FOREIGN KEY (platform_id) REFERENCES platforms(id)
|
||||
);
|
||||
|
||||
-- ============================================
|
||||
-- 推送记录表
|
||||
-- 用于 push_window once_per_day 功能
|
||||
-- ============================================
|
||||
CREATE TABLE IF NOT EXISTS push_records (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
date TEXT NOT NULL UNIQUE,
|
||||
pushed INTEGER DEFAULT 0,
|
||||
push_time TEXT,
|
||||
report_type TEXT,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
-- ============================================
|
||||
-- 索引定义
|
||||
-- ============================================
|
||||
|
||||
-- 平台索引
|
||||
CREATE INDEX IF NOT EXISTS idx_news_platform ON news_items(platform_id);
|
||||
|
||||
-- 时间索引(用于查询最新数据)
|
||||
CREATE INDEX IF NOT EXISTS idx_news_crawl_time ON news_items(last_crawl_time);
|
||||
|
||||
-- 标题索引(用于标题搜索)
|
||||
CREATE INDEX IF NOT EXISTS idx_news_title ON news_items(title);
|
||||
|
||||
-- URL + platform_id 唯一索引(仅对非空 URL,实现去重)
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS idx_news_url_platform
|
||||
ON news_items(url, platform_id) WHERE url != '';
|
||||
|
||||
-- 抓取状态索引
|
||||
CREATE INDEX IF NOT EXISTS idx_crawl_status_record ON crawl_source_status(crawl_record_id);
|
||||
|
||||
-- 排名历史索引
|
||||
CREATE INDEX IF NOT EXISTS idx_rank_history_news ON rank_history(news_item_id);
|
||||
Reference in New Issue
Block a user