v4.0.0 大大大更新

This commit is contained in:
sansan
2025-12-13 13:44:35 +08:00
parent 97c05aa33c
commit c7bacdfff7
61 changed files with 12407 additions and 5889 deletions
+51 -32
View File
@@ -517,24 +517,55 @@ class DataService:
# 遍历日期文件夹
for date_folder in output_dir.iterdir():
if date_folder.is_dir() and not date_folder.name.startswith('.'):
# 解析日期(格式: YYYY年MM月DD日)
try:
date_match = re.match(r'(\d{4})年(\d{2})月(\d{2})日', date_folder.name)
if date_match:
folder_date = datetime(
int(date_match.group(1)),
int(date_match.group(2)),
int(date_match.group(3))
)
available_dates.append(folder_date)
except Exception:
pass
folder_date = self._parse_date_folder_name(date_folder.name)
if folder_date:
available_dates.append(folder_date)
if not available_dates:
return (None, None)
return (min(available_dates), max(available_dates))
def _parse_date_folder_name(self, folder_name: str) -> Optional[datetime]:
"""
解析日期文件夹名称(兼容中文和ISO格式)
支持两种格式:
- 中文格式:YYYY年MM月DD日
- ISO格式:YYYY-MM-DD
Args:
folder_name: 文件夹名称
Returns:
datetime 对象,解析失败返回 None
"""
# 尝试中文格式:YYYY年MM月DD日
chinese_match = re.match(r'(\d{4})年(\d{2})月(\d{2})日', folder_name)
if chinese_match:
try:
return datetime(
int(chinese_match.group(1)),
int(chinese_match.group(2)),
int(chinese_match.group(3))
)
except ValueError:
pass
# 尝试 ISO 格式:YYYY-MM-DD
iso_match = re.match(r'(\d{4})-(\d{2})-(\d{2})', folder_name)
if iso_match:
try:
return datetime(
int(iso_match.group(1)),
int(iso_match.group(2)),
int(iso_match.group(3))
)
except ValueError:
pass
return None
def get_system_status(self) -> Dict:
"""
获取系统运行状态
@@ -553,26 +584,14 @@ class DataService:
if output_dir.exists():
# 遍历日期文件夹
for date_folder in output_dir.iterdir():
if date_folder.is_dir():
# 解析日期
try:
date_str = date_folder.name
# 格式: YYYY年MM月DD日
date_match = re.match(r'(\d{4})年(\d{2})月(\d{2})日', date_str)
if date_match:
folder_date = datetime(
int(date_match.group(1)),
int(date_match.group(2)),
int(date_match.group(3))
)
if oldest_record is None or folder_date < oldest_record:
oldest_record = folder_date
if latest_record is None or folder_date > latest_record:
latest_record = folder_date
except:
pass
if date_folder.is_dir() and not date_folder.name.startswith('.'):
# 解析日期(兼容中文和ISO格式)
folder_date = self._parse_date_folder_name(date_folder.name)
if folder_date:
if oldest_record is None or folder_date < oldest_record:
oldest_record = folder_date
if latest_record is None or folder_date > latest_record:
latest_record = folder_date
# 计算存储大小
for item in date_folder.rglob("*"):
+315 -67
View File
@@ -2,9 +2,12 @@
文件解析服务
提供txt格式新闻数据和YAML配置文件的解析功能。
支持从 SQLite 数据库和 TXT 文件两种数据源读取。
"""
import json
import re
import sqlite3
from pathlib import Path
from typing import Dict, List, Tuple, Optional
from datetime import datetime
@@ -145,17 +148,310 @@ class ParserService:
def get_date_folder_name(self, date: datetime = None) -> str:
"""
获取日期文件夹名称
获取日期文件夹名称(兼容中文和ISO格式)
Args:
date: 日期对象,默认为今天
Returns:
文件夹名称,格式: YYYY年MM月DD日
实际存在的文件夹名称,优先返回中文格式(YYYY年MM月DD日),
若不存在则返回 ISO 格式(YYYY-MM-DD
"""
if date is None:
date = datetime.now()
return date.strftime("%Y年%m月%d")
return self._find_date_folder(date)
def _get_date_folder_name(self, date: datetime = None) -> str:
"""
获取日期文件夹名称(兼容中文和ISO格式)
Args:
date: 日期对象,默认为今天
Returns:
实际存在的文件夹名称,优先返回中文格式(YYYY年MM月DD日),
若不存在则返回 ISO 格式(YYYY-MM-DD
"""
if date is None:
date = datetime.now()
return self._find_date_folder(date)
def _find_date_folder(self, date: datetime) -> str:
"""
查找实际存在的日期文件夹
支持两种格式:
- 中文格式:YYYY年MM月DD日(优先)
- ISO格式:YYYY-MM-DD
Args:
date: 日期对象
Returns:
实际存在的文件夹名称,若都不存在则返回中文格式
"""
output_dir = self.project_root / "output"
# 中文格式:YYYY年MM月DD日
chinese_format = date.strftime("%Y年%m月%d")
# ISO格式:YYYY-MM-DD
iso_format = date.strftime("%Y-%m-%d")
# 优先检查中文格式
if (output_dir / chinese_format).exists():
return chinese_format
# 其次检查 ISO 格式
if (output_dir / iso_format).exists():
return iso_format
# 都不存在,返回中文格式(与项目现有风格一致)
return chinese_format
def _get_sqlite_db_path(self, date: datetime = None) -> Optional[Path]:
"""
获取 SQLite 数据库文件路径
Args:
date: 日期对象,默认为今天
Returns:
数据库文件路径,如果不存在则返回 None
"""
date_folder = self._get_date_folder_name(date)
db_path = self.project_root / "output" / date_folder / "news.db"
if db_path.exists():
return db_path
return None
def _get_txt_folder_path(self, date: datetime = None) -> Optional[Path]:
"""
获取 TXT 文件夹路径
Args:
date: 日期对象,默认为今天
Returns:
TXT 文件夹路径,如果不存在则返回 None
"""
date_folder = self._get_date_folder_name(date)
txt_path = self.project_root / "output" / date_folder / "txt"
if txt_path.exists() and txt_path.is_dir():
return txt_path
return None
def _read_from_txt(
self,
date: datetime = None,
platform_ids: Optional[List[str]] = None
) -> Optional[Tuple[Dict, Dict, Dict]]:
"""
从 TXT 文件夹读取新闻数据
Args:
date: 日期对象,默认为今天
platform_ids: 平台ID列表,None表示所有平台
Returns:
(all_titles, id_to_name, all_timestamps) 元组,如果不存在返回 None
"""
txt_folder = self._get_txt_folder_path(date)
if txt_folder is None:
return None
# 获取所有 TXT 文件并按时间排序
txt_files = sorted(txt_folder.glob("*.txt"))
if not txt_files:
return None
all_titles = {}
id_to_name = {}
all_timestamps = {}
for txt_file in txt_files:
try:
titles_by_id, file_id_to_name = self.parse_txt_file(txt_file)
# 记录时间戳
all_timestamps[txt_file.name] = txt_file.stat().st_mtime
# 合并 id_to_name
id_to_name.update(file_id_to_name)
# 合并标题数据
for source_id, titles in titles_by_id.items():
# 如果指定了 platform_ids,过滤
if platform_ids and source_id not in platform_ids:
continue
if source_id not in all_titles:
all_titles[source_id] = {}
for title, data in titles.items():
if title not in all_titles[source_id]:
# 新标题
all_titles[source_id][title] = {
"ranks": data.get("ranks", []),
"url": data.get("url", ""),
"mobileUrl": data.get("mobileUrl", ""),
"first_time": txt_file.stem, # 使用文件名作为时间
"last_time": txt_file.stem,
"count": 1,
}
else:
# 合并已存在的标题
existing = all_titles[source_id][title]
# 合并排名
for rank in data.get("ranks", []):
if rank not in existing["ranks"]:
existing["ranks"].append(rank)
# 更新 last_time
existing["last_time"] = txt_file.stem
existing["count"] += 1
# 保留 URL
if not existing["url"] and data.get("url"):
existing["url"] = data["url"]
if not existing["mobileUrl"] and data.get("mobileUrl"):
existing["mobileUrl"] = data["mobileUrl"]
except Exception as e:
print(f"Warning: 解析 TXT 文件失败 {txt_file}: {e}")
continue
if not all_titles:
return None
return (all_titles, id_to_name, all_timestamps)
def _read_from_sqlite(
self,
date: datetime = None,
platform_ids: Optional[List[str]] = None
) -> Optional[Tuple[Dict, Dict, Dict]]:
"""
从 SQLite 数据库读取新闻数据
新表结构数据已按 URL 去重,包含:
- first_crawl_time: 首次抓取时间
- last_crawl_time: 最后抓取时间
- crawl_count: 抓取次数
Args:
date: 日期对象,默认为今天
platform_ids: 平台ID列表,None表示所有平台
Returns:
(all_titles, id_to_name, all_timestamps) 元组,如果数据库不存在返回 None
"""
db_path = self._get_sqlite_db_path(date)
if db_path is None:
return None
all_titles = {}
id_to_name = {}
all_timestamps = {}
try:
conn = sqlite3.connect(str(db_path))
conn.row_factory = sqlite3.Row
cursor = conn.cursor()
# 检查表是否存在
cursor.execute("""
SELECT name FROM sqlite_master
WHERE type='table' AND name='news_items'
""")
if not cursor.fetchone():
conn.close()
return None
# 构建查询
if platform_ids:
placeholders = ','.join(['?' for _ in platform_ids])
query = f"""
SELECT n.id, n.platform_id, p.name as platform_name, n.title,
n.rank, n.url, n.mobile_url,
n.first_crawl_time, n.last_crawl_time, n.crawl_count
FROM news_items n
LEFT JOIN platforms p ON n.platform_id = p.id
WHERE n.platform_id IN ({placeholders})
"""
cursor.execute(query, platform_ids)
else:
cursor.execute("""
SELECT n.id, n.platform_id, p.name as platform_name, n.title,
n.rank, n.url, n.mobile_url,
n.first_crawl_time, n.last_crawl_time, n.crawl_count
FROM news_items n
LEFT JOIN platforms p ON n.platform_id = p.id
""")
rows = cursor.fetchall()
# 收集所有 news_item_id 用于查询历史排名
news_ids = [row['id'] for row in rows]
rank_history_map = {}
if news_ids:
placeholders = ",".join("?" * len(news_ids))
cursor.execute(f"""
SELECT news_item_id, rank FROM rank_history
WHERE news_item_id IN ({placeholders})
ORDER BY news_item_id, crawl_time
""", news_ids)
for rh_row in cursor.fetchall():
news_id = rh_row['news_item_id']
rank = rh_row['rank']
if news_id not in rank_history_map:
rank_history_map[news_id] = []
rank_history_map[news_id].append(rank)
for row in rows:
news_id = row['id']
platform_id = row['platform_id']
platform_name = row['platform_name'] or platform_id
title = row['title']
# 更新 id_to_name
if platform_id not in id_to_name:
id_to_name[platform_id] = platform_name
# 初始化平台字典
if platform_id not in all_titles:
all_titles[platform_id] = {}
# 获取排名历史,如果为空则使用当前排名
ranks = rank_history_map.get(news_id, [row['rank']])
# 直接使用数据(已去重)
all_titles[platform_id][title] = {
"ranks": ranks,
"url": row['url'] or "",
"mobileUrl": row['mobile_url'] or "",
"first_time": row['first_crawl_time'] or "",
"last_time": row['last_crawl_time'] or "",
"count": row['crawl_count'] or 1,
}
# 获取抓取时间作为 timestamps
cursor.execute("""
SELECT crawl_time FROM crawl_records
ORDER BY crawl_time
""")
for row in cursor.fetchall():
crawl_time = row['crawl_time']
all_timestamps[f"{crawl_time}.db"] = 0 # 用虚拟时间戳
conn.close()
if not all_titles:
return None
return (all_titles, id_to_name, all_timestamps)
except Exception as e:
print(f"Warning: 从 SQLite 读取数据失败: {e}")
return None
def read_all_titles_for_date(
self,
@@ -163,7 +459,7 @@ class ParserService:
platform_ids: Optional[List[str]] = None
) -> Tuple[Dict, Dict, Dict]:
"""
读取指定日期的所有标题文件(带缓存)
读取指定日期的所有标题(带缓存)
Args:
date: 日期对象,默认为今天
@@ -193,71 +489,23 @@ class ParserService:
if cached:
return cached
# 缓存未命中,读取文件
date_folder = self.get_date_folder_name(date)
txt_dir = self.project_root / "output" / date_folder / "txt"
# 优先从 SQLite 读取
sqlite_result = self._read_from_sqlite(date, platform_ids)
if sqlite_result:
self.cache.set(cache_key, sqlite_result)
return sqlite_result
if not txt_dir.exists():
raise DataNotFoundError(
f"未找到 {date_folder} 的数据目录",
suggestion="请先运行爬虫或检查日期是否正确"
)
# SQLite 不存在,尝试从 TXT 读取
txt_result = self._read_from_txt(date, platform_ids)
if txt_result:
self.cache.set(cache_key, txt_result)
return txt_result
all_titles = {}
id_to_name = {}
all_timestamps = {}
# 读取所有txt文件
txt_files = sorted(txt_dir.glob("*.txt"))
if not txt_files:
raise DataNotFoundError(
f"{date_folder} 没有数据文件",
suggestion="请等待爬虫任务完成"
)
for txt_file in txt_files:
try:
titles_by_id, file_id_to_name = self.parse_txt_file(txt_file)
# 更新id_to_name
id_to_name.update(file_id_to_name)
# 合并标题数据
for platform_id, titles in titles_by_id.items():
# 如果指定了平台过滤
if platform_ids and platform_id not in platform_ids:
continue
if platform_id not in all_titles:
all_titles[platform_id] = {}
for title, info in titles.items():
if title in all_titles[platform_id]:
# 合并排名
all_titles[platform_id][title]["ranks"].extend(info["ranks"])
else:
all_titles[platform_id][title] = info.copy()
# 记录文件时间戳
all_timestamps[txt_file.name] = txt_file.stat().st_mtime
except Exception as e:
# 忽略单个文件的解析错误,继续处理其他文件
print(f"Warning: 解析文件 {txt_file} 失败: {e}")
continue
if not all_titles:
raise DataNotFoundError(
f"{date_folder} 没有有效的数据",
suggestion="请检查数据文件格式或重新运行爬虫"
)
# 缓存结果
result = (all_titles, id_to_name, all_timestamps)
self.cache.set(cache_key, result)
return result
# 两种数据源都不存在
raise DataNotFoundError(
f"未找到 {date_str} 的数据",
suggestion="请先运行爬虫或检查日期是否正确"
)
def parse_yaml_config(self, config_path: str = None) -> dict:
"""