mirror of
https://gitee.com/houhuan/TrendRadar.git
synced 2026-05-01 01:22:42 +08:00
v4.0.0 大大大更新
This commit is contained in:
@@ -517,24 +517,55 @@ class DataService:
|
||||
# 遍历日期文件夹
|
||||
for date_folder in output_dir.iterdir():
|
||||
if date_folder.is_dir() and not date_folder.name.startswith('.'):
|
||||
# 解析日期(格式: YYYY年MM月DD日)
|
||||
try:
|
||||
date_match = re.match(r'(\d{4})年(\d{2})月(\d{2})日', date_folder.name)
|
||||
if date_match:
|
||||
folder_date = datetime(
|
||||
int(date_match.group(1)),
|
||||
int(date_match.group(2)),
|
||||
int(date_match.group(3))
|
||||
)
|
||||
available_dates.append(folder_date)
|
||||
except Exception:
|
||||
pass
|
||||
folder_date = self._parse_date_folder_name(date_folder.name)
|
||||
if folder_date:
|
||||
available_dates.append(folder_date)
|
||||
|
||||
if not available_dates:
|
||||
return (None, None)
|
||||
|
||||
return (min(available_dates), max(available_dates))
|
||||
|
||||
def _parse_date_folder_name(self, folder_name: str) -> Optional[datetime]:
|
||||
"""
|
||||
解析日期文件夹名称(兼容中文和ISO格式)
|
||||
|
||||
支持两种格式:
|
||||
- 中文格式:YYYY年MM月DD日
|
||||
- ISO格式:YYYY-MM-DD
|
||||
|
||||
Args:
|
||||
folder_name: 文件夹名称
|
||||
|
||||
Returns:
|
||||
datetime 对象,解析失败返回 None
|
||||
"""
|
||||
# 尝试中文格式:YYYY年MM月DD日
|
||||
chinese_match = re.match(r'(\d{4})年(\d{2})月(\d{2})日', folder_name)
|
||||
if chinese_match:
|
||||
try:
|
||||
return datetime(
|
||||
int(chinese_match.group(1)),
|
||||
int(chinese_match.group(2)),
|
||||
int(chinese_match.group(3))
|
||||
)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# 尝试 ISO 格式:YYYY-MM-DD
|
||||
iso_match = re.match(r'(\d{4})-(\d{2})-(\d{2})', folder_name)
|
||||
if iso_match:
|
||||
try:
|
||||
return datetime(
|
||||
int(iso_match.group(1)),
|
||||
int(iso_match.group(2)),
|
||||
int(iso_match.group(3))
|
||||
)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
def get_system_status(self) -> Dict:
|
||||
"""
|
||||
获取系统运行状态
|
||||
@@ -553,26 +584,14 @@ class DataService:
|
||||
if output_dir.exists():
|
||||
# 遍历日期文件夹
|
||||
for date_folder in output_dir.iterdir():
|
||||
if date_folder.is_dir():
|
||||
# 解析日期
|
||||
try:
|
||||
date_str = date_folder.name
|
||||
# 格式: YYYY年MM月DD日
|
||||
date_match = re.match(r'(\d{4})年(\d{2})月(\d{2})日', date_str)
|
||||
if date_match:
|
||||
folder_date = datetime(
|
||||
int(date_match.group(1)),
|
||||
int(date_match.group(2)),
|
||||
int(date_match.group(3))
|
||||
)
|
||||
|
||||
if oldest_record is None or folder_date < oldest_record:
|
||||
oldest_record = folder_date
|
||||
if latest_record is None or folder_date > latest_record:
|
||||
latest_record = folder_date
|
||||
|
||||
except:
|
||||
pass
|
||||
if date_folder.is_dir() and not date_folder.name.startswith('.'):
|
||||
# 解析日期(兼容中文和ISO格式)
|
||||
folder_date = self._parse_date_folder_name(date_folder.name)
|
||||
if folder_date:
|
||||
if oldest_record is None or folder_date < oldest_record:
|
||||
oldest_record = folder_date
|
||||
if latest_record is None or folder_date > latest_record:
|
||||
latest_record = folder_date
|
||||
|
||||
# 计算存储大小
|
||||
for item in date_folder.rglob("*"):
|
||||
|
||||
@@ -2,9 +2,12 @@
|
||||
文件解析服务
|
||||
|
||||
提供txt格式新闻数据和YAML配置文件的解析功能。
|
||||
支持从 SQLite 数据库和 TXT 文件两种数据源读取。
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Tuple, Optional
|
||||
from datetime import datetime
|
||||
@@ -145,17 +148,310 @@ class ParserService:
|
||||
|
||||
def get_date_folder_name(self, date: datetime = None) -> str:
|
||||
"""
|
||||
获取日期文件夹名称
|
||||
获取日期文件夹名称(兼容中文和ISO格式)
|
||||
|
||||
Args:
|
||||
date: 日期对象,默认为今天
|
||||
|
||||
Returns:
|
||||
文件夹名称,格式: YYYY年MM月DD日
|
||||
实际存在的文件夹名称,优先返回中文格式(YYYY年MM月DD日),
|
||||
若不存在则返回 ISO 格式(YYYY-MM-DD)
|
||||
"""
|
||||
if date is None:
|
||||
date = datetime.now()
|
||||
return date.strftime("%Y年%m月%d日")
|
||||
return self._find_date_folder(date)
|
||||
|
||||
def _get_date_folder_name(self, date: datetime = None) -> str:
|
||||
"""
|
||||
获取日期文件夹名称(兼容中文和ISO格式)
|
||||
|
||||
Args:
|
||||
date: 日期对象,默认为今天
|
||||
|
||||
Returns:
|
||||
实际存在的文件夹名称,优先返回中文格式(YYYY年MM月DD日),
|
||||
若不存在则返回 ISO 格式(YYYY-MM-DD)
|
||||
"""
|
||||
if date is None:
|
||||
date = datetime.now()
|
||||
return self._find_date_folder(date)
|
||||
|
||||
def _find_date_folder(self, date: datetime) -> str:
|
||||
"""
|
||||
查找实际存在的日期文件夹
|
||||
|
||||
支持两种格式:
|
||||
- 中文格式:YYYY年MM月DD日(优先)
|
||||
- ISO格式:YYYY-MM-DD
|
||||
|
||||
Args:
|
||||
date: 日期对象
|
||||
|
||||
Returns:
|
||||
实际存在的文件夹名称,若都不存在则返回中文格式
|
||||
"""
|
||||
output_dir = self.project_root / "output"
|
||||
|
||||
# 中文格式:YYYY年MM月DD日
|
||||
chinese_format = date.strftime("%Y年%m月%d日")
|
||||
# ISO格式:YYYY-MM-DD
|
||||
iso_format = date.strftime("%Y-%m-%d")
|
||||
|
||||
# 优先检查中文格式
|
||||
if (output_dir / chinese_format).exists():
|
||||
return chinese_format
|
||||
# 其次检查 ISO 格式
|
||||
if (output_dir / iso_format).exists():
|
||||
return iso_format
|
||||
|
||||
# 都不存在,返回中文格式(与项目现有风格一致)
|
||||
return chinese_format
|
||||
|
||||
def _get_sqlite_db_path(self, date: datetime = None) -> Optional[Path]:
|
||||
"""
|
||||
获取 SQLite 数据库文件路径
|
||||
|
||||
Args:
|
||||
date: 日期对象,默认为今天
|
||||
|
||||
Returns:
|
||||
数据库文件路径,如果不存在则返回 None
|
||||
"""
|
||||
date_folder = self._get_date_folder_name(date)
|
||||
db_path = self.project_root / "output" / date_folder / "news.db"
|
||||
if db_path.exists():
|
||||
return db_path
|
||||
return None
|
||||
|
||||
def _get_txt_folder_path(self, date: datetime = None) -> Optional[Path]:
|
||||
"""
|
||||
获取 TXT 文件夹路径
|
||||
|
||||
Args:
|
||||
date: 日期对象,默认为今天
|
||||
|
||||
Returns:
|
||||
TXT 文件夹路径,如果不存在则返回 None
|
||||
"""
|
||||
date_folder = self._get_date_folder_name(date)
|
||||
txt_path = self.project_root / "output" / date_folder / "txt"
|
||||
if txt_path.exists() and txt_path.is_dir():
|
||||
return txt_path
|
||||
return None
|
||||
|
||||
def _read_from_txt(
|
||||
self,
|
||||
date: datetime = None,
|
||||
platform_ids: Optional[List[str]] = None
|
||||
) -> Optional[Tuple[Dict, Dict, Dict]]:
|
||||
"""
|
||||
从 TXT 文件夹读取新闻数据
|
||||
|
||||
Args:
|
||||
date: 日期对象,默认为今天
|
||||
platform_ids: 平台ID列表,None表示所有平台
|
||||
|
||||
Returns:
|
||||
(all_titles, id_to_name, all_timestamps) 元组,如果不存在返回 None
|
||||
"""
|
||||
txt_folder = self._get_txt_folder_path(date)
|
||||
if txt_folder is None:
|
||||
return None
|
||||
|
||||
# 获取所有 TXT 文件并按时间排序
|
||||
txt_files = sorted(txt_folder.glob("*.txt"))
|
||||
if not txt_files:
|
||||
return None
|
||||
|
||||
all_titles = {}
|
||||
id_to_name = {}
|
||||
all_timestamps = {}
|
||||
|
||||
for txt_file in txt_files:
|
||||
try:
|
||||
titles_by_id, file_id_to_name = self.parse_txt_file(txt_file)
|
||||
|
||||
# 记录时间戳
|
||||
all_timestamps[txt_file.name] = txt_file.stat().st_mtime
|
||||
|
||||
# 合并 id_to_name
|
||||
id_to_name.update(file_id_to_name)
|
||||
|
||||
# 合并标题数据
|
||||
for source_id, titles in titles_by_id.items():
|
||||
# 如果指定了 platform_ids,过滤
|
||||
if platform_ids and source_id not in platform_ids:
|
||||
continue
|
||||
|
||||
if source_id not in all_titles:
|
||||
all_titles[source_id] = {}
|
||||
|
||||
for title, data in titles.items():
|
||||
if title not in all_titles[source_id]:
|
||||
# 新标题
|
||||
all_titles[source_id][title] = {
|
||||
"ranks": data.get("ranks", []),
|
||||
"url": data.get("url", ""),
|
||||
"mobileUrl": data.get("mobileUrl", ""),
|
||||
"first_time": txt_file.stem, # 使用文件名作为时间
|
||||
"last_time": txt_file.stem,
|
||||
"count": 1,
|
||||
}
|
||||
else:
|
||||
# 合并已存在的标题
|
||||
existing = all_titles[source_id][title]
|
||||
# 合并排名
|
||||
for rank in data.get("ranks", []):
|
||||
if rank not in existing["ranks"]:
|
||||
existing["ranks"].append(rank)
|
||||
# 更新 last_time
|
||||
existing["last_time"] = txt_file.stem
|
||||
existing["count"] += 1
|
||||
# 保留 URL
|
||||
if not existing["url"] and data.get("url"):
|
||||
existing["url"] = data["url"]
|
||||
if not existing["mobileUrl"] and data.get("mobileUrl"):
|
||||
existing["mobileUrl"] = data["mobileUrl"]
|
||||
|
||||
except Exception as e:
|
||||
print(f"Warning: 解析 TXT 文件失败 {txt_file}: {e}")
|
||||
continue
|
||||
|
||||
if not all_titles:
|
||||
return None
|
||||
|
||||
return (all_titles, id_to_name, all_timestamps)
|
||||
|
||||
def _read_from_sqlite(
|
||||
self,
|
||||
date: datetime = None,
|
||||
platform_ids: Optional[List[str]] = None
|
||||
) -> Optional[Tuple[Dict, Dict, Dict]]:
|
||||
"""
|
||||
从 SQLite 数据库读取新闻数据
|
||||
|
||||
新表结构数据已按 URL 去重,包含:
|
||||
- first_crawl_time: 首次抓取时间
|
||||
- last_crawl_time: 最后抓取时间
|
||||
- crawl_count: 抓取次数
|
||||
|
||||
Args:
|
||||
date: 日期对象,默认为今天
|
||||
platform_ids: 平台ID列表,None表示所有平台
|
||||
|
||||
Returns:
|
||||
(all_titles, id_to_name, all_timestamps) 元组,如果数据库不存在返回 None
|
||||
"""
|
||||
db_path = self._get_sqlite_db_path(date)
|
||||
if db_path is None:
|
||||
return None
|
||||
|
||||
all_titles = {}
|
||||
id_to_name = {}
|
||||
all_timestamps = {}
|
||||
|
||||
try:
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
conn.row_factory = sqlite3.Row
|
||||
cursor = conn.cursor()
|
||||
|
||||
# 检查表是否存在
|
||||
cursor.execute("""
|
||||
SELECT name FROM sqlite_master
|
||||
WHERE type='table' AND name='news_items'
|
||||
""")
|
||||
if not cursor.fetchone():
|
||||
conn.close()
|
||||
return None
|
||||
|
||||
# 构建查询
|
||||
if platform_ids:
|
||||
placeholders = ','.join(['?' for _ in platform_ids])
|
||||
query = f"""
|
||||
SELECT n.id, n.platform_id, p.name as platform_name, n.title,
|
||||
n.rank, n.url, n.mobile_url,
|
||||
n.first_crawl_time, n.last_crawl_time, n.crawl_count
|
||||
FROM news_items n
|
||||
LEFT JOIN platforms p ON n.platform_id = p.id
|
||||
WHERE n.platform_id IN ({placeholders})
|
||||
"""
|
||||
cursor.execute(query, platform_ids)
|
||||
else:
|
||||
cursor.execute("""
|
||||
SELECT n.id, n.platform_id, p.name as platform_name, n.title,
|
||||
n.rank, n.url, n.mobile_url,
|
||||
n.first_crawl_time, n.last_crawl_time, n.crawl_count
|
||||
FROM news_items n
|
||||
LEFT JOIN platforms p ON n.platform_id = p.id
|
||||
""")
|
||||
|
||||
rows = cursor.fetchall()
|
||||
|
||||
# 收集所有 news_item_id 用于查询历史排名
|
||||
news_ids = [row['id'] for row in rows]
|
||||
rank_history_map = {}
|
||||
|
||||
if news_ids:
|
||||
placeholders = ",".join("?" * len(news_ids))
|
||||
cursor.execute(f"""
|
||||
SELECT news_item_id, rank FROM rank_history
|
||||
WHERE news_item_id IN ({placeholders})
|
||||
ORDER BY news_item_id, crawl_time
|
||||
""", news_ids)
|
||||
|
||||
for rh_row in cursor.fetchall():
|
||||
news_id = rh_row['news_item_id']
|
||||
rank = rh_row['rank']
|
||||
if news_id not in rank_history_map:
|
||||
rank_history_map[news_id] = []
|
||||
rank_history_map[news_id].append(rank)
|
||||
|
||||
for row in rows:
|
||||
news_id = row['id']
|
||||
platform_id = row['platform_id']
|
||||
platform_name = row['platform_name'] or platform_id
|
||||
title = row['title']
|
||||
|
||||
# 更新 id_to_name
|
||||
if platform_id not in id_to_name:
|
||||
id_to_name[platform_id] = platform_name
|
||||
|
||||
# 初始化平台字典
|
||||
if platform_id not in all_titles:
|
||||
all_titles[platform_id] = {}
|
||||
|
||||
# 获取排名历史,如果为空则使用当前排名
|
||||
ranks = rank_history_map.get(news_id, [row['rank']])
|
||||
|
||||
# 直接使用数据(已去重)
|
||||
all_titles[platform_id][title] = {
|
||||
"ranks": ranks,
|
||||
"url": row['url'] or "",
|
||||
"mobileUrl": row['mobile_url'] or "",
|
||||
"first_time": row['first_crawl_time'] or "",
|
||||
"last_time": row['last_crawl_time'] or "",
|
||||
"count": row['crawl_count'] or 1,
|
||||
}
|
||||
|
||||
# 获取抓取时间作为 timestamps
|
||||
cursor.execute("""
|
||||
SELECT crawl_time FROM crawl_records
|
||||
ORDER BY crawl_time
|
||||
""")
|
||||
for row in cursor.fetchall():
|
||||
crawl_time = row['crawl_time']
|
||||
all_timestamps[f"{crawl_time}.db"] = 0 # 用虚拟时间戳
|
||||
|
||||
conn.close()
|
||||
|
||||
if not all_titles:
|
||||
return None
|
||||
|
||||
return (all_titles, id_to_name, all_timestamps)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Warning: 从 SQLite 读取数据失败: {e}")
|
||||
return None
|
||||
|
||||
def read_all_titles_for_date(
|
||||
self,
|
||||
@@ -163,7 +459,7 @@ class ParserService:
|
||||
platform_ids: Optional[List[str]] = None
|
||||
) -> Tuple[Dict, Dict, Dict]:
|
||||
"""
|
||||
读取指定日期的所有标题文件(带缓存)
|
||||
读取指定日期的所有标题(带缓存)
|
||||
|
||||
Args:
|
||||
date: 日期对象,默认为今天
|
||||
@@ -193,71 +489,23 @@ class ParserService:
|
||||
if cached:
|
||||
return cached
|
||||
|
||||
# 缓存未命中,读取文件
|
||||
date_folder = self.get_date_folder_name(date)
|
||||
txt_dir = self.project_root / "output" / date_folder / "txt"
|
||||
# 优先从 SQLite 读取
|
||||
sqlite_result = self._read_from_sqlite(date, platform_ids)
|
||||
if sqlite_result:
|
||||
self.cache.set(cache_key, sqlite_result)
|
||||
return sqlite_result
|
||||
|
||||
if not txt_dir.exists():
|
||||
raise DataNotFoundError(
|
||||
f"未找到 {date_folder} 的数据目录",
|
||||
suggestion="请先运行爬虫或检查日期是否正确"
|
||||
)
|
||||
# SQLite 不存在,尝试从 TXT 读取
|
||||
txt_result = self._read_from_txt(date, platform_ids)
|
||||
if txt_result:
|
||||
self.cache.set(cache_key, txt_result)
|
||||
return txt_result
|
||||
|
||||
all_titles = {}
|
||||
id_to_name = {}
|
||||
all_timestamps = {}
|
||||
|
||||
# 读取所有txt文件
|
||||
txt_files = sorted(txt_dir.glob("*.txt"))
|
||||
|
||||
if not txt_files:
|
||||
raise DataNotFoundError(
|
||||
f"{date_folder} 没有数据文件",
|
||||
suggestion="请等待爬虫任务完成"
|
||||
)
|
||||
|
||||
for txt_file in txt_files:
|
||||
try:
|
||||
titles_by_id, file_id_to_name = self.parse_txt_file(txt_file)
|
||||
|
||||
# 更新id_to_name
|
||||
id_to_name.update(file_id_to_name)
|
||||
|
||||
# 合并标题数据
|
||||
for platform_id, titles in titles_by_id.items():
|
||||
# 如果指定了平台过滤
|
||||
if platform_ids and platform_id not in platform_ids:
|
||||
continue
|
||||
|
||||
if platform_id not in all_titles:
|
||||
all_titles[platform_id] = {}
|
||||
|
||||
for title, info in titles.items():
|
||||
if title in all_titles[platform_id]:
|
||||
# 合并排名
|
||||
all_titles[platform_id][title]["ranks"].extend(info["ranks"])
|
||||
else:
|
||||
all_titles[platform_id][title] = info.copy()
|
||||
|
||||
# 记录文件时间戳
|
||||
all_timestamps[txt_file.name] = txt_file.stat().st_mtime
|
||||
|
||||
except Exception as e:
|
||||
# 忽略单个文件的解析错误,继续处理其他文件
|
||||
print(f"Warning: 解析文件 {txt_file} 失败: {e}")
|
||||
continue
|
||||
|
||||
if not all_titles:
|
||||
raise DataNotFoundError(
|
||||
f"{date_folder} 没有有效的数据",
|
||||
suggestion="请检查数据文件格式或重新运行爬虫"
|
||||
)
|
||||
|
||||
# 缓存结果
|
||||
result = (all_titles, id_to_name, all_timestamps)
|
||||
self.cache.set(cache_key, result)
|
||||
|
||||
return result
|
||||
# 两种数据源都不存在
|
||||
raise DataNotFoundError(
|
||||
f"未找到 {date_str} 的数据",
|
||||
suggestion="请先运行爬虫或检查日期是否正确"
|
||||
)
|
||||
|
||||
def parse_yaml_config(self, config_path: str = None) -> dict:
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user