mirror of
https://gitee.com/houhuan/TrendRadar.git
synced 2025-12-21 12:47:16 +08:00
fix: 解决某厂商的 chunked encoding 问题, 版本更新
This commit is contained in:
parent
35a63fe80c
commit
9a07ee57ff
@ -14,9 +14,7 @@ from typing import Dict, List, Tuple, Optional
|
|||||||
import requests
|
import requests
|
||||||
|
|
||||||
from trendradar.context import AppContext
|
from trendradar.context import AppContext
|
||||||
|
from trendradar import __version__
|
||||||
# 版本号直接定义,避免循环导入
|
|
||||||
VERSION = "4.0.0"
|
|
||||||
from trendradar.core import load_config
|
from trendradar.core import load_config
|
||||||
from trendradar.crawler import DataFetcher
|
from trendradar.crawler import DataFetcher
|
||||||
from trendradar.storage import convert_crawl_results_to_news_data
|
from trendradar.storage import convert_crawl_results_to_news_data
|
||||||
@ -105,7 +103,7 @@ class NewsAnalyzer:
|
|||||||
# 加载配置
|
# 加载配置
|
||||||
print("正在加载配置...")
|
print("正在加载配置...")
|
||||||
config = load_config()
|
config = load_config()
|
||||||
print(f"TrendRadar v{VERSION} 配置加载完成")
|
print(f"TrendRadar v{__version__} 配置加载完成")
|
||||||
print(f"监控平台数量: {len(config['PLATFORMS'])}")
|
print(f"监控平台数量: {len(config['PLATFORMS'])}")
|
||||||
print(f"时区: {config.get('TIMEZONE', 'Asia/Shanghai')}")
|
print(f"时区: {config.get('TIMEZONE', 'Asia/Shanghai')}")
|
||||||
|
|
||||||
@ -174,15 +172,15 @@ class NewsAnalyzer:
|
|||||||
"""检查版本更新"""
|
"""检查版本更新"""
|
||||||
try:
|
try:
|
||||||
need_update, remote_version = check_version_update(
|
need_update, remote_version = check_version_update(
|
||||||
VERSION, self.ctx.config["VERSION_CHECK_URL"], self.proxy_url
|
__version__, self.ctx.config["VERSION_CHECK_URL"], self.proxy_url
|
||||||
)
|
)
|
||||||
|
|
||||||
if need_update and remote_version:
|
if need_update and remote_version:
|
||||||
self.update_info = {
|
self.update_info = {
|
||||||
"current_version": VERSION,
|
"current_version": __version__,
|
||||||
"remote_version": remote_version,
|
"remote_version": remote_version,
|
||||||
}
|
}
|
||||||
print(f"发现新版本: {remote_version} (当前: {VERSION})")
|
print(f"发现新版本: {remote_version} (当前: {__version__})")
|
||||||
else:
|
else:
|
||||||
print("版本检查完成,当前为最新版本")
|
print("版本检查完成,当前为最新版本")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
@ -93,7 +93,11 @@ class RemoteStorageBackend(StorageBackend):
|
|||||||
|
|
||||||
# 初始化 S3 客户端
|
# 初始化 S3 客户端
|
||||||
# 使用 virtual-hosted style addressing(主流)
|
# 使用 virtual-hosted style addressing(主流)
|
||||||
s3_config = BotoConfig(s3={"addressing_style": "virtual"})
|
# 使用 s3 签名版本(v2),避免 s3v4 签名可能导致的 chunked encoding 问题
|
||||||
|
s3_config = BotoConfig(
|
||||||
|
s3={"addressing_style": "virtual"},
|
||||||
|
signature_version='s3', # 使用 S3v2 签名,兼容性更好
|
||||||
|
)
|
||||||
|
|
||||||
client_kwargs = {
|
client_kwargs = {
|
||||||
"endpoint_url": endpoint_url,
|
"endpoint_url": endpoint_url,
|
||||||
@ -133,7 +137,7 @@ class RemoteStorageBackend(StorageBackend):
|
|||||||
return format_time_filename(self.timezone)
|
return format_time_filename(self.timezone)
|
||||||
|
|
||||||
def _get_remote_db_key(self, date: Optional[str] = None) -> str:
|
def _get_remote_db_key(self, date: Optional[str] = None) -> str:
|
||||||
"""获取 R2 中 SQLite 文件的对象键"""
|
"""获取远程存储中 SQLite 文件的对象键"""
|
||||||
date_folder = self._format_date_folder(date)
|
date_folder = self._format_date_folder(date)
|
||||||
return f"news/{date_folder}.db"
|
return f"news/{date_folder}.db"
|
||||||
|
|
||||||
@ -144,10 +148,10 @@ class RemoteStorageBackend(StorageBackend):
|
|||||||
|
|
||||||
def _check_object_exists(self, r2_key: str) -> bool:
|
def _check_object_exists(self, r2_key: str) -> bool:
|
||||||
"""
|
"""
|
||||||
检查 R2 中对象是否存在
|
检查远程存储中对象是否存在
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
r2_key: R2 对象键
|
r2_key: 远程对象键
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
是否存在
|
是否存在
|
||||||
@ -157,7 +161,7 @@ class RemoteStorageBackend(StorageBackend):
|
|||||||
return True
|
return True
|
||||||
except ClientError as e:
|
except ClientError as e:
|
||||||
error_code = e.response.get("Error", {}).get("Code", "")
|
error_code = e.response.get("Error", {}).get("Code", "")
|
||||||
# R2/S3 可能返回 404, NoSuchKey, 或其他变体
|
# S3 兼容存储可能返回 404, NoSuchKey, 或其他变体
|
||||||
if error_code in ("404", "NoSuchKey", "Not Found"):
|
if error_code in ("404", "NoSuchKey", "Not Found"):
|
||||||
return False
|
return False
|
||||||
# 其他错误(如权限问题)也视为不存在,但打印警告
|
# 其他错误(如权限问题)也视为不存在,但打印警告
|
||||||
@ -169,7 +173,10 @@ class RemoteStorageBackend(StorageBackend):
|
|||||||
|
|
||||||
def _download_sqlite(self, date: Optional[str] = None) -> Optional[Path]:
|
def _download_sqlite(self, date: Optional[str] = None) -> Optional[Path]:
|
||||||
"""
|
"""
|
||||||
从 R2 下载当天的 SQLite 文件到本地临时目录
|
从远程存储下载当天的 SQLite 文件到本地临时目录
|
||||||
|
|
||||||
|
使用 get_object + iter_chunks 替代 download_file,
|
||||||
|
以正确处理腾讯云 COS 的 chunked transfer encoding。
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
date: 日期字符串
|
date: 日期字符串
|
||||||
@ -189,13 +196,18 @@ class RemoteStorageBackend(StorageBackend):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.s3_client.download_file(self.bucket_name, r2_key, str(local_path))
|
# 使用 get_object + iter_chunks 替代 download_file
|
||||||
|
# iter_chunks 会自动处理 chunked transfer encoding
|
||||||
|
response = self.s3_client.get_object(Bucket=self.bucket_name, Key=r2_key)
|
||||||
|
with open(local_path, 'wb') as f:
|
||||||
|
for chunk in response['Body'].iter_chunks(chunk_size=1024*1024):
|
||||||
|
f.write(chunk)
|
||||||
self._downloaded_files.append(local_path)
|
self._downloaded_files.append(local_path)
|
||||||
print(f"[远程存储] 已下载: {r2_key} -> {local_path}")
|
print(f"[远程存储] 已下载: {r2_key} -> {local_path}")
|
||||||
return local_path
|
return local_path
|
||||||
except ClientError as e:
|
except ClientError as e:
|
||||||
error_code = e.response.get("Error", {}).get("Code", "")
|
error_code = e.response.get("Error", {}).get("Code", "")
|
||||||
# R2/S3 可能返回不同的错误码
|
# S3 兼容存储可能返回不同的错误码
|
||||||
if error_code in ("404", "NoSuchKey", "Not Found"):
|
if error_code in ("404", "NoSuchKey", "Not Found"):
|
||||||
print(f"[远程存储] 文件不存在,将创建新数据库: {r2_key}")
|
print(f"[远程存储] 文件不存在,将创建新数据库: {r2_key}")
|
||||||
return None
|
return None
|
||||||
@ -208,7 +220,7 @@ class RemoteStorageBackend(StorageBackend):
|
|||||||
|
|
||||||
def _upload_sqlite(self, date: Optional[str] = None) -> bool:
|
def _upload_sqlite(self, date: Optional[str] = None) -> bool:
|
||||||
"""
|
"""
|
||||||
上传本地 SQLite 文件到 R2
|
上传本地 SQLite 文件到远程存储
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
date: 日期字符串
|
date: 日期字符串
|
||||||
@ -228,7 +240,20 @@ class RemoteStorageBackend(StorageBackend):
|
|||||||
local_size = local_path.stat().st_size
|
local_size = local_path.stat().st_size
|
||||||
print(f"[远程存储] 准备上传: {local_path} ({local_size} bytes) -> {r2_key}")
|
print(f"[远程存储] 准备上传: {local_path} ({local_size} bytes) -> {r2_key}")
|
||||||
|
|
||||||
self.s3_client.upload_file(str(local_path), self.bucket_name, r2_key)
|
# 读取文件内容为 bytes 后上传
|
||||||
|
# 避免传入文件对象时 requests 库使用 chunked transfer encoding
|
||||||
|
# 腾讯云 COS 等 S3 兼容服务可能无法正确处理 chunked encoding
|
||||||
|
with open(local_path, 'rb') as f:
|
||||||
|
file_content = f.read()
|
||||||
|
|
||||||
|
# 使用 put_object 并明确设置 ContentLength,确保不使用 chunked encoding
|
||||||
|
self.s3_client.put_object(
|
||||||
|
Bucket=self.bucket_name,
|
||||||
|
Key=r2_key,
|
||||||
|
Body=file_content,
|
||||||
|
ContentLength=local_size,
|
||||||
|
ContentType='application/octet-stream',
|
||||||
|
)
|
||||||
print(f"[远程存储] 已上传: {local_path} -> {r2_key}")
|
print(f"[远程存储] 已上传: {local_path} -> {r2_key}")
|
||||||
|
|
||||||
# 验证上传成功
|
# 验证上传成功
|
||||||
@ -236,7 +261,7 @@ class RemoteStorageBackend(StorageBackend):
|
|||||||
print(f"[远程存储] 上传验证成功: {r2_key}")
|
print(f"[远程存储] 上传验证成功: {r2_key}")
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
print(f"[远程存储] 上传验证失败: 文件未在 R2 中找到")
|
print(f"[远程存储] 上传验证失败: 文件未在远程存储中找到")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -252,7 +277,7 @@ class RemoteStorageBackend(StorageBackend):
|
|||||||
# 确保目录存在
|
# 确保目录存在
|
||||||
local_path.parent.mkdir(parents=True, exist_ok=True)
|
local_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
# 如果本地不存在,尝试从 R2 下载
|
# 如果本地不存在,尝试从远程存储下载
|
||||||
if not local_path.exists():
|
if not local_path.exists():
|
||||||
self._download_sqlite(date)
|
self._download_sqlite(date)
|
||||||
|
|
||||||
@ -282,9 +307,9 @@ class RemoteStorageBackend(StorageBackend):
|
|||||||
|
|
||||||
def save_news_data(self, data: NewsData) -> bool:
|
def save_news_data(self, data: NewsData) -> bool:
|
||||||
"""
|
"""
|
||||||
保存新闻数据到 R2(以 URL 为唯一标识,支持标题更新检测)
|
保存新闻数据到远程存储(以 URL 为唯一标识,支持标题更新检测)
|
||||||
|
|
||||||
流程:下载现有数据库 → 插入/更新数据 → 上传回 R2
|
流程:下载现有数据库 → 插入/更新数据 → 上传回远程存储
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
data: 新闻数据
|
data: 新闻数据
|
||||||
@ -466,12 +491,12 @@ class RemoteStorageBackend(StorageBackend):
|
|||||||
log_parts.append(f"(去重后总计: {final_count} 条)")
|
log_parts.append(f"(去重后总计: {final_count} 条)")
|
||||||
print(",".join(log_parts))
|
print(",".join(log_parts))
|
||||||
|
|
||||||
# 上传到 R2
|
# 上传到远程存储
|
||||||
if self._upload_sqlite(data.date):
|
if self._upload_sqlite(data.date):
|
||||||
print(f"[远程存储] 数据已同步到 R2")
|
print(f"[远程存储] 数据已同步到远程存储")
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
print(f"[远程存储] 上传 R2 失败")
|
print(f"[远程存储] 上传远程存储失败")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -695,7 +720,7 @@ class RemoteStorageBackend(StorageBackend):
|
|||||||
return {}
|
return {}
|
||||||
|
|
||||||
def save_txt_snapshot(self, data: NewsData) -> Optional[str]:
|
def save_txt_snapshot(self, data: NewsData) -> Optional[str]:
|
||||||
"""保存 TXT 快照(R2 模式下默认不支持)"""
|
"""保存 TXT 快照(远程存储模式下默认不支持)"""
|
||||||
if not self.enable_txt:
|
if not self.enable_txt:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@ -817,7 +842,7 @@ class RemoteStorageBackend(StorageBackend):
|
|||||||
|
|
||||||
def cleanup_old_data(self, retention_days: int) -> int:
|
def cleanup_old_data(self, retention_days: int) -> int:
|
||||||
"""
|
"""
|
||||||
清理 R2 上的过期数据
|
清理远程存储上的过期数据
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
retention_days: 保留天数(0 表示不清理)
|
retention_days: 保留天数(0 表示不清理)
|
||||||
@ -832,7 +857,7 @@ class RemoteStorageBackend(StorageBackend):
|
|||||||
cutoff_date = self._get_configured_time() - timedelta(days=retention_days)
|
cutoff_date = self._get_configured_time() - timedelta(days=retention_days)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# 列出 R2 中 news/ 前缀下的所有对象
|
# 列出远程存储中 news/ 前缀下的所有对象
|
||||||
paginator = self.s3_client.get_paginator('list_objects_v2')
|
paginator = self.s3_client.get_paginator('list_objects_v2')
|
||||||
pages = paginator.paginate(Bucket=self.bucket_name, Prefix="news/")
|
pages = paginator.paginate(Bucket=self.bucket_name, Prefix="news/")
|
||||||
|
|
||||||
@ -964,12 +989,12 @@ class RemoteStorageBackend(StorageBackend):
|
|||||||
|
|
||||||
print(f"[远程存储] 推送记录已保存: {report_type} at {now_str}")
|
print(f"[远程存储] 推送记录已保存: {report_type} at {now_str}")
|
||||||
|
|
||||||
# 上传到 R2 确保记录持久化
|
# 上传到远程存储 确保记录持久化
|
||||||
if self._upload_sqlite(date):
|
if self._upload_sqlite(date):
|
||||||
print(f"[远程存储] 推送记录已同步到 R2")
|
print(f"[远程存储] 推送记录已同步到远程存储")
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
print(f"[远程存储] 推送记录同步到 R2 失败")
|
print(f"[远程存储] 推送记录同步到远程存储失败")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -1030,14 +1055,13 @@ class RemoteStorageBackend(StorageBackend):
|
|||||||
print(f"[远程存储] 跳过(远程不存在): {date_str}")
|
print(f"[远程存储] 跳过(远程不存在): {date_str}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# 下载
|
# 下载(使用 get_object + iter_chunks 处理 chunked encoding)
|
||||||
try:
|
try:
|
||||||
local_date_dir.mkdir(parents=True, exist_ok=True)
|
local_date_dir.mkdir(parents=True, exist_ok=True)
|
||||||
self.s3_client.download_file(
|
response = self.s3_client.get_object(Bucket=self.bucket_name, Key=remote_key)
|
||||||
self.bucket_name,
|
with open(local_db_path, 'wb') as f:
|
||||||
remote_key,
|
for chunk in response['Body'].iter_chunks(chunk_size=1024*1024):
|
||||||
str(local_db_path)
|
f.write(chunk)
|
||||||
)
|
|
||||||
print(f"[远程存储] 已拉取: {remote_key} -> {local_db_path}")
|
print(f"[远程存储] 已拉取: {remote_key} -> {local_db_path}")
|
||||||
pulled_count += 1
|
pulled_count += 1
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user