mirror of
https://gitee.com/houhuan/TrendRadar.git
synced 2026-05-01 01:22:42 +08:00
v4.0.0 大大大更新
This commit is contained in:
@@ -0,0 +1,8 @@
|
||||
# coding=utf-8
|
||||
"""
|
||||
爬虫模块 - 数据抓取功能
|
||||
"""
|
||||
|
||||
from trendradar.crawler.fetcher import DataFetcher
|
||||
|
||||
__all__ = ["DataFetcher"]
|
||||
@@ -0,0 +1,184 @@
|
||||
# coding=utf-8
|
||||
"""
|
||||
数据获取器模块
|
||||
|
||||
负责从 NewsNow API 抓取新闻数据,支持:
|
||||
- 单个平台数据获取
|
||||
- 批量平台数据爬取
|
||||
- 自动重试机制
|
||||
- 代理支持
|
||||
"""
|
||||
|
||||
import json
|
||||
import random
|
||||
import time
|
||||
from typing import Dict, List, Tuple, Optional, Union
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
class DataFetcher:
|
||||
"""数据获取器"""
|
||||
|
||||
# 默认 API 地址
|
||||
DEFAULT_API_URL = "https://newsnow.busiyi.world/api/s"
|
||||
|
||||
# 默认请求头
|
||||
DEFAULT_HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
||||
"Accept": "application/json, text/plain, */*",
|
||||
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
||||
"Connection": "keep-alive",
|
||||
"Cache-Control": "no-cache",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
proxy_url: Optional[str] = None,
|
||||
api_url: Optional[str] = None,
|
||||
):
|
||||
"""
|
||||
初始化数据获取器
|
||||
|
||||
Args:
|
||||
proxy_url: 代理服务器 URL(可选)
|
||||
api_url: API 基础 URL(可选,默认使用 DEFAULT_API_URL)
|
||||
"""
|
||||
self.proxy_url = proxy_url
|
||||
self.api_url = api_url or self.DEFAULT_API_URL
|
||||
|
||||
def fetch_data(
|
||||
self,
|
||||
id_info: Union[str, Tuple[str, str]],
|
||||
max_retries: int = 2,
|
||||
min_retry_wait: int = 3,
|
||||
max_retry_wait: int = 5,
|
||||
) -> Tuple[Optional[str], str, str]:
|
||||
"""
|
||||
获取指定ID数据,支持重试
|
||||
|
||||
Args:
|
||||
id_info: 平台ID 或 (平台ID, 别名) 元组
|
||||
max_retries: 最大重试次数
|
||||
min_retry_wait: 最小重试等待时间(秒)
|
||||
max_retry_wait: 最大重试等待时间(秒)
|
||||
|
||||
Returns:
|
||||
(响应文本, 平台ID, 别名) 元组,失败时响应文本为 None
|
||||
"""
|
||||
if isinstance(id_info, tuple):
|
||||
id_value, alias = id_info
|
||||
else:
|
||||
id_value = id_info
|
||||
alias = id_value
|
||||
|
||||
url = f"{self.api_url}?id={id_value}&latest"
|
||||
|
||||
proxies = None
|
||||
if self.proxy_url:
|
||||
proxies = {"http": self.proxy_url, "https": self.proxy_url}
|
||||
|
||||
retries = 0
|
||||
while retries <= max_retries:
|
||||
try:
|
||||
response = requests.get(
|
||||
url,
|
||||
proxies=proxies,
|
||||
headers=self.DEFAULT_HEADERS,
|
||||
timeout=10,
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
data_text = response.text
|
||||
data_json = json.loads(data_text)
|
||||
|
||||
status = data_json.get("status", "未知")
|
||||
if status not in ["success", "cache"]:
|
||||
raise ValueError(f"响应状态异常: {status}")
|
||||
|
||||
status_info = "最新数据" if status == "success" else "缓存数据"
|
||||
print(f"获取 {id_value} 成功({status_info})")
|
||||
return data_text, id_value, alias
|
||||
|
||||
except Exception as e:
|
||||
retries += 1
|
||||
if retries <= max_retries:
|
||||
base_wait = random.uniform(min_retry_wait, max_retry_wait)
|
||||
additional_wait = (retries - 1) * random.uniform(1, 2)
|
||||
wait_time = base_wait + additional_wait
|
||||
print(f"请求 {id_value} 失败: {e}. {wait_time:.2f}秒后重试...")
|
||||
time.sleep(wait_time)
|
||||
else:
|
||||
print(f"请求 {id_value} 失败: {e}")
|
||||
return None, id_value, alias
|
||||
|
||||
return None, id_value, alias
|
||||
|
||||
def crawl_websites(
|
||||
self,
|
||||
ids_list: List[Union[str, Tuple[str, str]]],
|
||||
request_interval: int = 100,
|
||||
) -> Tuple[Dict, Dict, List]:
|
||||
"""
|
||||
爬取多个网站数据
|
||||
|
||||
Args:
|
||||
ids_list: 平台ID列表,每个元素可以是字符串或 (平台ID, 别名) 元组
|
||||
request_interval: 请求间隔(毫秒)
|
||||
|
||||
Returns:
|
||||
(结果字典, ID到名称的映射, 失败ID列表) 元组
|
||||
"""
|
||||
results = {}
|
||||
id_to_name = {}
|
||||
failed_ids = []
|
||||
|
||||
for i, id_info in enumerate(ids_list):
|
||||
if isinstance(id_info, tuple):
|
||||
id_value, name = id_info
|
||||
else:
|
||||
id_value = id_info
|
||||
name = id_value
|
||||
|
||||
id_to_name[id_value] = name
|
||||
response, _, _ = self.fetch_data(id_info)
|
||||
|
||||
if response:
|
||||
try:
|
||||
data = json.loads(response)
|
||||
results[id_value] = {}
|
||||
|
||||
for index, item in enumerate(data.get("items", []), 1):
|
||||
title = item.get("title")
|
||||
# 跳过无效标题(None、float、空字符串)
|
||||
if title is None or isinstance(title, float) or not str(title).strip():
|
||||
continue
|
||||
title = str(title).strip()
|
||||
url = item.get("url", "")
|
||||
mobile_url = item.get("mobileUrl", "")
|
||||
|
||||
if title in results[id_value]:
|
||||
results[id_value][title]["ranks"].append(index)
|
||||
else:
|
||||
results[id_value][title] = {
|
||||
"ranks": [index],
|
||||
"url": url,
|
||||
"mobileUrl": mobile_url,
|
||||
}
|
||||
except json.JSONDecodeError:
|
||||
print(f"解析 {id_value} 响应失败")
|
||||
failed_ids.append(id_value)
|
||||
except Exception as e:
|
||||
print(f"处理 {id_value} 数据出错: {e}")
|
||||
failed_ids.append(id_value)
|
||||
else:
|
||||
failed_ids.append(id_value)
|
||||
|
||||
# 请求间隔(除了最后一个)
|
||||
if i < len(ids_list) - 1:
|
||||
actual_interval = request_interval + random.randint(-10, 20)
|
||||
actual_interval = max(50, actual_interval)
|
||||
time.sleep(actual_interval / 1000)
|
||||
|
||||
print(f"成功: {list(results.keys())}, 失败: {failed_ids}")
|
||||
return results, id_to_name, failed_ids
|
||||
Reference in New Issue
Block a user