戒酒的李白

Add a visual control panel for the crawler, supporting customization of topics a…

…nd parameter configuration.
... ... @@ -99,8 +99,10 @@ app.secret_key = 'this is secret_key you know ?' # 设置 Flask 的密钥,用
# 导入蓝图
from views.page import page
from views.user import user
from views.spider_control import spider_bp
app.register_blueprint(page.pb) # 注册页面蓝图
app.register_blueprint(user.ub) # 注册用户蓝图
app.register_blueprint(spider_bp) # 注册爬虫控制蓝图
# 首页路由,清空 session
@app.route('/')
... ...
... ... @@ -3,6 +3,13 @@ from spiderDataPackage.spiderContent import start as spiderContent
from spiderDataPackage.spiderComments import start as spiderComments
from spiderDataPackage.settings import navAddr
import os
import requests
import time
import random
import logging
from bs4 import BeautifulSoup
from datetime import datetime
from utils.logger import spider_logger as logging
def spiderData():
if not os.path.exists(navAddr):
... ... @@ -13,5 +20,131 @@ def spiderData():
print('正在爬取文章评论数据')
spiderComments()
class SpiderData:
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
self.base_url = 'https://s.weibo.com'
def crawl_topic(self, topic, depth=3, interval=5, max_retries=3, timeout=30):
"""
爬取指定话题的微博内容
:param topic: 要爬取的话题
:param depth: 爬取深度(页数)
:param interval: 请求间隔时间(秒)
:param max_retries: 最大重试次数
:param timeout: 请求超时时间(秒)
"""
logging.info(f"开始爬取话题: {topic}")
for page in range(1, depth + 1):
retries = 0
while retries < max_retries:
try:
url = f"{self.base_url}/weibo?q={topic}&page={page}"
response = requests.get(url, headers=self.headers, timeout=timeout)
if response.status_code == 200:
self._parse_page(response.text)
logging.info(f"成功爬取话题 {topic} 第 {page} 页")
break
else:
logging.warning(f"请求失败,状态码: {response.status_code}")
retries += 1
except requests.RequestException as e:
logging.error(f"请求异常: {e}")
retries += 1
if retries < max_retries:
sleep_time = interval * (1 + random.random())
logging.info(f"等待 {sleep_time:.2f} 秒后重试...")
time.sleep(sleep_time)
if retries == max_retries:
logging.error(f"话题 {topic} 第 {page} 页爬取失败,已达到最大重试次数")
continue
# 在页面之间添加随机延迟
if page < depth:
sleep_time = interval * (1 + random.random())
logging.info(f"等待 {sleep_time:.2f} 秒后继续...")
time.sleep(sleep_time)
def _parse_page(self, html_content):
"""
解析页面内容并保存数据
:param html_content: 页面HTML内容
"""
try:
soup = BeautifulSoup(html_content, 'html.parser')
weibo_items = soup.find_all('div', class_='card-wrap')
for item in weibo_items:
try:
# 提取微博内容
content = item.find('p', class_='txt')
if not content:
continue
# 提取用户信息
user_info = item.find('a', class_='name')
if not user_info:
continue
# 提取发布时间
time_info = item.find('p', class_='from')
# 提取互动数据
actions = item.find_all('li', class_='action')
# 构建数据字典
weibo_data = {
'content': content.text.strip(),
'user_name': user_info.text.strip(),
'publish_time': time_info.text.strip() if time_info else '',
'forward_count': self._extract_number(actions[0].text) if len(actions) > 0 else 0,
'comment_count': self._extract_number(actions[1].text) if len(actions) > 1 else 0,
'like_count': self._extract_number(actions[2].text) if len(actions) > 2 else 0,
'crawl_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
}
# 保存到数据库
self._save_to_database(weibo_data)
except Exception as e:
logging.error(f"解析微博项时出错: {e}")
continue
except Exception as e:
logging.error(f"解析页面时出错: {e}")
def _extract_number(self, text):
"""
从文本中提取数字
:param text: 包含数字的文本
:return: 提取的数字,如果没有找到则返回0
"""
try:
return int(''.join(filter(str.isdigit, text)))
except ValueError:
return 0
def _save_to_database(self, data):
"""
将数据保存到数据库
:param data: 要保存的数据字典
"""
try:
# TODO: 实现数据库保存逻辑
logging.info(f"保存数据: {data}")
except Exception as e:
logging.error(f"保存数据时出错: {e}")
if __name__ == '__main__':
spiderData()
\ No newline at end of file
... ...
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>爬虫控制面板</title>
<link href="https://cdn.bootcdn.net/ajax/libs/twitter-bootstrap/5.0.2/css/bootstrap.min.css" rel="stylesheet">
<link href="https://cdn.bootcdn.net/ajax/libs/font-awesome/5.15.4/css/all.min.css" rel="stylesheet">
<style>
.topic-item {
margin: 5px;
padding: 8px 15px;
border-radius: 20px;
background-color: #f8f9fa;
display: inline-block;
cursor: pointer;
}
.topic-item.selected {
background-color: #0d6efd;
color: white;
}
.custom-topic-input {
margin: 10px 0;
}
.parameter-section {
margin: 20px 0;
padding: 20px;
border-radius: 10px;
background-color: #f8f9fa;
}
</style>
</head>
<body>
<div class="container mt-5">
<h2 class="mb-4">爬虫控制面板</h2>
<!-- 话题选择区域 -->
<div class="card mb-4">
<div class="card-header">
<h5 class="mb-0">选择话题类型</h5>
</div>
<div class="card-body">
<div id="predefinedTopics" class="mb-3">
<!-- 预定义话题将通过JavaScript动态加载 -->
</div>
<div class="custom-topic-input">
<h6>添加自定义话题</h6>
<div class="input-group">
<input type="text" class="form-control" id="customTopic" placeholder="输入自定义话题">
<button class="btn btn-primary" onclick="addCustomTopic()">
<i class="fas fa-plus"></i> 添加
</button>
</div>
</div>
<div id="selectedTopics" class="mt-3">
<h6>已选择的话题:</h6>
<div id="selectedTopicsList" class="mt-2">
<!-- 已选择的话题将在这里显示 -->
</div>
</div>
</div>
</div>
<!-- 爬虫参数配置 -->
<div class="card mb-4">
<div class="card-header">
<h5 class="mb-0">爬虫参数配置</h5>
</div>
<div class="card-body">
<div class="row">
<div class="col-md-6">
<div class="mb-3">
<label for="crawlDepth" class="form-label">爬取深度</label>
<input type="number" class="form-control" id="crawlDepth" value="3" min="1" max="10">
<small class="text-muted">每个话题爬取的页数(1-10)</small>
</div>
</div>
<div class="col-md-6">
<div class="mb-3">
<label for="interval" class="form-label">爬取间隔(秒)</label>
<input type="number" class="form-control" id="interval" value="5" min="1">
<small class="text-muted">每次请求之间的间隔时间</small>
</div>
</div>
</div>
<div class="row">
<div class="col-md-6">
<div class="mb-3">
<label for="maxRetries" class="form-label">最大重试次数</label>
<input type="number" class="form-control" id="maxRetries" value="3" min="1">
</div>
</div>
<div class="col-md-6">
<div class="mb-3">
<label for="timeout" class="form-label">请求超时时间(秒)</label>
<input type="number" class="form-control" id="timeout" value="30" min="1">
</div>
</div>
</div>
</div>
</div>
<!-- 操作按钮 -->
<div class="d-flex justify-content-between mb-5">
<button class="btn btn-primary" onclick="startCrawling()">
<i class="fas fa-play"></i> 开始爬取
</button>
<button class="btn btn-secondary" onclick="saveConfig()">
<i class="fas fa-save"></i> 保存配置
</button>
</div>
<!-- 爬虫状态和日志 -->
<div class="card">
<div class="card-header">
<h5 class="mb-0">爬虫状态</h5>
</div>
<div class="card-body">
<div class="progress mb-3">
<div id="crawlProgress" class="progress-bar" role="progressbar" style="width: 0%"></div>
</div>
<div class="border p-3 bg-light" style="height: 200px; overflow-y: auto;">
<pre id="crawlLog" class="mb-0"></pre>
</div>
</div>
</div>
</div>
<script src="https://cdn.bootcdn.net/ajax/libs/jquery/3.6.0/jquery.min.js"></script>
<script src="https://cdn.bootcdn.net/ajax/libs/twitter-bootstrap/5.0.2/js/bootstrap.bundle.min.js"></script>
<script>
// 预定义话题列表
const predefinedTopics = [
'热门', '社会', '科技', '娱乐', '体育', '财经',
'教育', '健康', '军事', '文化', '汽车', '美食'
];
// 已选择的话题
let selectedTopics = new Set();
// 初始化页面
window.onload = function() {
loadPredefinedTopics();
};
// 加载预定义话题
function loadPredefinedTopics() {
const topicsDiv = document.getElementById('predefinedTopics');
predefinedTopics.forEach(topic => {
const topicElement = document.createElement('span');
topicElement.className = 'topic-item';
topicElement.textContent = topic;
topicElement.onclick = () => toggleTopic(topic, topicElement);
topicsDiv.appendChild(topicElement);
});
}
// 切换话题选择状态
function toggleTopic(topic, element) {
if (selectedTopics.has(topic)) {
selectedTopics.delete(topic);
element.classList.remove('selected');
} else {
selectedTopics.add(topic);
element.classList.add('selected');
}
updateSelectedTopicsList();
}
// 添加自定义话题
function addCustomTopic() {
const input = document.getElementById('customTopic');
const topic = input.value.trim();
if (topic) {
selectedTopics.add(topic);
input.value = '';
updateSelectedTopicsList();
}
}
// 更新已选择的话题列表
function updateSelectedTopicsList() {
const listDiv = document.getElementById('selectedTopicsList');
listDiv.innerHTML = '';
selectedTopics.forEach(topic => {
const topicElement = document.createElement('span');
topicElement.className = 'topic-item selected';
topicElement.textContent = topic;
topicElement.onclick = () => {
selectedTopics.delete(topic);
updateSelectedTopicsList();
};
listDiv.appendChild(topicElement);
});
}
// 开始爬取
function startCrawling() {
if (selectedTopics.size === 0) {
alert('请至少选择一个话题!');
return;
}
const config = {
topics: Array.from(selectedTopics),
parameters: {
crawlDepth: parseInt(document.getElementById('crawlDepth').value),
interval: parseInt(document.getElementById('interval').value),
maxRetries: parseInt(document.getElementById('maxRetries').value),
timeout: parseInt(document.getElementById('timeout').value)
}
};
// 发送爬虫配置到后端
fetch('/api/spider/start', {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify(config)
})
.then(response => response.json())
.then(data => {
if (data.success) {
updateCrawlLog('爬虫任务已启动...');
} else {
updateCrawlLog('启动失败:' + data.message);
}
})
.catch(error => {
updateCrawlLog('错误:' + error.message);
});
}
// 保存配置
function saveConfig() {
const config = {
topics: Array.from(selectedTopics),
parameters: {
crawlDepth: parseInt(document.getElementById('crawlDepth').value),
interval: parseInt(document.getElementById('interval').value),
maxRetries: parseInt(document.getElementById('maxRetries').value),
timeout: parseInt(document.getElementById('timeout').value)
}
};
fetch('/api/spider/save-config', {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify(config)
})
.then(response => response.json())
.then(data => {
if (data.success) {
alert('配置已保存!');
} else {
alert('保存失败:' + data.message);
}
})
.catch(error => {
alert('保存出错:' + error.message);
});
}
// 更新爬虫日志
function updateCrawlLog(message) {
const log = document.getElementById('crawlLog');
const timestamp = new Date().toLocaleTimeString();
log.innerHTML += `[${timestamp}] ${message}\n`;
log.scrollTop = log.scrollHeight;
}
// WebSocket连接用于实时更新爬虫状态
const ws = new WebSocket(`ws://${window.location.host}/ws/spider-status`);
ws.onmessage = function(event) {
const data = JSON.parse(event.data);
if (data.type === 'progress') {
document.getElementById('crawlProgress').style.width = data.value + '%';
} else if (data.type === 'log') {
updateCrawlLog(data.message);
}
};
</script>
</body>
</html>
\ No newline at end of file
... ...
from flask import Blueprint, jsonify, request, render_template
import json
import os
from datetime import datetime
import threading
from queue import Queue
import asyncio
import websockets
import logging
from spider.spiderData import SpiderData
# 创建蓝图
spider_bp = Blueprint('spider', __name__)
# 创建日志记录器
logger = logging.getLogger('spider_control')
logger.setLevel(logging.INFO)
# 存储WebSocket连接的集合
websocket_connections = set()
# 创建消息队列
message_queue = Queue()
# 默认配置
DEFAULT_CONFIG = {
'crawlDepth': 3,
'interval': 5,
'maxRetries': 3,
'timeout': 30
}
def load_config():
"""加载爬虫配置"""
config_path = os.path.join(os.path.dirname(__file__), '../spider/config.json')
try:
if os.path.exists(config_path):
with open(config_path, 'r', encoding='utf-8') as f:
return json.load(f)
except Exception as e:
logger.error(f"加载配置文件失败: {e}")
return DEFAULT_CONFIG
def save_config(config):
"""保存爬虫配置"""
config_path = os.path.join(os.path.dirname(__file__), '../spider/config.json')
try:
with open(config_path, 'w', encoding='utf-8') as f:
json.dump(config, f, ensure_ascii=False, indent=4)
return True
except Exception as e:
logger.error(f"保存配置文件失败: {e}")
return False
async def broadcast_message(message):
"""广播消息到所有WebSocket连接"""
if not websocket_connections:
return
for websocket in websocket_connections.copy():
try:
await websocket.send(json.dumps(message))
except websockets.exceptions.ConnectionClosed:
websocket_connections.remove(websocket)
except Exception as e:
logger.error(f"发送WebSocket消息失败: {e}")
websocket_connections.remove(websocket)
def spider_worker(topics, parameters):
"""爬虫工作线程"""
total_topics = len(topics)
completed_topics = 0
try:
spider = SpiderData()
for topic in topics:
try:
# 更新进度
progress = int((completed_topics / total_topics) * 100)
asyncio.run(broadcast_message({
'type': 'progress',
'value': progress
}))
# 发送开始爬取的日志
asyncio.run(broadcast_message({
'type': 'log',
'message': f'开始爬取话题: {topic}'
}))
# 执行爬取
spider.crawl_topic(
topic=topic,
depth=parameters['crawlDepth'],
interval=parameters['interval'],
max_retries=parameters['maxRetries'],
timeout=parameters['timeout']
)
completed_topics += 1
# 发送完成爬取的日志
asyncio.run(broadcast_message({
'type': 'log',
'message': f'话题 {topic} 爬取完成'
}))
except Exception as e:
# 发送错误日志
asyncio.run(broadcast_message({
'type': 'log',
'message': f'爬取话题 {topic} 时出错: {str(e)}'
}))
# 更新最终进度
asyncio.run(broadcast_message({
'type': 'progress',
'value': 100
}))
# 发送完成消息
asyncio.run(broadcast_message({
'type': 'log',
'message': '所有话题爬取完成'
}))
except Exception as e:
# 发送错误日志
asyncio.run(broadcast_message({
'type': 'log',
'message': f'爬虫任务执行出错: {str(e)}'
}))
@spider_bp.route('/spider/control')
def spider_control():
"""渲染爬虫控制页面"""
return render_template('spider_control.html')
@spider_bp.route('/api/spider/start', methods=['POST'])
def start_spider():
"""启动爬虫任务"""
try:
data = request.get_json()
topics = data.get('topics', [])
parameters = data.get('parameters', DEFAULT_CONFIG)
if not topics:
return jsonify({
'success': False,
'message': '请选择至少一个话题'
})
# 启动爬虫线程
thread = threading.Thread(
target=spider_worker,
args=(topics, parameters),
daemon=True
)
thread.start()
return jsonify({
'success': True,
'message': '爬虫任务已启动'
})
except Exception as e:
logger.error(f"启动爬虫任务失败: {e}")
return jsonify({
'success': False,
'message': str(e)
})
@spider_bp.route('/api/spider/save-config', methods=['POST'])
def save_spider_config():
"""保存爬虫配置"""
try:
config = request.get_json()
if save_config(config):
return jsonify({
'success': True,
'message': '配置保存成功'
})
else:
return jsonify({
'success': False,
'message': '配置保存失败'
})
except Exception as e:
logger.error(f"保存配置失败: {e}")
return jsonify({
'success': False,
'message': str(e)
})
@spider_bp.websocket('/ws/spider-status')
async def spider_status_socket():
"""WebSocket连接处理"""
try:
websocket = websockets.WebSocketServerProtocol()
websocket_connections.add(websocket)
try:
while True:
# 保持连接活跃
await websocket.ping()
await asyncio.sleep(30)
except websockets.exceptions.ConnectionClosed:
pass
finally:
websocket_connections.remove(websocket)
except Exception as e:
logger.error(f"WebSocket连接处理失败: {e}")
\ No newline at end of file
... ...