戒酒的李白

Add a visual control panel for the crawler, supporting customization of topics a…

…nd parameter configuration.
@@ -99,8 +99,10 @@ app.secret_key = 'this is secret_key you know ?' # 设置 Flask 的密钥,用 @@ -99,8 +99,10 @@ app.secret_key = 'this is secret_key you know ?' # 设置 Flask 的密钥,用
99 # 导入蓝图 99 # 导入蓝图
100 from views.page import page 100 from views.page import page
101 from views.user import user 101 from views.user import user
  102 +from views.spider_control import spider_bp
102 app.register_blueprint(page.pb) # 注册页面蓝图 103 app.register_blueprint(page.pb) # 注册页面蓝图
103 app.register_blueprint(user.ub) # 注册用户蓝图 104 app.register_blueprint(user.ub) # 注册用户蓝图
  105 +app.register_blueprint(spider_bp) # 注册爬虫控制蓝图
104 106
105 # 首页路由,清空 session 107 # 首页路由,清空 session
106 @app.route('/') 108 @app.route('/')
@@ -3,6 +3,13 @@ from spiderDataPackage.spiderContent import start as spiderContent @@ -3,6 +3,13 @@ from spiderDataPackage.spiderContent import start as spiderContent
3 from spiderDataPackage.spiderComments import start as spiderComments 3 from spiderDataPackage.spiderComments import start as spiderComments
4 from spiderDataPackage.settings import navAddr 4 from spiderDataPackage.settings import navAddr
5 import os 5 import os
  6 +import requests
  7 +import time
  8 +import random
  9 +import logging
  10 +from bs4 import BeautifulSoup
  11 +from datetime import datetime
  12 +from utils.logger import spider_logger as logging
6 13
7 def spiderData(): 14 def spiderData():
8 if not os.path.exists(navAddr): 15 if not os.path.exists(navAddr):
@@ -13,5 +20,131 @@ def spiderData(): @@ -13,5 +20,131 @@ def spiderData():
13 print('正在爬取文章评论数据') 20 print('正在爬取文章评论数据')
14 spiderComments() 21 spiderComments()
15 22
  23 +class SpiderData:
  24 + def __init__(self):
  25 + self.headers = {
  26 + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
  27 + }
  28 + self.base_url = 'https://s.weibo.com'
  29 +
  30 + def crawl_topic(self, topic, depth=3, interval=5, max_retries=3, timeout=30):
  31 + """
  32 + 爬取指定话题的微博内容
  33 +
  34 + :param topic: 要爬取的话题
  35 + :param depth: 爬取深度(页数)
  36 + :param interval: 请求间隔时间(秒)
  37 + :param max_retries: 最大重试次数
  38 + :param timeout: 请求超时时间(秒)
  39 + """
  40 + logging.info(f"开始爬取话题: {topic}")
  41 +
  42 + for page in range(1, depth + 1):
  43 + retries = 0
  44 + while retries < max_retries:
  45 + try:
  46 + url = f"{self.base_url}/weibo?q={topic}&page={page}"
  47 + response = requests.get(url, headers=self.headers, timeout=timeout)
  48 +
  49 + if response.status_code == 200:
  50 + self._parse_page(response.text)
  51 + logging.info(f"成功爬取话题 {topic} 第 {page} 页")
  52 + break
  53 + else:
  54 + logging.warning(f"请求失败,状态码: {response.status_code}")
  55 + retries += 1
  56 +
  57 + except requests.RequestException as e:
  58 + logging.error(f"请求异常: {e}")
  59 + retries += 1
  60 +
  61 + if retries < max_retries:
  62 + sleep_time = interval * (1 + random.random())
  63 + logging.info(f"等待 {sleep_time:.2f} 秒后重试...")
  64 + time.sleep(sleep_time)
  65 +
  66 + if retries == max_retries:
  67 + logging.error(f"话题 {topic} 第 {page} 页爬取失败,已达到最大重试次数")
  68 + continue
  69 +
  70 + # 在页面之间添加随机延迟
  71 + if page < depth:
  72 + sleep_time = interval * (1 + random.random())
  73 + logging.info(f"等待 {sleep_time:.2f} 秒后继续...")
  74 + time.sleep(sleep_time)
  75 +
  76 + def _parse_page(self, html_content):
  77 + """
  78 + 解析页面内容并保存数据
  79 +
  80 + :param html_content: 页面HTML内容
  81 + """
  82 + try:
  83 + soup = BeautifulSoup(html_content, 'html.parser')
  84 + weibo_items = soup.find_all('div', class_='card-wrap')
  85 +
  86 + for item in weibo_items:
  87 + try:
  88 + # 提取微博内容
  89 + content = item.find('p', class_='txt')
  90 + if not content:
  91 + continue
  92 +
  93 + # 提取用户信息
  94 + user_info = item.find('a', class_='name')
  95 + if not user_info:
  96 + continue
  97 +
  98 + # 提取发布时间
  99 + time_info = item.find('p', class_='from')
  100 +
  101 + # 提取互动数据
  102 + actions = item.find_all('li', class_='action')
  103 +
  104 + # 构建数据字典
  105 + weibo_data = {
  106 + 'content': content.text.strip(),
  107 + 'user_name': user_info.text.strip(),
  108 + 'publish_time': time_info.text.strip() if time_info else '',
  109 + 'forward_count': self._extract_number(actions[0].text) if len(actions) > 0 else 0,
  110 + 'comment_count': self._extract_number(actions[1].text) if len(actions) > 1 else 0,
  111 + 'like_count': self._extract_number(actions[2].text) if len(actions) > 2 else 0,
  112 + 'crawl_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
  113 + }
  114 +
  115 + # 保存到数据库
  116 + self._save_to_database(weibo_data)
  117 +
  118 + except Exception as e:
  119 + logging.error(f"解析微博项时出错: {e}")
  120 + continue
  121 +
  122 + except Exception as e:
  123 + logging.error(f"解析页面时出错: {e}")
  124 +
  125 + def _extract_number(self, text):
  126 + """
  127 + 从文本中提取数字
  128 +
  129 + :param text: 包含数字的文本
  130 + :return: 提取的数字,如果没有找到则返回0
  131 + """
  132 + try:
  133 + return int(''.join(filter(str.isdigit, text)))
  134 + except ValueError:
  135 + return 0
  136 +
  137 + def _save_to_database(self, data):
  138 + """
  139 + 将数据保存到数据库
  140 +
  141 + :param data: 要保存的数据字典
  142 + """
  143 + try:
  144 + # TODO: 实现数据库保存逻辑
  145 + logging.info(f"保存数据: {data}")
  146 + except Exception as e:
  147 + logging.error(f"保存数据时出错: {e}")
  148 +
16 if __name__ == '__main__': 149 if __name__ == '__main__':
17 spiderData() 150 spiderData()
  1 +<!DOCTYPE html>
  2 +<html lang="zh-CN">
  3 +<head>
  4 + <meta charset="UTF-8">
  5 + <meta name="viewport" content="width=device-width, initial-scale=1.0">
  6 + <title>爬虫控制面板</title>
  7 + <link href="https://cdn.bootcdn.net/ajax/libs/twitter-bootstrap/5.0.2/css/bootstrap.min.css" rel="stylesheet">
  8 + <link href="https://cdn.bootcdn.net/ajax/libs/font-awesome/5.15.4/css/all.min.css" rel="stylesheet">
  9 + <style>
  10 + .topic-item {
  11 + margin: 5px;
  12 + padding: 8px 15px;
  13 + border-radius: 20px;
  14 + background-color: #f8f9fa;
  15 + display: inline-block;
  16 + cursor: pointer;
  17 + }
  18 + .topic-item.selected {
  19 + background-color: #0d6efd;
  20 + color: white;
  21 + }
  22 + .custom-topic-input {
  23 + margin: 10px 0;
  24 + }
  25 + .parameter-section {
  26 + margin: 20px 0;
  27 + padding: 20px;
  28 + border-radius: 10px;
  29 + background-color: #f8f9fa;
  30 + }
  31 + </style>
  32 +</head>
  33 +<body>
  34 + <div class="container mt-5">
  35 + <h2 class="mb-4">爬虫控制面板</h2>
  36 +
  37 + <!-- 话题选择区域 -->
  38 + <div class="card mb-4">
  39 + <div class="card-header">
  40 + <h5 class="mb-0">选择话题类型</h5>
  41 + </div>
  42 + <div class="card-body">
  43 + <div id="predefinedTopics" class="mb-3">
  44 + <!-- 预定义话题将通过JavaScript动态加载 -->
  45 + </div>
  46 +
  47 + <div class="custom-topic-input">
  48 + <h6>添加自定义话题</h6>
  49 + <div class="input-group">
  50 + <input type="text" class="form-control" id="customTopic" placeholder="输入自定义话题">
  51 + <button class="btn btn-primary" onclick="addCustomTopic()">
  52 + <i class="fas fa-plus"></i> 添加
  53 + </button>
  54 + </div>
  55 + </div>
  56 +
  57 + <div id="selectedTopics" class="mt-3">
  58 + <h6>已选择的话题:</h6>
  59 + <div id="selectedTopicsList" class="mt-2">
  60 + <!-- 已选择的话题将在这里显示 -->
  61 + </div>
  62 + </div>
  63 + </div>
  64 + </div>
  65 +
  66 + <!-- 爬虫参数配置 -->
  67 + <div class="card mb-4">
  68 + <div class="card-header">
  69 + <h5 class="mb-0">爬虫参数配置</h5>
  70 + </div>
  71 + <div class="card-body">
  72 + <div class="row">
  73 + <div class="col-md-6">
  74 + <div class="mb-3">
  75 + <label for="crawlDepth" class="form-label">爬取深度</label>
  76 + <input type="number" class="form-control" id="crawlDepth" value="3" min="1" max="10">
  77 + <small class="text-muted">每个话题爬取的页数(1-10)</small>
  78 + </div>
  79 + </div>
  80 + <div class="col-md-6">
  81 + <div class="mb-3">
  82 + <label for="interval" class="form-label">爬取间隔(秒)</label>
  83 + <input type="number" class="form-control" id="interval" value="5" min="1">
  84 + <small class="text-muted">每次请求之间的间隔时间</small>
  85 + </div>
  86 + </div>
  87 + </div>
  88 +
  89 + <div class="row">
  90 + <div class="col-md-6">
  91 + <div class="mb-3">
  92 + <label for="maxRetries" class="form-label">最大重试次数</label>
  93 + <input type="number" class="form-control" id="maxRetries" value="3" min="1">
  94 + </div>
  95 + </div>
  96 + <div class="col-md-6">
  97 + <div class="mb-3">
  98 + <label for="timeout" class="form-label">请求超时时间(秒)</label>
  99 + <input type="number" class="form-control" id="timeout" value="30" min="1">
  100 + </div>
  101 + </div>
  102 + </div>
  103 + </div>
  104 + </div>
  105 +
  106 + <!-- 操作按钮 -->
  107 + <div class="d-flex justify-content-between mb-5">
  108 + <button class="btn btn-primary" onclick="startCrawling()">
  109 + <i class="fas fa-play"></i> 开始爬取
  110 + </button>
  111 + <button class="btn btn-secondary" onclick="saveConfig()">
  112 + <i class="fas fa-save"></i> 保存配置
  113 + </button>
  114 + </div>
  115 +
  116 + <!-- 爬虫状态和日志 -->
  117 + <div class="card">
  118 + <div class="card-header">
  119 + <h5 class="mb-0">爬虫状态</h5>
  120 + </div>
  121 + <div class="card-body">
  122 + <div class="progress mb-3">
  123 + <div id="crawlProgress" class="progress-bar" role="progressbar" style="width: 0%"></div>
  124 + </div>
  125 + <div class="border p-3 bg-light" style="height: 200px; overflow-y: auto;">
  126 + <pre id="crawlLog" class="mb-0"></pre>
  127 + </div>
  128 + </div>
  129 + </div>
  130 + </div>
  131 +
  132 + <script src="https://cdn.bootcdn.net/ajax/libs/jquery/3.6.0/jquery.min.js"></script>
  133 + <script src="https://cdn.bootcdn.net/ajax/libs/twitter-bootstrap/5.0.2/js/bootstrap.bundle.min.js"></script>
  134 + <script>
  135 + // 预定义话题列表
  136 + const predefinedTopics = [
  137 + '热门', '社会', '科技', '娱乐', '体育', '财经',
  138 + '教育', '健康', '军事', '文化', '汽车', '美食'
  139 + ];
  140 +
  141 + // 已选择的话题
  142 + let selectedTopics = new Set();
  143 +
  144 + // 初始化页面
  145 + window.onload = function() {
  146 + loadPredefinedTopics();
  147 + };
  148 +
  149 + // 加载预定义话题
  150 + function loadPredefinedTopics() {
  151 + const topicsDiv = document.getElementById('predefinedTopics');
  152 + predefinedTopics.forEach(topic => {
  153 + const topicElement = document.createElement('span');
  154 + topicElement.className = 'topic-item';
  155 + topicElement.textContent = topic;
  156 + topicElement.onclick = () => toggleTopic(topic, topicElement);
  157 + topicsDiv.appendChild(topicElement);
  158 + });
  159 + }
  160 +
  161 + // 切换话题选择状态
  162 + function toggleTopic(topic, element) {
  163 + if (selectedTopics.has(topic)) {
  164 + selectedTopics.delete(topic);
  165 + element.classList.remove('selected');
  166 + } else {
  167 + selectedTopics.add(topic);
  168 + element.classList.add('selected');
  169 + }
  170 + updateSelectedTopicsList();
  171 + }
  172 +
  173 + // 添加自定义话题
  174 + function addCustomTopic() {
  175 + const input = document.getElementById('customTopic');
  176 + const topic = input.value.trim();
  177 + if (topic) {
  178 + selectedTopics.add(topic);
  179 + input.value = '';
  180 + updateSelectedTopicsList();
  181 + }
  182 + }
  183 +
  184 + // 更新已选择的话题列表
  185 + function updateSelectedTopicsList() {
  186 + const listDiv = document.getElementById('selectedTopicsList');
  187 + listDiv.innerHTML = '';
  188 + selectedTopics.forEach(topic => {
  189 + const topicElement = document.createElement('span');
  190 + topicElement.className = 'topic-item selected';
  191 + topicElement.textContent = topic;
  192 + topicElement.onclick = () => {
  193 + selectedTopics.delete(topic);
  194 + updateSelectedTopicsList();
  195 + };
  196 + listDiv.appendChild(topicElement);
  197 + });
  198 + }
  199 +
  200 + // 开始爬取
  201 + function startCrawling() {
  202 + if (selectedTopics.size === 0) {
  203 + alert('请至少选择一个话题!');
  204 + return;
  205 + }
  206 +
  207 + const config = {
  208 + topics: Array.from(selectedTopics),
  209 + parameters: {
  210 + crawlDepth: parseInt(document.getElementById('crawlDepth').value),
  211 + interval: parseInt(document.getElementById('interval').value),
  212 + maxRetries: parseInt(document.getElementById('maxRetries').value),
  213 + timeout: parseInt(document.getElementById('timeout').value)
  214 + }
  215 + };
  216 +
  217 + // 发送爬虫配置到后端
  218 + fetch('/api/spider/start', {
  219 + method: 'POST',
  220 + headers: {
  221 + 'Content-Type': 'application/json'
  222 + },
  223 + body: JSON.stringify(config)
  224 + })
  225 + .then(response => response.json())
  226 + .then(data => {
  227 + if (data.success) {
  228 + updateCrawlLog('爬虫任务已启动...');
  229 + } else {
  230 + updateCrawlLog('启动失败:' + data.message);
  231 + }
  232 + })
  233 + .catch(error => {
  234 + updateCrawlLog('错误:' + error.message);
  235 + });
  236 + }
  237 +
  238 + // 保存配置
  239 + function saveConfig() {
  240 + const config = {
  241 + topics: Array.from(selectedTopics),
  242 + parameters: {
  243 + crawlDepth: parseInt(document.getElementById('crawlDepth').value),
  244 + interval: parseInt(document.getElementById('interval').value),
  245 + maxRetries: parseInt(document.getElementById('maxRetries').value),
  246 + timeout: parseInt(document.getElementById('timeout').value)
  247 + }
  248 + };
  249 +
  250 + fetch('/api/spider/save-config', {
  251 + method: 'POST',
  252 + headers: {
  253 + 'Content-Type': 'application/json'
  254 + },
  255 + body: JSON.stringify(config)
  256 + })
  257 + .then(response => response.json())
  258 + .then(data => {
  259 + if (data.success) {
  260 + alert('配置已保存!');
  261 + } else {
  262 + alert('保存失败:' + data.message);
  263 + }
  264 + })
  265 + .catch(error => {
  266 + alert('保存出错:' + error.message);
  267 + });
  268 + }
  269 +
  270 + // 更新爬虫日志
  271 + function updateCrawlLog(message) {
  272 + const log = document.getElementById('crawlLog');
  273 + const timestamp = new Date().toLocaleTimeString();
  274 + log.innerHTML += `[${timestamp}] ${message}\n`;
  275 + log.scrollTop = log.scrollHeight;
  276 + }
  277 +
  278 + // WebSocket连接用于实时更新爬虫状态
  279 + const ws = new WebSocket(`ws://${window.location.host}/ws/spider-status`);
  280 +
  281 + ws.onmessage = function(event) {
  282 + const data = JSON.parse(event.data);
  283 + if (data.type === 'progress') {
  284 + document.getElementById('crawlProgress').style.width = data.value + '%';
  285 + } else if (data.type === 'log') {
  286 + updateCrawlLog(data.message);
  287 + }
  288 + };
  289 + </script>
  290 +</body>
  291 +</html>
  1 +from flask import Blueprint, jsonify, request, render_template
  2 +import json
  3 +import os
  4 +from datetime import datetime
  5 +import threading
  6 +from queue import Queue
  7 +import asyncio
  8 +import websockets
  9 +import logging
  10 +from spider.spiderData import SpiderData
  11 +
  12 +# 创建蓝图
  13 +spider_bp = Blueprint('spider', __name__)
  14 +
  15 +# 创建日志记录器
  16 +logger = logging.getLogger('spider_control')
  17 +logger.setLevel(logging.INFO)
  18 +
  19 +# 存储WebSocket连接的集合
  20 +websocket_connections = set()
  21 +
  22 +# 创建消息队列
  23 +message_queue = Queue()
  24 +
  25 +# 默认配置
  26 +DEFAULT_CONFIG = {
  27 + 'crawlDepth': 3,
  28 + 'interval': 5,
  29 + 'maxRetries': 3,
  30 + 'timeout': 30
  31 +}
  32 +
  33 +def load_config():
  34 + """加载爬虫配置"""
  35 + config_path = os.path.join(os.path.dirname(__file__), '../spider/config.json')
  36 + try:
  37 + if os.path.exists(config_path):
  38 + with open(config_path, 'r', encoding='utf-8') as f:
  39 + return json.load(f)
  40 + except Exception as e:
  41 + logger.error(f"加载配置文件失败: {e}")
  42 + return DEFAULT_CONFIG
  43 +
  44 +def save_config(config):
  45 + """保存爬虫配置"""
  46 + config_path = os.path.join(os.path.dirname(__file__), '../spider/config.json')
  47 + try:
  48 + with open(config_path, 'w', encoding='utf-8') as f:
  49 + json.dump(config, f, ensure_ascii=False, indent=4)
  50 + return True
  51 + except Exception as e:
  52 + logger.error(f"保存配置文件失败: {e}")
  53 + return False
  54 +
  55 +async def broadcast_message(message):
  56 + """广播消息到所有WebSocket连接"""
  57 + if not websocket_connections:
  58 + return
  59 +
  60 + for websocket in websocket_connections.copy():
  61 + try:
  62 + await websocket.send(json.dumps(message))
  63 + except websockets.exceptions.ConnectionClosed:
  64 + websocket_connections.remove(websocket)
  65 + except Exception as e:
  66 + logger.error(f"发送WebSocket消息失败: {e}")
  67 + websocket_connections.remove(websocket)
  68 +
  69 +def spider_worker(topics, parameters):
  70 + """爬虫工作线程"""
  71 + total_topics = len(topics)
  72 + completed_topics = 0
  73 +
  74 + try:
  75 + spider = SpiderData()
  76 +
  77 + for topic in topics:
  78 + try:
  79 + # 更新进度
  80 + progress = int((completed_topics / total_topics) * 100)
  81 + asyncio.run(broadcast_message({
  82 + 'type': 'progress',
  83 + 'value': progress
  84 + }))
  85 +
  86 + # 发送开始爬取的日志
  87 + asyncio.run(broadcast_message({
  88 + 'type': 'log',
  89 + 'message': f'开始爬取话题: {topic}'
  90 + }))
  91 +
  92 + # 执行爬取
  93 + spider.crawl_topic(
  94 + topic=topic,
  95 + depth=parameters['crawlDepth'],
  96 + interval=parameters['interval'],
  97 + max_retries=parameters['maxRetries'],
  98 + timeout=parameters['timeout']
  99 + )
  100 +
  101 + completed_topics += 1
  102 +
  103 + # 发送完成爬取的日志
  104 + asyncio.run(broadcast_message({
  105 + 'type': 'log',
  106 + 'message': f'话题 {topic} 爬取完成'
  107 + }))
  108 +
  109 + except Exception as e:
  110 + # 发送错误日志
  111 + asyncio.run(broadcast_message({
  112 + 'type': 'log',
  113 + 'message': f'爬取话题 {topic} 时出错: {str(e)}'
  114 + }))
  115 +
  116 + # 更新最终进度
  117 + asyncio.run(broadcast_message({
  118 + 'type': 'progress',
  119 + 'value': 100
  120 + }))
  121 +
  122 + # 发送完成消息
  123 + asyncio.run(broadcast_message({
  124 + 'type': 'log',
  125 + 'message': '所有话题爬取完成'
  126 + }))
  127 +
  128 + except Exception as e:
  129 + # 发送错误日志
  130 + asyncio.run(broadcast_message({
  131 + 'type': 'log',
  132 + 'message': f'爬虫任务执行出错: {str(e)}'
  133 + }))
  134 +
  135 +@spider_bp.route('/spider/control')
  136 +def spider_control():
  137 + """渲染爬虫控制页面"""
  138 + return render_template('spider_control.html')
  139 +
  140 +@spider_bp.route('/api/spider/start', methods=['POST'])
  141 +def start_spider():
  142 + """启动爬虫任务"""
  143 + try:
  144 + data = request.get_json()
  145 + topics = data.get('topics', [])
  146 + parameters = data.get('parameters', DEFAULT_CONFIG)
  147 +
  148 + if not topics:
  149 + return jsonify({
  150 + 'success': False,
  151 + 'message': '请选择至少一个话题'
  152 + })
  153 +
  154 + # 启动爬虫线程
  155 + thread = threading.Thread(
  156 + target=spider_worker,
  157 + args=(topics, parameters),
  158 + daemon=True
  159 + )
  160 + thread.start()
  161 +
  162 + return jsonify({
  163 + 'success': True,
  164 + 'message': '爬虫任务已启动'
  165 + })
  166 +
  167 + except Exception as e:
  168 + logger.error(f"启动爬虫任务失败: {e}")
  169 + return jsonify({
  170 + 'success': False,
  171 + 'message': str(e)
  172 + })
  173 +
  174 +@spider_bp.route('/api/spider/save-config', methods=['POST'])
  175 +def save_spider_config():
  176 + """保存爬虫配置"""
  177 + try:
  178 + config = request.get_json()
  179 + if save_config(config):
  180 + return jsonify({
  181 + 'success': True,
  182 + 'message': '配置保存成功'
  183 + })
  184 + else:
  185 + return jsonify({
  186 + 'success': False,
  187 + 'message': '配置保存失败'
  188 + })
  189 + except Exception as e:
  190 + logger.error(f"保存配置失败: {e}")
  191 + return jsonify({
  192 + 'success': False,
  193 + 'message': str(e)
  194 + })
  195 +
  196 +@spider_bp.websocket('/ws/spider-status')
  197 +async def spider_status_socket():
  198 + """WebSocket连接处理"""
  199 + try:
  200 + websocket = websockets.WebSocketServerProtocol()
  201 + websocket_connections.add(websocket)
  202 +
  203 + try:
  204 + while True:
  205 + # 保持连接活跃
  206 + await websocket.ping()
  207 + await asyncio.sleep(30)
  208 + except websockets.exceptions.ConnectionClosed:
  209 + pass
  210 + finally:
  211 + websocket_connections.remove(websocket)
  212 + except Exception as e:
  213 + logger.error(f"WebSocket连接处理失败: {e}")