Add a visual control panel for the crawler, supporting customization of topics a…
…nd parameter configuration.
Showing
4 changed files
with
639 additions
and
0 deletions
| @@ -99,8 +99,10 @@ app.secret_key = 'this is secret_key you know ?' # 设置 Flask 的密钥,用 | @@ -99,8 +99,10 @@ app.secret_key = 'this is secret_key you know ?' # 设置 Flask 的密钥,用 | ||
| 99 | # 导入蓝图 | 99 | # 导入蓝图 |
| 100 | from views.page import page | 100 | from views.page import page |
| 101 | from views.user import user | 101 | from views.user import user |
| 102 | +from views.spider_control import spider_bp | ||
| 102 | app.register_blueprint(page.pb) # 注册页面蓝图 | 103 | app.register_blueprint(page.pb) # 注册页面蓝图 |
| 103 | app.register_blueprint(user.ub) # 注册用户蓝图 | 104 | app.register_blueprint(user.ub) # 注册用户蓝图 |
| 105 | +app.register_blueprint(spider_bp) # 注册爬虫控制蓝图 | ||
| 104 | 106 | ||
| 105 | # 首页路由,清空 session | 107 | # 首页路由,清空 session |
| 106 | @app.route('/') | 108 | @app.route('/') |
| @@ -3,6 +3,13 @@ from spiderDataPackage.spiderContent import start as spiderContent | @@ -3,6 +3,13 @@ from spiderDataPackage.spiderContent import start as spiderContent | ||
| 3 | from spiderDataPackage.spiderComments import start as spiderComments | 3 | from spiderDataPackage.spiderComments import start as spiderComments |
| 4 | from spiderDataPackage.settings import navAddr | 4 | from spiderDataPackage.settings import navAddr |
| 5 | import os | 5 | import os |
| 6 | +import requests | ||
| 7 | +import time | ||
| 8 | +import random | ||
| 9 | +import logging | ||
| 10 | +from bs4 import BeautifulSoup | ||
| 11 | +from datetime import datetime | ||
| 12 | +from utils.logger import spider_logger as logging | ||
| 6 | 13 | ||
| 7 | def spiderData(): | 14 | def spiderData(): |
| 8 | if not os.path.exists(navAddr): | 15 | if not os.path.exists(navAddr): |
| @@ -13,5 +20,131 @@ def spiderData(): | @@ -13,5 +20,131 @@ def spiderData(): | ||
| 13 | print('正在爬取文章评论数据') | 20 | print('正在爬取文章评论数据') |
| 14 | spiderComments() | 21 | spiderComments() |
| 15 | 22 | ||
| 23 | +class SpiderData: | ||
| 24 | + def __init__(self): | ||
| 25 | + self.headers = { | ||
| 26 | + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | ||
| 27 | + } | ||
| 28 | + self.base_url = 'https://s.weibo.com' | ||
| 29 | + | ||
| 30 | + def crawl_topic(self, topic, depth=3, interval=5, max_retries=3, timeout=30): | ||
| 31 | + """ | ||
| 32 | + 爬取指定话题的微博内容 | ||
| 33 | + | ||
| 34 | + :param topic: 要爬取的话题 | ||
| 35 | + :param depth: 爬取深度(页数) | ||
| 36 | + :param interval: 请求间隔时间(秒) | ||
| 37 | + :param max_retries: 最大重试次数 | ||
| 38 | + :param timeout: 请求超时时间(秒) | ||
| 39 | + """ | ||
| 40 | + logging.info(f"开始爬取话题: {topic}") | ||
| 41 | + | ||
| 42 | + for page in range(1, depth + 1): | ||
| 43 | + retries = 0 | ||
| 44 | + while retries < max_retries: | ||
| 45 | + try: | ||
| 46 | + url = f"{self.base_url}/weibo?q={topic}&page={page}" | ||
| 47 | + response = requests.get(url, headers=self.headers, timeout=timeout) | ||
| 48 | + | ||
| 49 | + if response.status_code == 200: | ||
| 50 | + self._parse_page(response.text) | ||
| 51 | + logging.info(f"成功爬取话题 {topic} 第 {page} 页") | ||
| 52 | + break | ||
| 53 | + else: | ||
| 54 | + logging.warning(f"请求失败,状态码: {response.status_code}") | ||
| 55 | + retries += 1 | ||
| 56 | + | ||
| 57 | + except requests.RequestException as e: | ||
| 58 | + logging.error(f"请求异常: {e}") | ||
| 59 | + retries += 1 | ||
| 60 | + | ||
| 61 | + if retries < max_retries: | ||
| 62 | + sleep_time = interval * (1 + random.random()) | ||
| 63 | + logging.info(f"等待 {sleep_time:.2f} 秒后重试...") | ||
| 64 | + time.sleep(sleep_time) | ||
| 65 | + | ||
| 66 | + if retries == max_retries: | ||
| 67 | + logging.error(f"话题 {topic} 第 {page} 页爬取失败,已达到最大重试次数") | ||
| 68 | + continue | ||
| 69 | + | ||
| 70 | + # 在页面之间添加随机延迟 | ||
| 71 | + if page < depth: | ||
| 72 | + sleep_time = interval * (1 + random.random()) | ||
| 73 | + logging.info(f"等待 {sleep_time:.2f} 秒后继续...") | ||
| 74 | + time.sleep(sleep_time) | ||
| 75 | + | ||
| 76 | + def _parse_page(self, html_content): | ||
| 77 | + """ | ||
| 78 | + 解析页面内容并保存数据 | ||
| 79 | + | ||
| 80 | + :param html_content: 页面HTML内容 | ||
| 81 | + """ | ||
| 82 | + try: | ||
| 83 | + soup = BeautifulSoup(html_content, 'html.parser') | ||
| 84 | + weibo_items = soup.find_all('div', class_='card-wrap') | ||
| 85 | + | ||
| 86 | + for item in weibo_items: | ||
| 87 | + try: | ||
| 88 | + # 提取微博内容 | ||
| 89 | + content = item.find('p', class_='txt') | ||
| 90 | + if not content: | ||
| 91 | + continue | ||
| 92 | + | ||
| 93 | + # 提取用户信息 | ||
| 94 | + user_info = item.find('a', class_='name') | ||
| 95 | + if not user_info: | ||
| 96 | + continue | ||
| 97 | + | ||
| 98 | + # 提取发布时间 | ||
| 99 | + time_info = item.find('p', class_='from') | ||
| 100 | + | ||
| 101 | + # 提取互动数据 | ||
| 102 | + actions = item.find_all('li', class_='action') | ||
| 103 | + | ||
| 104 | + # 构建数据字典 | ||
| 105 | + weibo_data = { | ||
| 106 | + 'content': content.text.strip(), | ||
| 107 | + 'user_name': user_info.text.strip(), | ||
| 108 | + 'publish_time': time_info.text.strip() if time_info else '', | ||
| 109 | + 'forward_count': self._extract_number(actions[0].text) if len(actions) > 0 else 0, | ||
| 110 | + 'comment_count': self._extract_number(actions[1].text) if len(actions) > 1 else 0, | ||
| 111 | + 'like_count': self._extract_number(actions[2].text) if len(actions) > 2 else 0, | ||
| 112 | + 'crawl_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S') | ||
| 113 | + } | ||
| 114 | + | ||
| 115 | + # 保存到数据库 | ||
| 116 | + self._save_to_database(weibo_data) | ||
| 117 | + | ||
| 118 | + except Exception as e: | ||
| 119 | + logging.error(f"解析微博项时出错: {e}") | ||
| 120 | + continue | ||
| 121 | + | ||
| 122 | + except Exception as e: | ||
| 123 | + logging.error(f"解析页面时出错: {e}") | ||
| 124 | + | ||
| 125 | + def _extract_number(self, text): | ||
| 126 | + """ | ||
| 127 | + 从文本中提取数字 | ||
| 128 | + | ||
| 129 | + :param text: 包含数字的文本 | ||
| 130 | + :return: 提取的数字,如果没有找到则返回0 | ||
| 131 | + """ | ||
| 132 | + try: | ||
| 133 | + return int(''.join(filter(str.isdigit, text))) | ||
| 134 | + except ValueError: | ||
| 135 | + return 0 | ||
| 136 | + | ||
| 137 | + def _save_to_database(self, data): | ||
| 138 | + """ | ||
| 139 | + 将数据保存到数据库 | ||
| 140 | + | ||
| 141 | + :param data: 要保存的数据字典 | ||
| 142 | + """ | ||
| 143 | + try: | ||
| 144 | + # TODO: 实现数据库保存逻辑 | ||
| 145 | + logging.info(f"保存数据: {data}") | ||
| 146 | + except Exception as e: | ||
| 147 | + logging.error(f"保存数据时出错: {e}") | ||
| 148 | + | ||
| 16 | if __name__ == '__main__': | 149 | if __name__ == '__main__': |
| 17 | spiderData() | 150 | spiderData() |
templates/spider_control.html
0 → 100644
| 1 | +<!DOCTYPE html> | ||
| 2 | +<html lang="zh-CN"> | ||
| 3 | +<head> | ||
| 4 | + <meta charset="UTF-8"> | ||
| 5 | + <meta name="viewport" content="width=device-width, initial-scale=1.0"> | ||
| 6 | + <title>爬虫控制面板</title> | ||
| 7 | + <link href="https://cdn.bootcdn.net/ajax/libs/twitter-bootstrap/5.0.2/css/bootstrap.min.css" rel="stylesheet"> | ||
| 8 | + <link href="https://cdn.bootcdn.net/ajax/libs/font-awesome/5.15.4/css/all.min.css" rel="stylesheet"> | ||
| 9 | + <style> | ||
| 10 | + .topic-item { | ||
| 11 | + margin: 5px; | ||
| 12 | + padding: 8px 15px; | ||
| 13 | + border-radius: 20px; | ||
| 14 | + background-color: #f8f9fa; | ||
| 15 | + display: inline-block; | ||
| 16 | + cursor: pointer; | ||
| 17 | + } | ||
| 18 | + .topic-item.selected { | ||
| 19 | + background-color: #0d6efd; | ||
| 20 | + color: white; | ||
| 21 | + } | ||
| 22 | + .custom-topic-input { | ||
| 23 | + margin: 10px 0; | ||
| 24 | + } | ||
| 25 | + .parameter-section { | ||
| 26 | + margin: 20px 0; | ||
| 27 | + padding: 20px; | ||
| 28 | + border-radius: 10px; | ||
| 29 | + background-color: #f8f9fa; | ||
| 30 | + } | ||
| 31 | + </style> | ||
| 32 | +</head> | ||
| 33 | +<body> | ||
| 34 | + <div class="container mt-5"> | ||
| 35 | + <h2 class="mb-4">爬虫控制面板</h2> | ||
| 36 | + | ||
| 37 | + <!-- 话题选择区域 --> | ||
| 38 | + <div class="card mb-4"> | ||
| 39 | + <div class="card-header"> | ||
| 40 | + <h5 class="mb-0">选择话题类型</h5> | ||
| 41 | + </div> | ||
| 42 | + <div class="card-body"> | ||
| 43 | + <div id="predefinedTopics" class="mb-3"> | ||
| 44 | + <!-- 预定义话题将通过JavaScript动态加载 --> | ||
| 45 | + </div> | ||
| 46 | + | ||
| 47 | + <div class="custom-topic-input"> | ||
| 48 | + <h6>添加自定义话题</h6> | ||
| 49 | + <div class="input-group"> | ||
| 50 | + <input type="text" class="form-control" id="customTopic" placeholder="输入自定义话题"> | ||
| 51 | + <button class="btn btn-primary" onclick="addCustomTopic()"> | ||
| 52 | + <i class="fas fa-plus"></i> 添加 | ||
| 53 | + </button> | ||
| 54 | + </div> | ||
| 55 | + </div> | ||
| 56 | + | ||
| 57 | + <div id="selectedTopics" class="mt-3"> | ||
| 58 | + <h6>已选择的话题:</h6> | ||
| 59 | + <div id="selectedTopicsList" class="mt-2"> | ||
| 60 | + <!-- 已选择的话题将在这里显示 --> | ||
| 61 | + </div> | ||
| 62 | + </div> | ||
| 63 | + </div> | ||
| 64 | + </div> | ||
| 65 | + | ||
| 66 | + <!-- 爬虫参数配置 --> | ||
| 67 | + <div class="card mb-4"> | ||
| 68 | + <div class="card-header"> | ||
| 69 | + <h5 class="mb-0">爬虫参数配置</h5> | ||
| 70 | + </div> | ||
| 71 | + <div class="card-body"> | ||
| 72 | + <div class="row"> | ||
| 73 | + <div class="col-md-6"> | ||
| 74 | + <div class="mb-3"> | ||
| 75 | + <label for="crawlDepth" class="form-label">爬取深度</label> | ||
| 76 | + <input type="number" class="form-control" id="crawlDepth" value="3" min="1" max="10"> | ||
| 77 | + <small class="text-muted">每个话题爬取的页数(1-10)</small> | ||
| 78 | + </div> | ||
| 79 | + </div> | ||
| 80 | + <div class="col-md-6"> | ||
| 81 | + <div class="mb-3"> | ||
| 82 | + <label for="interval" class="form-label">爬取间隔(秒)</label> | ||
| 83 | + <input type="number" class="form-control" id="interval" value="5" min="1"> | ||
| 84 | + <small class="text-muted">每次请求之间的间隔时间</small> | ||
| 85 | + </div> | ||
| 86 | + </div> | ||
| 87 | + </div> | ||
| 88 | + | ||
| 89 | + <div class="row"> | ||
| 90 | + <div class="col-md-6"> | ||
| 91 | + <div class="mb-3"> | ||
| 92 | + <label for="maxRetries" class="form-label">最大重试次数</label> | ||
| 93 | + <input type="number" class="form-control" id="maxRetries" value="3" min="1"> | ||
| 94 | + </div> | ||
| 95 | + </div> | ||
| 96 | + <div class="col-md-6"> | ||
| 97 | + <div class="mb-3"> | ||
| 98 | + <label for="timeout" class="form-label">请求超时时间(秒)</label> | ||
| 99 | + <input type="number" class="form-control" id="timeout" value="30" min="1"> | ||
| 100 | + </div> | ||
| 101 | + </div> | ||
| 102 | + </div> | ||
| 103 | + </div> | ||
| 104 | + </div> | ||
| 105 | + | ||
| 106 | + <!-- 操作按钮 --> | ||
| 107 | + <div class="d-flex justify-content-between mb-5"> | ||
| 108 | + <button class="btn btn-primary" onclick="startCrawling()"> | ||
| 109 | + <i class="fas fa-play"></i> 开始爬取 | ||
| 110 | + </button> | ||
| 111 | + <button class="btn btn-secondary" onclick="saveConfig()"> | ||
| 112 | + <i class="fas fa-save"></i> 保存配置 | ||
| 113 | + </button> | ||
| 114 | + </div> | ||
| 115 | + | ||
| 116 | + <!-- 爬虫状态和日志 --> | ||
| 117 | + <div class="card"> | ||
| 118 | + <div class="card-header"> | ||
| 119 | + <h5 class="mb-0">爬虫状态</h5> | ||
| 120 | + </div> | ||
| 121 | + <div class="card-body"> | ||
| 122 | + <div class="progress mb-3"> | ||
| 123 | + <div id="crawlProgress" class="progress-bar" role="progressbar" style="width: 0%"></div> | ||
| 124 | + </div> | ||
| 125 | + <div class="border p-3 bg-light" style="height: 200px; overflow-y: auto;"> | ||
| 126 | + <pre id="crawlLog" class="mb-0"></pre> | ||
| 127 | + </div> | ||
| 128 | + </div> | ||
| 129 | + </div> | ||
| 130 | + </div> | ||
| 131 | + | ||
| 132 | + <script src="https://cdn.bootcdn.net/ajax/libs/jquery/3.6.0/jquery.min.js"></script> | ||
| 133 | + <script src="https://cdn.bootcdn.net/ajax/libs/twitter-bootstrap/5.0.2/js/bootstrap.bundle.min.js"></script> | ||
| 134 | + <script> | ||
| 135 | + // 预定义话题列表 | ||
| 136 | + const predefinedTopics = [ | ||
| 137 | + '热门', '社会', '科技', '娱乐', '体育', '财经', | ||
| 138 | + '教育', '健康', '军事', '文化', '汽车', '美食' | ||
| 139 | + ]; | ||
| 140 | + | ||
| 141 | + // 已选择的话题 | ||
| 142 | + let selectedTopics = new Set(); | ||
| 143 | + | ||
| 144 | + // 初始化页面 | ||
| 145 | + window.onload = function() { | ||
| 146 | + loadPredefinedTopics(); | ||
| 147 | + }; | ||
| 148 | + | ||
| 149 | + // 加载预定义话题 | ||
| 150 | + function loadPredefinedTopics() { | ||
| 151 | + const topicsDiv = document.getElementById('predefinedTopics'); | ||
| 152 | + predefinedTopics.forEach(topic => { | ||
| 153 | + const topicElement = document.createElement('span'); | ||
| 154 | + topicElement.className = 'topic-item'; | ||
| 155 | + topicElement.textContent = topic; | ||
| 156 | + topicElement.onclick = () => toggleTopic(topic, topicElement); | ||
| 157 | + topicsDiv.appendChild(topicElement); | ||
| 158 | + }); | ||
| 159 | + } | ||
| 160 | + | ||
| 161 | + // 切换话题选择状态 | ||
| 162 | + function toggleTopic(topic, element) { | ||
| 163 | + if (selectedTopics.has(topic)) { | ||
| 164 | + selectedTopics.delete(topic); | ||
| 165 | + element.classList.remove('selected'); | ||
| 166 | + } else { | ||
| 167 | + selectedTopics.add(topic); | ||
| 168 | + element.classList.add('selected'); | ||
| 169 | + } | ||
| 170 | + updateSelectedTopicsList(); | ||
| 171 | + } | ||
| 172 | + | ||
| 173 | + // 添加自定义话题 | ||
| 174 | + function addCustomTopic() { | ||
| 175 | + const input = document.getElementById('customTopic'); | ||
| 176 | + const topic = input.value.trim(); | ||
| 177 | + if (topic) { | ||
| 178 | + selectedTopics.add(topic); | ||
| 179 | + input.value = ''; | ||
| 180 | + updateSelectedTopicsList(); | ||
| 181 | + } | ||
| 182 | + } | ||
| 183 | + | ||
| 184 | + // 更新已选择的话题列表 | ||
| 185 | + function updateSelectedTopicsList() { | ||
| 186 | + const listDiv = document.getElementById('selectedTopicsList'); | ||
| 187 | + listDiv.innerHTML = ''; | ||
| 188 | + selectedTopics.forEach(topic => { | ||
| 189 | + const topicElement = document.createElement('span'); | ||
| 190 | + topicElement.className = 'topic-item selected'; | ||
| 191 | + topicElement.textContent = topic; | ||
| 192 | + topicElement.onclick = () => { | ||
| 193 | + selectedTopics.delete(topic); | ||
| 194 | + updateSelectedTopicsList(); | ||
| 195 | + }; | ||
| 196 | + listDiv.appendChild(topicElement); | ||
| 197 | + }); | ||
| 198 | + } | ||
| 199 | + | ||
| 200 | + // 开始爬取 | ||
| 201 | + function startCrawling() { | ||
| 202 | + if (selectedTopics.size === 0) { | ||
| 203 | + alert('请至少选择一个话题!'); | ||
| 204 | + return; | ||
| 205 | + } | ||
| 206 | + | ||
| 207 | + const config = { | ||
| 208 | + topics: Array.from(selectedTopics), | ||
| 209 | + parameters: { | ||
| 210 | + crawlDepth: parseInt(document.getElementById('crawlDepth').value), | ||
| 211 | + interval: parseInt(document.getElementById('interval').value), | ||
| 212 | + maxRetries: parseInt(document.getElementById('maxRetries').value), | ||
| 213 | + timeout: parseInt(document.getElementById('timeout').value) | ||
| 214 | + } | ||
| 215 | + }; | ||
| 216 | + | ||
| 217 | + // 发送爬虫配置到后端 | ||
| 218 | + fetch('/api/spider/start', { | ||
| 219 | + method: 'POST', | ||
| 220 | + headers: { | ||
| 221 | + 'Content-Type': 'application/json' | ||
| 222 | + }, | ||
| 223 | + body: JSON.stringify(config) | ||
| 224 | + }) | ||
| 225 | + .then(response => response.json()) | ||
| 226 | + .then(data => { | ||
| 227 | + if (data.success) { | ||
| 228 | + updateCrawlLog('爬虫任务已启动...'); | ||
| 229 | + } else { | ||
| 230 | + updateCrawlLog('启动失败:' + data.message); | ||
| 231 | + } | ||
| 232 | + }) | ||
| 233 | + .catch(error => { | ||
| 234 | + updateCrawlLog('错误:' + error.message); | ||
| 235 | + }); | ||
| 236 | + } | ||
| 237 | + | ||
| 238 | + // 保存配置 | ||
| 239 | + function saveConfig() { | ||
| 240 | + const config = { | ||
| 241 | + topics: Array.from(selectedTopics), | ||
| 242 | + parameters: { | ||
| 243 | + crawlDepth: parseInt(document.getElementById('crawlDepth').value), | ||
| 244 | + interval: parseInt(document.getElementById('interval').value), | ||
| 245 | + maxRetries: parseInt(document.getElementById('maxRetries').value), | ||
| 246 | + timeout: parseInt(document.getElementById('timeout').value) | ||
| 247 | + } | ||
| 248 | + }; | ||
| 249 | + | ||
| 250 | + fetch('/api/spider/save-config', { | ||
| 251 | + method: 'POST', | ||
| 252 | + headers: { | ||
| 253 | + 'Content-Type': 'application/json' | ||
| 254 | + }, | ||
| 255 | + body: JSON.stringify(config) | ||
| 256 | + }) | ||
| 257 | + .then(response => response.json()) | ||
| 258 | + .then(data => { | ||
| 259 | + if (data.success) { | ||
| 260 | + alert('配置已保存!'); | ||
| 261 | + } else { | ||
| 262 | + alert('保存失败:' + data.message); | ||
| 263 | + } | ||
| 264 | + }) | ||
| 265 | + .catch(error => { | ||
| 266 | + alert('保存出错:' + error.message); | ||
| 267 | + }); | ||
| 268 | + } | ||
| 269 | + | ||
| 270 | + // 更新爬虫日志 | ||
| 271 | + function updateCrawlLog(message) { | ||
| 272 | + const log = document.getElementById('crawlLog'); | ||
| 273 | + const timestamp = new Date().toLocaleTimeString(); | ||
| 274 | + log.innerHTML += `[${timestamp}] ${message}\n`; | ||
| 275 | + log.scrollTop = log.scrollHeight; | ||
| 276 | + } | ||
| 277 | + | ||
| 278 | + // WebSocket连接用于实时更新爬虫状态 | ||
| 279 | + const ws = new WebSocket(`ws://${window.location.host}/ws/spider-status`); | ||
| 280 | + | ||
| 281 | + ws.onmessage = function(event) { | ||
| 282 | + const data = JSON.parse(event.data); | ||
| 283 | + if (data.type === 'progress') { | ||
| 284 | + document.getElementById('crawlProgress').style.width = data.value + '%'; | ||
| 285 | + } else if (data.type === 'log') { | ||
| 286 | + updateCrawlLog(data.message); | ||
| 287 | + } | ||
| 288 | + }; | ||
| 289 | + </script> | ||
| 290 | +</body> | ||
| 291 | +</html> |
views/spider_control.py
0 → 100644
| 1 | +from flask import Blueprint, jsonify, request, render_template | ||
| 2 | +import json | ||
| 3 | +import os | ||
| 4 | +from datetime import datetime | ||
| 5 | +import threading | ||
| 6 | +from queue import Queue | ||
| 7 | +import asyncio | ||
| 8 | +import websockets | ||
| 9 | +import logging | ||
| 10 | +from spider.spiderData import SpiderData | ||
| 11 | + | ||
| 12 | +# 创建蓝图 | ||
| 13 | +spider_bp = Blueprint('spider', __name__) | ||
| 14 | + | ||
| 15 | +# 创建日志记录器 | ||
| 16 | +logger = logging.getLogger('spider_control') | ||
| 17 | +logger.setLevel(logging.INFO) | ||
| 18 | + | ||
| 19 | +# 存储WebSocket连接的集合 | ||
| 20 | +websocket_connections = set() | ||
| 21 | + | ||
| 22 | +# 创建消息队列 | ||
| 23 | +message_queue = Queue() | ||
| 24 | + | ||
| 25 | +# 默认配置 | ||
| 26 | +DEFAULT_CONFIG = { | ||
| 27 | + 'crawlDepth': 3, | ||
| 28 | + 'interval': 5, | ||
| 29 | + 'maxRetries': 3, | ||
| 30 | + 'timeout': 30 | ||
| 31 | +} | ||
| 32 | + | ||
| 33 | +def load_config(): | ||
| 34 | + """加载爬虫配置""" | ||
| 35 | + config_path = os.path.join(os.path.dirname(__file__), '../spider/config.json') | ||
| 36 | + try: | ||
| 37 | + if os.path.exists(config_path): | ||
| 38 | + with open(config_path, 'r', encoding='utf-8') as f: | ||
| 39 | + return json.load(f) | ||
| 40 | + except Exception as e: | ||
| 41 | + logger.error(f"加载配置文件失败: {e}") | ||
| 42 | + return DEFAULT_CONFIG | ||
| 43 | + | ||
| 44 | +def save_config(config): | ||
| 45 | + """保存爬虫配置""" | ||
| 46 | + config_path = os.path.join(os.path.dirname(__file__), '../spider/config.json') | ||
| 47 | + try: | ||
| 48 | + with open(config_path, 'w', encoding='utf-8') as f: | ||
| 49 | + json.dump(config, f, ensure_ascii=False, indent=4) | ||
| 50 | + return True | ||
| 51 | + except Exception as e: | ||
| 52 | + logger.error(f"保存配置文件失败: {e}") | ||
| 53 | + return False | ||
| 54 | + | ||
| 55 | +async def broadcast_message(message): | ||
| 56 | + """广播消息到所有WebSocket连接""" | ||
| 57 | + if not websocket_connections: | ||
| 58 | + return | ||
| 59 | + | ||
| 60 | + for websocket in websocket_connections.copy(): | ||
| 61 | + try: | ||
| 62 | + await websocket.send(json.dumps(message)) | ||
| 63 | + except websockets.exceptions.ConnectionClosed: | ||
| 64 | + websocket_connections.remove(websocket) | ||
| 65 | + except Exception as e: | ||
| 66 | + logger.error(f"发送WebSocket消息失败: {e}") | ||
| 67 | + websocket_connections.remove(websocket) | ||
| 68 | + | ||
| 69 | +def spider_worker(topics, parameters): | ||
| 70 | + """爬虫工作线程""" | ||
| 71 | + total_topics = len(topics) | ||
| 72 | + completed_topics = 0 | ||
| 73 | + | ||
| 74 | + try: | ||
| 75 | + spider = SpiderData() | ||
| 76 | + | ||
| 77 | + for topic in topics: | ||
| 78 | + try: | ||
| 79 | + # 更新进度 | ||
| 80 | + progress = int((completed_topics / total_topics) * 100) | ||
| 81 | + asyncio.run(broadcast_message({ | ||
| 82 | + 'type': 'progress', | ||
| 83 | + 'value': progress | ||
| 84 | + })) | ||
| 85 | + | ||
| 86 | + # 发送开始爬取的日志 | ||
| 87 | + asyncio.run(broadcast_message({ | ||
| 88 | + 'type': 'log', | ||
| 89 | + 'message': f'开始爬取话题: {topic}' | ||
| 90 | + })) | ||
| 91 | + | ||
| 92 | + # 执行爬取 | ||
| 93 | + spider.crawl_topic( | ||
| 94 | + topic=topic, | ||
| 95 | + depth=parameters['crawlDepth'], | ||
| 96 | + interval=parameters['interval'], | ||
| 97 | + max_retries=parameters['maxRetries'], | ||
| 98 | + timeout=parameters['timeout'] | ||
| 99 | + ) | ||
| 100 | + | ||
| 101 | + completed_topics += 1 | ||
| 102 | + | ||
| 103 | + # 发送完成爬取的日志 | ||
| 104 | + asyncio.run(broadcast_message({ | ||
| 105 | + 'type': 'log', | ||
| 106 | + 'message': f'话题 {topic} 爬取完成' | ||
| 107 | + })) | ||
| 108 | + | ||
| 109 | + except Exception as e: | ||
| 110 | + # 发送错误日志 | ||
| 111 | + asyncio.run(broadcast_message({ | ||
| 112 | + 'type': 'log', | ||
| 113 | + 'message': f'爬取话题 {topic} 时出错: {str(e)}' | ||
| 114 | + })) | ||
| 115 | + | ||
| 116 | + # 更新最终进度 | ||
| 117 | + asyncio.run(broadcast_message({ | ||
| 118 | + 'type': 'progress', | ||
| 119 | + 'value': 100 | ||
| 120 | + })) | ||
| 121 | + | ||
| 122 | + # 发送完成消息 | ||
| 123 | + asyncio.run(broadcast_message({ | ||
| 124 | + 'type': 'log', | ||
| 125 | + 'message': '所有话题爬取完成' | ||
| 126 | + })) | ||
| 127 | + | ||
| 128 | + except Exception as e: | ||
| 129 | + # 发送错误日志 | ||
| 130 | + asyncio.run(broadcast_message({ | ||
| 131 | + 'type': 'log', | ||
| 132 | + 'message': f'爬虫任务执行出错: {str(e)}' | ||
| 133 | + })) | ||
| 134 | + | ||
| 135 | +@spider_bp.route('/spider/control') | ||
| 136 | +def spider_control(): | ||
| 137 | + """渲染爬虫控制页面""" | ||
| 138 | + return render_template('spider_control.html') | ||
| 139 | + | ||
| 140 | +@spider_bp.route('/api/spider/start', methods=['POST']) | ||
| 141 | +def start_spider(): | ||
| 142 | + """启动爬虫任务""" | ||
| 143 | + try: | ||
| 144 | + data = request.get_json() | ||
| 145 | + topics = data.get('topics', []) | ||
| 146 | + parameters = data.get('parameters', DEFAULT_CONFIG) | ||
| 147 | + | ||
| 148 | + if not topics: | ||
| 149 | + return jsonify({ | ||
| 150 | + 'success': False, | ||
| 151 | + 'message': '请选择至少一个话题' | ||
| 152 | + }) | ||
| 153 | + | ||
| 154 | + # 启动爬虫线程 | ||
| 155 | + thread = threading.Thread( | ||
| 156 | + target=spider_worker, | ||
| 157 | + args=(topics, parameters), | ||
| 158 | + daemon=True | ||
| 159 | + ) | ||
| 160 | + thread.start() | ||
| 161 | + | ||
| 162 | + return jsonify({ | ||
| 163 | + 'success': True, | ||
| 164 | + 'message': '爬虫任务已启动' | ||
| 165 | + }) | ||
| 166 | + | ||
| 167 | + except Exception as e: | ||
| 168 | + logger.error(f"启动爬虫任务失败: {e}") | ||
| 169 | + return jsonify({ | ||
| 170 | + 'success': False, | ||
| 171 | + 'message': str(e) | ||
| 172 | + }) | ||
| 173 | + | ||
| 174 | +@spider_bp.route('/api/spider/save-config', methods=['POST']) | ||
| 175 | +def save_spider_config(): | ||
| 176 | + """保存爬虫配置""" | ||
| 177 | + try: | ||
| 178 | + config = request.get_json() | ||
| 179 | + if save_config(config): | ||
| 180 | + return jsonify({ | ||
| 181 | + 'success': True, | ||
| 182 | + 'message': '配置保存成功' | ||
| 183 | + }) | ||
| 184 | + else: | ||
| 185 | + return jsonify({ | ||
| 186 | + 'success': False, | ||
| 187 | + 'message': '配置保存失败' | ||
| 188 | + }) | ||
| 189 | + except Exception as e: | ||
| 190 | + logger.error(f"保存配置失败: {e}") | ||
| 191 | + return jsonify({ | ||
| 192 | + 'success': False, | ||
| 193 | + 'message': str(e) | ||
| 194 | + }) | ||
| 195 | + | ||
| 196 | +@spider_bp.websocket('/ws/spider-status') | ||
| 197 | +async def spider_status_socket(): | ||
| 198 | + """WebSocket连接处理""" | ||
| 199 | + try: | ||
| 200 | + websocket = websockets.WebSocketServerProtocol() | ||
| 201 | + websocket_connections.add(websocket) | ||
| 202 | + | ||
| 203 | + try: | ||
| 204 | + while True: | ||
| 205 | + # 保持连接活跃 | ||
| 206 | + await websocket.ping() | ||
| 207 | + await asyncio.sleep(30) | ||
| 208 | + except websockets.exceptions.ConnectionClosed: | ||
| 209 | + pass | ||
| 210 | + finally: | ||
| 211 | + websocket_connections.remove(websocket) | ||
| 212 | + except Exception as e: | ||
| 213 | + logger.error(f"WebSocket连接处理失败: {e}") |
-
Please register or login to post a comment