Showing
2 changed files
with
210 additions
and
166 deletions
| @@ -11,6 +11,9 @@ from bs4 import BeautifulSoup | @@ -11,6 +11,9 @@ from bs4 import BeautifulSoup | ||
| 11 | from datetime import datetime | 11 | from datetime import datetime |
| 12 | from utils.logger import spider_logger as logging | 12 | from utils.logger import spider_logger as logging |
| 13 | from utils.db_manager import DatabaseManager | 13 | from utils.db_manager import DatabaseManager |
| 14 | +from cachetools import TTLCache, LRUCache | ||
| 15 | +from typing import List, Dict, Any | ||
| 16 | +import pandas as pd | ||
| 14 | 17 | ||
| 15 | def spiderData(): | 18 | def spiderData(): |
| 16 | if not os.path.exists(navAddr): | 19 | if not os.path.exists(navAddr): |
| @@ -28,17 +31,70 @@ class SpiderData: | @@ -28,17 +31,70 @@ class SpiderData: | ||
| 28 | } | 31 | } |
| 29 | self.base_url = 'https://s.weibo.com' | 32 | self.base_url = 'https://s.weibo.com' |
| 30 | self.db = DatabaseManager() | 33 | self.db = DatabaseManager() |
| 34 | + | ||
| 35 | + # 初始化缓存 | ||
| 36 | + self.data_cache = TTLCache(maxsize=1000, ttl=3600) # 1小时TTL缓存 | ||
| 37 | + self.html_cache = LRUCache(maxsize=100) # LRU缓存最近的100个页面 | ||
| 38 | + | ||
| 39 | + # 批量插入缓冲区 | ||
| 40 | + self.insert_buffer = [] | ||
| 41 | + self.buffer_size = 50 # 每50条数据批量插入一次 | ||
| 42 | + | ||
| 43 | + def _get_cached_page(self, url: str) -> str: | ||
| 44 | + """获取缓存的页面内容""" | ||
| 45 | + return self.html_cache.get(url) | ||
| 46 | + | ||
| 47 | + def _cache_page(self, url: str, content: str): | ||
| 48 | + """缓存页面内容""" | ||
| 49 | + self.html_cache[url] = content | ||
| 50 | + | ||
| 51 | + def _get_cached_data(self, key: str) -> Dict[str, Any]: | ||
| 52 | + """获取缓存的数据""" | ||
| 53 | + return self.data_cache.get(key) | ||
| 31 | 54 | ||
| 32 | - def crawl_topic(self, topic, depth=3, interval=5, max_retries=3, timeout=30): | ||
| 33 | - """ | ||
| 34 | - 爬取指定话题的微博内容 | 55 | + def _cache_data(self, key: str, data: Dict[str, Any]): |
| 56 | + """缓存数据""" | ||
| 57 | + self.data_cache[key] = data | ||
| 58 | + | ||
| 59 | + def _flush_buffer(self): | ||
| 60 | + """将缓冲区数据批量插入数据库""" | ||
| 61 | + if not self.insert_buffer: | ||
| 62 | + return | ||
| 35 | 63 | ||
| 36 | - :param topic: 要爬取的话题 | ||
| 37 | - :param depth: 爬取深度(页数) | ||
| 38 | - :param interval: 请求间隔时间(秒) | ||
| 39 | - :param max_retries: 最大重试次数 | ||
| 40 | - :param timeout: 请求超时时间(秒) | ||
| 41 | - """ | 64 | + try: |
| 65 | + connection = self.db.get_connection() | ||
| 66 | + with connection.cursor() as cursor: | ||
| 67 | + # 使用pandas进行高效的批量插入 | ||
| 68 | + df = pd.DataFrame(self.insert_buffer) | ||
| 69 | + | ||
| 70 | + # 构建批量插入SQL | ||
| 71 | + columns = ', '.join(df.columns) | ||
| 72 | + values = ', '.join(['%s'] * len(df.columns)) | ||
| 73 | + sql = f""" | ||
| 74 | + INSERT INTO article ({columns}) | ||
| 75 | + VALUES ({values}) | ||
| 76 | + ON DUPLICATE KEY UPDATE | ||
| 77 | + forward_count = VALUES(forward_count), | ||
| 78 | + comment_count = VALUES(comment_count), | ||
| 79 | + like_count = VALUES(like_count), | ||
| 80 | + crawl_time = VALUES(crawl_time) | ||
| 81 | + """ | ||
| 82 | + | ||
| 83 | + # 执行批量插入 | ||
| 84 | + cursor.executemany(sql, df.values.tolist()) | ||
| 85 | + connection.commit() | ||
| 86 | + | ||
| 87 | + logging.info(f"成功批量插入 {len(self.insert_buffer)} 条数据") | ||
| 88 | + self.insert_buffer.clear() | ||
| 89 | + | ||
| 90 | + except Exception as e: | ||
| 91 | + logging.error(f"批量插入数据失败: {e}") | ||
| 92 | + if connection: | ||
| 93 | + connection.rollback() | ||
| 94 | + | ||
| 95 | + def crawl_topic(self, topic: str, depth: int = 3, interval: int = 5, | ||
| 96 | + max_retries: int = 3, timeout: int = 30): | ||
| 97 | + """爬取指定话题的微博内容""" | ||
| 42 | # 参数验证 | 98 | # 参数验证 |
| 43 | if not isinstance(depth, int) or depth < 1 or depth > 10: | 99 | if not isinstance(depth, int) or depth < 1 or depth > 10: |
| 44 | raise ValueError("爬取深度必须在1-10页之间") | 100 | raise ValueError("爬取深度必须在1-10页之间") |
| @@ -56,9 +112,19 @@ class SpiderData: | @@ -56,9 +112,19 @@ class SpiderData: | ||
| 56 | while retries < max_retries: | 112 | while retries < max_retries: |
| 57 | try: | 113 | try: |
| 58 | url = f"{self.base_url}/weibo?q={topic}&page={page}" | 114 | url = f"{self.base_url}/weibo?q={topic}&page={page}" |
| 115 | + | ||
| 116 | + # 检查缓存 | ||
| 117 | + cached_content = self._get_cached_page(url) | ||
| 118 | + if cached_content: | ||
| 119 | + self._parse_page(cached_content) | ||
| 120 | + logging.info(f"使用缓存数据: {topic} 第 {page} 页") | ||
| 121 | + break | ||
| 122 | + | ||
| 59 | response = requests.get(url, headers=self.headers, timeout=timeout) | 123 | response = requests.get(url, headers=self.headers, timeout=timeout) |
| 60 | 124 | ||
| 61 | if response.status_code == 200: | 125 | if response.status_code == 200: |
| 126 | + # 缓存页面内容 | ||
| 127 | + self._cache_page(url, response.text) | ||
| 62 | self._parse_page(response.text) | 128 | self._parse_page(response.text) |
| 63 | logging.info(f"成功爬取话题 {topic} 第 {page} 页") | 129 | logging.info(f"成功爬取话题 {topic} 第 {page} 页") |
| 64 | break | 130 | break |
| @@ -84,13 +150,12 @@ class SpiderData: | @@ -84,13 +150,12 @@ class SpiderData: | ||
| 84 | sleep_time = interval * (1 + random.random()) | 150 | sleep_time = interval * (1 + random.random()) |
| 85 | logging.info(f"等待 {sleep_time:.2f} 秒后继续...") | 151 | logging.info(f"等待 {sleep_time:.2f} 秒后继续...") |
| 86 | time.sleep(sleep_time) | 152 | time.sleep(sleep_time) |
| 87 | - | ||
| 88 | - def _parse_page(self, html_content): | ||
| 89 | - """ | ||
| 90 | - 解析页面内容并保存数据 | ||
| 91 | 153 | ||
| 92 | - :param html_content: 页面HTML内容 | ||
| 93 | - """ | 154 | + # 最后刷新缓冲区 |
| 155 | + self._flush_buffer() | ||
| 156 | + | ||
| 157 | + def _parse_page(self, html_content: str): | ||
| 158 | + """解析页面内容并保存数据""" | ||
| 94 | try: | 159 | try: |
| 95 | soup = BeautifulSoup(html_content, 'html.parser') | 160 | soup = BeautifulSoup(html_content, 'html.parser') |
| 96 | weibo_items = soup.find_all('div', class_='card-wrap') | 161 | weibo_items = soup.find_all('div', class_='card-wrap') |
| @@ -124,8 +189,12 @@ class SpiderData: | @@ -124,8 +189,12 @@ class SpiderData: | ||
| 124 | 'crawl_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S') | 189 | 'crawl_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S') |
| 125 | } | 190 | } |
| 126 | 191 | ||
| 127 | - # 保存到数据库 | ||
| 128 | - self._save_to_database(weibo_data) | 192 | + # 添加到插入缓冲区 |
| 193 | + self.insert_buffer.append(weibo_data) | ||
| 194 | + | ||
| 195 | + # 如果缓冲区达到阈值,执行批量插入 | ||
| 196 | + if len(self.insert_buffer) >= self.buffer_size: | ||
| 197 | + self._flush_buffer() | ||
| 129 | 198 | ||
| 130 | except Exception as e: | 199 | except Exception as e: |
| 131 | logging.error(f"解析微博项时出错: {e}") | 200 | logging.error(f"解析微博项时出错: {e}") |
| @@ -134,52 +203,12 @@ class SpiderData: | @@ -134,52 +203,12 @@ class SpiderData: | ||
| 134 | except Exception as e: | 203 | except Exception as e: |
| 135 | logging.error(f"解析页面时出错: {e}") | 204 | logging.error(f"解析页面时出错: {e}") |
| 136 | 205 | ||
| 137 | - def _extract_number(self, text): | ||
| 138 | - """ | ||
| 139 | - 从文本中提取数字 | ||
| 140 | - | ||
| 141 | - :param text: 包含数字的文本 | ||
| 142 | - :return: 提取的数字,如果没有找到则返回0 | ||
| 143 | - """ | 206 | + def _extract_number(self, text: str) -> int: |
| 207 | + """从文本中提取数字""" | ||
| 144 | try: | 208 | try: |
| 145 | return int(''.join(filter(str.isdigit, text))) | 209 | return int(''.join(filter(str.isdigit, text))) |
| 146 | except ValueError: | 210 | except ValueError: |
| 147 | return 0 | 211 | return 0 |
| 148 | - | ||
| 149 | - def _save_to_database(self, data): | ||
| 150 | - """ | ||
| 151 | - 将数据保存到数据库 | ||
| 152 | - | ||
| 153 | - :param data: 要保存的数据字典 | ||
| 154 | - """ | ||
| 155 | - connection = None | ||
| 156 | - try: | ||
| 157 | - connection = self.db.get_connection() | ||
| 158 | - | ||
| 159 | - with connection.cursor() as cursor: | ||
| 160 | - # 插入文章数据 | ||
| 161 | - sql = """ | ||
| 162 | - INSERT INTO article (content, user_name, publish_time, forward_count, | ||
| 163 | - comment_count, like_count, crawl_time) | ||
| 164 | - VALUES (%s, %s, %s, %s, %s, %s, %s) | ||
| 165 | - """ | ||
| 166 | - cursor.execute(sql, ( | ||
| 167 | - data['content'], | ||
| 168 | - data['user_name'], | ||
| 169 | - data['publish_time'], | ||
| 170 | - data['forward_count'], | ||
| 171 | - data['comment_count'], | ||
| 172 | - data['like_count'], | ||
| 173 | - data['crawl_time'] | ||
| 174 | - )) | ||
| 175 | - | ||
| 176 | - connection.commit() | ||
| 177 | - logging.info(f"成功保存微博数据: {data['content'][:30]}...") | ||
| 178 | - | ||
| 179 | - except Exception as e: | ||
| 180 | - logging.error(f"保存数据时出错: {e}") | ||
| 181 | - if connection: | ||
| 182 | - connection.rollback() | ||
| 183 | 212 | ||
| 184 | if __name__ == '__main__': | 213 | if __name__ == '__main__': |
| 185 | spiderData() | 214 | spiderData() |
| @@ -10,6 +10,10 @@ import logging | @@ -10,6 +10,10 @@ import logging | ||
| 10 | from spider.spiderData import SpiderData | 10 | from spider.spiderData import SpiderData |
| 11 | from openai import OpenAI | 11 | from openai import OpenAI |
| 12 | from anthropic import Anthropic | 12 | from anthropic import Anthropic |
| 13 | +import aiohttp | ||
| 14 | +from concurrent.futures import ThreadPoolExecutor | ||
| 15 | +from ratelimit import limits, sleep_and_retry | ||
| 16 | +from tenacity import retry, stop_after_attempt, wait_exponential | ||
| 13 | 17 | ||
| 14 | # 创建蓝图 | 18 | # 创建蓝图 |
| 15 | spider_bp = Blueprint('spider', __name__) | 19 | spider_bp = Blueprint('spider', __name__) |
| @@ -24,137 +28,150 @@ websocket_connections = set() | @@ -24,137 +28,150 @@ websocket_connections = set() | ||
| 24 | # 创建消息队列 | 28 | # 创建消息队列 |
| 25 | message_queue = Queue() | 29 | message_queue = Queue() |
| 26 | 30 | ||
| 31 | +# 创建线程池 | ||
| 32 | +thread_pool = ThreadPoolExecutor(max_workers=3) | ||
| 33 | + | ||
| 34 | +# 创建异步事件循环 | ||
| 35 | +loop = asyncio.new_event_loop() | ||
| 36 | +asyncio.set_event_loop(loop) | ||
| 37 | + | ||
| 27 | # 默认配置 | 38 | # 默认配置 |
| 28 | DEFAULT_CONFIG = { | 39 | DEFAULT_CONFIG = { |
| 29 | 'crawlDepth': 3, | 40 | 'crawlDepth': 3, |
| 30 | 'interval': 5, | 41 | 'interval': 5, |
| 31 | 'maxRetries': 3, | 42 | 'maxRetries': 3, |
| 32 | - 'timeout': 30 | 43 | + 'timeout': 30, |
| 44 | + 'maxConcurrent': 2 | ||
| 33 | } | 45 | } |
| 34 | 46 | ||
| 35 | -def load_config(): | ||
| 36 | - """加载爬虫配置""" | ||
| 37 | - config_path = os.path.join(os.path.dirname(__file__), '../spider/config.json') | ||
| 38 | - try: | ||
| 39 | - if os.path.exists(config_path): | ||
| 40 | - with open(config_path, 'r', encoding='utf-8') as f: | ||
| 41 | - return json.load(f) | ||
| 42 | - except Exception as e: | ||
| 43 | - logger.error(f"加载配置文件失败: {e}") | ||
| 44 | - return DEFAULT_CONFIG | 47 | +# 限流装饰器 |
| 48 | +@sleep_and_retry | ||
| 49 | +@limits(calls=100, period=60) # 每分钟最多100个请求 | ||
| 50 | +def rate_limited_request(): | ||
| 51 | + pass | ||
| 45 | 52 | ||
| 46 | -def save_config(config): | ||
| 47 | - """保存爬虫配置""" | ||
| 48 | - config_path = os.path.join(os.path.dirname(__file__), '../spider/config.json') | ||
| 49 | - try: | ||
| 50 | - with open(config_path, 'w', encoding='utf-8') as f: | ||
| 51 | - json.dump(config, f, ensure_ascii=False, indent=4) | ||
| 52 | - return True | ||
| 53 | - except Exception as e: | ||
| 54 | - logger.error(f"保存配置文件失败: {e}") | ||
| 55 | - return False | 53 | +class SpiderWorker: |
| 54 | + def __init__(self, topics, parameters): | ||
| 55 | + self.topics = topics | ||
| 56 | + self.parameters = parameters | ||
| 57 | + self.total_topics = len(topics) | ||
| 58 | + self.completed_topics = 0 | ||
| 59 | + self.spider = SpiderData() | ||
| 60 | + self.message_buffer = [] | ||
| 61 | + self.message_buffer_size = 10 | ||
| 62 | + self.semaphore = asyncio.Semaphore(parameters.get('maxConcurrent', DEFAULT_CONFIG['maxConcurrent'])) | ||
| 63 | + | ||
| 64 | + async def send_message(self, message): | ||
| 65 | + """异步发送消息,使用缓冲区优化""" | ||
| 66 | + self.message_buffer.append(message) | ||
| 67 | + if len(self.message_buffer) >= self.message_buffer_size: | ||
| 68 | + await self.flush_messages() | ||
| 69 | + | ||
| 70 | + async def flush_messages(self): | ||
| 71 | + """刷新消息缓冲区""" | ||
| 72 | + if not self.message_buffer: | ||
| 73 | + return | ||
| 74 | + | ||
| 75 | + try: | ||
| 76 | + await broadcast_message(self.message_buffer) | ||
| 77 | + self.message_buffer.clear() | ||
| 78 | + except Exception as e: | ||
| 79 | + logger.error(f"发送消息失败: {e}") | ||
| 80 | + | ||
| 81 | + @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10)) | ||
| 82 | + async def crawl_single_topic(self, topic): | ||
| 83 | + """爬取单个话题""" | ||
| 84 | + try: | ||
| 85 | + rate_limited_request() | ||
| 86 | + | ||
| 87 | + await self.send_message({ | ||
| 88 | + 'type': 'log', | ||
| 89 | + 'message': f'开始爬取话题: {topic}' | ||
| 90 | + }) | ||
| 91 | + | ||
| 92 | + async with self.semaphore: | ||
| 93 | + await asyncio.get_event_loop().run_in_executor( | ||
| 94 | + thread_pool, | ||
| 95 | + self.spider.crawl_topic, | ||
| 96 | + topic, | ||
| 97 | + self.parameters['crawlDepth'], | ||
| 98 | + self.parameters['interval'], | ||
| 99 | + self.parameters['maxRetries'], | ||
| 100 | + self.parameters['timeout'] | ||
| 101 | + ) | ||
| 102 | + | ||
| 103 | + self.completed_topics += 1 | ||
| 104 | + progress = int((self.completed_topics / self.total_topics) * 100) | ||
| 105 | + | ||
| 106 | + await self.send_message({ | ||
| 107 | + 'type': 'progress', | ||
| 108 | + 'value': progress | ||
| 109 | + }) | ||
| 110 | + | ||
| 111 | + await self.send_message({ | ||
| 112 | + 'type': 'log', | ||
| 113 | + 'message': f'话题 {topic} 爬取完成' | ||
| 114 | + }) | ||
| 115 | + | ||
| 116 | + except Exception as e: | ||
| 117 | + logger.error(f"爬取话题 {topic} 失败: {e}") | ||
| 118 | + await self.send_message({ | ||
| 119 | + 'type': 'log', | ||
| 120 | + 'message': f'爬取话题 {topic} 时出错: {str(e)}' | ||
| 121 | + }) | ||
| 122 | + raise | ||
| 123 | + | ||
| 124 | + async def run(self): | ||
| 125 | + """运行爬虫任务""" | ||
| 126 | + try: | ||
| 127 | + tasks = [self.crawl_single_topic(topic) for topic in self.topics] | ||
| 128 | + await asyncio.gather(*tasks) | ||
| 129 | + await self.flush_messages() | ||
| 130 | + | ||
| 131 | + await self.send_message({ | ||
| 132 | + 'type': 'log', | ||
| 133 | + 'message': '所有话题爬取完成' | ||
| 134 | + }) | ||
| 135 | + | ||
| 136 | + except Exception as e: | ||
| 137 | + logger.error(f"爬虫任务执行出错: {e}") | ||
| 138 | + await self.send_message({ | ||
| 139 | + 'type': 'log', | ||
| 140 | + 'message': f'爬虫任务执行出错: {str(e)}' | ||
| 141 | + }) | ||
| 142 | + finally: | ||
| 143 | + await self.flush_messages() | ||
| 56 | 144 | ||
| 57 | -async def broadcast_message(message): | 145 | +async def broadcast_message(messages): |
| 58 | """广播消息到所有WebSocket连接""" | 146 | """广播消息到所有WebSocket连接""" |
| 59 | if not websocket_connections: | 147 | if not websocket_connections: |
| 60 | return | 148 | return |
| 61 | 149 | ||
| 62 | for websocket in websocket_connections.copy(): | 150 | for websocket in websocket_connections.copy(): |
| 63 | try: | 151 | try: |
| 64 | - await websocket.send(json.dumps(message)) | 152 | + if isinstance(messages, list): |
| 153 | + for message in messages: | ||
| 154 | + await websocket.send(json.dumps(message)) | ||
| 155 | + else: | ||
| 156 | + await websocket.send(json.dumps(messages)) | ||
| 65 | except websockets.exceptions.ConnectionClosed: | 157 | except websockets.exceptions.ConnectionClosed: |
| 66 | websocket_connections.remove(websocket) | 158 | websocket_connections.remove(websocket) |
| 67 | except Exception as e: | 159 | except Exception as e: |
| 68 | logger.error(f"发送WebSocket消息失败: {e}") | 160 | logger.error(f"发送WebSocket消息失败: {e}") |
| 69 | websocket_connections.remove(websocket) | 161 | websocket_connections.remove(websocket) |
| 70 | 162 | ||
| 71 | -def spider_worker(topics, parameters): | ||
| 72 | - """爬虫工作线程""" | ||
| 73 | - total_topics = len(topics) | ||
| 74 | - completed_topics = 0 | ||
| 75 | - | ||
| 76 | - async def send_message(message): | ||
| 77 | - """异步发送消息的包装函数""" | ||
| 78 | - loop = asyncio.new_event_loop() | ||
| 79 | - asyncio.set_event_loop(loop) | ||
| 80 | - try: | ||
| 81 | - await broadcast_message(message) | ||
| 82 | - finally: | ||
| 83 | - loop.close() | ||
| 84 | - | ||
| 85 | - try: | ||
| 86 | - spider = SpiderData() | ||
| 87 | - | ||
| 88 | - for topic in topics: | ||
| 89 | - try: | ||
| 90 | - # 更新进度 | ||
| 91 | - progress = int((completed_topics / total_topics) * 100) | ||
| 92 | - asyncio.run(send_message({ | ||
| 93 | - 'type': 'progress', | ||
| 94 | - 'value': progress | ||
| 95 | - })) | ||
| 96 | - | ||
| 97 | - # 发送开始爬取的日志 | ||
| 98 | - asyncio.run(send_message({ | ||
| 99 | - 'type': 'log', | ||
| 100 | - 'message': f'开始爬取话题: {topic}' | ||
| 101 | - })) | ||
| 102 | - | ||
| 103 | - # 执行爬取 | ||
| 104 | - spider.crawl_topic( | ||
| 105 | - topic=topic, | ||
| 106 | - depth=parameters['crawlDepth'], | ||
| 107 | - interval=parameters['interval'], | ||
| 108 | - max_retries=parameters['maxRetries'], | ||
| 109 | - timeout=parameters['timeout'] | ||
| 110 | - ) | ||
| 111 | - | ||
| 112 | - completed_topics += 1 | ||
| 113 | - | ||
| 114 | - # 发送完成爬取的日志 | ||
| 115 | - asyncio.run(send_message({ | ||
| 116 | - 'type': 'log', | ||
| 117 | - 'message': f'话题 {topic} 爬取完成' | ||
| 118 | - })) | ||
| 119 | - | ||
| 120 | - except Exception as e: | ||
| 121 | - # 发送错误日志 | ||
| 122 | - asyncio.run(send_message({ | ||
| 123 | - 'type': 'log', | ||
| 124 | - 'message': f'爬取话题 {topic} 时出错: {str(e)}' | ||
| 125 | - })) | ||
| 126 | - | ||
| 127 | - # 更新最终进度 | ||
| 128 | - asyncio.run(send_message({ | ||
| 129 | - 'type': 'progress', | ||
| 130 | - 'value': 100 | ||
| 131 | - })) | ||
| 132 | - | ||
| 133 | - # 发送完成消息 | ||
| 134 | - asyncio.run(send_message({ | ||
| 135 | - 'type': 'log', | ||
| 136 | - 'message': '所有话题爬取完成' | ||
| 137 | - })) | ||
| 138 | - | ||
| 139 | - except Exception as e: | ||
| 140 | - # 发送错误日志 | ||
| 141 | - asyncio.run(send_message({ | ||
| 142 | - 'type': 'log', | ||
| 143 | - 'message': f'爬虫任务执行出错: {str(e)}' | ||
| 144 | - })) | ||
| 145 | - | ||
| 146 | @spider_bp.route('/spider/control') | 163 | @spider_bp.route('/spider/control') |
| 147 | def spider_control(): | 164 | def spider_control(): |
| 148 | """渲染爬虫控制页面""" | 165 | """渲染爬虫控制页面""" |
| 149 | return render_template('spider_control.html') | 166 | return render_template('spider_control.html') |
| 150 | 167 | ||
| 151 | @spider_bp.route('/api/spider/start', methods=['POST']) | 168 | @spider_bp.route('/api/spider/start', methods=['POST']) |
| 152 | -def start_spider(): | 169 | +async def start_spider(): |
| 153 | """启动爬虫任务""" | 170 | """启动爬虫任务""" |
| 154 | try: | 171 | try: |
| 155 | data = request.get_json() | 172 | data = request.get_json() |
| 156 | topics = data.get('topics', []) | 173 | topics = data.get('topics', []) |
| 157 | - parameters = data.get('parameters', DEFAULT_CONFIG) | 174 | + parameters = {**DEFAULT_CONFIG, **data.get('parameters', {})} |
| 158 | 175 | ||
| 159 | if not topics: | 176 | if not topics: |
| 160 | return jsonify({ | 177 | return jsonify({ |
| @@ -162,13 +179,11 @@ def start_spider(): | @@ -162,13 +179,11 @@ def start_spider(): | ||
| 162 | 'message': '请选择至少一个话题' | 179 | 'message': '请选择至少一个话题' |
| 163 | }) | 180 | }) |
| 164 | 181 | ||
| 165 | - # 启动爬虫线程 | ||
| 166 | - thread = threading.Thread( | ||
| 167 | - target=spider_worker, | ||
| 168 | - args=(topics, parameters), | ||
| 169 | - daemon=True | ||
| 170 | - ) | ||
| 171 | - thread.start() | 182 | + # 创建爬虫工作器 |
| 183 | + worker = SpiderWorker(topics, parameters) | ||
| 184 | + | ||
| 185 | + # 在事件循环中运行爬虫任务 | ||
| 186 | + asyncio.create_task(worker.run()) | ||
| 172 | 187 | ||
| 173 | return jsonify({ | 188 | return jsonify({ |
| 174 | 'success': True, | 189 | 'success': True, |
-
Please register or login to post a comment