Spider System Performance & Stability Enhancement

戒酒的李白
Commit f85298c02191dcb00a49c5fa6014a298bb9b37ad f85298c0 1 parent e0719583
Showing 2 changed files with 195 additions and 151 deletions
spider/spiderData.py
views/spider_control.py
--- a/spider/spiderData.py
View file @f85298c
+++ b/spider/spiderData.py
View file @f85298c
@@ -11,6 +11,9 @@ from bs4 import BeautifulSoup
 from datetime import datetime
 from utils.logger import spider_logger as logging
 from utils.db_manager import DatabaseManager
+ from cachetools import TTLCache, LRUCache
+ from typing import List, Dict, Any
+ import pandas as pd
 
 def spiderData():
     if not os.path.exists(navAddr):
@@ -29,16 +32,69 @@ class SpiderData:
         self.base_url = 'https://s.weibo.com'
         self.db = DatabaseManager()
         
-     def crawl_topic(self, topic, depth=3, interval=5, max_retries=3, timeout=30):
-         """
-         爬取指定话题的微博内容
+         # 初始化缓存
+         self.data_cache = TTLCache(maxsize=1000, ttl=3600)  # 1小时TTL缓存
+         self.html_cache = LRUCache(maxsize=100)  # LRU缓存最近的100个页面
+         
+         # 批量插入缓冲区
+         self.insert_buffer = []
+         self.buffer_size = 50  # 每50条数据批量插入一次
+     
+     def _get_cached_page(self, url: str) -> str:
+         """获取缓存的页面内容"""
+         return self.html_cache.get(url)
+     
+     def _cache_page(self, url: str, content: str):
+         """缓存页面内容"""
+         self.html_cache[url] = content
+     
+     def _get_cached_data(self, key: str) -> Dict[str, Any]:
+         """获取缓存的数据"""
+         return self.data_cache.get(key)
+     
+     def _cache_data(self, key: str, data: Dict[str, Any]):
+         """缓存数据"""
+         self.data_cache[key] = data
     
-         :param topic: 要爬取的话题
-         :param depth: 爬取深度（页数）
-         :param interval: 请求间隔时间（秒）
-         :param max_retries: 最大重试次数
-         :param timeout: 请求超时时间（秒）
+     def _flush_buffer(self):
+         """将缓冲区数据批量插入数据库"""
+         if not self.insert_buffer:
+             return
+         
+         try:
+             connection = self.db.get_connection()
+             with connection.cursor() as cursor:
+                 # 使用pandas进行高效的批量插入
+                 df = pd.DataFrame(self.insert_buffer)
+                 
+                 # 构建批量插入SQL
+                 columns = ', '.join(df.columns)
+                 values = ', '.join(['%s'] * len(df.columns))
+                 sql = f"""
+                 INSERT INTO article ({columns})
+                 VALUES ({values})
+                 ON DUPLICATE KEY UPDATE
+                 forward_count = VALUES(forward_count),
+                 comment_count = VALUES(comment_count),
+                 like_count = VALUES(like_count),
+                 crawl_time = VALUES(crawl_time)
                 """
+                 
+                 # 执行批量插入
+                 cursor.executemany(sql, df.values.tolist())
+                 connection.commit()
+                 
+                 logging.info(f"成功批量插入 {len(self.insert_buffer)} 条数据")
+                 self.insert_buffer.clear()
+                 
+         except Exception as e:
+             logging.error(f"批量插入数据失败: {e}")
+             if connection:
+                 connection.rollback()
+     
+     def crawl_topic(self, topic: str, depth: int = 3, interval: int = 5,
+                     max_retries: int = 3, timeout: int = 30):
+         """爬取指定话题的微博内容"""
         # 参数验证
         if not isinstance(depth, int) or depth < 1 or depth > 10:
             raise ValueError("爬取深度必须在1-10页之间")
@@ -56,9 +112,19 @@ class SpiderData:
             while retries < max_retries:
                 try:
                     url = f"{self.base_url}/weibo?q={topic}&page={page}"
+                     
+                     # 检查缓存
+                     cached_content = self._get_cached_page(url)
+                     if cached_content:
+                         self._parse_page(cached_content)
+                         logging.info(f"使用缓存数据: {topic} 第 {page} 页")
+                         break
+                     
                     response = requests.get(url, headers=self.headers, timeout=timeout)
                     
                     if response.status_code == 200:
+                         # 缓存页面内容
+                         self._cache_page(url, response.text)
                         self._parse_page(response.text)
                         logging.info(f"成功爬取话题 {topic} 第 {page} 页")
                         break
@@ -85,12 +151,11 @@ class SpiderData:
                 logging.info(f"等待 {sleep_time:.2f} 秒后继续...")
                 time.sleep(sleep_time)
         
-     def _parse_page(self, html_content):
-         """
-         解析页面内容并保存数据
+         # 最后刷新缓冲区
+         self._flush_buffer()
     
-         :param html_content: 页面HTML内容
-         """
+     def _parse_page(self, html_content: str):
+         """解析页面内容并保存数据"""
         try:
             soup = BeautifulSoup(html_content, 'html.parser')
             weibo_items = soup.find_all('div', class_='card-wrap')
@@ -124,8 +189,12 @@ class SpiderData:
                         'crawl_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                     }
                     
-                     # 保存到数据库
-                     self._save_to_database(weibo_data)
+                     # 添加到插入缓冲区
+                     self.insert_buffer.append(weibo_data)
+                     
+                     # 如果缓冲区达到阈值，执行批量插入
+                     if len(self.insert_buffer) >= self.buffer_size:
+                         self._flush_buffer()
                     
                 except Exception as e:
                     logging.error(f"解析微博项时出错: {e}")
@@ -134,52 +203,12 @@ class SpiderData:
         except Exception as e:
             logging.error(f"解析页面时出错: {e}")
     
-     def _extract_number(self, text):
-         """
-         从文本中提取数字
-         
-         :param text: 包含数字的文本
-         :return: 提取的数字，如果没有找到则返回0
-         """
+     def _extract_number(self, text: str) -> int:
+         """从文本中提取数字"""
         try:
             return int(''.join(filter(str.isdigit, text)))
         except ValueError:
             return 0
 
-     def _save_to_database(self, data):
-         """
-         将数据保存到数据库
-         
-         :param data: 要保存的数据字典
-         """
-         connection = None
-         try:
-             connection = self.db.get_connection()
-             
-             with connection.cursor() as cursor:
-                 # 插入文章数据
-                 sql = """
-                 INSERT INTO article (content, user_name, publish_time, forward_count, 
-                                    comment_count, like_count, crawl_time)
-                 VALUES (%s, %s, %s, %s, %s, %s, %s)
-                 """
-                 cursor.execute(sql, (
-                     data['content'],
-                     data['user_name'],
-                     data['publish_time'],
-                     data['forward_count'],
-                     data['comment_count'],
-                     data['like_count'],
-                     data['crawl_time']
-                 ))
-                 
-                 connection.commit()
-                 logging.info(f"成功保存微博数据: {data['content'][:30]}...")
-                 
-         except Exception as e:
-             logging.error(f"保存数据时出错: {e}")
-             if connection:
-                 connection.rollback()
- 
 if __name__ == '__main__':
     spiderData()
\ No newline at end of file
--- a/views/spider_control.py
View file @f85298c
+++ b/views/spider_control.py
View file @f85298c
@@ -10,6 +10,10 @@ import logging
 from spider.spiderData import SpiderData
 from openai import OpenAI
 from anthropic import Anthropic
+ import aiohttp
+ from concurrent.futures import ThreadPoolExecutor
+ from ratelimit import limits, sleep_and_retry
+ from tenacity import retry, stop_after_attempt, wait_exponential
 
 # 创建蓝图
 spider_bp = Blueprint('spider', __name__)
@@ -24,124 +28,137 @@ websocket_connections = set()
 # 创建消息队列
 message_queue = Queue()
 
+ # 创建线程池
+ thread_pool = ThreadPoolExecutor(max_workers=3)
+ 
+ # 创建异步事件循环
+ loop = asyncio.new_event_loop()
+ asyncio.set_event_loop(loop)
+ 
 # 默认配置
 DEFAULT_CONFIG = {
     'crawlDepth': 3,
     'interval': 5,
     'maxRetries': 3,
-     'timeout': 30
+     'timeout': 30,
+     'maxConcurrent': 2
 }
 
- def load_config():
-     """加载爬虫配置"""
-     config_path = os.path.join(os.path.dirname(__file__), '../spider/config.json')
-     try:
-         if os.path.exists(config_path):
-             with open(config_path, 'r', encoding='utf-8') as f:
-                 return json.load(f)
-     except Exception as e:
-         logger.error(f"加载配置文件失败: {e}")
-     return DEFAULT_CONFIG
- 
- def save_config(config):
-     """保存爬虫配置"""
-     config_path = os.path.join(os.path.dirname(__file__), '../spider/config.json')
-     try:
-         with open(config_path, 'w', encoding='utf-8') as f:
-             json.dump(config, f, ensure_ascii=False, indent=4)
-         return True
-     except Exception as e:
-         logger.error(f"保存配置文件失败: {e}")
-         return False
- 
- async def broadcast_message(message):
-     """广播消息到所有WebSocket连接"""
-     if not websocket_connections:
+ # 限流装饰器
+ @sleep_and_retry
+ @limits(calls=100, period=60)  # 每分钟最多100个请求
+ def rate_limited_request():
+     pass
+ 
+ class SpiderWorker:
+     def __init__(self, topics, parameters):
+         self.topics = topics
+         self.parameters = parameters
+         self.total_topics = len(topics)
+         self.completed_topics = 0
+         self.spider = SpiderData()
+         self.message_buffer = []
+         self.message_buffer_size = 10
+         self.semaphore = asyncio.Semaphore(parameters.get('maxConcurrent', DEFAULT_CONFIG['maxConcurrent']))
+     
+     async def send_message(self, message):
+         """异步发送消息，使用缓冲区优化"""
+         self.message_buffer.append(message)
+         if len(self.message_buffer) >= self.message_buffer_size:
+             await self.flush_messages()
+     
+     async def flush_messages(self):
+         """刷新消息缓冲区"""
+         if not self.message_buffer:
             return
         
-     for websocket in websocket_connections.copy():
         try:
-             await websocket.send(json.dumps(message))
-         except websockets.exceptions.ConnectionClosed:
-             websocket_connections.remove(websocket)
+             await broadcast_message(self.message_buffer)
+             self.message_buffer.clear()
         except Exception as e:
-             logger.error(f"发送WebSocket消息失败: {e}")
-             websocket_connections.remove(websocket)
- 
- def spider_worker(topics, parameters):
-     """爬虫工作线程"""
-     total_topics = len(topics)
-     completed_topics = 0
-     
-     async def send_message(message):
-         """异步发送消息的包装函数"""
-         loop = asyncio.new_event_loop()
-         asyncio.set_event_loop(loop)
-         try:
-             await broadcast_message(message)
-         finally:
-             loop.close()
+             logger.error(f"发送消息失败: {e}")
     
+     @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
+     async def crawl_single_topic(self, topic):
+         """爬取单个话题"""
         try:
-         spider = SpiderData()
+             rate_limited_request()
             
-         for topic in topics:
-             try:
-                 # 更新进度
-                 progress = int((completed_topics / total_topics) * 100)
-                 asyncio.run(send_message({
-                     'type': 'progress',
-                     'value': progress
-                 }))
-                 
-                 # 发送开始爬取的日志
-                 asyncio.run(send_message({
+             await self.send_message({
                 'type': 'log',
                 'message': f'开始爬取话题: {topic}'
-                 }))
-                 
-                 # 执行爬取
-                 spider.crawl_topic(
-                     topic=topic,
-                     depth=parameters['crawlDepth'],
-                     interval=parameters['interval'],
-                     max_retries=parameters['maxRetries'],
-                     timeout=parameters['timeout']
+             })
+             
+             async with self.semaphore:
+                 await asyncio.get_event_loop().run_in_executor(
+                     thread_pool,
+                     self.spider.crawl_topic,
+                     topic,
+                     self.parameters['crawlDepth'],
+                     self.parameters['interval'],
+                     self.parameters['maxRetries'],
+                     self.parameters['timeout']
                 )
             
-                 completed_topics += 1
+             self.completed_topics += 1
+             progress = int((self.completed_topics / self.total_topics) * 100)
             
-                 # 发送完成爬取的日志
-                 asyncio.run(send_message({
+             await self.send_message({
+                 'type': 'progress',
+                 'value': progress
+             })
+             
+             await self.send_message({
                 'type': 'log',
                 'message': f'话题 {topic} 爬取完成'
-                 }))
+             })
             
         except Exception as e:
-                 # 发送错误日志
-                 asyncio.run(send_message({
+             logger.error(f"爬取话题 {topic} 失败: {e}")
+             await self.send_message({
                 'type': 'log',
                 'message': f'爬取话题 {topic} 时出错: {str(e)}'
-                 }))
+             })
+             raise
     
-         # 更新最终进度
-         asyncio.run(send_message({
-             'type': 'progress',
-             'value': 100
-         }))
+     async def run(self):
+         """运行爬虫任务"""
+         try:
+             tasks = [self.crawl_single_topic(topic) for topic in self.topics]
+             await asyncio.gather(*tasks)
+             await self.flush_messages()
             
-         # 发送完成消息
-         asyncio.run(send_message({
+             await self.send_message({
                 'type': 'log',
                 'message': '所有话题爬取完成'
-         }))
+             })
             
         except Exception as e:
-         # 发送错误日志
-         asyncio.run(send_message({
+             logger.error(f"爬虫任务执行出错: {e}")
+             await self.send_message({
                 'type': 'log',
                 'message': f'爬虫任务执行出错: {str(e)}'
-         }))
+             })
+         finally:
+             await self.flush_messages()
+ 
+ async def broadcast_message(messages):
+     """广播消息到所有WebSocket连接"""
+     if not websocket_connections:
+         return
+     
+     for websocket in websocket_connections.copy():
+         try:
+             if isinstance(messages, list):
+                 for message in messages:
+                     await websocket.send(json.dumps(message))
+             else:
+                 await websocket.send(json.dumps(messages))
+         except websockets.exceptions.ConnectionClosed:
+             websocket_connections.remove(websocket)
+         except Exception as e:
+             logger.error(f"发送WebSocket消息失败: {e}")
+             websocket_connections.remove(websocket)
 
 @spider_bp.route('/spider/control')
 def spider_control():
@@ -149,12 +166,12 @@ def spider_control():
     return render_template('spider_control.html')
 
 @spider_bp.route('/api/spider/start', methods=['POST'])
- def start_spider():
+ async def start_spider():
     """启动爬虫任务"""
     try:
         data = request.get_json()
         topics = data.get('topics', [])
-         parameters = data.get('parameters', DEFAULT_CONFIG)
+         parameters = {**DEFAULT_CONFIG, **data.get('parameters', {})}
         
         if not topics:
             return jsonify({
@@ -162,13 +179,11 @@ def start_spider():
                 'message': '请选择至少一个话题'
             })
         
-         # 启动爬虫线程
-         thread = threading.Thread(
-             target=spider_worker,
-             args=(topics, parameters),
-             daemon=True
-         )
-         thread.start()
+         # 创建爬虫工作器
+         worker = SpiderWorker(topics, parameters)
+         
+         # 在事件循环中运行爬虫任务
+         asyncio.create_task(worker.run())
         
         return jsonify({
             'success': True,