戒酒的李白

Spider System Performance & Stability Enhancement

@@ -11,6 +11,9 @@ from bs4 import BeautifulSoup @@ -11,6 +11,9 @@ from bs4 import BeautifulSoup
11 from datetime import datetime 11 from datetime import datetime
12 from utils.logger import spider_logger as logging 12 from utils.logger import spider_logger as logging
13 from utils.db_manager import DatabaseManager 13 from utils.db_manager import DatabaseManager
  14 +from cachetools import TTLCache, LRUCache
  15 +from typing import List, Dict, Any
  16 +import pandas as pd
14 17
15 def spiderData(): 18 def spiderData():
16 if not os.path.exists(navAddr): 19 if not os.path.exists(navAddr):
@@ -28,17 +31,70 @@ class SpiderData: @@ -28,17 +31,70 @@ class SpiderData:
28 } 31 }
29 self.base_url = 'https://s.weibo.com' 32 self.base_url = 'https://s.weibo.com'
30 self.db = DatabaseManager() 33 self.db = DatabaseManager()
  34 +
  35 + # 初始化缓存
  36 + self.data_cache = TTLCache(maxsize=1000, ttl=3600) # 1小时TTL缓存
  37 + self.html_cache = LRUCache(maxsize=100) # LRU缓存最近的100个页面
  38 +
  39 + # 批量插入缓冲区
  40 + self.insert_buffer = []
  41 + self.buffer_size = 50 # 每50条数据批量插入一次
  42 +
  43 + def _get_cached_page(self, url: str) -> str:
  44 + """获取缓存的页面内容"""
  45 + return self.html_cache.get(url)
  46 +
  47 + def _cache_page(self, url: str, content: str):
  48 + """缓存页面内容"""
  49 + self.html_cache[url] = content
  50 +
  51 + def _get_cached_data(self, key: str) -> Dict[str, Any]:
  52 + """获取缓存的数据"""
  53 + return self.data_cache.get(key)
31 54
32 - def crawl_topic(self, topic, depth=3, interval=5, max_retries=3, timeout=30):  
33 - """  
34 - 爬取指定话题的微博内容 55 + def _cache_data(self, key: str, data: Dict[str, Any]):
  56 + """缓存数据"""
  57 + self.data_cache[key] = data
  58 +
  59 + def _flush_buffer(self):
  60 + """将缓冲区数据批量插入数据库"""
  61 + if not self.insert_buffer:
  62 + return
35 63
36 - :param topic: 要爬取的话题  
37 - :param depth: 爬取深度(页数)  
38 - :param interval: 请求间隔时间(秒)  
39 - :param max_retries: 最大重试次数  
40 - :param timeout: 请求超时时间(秒)  
41 - """ 64 + try:
  65 + connection = self.db.get_connection()
  66 + with connection.cursor() as cursor:
  67 + # 使用pandas进行高效的批量插入
  68 + df = pd.DataFrame(self.insert_buffer)
  69 +
  70 + # 构建批量插入SQL
  71 + columns = ', '.join(df.columns)
  72 + values = ', '.join(['%s'] * len(df.columns))
  73 + sql = f"""
  74 + INSERT INTO article ({columns})
  75 + VALUES ({values})
  76 + ON DUPLICATE KEY UPDATE
  77 + forward_count = VALUES(forward_count),
  78 + comment_count = VALUES(comment_count),
  79 + like_count = VALUES(like_count),
  80 + crawl_time = VALUES(crawl_time)
  81 + """
  82 +
  83 + # 执行批量插入
  84 + cursor.executemany(sql, df.values.tolist())
  85 + connection.commit()
  86 +
  87 + logging.info(f"成功批量插入 {len(self.insert_buffer)} 条数据")
  88 + self.insert_buffer.clear()
  89 +
  90 + except Exception as e:
  91 + logging.error(f"批量插入数据失败: {e}")
  92 + if connection:
  93 + connection.rollback()
  94 +
  95 + def crawl_topic(self, topic: str, depth: int = 3, interval: int = 5,
  96 + max_retries: int = 3, timeout: int = 30):
  97 + """爬取指定话题的微博内容"""
42 # 参数验证 98 # 参数验证
43 if not isinstance(depth, int) or depth < 1 or depth > 10: 99 if not isinstance(depth, int) or depth < 1 or depth > 10:
44 raise ValueError("爬取深度必须在1-10页之间") 100 raise ValueError("爬取深度必须在1-10页之间")
@@ -56,9 +112,19 @@ class SpiderData: @@ -56,9 +112,19 @@ class SpiderData:
56 while retries < max_retries: 112 while retries < max_retries:
57 try: 113 try:
58 url = f"{self.base_url}/weibo?q={topic}&page={page}" 114 url = f"{self.base_url}/weibo?q={topic}&page={page}"
  115 +
  116 + # 检查缓存
  117 + cached_content = self._get_cached_page(url)
  118 + if cached_content:
  119 + self._parse_page(cached_content)
  120 + logging.info(f"使用缓存数据: {topic} 第 {page} 页")
  121 + break
  122 +
59 response = requests.get(url, headers=self.headers, timeout=timeout) 123 response = requests.get(url, headers=self.headers, timeout=timeout)
60 124
61 if response.status_code == 200: 125 if response.status_code == 200:
  126 + # 缓存页面内容
  127 + self._cache_page(url, response.text)
62 self._parse_page(response.text) 128 self._parse_page(response.text)
63 logging.info(f"成功爬取话题 {topic} 第 {page} 页") 129 logging.info(f"成功爬取话题 {topic} 第 {page} 页")
64 break 130 break
@@ -84,13 +150,12 @@ class SpiderData: @@ -84,13 +150,12 @@ class SpiderData:
84 sleep_time = interval * (1 + random.random()) 150 sleep_time = interval * (1 + random.random())
85 logging.info(f"等待 {sleep_time:.2f} 秒后继续...") 151 logging.info(f"等待 {sleep_time:.2f} 秒后继续...")
86 time.sleep(sleep_time) 152 time.sleep(sleep_time)
87 -  
88 - def _parse_page(self, html_content):  
89 - """  
90 - 解析页面内容并保存数据  
91 153
92 - :param html_content: 页面HTML内容  
93 - """ 154 + # 最后刷新缓冲区
  155 + self._flush_buffer()
  156 +
  157 + def _parse_page(self, html_content: str):
  158 + """解析页面内容并保存数据"""
94 try: 159 try:
95 soup = BeautifulSoup(html_content, 'html.parser') 160 soup = BeautifulSoup(html_content, 'html.parser')
96 weibo_items = soup.find_all('div', class_='card-wrap') 161 weibo_items = soup.find_all('div', class_='card-wrap')
@@ -124,8 +189,12 @@ class SpiderData: @@ -124,8 +189,12 @@ class SpiderData:
124 'crawl_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S') 189 'crawl_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
125 } 190 }
126 191
127 - # 保存到数据库  
128 - self._save_to_database(weibo_data) 192 + # 添加到插入缓冲区
  193 + self.insert_buffer.append(weibo_data)
  194 +
  195 + # 如果缓冲区达到阈值,执行批量插入
  196 + if len(self.insert_buffer) >= self.buffer_size:
  197 + self._flush_buffer()
129 198
130 except Exception as e: 199 except Exception as e:
131 logging.error(f"解析微博项时出错: {e}") 200 logging.error(f"解析微博项时出错: {e}")
@@ -134,52 +203,12 @@ class SpiderData: @@ -134,52 +203,12 @@ class SpiderData:
134 except Exception as e: 203 except Exception as e:
135 logging.error(f"解析页面时出错: {e}") 204 logging.error(f"解析页面时出错: {e}")
136 205
137 - def _extract_number(self, text):  
138 - """  
139 - 从文本中提取数字  
140 -  
141 - :param text: 包含数字的文本  
142 - :return: 提取的数字,如果没有找到则返回0  
143 - """ 206 + def _extract_number(self, text: str) -> int:
  207 + """从文本中提取数字"""
144 try: 208 try:
145 return int(''.join(filter(str.isdigit, text))) 209 return int(''.join(filter(str.isdigit, text)))
146 except ValueError: 210 except ValueError:
147 return 0 211 return 0
148 -  
149 - def _save_to_database(self, data):  
150 - """  
151 - 将数据保存到数据库  
152 -  
153 - :param data: 要保存的数据字典  
154 - """  
155 - connection = None  
156 - try:  
157 - connection = self.db.get_connection()  
158 -  
159 - with connection.cursor() as cursor:  
160 - # 插入文章数据  
161 - sql = """  
162 - INSERT INTO article (content, user_name, publish_time, forward_count,  
163 - comment_count, like_count, crawl_time)  
164 - VALUES (%s, %s, %s, %s, %s, %s, %s)  
165 - """  
166 - cursor.execute(sql, (  
167 - data['content'],  
168 - data['user_name'],  
169 - data['publish_time'],  
170 - data['forward_count'],  
171 - data['comment_count'],  
172 - data['like_count'],  
173 - data['crawl_time']  
174 - ))  
175 -  
176 - connection.commit()  
177 - logging.info(f"成功保存微博数据: {data['content'][:30]}...")  
178 -  
179 - except Exception as e:  
180 - logging.error(f"保存数据时出错: {e}")  
181 - if connection:  
182 - connection.rollback()  
183 212
184 if __name__ == '__main__': 213 if __name__ == '__main__':
185 spiderData() 214 spiderData()
@@ -10,6 +10,10 @@ import logging @@ -10,6 +10,10 @@ import logging
10 from spider.spiderData import SpiderData 10 from spider.spiderData import SpiderData
11 from openai import OpenAI 11 from openai import OpenAI
12 from anthropic import Anthropic 12 from anthropic import Anthropic
  13 +import aiohttp
  14 +from concurrent.futures import ThreadPoolExecutor
  15 +from ratelimit import limits, sleep_and_retry
  16 +from tenacity import retry, stop_after_attempt, wait_exponential
13 17
14 # 创建蓝图 18 # 创建蓝图
15 spider_bp = Blueprint('spider', __name__) 19 spider_bp = Blueprint('spider', __name__)
@@ -24,137 +28,150 @@ websocket_connections = set() @@ -24,137 +28,150 @@ websocket_connections = set()
24 # 创建消息队列 28 # 创建消息队列
25 message_queue = Queue() 29 message_queue = Queue()
26 30
  31 +# 创建线程池
  32 +thread_pool = ThreadPoolExecutor(max_workers=3)
  33 +
  34 +# 创建异步事件循环
  35 +loop = asyncio.new_event_loop()
  36 +asyncio.set_event_loop(loop)
  37 +
27 # 默认配置 38 # 默认配置
28 DEFAULT_CONFIG = { 39 DEFAULT_CONFIG = {
29 'crawlDepth': 3, 40 'crawlDepth': 3,
30 'interval': 5, 41 'interval': 5,
31 'maxRetries': 3, 42 'maxRetries': 3,
32 - 'timeout': 30 43 + 'timeout': 30,
  44 + 'maxConcurrent': 2
33 } 45 }
34 46
35 -def load_config():  
36 - """加载爬虫配置"""  
37 - config_path = os.path.join(os.path.dirname(__file__), '../spider/config.json')  
38 - try:  
39 - if os.path.exists(config_path):  
40 - with open(config_path, 'r', encoding='utf-8') as f:  
41 - return json.load(f)  
42 - except Exception as e:  
43 - logger.error(f"加载配置文件失败: {e}")  
44 - return DEFAULT_CONFIG 47 +# 限流装饰器
  48 +@sleep_and_retry
  49 +@limits(calls=100, period=60) # 每分钟最多100个请求
  50 +def rate_limited_request():
  51 + pass
45 52
46 -def save_config(config):  
47 - """保存爬虫配置"""  
48 - config_path = os.path.join(os.path.dirname(__file__), '../spider/config.json')  
49 - try:  
50 - with open(config_path, 'w', encoding='utf-8') as f:  
51 - json.dump(config, f, ensure_ascii=False, indent=4)  
52 - return True  
53 - except Exception as e:  
54 - logger.error(f"保存配置文件失败: {e}")  
55 - return False 53 +class SpiderWorker:
  54 + def __init__(self, topics, parameters):
  55 + self.topics = topics
  56 + self.parameters = parameters
  57 + self.total_topics = len(topics)
  58 + self.completed_topics = 0
  59 + self.spider = SpiderData()
  60 + self.message_buffer = []
  61 + self.message_buffer_size = 10
  62 + self.semaphore = asyncio.Semaphore(parameters.get('maxConcurrent', DEFAULT_CONFIG['maxConcurrent']))
  63 +
  64 + async def send_message(self, message):
  65 + """异步发送消息,使用缓冲区优化"""
  66 + self.message_buffer.append(message)
  67 + if len(self.message_buffer) >= self.message_buffer_size:
  68 + await self.flush_messages()
  69 +
  70 + async def flush_messages(self):
  71 + """刷新消息缓冲区"""
  72 + if not self.message_buffer:
  73 + return
  74 +
  75 + try:
  76 + await broadcast_message(self.message_buffer)
  77 + self.message_buffer.clear()
  78 + except Exception as e:
  79 + logger.error(f"发送消息失败: {e}")
  80 +
  81 + @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
  82 + async def crawl_single_topic(self, topic):
  83 + """爬取单个话题"""
  84 + try:
  85 + rate_limited_request()
  86 +
  87 + await self.send_message({
  88 + 'type': 'log',
  89 + 'message': f'开始爬取话题: {topic}'
  90 + })
  91 +
  92 + async with self.semaphore:
  93 + await asyncio.get_event_loop().run_in_executor(
  94 + thread_pool,
  95 + self.spider.crawl_topic,
  96 + topic,
  97 + self.parameters['crawlDepth'],
  98 + self.parameters['interval'],
  99 + self.parameters['maxRetries'],
  100 + self.parameters['timeout']
  101 + )
  102 +
  103 + self.completed_topics += 1
  104 + progress = int((self.completed_topics / self.total_topics) * 100)
  105 +
  106 + await self.send_message({
  107 + 'type': 'progress',
  108 + 'value': progress
  109 + })
  110 +
  111 + await self.send_message({
  112 + 'type': 'log',
  113 + 'message': f'话题 {topic} 爬取完成'
  114 + })
  115 +
  116 + except Exception as e:
  117 + logger.error(f"爬取话题 {topic} 失败: {e}")
  118 + await self.send_message({
  119 + 'type': 'log',
  120 + 'message': f'爬取话题 {topic} 时出错: {str(e)}'
  121 + })
  122 + raise
  123 +
  124 + async def run(self):
  125 + """运行爬虫任务"""
  126 + try:
  127 + tasks = [self.crawl_single_topic(topic) for topic in self.topics]
  128 + await asyncio.gather(*tasks)
  129 + await self.flush_messages()
  130 +
  131 + await self.send_message({
  132 + 'type': 'log',
  133 + 'message': '所有话题爬取完成'
  134 + })
  135 +
  136 + except Exception as e:
  137 + logger.error(f"爬虫任务执行出错: {e}")
  138 + await self.send_message({
  139 + 'type': 'log',
  140 + 'message': f'爬虫任务执行出错: {str(e)}'
  141 + })
  142 + finally:
  143 + await self.flush_messages()
56 144
57 -async def broadcast_message(message): 145 +async def broadcast_message(messages):
58 """广播消息到所有WebSocket连接""" 146 """广播消息到所有WebSocket连接"""
59 if not websocket_connections: 147 if not websocket_connections:
60 return 148 return
61 149
62 for websocket in websocket_connections.copy(): 150 for websocket in websocket_connections.copy():
63 try: 151 try:
64 - await websocket.send(json.dumps(message)) 152 + if isinstance(messages, list):
  153 + for message in messages:
  154 + await websocket.send(json.dumps(message))
  155 + else:
  156 + await websocket.send(json.dumps(messages))
65 except websockets.exceptions.ConnectionClosed: 157 except websockets.exceptions.ConnectionClosed:
66 websocket_connections.remove(websocket) 158 websocket_connections.remove(websocket)
67 except Exception as e: 159 except Exception as e:
68 logger.error(f"发送WebSocket消息失败: {e}") 160 logger.error(f"发送WebSocket消息失败: {e}")
69 websocket_connections.remove(websocket) 161 websocket_connections.remove(websocket)
70 162
71 -def spider_worker(topics, parameters):  
72 - """爬虫工作线程"""  
73 - total_topics = len(topics)  
74 - completed_topics = 0  
75 -  
76 - async def send_message(message):  
77 - """异步发送消息的包装函数"""  
78 - loop = asyncio.new_event_loop()  
79 - asyncio.set_event_loop(loop)  
80 - try:  
81 - await broadcast_message(message)  
82 - finally:  
83 - loop.close()  
84 -  
85 - try:  
86 - spider = SpiderData()  
87 -  
88 - for topic in topics:  
89 - try:  
90 - # 更新进度  
91 - progress = int((completed_topics / total_topics) * 100)  
92 - asyncio.run(send_message({  
93 - 'type': 'progress',  
94 - 'value': progress  
95 - }))  
96 -  
97 - # 发送开始爬取的日志  
98 - asyncio.run(send_message({  
99 - 'type': 'log',  
100 - 'message': f'开始爬取话题: {topic}'  
101 - }))  
102 -  
103 - # 执行爬取  
104 - spider.crawl_topic(  
105 - topic=topic,  
106 - depth=parameters['crawlDepth'],  
107 - interval=parameters['interval'],  
108 - max_retries=parameters['maxRetries'],  
109 - timeout=parameters['timeout']  
110 - )  
111 -  
112 - completed_topics += 1  
113 -  
114 - # 发送完成爬取的日志  
115 - asyncio.run(send_message({  
116 - 'type': 'log',  
117 - 'message': f'话题 {topic} 爬取完成'  
118 - }))  
119 -  
120 - except Exception as e:  
121 - # 发送错误日志  
122 - asyncio.run(send_message({  
123 - 'type': 'log',  
124 - 'message': f'爬取话题 {topic} 时出错: {str(e)}'  
125 - }))  
126 -  
127 - # 更新最终进度  
128 - asyncio.run(send_message({  
129 - 'type': 'progress',  
130 - 'value': 100  
131 - }))  
132 -  
133 - # 发送完成消息  
134 - asyncio.run(send_message({  
135 - 'type': 'log',  
136 - 'message': '所有话题爬取完成'  
137 - }))  
138 -  
139 - except Exception as e:  
140 - # 发送错误日志  
141 - asyncio.run(send_message({  
142 - 'type': 'log',  
143 - 'message': f'爬虫任务执行出错: {str(e)}'  
144 - }))  
145 -  
146 @spider_bp.route('/spider/control') 163 @spider_bp.route('/spider/control')
147 def spider_control(): 164 def spider_control():
148 """渲染爬虫控制页面""" 165 """渲染爬虫控制页面"""
149 return render_template('spider_control.html') 166 return render_template('spider_control.html')
150 167
151 @spider_bp.route('/api/spider/start', methods=['POST']) 168 @spider_bp.route('/api/spider/start', methods=['POST'])
152 -def start_spider(): 169 +async def start_spider():
153 """启动爬虫任务""" 170 """启动爬虫任务"""
154 try: 171 try:
155 data = request.get_json() 172 data = request.get_json()
156 topics = data.get('topics', []) 173 topics = data.get('topics', [])
157 - parameters = data.get('parameters', DEFAULT_CONFIG) 174 + parameters = {**DEFAULT_CONFIG, **data.get('parameters', {})}
158 175
159 if not topics: 176 if not topics:
160 return jsonify({ 177 return jsonify({
@@ -162,13 +179,11 @@ def start_spider(): @@ -162,13 +179,11 @@ def start_spider():
162 'message': '请选择至少一个话题' 179 'message': '请选择至少一个话题'
163 }) 180 })
164 181
165 - # 启动爬虫线程  
166 - thread = threading.Thread(  
167 - target=spider_worker,  
168 - args=(topics, parameters),  
169 - daemon=True  
170 - )  
171 - thread.start() 182 + # 创建爬虫工作器
  183 + worker = SpiderWorker(topics, parameters)
  184 +
  185 + # 在事件循环中运行爬虫任务
  186 + asyncio.create_task(worker.run())
172 187
173 return jsonify({ 188 return jsonify({
174 'success': True, 189 'success': True,