Optimize the crawler configuration page, add multi-account parallel functionalit…
…y, adapt AI configuration features, and include database configuration options.
Showing
3 changed files
with
1014 additions
and
36 deletions
| @@ -93,8 +93,20 @@ class SpiderData: | @@ -93,8 +93,20 @@ class SpiderData: | ||
| 93 | connection.rollback() | 93 | connection.rollback() |
| 94 | 94 | ||
| 95 | def crawl_topic(self, topic: str, depth: int = 3, interval: int = 5, | 95 | def crawl_topic(self, topic: str, depth: int = 3, interval: int = 5, |
| 96 | - max_retries: int = 3, timeout: int = 30): | ||
| 97 | - """爬取指定话题的微博内容""" | 96 | + max_retries: int = 3, timeout: int = 30, cookie: str = None, |
| 97 | + filter_callback = None): | ||
| 98 | + """ | ||
| 99 | + 爬取指定话题的微博内容 | ||
| 100 | + | ||
| 101 | + Args: | ||
| 102 | + topic: 话题关键词 | ||
| 103 | + depth: 爬取深度(页数) | ||
| 104 | + interval: 请求间隔(秒) | ||
| 105 | + max_retries: 最大重试次数 | ||
| 106 | + timeout: 请求超时时间(秒) | ||
| 107 | + cookie: 用户Cookie | ||
| 108 | + filter_callback: 筛选回调函数,返回True表示保留该条微博 | ||
| 109 | + """ | ||
| 98 | # 参数验证 | 110 | # 参数验证 |
| 99 | if not isinstance(depth, int) or depth < 1 or depth > 10: | 111 | if not isinstance(depth, int) or depth < 1 or depth > 10: |
| 100 | raise ValueError("爬取深度必须在1-10页之间") | 112 | raise ValueError("爬取深度必须在1-10页之间") |
| @@ -105,6 +117,10 @@ class SpiderData: | @@ -105,6 +117,10 @@ class SpiderData: | ||
| 105 | if not isinstance(timeout, int) or timeout < 10 or timeout > 60: | 117 | if not isinstance(timeout, int) or timeout < 10 or timeout > 60: |
| 106 | raise ValueError("请求超时时间必须在10-60秒之间") | 118 | raise ValueError("请求超时时间必须在10-60秒之间") |
| 107 | 119 | ||
| 120 | + # 更新请求头中的Cookie | ||
| 121 | + if cookie: | ||
| 122 | + self.headers['Cookie'] = cookie | ||
| 123 | + | ||
| 108 | logging.info(f"开始爬取话题: {topic}, 参数: depth={depth}, interval={interval}, max_retries={max_retries}, timeout={timeout}") | 124 | logging.info(f"开始爬取话题: {topic}, 参数: depth={depth}, interval={interval}, max_retries={max_retries}, timeout={timeout}") |
| 109 | 125 | ||
| 110 | for page in range(1, depth + 1): | 126 | for page in range(1, depth + 1): |
| @@ -116,7 +132,7 @@ class SpiderData: | @@ -116,7 +132,7 @@ class SpiderData: | ||
| 116 | # 检查缓存 | 132 | # 检查缓存 |
| 117 | cached_content = self._get_cached_page(url) | 133 | cached_content = self._get_cached_page(url) |
| 118 | if cached_content: | 134 | if cached_content: |
| 119 | - self._parse_page(cached_content) | 135 | + self._parse_page(cached_content, filter_callback) |
| 120 | logging.info(f"使用缓存数据: {topic} 第 {page} 页") | 136 | logging.info(f"使用缓存数据: {topic} 第 {page} 页") |
| 121 | break | 137 | break |
| 122 | 138 | ||
| @@ -125,7 +141,7 @@ class SpiderData: | @@ -125,7 +141,7 @@ class SpiderData: | ||
| 125 | if response.status_code == 200: | 141 | if response.status_code == 200: |
| 126 | # 缓存页面内容 | 142 | # 缓存页面内容 |
| 127 | self._cache_page(url, response.text) | 143 | self._cache_page(url, response.text) |
| 128 | - self._parse_page(response.text) | 144 | + self._parse_page(response.text, filter_callback) |
| 129 | logging.info(f"成功爬取话题 {topic} 第 {page} 页") | 145 | logging.info(f"成功爬取话题 {topic} 第 {page} 页") |
| 130 | break | 146 | break |
| 131 | else: | 147 | else: |
| @@ -154,8 +170,14 @@ class SpiderData: | @@ -154,8 +170,14 @@ class SpiderData: | ||
| 154 | # 最后刷新缓冲区 | 170 | # 最后刷新缓冲区 |
| 155 | self._flush_buffer() | 171 | self._flush_buffer() |
| 156 | 172 | ||
| 157 | - def _parse_page(self, html_content: str): | ||
| 158 | - """解析页面内容并保存数据""" | 173 | + def _parse_page(self, html_content: str, filter_callback = None): |
| 174 | + """ | ||
| 175 | + 解析页面内容并保存数据 | ||
| 176 | + | ||
| 177 | + Args: | ||
| 178 | + html_content: HTML页面内容 | ||
| 179 | + filter_callback: 筛选回调函数 | ||
| 180 | + """ | ||
| 159 | try: | 181 | try: |
| 160 | soup = BeautifulSoup(html_content, 'html.parser') | 182 | soup = BeautifulSoup(html_content, 'html.parser') |
| 161 | weibo_items = soup.find_all('div', class_='card-wrap') | 183 | weibo_items = soup.find_all('div', class_='card-wrap') |
| @@ -178,6 +200,19 @@ class SpiderData: | @@ -178,6 +200,19 @@ class SpiderData: | ||
| 178 | # 提取互动数据 | 200 | # 提取互动数据 |
| 179 | actions = item.find_all('li', class_='action') | 201 | actions = item.find_all('li', class_='action') |
| 180 | 202 | ||
| 203 | + # 提取用户认证状态 | ||
| 204 | + user_verified = bool(item.find('i', class_='icon-vip')) | ||
| 205 | + | ||
| 206 | + # 提取是否原创 | ||
| 207 | + is_original = not bool(item.find('span', class_='repost')) | ||
| 208 | + | ||
| 209 | + # 提取是否包含媒体 | ||
| 210 | + has_media = bool(item.find('div', class_='media')) | ||
| 211 | + | ||
| 212 | + # 提取发布位置 | ||
| 213 | + location = item.find('a', class_='location') | ||
| 214 | + location_text = location.text.strip() if location else '' | ||
| 215 | + | ||
| 181 | # 构建数据字典 | 216 | # 构建数据字典 |
| 182 | weibo_data = { | 217 | weibo_data = { |
| 183 | 'content': content.text.strip(), | 218 | 'content': content.text.strip(), |
| @@ -186,15 +221,22 @@ class SpiderData: | @@ -186,15 +221,22 @@ class SpiderData: | ||
| 186 | 'forward_count': self._extract_number(actions[0].text) if len(actions) > 0 else 0, | 221 | 'forward_count': self._extract_number(actions[0].text) if len(actions) > 0 else 0, |
| 187 | 'comment_count': self._extract_number(actions[1].text) if len(actions) > 1 else 0, | 222 | 'comment_count': self._extract_number(actions[1].text) if len(actions) > 1 else 0, |
| 188 | 'like_count': self._extract_number(actions[2].text) if len(actions) > 2 else 0, | 223 | 'like_count': self._extract_number(actions[2].text) if len(actions) > 2 else 0, |
| 224 | + 'read_count': self._extract_number(actions[3].text) if len(actions) > 3 else 0, | ||
| 225 | + 'user_verified': user_verified, | ||
| 226 | + 'is_original': is_original, | ||
| 227 | + 'has_media': has_media, | ||
| 228 | + 'location': location_text, | ||
| 189 | 'crawl_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S') | 229 | 'crawl_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S') |
| 190 | } | 230 | } |
| 191 | 231 | ||
| 192 | - # 添加到插入缓冲区 | ||
| 193 | - self.insert_buffer.append(weibo_data) | ||
| 194 | - | ||
| 195 | - # 如果缓冲区达到阈值,执行批量插入 | ||
| 196 | - if len(self.insert_buffer) >= self.buffer_size: | ||
| 197 | - self._flush_buffer() | 232 | + # 如果有筛选回调函数,则进行筛选 |
| 233 | + if filter_callback is None or filter_callback(weibo_data): | ||
| 234 | + # 添加到插入缓冲区 | ||
| 235 | + self.insert_buffer.append(weibo_data) | ||
| 236 | + | ||
| 237 | + # 如果缓冲区达到阈值,执行批量插入 | ||
| 238 | + if len(self.insert_buffer) >= self.buffer_size: | ||
| 239 | + self._flush_buffer() | ||
| 198 | 240 | ||
| 199 | except Exception as e: | 241 | except Exception as e: |
| 200 | logging.error(f"解析微博项时出错: {e}") | 242 | logging.error(f"解析微博项时出错: {e}") |
| @@ -103,6 +103,214 @@ | @@ -103,6 +103,214 @@ | ||
| 103 | </div> | 103 | </div> |
| 104 | </div> | 104 | </div> |
| 105 | 105 | ||
| 106 | + <!-- 内容筛选配置 --> | ||
| 107 | + <div class="card mb-4"> | ||
| 108 | + <div class="card-header d-flex justify-content-between align-items-center"> | ||
| 109 | + <h5 class="mb-0">内容筛选配置</h5> | ||
| 110 | + <button class="btn btn-sm btn-outline-primary" type="button" data-bs-toggle="collapse" data-bs-target="#filterHelp"> | ||
| 111 | + <i class="fas fa-question-circle"></i> 帮助 | ||
| 112 | + </button> | ||
| 113 | + </div> | ||
| 114 | + <div class="collapse" id="filterHelp"> | ||
| 115 | + <div class="card-body bg-light"> | ||
| 116 | + <h6>筛选条件说明:</h6> | ||
| 117 | + <ul> | ||
| 118 | + <li>数值条件:设置大于某个值进行筛选,如点赞数>1000</li> | ||
| 119 | + <li>正则匹配:使用正则表达式匹配内容,如包含特定关键词</li> | ||
| 120 | + <li>多个条件之间是"与"的关系,即同时满足才会保留</li> | ||
| 121 | + </ul> | ||
| 122 | + <div class="alert alert-info"> | ||
| 123 | + <i class="fas fa-info-circle"></i> 提示:合理设置筛选条件可以提高数据质量 | ||
| 124 | + </div> | ||
| 125 | + </div> | ||
| 126 | + </div> | ||
| 127 | + <div class="card-body"> | ||
| 128 | + <!-- 互动数据筛选 --> | ||
| 129 | + <h6 class="mb-3">互动数据筛选</h6> | ||
| 130 | + <div class="row"> | ||
| 131 | + <div class="col-md-3"> | ||
| 132 | + <div class="mb-3"> | ||
| 133 | + <label class="form-label">点赞数大于</label> | ||
| 134 | + <input type="number" class="form-control" id="minLikes" value="0" min="0"> | ||
| 135 | + </div> | ||
| 136 | + </div> | ||
| 137 | + <div class="col-md-3"> | ||
| 138 | + <div class="mb-3"> | ||
| 139 | + <label class="form-label">评论数大于</label> | ||
| 140 | + <input type="number" class="form-control" id="minComments" value="0" min="0"> | ||
| 141 | + </div> | ||
| 142 | + </div> | ||
| 143 | + <div class="col-md-3"> | ||
| 144 | + <div class="mb-3"> | ||
| 145 | + <label class="form-label">转发数大于</label> | ||
| 146 | + <input type="number" class="form-control" id="minReposts" value="0" min="0"> | ||
| 147 | + </div> | ||
| 148 | + </div> | ||
| 149 | + <div class="col-md-3"> | ||
| 150 | + <div class="mb-3"> | ||
| 151 | + <label class="form-label">阅读数大于</label> | ||
| 152 | + <input type="number" class="form-control" id="minReads" value="0" min="0"> | ||
| 153 | + </div> | ||
| 154 | + </div> | ||
| 155 | + </div> | ||
| 156 | + | ||
| 157 | + <!-- 内容正则筛选 --> | ||
| 158 | + <h6 class="mb-3 mt-4">内容正则筛选</h6> | ||
| 159 | + <div id="regexFilters"> | ||
| 160 | + <!-- 正则表达式筛选器列表 --> | ||
| 161 | + </div> | ||
| 162 | + <button class="btn btn-outline-primary btn-sm mt-2" onclick="addRegexFilter()"> | ||
| 163 | + <i class="fas fa-plus"></i> 添加正则筛选 | ||
| 164 | + </button> | ||
| 165 | + | ||
| 166 | + <!-- 高级筛选选项 --> | ||
| 167 | + <h6 class="mb-3 mt-4">高级选项</h6> | ||
| 168 | + <div class="form-check mb-2"> | ||
| 169 | + <input class="form-check-input" type="checkbox" id="filterOriginal"> | ||
| 170 | + <label class="form-check-label" for="filterOriginal"> | ||
| 171 | + 仅爬取原创内容 | ||
| 172 | + </label> | ||
| 173 | + </div> | ||
| 174 | + <div class="form-check mb-2"> | ||
| 175 | + <input class="form-check-input" type="checkbox" id="filterWithMedia"> | ||
| 176 | + <label class="form-check-label" for="filterWithMedia"> | ||
| 177 | + 必须包含图片或视频 | ||
| 178 | + </label> | ||
| 179 | + </div> | ||
| 180 | + <div class="form-check"> | ||
| 181 | + <input class="form-check-input" type="checkbox" id="filterVerified"> | ||
| 182 | + <label class="form-check-label" for="filterVerified"> | ||
| 183 | + 仅认证用户的内容 | ||
| 184 | + </label> | ||
| 185 | + </div> | ||
| 186 | + </div> | ||
| 187 | + </div> | ||
| 188 | + | ||
| 189 | + <!-- 账号配置 --> | ||
| 190 | + <div class="card mb-4"> | ||
| 191 | + <div class="card-header d-flex justify-content-between align-items-center"> | ||
| 192 | + <h5 class="mb-0">账号配置</h5> | ||
| 193 | + <div> | ||
| 194 | + <button class="btn btn-sm btn-outline-primary me-2" type="button" data-bs-toggle="collapse" data-bs-target="#accountHelp"> | ||
| 195 | + <i class="fas fa-question-circle"></i> 帮助 | ||
| 196 | + </button> | ||
| 197 | + <button class="btn btn-sm btn-success" onclick="addAccount()"> | ||
| 198 | + <i class="fas fa-plus"></i> 添加账号 | ||
| 199 | + </button> | ||
| 200 | + </div> | ||
| 201 | + </div> | ||
| 202 | + <div class="collapse" id="accountHelp"> | ||
| 203 | + <div class="card-body bg-light"> | ||
| 204 | + <h6>如何获取Cookie?</h6> | ||
| 205 | + <ol> | ||
| 206 | + <li>登录微博网页版</li> | ||
| 207 | + <li>按F12打开开发者工具</li> | ||
| 208 | + <li>切换到Network标签页</li> | ||
| 209 | + <li>刷新页面,找到请求头中的Cookie值</li> | ||
| 210 | + </ol> | ||
| 211 | + <div class="alert alert-warning"> | ||
| 212 | + <i class="fas fa-exclamation-triangle"></i> 注意:请勿泄露您的Cookie信息! | ||
| 213 | + </div> | ||
| 214 | + <div class="alert alert-info"> | ||
| 215 | + <i class="fas fa-info-circle"></i> 提示:添加多个账号可以提高爬取效率,系统会自动在账号间轮换。 | ||
| 216 | + </div> | ||
| 217 | + </div> | ||
| 218 | + </div> | ||
| 219 | + <div class="card-body"> | ||
| 220 | + <div id="accountsList"> | ||
| 221 | + <!-- 账号列表将通过JavaScript动态生成 --> | ||
| 222 | + </div> | ||
| 223 | + <div class="alert alert-warning mt-3" id="noAccountsWarning" style="display: none;"> | ||
| 224 | + <i class="fas fa-exclamation-triangle"></i> 请至少添加一个账号 | ||
| 225 | + </div> | ||
| 226 | + </div> | ||
| 227 | + </div> | ||
| 228 | + | ||
| 229 | + <!-- 并行配置 --> | ||
| 230 | + <div class="card mb-4"> | ||
| 231 | + <div class="card-header"> | ||
| 232 | + <h5 class="mb-0">并行配置</h5> | ||
| 233 | + </div> | ||
| 234 | + <div class="card-body"> | ||
| 235 | + <div class="row"> | ||
| 236 | + <div class="col-md-6"> | ||
| 237 | + <div class="mb-3"> | ||
| 238 | + <label for="maxConcurrent" class="form-label">最大并行数</label> | ||
| 239 | + <input type="number" class="form-control" id="maxConcurrent" value="2" min="1" max="5"> | ||
| 240 | + <small class="text-muted">同时进行爬取的最大话题数(1-5)</small> | ||
| 241 | + </div> | ||
| 242 | + </div> | ||
| 243 | + <div class="col-md-6"> | ||
| 244 | + <div class="mb-3"> | ||
| 245 | + <label for="requestsPerMinute" class="form-label">每分钟请求数限制</label> | ||
| 246 | + <input type="number" class="form-control" id="requestsPerMinute" value="60" min="30" max="120"> | ||
| 247 | + <small class="text-muted">避免请求过于频繁(30-120)</small> | ||
| 248 | + </div> | ||
| 249 | + </div> | ||
| 250 | + </div> | ||
| 251 | + </div> | ||
| 252 | + </div> | ||
| 253 | + | ||
| 254 | + <!-- 数据库配置 --> | ||
| 255 | + <div class="card mb-4"> | ||
| 256 | + <div class="card-header"> | ||
| 257 | + <h5 class="mb-0">数据库配置</h5> | ||
| 258 | + </div> | ||
| 259 | + <div class="card-body"> | ||
| 260 | + <div class="row"> | ||
| 261 | + <div class="col-md-6"> | ||
| 262 | + <div class="mb-3"> | ||
| 263 | + <label for="dbType" class="form-label">数据库类型</label> | ||
| 264 | + <select class="form-select" id="dbType"> | ||
| 265 | + <option value="mysql">MySQL</option> | ||
| 266 | + <option value="postgresql">PostgreSQL</option> | ||
| 267 | + <option value="mongodb">MongoDB</option> | ||
| 268 | + </select> | ||
| 269 | + </div> | ||
| 270 | + </div> | ||
| 271 | + <div class="col-md-6"> | ||
| 272 | + <div class="mb-3"> | ||
| 273 | + <label for="dbHost" class="form-label">主机地址</label> | ||
| 274 | + <input type="text" class="form-control" id="dbHost" value="localhost"> | ||
| 275 | + </div> | ||
| 276 | + </div> | ||
| 277 | + </div> | ||
| 278 | + <div class="row"> | ||
| 279 | + <div class="col-md-6"> | ||
| 280 | + <div class="mb-3"> | ||
| 281 | + <label for="dbPort" class="form-label">端口</label> | ||
| 282 | + <input type="number" class="form-control" id="dbPort" value="3306"> | ||
| 283 | + </div> | ||
| 284 | + </div> | ||
| 285 | + <div class="col-md-6"> | ||
| 286 | + <div class="mb-3"> | ||
| 287 | + <label for="dbName" class="form-label">数据库名</label> | ||
| 288 | + <input type="text" class="form-control" id="dbName" value="weibo_data"> | ||
| 289 | + </div> | ||
| 290 | + </div> | ||
| 291 | + </div> | ||
| 292 | + <div class="row"> | ||
| 293 | + <div class="col-md-6"> | ||
| 294 | + <div class="mb-3"> | ||
| 295 | + <label for="dbUser" class="form-label">用户名</label> | ||
| 296 | + <input type="text" class="form-control" id="dbUser"> | ||
| 297 | + </div> | ||
| 298 | + </div> | ||
| 299 | + <div class="col-md-6"> | ||
| 300 | + <div class="mb-3"> | ||
| 301 | + <label for="dbPassword" class="form-label">密码</label> | ||
| 302 | + <input type="password" class="form-control" id="dbPassword"> | ||
| 303 | + </div> | ||
| 304 | + </div> | ||
| 305 | + </div> | ||
| 306 | + <div class="d-flex justify-content-end"> | ||
| 307 | + <button class="btn btn-primary" onclick="testDbConnection()"> | ||
| 308 | + <i class="fas fa-database"></i> 测试连接 | ||
| 309 | + </button> | ||
| 310 | + </div> | ||
| 311 | + </div> | ||
| 312 | + </div> | ||
| 313 | + | ||
| 106 | <!-- AI配置助手 --> | 314 | <!-- AI配置助手 --> |
| 107 | <div class="card mb-4"> | 315 | <div class="card mb-4"> |
| 108 | <div class="card-header"> | 316 | <div class="card-header"> |
| @@ -237,13 +445,43 @@ | @@ -237,13 +445,43 @@ | ||
| 237 | return; | 445 | return; |
| 238 | } | 446 | } |
| 239 | 447 | ||
| 448 | + // 验证必要的配置 | ||
| 449 | + if (!validateConfig()) { | ||
| 450 | + return; | ||
| 451 | + } | ||
| 452 | + | ||
| 240 | const config = { | 453 | const config = { |
| 241 | topics: Array.from(selectedTopics), | 454 | topics: Array.from(selectedTopics), |
| 242 | parameters: { | 455 | parameters: { |
| 243 | crawlDepth: parseInt(document.getElementById('crawlDepth').value), | 456 | crawlDepth: parseInt(document.getElementById('crawlDepth').value), |
| 244 | interval: parseInt(document.getElementById('interval').value), | 457 | interval: parseInt(document.getElementById('interval').value), |
| 245 | maxRetries: parseInt(document.getElementById('maxRetries').value), | 458 | maxRetries: parseInt(document.getElementById('maxRetries').value), |
| 246 | - timeout: parseInt(document.getElementById('timeout').value) | 459 | + timeout: parseInt(document.getElementById('timeout').value), |
| 460 | + maxConcurrent: parseInt(document.getElementById('maxConcurrent').value), | ||
| 461 | + requestsPerMinute: parseInt(document.getElementById('requestsPerMinute').value) | ||
| 462 | + }, | ||
| 463 | + filters: { | ||
| 464 | + interaction: { | ||
| 465 | + minLikes: parseInt(document.getElementById('minLikes').value) || 0, | ||
| 466 | + minComments: parseInt(document.getElementById('minComments').value) || 0, | ||
| 467 | + minReposts: parseInt(document.getElementById('minReposts').value) || 0, | ||
| 468 | + minReads: parseInt(document.getElementById('minReads').value) || 0 | ||
| 469 | + }, | ||
| 470 | + regex: getRegexFilters(), | ||
| 471 | + options: { | ||
| 472 | + originalOnly: document.getElementById('filterOriginal').checked, | ||
| 473 | + withMediaOnly: document.getElementById('filterWithMedia').checked, | ||
| 474 | + verifiedOnly: document.getElementById('filterVerified').checked | ||
| 475 | + } | ||
| 476 | + }, | ||
| 477 | + accounts: getAccountsConfig(), | ||
| 478 | + database: { | ||
| 479 | + type: document.getElementById('dbType').value, | ||
| 480 | + host: document.getElementById('dbHost').value, | ||
| 481 | + port: parseInt(document.getElementById('dbPort').value), | ||
| 482 | + name: document.getElementById('dbName').value, | ||
| 483 | + user: document.getElementById('dbUser').value, | ||
| 484 | + password: document.getElementById('dbPassword').value | ||
| 247 | } | 485 | } |
| 248 | }; | 486 | }; |
| 249 | 487 | ||
| @@ -268,6 +506,335 @@ | @@ -268,6 +506,335 @@ | ||
| 268 | }); | 506 | }); |
| 269 | } | 507 | } |
| 270 | 508 | ||
| 509 | + // 账号管理相关函数 | ||
| 510 | + let accounts = []; | ||
| 511 | + let accountIdCounter = 0; | ||
| 512 | + | ||
| 513 | + function createAccountElement(account) { | ||
| 514 | + const accountDiv = document.createElement('div'); | ||
| 515 | + accountDiv.className = 'border rounded p-3 mb-3 position-relative account-item'; | ||
| 516 | + accountDiv.dataset.id = account.id; | ||
| 517 | + | ||
| 518 | + const deleteButton = document.createElement('button'); | ||
| 519 | + deleteButton.className = 'btn btn-sm btn-danger position-absolute top-0 end-0 m-2'; | ||
| 520 | + deleteButton.innerHTML = '<i class="fas fa-times"></i>'; | ||
| 521 | + deleteButton.onclick = () => removeAccount(account.id); | ||
| 522 | + | ||
| 523 | + const content = ` | ||
| 524 | + <div class="row"> | ||
| 525 | + <div class="col-md-6"> | ||
| 526 | + <div class="mb-3"> | ||
| 527 | + <label class="form-label">用户名</label> | ||
| 528 | + <input type="text" class="form-control account-username" value="${account.username || ''}" placeholder="微博用户名"> | ||
| 529 | + </div> | ||
| 530 | + </div> | ||
| 531 | + <div class="col-md-6"> | ||
| 532 | + <div class="mb-3"> | ||
| 533 | + <label class="form-label">密码</label> | ||
| 534 | + <input type="password" class="form-control account-password" value="${account.password || ''}" placeholder="微博密码"> | ||
| 535 | + </div> | ||
| 536 | + </div> | ||
| 537 | + </div> | ||
| 538 | + <div class="mb-3"> | ||
| 539 | + <label class="form-label">Cookie</label> | ||
| 540 | + <textarea class="form-control account-cookie" rows="2" placeholder="请输入微博Cookie">${account.cookie || ''}</textarea> | ||
| 541 | + </div> | ||
| 542 | + <div class="form-check mb-3"> | ||
| 543 | + <input class="form-check-input account-save-cookie" type="checkbox" ${account.saveCookie ? 'checked' : ''}> | ||
| 544 | + <label class="form-check-label"> | ||
| 545 | + 保存Cookie(加密存储) | ||
| 546 | + </label> | ||
| 547 | + </div> | ||
| 548 | + <div class="account-status alert alert-info"> | ||
| 549 | + 状态:待验证 | ||
| 550 | + <button class="btn btn-sm btn-outline-primary ms-2" onclick="validateAccount(${account.id})"> | ||
| 551 | + <i class="fas fa-check-circle"></i> 验证账号 | ||
| 552 | + </button> | ||
| 553 | + </div> | ||
| 554 | + `; | ||
| 555 | + | ||
| 556 | + accountDiv.innerHTML = content; | ||
| 557 | + accountDiv.appendChild(deleteButton); | ||
| 558 | + return accountDiv; | ||
| 559 | + } | ||
| 560 | + | ||
| 561 | + function addAccount() { | ||
| 562 | + const account = { | ||
| 563 | + id: accountIdCounter++, | ||
| 564 | + username: '', | ||
| 565 | + password: '', | ||
| 566 | + cookie: '', | ||
| 567 | + saveCookie: false, | ||
| 568 | + status: 'pending' | ||
| 569 | + }; | ||
| 570 | + accounts.push(account); | ||
| 571 | + | ||
| 572 | + const accountsList = document.getElementById('accountsList'); | ||
| 573 | + accountsList.appendChild(createAccountElement(account)); | ||
| 574 | + updateAccountsWarning(); | ||
| 575 | + } | ||
| 576 | + | ||
| 577 | + function removeAccount(id) { | ||
| 578 | + accounts = accounts.filter(account => account.id !== id); | ||
| 579 | + const accountElement = document.querySelector(`.account-item[data-id="${id}"]`); | ||
| 580 | + if (accountElement) { | ||
| 581 | + accountElement.remove(); | ||
| 582 | + } | ||
| 583 | + updateAccountsWarning(); | ||
| 584 | + } | ||
| 585 | + | ||
| 586 | + function updateAccountsWarning() { | ||
| 587 | + const warning = document.getElementById('noAccountsWarning'); | ||
| 588 | + warning.style.display = accounts.length === 0 ? 'block' : 'none'; | ||
| 589 | + } | ||
| 590 | + | ||
| 591 | + function getAccountsConfig() { | ||
| 592 | + return accounts.map(account => { | ||
| 593 | + const accountElement = document.querySelector(`.account-item[data-id="${account.id}"]`); | ||
| 594 | + return { | ||
| 595 | + username: accountElement.querySelector('.account-username').value, | ||
| 596 | + password: accountElement.querySelector('.account-password').value, | ||
| 597 | + cookie: accountElement.querySelector('.account-cookie').value, | ||
| 598 | + saveCookie: accountElement.querySelector('.account-save-cookie').checked | ||
| 599 | + }; | ||
| 600 | + }); | ||
| 601 | + } | ||
| 602 | + | ||
| 603 | + async function validateAccount(id) { | ||
| 604 | + const accountElement = document.querySelector(`.account-item[data-id="${id}"]`); | ||
| 605 | + const statusElement = accountElement.querySelector('.account-status'); | ||
| 606 | + const cookie = accountElement.querySelector('.account-cookie').value.trim(); | ||
| 607 | + | ||
| 608 | + if (!cookie) { | ||
| 609 | + statusElement.className = 'account-status alert alert-danger'; | ||
| 610 | + statusElement.innerHTML = '状态:验证失败 - Cookie不能为空'; | ||
| 611 | + return; | ||
| 612 | + } | ||
| 613 | + | ||
| 614 | + statusElement.className = 'account-status alert alert-warning'; | ||
| 615 | + statusElement.innerHTML = '状态:验证中...'; | ||
| 616 | + | ||
| 617 | + try { | ||
| 618 | + const response = await fetch('/api/spider/validate-account', { | ||
| 619 | + method: 'POST', | ||
| 620 | + headers: { | ||
| 621 | + 'Content-Type': 'application/json' | ||
| 622 | + }, | ||
| 623 | + body: JSON.stringify({ | ||
| 624 | + cookie: cookie | ||
| 625 | + }) | ||
| 626 | + }); | ||
| 627 | + | ||
| 628 | + const data = await response.json(); | ||
| 629 | + if (data.success) { | ||
| 630 | + statusElement.className = 'account-status alert alert-success'; | ||
| 631 | + statusElement.innerHTML = '状态:验证成功'; | ||
| 632 | + } else { | ||
| 633 | + statusElement.className = 'account-status alert alert-danger'; | ||
| 634 | + statusElement.innerHTML = `状态:验证失败 - ${data.message}`; | ||
| 635 | + } | ||
| 636 | + } catch (error) { | ||
| 637 | + statusElement.className = 'account-status alert alert-danger'; | ||
| 638 | + statusElement.innerHTML = `状态:验证失败 - ${error.message}`; | ||
| 639 | + } | ||
| 640 | + } | ||
| 641 | + | ||
| 642 | + // 正则筛选器管理 | ||
| 643 | + let regexFilters = []; | ||
| 644 | + let regexFilterIdCounter = 0; | ||
| 645 | + | ||
| 646 | + function createRegexFilterElement(filter) { | ||
| 647 | + const filterDiv = document.createElement('div'); | ||
| 648 | + filterDiv.className = 'border rounded p-3 mb-3 position-relative regex-filter-item'; | ||
| 649 | + filterDiv.dataset.id = filter.id; | ||
| 650 | + | ||
| 651 | + const deleteButton = document.createElement('button'); | ||
| 652 | + deleteButton.className = 'btn btn-sm btn-danger position-absolute top-0 end-0 m-2'; | ||
| 653 | + deleteButton.innerHTML = '<i class="fas fa-times"></i>'; | ||
| 654 | + deleteButton.onclick = () => removeRegexFilter(filter.id); | ||
| 655 | + | ||
| 656 | + const content = ` | ||
| 657 | + <div class="row"> | ||
| 658 | + <div class="col-md-6"> | ||
| 659 | + <div class="mb-3"> | ||
| 660 | + <label class="form-label">正则表达式</label> | ||
| 661 | + <input type="text" class="form-control regex-pattern" value="${filter.pattern || ''}" placeholder="输入正则表达式"> | ||
| 662 | + </div> | ||
| 663 | + </div> | ||
| 664 | + <div class="col-md-6"> | ||
| 665 | + <div class="mb-3"> | ||
| 666 | + <label class="form-label">匹配目标</label> | ||
| 667 | + <select class="form-select regex-target"> | ||
| 668 | + <option value="content" ${filter.target === 'content' ? 'selected' : ''}>微博内容</option> | ||
| 669 | + <option value="author" ${filter.target === 'author' ? 'selected' : ''}>作者名</option> | ||
| 670 | + <option value="location" ${filter.target === 'location' ? 'selected' : ''}>发布位置</option> | ||
| 671 | + </select> | ||
| 672 | + </div> | ||
| 673 | + </div> | ||
| 674 | + </div> | ||
| 675 | + <div class="form-check"> | ||
| 676 | + <input class="form-check-input regex-inverse" type="checkbox" ${filter.inverse ? 'checked' : ''}> | ||
| 677 | + <label class="form-check-label"> | ||
| 678 | + 反向匹配(不包含匹配项) | ||
| 679 | + </label> | ||
| 680 | + </div> | ||
| 681 | + `; | ||
| 682 | + | ||
| 683 | + filterDiv.innerHTML = content; | ||
| 684 | + filterDiv.appendChild(deleteButton); | ||
| 685 | + return filterDiv; | ||
| 686 | + } | ||
| 687 | + | ||
| 688 | + function addRegexFilter() { | ||
| 689 | + const filter = { | ||
| 690 | + id: regexFilterIdCounter++, | ||
| 691 | + pattern: '', | ||
| 692 | + target: 'content', | ||
| 693 | + inverse: false | ||
| 694 | + }; | ||
| 695 | + regexFilters.push(filter); | ||
| 696 | + | ||
| 697 | + const filtersList = document.getElementById('regexFilters'); | ||
| 698 | + filtersList.appendChild(createRegexFilterElement(filter)); | ||
| 699 | + } | ||
| 700 | + | ||
| 701 | + function removeRegexFilter(id) { | ||
| 702 | + regexFilters = regexFilters.filter(filter => filter.id !== id); | ||
| 703 | + const filterElement = document.querySelector(`.regex-filter-item[data-id="${id}"]`); | ||
| 704 | + if (filterElement) { | ||
| 705 | + filterElement.remove(); | ||
| 706 | + } | ||
| 707 | + } | ||
| 708 | + | ||
| 709 | + function getRegexFilters() { | ||
| 710 | + return regexFilters.map(filter => { | ||
| 711 | + const filterElement = document.querySelector(`.regex-filter-item[data-id="${filter.id}"]`); | ||
| 712 | + return { | ||
| 713 | + pattern: filterElement.querySelector('.regex-pattern').value, | ||
| 714 | + target: filterElement.querySelector('.regex-target').value, | ||
| 715 | + inverse: filterElement.querySelector('.regex-inverse').checked | ||
| 716 | + }; | ||
| 717 | + }).filter(filter => filter.pattern.trim() !== ''); | ||
| 718 | + } | ||
| 719 | + | ||
| 720 | + // 验证配置 | ||
| 721 | + function validateConfig() { | ||
| 722 | + // 验证正则表达式 | ||
| 723 | + const invalidRegex = regexFilters.some(filter => { | ||
| 724 | + const filterElement = document.querySelector(`.regex-filter-item[data-id="${filter.id}"]`); | ||
| 725 | + const pattern = filterElement.querySelector('.regex-pattern').value.trim(); | ||
| 726 | + if (pattern !== '') { | ||
| 727 | + try { | ||
| 728 | + new RegExp(pattern); | ||
| 729 | + return false; | ||
| 730 | + } catch (e) { | ||
| 731 | + alert(`正则表达式 "${pattern}" 格式无效!`); | ||
| 732 | + return true; | ||
| 733 | + } | ||
| 734 | + } | ||
| 735 | + return false; | ||
| 736 | + }); | ||
| 737 | + | ||
| 738 | + if (invalidRegex) { | ||
| 739 | + return false; | ||
| 740 | + } | ||
| 741 | + | ||
| 742 | + // 验证是否有账号配置 | ||
| 743 | + if (accounts.length === 0) { | ||
| 744 | + alert('请至少添加一个账号!'); | ||
| 745 | + return false; | ||
| 746 | + } | ||
| 747 | + | ||
| 748 | + // 验证每个账号是否都有Cookie | ||
| 749 | + const invalidAccounts = accounts.filter(account => { | ||
| 750 | + const accountElement = document.querySelector(`.account-item[data-id="${account.id}"]`); | ||
| 751 | + return !accountElement.querySelector('.account-cookie').value.trim(); | ||
| 752 | + }); | ||
| 753 | + | ||
| 754 | + if (invalidAccounts.length > 0) { | ||
| 755 | + alert('存在未配置Cookie的账号,请检查!'); | ||
| 756 | + return false; | ||
| 757 | + } | ||
| 758 | + | ||
| 759 | + // 验证并行配置 | ||
| 760 | + const maxConcurrent = parseInt(document.getElementById('maxConcurrent').value); | ||
| 761 | + const requestsPerMinute = parseInt(document.getElementById('requestsPerMinute').value); | ||
| 762 | + if (maxConcurrent < 1 || maxConcurrent > 5) { | ||
| 763 | + alert('最大并行数必须在1-5之间!'); | ||
| 764 | + return false; | ||
| 765 | + } | ||
| 766 | + if (requestsPerMinute < 30 || requestsPerMinute > 120) { | ||
| 767 | + alert('每分钟请求数必须在30-120之间!'); | ||
| 768 | + return false; | ||
| 769 | + } | ||
| 770 | + | ||
| 771 | + // 验证数据库配置 | ||
| 772 | + const dbConfig = { | ||
| 773 | + host: document.getElementById('dbHost').value.trim(), | ||
| 774 | + port: document.getElementById('dbPort').value.trim(), | ||
| 775 | + name: document.getElementById('dbName').value.trim(), | ||
| 776 | + user: document.getElementById('dbUser').value.trim(), | ||
| 777 | + password: document.getElementById('dbPassword').value.trim() | ||
| 778 | + }; | ||
| 779 | + | ||
| 780 | + if (!dbConfig.host || !dbConfig.port || !dbConfig.name || !dbConfig.user || !dbConfig.password) { | ||
| 781 | + alert('请完整填写数据库配置信息!'); | ||
| 782 | + return false; | ||
| 783 | + } | ||
| 784 | + | ||
| 785 | + return true; | ||
| 786 | + } | ||
| 787 | + | ||
| 788 | + // 测试数据库连接 | ||
| 789 | + async function testDbConnection() { | ||
| 790 | + const dbConfig = { | ||
| 791 | + type: document.getElementById('dbType').value, | ||
| 792 | + host: document.getElementById('dbHost').value, | ||
| 793 | + port: parseInt(document.getElementById('dbPort').value), | ||
| 794 | + name: document.getElementById('dbName').value, | ||
| 795 | + user: document.getElementById('dbUser').value, | ||
| 796 | + password: document.getElementById('dbPassword').value | ||
| 797 | + }; | ||
| 798 | + | ||
| 799 | + try { | ||
| 800 | + const response = await fetch('/api/spider/test-db', { | ||
| 801 | + method: 'POST', | ||
| 802 | + headers: { | ||
| 803 | + 'Content-Type': 'application/json' | ||
| 804 | + }, | ||
| 805 | + body: JSON.stringify(dbConfig) | ||
| 806 | + }); | ||
| 807 | + | ||
| 808 | + const data = await response.json(); | ||
| 809 | + if (data.success) { | ||
| 810 | + alert('数据库连接测试成功!'); | ||
| 811 | + } else { | ||
| 812 | + alert('数据库连接测试失败:' + data.message); | ||
| 813 | + } | ||
| 814 | + } catch (error) { | ||
| 815 | + alert('测试连接时发生错误:' + error.message); | ||
| 816 | + } | ||
| 817 | + } | ||
| 818 | + | ||
| 819 | + // 监听数据库类型变化 | ||
| 820 | + document.getElementById('dbType').addEventListener('change', function() { | ||
| 821 | + const dbType = this.value; | ||
| 822 | + const portInput = document.getElementById('dbPort'); | ||
| 823 | + | ||
| 824 | + // 根据数据库类型设置默认端口 | ||
| 825 | + switch(dbType) { | ||
| 826 | + case 'mysql': | ||
| 827 | + portInput.value = '3306'; | ||
| 828 | + break; | ||
| 829 | + case 'postgresql': | ||
| 830 | + portInput.value = '5432'; | ||
| 831 | + break; | ||
| 832 | + case 'mongodb': | ||
| 833 | + portInput.value = '27017'; | ||
| 834 | + break; | ||
| 835 | + } | ||
| 836 | + }); | ||
| 837 | + | ||
| 271 | // 保存配置 | 838 | // 保存配置 |
| 272 | function saveConfig() { | 839 | function saveConfig() { |
| 273 | const config = { | 840 | const config = { |
| @@ -14,6 +14,12 @@ import aiohttp | @@ -14,6 +14,12 @@ import aiohttp | ||
| 14 | from concurrent.futures import ThreadPoolExecutor | 14 | from concurrent.futures import ThreadPoolExecutor |
| 15 | from ratelimit import limits, sleep_and_retry | 15 | from ratelimit import limits, sleep_and_retry |
| 16 | from tenacity import retry, stop_after_attempt, wait_exponential | 16 | from tenacity import retry, stop_after_attempt, wait_exponential |
| 17 | +import pymysql | ||
| 18 | +import psycopg2 | ||
| 19 | +from pymongo import MongoClient | ||
| 20 | +from cryptography.fernet import Fernet | ||
| 21 | +import base64 | ||
| 22 | +import re | ||
| 17 | 23 | ||
| 18 | # 创建蓝图 | 24 | # 创建蓝图 |
| 19 | spider_bp = Blueprint('spider', __name__) | 25 | spider_bp = Blueprint('spider', __name__) |
| @@ -22,6 +28,10 @@ spider_bp = Blueprint('spider', __name__) | @@ -22,6 +28,10 @@ spider_bp = Blueprint('spider', __name__) | ||
| 22 | logger = logging.getLogger('spider_control') | 28 | logger = logging.getLogger('spider_control') |
| 23 | logger.setLevel(logging.INFO) | 29 | logger.setLevel(logging.INFO) |
| 24 | 30 | ||
| 31 | +# 加密密钥 | ||
| 32 | +ENCRYPTION_KEY = Fernet.generate_key() | ||
| 33 | +cipher_suite = Fernet(ENCRYPTION_KEY) | ||
| 34 | + | ||
| 25 | # 存储WebSocket连接的集合 | 35 | # 存储WebSocket连接的集合 |
| 26 | websocket_connections = set() | 36 | websocket_connections = set() |
| 27 | 37 | ||
| @@ -41,14 +51,93 @@ DEFAULT_CONFIG = { | @@ -41,14 +51,93 @@ DEFAULT_CONFIG = { | ||
| 41 | 'interval': 5, | 51 | 'interval': 5, |
| 42 | 'maxRetries': 3, | 52 | 'maxRetries': 3, |
| 43 | 'timeout': 30, | 53 | 'timeout': 30, |
| 44 | - 'maxConcurrent': 2 | 54 | + 'maxConcurrent': 2, |
| 55 | + 'requestsPerMinute': 60 | ||
| 45 | } | 56 | } |
| 46 | 57 | ||
| 47 | -# 限流装饰器 | ||
| 48 | -@sleep_and_retry | ||
| 49 | -@limits(calls=100, period=60) # 每分钟最多100个请求 | ||
| 50 | -def rate_limited_request(): | ||
| 51 | - pass | 58 | +def encrypt_data(data): |
| 59 | + """加密敏感数据""" | ||
| 60 | + if not data: | ||
| 61 | + return None | ||
| 62 | + return cipher_suite.encrypt(data.encode()).decode() | ||
| 63 | + | ||
| 64 | +def decrypt_data(encrypted_data): | ||
| 65 | + """解密敏感数据""" | ||
| 66 | + if not encrypted_data: | ||
| 67 | + return None | ||
| 68 | + return cipher_suite.decrypt(encrypted_data.encode()).decode() | ||
| 69 | + | ||
| 70 | +@spider_bp.route('/api/spider/test-db', methods=['POST']) | ||
| 71 | +def test_db_connection(): | ||
| 72 | + """测试数据库连接""" | ||
| 73 | + try: | ||
| 74 | + data = request.get_json() | ||
| 75 | + db_type = data.get('type') | ||
| 76 | + host = data.get('host') | ||
| 77 | + port = data.get('port') | ||
| 78 | + db_name = data.get('name') | ||
| 79 | + user = data.get('user') | ||
| 80 | + password = data.get('password') | ||
| 81 | + | ||
| 82 | + if not all([db_type, host, port, db_name, user, password]): | ||
| 83 | + return jsonify({ | ||
| 84 | + 'success': False, | ||
| 85 | + 'message': '请提供完整的数据库配置信息' | ||
| 86 | + }) | ||
| 87 | + | ||
| 88 | + try: | ||
| 89 | + if db_type == 'mysql': | ||
| 90 | + connection = pymysql.connect( | ||
| 91 | + host=host, | ||
| 92 | + port=port, | ||
| 93 | + user=user, | ||
| 94 | + password=password, | ||
| 95 | + database=db_name | ||
| 96 | + ) | ||
| 97 | + connection.close() | ||
| 98 | + elif db_type == 'postgresql': | ||
| 99 | + connection = psycopg2.connect( | ||
| 100 | + host=host, | ||
| 101 | + port=port, | ||
| 102 | + database=db_name, | ||
| 103 | + user=user, | ||
| 104 | + password=password | ||
| 105 | + ) | ||
| 106 | + connection.close() | ||
| 107 | + elif db_type == 'mongodb': | ||
| 108 | + client = MongoClient( | ||
| 109 | + host=host, | ||
| 110 | + port=port, | ||
| 111 | + username=user, | ||
| 112 | + password=password, | ||
| 113 | + authSource=db_name | ||
| 114 | + ) | ||
| 115 | + client.server_info() # 测试连接 | ||
| 116 | + client.close() | ||
| 117 | + else: | ||
| 118 | + return jsonify({ | ||
| 119 | + 'success': False, | ||
| 120 | + 'message': '不支持的数据库类型' | ||
| 121 | + }) | ||
| 122 | + | ||
| 123 | + return jsonify({ | ||
| 124 | + 'success': True, | ||
| 125 | + 'message': '数据库连接测试成功' | ||
| 126 | + }) | ||
| 127 | + | ||
| 128 | + except Exception as e: | ||
| 129 | + logger.error(f"数据库连接测试失败: {str(e)}") | ||
| 130 | + return jsonify({ | ||
| 131 | + 'success': False, | ||
| 132 | + 'message': f'数据库连接失败: {str(e)}' | ||
| 133 | + }) | ||
| 134 | + | ||
| 135 | + except Exception as e: | ||
| 136 | + logger.error(f"处理数据库测试请求时出错: {str(e)}") | ||
| 137 | + return jsonify({ | ||
| 138 | + 'success': False, | ||
| 139 | + 'message': str(e) | ||
| 140 | + }) | ||
| 52 | 141 | ||
| 53 | class SpiderWorker: | 142 | class SpiderWorker: |
| 54 | def __init__(self, topics, parameters): | 143 | def __init__(self, topics, parameters): |
| @@ -60,6 +149,50 @@ class SpiderWorker: | @@ -60,6 +149,50 @@ class SpiderWorker: | ||
| 60 | self.message_buffer = [] | 149 | self.message_buffer = [] |
| 61 | self.message_buffer_size = 10 | 150 | self.message_buffer_size = 10 |
| 62 | self.semaphore = asyncio.Semaphore(parameters.get('maxConcurrent', DEFAULT_CONFIG['maxConcurrent'])) | 151 | self.semaphore = asyncio.Semaphore(parameters.get('maxConcurrent', DEFAULT_CONFIG['maxConcurrent'])) |
| 152 | + self.rate_limiter = asyncio.Semaphore(parameters.get('requestsPerMinute', DEFAULT_CONFIG['requestsPerMinute'])) | ||
| 153 | + self.accounts = parameters.get('accounts', []) | ||
| 154 | + self.current_account_index = 0 | ||
| 155 | + self.account_lock = asyncio.Lock() | ||
| 156 | + | ||
| 157 | + # 添加筛选条件 | ||
| 158 | + self.filters = parameters.get('filters', {}) | ||
| 159 | + self.interaction_filters = self.filters.get('interaction', {}) | ||
| 160 | + self.regex_filters = self.filters.get('regex', []) | ||
| 161 | + self.filter_options = self.filters.get('options', {}) | ||
| 162 | + | ||
| 163 | + # 初始化正则表达式 | ||
| 164 | + self.compiled_regex = [] | ||
| 165 | + for regex_filter in self.regex_filters: | ||
| 166 | + try: | ||
| 167 | + pattern = regex_filter['pattern'] | ||
| 168 | + if pattern: | ||
| 169 | + self.compiled_regex.append({ | ||
| 170 | + 'regex': re.compile(pattern), | ||
| 171 | + 'target': regex_filter['target'], | ||
| 172 | + 'inverse': regex_filter['inverse'] | ||
| 173 | + }) | ||
| 174 | + except re.error as e: | ||
| 175 | + logger.error(f"正则表达式编译失败: {pattern}, 错误: {e}") | ||
| 176 | + | ||
| 177 | + def get_next_account(self): | ||
| 178 | + """获取下一个可用账号""" | ||
| 179 | + with self.account_lock: | ||
| 180 | + if not self.accounts: | ||
| 181 | + raise ValueError("没有可用的账号") | ||
| 182 | + | ||
| 183 | + account = self.accounts[self.current_account_index] | ||
| 184 | + self.current_account_index = (self.current_account_index + 1) % len(self.accounts) | ||
| 185 | + return account | ||
| 186 | + | ||
| 187 | + async def acquire_rate_limit(self): | ||
| 188 | + """获取速率限制令牌""" | ||
| 189 | + await self.rate_limiter.acquire() | ||
| 190 | + asyncio.create_task(self.release_rate_limit()) | ||
| 191 | + | ||
| 192 | + async def release_rate_limit(self): | ||
| 193 | + """释放速率限制令牌""" | ||
| 194 | + await asyncio.sleep(60) # 1分钟后释放 | ||
| 195 | + self.rate_limiter.release() | ||
| 63 | 196 | ||
| 64 | async def send_message(self, message): | 197 | async def send_message(self, message): |
| 65 | """异步发送消息,使用缓冲区优化""" | 198 | """异步发送消息,使用缓冲区优化""" |
| @@ -82,22 +215,43 @@ class SpiderWorker: | @@ -82,22 +215,43 @@ class SpiderWorker: | ||
| 82 | async def crawl_single_topic(self, topic): | 215 | async def crawl_single_topic(self, topic): |
| 83 | """爬取单个话题""" | 216 | """爬取单个话题""" |
| 84 | try: | 217 | try: |
| 85 | - rate_limited_request() | 218 | + await self.acquire_rate_limit() |
| 219 | + | ||
| 220 | + # 获取当前要使用的账号 | ||
| 221 | + account = self.get_next_account() | ||
| 86 | 222 | ||
| 87 | await self.send_message({ | 223 | await self.send_message({ |
| 88 | 'type': 'log', | 224 | 'type': 'log', |
| 89 | - 'message': f'开始爬取话题: {topic}' | 225 | + 'message': f'使用账号 {account["username"]} 开始爬取话题: {topic}' |
| 90 | }) | 226 | }) |
| 91 | 227 | ||
| 228 | + filtered_count = 0 | ||
| 229 | + total_count = 0 | ||
| 230 | + | ||
| 92 | async with self.semaphore: | 231 | async with self.semaphore: |
| 232 | + # 创建一个回调函数来处理爬取的数据 | ||
| 233 | + def process_post(post): | ||
| 234 | + nonlocal filtered_count, total_count | ||
| 235 | + total_count += 1 | ||
| 236 | + | ||
| 237 | + # 应用筛选条件 | ||
| 238 | + if self.apply_filters(post): | ||
| 239 | + filtered_count += 1 | ||
| 240 | + return True | ||
| 241 | + return False | ||
| 242 | + | ||
| 243 | + # 调用爬虫并传入回调函数 | ||
| 93 | await asyncio.get_event_loop().run_in_executor( | 244 | await asyncio.get_event_loop().run_in_executor( |
| 94 | thread_pool, | 245 | thread_pool, |
| 95 | - self.spider.crawl_topic, | ||
| 96 | - topic, | ||
| 97 | - self.parameters['crawlDepth'], | ||
| 98 | - self.parameters['interval'], | ||
| 99 | - self.parameters['maxRetries'], | ||
| 100 | - self.parameters['timeout'] | 246 | + lambda: self.spider.crawl_topic( |
| 247 | + topic, | ||
| 248 | + self.parameters['crawlDepth'], | ||
| 249 | + self.parameters['interval'], | ||
| 250 | + self.parameters['maxRetries'], | ||
| 251 | + self.parameters['timeout'], | ||
| 252 | + account['cookie'], | ||
| 253 | + process_post # 传入回调函数 | ||
| 254 | + ) | ||
| 101 | ) | 255 | ) |
| 102 | 256 | ||
| 103 | self.completed_topics += 1 | 257 | self.completed_topics += 1 |
| @@ -108,9 +262,10 @@ class SpiderWorker: | @@ -108,9 +262,10 @@ class SpiderWorker: | ||
| 108 | 'value': progress | 262 | 'value': progress |
| 109 | }) | 263 | }) |
| 110 | 264 | ||
| 265 | + # 发送筛选统计信息 | ||
| 111 | await self.send_message({ | 266 | await self.send_message({ |
| 112 | 'type': 'log', | 267 | 'type': 'log', |
| 113 | - 'message': f'话题 {topic} 爬取完成' | 268 | + 'message': f'话题 {topic} 爬取完成,共爬取 {total_count} 条微博,符合筛选条件 {filtered_count} 条' |
| 114 | }) | 269 | }) |
| 115 | 270 | ||
| 116 | except Exception as e: | 271 | except Exception as e: |
| @@ -142,6 +297,116 @@ class SpiderWorker: | @@ -142,6 +297,116 @@ class SpiderWorker: | ||
| 142 | finally: | 297 | finally: |
| 143 | await self.flush_messages() | 298 | await self.flush_messages() |
| 144 | 299 | ||
| 300 | + def apply_filters(self, post): | ||
| 301 | + """ | ||
| 302 | + 应用筛选条件到单条微博 | ||
| 303 | + | ||
| 304 | + Args: | ||
| 305 | + post: 微博数据字典 | ||
| 306 | + | ||
| 307 | + Returns: | ||
| 308 | + bool: 是否通过筛选 | ||
| 309 | + """ | ||
| 310 | + try: | ||
| 311 | + # 1. 检查互动数据 | ||
| 312 | + if not self._check_interaction_metrics(post): | ||
| 313 | + return False | ||
| 314 | + | ||
| 315 | + # 2. 检查正则匹配 | ||
| 316 | + if not self._check_regex_filters(post): | ||
| 317 | + return False | ||
| 318 | + | ||
| 319 | + # 3. 检查高级选项 | ||
| 320 | + if not self._check_advanced_options(post): | ||
| 321 | + return False | ||
| 322 | + | ||
| 323 | + return True | ||
| 324 | + | ||
| 325 | + except Exception as e: | ||
| 326 | + logger.error(f"应用筛选条件时出错: {e}") | ||
| 327 | + return False | ||
| 328 | + | ||
| 329 | + def _check_interaction_metrics(self, post): | ||
| 330 | + """检查互动指标是否满足条件""" | ||
| 331 | + try: | ||
| 332 | + # 获取互动指标的最小值要求 | ||
| 333 | + min_likes = self.interaction_filters.get('minLikes', 0) | ||
| 334 | + min_comments = self.interaction_filters.get('minComments', 0) | ||
| 335 | + min_reposts = self.interaction_filters.get('minReposts', 0) | ||
| 336 | + min_reads = self.interaction_filters.get('minReads', 0) | ||
| 337 | + | ||
| 338 | + # 检查是否满足所有条件 | ||
| 339 | + if post.get('like_count', 0) < min_likes: | ||
| 340 | + return False | ||
| 341 | + if post.get('comment_count', 0) < min_comments: | ||
| 342 | + return False | ||
| 343 | + if post.get('forward_count', 0) < min_reposts: | ||
| 344 | + return False | ||
| 345 | + if post.get('read_count', 0) < min_reads: | ||
| 346 | + return False | ||
| 347 | + | ||
| 348 | + return True | ||
| 349 | + | ||
| 350 | + except Exception as e: | ||
| 351 | + logger.error(f"检查互动指标时出错: {e}") | ||
| 352 | + return False | ||
| 353 | + | ||
| 354 | + def _check_regex_filters(self, post): | ||
| 355 | + """检查正则表达式匹配""" | ||
| 356 | + try: | ||
| 357 | + for regex_filter in self.compiled_regex: | ||
| 358 | + regex = regex_filter['regex'] | ||
| 359 | + target = regex_filter['target'] | ||
| 360 | + inverse = regex_filter['inverse'] | ||
| 361 | + | ||
| 362 | + # 获取目标文本 | ||
| 363 | + if target == 'content': | ||
| 364 | + text = post.get('content', '') | ||
| 365 | + elif target == 'author': | ||
| 366 | + text = post.get('user_name', '') | ||
| 367 | + elif target == 'location': | ||
| 368 | + text = post.get('location', '') | ||
| 369 | + else: | ||
| 370 | + continue | ||
| 371 | + | ||
| 372 | + # 执行匹配 | ||
| 373 | + match = bool(regex.search(text)) | ||
| 374 | + | ||
| 375 | + # 如果是反向匹配,取反结果 | ||
| 376 | + if inverse: | ||
| 377 | + match = not match | ||
| 378 | + | ||
| 379 | + # 如果不满足条件,返回False | ||
| 380 | + if not match: | ||
| 381 | + return False | ||
| 382 | + | ||
| 383 | + return True | ||
| 384 | + | ||
| 385 | + except Exception as e: | ||
| 386 | + logger.error(f"检查正则匹配时出错: {e}") | ||
| 387 | + return False | ||
| 388 | + | ||
| 389 | + def _check_advanced_options(self, post): | ||
| 390 | + """检查高级筛选选项""" | ||
| 391 | + try: | ||
| 392 | + # 检查是否只要原创内容 | ||
| 393 | + if self.filter_options.get('originalOnly') and not post.get('is_original', False): | ||
| 394 | + return False | ||
| 395 | + | ||
| 396 | + # 检查是否必须包含媒体 | ||
| 397 | + if self.filter_options.get('withMediaOnly') and not post.get('has_media', False): | ||
| 398 | + return False | ||
| 399 | + | ||
| 400 | + # 检查是否只要认证用户 | ||
| 401 | + if self.filter_options.get('verifiedOnly') and not post.get('user_verified', False): | ||
| 402 | + return False | ||
| 403 | + | ||
| 404 | + return True | ||
| 405 | + | ||
| 406 | + except Exception as e: | ||
| 407 | + logger.error(f"检查高级选项时出错: {e}") | ||
| 408 | + return False | ||
| 409 | + | ||
| 145 | async def broadcast_message(messages): | 410 | async def broadcast_message(messages): |
| 146 | """广播消息到所有WebSocket连接""" | 411 | """广播消息到所有WebSocket连接""" |
| 147 | if not websocket_connections: | 412 | if not websocket_connections: |
| @@ -172,6 +437,7 @@ async def start_spider(): | @@ -172,6 +437,7 @@ async def start_spider(): | ||
| 172 | data = request.get_json() | 437 | data = request.get_json() |
| 173 | topics = data.get('topics', []) | 438 | topics = data.get('topics', []) |
| 174 | parameters = {**DEFAULT_CONFIG, **data.get('parameters', {})} | 439 | parameters = {**DEFAULT_CONFIG, **data.get('parameters', {})} |
| 440 | + accounts = data.get('accounts', []) | ||
| 175 | 441 | ||
| 176 | if not topics: | 442 | if not topics: |
| 177 | return jsonify({ | 443 | return jsonify({ |
| @@ -179,6 +445,20 @@ async def start_spider(): | @@ -179,6 +445,20 @@ async def start_spider(): | ||
| 179 | 'message': '请选择至少一个话题' | 445 | 'message': '请选择至少一个话题' |
| 180 | }) | 446 | }) |
| 181 | 447 | ||
| 448 | + if not accounts: | ||
| 449 | + return jsonify({ | ||
| 450 | + 'success': False, | ||
| 451 | + 'message': '请配置至少一个账号' | ||
| 452 | + }) | ||
| 453 | + | ||
| 454 | + # 处理账号Cookie的加密存储 | ||
| 455 | + for account in accounts: | ||
| 456 | + if account.get('saveCookie'): | ||
| 457 | + account['cookie'] = encrypt_data(account['cookie']) | ||
| 458 | + | ||
| 459 | + # 将账号信息添加到参数中 | ||
| 460 | + parameters['accounts'] = accounts | ||
| 461 | + | ||
| 182 | # 创建爬虫工作器 | 462 | # 创建爬虫工作器 |
| 183 | worker = SpiderWorker(topics, parameters) | 463 | worker = SpiderWorker(topics, parameters) |
| 184 | 464 | ||
| @@ -298,17 +578,36 @@ def generate_ai_config(): | @@ -298,17 +578,36 @@ def generate_ai_config(): | ||
| 298 | 578 | ||
| 299 | # 构建AI提示 | 579 | # 构建AI提示 |
| 300 | system_prompt = """你是一个专业的爬虫配置助手。请根据用户的自然语言描述,生成合适的微博爬虫配置。 | 580 | system_prompt = """你是一个专业的爬虫配置助手。请根据用户的自然语言描述,生成合适的微博爬虫配置。 |
| 581 | + | ||
| 301 | 配置应包含以下内容: | 582 | 配置应包含以下内容: |
| 302 | 1. 要爬取的话题列表 | 583 | 1. 要爬取的话题列表 |
| 303 | -2. 爬虫参数(爬取深度、间隔时间、重试次数、超时时间) | 584 | +2. 爬虫参数配置 |
| 585 | + - 爬取深度(crawlDepth):1-10页 | ||
| 586 | + - 间隔时间(interval):3-30秒 | ||
| 587 | + - 重试次数(maxRetries):1-5次 | ||
| 588 | + - 超时时间(timeout):10-60秒 | ||
| 589 | + - 最大并行数(maxConcurrent):1-5 | ||
| 590 | + - 每分钟请求数限制(requestsPerMinute):30-120 | ||
| 591 | + | ||
| 592 | +3. 内容筛选条件 | ||
| 593 | + a) 互动数据筛选(设为0表示不启用) | ||
| 594 | + - 最小点赞数(minLikes) | ||
| 595 | + - 最小评论数(minComments) | ||
| 596 | + - 最小转发数(minReposts) | ||
| 597 | + - 最小阅读数(minReads) | ||
| 598 | + | ||
| 599 | + b) 正则表达式筛选(数组,可以有多个规则) | ||
| 600 | + - pattern: 正则表达式模式 | ||
| 601 | + - target: 匹配目标(content/author/location) | ||
| 602 | + - inverse: 是否反向匹配(true/false) | ||
| 603 | + | ||
| 604 | + c) 高级筛选选项(布尔值) | ||
| 605 | + - originalOnly: 是否只要原创内容 | ||
| 606 | + - withMediaOnly: 是否必须包含媒体 | ||
| 607 | + - verifiedOnly: 是否只要认证用户 | ||
| 304 | 608 | ||
| 305 | 请先用通俗易懂的语言解释你的配置建议,然后在最后提供一个JSON格式的具体配置。 | 609 | 请先用通俗易懂的语言解释你的配置建议,然后在最后提供一个JSON格式的具体配置。 |
| 306 | -注意: | ||
| 307 | -- 爬取深度(crawlDepth)范围:1-10页 | ||
| 308 | -- 间隔时间(interval)范围:3-30秒 | ||
| 309 | -- 重试次数(maxRetries)范围:1-5次 | ||
| 310 | -- 超时时间(timeout)范围:10-60秒 | ||
| 311 | -- 所有参数都必须是整数 | 610 | +所有数值参数必须是整数,并且在指定范围内。 |
| 312 | 611 | ||
| 313 | 示例输出格式: | 612 | 示例输出格式: |
| 314 | 根据您的需求,我建议... | 613 | 根据您的需求,我建议... |
| @@ -319,7 +618,29 @@ def generate_ai_config(): | @@ -319,7 +618,29 @@ def generate_ai_config(): | ||
| 319 | "crawlDepth": 5, | 618 | "crawlDepth": 5, |
| 320 | "interval": 5, | 619 | "interval": 5, |
| 321 | "maxRetries": 3, | 620 | "maxRetries": 3, |
| 322 | - "timeout": 30 | 621 | + "timeout": 30, |
| 622 | + "maxConcurrent": 2, | ||
| 623 | + "requestsPerMinute": 60 | ||
| 624 | + }, | ||
| 625 | + "filters": { | ||
| 626 | + "interaction": { | ||
| 627 | + "minLikes": 1000, | ||
| 628 | + "minComments": 100, | ||
| 629 | + "minReposts": 50, | ||
| 630 | + "minReads": 10000 | ||
| 631 | + }, | ||
| 632 | + "regex": [ | ||
| 633 | + { | ||
| 634 | + "pattern": "关键词", | ||
| 635 | + "target": "content", | ||
| 636 | + "inverse": false | ||
| 637 | + } | ||
| 638 | + ], | ||
| 639 | + "options": { | ||
| 640 | + "originalOnly": true, | ||
| 641 | + "withMediaOnly": false, | ||
| 642 | + "verifiedOnly": true | ||
| 643 | + } | ||
| 323 | } | 644 | } |
| 324 | }""" | 645 | }""" |
| 325 | 646 | ||
| @@ -368,4 +689,52 @@ def generate_ai_config(): | @@ -368,4 +689,52 @@ def generate_ai_config(): | ||
| 368 | return jsonify({ | 689 | return jsonify({ |
| 369 | 'success': False, | 690 | 'success': False, |
| 370 | 'message': str(e) | 691 | 'message': str(e) |
| 692 | + }) | ||
| 693 | + | ||
| 694 | +@spider_bp.route('/api/spider/validate-account', methods=['POST']) | ||
| 695 | +async def validate_account(): | ||
| 696 | + """验证微博账号""" | ||
| 697 | + try: | ||
| 698 | + data = request.get_json() | ||
| 699 | + cookie = data.get('cookie') | ||
| 700 | + | ||
| 701 | + if not cookie: | ||
| 702 | + return jsonify({ | ||
| 703 | + 'success': False, | ||
| 704 | + 'message': 'Cookie不能为空' | ||
| 705 | + }) | ||
| 706 | + | ||
| 707 | + # 创建测试请求 | ||
| 708 | + try: | ||
| 709 | + async with aiohttp.ClientSession() as session: | ||
| 710 | + headers = { | ||
| 711 | + 'Cookie': cookie, | ||
| 712 | + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | ||
| 713 | + } | ||
| 714 | + # 尝试访问微博API | ||
| 715 | + async with session.get('https://weibo.com/ajax/profile/info', headers=headers) as response: | ||
| 716 | + if response.status == 200: | ||
| 717 | + data = await response.json() | ||
| 718 | + if data.get('data', {}).get('user', {}): | ||
| 719 | + return jsonify({ | ||
| 720 | + 'success': True, | ||
| 721 | + 'message': '账号验证成功' | ||
| 722 | + }) | ||
| 723 | + | ||
| 724 | + return jsonify({ | ||
| 725 | + 'success': False, | ||
| 726 | + 'message': 'Cookie无效或已过期' | ||
| 727 | + }) | ||
| 728 | + except Exception as e: | ||
| 729 | + logger.error(f"验证账号时发生错误: {e}") | ||
| 730 | + return jsonify({ | ||
| 731 | + 'success': False, | ||
| 732 | + 'message': f'验证过程出错: {str(e)}' | ||
| 733 | + }) | ||
| 734 | + | ||
| 735 | + except Exception as e: | ||
| 736 | + logger.error(f"处理账号验证请求时出错: {e}") | ||
| 737 | + return jsonify({ | ||
| 738 | + 'success': False, | ||
| 739 | + 'message': str(e) | ||
| 371 | }) | 740 | }) |
-
Please register or login to post a comment