戒酒的李白

Optimize the crawler configuration page, add multi-account parallel functionalit…

…y, adapt AI configuration features, and include database configuration options.
@@ -93,8 +93,20 @@ class SpiderData: @@ -93,8 +93,20 @@ class SpiderData:
93 connection.rollback() 93 connection.rollback()
94 94
95 def crawl_topic(self, topic: str, depth: int = 3, interval: int = 5, 95 def crawl_topic(self, topic: str, depth: int = 3, interval: int = 5,
96 - max_retries: int = 3, timeout: int = 30):  
97 - """爬取指定话题的微博内容""" 96 + max_retries: int = 3, timeout: int = 30, cookie: str = None,
  97 + filter_callback = None):
  98 + """
  99 + 爬取指定话题的微博内容
  100 +
  101 + Args:
  102 + topic: 话题关键词
  103 + depth: 爬取深度(页数)
  104 + interval: 请求间隔(秒)
  105 + max_retries: 最大重试次数
  106 + timeout: 请求超时时间(秒)
  107 + cookie: 用户Cookie
  108 + filter_callback: 筛选回调函数,返回True表示保留该条微博
  109 + """
98 # 参数验证 110 # 参数验证
99 if not isinstance(depth, int) or depth < 1 or depth > 10: 111 if not isinstance(depth, int) or depth < 1 or depth > 10:
100 raise ValueError("爬取深度必须在1-10页之间") 112 raise ValueError("爬取深度必须在1-10页之间")
@@ -105,6 +117,10 @@ class SpiderData: @@ -105,6 +117,10 @@ class SpiderData:
105 if not isinstance(timeout, int) or timeout < 10 or timeout > 60: 117 if not isinstance(timeout, int) or timeout < 10 or timeout > 60:
106 raise ValueError("请求超时时间必须在10-60秒之间") 118 raise ValueError("请求超时时间必须在10-60秒之间")
107 119
  120 + # 更新请求头中的Cookie
  121 + if cookie:
  122 + self.headers['Cookie'] = cookie
  123 +
108 logging.info(f"开始爬取话题: {topic}, 参数: depth={depth}, interval={interval}, max_retries={max_retries}, timeout={timeout}") 124 logging.info(f"开始爬取话题: {topic}, 参数: depth={depth}, interval={interval}, max_retries={max_retries}, timeout={timeout}")
109 125
110 for page in range(1, depth + 1): 126 for page in range(1, depth + 1):
@@ -116,7 +132,7 @@ class SpiderData: @@ -116,7 +132,7 @@ class SpiderData:
116 # 检查缓存 132 # 检查缓存
117 cached_content = self._get_cached_page(url) 133 cached_content = self._get_cached_page(url)
118 if cached_content: 134 if cached_content:
119 - self._parse_page(cached_content) 135 + self._parse_page(cached_content, filter_callback)
120 logging.info(f"使用缓存数据: {topic} 第 {page} 页") 136 logging.info(f"使用缓存数据: {topic} 第 {page} 页")
121 break 137 break
122 138
@@ -125,7 +141,7 @@ class SpiderData: @@ -125,7 +141,7 @@ class SpiderData:
125 if response.status_code == 200: 141 if response.status_code == 200:
126 # 缓存页面内容 142 # 缓存页面内容
127 self._cache_page(url, response.text) 143 self._cache_page(url, response.text)
128 - self._parse_page(response.text) 144 + self._parse_page(response.text, filter_callback)
129 logging.info(f"成功爬取话题 {topic} 第 {page} 页") 145 logging.info(f"成功爬取话题 {topic} 第 {page} 页")
130 break 146 break
131 else: 147 else:
@@ -154,8 +170,14 @@ class SpiderData: @@ -154,8 +170,14 @@ class SpiderData:
154 # 最后刷新缓冲区 170 # 最后刷新缓冲区
155 self._flush_buffer() 171 self._flush_buffer()
156 172
157 - def _parse_page(self, html_content: str):  
158 - """解析页面内容并保存数据""" 173 + def _parse_page(self, html_content: str, filter_callback = None):
  174 + """
  175 + 解析页面内容并保存数据
  176 +
  177 + Args:
  178 + html_content: HTML页面内容
  179 + filter_callback: 筛选回调函数
  180 + """
159 try: 181 try:
160 soup = BeautifulSoup(html_content, 'html.parser') 182 soup = BeautifulSoup(html_content, 'html.parser')
161 weibo_items = soup.find_all('div', class_='card-wrap') 183 weibo_items = soup.find_all('div', class_='card-wrap')
@@ -178,6 +200,19 @@ class SpiderData: @@ -178,6 +200,19 @@ class SpiderData:
178 # 提取互动数据 200 # 提取互动数据
179 actions = item.find_all('li', class_='action') 201 actions = item.find_all('li', class_='action')
180 202
  203 + # 提取用户认证状态
  204 + user_verified = bool(item.find('i', class_='icon-vip'))
  205 +
  206 + # 提取是否原创
  207 + is_original = not bool(item.find('span', class_='repost'))
  208 +
  209 + # 提取是否包含媒体
  210 + has_media = bool(item.find('div', class_='media'))
  211 +
  212 + # 提取发布位置
  213 + location = item.find('a', class_='location')
  214 + location_text = location.text.strip() if location else ''
  215 +
181 # 构建数据字典 216 # 构建数据字典
182 weibo_data = { 217 weibo_data = {
183 'content': content.text.strip(), 218 'content': content.text.strip(),
@@ -186,9 +221,16 @@ class SpiderData: @@ -186,9 +221,16 @@ class SpiderData:
186 'forward_count': self._extract_number(actions[0].text) if len(actions) > 0 else 0, 221 'forward_count': self._extract_number(actions[0].text) if len(actions) > 0 else 0,
187 'comment_count': self._extract_number(actions[1].text) if len(actions) > 1 else 0, 222 'comment_count': self._extract_number(actions[1].text) if len(actions) > 1 else 0,
188 'like_count': self._extract_number(actions[2].text) if len(actions) > 2 else 0, 223 'like_count': self._extract_number(actions[2].text) if len(actions) > 2 else 0,
  224 + 'read_count': self._extract_number(actions[3].text) if len(actions) > 3 else 0,
  225 + 'user_verified': user_verified,
  226 + 'is_original': is_original,
  227 + 'has_media': has_media,
  228 + 'location': location_text,
189 'crawl_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S') 229 'crawl_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
190 } 230 }
191 231
  232 + # 如果有筛选回调函数,则进行筛选
  233 + if filter_callback is None or filter_callback(weibo_data):
192 # 添加到插入缓冲区 234 # 添加到插入缓冲区
193 self.insert_buffer.append(weibo_data) 235 self.insert_buffer.append(weibo_data)
194 236
@@ -103,6 +103,214 @@ @@ -103,6 +103,214 @@
103 </div> 103 </div>
104 </div> 104 </div>
105 105
  106 + <!-- 内容筛选配置 -->
  107 + <div class="card mb-4">
  108 + <div class="card-header d-flex justify-content-between align-items-center">
  109 + <h5 class="mb-0">内容筛选配置</h5>
  110 + <button class="btn btn-sm btn-outline-primary" type="button" data-bs-toggle="collapse" data-bs-target="#filterHelp">
  111 + <i class="fas fa-question-circle"></i> 帮助
  112 + </button>
  113 + </div>
  114 + <div class="collapse" id="filterHelp">
  115 + <div class="card-body bg-light">
  116 + <h6>筛选条件说明:</h6>
  117 + <ul>
  118 + <li>数值条件:设置大于某个值进行筛选,如点赞数>1000</li>
  119 + <li>正则匹配:使用正则表达式匹配内容,如包含特定关键词</li>
  120 + <li>多个条件之间是"与"的关系,即同时满足才会保留</li>
  121 + </ul>
  122 + <div class="alert alert-info">
  123 + <i class="fas fa-info-circle"></i> 提示:合理设置筛选条件可以提高数据质量
  124 + </div>
  125 + </div>
  126 + </div>
  127 + <div class="card-body">
  128 + <!-- 互动数据筛选 -->
  129 + <h6 class="mb-3">互动数据筛选</h6>
  130 + <div class="row">
  131 + <div class="col-md-3">
  132 + <div class="mb-3">
  133 + <label class="form-label">点赞数大于</label>
  134 + <input type="number" class="form-control" id="minLikes" value="0" min="0">
  135 + </div>
  136 + </div>
  137 + <div class="col-md-3">
  138 + <div class="mb-3">
  139 + <label class="form-label">评论数大于</label>
  140 + <input type="number" class="form-control" id="minComments" value="0" min="0">
  141 + </div>
  142 + </div>
  143 + <div class="col-md-3">
  144 + <div class="mb-3">
  145 + <label class="form-label">转发数大于</label>
  146 + <input type="number" class="form-control" id="minReposts" value="0" min="0">
  147 + </div>
  148 + </div>
  149 + <div class="col-md-3">
  150 + <div class="mb-3">
  151 + <label class="form-label">阅读数大于</label>
  152 + <input type="number" class="form-control" id="minReads" value="0" min="0">
  153 + </div>
  154 + </div>
  155 + </div>
  156 +
  157 + <!-- 内容正则筛选 -->
  158 + <h6 class="mb-3 mt-4">内容正则筛选</h6>
  159 + <div id="regexFilters">
  160 + <!-- 正则表达式筛选器列表 -->
  161 + </div>
  162 + <button class="btn btn-outline-primary btn-sm mt-2" onclick="addRegexFilter()">
  163 + <i class="fas fa-plus"></i> 添加正则筛选
  164 + </button>
  165 +
  166 + <!-- 高级筛选选项 -->
  167 + <h6 class="mb-3 mt-4">高级选项</h6>
  168 + <div class="form-check mb-2">
  169 + <input class="form-check-input" type="checkbox" id="filterOriginal">
  170 + <label class="form-check-label" for="filterOriginal">
  171 + 仅爬取原创内容
  172 + </label>
  173 + </div>
  174 + <div class="form-check mb-2">
  175 + <input class="form-check-input" type="checkbox" id="filterWithMedia">
  176 + <label class="form-check-label" for="filterWithMedia">
  177 + 必须包含图片或视频
  178 + </label>
  179 + </div>
  180 + <div class="form-check">
  181 + <input class="form-check-input" type="checkbox" id="filterVerified">
  182 + <label class="form-check-label" for="filterVerified">
  183 + 仅认证用户的内容
  184 + </label>
  185 + </div>
  186 + </div>
  187 + </div>
  188 +
  189 + <!-- 账号配置 -->
  190 + <div class="card mb-4">
  191 + <div class="card-header d-flex justify-content-between align-items-center">
  192 + <h5 class="mb-0">账号配置</h5>
  193 + <div>
  194 + <button class="btn btn-sm btn-outline-primary me-2" type="button" data-bs-toggle="collapse" data-bs-target="#accountHelp">
  195 + <i class="fas fa-question-circle"></i> 帮助
  196 + </button>
  197 + <button class="btn btn-sm btn-success" onclick="addAccount()">
  198 + <i class="fas fa-plus"></i> 添加账号
  199 + </button>
  200 + </div>
  201 + </div>
  202 + <div class="collapse" id="accountHelp">
  203 + <div class="card-body bg-light">
  204 + <h6>如何获取Cookie?</h6>
  205 + <ol>
  206 + <li>登录微博网页版</li>
  207 + <li>按F12打开开发者工具</li>
  208 + <li>切换到Network标签页</li>
  209 + <li>刷新页面,找到请求头中的Cookie值</li>
  210 + </ol>
  211 + <div class="alert alert-warning">
  212 + <i class="fas fa-exclamation-triangle"></i> 注意:请勿泄露您的Cookie信息!
  213 + </div>
  214 + <div class="alert alert-info">
  215 + <i class="fas fa-info-circle"></i> 提示:添加多个账号可以提高爬取效率,系统会自动在账号间轮换。
  216 + </div>
  217 + </div>
  218 + </div>
  219 + <div class="card-body">
  220 + <div id="accountsList">
  221 + <!-- 账号列表将通过JavaScript动态生成 -->
  222 + </div>
  223 + <div class="alert alert-warning mt-3" id="noAccountsWarning" style="display: none;">
  224 + <i class="fas fa-exclamation-triangle"></i> 请至少添加一个账号
  225 + </div>
  226 + </div>
  227 + </div>
  228 +
  229 + <!-- 并行配置 -->
  230 + <div class="card mb-4">
  231 + <div class="card-header">
  232 + <h5 class="mb-0">并行配置</h5>
  233 + </div>
  234 + <div class="card-body">
  235 + <div class="row">
  236 + <div class="col-md-6">
  237 + <div class="mb-3">
  238 + <label for="maxConcurrent" class="form-label">最大并行数</label>
  239 + <input type="number" class="form-control" id="maxConcurrent" value="2" min="1" max="5">
  240 + <small class="text-muted">同时进行爬取的最大话题数(1-5)</small>
  241 + </div>
  242 + </div>
  243 + <div class="col-md-6">
  244 + <div class="mb-3">
  245 + <label for="requestsPerMinute" class="form-label">每分钟请求数限制</label>
  246 + <input type="number" class="form-control" id="requestsPerMinute" value="60" min="30" max="120">
  247 + <small class="text-muted">避免请求过于频繁(30-120)</small>
  248 + </div>
  249 + </div>
  250 + </div>
  251 + </div>
  252 + </div>
  253 +
  254 + <!-- 数据库配置 -->
  255 + <div class="card mb-4">
  256 + <div class="card-header">
  257 + <h5 class="mb-0">数据库配置</h5>
  258 + </div>
  259 + <div class="card-body">
  260 + <div class="row">
  261 + <div class="col-md-6">
  262 + <div class="mb-3">
  263 + <label for="dbType" class="form-label">数据库类型</label>
  264 + <select class="form-select" id="dbType">
  265 + <option value="mysql">MySQL</option>
  266 + <option value="postgresql">PostgreSQL</option>
  267 + <option value="mongodb">MongoDB</option>
  268 + </select>
  269 + </div>
  270 + </div>
  271 + <div class="col-md-6">
  272 + <div class="mb-3">
  273 + <label for="dbHost" class="form-label">主机地址</label>
  274 + <input type="text" class="form-control" id="dbHost" value="localhost">
  275 + </div>
  276 + </div>
  277 + </div>
  278 + <div class="row">
  279 + <div class="col-md-6">
  280 + <div class="mb-3">
  281 + <label for="dbPort" class="form-label">端口</label>
  282 + <input type="number" class="form-control" id="dbPort" value="3306">
  283 + </div>
  284 + </div>
  285 + <div class="col-md-6">
  286 + <div class="mb-3">
  287 + <label for="dbName" class="form-label">数据库名</label>
  288 + <input type="text" class="form-control" id="dbName" value="weibo_data">
  289 + </div>
  290 + </div>
  291 + </div>
  292 + <div class="row">
  293 + <div class="col-md-6">
  294 + <div class="mb-3">
  295 + <label for="dbUser" class="form-label">用户名</label>
  296 + <input type="text" class="form-control" id="dbUser">
  297 + </div>
  298 + </div>
  299 + <div class="col-md-6">
  300 + <div class="mb-3">
  301 + <label for="dbPassword" class="form-label">密码</label>
  302 + <input type="password" class="form-control" id="dbPassword">
  303 + </div>
  304 + </div>
  305 + </div>
  306 + <div class="d-flex justify-content-end">
  307 + <button class="btn btn-primary" onclick="testDbConnection()">
  308 + <i class="fas fa-database"></i> 测试连接
  309 + </button>
  310 + </div>
  311 + </div>
  312 + </div>
  313 +
106 <!-- AI配置助手 --> 314 <!-- AI配置助手 -->
107 <div class="card mb-4"> 315 <div class="card mb-4">
108 <div class="card-header"> 316 <div class="card-header">
@@ -237,13 +445,43 @@ @@ -237,13 +445,43 @@
237 return; 445 return;
238 } 446 }
239 447
  448 + // 验证必要的配置
  449 + if (!validateConfig()) {
  450 + return;
  451 + }
  452 +
240 const config = { 453 const config = {
241 topics: Array.from(selectedTopics), 454 topics: Array.from(selectedTopics),
242 parameters: { 455 parameters: {
243 crawlDepth: parseInt(document.getElementById('crawlDepth').value), 456 crawlDepth: parseInt(document.getElementById('crawlDepth').value),
244 interval: parseInt(document.getElementById('interval').value), 457 interval: parseInt(document.getElementById('interval').value),
245 maxRetries: parseInt(document.getElementById('maxRetries').value), 458 maxRetries: parseInt(document.getElementById('maxRetries').value),
246 - timeout: parseInt(document.getElementById('timeout').value) 459 + timeout: parseInt(document.getElementById('timeout').value),
  460 + maxConcurrent: parseInt(document.getElementById('maxConcurrent').value),
  461 + requestsPerMinute: parseInt(document.getElementById('requestsPerMinute').value)
  462 + },
  463 + filters: {
  464 + interaction: {
  465 + minLikes: parseInt(document.getElementById('minLikes').value) || 0,
  466 + minComments: parseInt(document.getElementById('minComments').value) || 0,
  467 + minReposts: parseInt(document.getElementById('minReposts').value) || 0,
  468 + minReads: parseInt(document.getElementById('minReads').value) || 0
  469 + },
  470 + regex: getRegexFilters(),
  471 + options: {
  472 + originalOnly: document.getElementById('filterOriginal').checked,
  473 + withMediaOnly: document.getElementById('filterWithMedia').checked,
  474 + verifiedOnly: document.getElementById('filterVerified').checked
  475 + }
  476 + },
  477 + accounts: getAccountsConfig(),
  478 + database: {
  479 + type: document.getElementById('dbType').value,
  480 + host: document.getElementById('dbHost').value,
  481 + port: parseInt(document.getElementById('dbPort').value),
  482 + name: document.getElementById('dbName').value,
  483 + user: document.getElementById('dbUser').value,
  484 + password: document.getElementById('dbPassword').value
247 } 485 }
248 }; 486 };
249 487
@@ -268,6 +506,335 @@ @@ -268,6 +506,335 @@
268 }); 506 });
269 } 507 }
270 508
  509 + // 账号管理相关函数
  510 + let accounts = [];
  511 + let accountIdCounter = 0;
  512 +
  513 + function createAccountElement(account) {
  514 + const accountDiv = document.createElement('div');
  515 + accountDiv.className = 'border rounded p-3 mb-3 position-relative account-item';
  516 + accountDiv.dataset.id = account.id;
  517 +
  518 + const deleteButton = document.createElement('button');
  519 + deleteButton.className = 'btn btn-sm btn-danger position-absolute top-0 end-0 m-2';
  520 + deleteButton.innerHTML = '<i class="fas fa-times"></i>';
  521 + deleteButton.onclick = () => removeAccount(account.id);
  522 +
  523 + const content = `
  524 + <div class="row">
  525 + <div class="col-md-6">
  526 + <div class="mb-3">
  527 + <label class="form-label">用户名</label>
  528 + <input type="text" class="form-control account-username" value="${account.username || ''}" placeholder="微博用户名">
  529 + </div>
  530 + </div>
  531 + <div class="col-md-6">
  532 + <div class="mb-3">
  533 + <label class="form-label">密码</label>
  534 + <input type="password" class="form-control account-password" value="${account.password || ''}" placeholder="微博密码">
  535 + </div>
  536 + </div>
  537 + </div>
  538 + <div class="mb-3">
  539 + <label class="form-label">Cookie</label>
  540 + <textarea class="form-control account-cookie" rows="2" placeholder="请输入微博Cookie">${account.cookie || ''}</textarea>
  541 + </div>
  542 + <div class="form-check mb-3">
  543 + <input class="form-check-input account-save-cookie" type="checkbox" ${account.saveCookie ? 'checked' : ''}>
  544 + <label class="form-check-label">
  545 + 保存Cookie(加密存储)
  546 + </label>
  547 + </div>
  548 + <div class="account-status alert alert-info">
  549 + 状态:待验证
  550 + <button class="btn btn-sm btn-outline-primary ms-2" onclick="validateAccount(${account.id})">
  551 + <i class="fas fa-check-circle"></i> 验证账号
  552 + </button>
  553 + </div>
  554 + `;
  555 +
  556 + accountDiv.innerHTML = content;
  557 + accountDiv.appendChild(deleteButton);
  558 + return accountDiv;
  559 + }
  560 +
  561 + function addAccount() {
  562 + const account = {
  563 + id: accountIdCounter++,
  564 + username: '',
  565 + password: '',
  566 + cookie: '',
  567 + saveCookie: false,
  568 + status: 'pending'
  569 + };
  570 + accounts.push(account);
  571 +
  572 + const accountsList = document.getElementById('accountsList');
  573 + accountsList.appendChild(createAccountElement(account));
  574 + updateAccountsWarning();
  575 + }
  576 +
  577 + function removeAccount(id) {
  578 + accounts = accounts.filter(account => account.id !== id);
  579 + const accountElement = document.querySelector(`.account-item[data-id="${id}"]`);
  580 + if (accountElement) {
  581 + accountElement.remove();
  582 + }
  583 + updateAccountsWarning();
  584 + }
  585 +
  586 + function updateAccountsWarning() {
  587 + const warning = document.getElementById('noAccountsWarning');
  588 + warning.style.display = accounts.length === 0 ? 'block' : 'none';
  589 + }
  590 +
  591 + function getAccountsConfig() {
  592 + return accounts.map(account => {
  593 + const accountElement = document.querySelector(`.account-item[data-id="${account.id}"]`);
  594 + return {
  595 + username: accountElement.querySelector('.account-username').value,
  596 + password: accountElement.querySelector('.account-password').value,
  597 + cookie: accountElement.querySelector('.account-cookie').value,
  598 + saveCookie: accountElement.querySelector('.account-save-cookie').checked
  599 + };
  600 + });
  601 + }
  602 +
  603 + async function validateAccount(id) {
  604 + const accountElement = document.querySelector(`.account-item[data-id="${id}"]`);
  605 + const statusElement = accountElement.querySelector('.account-status');
  606 + const cookie = accountElement.querySelector('.account-cookie').value.trim();
  607 +
  608 + if (!cookie) {
  609 + statusElement.className = 'account-status alert alert-danger';
  610 + statusElement.innerHTML = '状态:验证失败 - Cookie不能为空';
  611 + return;
  612 + }
  613 +
  614 + statusElement.className = 'account-status alert alert-warning';
  615 + statusElement.innerHTML = '状态:验证中...';
  616 +
  617 + try {
  618 + const response = await fetch('/api/spider/validate-account', {
  619 + method: 'POST',
  620 + headers: {
  621 + 'Content-Type': 'application/json'
  622 + },
  623 + body: JSON.stringify({
  624 + cookie: cookie
  625 + })
  626 + });
  627 +
  628 + const data = await response.json();
  629 + if (data.success) {
  630 + statusElement.className = 'account-status alert alert-success';
  631 + statusElement.innerHTML = '状态:验证成功';
  632 + } else {
  633 + statusElement.className = 'account-status alert alert-danger';
  634 + statusElement.innerHTML = `状态:验证失败 - ${data.message}`;
  635 + }
  636 + } catch (error) {
  637 + statusElement.className = 'account-status alert alert-danger';
  638 + statusElement.innerHTML = `状态:验证失败 - ${error.message}`;
  639 + }
  640 + }
  641 +
  642 + // 正则筛选器管理
  643 + let regexFilters = [];
  644 + let regexFilterIdCounter = 0;
  645 +
  646 + function createRegexFilterElement(filter) {
  647 + const filterDiv = document.createElement('div');
  648 + filterDiv.className = 'border rounded p-3 mb-3 position-relative regex-filter-item';
  649 + filterDiv.dataset.id = filter.id;
  650 +
  651 + const deleteButton = document.createElement('button');
  652 + deleteButton.className = 'btn btn-sm btn-danger position-absolute top-0 end-0 m-2';
  653 + deleteButton.innerHTML = '<i class="fas fa-times"></i>';
  654 + deleteButton.onclick = () => removeRegexFilter(filter.id);
  655 +
  656 + const content = `
  657 + <div class="row">
  658 + <div class="col-md-6">
  659 + <div class="mb-3">
  660 + <label class="form-label">正则表达式</label>
  661 + <input type="text" class="form-control regex-pattern" value="${filter.pattern || ''}" placeholder="输入正则表达式">
  662 + </div>
  663 + </div>
  664 + <div class="col-md-6">
  665 + <div class="mb-3">
  666 + <label class="form-label">匹配目标</label>
  667 + <select class="form-select regex-target">
  668 + <option value="content" ${filter.target === 'content' ? 'selected' : ''}>微博内容</option>
  669 + <option value="author" ${filter.target === 'author' ? 'selected' : ''}>作者名</option>
  670 + <option value="location" ${filter.target === 'location' ? 'selected' : ''}>发布位置</option>
  671 + </select>
  672 + </div>
  673 + </div>
  674 + </div>
  675 + <div class="form-check">
  676 + <input class="form-check-input regex-inverse" type="checkbox" ${filter.inverse ? 'checked' : ''}>
  677 + <label class="form-check-label">
  678 + 反向匹配(不包含匹配项)
  679 + </label>
  680 + </div>
  681 + `;
  682 +
  683 + filterDiv.innerHTML = content;
  684 + filterDiv.appendChild(deleteButton);
  685 + return filterDiv;
  686 + }
  687 +
  688 + function addRegexFilter() {
  689 + const filter = {
  690 + id: regexFilterIdCounter++,
  691 + pattern: '',
  692 + target: 'content',
  693 + inverse: false
  694 + };
  695 + regexFilters.push(filter);
  696 +
  697 + const filtersList = document.getElementById('regexFilters');
  698 + filtersList.appendChild(createRegexFilterElement(filter));
  699 + }
  700 +
  701 + function removeRegexFilter(id) {
  702 + regexFilters = regexFilters.filter(filter => filter.id !== id);
  703 + const filterElement = document.querySelector(`.regex-filter-item[data-id="${id}"]`);
  704 + if (filterElement) {
  705 + filterElement.remove();
  706 + }
  707 + }
  708 +
  709 + function getRegexFilters() {
  710 + return regexFilters.map(filter => {
  711 + const filterElement = document.querySelector(`.regex-filter-item[data-id="${filter.id}"]`);
  712 + return {
  713 + pattern: filterElement.querySelector('.regex-pattern').value,
  714 + target: filterElement.querySelector('.regex-target').value,
  715 + inverse: filterElement.querySelector('.regex-inverse').checked
  716 + };
  717 + }).filter(filter => filter.pattern.trim() !== '');
  718 + }
  719 +
  720 + // 验证配置
  721 + function validateConfig() {
  722 + // 验证正则表达式
  723 + const invalidRegex = regexFilters.some(filter => {
  724 + const filterElement = document.querySelector(`.regex-filter-item[data-id="${filter.id}"]`);
  725 + const pattern = filterElement.querySelector('.regex-pattern').value.trim();
  726 + if (pattern !== '') {
  727 + try {
  728 + new RegExp(pattern);
  729 + return false;
  730 + } catch (e) {
  731 + alert(`正则表达式 "${pattern}" 格式无效!`);
  732 + return true;
  733 + }
  734 + }
  735 + return false;
  736 + });
  737 +
  738 + if (invalidRegex) {
  739 + return false;
  740 + }
  741 +
  742 + // 验证是否有账号配置
  743 + if (accounts.length === 0) {
  744 + alert('请至少添加一个账号!');
  745 + return false;
  746 + }
  747 +
  748 + // 验证每个账号是否都有Cookie
  749 + const invalidAccounts = accounts.filter(account => {
  750 + const accountElement = document.querySelector(`.account-item[data-id="${account.id}"]`);
  751 + return !accountElement.querySelector('.account-cookie').value.trim();
  752 + });
  753 +
  754 + if (invalidAccounts.length > 0) {
  755 + alert('存在未配置Cookie的账号,请检查!');
  756 + return false;
  757 + }
  758 +
  759 + // 验证并行配置
  760 + const maxConcurrent = parseInt(document.getElementById('maxConcurrent').value);
  761 + const requestsPerMinute = parseInt(document.getElementById('requestsPerMinute').value);
  762 + if (maxConcurrent < 1 || maxConcurrent > 5) {
  763 + alert('最大并行数必须在1-5之间!');
  764 + return false;
  765 + }
  766 + if (requestsPerMinute < 30 || requestsPerMinute > 120) {
  767 + alert('每分钟请求数必须在30-120之间!');
  768 + return false;
  769 + }
  770 +
  771 + // 验证数据库配置
  772 + const dbConfig = {
  773 + host: document.getElementById('dbHost').value.trim(),
  774 + port: document.getElementById('dbPort').value.trim(),
  775 + name: document.getElementById('dbName').value.trim(),
  776 + user: document.getElementById('dbUser').value.trim(),
  777 + password: document.getElementById('dbPassword').value.trim()
  778 + };
  779 +
  780 + if (!dbConfig.host || !dbConfig.port || !dbConfig.name || !dbConfig.user || !dbConfig.password) {
  781 + alert('请完整填写数据库配置信息!');
  782 + return false;
  783 + }
  784 +
  785 + return true;
  786 + }
  787 +
  788 + // 测试数据库连接
  789 + async function testDbConnection() {
  790 + const dbConfig = {
  791 + type: document.getElementById('dbType').value,
  792 + host: document.getElementById('dbHost').value,
  793 + port: parseInt(document.getElementById('dbPort').value),
  794 + name: document.getElementById('dbName').value,
  795 + user: document.getElementById('dbUser').value,
  796 + password: document.getElementById('dbPassword').value
  797 + };
  798 +
  799 + try {
  800 + const response = await fetch('/api/spider/test-db', {
  801 + method: 'POST',
  802 + headers: {
  803 + 'Content-Type': 'application/json'
  804 + },
  805 + body: JSON.stringify(dbConfig)
  806 + });
  807 +
  808 + const data = await response.json();
  809 + if (data.success) {
  810 + alert('数据库连接测试成功!');
  811 + } else {
  812 + alert('数据库连接测试失败:' + data.message);
  813 + }
  814 + } catch (error) {
  815 + alert('测试连接时发生错误:' + error.message);
  816 + }
  817 + }
  818 +
  819 + // 监听数据库类型变化
  820 + document.getElementById('dbType').addEventListener('change', function() {
  821 + const dbType = this.value;
  822 + const portInput = document.getElementById('dbPort');
  823 +
  824 + // 根据数据库类型设置默认端口
  825 + switch(dbType) {
  826 + case 'mysql':
  827 + portInput.value = '3306';
  828 + break;
  829 + case 'postgresql':
  830 + portInput.value = '5432';
  831 + break;
  832 + case 'mongodb':
  833 + portInput.value = '27017';
  834 + break;
  835 + }
  836 + });
  837 +
271 // 保存配置 838 // 保存配置
272 function saveConfig() { 839 function saveConfig() {
273 const config = { 840 const config = {
@@ -14,6 +14,12 @@ import aiohttp @@ -14,6 +14,12 @@ import aiohttp
14 from concurrent.futures import ThreadPoolExecutor 14 from concurrent.futures import ThreadPoolExecutor
15 from ratelimit import limits, sleep_and_retry 15 from ratelimit import limits, sleep_and_retry
16 from tenacity import retry, stop_after_attempt, wait_exponential 16 from tenacity import retry, stop_after_attempt, wait_exponential
  17 +import pymysql
  18 +import psycopg2
  19 +from pymongo import MongoClient
  20 +from cryptography.fernet import Fernet
  21 +import base64
  22 +import re
17 23
18 # 创建蓝图 24 # 创建蓝图
19 spider_bp = Blueprint('spider', __name__) 25 spider_bp = Blueprint('spider', __name__)
@@ -22,6 +28,10 @@ spider_bp = Blueprint('spider', __name__) @@ -22,6 +28,10 @@ spider_bp = Blueprint('spider', __name__)
22 logger = logging.getLogger('spider_control') 28 logger = logging.getLogger('spider_control')
23 logger.setLevel(logging.INFO) 29 logger.setLevel(logging.INFO)
24 30
  31 +# 加密密钥
  32 +ENCRYPTION_KEY = Fernet.generate_key()
  33 +cipher_suite = Fernet(ENCRYPTION_KEY)
  34 +
25 # 存储WebSocket连接的集合 35 # 存储WebSocket连接的集合
26 websocket_connections = set() 36 websocket_connections = set()
27 37
@@ -41,14 +51,93 @@ DEFAULT_CONFIG = { @@ -41,14 +51,93 @@ DEFAULT_CONFIG = {
41 'interval': 5, 51 'interval': 5,
42 'maxRetries': 3, 52 'maxRetries': 3,
43 'timeout': 30, 53 'timeout': 30,
44 - 'maxConcurrent': 2 54 + 'maxConcurrent': 2,
  55 + 'requestsPerMinute': 60
45 } 56 }
46 57
47 -# 限流装饰器  
48 -@sleep_and_retry  
49 -@limits(calls=100, period=60) # 每分钟最多100个请求  
50 -def rate_limited_request():  
51 - pass 58 +def encrypt_data(data):
  59 + """加密敏感数据"""
  60 + if not data:
  61 + return None
  62 + return cipher_suite.encrypt(data.encode()).decode()
  63 +
  64 +def decrypt_data(encrypted_data):
  65 + """解密敏感数据"""
  66 + if not encrypted_data:
  67 + return None
  68 + return cipher_suite.decrypt(encrypted_data.encode()).decode()
  69 +
  70 +@spider_bp.route('/api/spider/test-db', methods=['POST'])
  71 +def test_db_connection():
  72 + """测试数据库连接"""
  73 + try:
  74 + data = request.get_json()
  75 + db_type = data.get('type')
  76 + host = data.get('host')
  77 + port = data.get('port')
  78 + db_name = data.get('name')
  79 + user = data.get('user')
  80 + password = data.get('password')
  81 +
  82 + if not all([db_type, host, port, db_name, user, password]):
  83 + return jsonify({
  84 + 'success': False,
  85 + 'message': '请提供完整的数据库配置信息'
  86 + })
  87 +
  88 + try:
  89 + if db_type == 'mysql':
  90 + connection = pymysql.connect(
  91 + host=host,
  92 + port=port,
  93 + user=user,
  94 + password=password,
  95 + database=db_name
  96 + )
  97 + connection.close()
  98 + elif db_type == 'postgresql':
  99 + connection = psycopg2.connect(
  100 + host=host,
  101 + port=port,
  102 + database=db_name,
  103 + user=user,
  104 + password=password
  105 + )
  106 + connection.close()
  107 + elif db_type == 'mongodb':
  108 + client = MongoClient(
  109 + host=host,
  110 + port=port,
  111 + username=user,
  112 + password=password,
  113 + authSource=db_name
  114 + )
  115 + client.server_info() # 测试连接
  116 + client.close()
  117 + else:
  118 + return jsonify({
  119 + 'success': False,
  120 + 'message': '不支持的数据库类型'
  121 + })
  122 +
  123 + return jsonify({
  124 + 'success': True,
  125 + 'message': '数据库连接测试成功'
  126 + })
  127 +
  128 + except Exception as e:
  129 + logger.error(f"数据库连接测试失败: {str(e)}")
  130 + return jsonify({
  131 + 'success': False,
  132 + 'message': f'数据库连接失败: {str(e)}'
  133 + })
  134 +
  135 + except Exception as e:
  136 + logger.error(f"处理数据库测试请求时出错: {str(e)}")
  137 + return jsonify({
  138 + 'success': False,
  139 + 'message': str(e)
  140 + })
52 141
53 class SpiderWorker: 142 class SpiderWorker:
54 def __init__(self, topics, parameters): 143 def __init__(self, topics, parameters):
@@ -60,6 +149,50 @@ class SpiderWorker: @@ -60,6 +149,50 @@ class SpiderWorker:
60 self.message_buffer = [] 149 self.message_buffer = []
61 self.message_buffer_size = 10 150 self.message_buffer_size = 10
62 self.semaphore = asyncio.Semaphore(parameters.get('maxConcurrent', DEFAULT_CONFIG['maxConcurrent'])) 151 self.semaphore = asyncio.Semaphore(parameters.get('maxConcurrent', DEFAULT_CONFIG['maxConcurrent']))
  152 + self.rate_limiter = asyncio.Semaphore(parameters.get('requestsPerMinute', DEFAULT_CONFIG['requestsPerMinute']))
  153 + self.accounts = parameters.get('accounts', [])
  154 + self.current_account_index = 0
  155 + self.account_lock = asyncio.Lock()
  156 +
  157 + # 添加筛选条件
  158 + self.filters = parameters.get('filters', {})
  159 + self.interaction_filters = self.filters.get('interaction', {})
  160 + self.regex_filters = self.filters.get('regex', [])
  161 + self.filter_options = self.filters.get('options', {})
  162 +
  163 + # 初始化正则表达式
  164 + self.compiled_regex = []
  165 + for regex_filter in self.regex_filters:
  166 + try:
  167 + pattern = regex_filter['pattern']
  168 + if pattern:
  169 + self.compiled_regex.append({
  170 + 'regex': re.compile(pattern),
  171 + 'target': regex_filter['target'],
  172 + 'inverse': regex_filter['inverse']
  173 + })
  174 + except re.error as e:
  175 + logger.error(f"正则表达式编译失败: {pattern}, 错误: {e}")
  176 +
  177 + def get_next_account(self):
  178 + """获取下一个可用账号"""
  179 + with self.account_lock:
  180 + if not self.accounts:
  181 + raise ValueError("没有可用的账号")
  182 +
  183 + account = self.accounts[self.current_account_index]
  184 + self.current_account_index = (self.current_account_index + 1) % len(self.accounts)
  185 + return account
  186 +
  187 + async def acquire_rate_limit(self):
  188 + """获取速率限制令牌"""
  189 + await self.rate_limiter.acquire()
  190 + asyncio.create_task(self.release_rate_limit())
  191 +
  192 + async def release_rate_limit(self):
  193 + """释放速率限制令牌"""
  194 + await asyncio.sleep(60) # 1分钟后释放
  195 + self.rate_limiter.release()
63 196
64 async def send_message(self, message): 197 async def send_message(self, message):
65 """异步发送消息,使用缓冲区优化""" 198 """异步发送消息,使用缓冲区优化"""
@@ -82,22 +215,43 @@ class SpiderWorker: @@ -82,22 +215,43 @@ class SpiderWorker:
82 async def crawl_single_topic(self, topic): 215 async def crawl_single_topic(self, topic):
83 """爬取单个话题""" 216 """爬取单个话题"""
84 try: 217 try:
85 - rate_limited_request() 218 + await self.acquire_rate_limit()
  219 +
  220 + # 获取当前要使用的账号
  221 + account = self.get_next_account()
86 222
87 await self.send_message({ 223 await self.send_message({
88 'type': 'log', 224 'type': 'log',
89 - 'message': f'开始爬取话题: {topic}' 225 + 'message': f'使用账号 {account["username"]} 开始爬取话题: {topic}'
90 }) 226 })
91 227
  228 + filtered_count = 0
  229 + total_count = 0
  230 +
92 async with self.semaphore: 231 async with self.semaphore:
  232 + # 创建一个回调函数来处理爬取的数据
  233 + def process_post(post):
  234 + nonlocal filtered_count, total_count
  235 + total_count += 1
  236 +
  237 + # 应用筛选条件
  238 + if self.apply_filters(post):
  239 + filtered_count += 1
  240 + return True
  241 + return False
  242 +
  243 + # 调用爬虫并传入回调函数
93 await asyncio.get_event_loop().run_in_executor( 244 await asyncio.get_event_loop().run_in_executor(
94 thread_pool, 245 thread_pool,
95 - self.spider.crawl_topic, 246 + lambda: self.spider.crawl_topic(
96 topic, 247 topic,
97 self.parameters['crawlDepth'], 248 self.parameters['crawlDepth'],
98 self.parameters['interval'], 249 self.parameters['interval'],
99 self.parameters['maxRetries'], 250 self.parameters['maxRetries'],
100 - self.parameters['timeout'] 251 + self.parameters['timeout'],
  252 + account['cookie'],
  253 + process_post # 传入回调函数
  254 + )
101 ) 255 )
102 256
103 self.completed_topics += 1 257 self.completed_topics += 1
@@ -108,9 +262,10 @@ class SpiderWorker: @@ -108,9 +262,10 @@ class SpiderWorker:
108 'value': progress 262 'value': progress
109 }) 263 })
110 264
  265 + # 发送筛选统计信息
111 await self.send_message({ 266 await self.send_message({
112 'type': 'log', 267 'type': 'log',
113 - 'message': f'话题 {topic} 爬取完成' 268 + 'message': f'话题 {topic} 爬取完成,共爬取 {total_count} 条微博,符合筛选条件 {filtered_count} 条'
114 }) 269 })
115 270
116 except Exception as e: 271 except Exception as e:
@@ -142,6 +297,116 @@ class SpiderWorker: @@ -142,6 +297,116 @@ class SpiderWorker:
142 finally: 297 finally:
143 await self.flush_messages() 298 await self.flush_messages()
144 299
  300 + def apply_filters(self, post):
  301 + """
  302 + 应用筛选条件到单条微博
  303 +
  304 + Args:
  305 + post: 微博数据字典
  306 +
  307 + Returns:
  308 + bool: 是否通过筛选
  309 + """
  310 + try:
  311 + # 1. 检查互动数据
  312 + if not self._check_interaction_metrics(post):
  313 + return False
  314 +
  315 + # 2. 检查正则匹配
  316 + if not self._check_regex_filters(post):
  317 + return False
  318 +
  319 + # 3. 检查高级选项
  320 + if not self._check_advanced_options(post):
  321 + return False
  322 +
  323 + return True
  324 +
  325 + except Exception as e:
  326 + logger.error(f"应用筛选条件时出错: {e}")
  327 + return False
  328 +
  329 + def _check_interaction_metrics(self, post):
  330 + """检查互动指标是否满足条件"""
  331 + try:
  332 + # 获取互动指标的最小值要求
  333 + min_likes = self.interaction_filters.get('minLikes', 0)
  334 + min_comments = self.interaction_filters.get('minComments', 0)
  335 + min_reposts = self.interaction_filters.get('minReposts', 0)
  336 + min_reads = self.interaction_filters.get('minReads', 0)
  337 +
  338 + # 检查是否满足所有条件
  339 + if post.get('like_count', 0) < min_likes:
  340 + return False
  341 + if post.get('comment_count', 0) < min_comments:
  342 + return False
  343 + if post.get('forward_count', 0) < min_reposts:
  344 + return False
  345 + if post.get('read_count', 0) < min_reads:
  346 + return False
  347 +
  348 + return True
  349 +
  350 + except Exception as e:
  351 + logger.error(f"检查互动指标时出错: {e}")
  352 + return False
  353 +
  354 + def _check_regex_filters(self, post):
  355 + """检查正则表达式匹配"""
  356 + try:
  357 + for regex_filter in self.compiled_regex:
  358 + regex = regex_filter['regex']
  359 + target = regex_filter['target']
  360 + inverse = regex_filter['inverse']
  361 +
  362 + # 获取目标文本
  363 + if target == 'content':
  364 + text = post.get('content', '')
  365 + elif target == 'author':
  366 + text = post.get('user_name', '')
  367 + elif target == 'location':
  368 + text = post.get('location', '')
  369 + else:
  370 + continue
  371 +
  372 + # 执行匹配
  373 + match = bool(regex.search(text))
  374 +
  375 + # 如果是反向匹配,取反结果
  376 + if inverse:
  377 + match = not match
  378 +
  379 + # 如果不满足条件,返回False
  380 + if not match:
  381 + return False
  382 +
  383 + return True
  384 +
  385 + except Exception as e:
  386 + logger.error(f"检查正则匹配时出错: {e}")
  387 + return False
  388 +
  389 + def _check_advanced_options(self, post):
  390 + """检查高级筛选选项"""
  391 + try:
  392 + # 检查是否只要原创内容
  393 + if self.filter_options.get('originalOnly') and not post.get('is_original', False):
  394 + return False
  395 +
  396 + # 检查是否必须包含媒体
  397 + if self.filter_options.get('withMediaOnly') and not post.get('has_media', False):
  398 + return False
  399 +
  400 + # 检查是否只要认证用户
  401 + if self.filter_options.get('verifiedOnly') and not post.get('user_verified', False):
  402 + return False
  403 +
  404 + return True
  405 +
  406 + except Exception as e:
  407 + logger.error(f"检查高级选项时出错: {e}")
  408 + return False
  409 +
145 async def broadcast_message(messages): 410 async def broadcast_message(messages):
146 """广播消息到所有WebSocket连接""" 411 """广播消息到所有WebSocket连接"""
147 if not websocket_connections: 412 if not websocket_connections:
@@ -172,6 +437,7 @@ async def start_spider(): @@ -172,6 +437,7 @@ async def start_spider():
172 data = request.get_json() 437 data = request.get_json()
173 topics = data.get('topics', []) 438 topics = data.get('topics', [])
174 parameters = {**DEFAULT_CONFIG, **data.get('parameters', {})} 439 parameters = {**DEFAULT_CONFIG, **data.get('parameters', {})}
  440 + accounts = data.get('accounts', [])
175 441
176 if not topics: 442 if not topics:
177 return jsonify({ 443 return jsonify({
@@ -179,6 +445,20 @@ async def start_spider(): @@ -179,6 +445,20 @@ async def start_spider():
179 'message': '请选择至少一个话题' 445 'message': '请选择至少一个话题'
180 }) 446 })
181 447
  448 + if not accounts:
  449 + return jsonify({
  450 + 'success': False,
  451 + 'message': '请配置至少一个账号'
  452 + })
  453 +
  454 + # 处理账号Cookie的加密存储
  455 + for account in accounts:
  456 + if account.get('saveCookie'):
  457 + account['cookie'] = encrypt_data(account['cookie'])
  458 +
  459 + # 将账号信息添加到参数中
  460 + parameters['accounts'] = accounts
  461 +
182 # 创建爬虫工作器 462 # 创建爬虫工作器
183 worker = SpiderWorker(topics, parameters) 463 worker = SpiderWorker(topics, parameters)
184 464
@@ -298,17 +578,36 @@ def generate_ai_config(): @@ -298,17 +578,36 @@ def generate_ai_config():
298 578
299 # 构建AI提示 579 # 构建AI提示
300 system_prompt = """你是一个专业的爬虫配置助手。请根据用户的自然语言描述,生成合适的微博爬虫配置。 580 system_prompt = """你是一个专业的爬虫配置助手。请根据用户的自然语言描述,生成合适的微博爬虫配置。
  581 +
301 配置应包含以下内容: 582 配置应包含以下内容:
302 1. 要爬取的话题列表 583 1. 要爬取的话题列表
303 -2. 爬虫参数(爬取深度、间隔时间、重试次数、超时时间) 584 +2. 爬虫参数配置
  585 + - 爬取深度(crawlDepth):1-10页
  586 + - 间隔时间(interval):3-30秒
  587 + - 重试次数(maxRetries):1-5次
  588 + - 超时时间(timeout):10-60秒
  589 + - 最大并行数(maxConcurrent):1-5
  590 + - 每分钟请求数限制(requestsPerMinute):30-120
  591 +
  592 +3. 内容筛选条件
  593 + a) 互动数据筛选(设为0表示不启用)
  594 + - 最小点赞数(minLikes)
  595 + - 最小评论数(minComments)
  596 + - 最小转发数(minReposts)
  597 + - 最小阅读数(minReads)
  598 +
  599 + b) 正则表达式筛选(数组,可以有多个规则)
  600 + - pattern: 正则表达式模式
  601 + - target: 匹配目标(content/author/location)
  602 + - inverse: 是否反向匹配(true/false)
  603 +
  604 + c) 高级筛选选项(布尔值)
  605 + - originalOnly: 是否只要原创内容
  606 + - withMediaOnly: 是否必须包含媒体
  607 + - verifiedOnly: 是否只要认证用户
304 608
305 请先用通俗易懂的语言解释你的配置建议,然后在最后提供一个JSON格式的具体配置。 609 请先用通俗易懂的语言解释你的配置建议,然后在最后提供一个JSON格式的具体配置。
306 -注意:  
307 -- 爬取深度(crawlDepth)范围:1-10页  
308 -- 间隔时间(interval)范围:3-30秒  
309 -- 重试次数(maxRetries)范围:1-5次  
310 -- 超时时间(timeout)范围:10-60秒  
311 -- 所有参数都必须是整数 610 +所有数值参数必须是整数,并且在指定范围内。
312 611
313 示例输出格式: 612 示例输出格式:
314 根据您的需求,我建议... 613 根据您的需求,我建议...
@@ -319,7 +618,29 @@ def generate_ai_config(): @@ -319,7 +618,29 @@ def generate_ai_config():
319 "crawlDepth": 5, 618 "crawlDepth": 5,
320 "interval": 5, 619 "interval": 5,
321 "maxRetries": 3, 620 "maxRetries": 3,
322 - "timeout": 30 621 + "timeout": 30,
  622 + "maxConcurrent": 2,
  623 + "requestsPerMinute": 60
  624 + },
  625 + "filters": {
  626 + "interaction": {
  627 + "minLikes": 1000,
  628 + "minComments": 100,
  629 + "minReposts": 50,
  630 + "minReads": 10000
  631 + },
  632 + "regex": [
  633 + {
  634 + "pattern": "关键词",
  635 + "target": "content",
  636 + "inverse": false
  637 + }
  638 + ],
  639 + "options": {
  640 + "originalOnly": true,
  641 + "withMediaOnly": false,
  642 + "verifiedOnly": true
  643 + }
323 } 644 }
324 }""" 645 }"""
325 646
@@ -369,3 +690,51 @@ def generate_ai_config(): @@ -369,3 +690,51 @@ def generate_ai_config():
369 'success': False, 690 'success': False,
370 'message': str(e) 691 'message': str(e)
371 }) 692 })
  693 +
  694 +@spider_bp.route('/api/spider/validate-account', methods=['POST'])
  695 +async def validate_account():
  696 + """验证微博账号"""
  697 + try:
  698 + data = request.get_json()
  699 + cookie = data.get('cookie')
  700 +
  701 + if not cookie:
  702 + return jsonify({
  703 + 'success': False,
  704 + 'message': 'Cookie不能为空'
  705 + })
  706 +
  707 + # 创建测试请求
  708 + try:
  709 + async with aiohttp.ClientSession() as session:
  710 + headers = {
  711 + 'Cookie': cookie,
  712 + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
  713 + }
  714 + # 尝试访问微博API
  715 + async with session.get('https://weibo.com/ajax/profile/info', headers=headers) as response:
  716 + if response.status == 200:
  717 + data = await response.json()
  718 + if data.get('data', {}).get('user', {}):
  719 + return jsonify({
  720 + 'success': True,
  721 + 'message': '账号验证成功'
  722 + })
  723 +
  724 + return jsonify({
  725 + 'success': False,
  726 + 'message': 'Cookie无效或已过期'
  727 + })
  728 + except Exception as e:
  729 + logger.error(f"验证账号时发生错误: {e}")
  730 + return jsonify({
  731 + 'success': False,
  732 + 'message': f'验证过程出错: {str(e)}'
  733 + })
  734 +
  735 + except Exception as e:
  736 + logger.error(f"处理账号验证请求时出错: {e}")
  737 + return jsonify({
  738 + 'success': False,
  739 + 'message': str(e)
  740 + })