Add AI-powered Spider Configuration Assistant.

戒酒的李白
Commit 930046fd5ce07b5e45609360a28e6be2f139b850 930046fd 1 parent 1180f285
Showing 2 changed files with 221 additions and 1 deletions
templates/spider_control.html
views/spider_control.py
--- a/templates/spider_control.html
View file @930046f
+++ b/templates/spider_control.html
View file @930046f
@@ -103,6 +103,39 @@
             </div>
         </div>
 
+         <!-- AI配置助手 -->
+         <div class="card mb-4">
+             <div class="card-header">
+                 <h5 class="mb-0">
+                     <i class="fas fa-robot"></i> AI配置助手
+                 </h5>
+             </div>
+             <div class="card-body">
+                 <div class="mb-3">
+                     <label for="aiPrompt" class="form-label">用自然语言描述您的爬虫需求</label>
+                     <textarea class="form-control" id="aiPrompt" rows="3" 
+                         placeholder="例如：我想爬取最近一周关于人工智能的热门微博，重点关注转发量超过1000的内容，每个话题爬取前5页内容。"></textarea>
+                 </div>
+                 <div class="d-flex justify-content-between align-items-center">
+                     <button class="btn btn-primary" onclick="generateConfig()">
+                         <i class="fas fa-magic"></i> 生成配置
+                     </button>
+                     <div class="form-check">
+                         <input class="form-check-input" type="checkbox" id="autoApply" checked>
+                         <label class="form-check-label" for="autoApply">
+                             自动应用生成的配置
+                         </label>
+                     </div>
+                 </div>
+                 <div id="aiResponse" class="mt-3" style="display: none;">
+                     <div class="alert alert-info">
+                         <h6 class="alert-heading">AI助手建议：</h6>
+                         <p id="aiSuggestion" class="mb-0"></p>
+                     </div>
+                 </div>
+             </div>
+         </div>
+ 
         <!-- 操作按钮 -->
         <div class="d-flex justify-content-between mb-5">
             <button class="btn btn-primary" onclick="startCrawling()">
@@ -286,6 +319,63 @@
                 updateCrawlLog(data.message);
             }
         };
+ 
+         // AI配置生成
+         async function generateConfig() {
+             const prompt = document.getElementById('aiPrompt').value.trim();
+             if (!prompt) {
+                 alert('请输入您的爬虫需求描述！');
+                 return;
+             }
+ 
+             const aiResponse = document.getElementById('aiResponse');
+             const aiSuggestion = document.getElementById('aiSuggestion');
+             
+             try {
+                 const response = await fetch('/api/spider/ai-config', {
+                     method: 'POST',
+                     headers: {
+                         'Content-Type': 'application/json'
+                     },
+                     body: JSON.stringify({ prompt })
+                 });
+ 
+                 const data = await response.json();
+                 if (data.success) {
+                     // 显示AI建议
+                     aiSuggestion.textContent = data.suggestion;
+                     aiResponse.style.display = 'block';
+ 
+                     // 如果选择自动应用配置
+                     if (document.getElementById('autoApply').checked) {
+                         // 清除现有选择
+                         selectedTopics.clear();
+                         
+                         // 应用新的话题
+                         data.config.topics.forEach(topic => {
+                             selectedTopics.add(topic);
+                         });
+                         
+                         // 更新参数
+                         document.getElementById('crawlDepth').value = data.config.parameters.crawlDepth;
+                         document.getElementById('interval').value = data.config.parameters.interval;
+                         document.getElementById('maxRetries').value = data.config.parameters.maxRetries;
+                         document.getElementById('timeout').value = data.config.parameters.timeout;
+                         
+                         // 更新UI
+                         updateSelectedTopicsList();
+                         
+                         // 添加提示
+                         updateCrawlLog('AI配置已自动应用');
+                     }
+                 } else {
+                     throw new Error(data.message);
+                 }
+             } catch (error) {
+                 aiSuggestion.textContent = '生成配置时出错：' + error.message;
+                 aiResponse.style.display = 'block';
+             }
+         }
     </script>
 </body>
 </html> 
\ No newline at end of file
--- a/views/spider_control.py
View file @930046f
+++ b/views/spider_control.py
View file @930046f
@@ -8,6 +8,8 @@ import asyncio
 import websockets
 import logging
 from spider.spiderData import SpiderData
+ from openai import OpenAI
+ from anthropic import Anthropic
 
 # 创建蓝图
 spider_bp = Blueprint('spider', __name__)
@@ -210,4 +212,132 @@ async def spider_status_socket():
         finally:
             websocket_connections.remove(websocket)
     except Exception as e:
-         logger.error(f"WebSocket连接处理失败: {e}") 
\ No newline at end of file
+         logger.error(f"WebSocket连接处理失败: {e}")
+ 
+ def get_ai_client():
+     """获取可用的AI客户端"""
+     # 按优先级尝试不同的AI服务
+     if os.getenv('ANTHROPIC_API_KEY'):
+         return {
+             'type': 'anthropic',
+             'client': Anthropic(api_key=os.getenv('ANTHROPIC_API_KEY'))
+         }
+     elif os.getenv('OPENAI_API_KEY'):
+         return {
+             'type': 'openai',
+             'client': OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
+         }
+     else:
+         raise ValueError("未找到可用的AI API密钥")
+ 
+ def parse_ai_response(response_text):
+     """解析AI响应中的JSON配置"""
+     try:
+         # 查找JSON内容
+         start = response_text.find('{')
+         end = response_text.rfind('}') + 1
+         if start == -1 or end == 0:
+             raise ValueError("未找到有效的JSON配置")
+         
+         json_str = response_text[start:end]
+         config = json.loads(json_str)
+         
+         # 验证配置格式
+         if not isinstance(config.get('topics'), list):
+             raise ValueError("配置必须包含话题列表")
+         
+         parameters = config.get('parameters', {})
+         if not all(key in parameters for key in ['crawlDepth', 'interval', 'maxRetries', 'timeout']):
+             raise ValueError("配置缺少必要的参数")
+         
+         # 提取建议文本（JSON之前的部分）
+         suggestion = response_text[:start].strip()
+         
+         return config, suggestion
+     except Exception as e:
+         raise ValueError(f"解析AI响应失败: {str(e)}")
+ 
+ @spider_bp.route('/api/spider/ai-config', methods=['POST'])
+ def generate_ai_config():
+     """使用AI生成爬虫配置"""
+     try:
+         prompt = request.json.get('prompt', '')
+         if not prompt:
+             return jsonify({
+                 'success': False,
+                 'message': '请提供爬虫需求描述'
+             })
+         
+         # 构建AI提示
+         system_prompt = """你是一个专业的爬虫配置助手。请根据用户的自然语言描述，生成合适的微博爬虫配置。
+ 配置应包含以下内容：
+ 1. 要爬取的话题列表
+ 2. 爬虫参数（爬取深度、间隔时间、重试次数、超时时间）
+ 
+ 请先用通俗易懂的语言解释你的配置建议，然后在最后提供一个JSON格式的具体配置。
+ 注意：
+ - 爬取深度(crawlDepth)范围：1-10页
+ - 间隔时间(interval)范围：3-30秒
+ - 重试次数(maxRetries)范围：1-5次
+ - 超时时间(timeout)范围：10-60秒
+ - 所有参数都必须是整数
+ 
+ 示例输出格式：
+ 根据您的需求，我建议...
+ 
+ {
+     "topics": ["话题1", "话题2"],
+     "parameters": {
+         "crawlDepth": 5,
+         "interval": 5,
+         "maxRetries": 3,
+         "timeout": 30
+     }
+ }"""
+ 
+         # 获取AI客户端
+         ai = get_ai_client()
+         
+         try:
+             if ai['type'] == 'anthropic':
+                 response = ai['client'].messages.create(
+                     model="claude-3-sonnet-20240229",
+                     max_tokens=1000,
+                     messages=[
+                         {"role": "system", "content": system_prompt},
+                         {"role": "user", "content": prompt}
+                     ]
+                 )
+                 response_text = response.content[0].text
+             else:  # OpenAI
+                 response = ai['client'].chat.completions.create(
+                     model="gpt-3.5-turbo",
+                     messages=[
+                         {"role": "system", "content": system_prompt},
+                         {"role": "user", "content": prompt}
+                     ]
+                 )
+                 response_text = response.choices[0].message.content
+             
+             # 解析AI响应
+             config, suggestion = parse_ai_response(response_text)
+             
+             return jsonify({
+                 'success': True,
+                 'config': config,
+                 'suggestion': suggestion
+             })
+             
+         except Exception as e:
+             logger.error(f"AI服务调用失败: {e}")
+             return jsonify({
+                 'success': False,
+                 'message': f"AI配置生成失败: {str(e)}"
+             })
+             
+     except Exception as e:
+         logger.error(f"生成配置失败: {e}")
+         return jsonify({
+             'success': False,
+             'message': str(e)
+         }) 
\ No newline at end of file