style(sentiment_analyzer): format file

ghmark675 · 666ghj
Commit aa11c529c8707615a2f40d8a301cd740f345dbad aa11c529 1 parent 71f4b3ad
Showing 1 changed file with 178 additions and 114 deletions
InsightEngine/tools/sentiment_analyzer.py
--- a/InsightEngine/tools/sentiment_analyzer.py
View file @aa11c52
+++ b/InsightEngine/tools/sentiment_analyzer.py
View file @aa11c52
@@ -11,6 +11,7 @@ import re
 try:
     import torch
+
     TORCH_AVAILABLE = True
 except ImportError:
     torch = None  # type: ignore
@@ -18,6 +19,7 @@ except ImportError:
 try:
     from transformers import AutoTokenizer, AutoModelForSequenceClassification
+
     TRANSFORMERS_AVAILABLE = True
 except ImportError:
     AutoTokenizer = None  # type: ignore
@@ -28,6 +30,7 @@ except ImportError:
 # INFO：若想跳过情感分析，可手动切换此开关为False
 SENTIMENT_ANALYSIS_ENABLED = True
+
 def _describe_missing_dependencies() -> str:
     missing = []
     if not TORCH_AVAILABLE:
@@ -36,14 +39,21 @@ def _describe_missing_dependencies() -> str:
         missing.append("Transformers")
     return " / ".join(missing)
+
 # 添加项目根目录到路径，以便导入WeiboMultilingualSentiment
-project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-weibo_sentiment_path = os.path.join(project_root, "SentimentAnalysisModel", "WeiboMultilingualSentiment")
+project_root = os.path.dirname(
+    os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+)
+weibo_sentiment_path = os.path.join(
+    project_root, "SentimentAnalysisModel", "WeiboMultilingualSentiment"
+)
 sys.path.append(weibo_sentiment_path)
+
 @dataclass
 class SentimentResult:
     """情感分析结果数据类"""
+
     text: str
     sentiment_label: str
     confidence: float
@@ -53,9 +63,10 @@ class SentimentResult:
     analysis_performed: bool = True
-@dataclass 
+@dataclass
 class BatchSentimentResult:
     """批量情感分析结果数据类"""
+
     results: List[SentimentResult]
     total_processed: int
     success_count: int
@@ -69,7 +80,7 @@ class WeiboMultilingualSentimentAnalyzer:
     多语言情感分析器
     封装WeiboMultilingualSentiment模型，为AI Agent提供情感分析功能
     """
-    
+
     def __init__(self):
         """初始化情感分析器"""
         self.model = None
@@ -78,14 +89,14 @@ class WeiboMultilingualSentimentAnalyzer:
         self.is_initialized = False
         self.is_disabled = False
         self.disable_reason: Optional[str] = None
-        
+
         # 情感标签映射（5级分类）
         self.sentiment_map = {
-            0: "非常负面", 
-            1: "负面", 
-            2: "中性", 
-            3: "正面", 
-            4: "非常正面"
+            0: "非常负面",
+            1: "负面",
+            2: "中性",
+            3: "正面",
+            4: "非常正面",
         }
         if not SENTIMENT_ANALYSIS_ENABLED:
@@ -96,9 +107,13 @@ class WeiboMultilingualSentimentAnalyzer:
         if self.is_disabled:
             reason = self.disable_reason or "Sentiment analysis disabled."
-            print(f"WeiboMultilingualSentimentAnalyzer initialized but disabled: {reason}")
+            print(
+                f"WeiboMultilingualSentimentAnalyzer initialized but disabled: {reason}"
+            )
         else:
-            print("WeiboMultilingualSentimentAnalyzer 已创建，调用 initialize() 来加载模型")
+            print(
+                "WeiboMultilingualSentimentAnalyzer 已创建，调用 initialize() 来加载模型"
+            )
     def disable(self, reason: Optional[str] = None, drop_state: bool = False) -> None:
         """Disable sentiment analysis, optionally clearing loaded resources."""
@@ -130,14 +145,18 @@ class WeiboMultilingualSentimentAnalyzer:
         if torch.cuda.is_available():
             return torch.device("cuda")
         mps_backend = getattr(torch.backends, "mps", None)
-        if mps_backend and getattr(mps_backend, "is_available", lambda: False)() and getattr(mps_backend, "is_built", lambda: False)():
+        if (
+            mps_backend
+            and getattr(mps_backend, "is_available", lambda: False)()
+            and getattr(mps_backend, "is_built", lambda: False)()
+        ):
             return torch.device("mps")
         return torch.device("cpu")
-    
+
     def initialize(self) -> bool:
         """
         初始化模型和分词器
-        
+
         Returns:
             是否初始化成功
         """
@@ -155,31 +174,35 @@ class WeiboMultilingualSentimentAnalyzer:
         if self.is_initialized:
             print("模型已经初始化，无需重复加载")
             return True
-            
+
         try:
             print("正在加载多语言情感分析模型...")
-            
+
             # 使用多语言情感分析模型
             model_name = "tabularisai/multilingual-sentiment-analysis"
             local_model_path = os.path.join(weibo_sentiment_path, "model")
-            
+
             # 检查本地是否已有模型
             if os.path.exists(local_model_path):
                 print("从本地加载模型...")
                 self.tokenizer = AutoTokenizer.from_pretrained(local_model_path)
-                self.model = AutoModelForSequenceClassification.from_pretrained(local_model_path)
+                self.model = AutoModelForSequenceClassification.from_pretrained(
+                    local_model_path
+                )
             else:
                 print("首次使用，正在下载模型到本地...")
                 # 下载并保存到本地
                 self.tokenizer = AutoTokenizer.from_pretrained(model_name)
-                self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
-                
+                self.model = AutoModelForSequenceClassification.from_pretrained(
+                    model_name
+                )
+
                 # 保存到本地
                 os.makedirs(local_model_path, exist_ok=True)
                 self.tokenizer.save_pretrained(local_model_path)
                 self.model.save_pretrained(local_model_path)
                 print(f"模型已保存到: {local_model_path}")
-            
+
             # 设置设备
             device = self._select_device()
             if device is None:
@@ -198,46 +221,46 @@ class WeiboMultilingualSentimentAnalyzer:
                 print("检测到 Apple MPS 设备，已使用 MPS 进行推理。")
             else:
                 print("未检测到 GPU，自动使用 CPU 进行推理。")
-            
+
             print(f"模型加载成功! 使用设备: {self.device}")
             print("支持语言: 中文、英文、西班牙文、阿拉伯文、日文、韩文等22种语言")
             print("情感等级: 非常负面、负面、中性、正面、非常正面")
-            
+
             return True
-            
+
         except Exception as e:
             error_message = f"模型加载失败: {e}"
             print(error_message)
             print("请检查网络连接或模型文件")
             self.disable(error_message, drop_state=True)
             return False
-    
+
     def _preprocess_text(self, text: str) -> str:
         """
         文本预处理
-        
+
         Args:
             text: 输入文本
-            
+
         Returns:
             处理后的文本
         """
         # 基本文本清理
         if not text or not text.strip():
             return ""
-        
+
         # 去除多余空格
-        text = re.sub(r'\s+', ' ', text.strip())
-        
+        text = re.sub(r"\s+", " ", text.strip())
+
         return text
-    
+
     def analyze_single_text(self, text: str) -> SentimentResult:
         """
         对单个文本进行情感分析
-        
+
         Args:
             text: 要分析的文本
-            
+
         Returns:
             SentimentResult对象
         """
@@ -249,7 +272,7 @@ class WeiboMultilingualSentimentAnalyzer:
                 probability_distribution={},
                 success=False,
                 error_message=self.disable_reason or "情感分析功能已禁用",
-                analysis_performed=False
+                analysis_performed=False,
             )
         if not self.is_initialized:
@@ -260,7 +283,7 @@ class WeiboMultilingualSentimentAnalyzer:
                 probability_distribution={},
                 success=False,
                 error_message="模型未初始化，请先调用initialize() 方法",
-                analysis_performed=False
+                analysis_performed=False,
             )
         try:
@@ -275,7 +298,7 @@ class WeiboMultilingualSentimentAnalyzer:
                     probability_distribution={},
                     success=False,
                     error_message="输入文本为空或无效内容",
-                    analysis_performed=False
+                    analysis_performed=False,
                 )
             # 分词编码
@@ -284,7 +307,7 @@ class WeiboMultilingualSentimentAnalyzer:
                 max_length=512,
                 padding=True,
                 truncation=True,
-                return_tensors='pt'
+                return_tensors="pt",
             )
             # 转移到设备
@@ -311,7 +334,7 @@ class WeiboMultilingualSentimentAnalyzer:
                 sentiment_label=label,
                 confidence=confidence,
                 probability_distribution=prob_dist,
-                success=True
+                success=True,
             )
         except Exception as e:
@@ -322,17 +345,19 @@ class WeiboMultilingualSentimentAnalyzer:
                 probability_distribution={},
                 success=False,
                 error_message=f"预测时发生错误: {str(e)}",
-                analysis_performed=False
+                analysis_performed=False,
             )
-    def analyze_batch(self, texts: List[str], show_progress: bool = True) -> BatchSentimentResult:
+    def analyze_batch(
+        self, texts: List[str], show_progress: bool = True
+    ) -> BatchSentimentResult:
         """
         批量情感分析
-        
+
         Args:
             texts: 文本列表
             show_progress: 是否显示进度
-            
+
         Returns:
             BatchSentimentResult对象
         """
@@ -343,9 +368,9 @@ class WeiboMultilingualSentimentAnalyzer:
                 success_count=0,
                 failed_count=0,
                 average_confidence=0.0,
-                analysis_performed=not self.is_disabled and self.is_initialized
+                analysis_performed=not self.is_disabled and self.is_initialized,
             )
-        
+
         if self.is_disabled or not self.is_initialized:
             passthrough_results = [
                 SentimentResult(
@@ -355,7 +380,7 @@ class WeiboMultilingualSentimentAnalyzer:
                     probability_distribution={},
                     success=False,
                     error_message=self.disable_reason or "情感分析功能不可用",
-                    analysis_performed=False
+                    analysis_performed=False,
                 )
                 for text in texts
             ]
@@ -365,42 +390,44 @@ class WeiboMultilingualSentimentAnalyzer:
                 success_count=0,
                 failed_count=len(texts),
                 average_confidence=0.0,
-                analysis_performed=False
+                analysis_performed=False,
             )
-        
+
         results = []
         success_count = 0
         total_confidence = 0.0
-        
+
         for i, text in enumerate(texts):
             if show_progress and len(texts) > 1:
-                print(f"处理进度: {i+1}/{len(texts)}")
-            
+                print(f"处理进度: {i + 1}/{len(texts)}")
+
             result = self.analyze_single_text(text)
             results.append(result)
-            
+
             if result.success:
                 success_count += 1
                 total_confidence += result.confidence
-        
-        average_confidence = total_confidence / success_count if success_count > 0 else 0.0
+
+        average_confidence = (
+            total_confidence / success_count if success_count > 0 else 0.0
+        )
         failed_count = len(texts) - success_count
-        
+
         return BatchSentimentResult(
             results=results,
             total_processed=len(texts),
             success_count=success_count,
             failed_count=failed_count,
             average_confidence=average_confidence,
-            analysis_performed=True
+            analysis_performed=True,
         )
-    
+
     def _build_passthrough_analysis(
         self,
         original_data: List[Dict[str, Any]],
         reason: str,
         texts: Optional[List[str]] = None,
-        results: Optional[List[SentimentResult]] = None
+        results: Optional[List[SentimentResult]] = None,
     ) -> Dict[str, Any]:
         """
         构建在情感分析不可用时的透传结果
@@ -416,33 +443,36 @@ class WeiboMultilingualSentimentAnalyzer:
                 "sentiment_distribution": {},
                 "high_confidence_results": [],
                 "summary": f"情感分析未执行：{reason}",
-                "original_texts": original_data
+                "original_texts": original_data,
             }
         }
-        
+
         if texts is not None:
             response["sentiment_analysis"]["passthrough_texts"] = texts
-        
+
         if results is not None:
             response["sentiment_analysis"]["results"] = [
                 result.__dict__ if isinstance(result, SentimentResult) else result
                 for result in results
             ]
-        
+
         return response
-    
-    def analyze_query_results(self, query_results: List[Dict[str, Any]], 
-                            text_field: str = "content", 
-                            min_confidence: float = 0.5) -> Dict[str, Any]:
+
+    def analyze_query_results(
+        self,
+        query_results: List[Dict[str, Any]],
+        text_field: str = "content",
+        min_confidence: float = 0.5,
+    ) -> Dict[str, Any]:
         """
         对查询结果进行情感分析
         专门用于分析从MediaCrawlerDB返回的查询结果
-        
+
         Args:
             query_results: 查询结果列表，每个元素包含文本内容
             text_field: 文本内容字段名，默认为"content"
             min_confidence: 最小置信度阈值
-            
+
         Returns:
             包含情感分析结果的字典
         """
@@ -452,14 +482,14 @@ class WeiboMultilingualSentimentAnalyzer:
                     "total_analyzed": 0,
                     "sentiment_distribution": {},
                     "high_confidence_results": [],
-                    "summary": "没有内容需要分析"
+                    "summary": "没有内容需要分析",
                 }
             }
-        
+
         # 提取文本内容
         texts_to_analyze = []
         original_data = []
-        
+
         for item in query_results:
             # 尝试多个可能的文本字段
             text_content = ""
@@ -467,49 +497,52 @@ class WeiboMultilingualSentimentAnalyzer:
                 if field in item and item[field]:
                     text_content = str(item[field])
                     break
-            
+
             if text_content.strip():
                 texts_to_analyze.append(text_content)
                 original_data.append(item)
-        
+
         if not texts_to_analyze:
             return {
                 "sentiment_analysis": {
                     "total_analyzed": 0,
                     "sentiment_distribution": {},
                     "high_confidence_results": [],
-                    "summary": "查询结果中没有找到可分析的文本内容"
+                    "summary": "查询结果中没有找到可分析的文本内容",
                 }
             }
-        
+
         if self.is_disabled:
             return self._build_passthrough_analysis(
                 original_data=original_data,
                 reason=self.disable_reason or "情感分析模型不可用",
-                texts=texts_to_analyze
+                texts=texts_to_analyze,
             )
-        
+
         # 执行批量情感分析
         print(f"正在对{len(texts_to_analyze)}条内容进行情感分析...")
         batch_result = self.analyze_batch(texts_to_analyze, show_progress=True)
-        
+
         if not batch_result.analysis_performed:
             reason = self.disable_reason or "情感分析功能不可用"
             if batch_result.results:
-                candidate_error = next((r.error_message for r in batch_result.results if r.error_message), None)
+                candidate_error = next(
+                    (r.error_message for r in batch_result.results if r.error_message),
+                    None,
+                )
                 if candidate_error:
                     reason = candidate_error
             return self._build_passthrough_analysis(
                 original_data=original_data,
                 reason=reason,
                 texts=texts_to_analyze,
-                results=batch_result.results
+                results=batch_result.results,
             )
-        
+
         # 统计情感分布
         sentiment_distribution = {}
         high_confidence_results = []
-        
+
         for result, original_item in zip(batch_result.results, original_data):
             if result.success:
                 # 统计情感分布
@@ -517,24 +550,28 @@ class WeiboMultilingualSentimentAnalyzer:
                 if sentiment not in sentiment_distribution:
                     sentiment_distribution[sentiment] = 0
                 sentiment_distribution[sentiment] += 1
-                
+
                 # 收集高置信度结果
                 if result.confidence >= min_confidence:
-                    high_confidence_results.append({
-                        "original_data": original_item,
-                        "sentiment": result.sentiment_label,
-                        "confidence": result.confidence,
-                        "text_preview": result.text[:100] + "..." if len(result.text) > 100 else result.text
-                    })
-        
+                    high_confidence_results.append(
+                        {
+                            "original_data": original_item,
+                            "sentiment": result.sentiment_label,
+                            "confidence": result.confidence,
+                            "text_preview": result.text[:100] + "..."
+                            if len(result.text) > 100
+                            else result.text,
+                        }
+                    )
+
         # 生成情感分析摘要
         total_analyzed = batch_result.success_count
         if total_analyzed > 0:
             dominant_sentiment = max(sentiment_distribution.items(), key=lambda x: x[1])
-            sentiment_summary = f"共分析{total_analyzed}条内容，主要情感倾向为'{dominant_sentiment[0]}'({dominant_sentiment[1]}条，占{dominant_sentiment[1]/total_analyzed*100:.1f}%)"
+            sentiment_summary = f"共分析{total_analyzed}条内容，主要情感倾向为'{dominant_sentiment[0]}'({dominant_sentiment[1]}条，占{dominant_sentiment[1] / total_analyzed * 100:.1f}%)"
         else:
             sentiment_summary = "情感分析失败"
-        
+
         return {
             "sentiment_analysis": {
                 "total_analyzed": total_analyzed,
@@ -542,28 +579,46 @@ class WeiboMultilingualSentimentAnalyzer:
                 "average_confidence": round(batch_result.average_confidence, 4),
                 "sentiment_distribution": sentiment_distribution,
                 "high_confidence_results": high_confidence_results,  # 返回所有高置信度结果，不做限制
-                "summary": sentiment_summary
+                "summary": sentiment_summary,
             }
         }
-    
+
     def get_model_info(self) -> Dict[str, Any]:
         """
         获取模型信息
-        
+
         Returns:
             模型信息字典
         """
         return {
             "model_name": "tabularisai/multilingual-sentiment-analysis",
             "supported_languages": [
-                "中文", "英文", "西班牙文", "阿拉伯文", "日文", "韩文", 
-                "德文", "法文", "意大利文", "葡萄牙文", "俄文", "荷兰文",
-                "波兰文", "土耳其文", "丹麦文", "希腊文", "芬兰文", 
-                "瑞典文", "挪威文", "匈牙利文", "捷克文", "保加利亚文"
+                "中文",
+                "英文",
+                "西班牙文",
+                "阿拉伯文",
+                "日文",
+                "韩文",
+                "德文",
+                "法文",
+                "意大利文",
+                "葡萄牙文",
+                "俄文",
+                "荷兰文",
+                "波兰文",
+                "土耳其文",
+                "丹麦文",
+                "希腊文",
+                "芬兰文",
+                "瑞典文",
+                "挪威文",
+                "匈牙利文",
+                "捷克文",
+                "保加利亚文",
             ],
             "sentiment_levels": list(self.sentiment_map.values()),
             "is_initialized": self.is_initialized,
-            "device": str(self.device) if self.device else "未设置"
+            "device": str(self.device) if self.device else "未设置",
         }
@@ -576,20 +631,23 @@ def enable_sentiment_analysis() -> bool:
     return multilingual_sentiment_analyzer.enable()
-def disable_sentiment_analysis(reason: Optional[str] = None, drop_state: bool = False) -> None:
+def disable_sentiment_analysis(
+    reason: Optional[str] = None, drop_state: bool = False
+) -> None:
     """Public helper to disable sentiment analysis at runtime."""
     multilingual_sentiment_analyzer.disable(reason=reason, drop_state=drop_state)
-def analyze_sentiment(text_or_texts: Union[str, List[str]], 
-                     initialize_if_needed: bool = True) -> Union[SentimentResult, BatchSentimentResult]:
+def analyze_sentiment(
+    text_or_texts: Union[str, List[str]], initialize_if_needed: bool = True
+) -> Union[SentimentResult, BatchSentimentResult]:
     """
     便捷的情感分析函数
-    
+
     Args:
         text_or_texts: 单个文本或文本列表
         initialize_if_needed: 如果模型未初始化，是否自动初始化
-        
+
     Returns:
         SentimentResult或BatchSentimentResult
     """
@@ -599,7 +657,7 @@ def analyze_sentiment(text_or_texts: Union[str, List[str]],
         and not multilingual_sentiment_analyzer.is_disabled
     ):
         multilingual_sentiment_analyzer.initialize()
-    
+
     if isinstance(text_or_texts, str):
         return multilingual_sentiment_analyzer.analyze_single_text(text_or_texts)
     else:
@@ -610,24 +668,30 @@ def analyze_sentiment(text_or_texts: Union[str, List[str]],
 if __name__ == "__main__":
     # 测试代码
     analyzer = WeiboMultilingualSentimentAnalyzer()
-    
+
     if analyzer.initialize():
         # 测试单个文本
         result = analyzer.analyze_single_text("今天天气真好，心情特别棒！")
-        print(f"单个文本分析: {result.sentiment_label} (置信度: {result.confidence:.4f})")
-        
+        print(
+            f"单个文本分析: {result.sentiment_label} (置信度: {result.confidence:.4f})"
+        )
+
         # 测试批量文本
         test_texts = [
             "这家餐厅的菜味道非常棒！",
             "服务态度太差了，很失望",
             "I absolutely love this product!",
-            "The customer service was disappointing."
+            "The customer service was disappointing.",
         ]
-        
+
         batch_result = analyzer.analyze_batch(test_texts)
-        print(f"\n批量分析: 成功 {batch_result.success_count}/{batch_result.total_processed}")
-        
+        print(
+            f"\n批量分析: 成功 {batch_result.success_count}/{batch_result.total_processed}"
+        )
+
         for result in batch_result.results:
-            print(f"'{result.text[:30]}...' -> {result.sentiment_label} ({result.confidence:.4f})")
+            print(
+                f"'{result.text[:30]}...' -> {result.sentiment_label} ({result.confidence:.4f})"
+            )
     else:
         print("模型初始化失败，无法进行测试")