sentiment_analyzer.py 15.9 KB

Raw Blame History Permalink

"""
多语言情感分析工具
基于WeiboMultilingualSentiment模型为InsightEngine提供情感分析功能
"""

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import os
import sys
from typing import List, Dict, Any, Optional, Union
from dataclasses import dataclass
import re

# 添加项目根目录到路径，以便导入WeiboMultilingualSentiment
project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
weibo_sentiment_path = os.path.join(project_root, "SentimentAnalysisModel", "WeiboMultilingualSentiment")
sys.path.append(weibo_sentiment_path)


@dataclass
class SentimentResult:
    """情感分析结果数据类"""
    text: str
    sentiment_label: str
    confidence: float
    probability_distribution: Dict[str, float]
    success: bool = True
    error_message: Optional[str] = None


@dataclass
class BatchSentimentResult:
    """批量情感分析结果数据类"""
    results: List[SentimentResult]
    total_processed: int
    success_count: int
    failed_count: int
    average_confidence: float


class WeiboMultilingualSentimentAnalyzer:
    """
    多语言情感分析器
    封装WeiboMultilingualSentiment模型，为AI Agent提供情感分析功能
    """

    def __init__(self):
        """初始化情感分析器"""
        self.model = None
        self.tokenizer = None
        self.device = None
        self.is_initialized = False

        # 情感标签映射（5级分类）
        self.sentiment_map = {
            0: "非常负面",
            1: "负面",
            2: "中性",
            3: "正面",
            4: "非常正面"
        }

        print("WeiboMultilingualSentimentAnalyzer 已创建，调用 initialize() 来加载模型")

    def initialize(self) -> bool:
        """
        初始化模型和分词器

        Returns:
            是否初始化成功
        """
        if self.is_initialized:
            print("模型已经初始化，无需重复加载")
            return True

        try:
            print("正在加载多语言情感分析模型...")

            # 使用多语言情感分析模型
            model_name = "tabularisai/multilingual-sentiment-analysis"
            local_model_path = os.path.join(weibo_sentiment_path, "model")

            # 检查本地是否已有模型
            if os.path.exists(local_model_path):
                print("从本地加载模型...")
                self.tokenizer = AutoTokenizer.from_pretrained(local_model_path)
                self.model = AutoModelForSequenceClassification.from_pretrained(local_model_path)
            else:
                print("首次使用，正在下载模型到本地...")
                # 下载并保存到本地
                self.tokenizer = AutoTokenizer.from_pretrained(model_name)
                self.model = AutoModelForSequenceClassification.from_pretrained(model_name)

                # 保存到本地
                os.makedirs(local_model_path, exist_ok=True)
                self.tokenizer.save_pretrained(local_model_path)
                self.model.save_pretrained(local_model_path)
                print(f"模型已保存到: {local_model_path}")

            # 设置设备
            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
            self.model.to(self.device)
            self.model.eval()
            self.is_initialized = True

            print(f"模型加载成功! 使用设备: {self.device}")
            print("支持语言: 中文、英文、西班牙文、阿拉伯文、日文、韩文等22种语言")
            print("情感等级: 非常负面、负面、中性、正面、非常正面")

            return True

        except Exception as e:
            print(f"模型加载失败: {e}")
            print("请检查网络连接或模型文件")
            self.is_initialized = False
            return False

    def _preprocess_text(self, text: str) -> str:
        """
        文本预处理

        Args:
            text: 输入文本

        Returns:
            处理后的文本
        """
        # 基本文本清理
        if not text or not text.strip():
            return ""

        # 去除多余空格
        text = re.sub(r'\s+', ' ', text.strip())

        return text

    def analyze_single_text(self, text: str) -> SentimentResult:
        """
        对单个文本进行情感分析

        Args:
            text: 要分析的文本

        Returns:
            SentimentResult对象
        """
        if not self.is_initialized:
            return SentimentResult(
                text=text,
                sentiment_label="未初始化",
                confidence=0.0,
                probability_distribution={},
                success=False,
                error_message="模型未初始化，请先调用 initialize() 方法"
            )

        try:
            # 预处理文本
            processed_text = self._preprocess_text(text)

            if not processed_text:
                return SentimentResult(
                    text=text,
                    sentiment_label="输入错误",
                    confidence=0.0,
                    probability_distribution={},
                    success=False,
                    error_message="输入文本为空或无效"
                )

            # 分词编码
            inputs = self.tokenizer(
                processed_text,
                max_length=512,
                padding=True,
                truncation=True,
                return_tensors='pt'
            )

            # 转移到设备
            inputs = {k: v.to(self.device) for k, v in inputs.items()}

            # 预测
            with torch.no_grad():
                outputs = self.model(**inputs)
                logits = outputs.logits
                probabilities = torch.softmax(logits, dim=1)
                prediction = torch.argmax(probabilities, dim=1).item()

            # 构建结果
            confidence = probabilities[0][prediction].item()
            label = self.sentiment_map[prediction]

            # 构建概率分布字典
            prob_dist = {}
            for i, (label_name, prob) in enumerate(zip(self.sentiment_map.values(), probabilities[0])):
                prob_dist[label_name] = prob.item()

            return SentimentResult(
                text=text,
                sentiment_label=label,
                confidence=confidence,
                probability_distribution=prob_dist,
                success=True
            )

        except Exception as e:
            return SentimentResult(
                text=text,
                sentiment_label="分析失败",
                confidence=0.0,
                probability_distribution={},
                success=False,
                error_message=f"预测时发生错误: {str(e)}"
            )

    def analyze_batch(self, texts: List[str], show_progress: bool = True) -> BatchSentimentResult:
        """
        批量情感分析

        Args:
            texts: 文本列表
            show_progress: 是否显示进度

        Returns:
            BatchSentimentResult对象
        """
        if not texts:
            return BatchSentimentResult(
                results=[],
                total_processed=0,
                success_count=0,
                failed_count=0,
                average_confidence=0.0
            )

        results = []
        success_count = 0
        total_confidence = 0.0

        for i, text in enumerate(texts):
            if show_progress and len(texts) > 1:
                print(f"处理进度: {i+1}/{len(texts)}")

            result = self.analyze_single_text(text)
            results.append(result)

            if result.success:
                success_count += 1
                total_confidence += result.confidence

        average_confidence = total_confidence / success_count if success_count > 0 else 0.0
        failed_count = len(texts) - success_count

        return BatchSentimentResult(
            results=results,
            total_processed=len(texts),
            success_count=success_count,
            failed_count=failed_count,
            average_confidence=average_confidence
        )

    def analyze_query_results(self, query_results: List[Dict[str, Any]],
                            text_field: str = "content",
                            min_confidence: float = 0.5) -> Dict[str, Any]:
        """
        对查询结果进行情感分析
        专门用于分析从MediaCrawlerDB返回的查询结果

        Args:
            query_results: 查询结果列表，每个元素包含文本内容
            text_field: 文本内容字段名，默认为"content"
            min_confidence: 最小置信度阈值

        Returns:
            包含情感分析结果的字典
        """
        if not query_results:
            return {
                "sentiment_analysis": {
                    "total_analyzed": 0,
                    "sentiment_distribution": {},
                    "high_confidence_results": [],
                    "summary": "没有内容需要分析"
                }
            }

        # 提取文本内容
        texts_to_analyze = []
        original_data = []

        for item in query_results:
            # 尝试多个可能的文本字段
            text_content = ""
            for field in [text_field, "title_or_content", "content", "title", "text"]:
                if field in item and item[field]:
                    text_content = str(item[field])
                    break

            if text_content.strip():
                texts_to_analyze.append(text_content)
                original_data.append(item)

        if not texts_to_analyze:
            return {
                "sentiment_analysis": {
                    "total_analyzed": 0,
                    "sentiment_distribution": {},
                    "high_confidence_results": [],
                    "summary": "查询结果中没有找到可分析的文本内容"
                }
            }

        # 执行批量情感分析
        print(f"正在对{len(texts_to_analyze)}条内容进行情感分析...")
        batch_result = self.analyze_batch(texts_to_analyze, show_progress=True)

        # 统计情感分布
        sentiment_distribution = {}
        high_confidence_results = []

        for result, original_item in zip(batch_result.results, original_data):
            if result.success:
                # 统计情感分布
                sentiment = result.sentiment_label
                if sentiment not in sentiment_distribution:
                    sentiment_distribution[sentiment] = 0
                sentiment_distribution[sentiment] += 1

                # 收集高置信度结果
                if result.confidence >= min_confidence:
                    high_confidence_results.append({
                        "original_data": original_item,
                        "sentiment": result.sentiment_label,
                        "confidence": result.confidence,
                        "text_preview": result.text[:100] + "..." if len(result.text) > 100 else result.text
                    })

        # 生成情感分析摘要
        total_analyzed = batch_result.success_count
        if total_analyzed > 0:
            dominant_sentiment = max(sentiment_distribution.items(), key=lambda x: x[1])
            sentiment_summary = f"共分析{total_analyzed}条内容，主要情感倾向为'{dominant_sentiment[0]}'({dominant_sentiment[1]}条，占{dominant_sentiment[1]/total_analyzed*100:.1f}%)"
        else:
            sentiment_summary = "情感分析失败"

        return {
            "sentiment_analysis": {
                "total_analyzed": total_analyzed,
                "success_rate": f"{batch_result.success_count}/{batch_result.total_processed}",
                "average_confidence": round(batch_result.average_confidence, 4),
                "sentiment_distribution": sentiment_distribution,
                "high_confidence_results": high_confidence_results,  # 返回所有高置信度结果，不做限制
                "summary": sentiment_summary
            }
        }

    def get_model_info(self) -> Dict[str, Any]:
        """
        获取模型信息

        Returns:
            模型信息字典
        """
        return {
            "model_name": "tabularisai/multilingual-sentiment-analysis",
            "supported_languages": [
                "中文", "英文", "西班牙文", "阿拉伯文", "日文", "韩文",
                "德文", "法文", "意大利文", "葡萄牙文", "俄文", "荷兰文",
                "波兰文", "土耳其文", "丹麦文", "希腊文", "芬兰文",
                "瑞典文", "挪威文", "匈牙利文", "捷克文", "保加利亚文"
            ],
            "sentiment_levels": list(self.sentiment_map.values()),
            "is_initialized": self.is_initialized,
            "device": str(self.device) if self.device else "未设置"
        }


# 创建全局实例（延迟初始化）
multilingual_sentiment_analyzer = WeiboMultilingualSentimentAnalyzer()


def analyze_sentiment(text_or_texts: Union[str, List[str]],
                     initialize_if_needed: bool = True) -> Union[SentimentResult, BatchSentimentResult]:
    """
    便捷的情感分析函数

    Args:
        text_or_texts: 单个文本或文本列表
        initialize_if_needed: 如果模型未初始化，是否自动初始化

    Returns:
        SentimentResult或BatchSentimentResult
    """
    if initialize_if_needed and not multilingual_sentiment_analyzer.is_initialized:
        if not multilingual_sentiment_analyzer.initialize():
            # 如果初始化失败，返回失败结果
            if isinstance(text_or_texts, str):
                return SentimentResult(
                    text=text_or_texts,
                    sentiment_label="初始化失败",
                    confidence=0.0,
                    probability_distribution={},
                    success=False,
                    error_message="模型初始化失败"
                )
            else:
                return BatchSentimentResult(
                    results=[],
                    total_processed=0,
                    success_count=0,
                    failed_count=len(text_or_texts),
                    average_confidence=0.0
                )

    if isinstance(text_or_texts, str):
        return multilingual_sentiment_analyzer.analyze_single_text(text_or_texts)
    else:
        return multilingual_sentiment_analyzer.analyze_batch(text_or_texts)


if __name__ == "__main__":
    # 测试代码
    analyzer = WeiboMultilingualSentimentAnalyzer()

    if analyzer.initialize():
        # 测试单个文本
        result = analyzer.analyze_single_text("今天天气真好，心情特别棒！")
        print(f"单个文本分析: {result.sentiment_label} (置信度: {result.confidence:.4f})")

        # 测试批量文本
        test_texts = [
            "这家餐厅的菜味道非常棒！",
            "服务态度太差了，很失望",
            "I absolutely love this product!",
            "The customer service was disappointing."
        ]

        batch_result = analyzer.analyze_batch(test_texts)
        print(f"\n批量分析: 成功 {batch_result.success_count}/{batch_result.total_processed}")

        for result in batch_result.results:
            print(f"'{result.text[:30]}...' -> {result.sentiment_label} ({result.confidence:.4f})")
    else:
        print("模型初始化失败，无法进行测试")