Showing
7 changed files
with
329 additions
and
2 deletions
| @@ -182,6 +182,7 @@ WeiboSentiment_Finetuned/GPT2-AdapterTuning/models/ | @@ -182,6 +182,7 @@ WeiboSentiment_Finetuned/GPT2-AdapterTuning/models/ | ||
| 182 | WeiboSentiment_Finetuned/BertChinese-Lora/models/ | 182 | WeiboSentiment_Finetuned/BertChinese-Lora/models/ |
| 183 | WeiboSentiment_LLM/models/ | 183 | WeiboSentiment_LLM/models/ |
| 184 | WeiboSentiment_Finetuned/BertChinese-Lora/model/ | 184 | WeiboSentiment_Finetuned/BertChinese-Lora/model/ |
| 185 | +WeiboMultilingualSentiment/model/ | ||
| 185 | 186 | ||
| 186 | # LoRA 和 Adapter 权重 | 187 | # LoRA 和 Adapter 权重 |
| 187 | */adapter_model.safetensors | 188 | */adapter_model.safetensors |
WeiboMultilingualSentiment/README.md
0 → 100644
| 1 | +# 多语言情感分析 - Multilingual Sentiment Analysis | ||
| 2 | + | ||
| 3 | +本模块使用HuggingFace上的多语言情感分析模型进行情感分析,支持22种语言。 | ||
| 4 | + | ||
| 5 | +## 模型信息 | ||
| 6 | + | ||
| 7 | +- **模型名称**: tabularisai/multilingual-sentiment-analysis | ||
| 8 | +- **基础模型**: distilbert-base-multilingual-cased | ||
| 9 | +- **支持语言**: 22种语言,包括: | ||
| 10 | + - 中文 (中文) | ||
| 11 | + - English (英语) | ||
| 12 | + - Español (西班牙语) | ||
| 13 | + - 日本語 (日语) | ||
| 14 | + - 한국어 (韩语) | ||
| 15 | + - Français (法语) | ||
| 16 | + - Deutsch (德语) | ||
| 17 | + - Русский (俄语) | ||
| 18 | + - العربية (阿拉伯语) | ||
| 19 | + - हिन्दी (印地语) | ||
| 20 | + - Português (葡萄牙语) | ||
| 21 | + - Italiano (意大利语) | ||
| 22 | + - 等等... | ||
| 23 | + | ||
| 24 | +- **输出类别**: 5级情感分类 | ||
| 25 | + - 非常负面 (Very Negative) | ||
| 26 | + - 负面 (Negative) | ||
| 27 | + - 中性 (Neutral) | ||
| 28 | + - 正面 (Positive) | ||
| 29 | + - 非常正面 (Very Positive) | ||
| 30 | + | ||
| 31 | +## 快速开始 | ||
| 32 | + | ||
| 33 | +1. 确保已安装依赖: | ||
| 34 | +```bash | ||
| 35 | +pip install transformers torch | ||
| 36 | +``` | ||
| 37 | + | ||
| 38 | +2. 运行预测程序: | ||
| 39 | +```bash | ||
| 40 | +python predict.py | ||
| 41 | +``` | ||
| 42 | + | ||
| 43 | +3. 输入任意语言的文本进行分析: | ||
| 44 | +``` | ||
| 45 | +请输入文本: I love this product! | ||
| 46 | +预测结果: 非常正面 (置信度: 0.9456) | ||
| 47 | +``` | ||
| 48 | + | ||
| 49 | +4. 查看多语言示例: | ||
| 50 | +``` | ||
| 51 | +请输入文本: demo | ||
| 52 | +``` | ||
| 53 | + | ||
| 54 | +## 代码示例 | ||
| 55 | + | ||
| 56 | +```python | ||
| 57 | +from transformers import AutoTokenizer, AutoModelForSequenceClassification | ||
| 58 | +import torch | ||
| 59 | + | ||
| 60 | +# 加载模型 | ||
| 61 | +model_name = "tabularisai/multilingual-sentiment-analysis" | ||
| 62 | +tokenizer = AutoTokenizer.from_pretrained(model_name) | ||
| 63 | +model = AutoModelForSequenceClassification.from_pretrained(model_name) | ||
| 64 | + | ||
| 65 | +# 预测 | ||
| 66 | +texts = [ | ||
| 67 | + "今天心情很好", # 中文 | ||
| 68 | + "I love this!", # 英文 | ||
| 69 | + "¡Me encanta!" # 西班牙文 | ||
| 70 | +] | ||
| 71 | + | ||
| 72 | +for text in texts: | ||
| 73 | + inputs = tokenizer(text, return_tensors="pt") | ||
| 74 | + outputs = model(**inputs) | ||
| 75 | + prediction = torch.argmax(outputs.logits, dim=1).item() | ||
| 76 | + sentiment_map = {0: "非常负面", 1: "负面", 2: "中性", 3: "正面", 4: "非常正面"} | ||
| 77 | + print(f"{text} -> {sentiment_map[prediction]}") | ||
| 78 | +``` | ||
| 79 | + | ||
| 80 | +## 特色功能 | ||
| 81 | + | ||
| 82 | +- **多语言支持**: 无需指定语言,自动识别22种语言 | ||
| 83 | +- **5级精细分类**: 比传统二分类更细致的情感分析 | ||
| 84 | +- **高精度**: 基于DistilBERT的先进架构 | ||
| 85 | +- **本地缓存**: 首次下载后保存到本地,加快后续使用 | ||
| 86 | + | ||
| 87 | +## 应用场景 | ||
| 88 | + | ||
| 89 | +- 国际社交媒体监控 | ||
| 90 | +- 多语言客户反馈分析 | ||
| 91 | +- 全球产品评论情感分类 | ||
| 92 | +- 跨语言品牌情感追踪 | ||
| 93 | +- 多语言客服优化 | ||
| 94 | +- 国际市场研究 | ||
| 95 | + | ||
| 96 | +## 模型存储 | ||
| 97 | + | ||
| 98 | +- 首次运行时会自动下载模型到当前目录的 `model` 文件夹 | ||
| 99 | +- 后续运行会直接从本地加载,无需重复下载 | ||
| 100 | +- 模型大小约135MB,首次下载需要网络连接 | ||
| 101 | + | ||
| 102 | +## 文件说明 | ||
| 103 | + | ||
| 104 | +- `predict.py`: 主预测程序,使用直接模型调用 | ||
| 105 | +- `README.md`: 使用说明 | ||
| 106 | + | ||
| 107 | +## 注意事项 | ||
| 108 | + | ||
| 109 | +- 首次运行时会自动下载模型,需要网络连接 | ||
| 110 | +- 模型会保存到当前目录,方便后续使用 | ||
| 111 | +- 支持GPU加速,会自动检测可用设备 | ||
| 112 | +- 如需清理模型文件,删除 `model` 文件夹即可 | ||
| 113 | +- 该模型基于合成数据训练,在实际应用中建议进行验证 | ||
| 114 | + | ||
| 115 | +## 参考信息 | ||
| 116 | + | ||
| 117 | +- 模型链接: https://huggingface.co/tabularisai/multilingual-sentiment-analysis | ||
| 118 | +- 许可证: CC-BY-NC-4.0 (非商业使用) | ||
| 119 | +- 商业使用需联系: info@tabularis.ai |
WeiboMultilingualSentiment/predict.py
0 → 100644
| 1 | +import torch | ||
| 2 | +from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline | ||
| 3 | +import re | ||
| 4 | + | ||
| 5 | +def preprocess_text(text): | ||
| 6 | + """简单的文本预处理,适用于多语言文本""" | ||
| 7 | + text = re.sub(r"\{%.+?%\}", " ", text) # 去除 {%xxx%} | ||
| 8 | + text = re.sub(r"@.+?( |$)", " ", text) # 去除 @xxx | ||
| 9 | + text = re.sub(r"【.+?】", " ", text) # 去除 【xx】 | ||
| 10 | + text = re.sub(r"\u200b", " ", text) # 去除特殊字符 | ||
| 11 | + text = re.sub(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", "", text) # 去除URL | ||
| 12 | + # 删除表情符号 | ||
| 13 | + text = re.sub(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF\U00002600-\U000027BF\U0001f900-\U0001f9ff\U0001f018-\U0001f270\U0000231a-\U0000231b\U0000238d-\U0000238d\U000024c2-\U0001f251]+', '', text) | ||
| 14 | + text = re.sub(r"\s+", " ", text) # 多个空格合并 | ||
| 15 | + return text.strip() | ||
| 16 | + | ||
| 17 | +def main(): | ||
| 18 | + print("正在加载多语言情感分析模型...") | ||
| 19 | + | ||
| 20 | + # 使用多语言情感分析模型 | ||
| 21 | + model_name = "tabularisai/multilingual-sentiment-analysis" | ||
| 22 | + local_model_path = "./model" | ||
| 23 | + | ||
| 24 | + try: | ||
| 25 | + # 检查本地是否已有模型 | ||
| 26 | + import os | ||
| 27 | + if os.path.exists(local_model_path): | ||
| 28 | + print("从本地加载模型...") | ||
| 29 | + tokenizer = AutoTokenizer.from_pretrained(local_model_path) | ||
| 30 | + model = AutoModelForSequenceClassification.from_pretrained(local_model_path) | ||
| 31 | + else: | ||
| 32 | + print("首次使用,正在下载模型到本地...") | ||
| 33 | + # 下载并保存到本地 | ||
| 34 | + tokenizer = AutoTokenizer.from_pretrained(model_name) | ||
| 35 | + model = AutoModelForSequenceClassification.from_pretrained(model_name) | ||
| 36 | + | ||
| 37 | + # 保存到本地 | ||
| 38 | + tokenizer.save_pretrained(local_model_path) | ||
| 39 | + model.save_pretrained(local_model_path) | ||
| 40 | + print(f"模型已保存到: {local_model_path}") | ||
| 41 | + | ||
| 42 | + # 设置设备 | ||
| 43 | + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | ||
| 44 | + model.to(device) | ||
| 45 | + model.eval() | ||
| 46 | + print(f"模型加载成功! 使用设备: {device}") | ||
| 47 | + | ||
| 48 | + # 情感标签映射(5级分类) | ||
| 49 | + sentiment_map = { | ||
| 50 | + 0: "非常负面", 1: "负面", 2: "中性", 3: "正面", 4: "非常正面" | ||
| 51 | + } | ||
| 52 | + | ||
| 53 | + except Exception as e: | ||
| 54 | + print(f"模型加载失败: {e}") | ||
| 55 | + print("请检查网络连接") | ||
| 56 | + return | ||
| 57 | + | ||
| 58 | + print("\n============= 多语言情感分析 =============") | ||
| 59 | + print("支持语言: 中文、英文、西班牙文、阿拉伯文、日文、韩文等22种语言") | ||
| 60 | + print("情感等级: 非常负面、负面、中性、正面、非常正面") | ||
| 61 | + print("输入文本进行分析 (输入 'q' 退出):") | ||
| 62 | + print("输入 'demo' 查看多语言示例") | ||
| 63 | + | ||
| 64 | + while True: | ||
| 65 | + text = input("\n请输入文本: ") | ||
| 66 | + if text.lower() == 'q': | ||
| 67 | + break | ||
| 68 | + | ||
| 69 | + if text.lower() == 'demo': | ||
| 70 | + show_multilingual_demo(tokenizer, model, device, sentiment_map) | ||
| 71 | + continue | ||
| 72 | + | ||
| 73 | + if not text.strip(): | ||
| 74 | + print("输入不能为空,请重新输入") | ||
| 75 | + continue | ||
| 76 | + | ||
| 77 | + try: | ||
| 78 | + # 预处理文本 | ||
| 79 | + processed_text = preprocess_text(text) | ||
| 80 | + | ||
| 81 | + # 分词编码 | ||
| 82 | + inputs = tokenizer( | ||
| 83 | + processed_text, | ||
| 84 | + max_length=512, | ||
| 85 | + padding=True, | ||
| 86 | + truncation=True, | ||
| 87 | + return_tensors='pt' | ||
| 88 | + ) | ||
| 89 | + | ||
| 90 | + # 转移到设备 | ||
| 91 | + inputs = {k: v.to(device) for k, v in inputs.items()} | ||
| 92 | + | ||
| 93 | + # 预测 | ||
| 94 | + with torch.no_grad(): | ||
| 95 | + outputs = model(**inputs) | ||
| 96 | + logits = outputs.logits | ||
| 97 | + probabilities = torch.softmax(logits, dim=1) | ||
| 98 | + prediction = torch.argmax(probabilities, dim=1).item() | ||
| 99 | + | ||
| 100 | + # 输出结果 | ||
| 101 | + confidence = probabilities[0][prediction].item() | ||
| 102 | + label = sentiment_map[prediction] | ||
| 103 | + | ||
| 104 | + print(f"预测结果: {label} (置信度: {confidence:.4f})") | ||
| 105 | + | ||
| 106 | + # 显示所有类别的概率 | ||
| 107 | + print("详细概率分布:") | ||
| 108 | + for i, (label_name, prob) in enumerate(zip(sentiment_map.values(), probabilities[0])): | ||
| 109 | + print(f" {label_name}: {prob:.4f}") | ||
| 110 | + | ||
| 111 | + except Exception as e: | ||
| 112 | + print(f"预测时发生错误: {e}") | ||
| 113 | + continue | ||
| 114 | + | ||
| 115 | +def show_multilingual_demo(tokenizer, model, device, sentiment_map): | ||
| 116 | + """展示多语言情感分析示例""" | ||
| 117 | + print("\n=== 多语言情感分析示例 ===") | ||
| 118 | + | ||
| 119 | + demo_texts = [ | ||
| 120 | + # 中文 | ||
| 121 | + ("今天天气真好,心情特别棒!", "中文"), | ||
| 122 | + ("这家餐厅的菜味道非常棒!", "中文"), | ||
| 123 | + ("服务态度太差了,很失望", "中文"), | ||
| 124 | + | ||
| 125 | + # 英文 | ||
| 126 | + ("I absolutely love this product!", "英文"), | ||
| 127 | + ("The customer service was disappointing.", "英文"), | ||
| 128 | + ("The weather is fine, nothing special.", "英文"), | ||
| 129 | + | ||
| 130 | + # 日文 | ||
| 131 | + ("このレストランの料理は本当に美味しいです!", "日文"), | ||
| 132 | + ("このホテルのサービスはがっかりしました。", "日文"), | ||
| 133 | + | ||
| 134 | + # 韩文 | ||
| 135 | + ("이 가게의 케이크는 정말 맛있어요!", "韩文"), | ||
| 136 | + ("서비스가 너무 별로였어요。", "韩文"), | ||
| 137 | + | ||
| 138 | + # 西班牙文 | ||
| 139 | + ("¡Me encanta cómo quedó la decoración!", "西班牙文"), | ||
| 140 | + ("El servicio fue terrible y muy lento.", "西班牙文"), | ||
| 141 | + ] | ||
| 142 | + | ||
| 143 | + for text, language in demo_texts: | ||
| 144 | + try: | ||
| 145 | + inputs = tokenizer( | ||
| 146 | + text, | ||
| 147 | + max_length=512, | ||
| 148 | + padding=True, | ||
| 149 | + truncation=True, | ||
| 150 | + return_tensors='pt' | ||
| 151 | + ) | ||
| 152 | + | ||
| 153 | + inputs = {k: v.to(device) for k, v in inputs.items()} | ||
| 154 | + | ||
| 155 | + with torch.no_grad(): | ||
| 156 | + outputs = model(**inputs) | ||
| 157 | + logits = outputs.logits | ||
| 158 | + probabilities = torch.softmax(logits, dim=1) | ||
| 159 | + prediction = torch.argmax(probabilities, dim=1).item() | ||
| 160 | + | ||
| 161 | + confidence = probabilities[0][prediction].item() | ||
| 162 | + label = sentiment_map[prediction] | ||
| 163 | + | ||
| 164 | + print(f"\n{language}: {text}") | ||
| 165 | + print(f"结果: {label} (置信度: {confidence:.4f})") | ||
| 166 | + | ||
| 167 | + except Exception as e: | ||
| 168 | + print(f"处理 {text} 时出错: {e}") | ||
| 169 | + | ||
| 170 | + print("\n=== 示例结束 ===") | ||
| 171 | + | ||
| 172 | +if __name__ == "__main__": | ||
| 173 | + main() |
| @@ -8,6 +8,8 @@ def preprocess_text(text): | @@ -8,6 +8,8 @@ def preprocess_text(text): | ||
| 8 | text = re.sub(r"@.+?( |$)", " ", text) # 去除 @xxx | 8 | text = re.sub(r"@.+?( |$)", " ", text) # 去除 @xxx |
| 9 | text = re.sub(r"【.+?】", " ", text) # 去除 【xx】 | 9 | text = re.sub(r"【.+?】", " ", text) # 去除 【xx】 |
| 10 | text = re.sub(r"\u200b", " ", text) # 去除特殊字符 | 10 | text = re.sub(r"\u200b", " ", text) # 去除特殊字符 |
| 11 | + # 删除表情符号 | ||
| 12 | + text = re.sub(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF\U00002600-\U000027BF\U0001f900-\U0001f9ff\U0001f018-\U0001f270\U0000231a-\U0000231b\U0000238d-\U0000238d\U000024c2-\U0001f251]+', '', text) | ||
| 11 | text = re.sub(r"\s+", " ", text) # 多个空格合并 | 13 | text = re.sub(r"\s+", " ", text) # 多个空格合并 |
| 12 | return text.strip() | 14 | return text.strip() |
| 13 | 15 |
| @@ -7,6 +7,8 @@ def preprocess_text(text): | @@ -7,6 +7,8 @@ def preprocess_text(text): | ||
| 7 | text = re.sub(r"@.+?( |$)", " ", text) # 去除 @xxx | 7 | text = re.sub(r"@.+?( |$)", " ", text) # 去除 @xxx |
| 8 | text = re.sub(r"【.+?】", " ", text) # 去除 【xx】 | 8 | text = re.sub(r"【.+?】", " ", text) # 去除 【xx】 |
| 9 | text = re.sub(r"\u200b", " ", text) # 去除特殊字符 | 9 | text = re.sub(r"\u200b", " ", text) # 去除特殊字符 |
| 10 | + # 删除表情符号 | ||
| 11 | + text = re.sub(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF\U00002600-\U000027BF\U0001f900-\U0001f9ff\U0001f018-\U0001f270\U0000231a-\U0000231b\U0000238d-\U0000238d\U000024c2-\U0001f251]+', '', text) | ||
| 10 | text = re.sub(r"\s+", " ", text) # 多个空格合并 | 12 | text = re.sub(r"\s+", " ", text) # 多个空格合并 |
| 11 | return text.strip() | 13 | return text.strip() |
| 12 | 14 |
| 1 | import torch | 1 | import torch |
| 2 | from transformers import BertTokenizer | 2 | from transformers import BertTokenizer |
| 3 | from train import GPT2ClassifierWithAdapter | 3 | from train import GPT2ClassifierWithAdapter |
| 4 | +import re | ||
| 5 | + | ||
| 6 | +def preprocess_text(text): | ||
| 7 | + """简单的文本预处理""" | ||
| 8 | + text = re.sub(r"\{%.+?%\}", " ", text) # 去除 {%xxx%} | ||
| 9 | + text = re.sub(r"@.+?( |$)", " ", text) # 去除 @xxx | ||
| 10 | + text = re.sub(r"【.+?】", " ", text) # 去除 【xx】 | ||
| 11 | + text = re.sub(r"\u200b", " ", text) # 去除特殊字符 | ||
| 12 | + # 删除表情符号 | ||
| 13 | + text = re.sub(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF\U00002600-\U000027BF\U0001f900-\U0001f9ff\U0001f018-\U0001f270\U0000231a-\U0000231b\U0000238d-\U0000238d\U000024c2-\U0001f251]+', '', text) | ||
| 14 | + text = re.sub(r"\s+", " ", text) # 多个空格合并 | ||
| 15 | + return text.strip() | ||
| 4 | 16 | ||
| 5 | def main(): | 17 | def main(): |
| 6 | # 设置设备 | 18 | # 设置设备 |
| @@ -31,9 +43,12 @@ def main(): | @@ -31,9 +43,12 @@ def main(): | ||
| 31 | if text.lower() == 'q': | 43 | if text.lower() == 'q': |
| 32 | break | 44 | break |
| 33 | 45 | ||
| 46 | + # 预处理文本 | ||
| 47 | + processed_text = preprocess_text(text) | ||
| 48 | + | ||
| 34 | # 对文本进行编码 | 49 | # 对文本进行编码 |
| 35 | encoding = tokenizer( | 50 | encoding = tokenizer( |
| 36 | - text, | 51 | + processed_text, |
| 37 | max_length=128, | 52 | max_length=128, |
| 38 | padding='max_length', | 53 | padding='max_length', |
| 39 | truncation=True, | 54 | truncation=True, |
| @@ -2,6 +2,18 @@ import torch | @@ -2,6 +2,18 @@ import torch | ||
| 2 | from transformers import GPT2ForSequenceClassification, BertTokenizer | 2 | from transformers import GPT2ForSequenceClassification, BertTokenizer |
| 3 | from peft import PeftModel | 3 | from peft import PeftModel |
| 4 | import os | 4 | import os |
| 5 | +import re | ||
| 6 | + | ||
| 7 | +def preprocess_text(text): | ||
| 8 | + """简单的文本预处理""" | ||
| 9 | + text = re.sub(r"\{%.+?%\}", " ", text) # 去除 {%xxx%} | ||
| 10 | + text = re.sub(r"@.+?( |$)", " ", text) # 去除 @xxx | ||
| 11 | + text = re.sub(r"【.+?】", " ", text) # 去除 【xx】 | ||
| 12 | + text = re.sub(r"\u200b", " ", text) # 去除特殊字符 | ||
| 13 | + # 删除表情符号 | ||
| 14 | + text = re.sub(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF\U00002600-\U000027BF\U0001f900-\U0001f9ff\U0001f018-\U0001f270\U0000231a-\U0000231b\U0000238d-\U0000238d\U000024c2-\U0001f251]+', '', text) | ||
| 15 | + text = re.sub(r"\s+", " ", text) # 多个空格合并 | ||
| 16 | + return text.strip() | ||
| 5 | 17 | ||
| 6 | def main(): | 18 | def main(): |
| 7 | # 设置设备 | 19 | # 设置设备 |
| @@ -66,9 +78,12 @@ def main(): | @@ -66,9 +78,12 @@ def main(): | ||
| 66 | continue | 78 | continue |
| 67 | 79 | ||
| 68 | try: | 80 | try: |
| 81 | + # 预处理文本 | ||
| 82 | + processed_text = preprocess_text(text) | ||
| 83 | + | ||
| 69 | # 对文本进行编码 | 84 | # 对文本进行编码 |
| 70 | encoding = tokenizer( | 85 | encoding = tokenizer( |
| 71 | - text, | 86 | + processed_text, |
| 72 | max_length=128, | 87 | max_length=128, |
| 73 | padding='max_length', | 88 | padding='max_length', |
| 74 | truncation=True, | 89 | truncation=True, |
-
Please register or login to post a comment