train.py 10.2 KB
import os
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Config, GPT2ForSequenceClassification, BertTokenizer, get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm

from adapter import AdapterLayer
from gpt2_adapter import GPT2BlockWithAdapter

# 设置随机种子
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

# 定义微博情感分析数据集
class WeiboSentimentDataset(Dataset):
    def __init__(self, reviews, labels, tokenizer, max_length=128):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.reviews)
    
    def __getitem__(self, idx):
        review = str(self.reviews[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            review,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# 定义GPT2分类模型,带Adapter
class GPT2ClassifierWithAdapter(nn.Module):
    def __init__(self, pretrained_model_name, num_labels=2):
        super(GPT2ClassifierWithAdapter, self).__init__()
        # 加载预训练模型
        self.gpt2 = GPT2ForSequenceClassification.from_pretrained(
            pretrained_model_name,
            num_labels=num_labels
        )
        
        # 确保模型配置中设置了pad_token_id
        self.gpt2.config.pad_token_id = self.gpt2.config.eos_token_id
        
        # 替换原始的GPT2Block为带Adapter的版本
        config = self.gpt2.config
        for i in range(len(self.gpt2.transformer.h)):
            # 保存原始权重
            old_block = self.gpt2.transformer.h[i]
            # 创建带Adapter的新Block
            new_block = GPT2BlockWithAdapter(config)
            # 复制原始权重
            new_block.load_state_dict(old_block.state_dict(), strict=False)
            # 替换
            self.gpt2.transformer.h[i] = new_block
            
        # 冻结原始GPT2参数
        for param in self.gpt2.parameters():
            param.requires_grad = False
            
        # 解冻分类器层和Adapter层参数
        for param in self.gpt2.score.parameters():
            param.requires_grad = True
            
        # 解冻所有Adapter层
        for i in range(len(self.gpt2.transformer.h)):
            for param in self.gpt2.transformer.h[i].adapter.parameters():
                param.requires_grad = True
    
    def forward(self, input_ids, attention_mask, labels=None):
        return self.gpt2(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

# 训练函数
def train_model(model, train_dataloader, val_dataloader, optimizer, scheduler, device, epochs=3):
    best_f1 = 0.0
    
    for epoch in range(epochs):
        print(f"======== Epoch {epoch+1} / {epochs} ========")
        model.train()
        total_loss = 0
        
        # 训练循环
        progress_bar = tqdm(train_dataloader, desc="Training", position=0, leave=True)
        for batch in progress_bar:
            # 将数据移到GPU
            batch = {k: v.to(device) for k, v in batch.items()}
            
            # 清零梯度
            optimizer.zero_grad()
            
            # 前向传播
            outputs = model(
                input_ids=batch['input_ids'],
                attention_mask=batch['attention_mask'],
                labels=batch['labels']
            )
            
            loss = outputs.loss
            total_loss += loss.item()
            
            # 反向传播
            loss.backward()
            
            # 梯度裁剪,防止梯度爆炸
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            
            # 参数更新
            optimizer.step()
            scheduler.step()
            
            # 更新进度条
            progress_bar.set_postfix({"loss": loss.item()})
        
        # 计算平均训练损失
        avg_train_loss = total_loss / len(train_dataloader)
        print(f"Average training loss: {avg_train_loss:.4f}")
        
        # 评估模型
        val_metrics = evaluate_model(model, val_dataloader, device)
        print(f"Validation Loss: {val_metrics['loss']:.4f}")
        print(f"Validation Accuracy: {val_metrics['accuracy']:.4f}")
        print(f"Validation F1 Score: {val_metrics['f1']:.4f}")
        
        # 保存最佳模型
        if val_metrics['f1'] > best_f1:
            best_f1 = val_metrics['f1']
            torch.save(model.state_dict(), "best_weibo_sentiment_model.pth")
            print("Saved best model!")

# 评估函数
def evaluate_model(model, dataloader, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            batch = {k: v.to(device) for k, v in batch.items()}
            
            outputs = model(
                input_ids=batch['input_ids'],
                attention_mask=batch['attention_mask'],
                labels=batch['labels']
            )
            
            loss = outputs.loss
            total_loss += loss.item()
            
            # 获取预测结果
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            labels = batch['labels'].cpu().numpy()
            
            all_preds.extend(preds)
            all_labels.extend(labels)
    
    # 计算评估指标
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='macro')
    avg_loss = total_loss / len(dataloader)
    
    return {
        'loss': avg_loss,
        'accuracy': accuracy,
        'f1': f1
    }

def main():
    # 设置模型本地保存路径
    model_name = 'uer/gpt2-chinese-cluecorpussmall'
    local_model_path = './models/gpt2-chinese'
    
    # 确保目录存在
    os.makedirs(local_model_path, exist_ok=True)
    
    # 加载数据集
    print("加载微博情感数据集...")
    df = pd.read_csv('dataset/weibo_senti_100k.csv')
    
    # 分割数据集
    train_df, val_df = train_test_split(df, test_size=0.1, random_state=42, stratify=df['label'])
    
    # 加载tokenizer和模型
    print("加载预训练模型和tokenizer...")
    
    # 检查本地是否已有模型
    if os.path.exists(os.path.join(local_model_path, 'config.json')):
        print(f"从本地路径加载模型: {local_model_path}")
        tokenizer = BertTokenizer.from_pretrained(local_model_path)
    else:
        print(f"从Hugging Face下载模型到: {local_model_path}")
        tokenizer = BertTokenizer.from_pretrained(model_name, cache_dir=local_model_path)
        # 保存tokenizer到本地
        tokenizer.save_pretrained(local_model_path)
    
    # 设置padding token (BertTokenizer通常已有[PAD]作为padding token)
    if tokenizer.pad_token is None:
        # 如果没有,显式设置为[PAD]
        tokenizer.pad_token = '[PAD]'
    
    # 记录pad_token的ID,确保模型和tokenizer使用相同的pad_token_id
    pad_token_id = tokenizer.pad_token_id
    
    # 创建数据集
    train_dataset = WeiboSentimentDataset(
        train_df['review'].values,
        train_df['label'].values,
        tokenizer
    )
    
    val_dataset = WeiboSentimentDataset(
        val_df['review'].values,
        val_df['label'].values,
        tokenizer
    )
    
    # 创建数据加载器
    train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=16)
    
    # 设置设备
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"使用设备: {device}")
    
    # 初始化模型
    if (os.path.exists(os.path.join(local_model_path, 'pytorch_model.bin')) or 
        os.path.exists(os.path.join(local_model_path, 'model.safetensors'))):
        print(f"从本地路径加载模型权重: {local_model_path}")
        model = GPT2ClassifierWithAdapter(local_model_path)
    else:
        print(f"从Hugging Face下载模型权重到: {local_model_path}")
        # 直接从Hugging Face下载并保存完整模型
        temp_model = GPT2ForSequenceClassification.from_pretrained(model_name)
        temp_model.save_pretrained(local_model_path)
        # 然后用保存的模型创建GPT2ClassifierWithAdapter
        model = GPT2ClassifierWithAdapter(local_model_path)
    
    # 确保模型使用与tokenizer相同的pad_token_id
    model.gpt2.config.pad_token_id = pad_token_id
    model.to(device)
    
    # 统计需要训练的参数
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    
    print(f"模型总参数量: {total_params}")
    print(f"需要训练的参数量: {trainable_params} ({trainable_params/total_params*100:.2f}%)")
    
    # 设置优化器和学习率调度器
    optimizer = AdamW(
        [p for p in model.parameters() if p.requires_grad],
        lr=5e-5,
        eps=1e-8
    )
    
    # 设置总训练步数和warmup步数
    total_steps = len(train_dataloader) * 2  # 2个epoch
    warmup_steps = int(total_steps * 0.1)  # 10%的warmup
    
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=total_steps
    )
    
    # 训练模型
    print("开始训练...")
    train_model(
        model=model,
        train_dataloader=train_dataloader,
        val_dataloader=val_dataloader,
        optimizer=optimizer,
        scheduler=scheduler,
        device=device,
        epochs=2
    )
    
    print("训练完成!")

if __name__ == "__main__":
    main()