Calculates the scaling dot product attention

戒酒的李白
Commit 9af61e2ade8e2dd5ce77590804aba65c200632e2 9af61e2a 1 parent 4500b271
Showing 1 changed file with 13 additions and 8 deletions
model_pro/MHA.py
--- a/model_pro/MHA.py
View file @9af61e2
+++ b/model_pro/MHA.py
View file @9af61e2
 import torch
 import torch.nn as nn
+ import torch.nn.functional as F
 
 class MultiHeadAttentionLayer(nn.Module):
     def __init__(self, embed_size, num_heads):
@@ -19,16 +20,21 @@ class MultiHeadAttentionLayer(nn.Module):
         N = query.shape[0]  # batch_size
         
         # Linear transformations for Q, K, V
-         Q = self.q_linear(query)  # shape: (N, seq_len, embed_size)
-         K = self.k_linear(keys)   # shape: (N, seq_len, embed_size)
-         V = self.v_linear(values) # shape: (N, seq_len, embed_size)
+         Q = self.q_linear(query)
+         K = self.k_linear(keys)
+         V = self.v_linear(values)
         
-         # Reshape Q, K, V into multiple heads
+         # Reshape into multiple heads
         Q = Q.reshape(N, -1, self.num_heads, self.head_dim)
         K = K.reshape(N, -1, self.num_heads, self.head_dim)
         V = V.reshape(N, -1, self.num_heads, self.head_dim)
         
-         return Q, K, V
+         # Compute scaled dot-product attention scores
+         attention_scores = torch.einsum("nqhd,nkhd->nhqk", [Q, K])
+         attention_scores = attention_scores / (self.head_dim ** 0.5)
+         attention = torch.softmax(attention_scores, dim=-1)  # Normalize
+         
+         return attention
 
 
 if __name__ == "__main__":
@@ -36,10 +42,9 @@ if __name__ == "__main__":
     num_heads = 8
     mha_layer = MultiHeadAttentionLayer(embed_size, num_heads)
     
-     # Dummy data
     values = torch.randn(2, 10, embed_size)
     keys = torch.randn(2, 10, embed_size)
     query = torch.randn(2, 10, embed_size)
     
-     Q, K, V = mha_layer(values, keys, query)
-     print(f"Q shape: {Q.shape}, K shape: {K.shape}, V shape: {V.shape}")
+     attention = mha_layer(values, keys, query)
+     print(f"Attention shape: {attention.shape}")