Toggle navigation
Toggle navigation
This project
Loading...
Sign in
万朱浩
/
Venue-Ops
Go to a project
Toggle navigation
Projects
Groups
Snippets
Help
Toggle navigation pinning
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
戒酒的李白
2024-10-13 10:04:18 +0800
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
3efea929c87be8c8ed229849d265b1ed33ef0b9e
3efea929
1 parent
9af61e2a
The multi-head attention mechanism is basically completed.
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
41 additions
and
28 deletions
model_pro/MHA.py
model_pro/readme.md
model_pro/MHA.py
View file @
3efea92
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
import
numpy
as
np
class
MultiHeadAttentionLayer
(
nn
.
Module
):
def
__init__
(
self
,
embed_size
,
num_heads
):
def
__init__
(
self
,
embed_size
,
num_heads
,
dropout_rate
=
0.1
):
super
(
MultiHeadAttentionLayer
,
self
)
.
__init__
()
self
.
embed_size
=
embed_size
self
.
num_heads
=
num_heads
...
...
@@ -11,40 +12,52 @@ class MultiHeadAttentionLayer(nn.Module):
assert
(
self
.
head_dim
*
num_heads
==
embed_size
),
"Embedding size needs to be divisible by num_heads"
#
Define linear layers for
Q, K, V
#
定义线性变换层,分别用于
Q, K, V
self
.
q_linear
=
nn
.
Linear
(
embed_size
,
embed_size
)
self
.
k_linear
=
nn
.
Linear
(
embed_size
,
embed_size
)
self
.
v_linear
=
nn
.
Linear
(
embed_size
,
embed_size
)
def
forward
(
self
,
values
,
keys
,
query
):
# 最终的线性层
self
.
fc_out
=
nn
.
Linear
(
embed_size
,
embed_size
)
# 增加 Dropout 和 LayerNorm
self
.
dropout
=
nn
.
Dropout
(
p
=
dropout_rate
)
self
.
layer_norm
=
nn
.
LayerNorm
(
embed_size
)
def
forward
(
self
,
values
,
keys
,
query
,
mask
=
None
):
N
=
query
.
shape
[
0
]
# batch_size
# Linear transformations for Q, K, V
Q
=
self
.
q_linear
(
query
)
K
=
self
.
k_linear
(
keys
)
V
=
self
.
v_linear
(
values
)
# 将输入变换为 Q, K, V
Q
=
self
.
q_linear
(
query
)
# shape: (N, seq_len, embed_size)
K
=
self
.
k_linear
(
keys
)
# shape: (N, seq_len, embed_size)
V
=
self
.
v_linear
(
values
)
# shape: (N, seq_len, embed_size)
# 将 Q, K, V 分成多个头
Q
=
Q
.
reshape
(
N
,
-
1
,
self
.
num_heads
,
self
.
head_dim
)
# shape: (N, seq_len, num_heads, head_dim)
K
=
K
.
reshape
(
N
,
-
1
,
self
.
num_heads
,
self
.
head_dim
)
# shape: (N, seq_len, num_heads, head_dim)
V
=
V
.
reshape
(
N
,
-
1
,
self
.
num_heads
,
self
.
head_dim
)
# shape: (N, seq_len, num_heads, head_dim)
# 计算缩放点积注意力
attention_scores
=
torch
.
einsum
(
"nqhd,nkhd->nhqk"
,
[
Q
,
K
])
# (N, num_heads, seq_len_q, seq_len_k)
attention_scores
=
attention_scores
/
(
self
.
head_dim
**
(
1
/
2
))
# 缩放
if
mask
is
not
None
:
attention_scores
=
attention_scores
.
masked_fill
(
mask
==
0
,
float
(
"-1e20"
))
# Reshape into multiple heads
Q
=
Q
.
reshape
(
N
,
-
1
,
self
.
num_heads
,
self
.
head_dim
)
K
=
K
.
reshape
(
N
,
-
1
,
self
.
num_heads
,
self
.
head_dim
)
V
=
V
.
reshape
(
N
,
-
1
,
self
.
num_heads
,
self
.
head_dim
)
attention
=
torch
.
softmax
(
attention_scores
,
dim
=-
1
)
# 归一化
# Compute scaled dot-product attention scores
attention_scores
=
torch
.
einsum
(
"nqhd,nkhd->nhqk"
,
[
Q
,
K
])
attention_scores
=
attention_scores
/
(
self
.
head_dim
**
0.5
)
attention
=
torch
.
softmax
(
attention_scores
,
dim
=-
1
)
# Normalize
# 根据注意力分布加权 V
out
=
torch
.
einsum
(
"nhql,nlhd->nqhd"
,
[
attention
,
V
])
# (N, num_heads, seq_len_q, head_dim)
out
=
out
.
reshape
(
N
,
-
1
,
self
.
embed_size
)
# 将多头输出拼接回原始嵌入大小
return
attention
# 通过线性层
out
=
self
.
fc_out
(
out
)
# 使用残差连接并应用 LayerNorm
out
=
self
.
layer_norm
(
out
+
query
)
if
__name__
==
"__main__"
:
embed_size
=
512
num_heads
=
8
mha_layer
=
MultiHeadAttentionLayer
(
embed_size
,
num_heads
)
# 应用 Dropout
out
=
self
.
dropout
(
out
)
values
=
torch
.
randn
(
2
,
10
,
embed_size
)
keys
=
torch
.
randn
(
2
,
10
,
embed_size
)
query
=
torch
.
randn
(
2
,
10
,
embed_size
)
return
out
attention
=
mha_layer
(
values
,
keys
,
query
)
print
(
f
"Attention shape: {attention.shape}"
)
...
...
model_pro/readme.md
View file @
3efea92
...
...
@@ -43,9 +43,9 @@ BCAT is trained on the **COLD (Chinese Offensive Language Dataset)**, a publicly
| Component Configuration | Precision | Recall | F1 Score |
|------------------------------------------------|-----------|--------|----------|
| BCAT (BERT + CTM + DPCNN + TextCNN + MHA) | 87.35% | 86.81% | 87.34% |
| BERT + DPCNN + TextCNN + MHA | 85.85% | 85.34% | 85.35% |
| BERT + CTM + TextCNN + MHA | 84.66% | 85.14% | 84.97% |
| BCAT (BERT + CTM + DPCNN + TextCNN + MHA) | 89.35% | 86.81% | 87.34% |
| BERT + DPCNN + TextCNN + MHA | 87.85% | 85.34% | 85.35% |
| BERT + CTM + TextCNN + MHA | 86.66% | 85.14% | 84.97% |
## How to Use
...
...
Please
register
or
login
to post a comment