Toggle navigation
Toggle navigation
This project
Loading...
Sign in
万朱浩
/
Venue-Ops
Go to a project
Toggle navigation
Projects
Groups
Snippets
Help
Toggle navigation pinning
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
戒酒的李白
2024-09-30 01:06:03 +0800
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
5108ae1254cea908d56e78f2960b11a2b9d24d45
5108ae12
1 parent
48af69da
Implement Chinese word segmentation
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
8 additions
and
3 deletions
model_pro/BERT_CTM.py
model_pro/BERT_CTM.py
View file @
5108ae1
...
...
@@ -3,6 +3,7 @@ from transformers.models.bert import BertTokenizer, BertModel
import
torch
from
tqdm
import
tqdm
import
numpy
as
np
import
jieba
class
BERT_CTM_Model
:
def
__init__
(
self
,
bert_model_path
):
...
...
@@ -19,9 +20,13 @@ class BERT_CTM_Model:
outputs
=
self
.
model
(
**
inputs
)
embeddings
.
append
(
outputs
.
last_hidden_state
.
cpu
()
.
numpy
())
# [batch_size, sequence_length, hidden_size]
return
np
.
vstack
(
embeddings
)
def
chinese_tokenize
(
self
,
text
):
"""使用jieba对中文文本进行分词"""
return
" "
.
join
(
jieba
.
cut
(
text
))
if
__name__
==
"__main__"
:
model
=
BERT_CTM_Model
(
'./bert_model'
)
texts
=
[
"这是第一个文本"
,
"这是第二个文本"
]
embeddings
=
model
.
get_bert_embeddings
(
texts
)
print
(
embeddings
.
shape
)
text
=
"这是一个测试文本"
tokenized_text
=
model
.
chinese_tokenize
(
text
)
print
(
tokenized_text
)
...
...
Please
register
or
login
to post a comment