Toggle navigation
Toggle navigation
This project
Loading...
Sign in
万朱浩
/
Venue-Ops
Go to a project
Toggle navigation
Projects
Groups
Snippets
Help
Toggle navigation pinning
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
juanboy
2024-07-02 17:49:46 +0800
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
a94d2cdf80a55d3bc50f52af3a0c4a86c0bbff7f
a94d2cdf
1 parent
50781187
【cutComments.py】分词统计词频函数定义
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
60 additions
and
0 deletions
utils/cipingTotal.py
utils/cutComments.py
utils/cipingTotal.py
0 → 100644
View file @
a94d2cd
import
jieba
import
re
def
main
():
reader
=
open
(
'./cutComments.txt'
,
'r'
,
encoding
=
'utf8'
)
strs
=
reader
.
read
()
result
=
open
(
'./cipingTotal.csv'
,
'w'
,
encoding
=
'utf8'
)
# 分词,去重,列表
word_list
=
jieba
.
cut
(
strs
,
cut_all
=
True
)
new_words
=
[]
for
i
in
word_list
:
m
=
re
.
search
(
"
\
d+"
,
i
)
n
=
re
.
search
(
"
\
W+"
,
i
)
if
not
m
and
not
n
and
len
(
i
)
>
1
:
new_words
.
append
(
i
)
# 统计词频
word_count
=
{}
for
i
in
set
(
new_words
):
word_count
[
i
]
=
new_words
.
count
(
i
)
# 格式整理
list_count
=
sorted
(
word_count
.
items
(),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)
for
i
in
range
(
100
):
print
(
list_count
[
i
],
file
=
result
)
if
__name__
==
'__main__'
:
main
()
\ No newline at end of file
...
...
utils/cutComments.py
0 → 100644
View file @
a94d2cd
from
utils.getPublicData
import
getAllCommentsData
import
jieba
targetTxt
=
'cutComments.txt'
def
stopWordList
():
stopWords
=
[
line
.
strip
()
for
line
in
open
(
'./stopWords.txt'
,
encoding
=
'utf8'
)
.
readlines
()]
return
stopWords
def
seg_depart
(
sentence
):
sentence_depart
=
jieba
.
cut
(
" "
.
join
([
x
[
4
]
for
x
in
sentence
])
.
strip
())
stopWords
=
stopWordList
()
outStr
=
''
for
word
in
sentence_depart
:
if
word
not
in
stopWords
:
if
word
!=
'
\t
'
:
outStr
+=
word
return
outStr
def
writer_comments_cuts
():
with
open
(
targetTxt
,
'a+'
,
encoding
=
'utf-8'
)
as
targetFile
:
seg
=
jieba
.
cut
(
seg_depart
(
getAllCommentsData
()),
cut_all
=
True
)
output
=
' '
.
join
(
seg
)
targetFile
.
write
(
output
)
targetFile
.
write
(
'
\n
'
)
print
(
'写入成功'
)
if
__name__
==
'__main__'
:
writer_comments_cuts
()
\ No newline at end of file
...
...
Please
register
or
login
to post a comment