Toggle navigation
Toggle navigation
This project
Loading...
Sign in
万朱浩
/
Venue-Ops
Go to a project
Toggle navigation
Projects
Groups
Snippets
Help
Toggle navigation pinning
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
lintsinghua
2024-12-14 20:06:39 +0800
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Committed by
GitHub
2024-12-14 20:06:39 +0800
Commit
a7472b4287a2301192c0b32b4941909bc74bbcd4
a7472b42
1 parent
babb9d54
Update spiderComments.py
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
43 additions
and
55 deletions
spider/spiderDataPackage/spiderComments.py
spider/spiderDataPackage/spiderComments.py
View file @
a7472b4
...
...
@@ -2,99 +2,87 @@ import time
import
requests
import
csv
import
os
import
random
from
datetime
import
datetime
from
.settings
import
articleAddr
,
commentsAddr
from
.settings
import
articleAddr
,
commentsAddr
from
requests.exceptions
import
RequestException
# 初始化,创建评论数据文件
def
init
():
if
not
os
.
path
.
exists
(
commentsAddr
):
with
open
(
commentsAddr
,
'w'
,
encoding
=
'utf-8'
,
newline
=
''
)
as
csvFile
:
with
open
(
commentsAddr
,
'w'
,
encoding
=
'utf-8'
,
newline
=
''
)
as
csvFile
:
writer
=
csv
.
writer
(
csvFile
)
writer
.
writerow
([
'articleId'
,
'created_at'
,
'likes_counts'
,
'region'
,
'content'
,
'authorName'
,
'authorGender'
,
'authorAddress'
,
'authorAvatar'
'articleId'
,
'created_at'
,
'likes_counts'
,
'region'
,
'content'
,
'authorName'
,
'authorGender'
,
'authorAddress'
,
'authorAvatar'
])
# 写入评论数据到CSV
def
write
(
row
):
with
open
(
commentsAddr
,
'a'
,
encoding
=
'utf-8'
,
newline
=
''
)
as
csvFile
:
writer
=
csv
.
writer
(
csvFile
)
writer
.
writerow
(
row
)
def
fetchData
(
url
,
params
):
headers
=
{
'Cookie'
:
'SINAGLOBAL=2555941826014.1074.1676801766625; ULV=1719829459275:6:1:2:4660996305989.918.1719827559898:1719743122299; UOR=,,www.baidu.com; XSRF-TOKEN=VtLXviYSIs8lor7sz4iGyigL; SUB=_2A25LhvU9DeRhGeFH6FIX-S3MyD2IHXVo-gj1rDV8PUJbkNAGLRXMkW1Ne2nhI3Gle25QJK0Z99J3trq_NZn6YKJ-; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WW3Mv8V5EupQbbKh.vaZIwU5JpX5KzhUgL.FoM4e05c1Ke7e022dJLoIp7LxKML1KBLBKnLxKqL1hnLBoM41hz41hqReKqN; WBPSESS=Dt2hbAUaXfkVprjyrAZT_LRaDLsnxG-kIbeYwnBb5OUKZiwfVr_UrcYfWuqG-4ZVDM5HeU3HXkDNK_thfRfdS9Ao6ezT30jDksv-CpaVmlTAqGUHjJ7PYkH5aCK4HLxmRq14ZalmQNwzfWMPa4y0VNRLuYdg7L1s49ymNq_5v5vusoz0r4ki6u-MHGraF0fbUTgX14x0kHayEwOoxfLI-w==; SCF=AqmJWo31oFV5itnRgWNU1-wHQTL6PmkBLf3gDuqpdqAIfaWguDTMre6Oxjf5Uzs74JAh2r0DdV1sJ1g6m-wJ5NQ.; _s_tentry=-; Apache=4660996305989.918.1719827559898; PC_TOKEN=7955a7ab1f; appkey=; geetest_token=602cd4e3a7ed1898808f8adfe1a2048b; ALF=1722421868'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0'
}
response
=
requests
.
get
(
url
,
headers
=
headers
,
params
=
params
)
# 获取数据,支持多账号随机切换
def
fetchData
(
url
,
params
,
headers_list
):
headers
=
random
.
choice
(
headers_list
)
try
:
response
=
requests
.
get
(
url
,
headers
=
headers
,
params
=
params
,
timeout
=
10
)
if
response
.
status_code
==
200
:
return
response
.
json
()[
'data'
]
else
:
return
None
except
RequestException
as
e
:
print
(
f
"请求失败:{e}"
)
return
None
# 获取文章列表
def
getArticleList
():
articleList
=
[]
with
open
(
articleAddr
,
'r'
,
encoding
=
'utf-8'
)
as
reader
:
with
open
(
articleAddr
,
'r'
,
encoding
=
'utf-8'
)
as
reader
:
readerCsv
=
csv
.
reader
(
reader
)
next
(
reader
)
for
nav
in
readerCsv
:
articleList
.
append
(
nav
)
return
articleList
def
readJson
(
response
,
artileId
):
# 解析评论数据
def
readJson
(
response
,
articleId
):
for
comment
in
response
:
created_at
=
datetime
.
strptime
(
comment
[
'created_at'
],
'
%
a
%
b
%
d
%
H:
%
M:
%
S
%
z
%
Y'
)
.
strftime
(
'
%
Y-
%
m-
%
d'
)
created_at
=
datetime
.
strptime
(
comment
[
'created_at'
],
'
%
a
%
b
%
d
%
H:
%
M:
%
S
%
z
%
Y'
)
.
strftime
(
'
%
Y-
%
m-
%
d'
)
likes_counts
=
comment
[
'like_counts'
]
try
:
region
=
comment
[
'source'
]
.
replace
(
'来自'
,
''
)
except
:
region
=
'无'
region
=
comment
.
get
(
'source'
,
'无'
)
.
replace
(
'来自'
,
''
)
content
=
comment
[
'text_raw'
]
authorName
=
comment
[
'user'
][
'screen_name'
]
authorGender
=
comment
[
'user'
][
'gender'
]
authorAddress
=
comment
[
'user'
][
'location'
]
authorAvatar
=
comment
[
'user'
][
'avatar_large'
]
write
([
artileId
,
created_at
,
likes_counts
,
region
,
content
,
authorName
,
authorGender
,
authorAddress
,
authorAvatar
])
write
([
articleId
,
created_at
,
likes_counts
,
region
,
content
,
authorName
,
authorGender
,
authorAddress
,
authorAvatar
])
def
start
():
# 启动爬虫
def
start
(
headers_list
,
delay
=
2
):
commentUrl
=
'https://weibo.com/ajax/statuses/buildComments'
init
()
articleList
=
getArticleList
()
for
article
in
articleList
:
articleId
=
article
[
0
]
print
(
'正在爬取id值为
%
s的文章评论'
%
articleId
)
time
.
sleep
(
2
)
params
=
{
'id'
:
int
(
articleId
),
'is_show_bulletin'
:
2
}
response
=
fetchData
(
commentUrl
,
params
)
readJson
(
response
,
articleId
)
print
(
f
'正在爬取id值为{articleId}的文章评论'
)
time
.
sleep
(
random
.
uniform
(
1
,
delay
))
# 随机延时,避免频繁访问
params
=
{
'id'
:
int
(
articleId
),
'is_show_bulletin'
:
2
}
response
=
fetchData
(
commentUrl
,
params
,
headers_list
)
if
response
:
readJson
(
response
,
articleId
)
if
__name__
==
'__main__'
:
start
()
# 这里的headers_list应该包含多个账号的cookie
headers_list
=
[
{
'Cookie'
:
'your_cookie_here'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0'
},
{
'Cookie'
:
'another_cookie_here'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0'
}
]
start
(
headers_list
)
...
...
Please
register
or
login
to post a comment