Toggle navigation
Toggle navigation
This project
Loading...
Sign in
万朱浩
/
Venue-Ops
Go to a project
Toggle navigation
Projects
Groups
Snippets
Help
Toggle navigation pinning
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
lintsinghua
2024-12-14 20:05:58 +0800
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Committed by
GitHub
2024-12-14 20:05:58 +0800
Commit
babb9d54a956841b07411faad3ea45dd64fe3cf2
babb9d54
1 parent
82be6f86
Update spiderNav.py
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
41 additions
and
31 deletions
spider/spiderDataPackage/spiderNav.py
spider/spiderDataPackage/spiderNav.py
View file @
babb9d5
...
...
@@ -2,54 +2,64 @@ import requests
import
csv
import
numpy
as
np
import
os
import
random
from
.settings
import
navAddr
from
requests.exceptions
import
RequestException
# 初始化导航数据文件
def
init
():
if
not
os
.
path
.
exists
(
navAddr
):
with
open
(
navAddr
,
'w'
,
encoding
=
'utf-8'
,
newline
=
''
)
as
csvFile
:
with
open
(
navAddr
,
'w'
,
encoding
=
'utf-8'
,
newline
=
''
)
as
csvFile
:
writer
=
csv
.
writer
(
csvFile
)
writer
.
writerow
([
'typeName'
,
'gid'
,
'containerid'
])
writer
.
writerow
([
'typeName'
,
'gid'
,
'containerid'
])
# 写入导航数据
def
write
(
row
):
with
open
(
navAddr
,
'a'
,
encoding
=
'utf-8'
,
newline
=
''
)
as
csvFile
:
writer
=
csv
.
writer
(
csvFile
)
writer
.
writerow
(
row
)
def
fetchData
(
url
):
headers
=
{
'Cookie'
:
'SINAGLOBAL=2555941826014.1074.1676801766625; ULV=1719829459275:6:1:2:4660996305989.918.1719827559898:1719743122299; UOR=,,www.baidu.com; XSRF-TOKEN=VtLXviYSIs8lor7sz4iGyigL; SUB=_2A25LhvU9DeRhGeFH6FIX-S3MyD2IHXVo-gj1rDV8PUJbkNAGLRXMkW1Ne2nhI3Gle25QJK0Z99J3trq_NZn6YKJ-; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WW3Mv8V5EupQbbKh.vaZIwU5JpX5KzhUgL.FoM4e05c1Ke7e022dJLoIp7LxKML1KBLBKnLxKqL1hnLBoM41hz41hqReKqN; WBPSESS=Dt2hbAUaXfkVprjyrAZT_LRaDLsnxG-kIbeYwnBb5OUKZiwfVr_UrcYfWuqG-4ZVDM5HeU3HXkDNK_thfRfdS9Ao6ezT30jDksv-CpaVmlTAqGUHjJ7PYkH5aCK4HLxmRq14ZalmQNwzfWMPa4y0VNRLuYdg7L1s49ymNq_5v5vusoz0r4ki6u-MHGraF0fbUTgX14x0kHayEwOoxfLI-w==; SCF=AqmJWo31oFV5itnRgWNU1-wHQTL6PmkBLf3gDuqpdqAIfaWguDTMre6Oxjf5Uzs74JAh2r0DdV1sJ1g6m-wJ5NQ.; _s_tentry=-; Apache=4660996305989.918.1719827559898; PC_TOKEN=7955a7ab1f; appkey=; geetest_token=602cd4e3a7ed1898808f8adfe1a2048b; ALF=1722421868'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0'
}
params
=
{
'is_new_segment'
:
1
,
'fetch_hot'
:
1
}
response
=
requests
.
get
(
url
,
headers
=
headers
,
params
=
params
)
# 获取数据,支持多账号
def
fetchData
(
url
,
headers_list
):
headers
=
random
.
choice
(
headers_list
)
try
:
response
=
requests
.
get
(
url
,
headers
=
headers
,
timeout
=
10
)
if
response
.
status_code
==
200
:
return
response
.
json
()
return
response
.
json
()[
'data'
][
'modules'
]
else
:
return
None
except
RequestException
as
e
:
print
(
f
"请求失败:{e}"
)
return
None
# 解析导航数据
def
readJson
(
response
):
navList
=
np
.
append
(
response
[
'groups'
][
3
][
'group'
],
response
[
'groups'
][
4
][
'group'
])
for
nav
in
navList
:
navName
=
nav
[
'title'
]
gid
=
nav
[
'gid'
]
containerid
=
nav
[
'containerid'
]
write
([
navName
,
gid
,
containerid
])
for
module
in
response
:
if
'type'
in
module
and
'typeName'
in
module
:
typeName
=
module
[
'typeName'
]
for
submodule
in
module
[
'modules'
]:
if
'id'
in
submodule
and
'containerid'
in
submodule
:
gid
=
submodule
[
'id'
]
containerid
=
submodule
[
'containerid'
]
write
([
typeName
,
gid
,
containerid
])
def
start
():
# 启动爬虫
def
start
(
headers_list
):
navUrl
=
'https://weibo.com/ajax/side/hot'
init
()
url
=
'https://weibo.com/ajax/feed/allGroups'
response
=
fetchData
(
url
)
response
=
fetchData
(
navUrl
,
headers_list
)
if
response
:
readJson
(
response
)
if
__name__
==
'__main__'
:
start
()
\ No newline at end of file
headers_list
=
[
{
'Cookie'
:
'your_cookie_here'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0'
},
{
'Cookie'
:
'another_cookie_here'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0'
}
]
start
(
headers_list
)
...
...
Please
register
or
login
to post a comment