Toggle navigation
Toggle navigation
This project
Loading...
Sign in
万朱浩
/
Venue-Ops
Go to a project
Toggle navigation
Projects
Groups
Snippets
Help
Toggle navigation pinning
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
戒酒的李白
2024-07-03 22:34:34 +0800
Browse Files
Options
Browse Files
Download
Plain Diff
Commit
987fdfbb39e36f131ef9d98e9cbde7c74f1ffe75
987fdfbb
2 parents
d7da8b90
c0686430
Merge branch 'main' of
https://github.com/666ghj/Weibo_PublicOpinion_AnalysisSystem
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
60 additions
and
44 deletions
app.py
spider/article.csv
spider/main.py
spider/saveData.py
spider/spiderComments.py
spider/spiderContent.py
spider/spiderNav.py
app.py
View file @
987fdfb
from
flask
import
Flask
,
session
,
request
,
redirect
,
render_template
import
re
from
apscheduler.schedulers.background
import
BackgroundScheduler
import
subprocess
import
os
from
pytz
import
utc
app
=
Flask
(
__name__
)
app
.
secret_key
=
'this is secret_key you know ?'
...
...
@@ -24,5 +29,17 @@ def before_reuqest():
def
catch_all
(
path
):
return
render_template
(
'404.html'
)
def
run_spider_script
():
current_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
spider_script
=
os
.
path
.
join
(
current_dir
,
'spider'
,
'main.py'
)
subprocess
.
run
([
'python'
,
spider_script
])
if
__name__
==
'__main__'
:
scheduler
=
BackgroundScheduler
(
timezone
=
utc
)
scheduler
.
add_job
(
run_spider_script
,
'interval'
,
hours
=
5
)
scheduler
.
start
()
try
:
app
.
run
()
finally
:
scheduler
.
shutdown
()
\ No newline at end of file
...
...
spider/article.csv
deleted
100644 → 0
View file @
d7da8b9
id,likeNum,commentsLen,reposts_count,region,content,contentLen,created_at,type,detailUrl,authorAvatar,authorName,authorDetail,isVip
spider/main.py
View file @
987fdfb
from
spiderContent
import
start
as
spiderContentStart
from
spiderComments
import
start
as
spiderCommentsStart
from
spiderContent
import
start
as
spiderContent
from
spiderComments
import
start
as
spiderComments
from
saveData
import
save_to_sql
as
saveData
def
main
():
print
(
'正在爬取文章数据'
)
spiderContent
Start
(
1
,
1
)
spiderContent
(
1
,
1
)
print
(
'正在爬取文章评论数据'
)
spiderComments
Start
()
spiderComments
()
print
(
'正在存储数据'
)
saveData
()
print
(
"爬取数据更新"
)
...
...
spider/saveData.py
View file @
987fdfb
...
...
@@ -6,24 +6,24 @@ engine = create_engine('mysql+pymysql://XiaoXueQi:XiaoXueQi@10.92.35.13/Weibo_Pu
def
save_to_sql
():
try
:
artileOldPd
=
pd
.
read_sql
(
'select * from article'
,
engine
)
articleNewPd
=
pd
.
read_csv
(
'article.csv'
)
commentOldPd
=
pd
.
read_sql
(
'select * from comments'
,
engine
)
commentNewPd
=
pd
.
read_csv
(
'comments.csv'
)
oldArticle
=
pd
.
read_sql
(
'select * from article'
,
engine
)
newArticle
=
pd
.
read_csv
(
'article.csv'
)
oldComment
=
pd
.
read_sql
(
'select * from comments'
,
engine
)
newComment
=
pd
.
read_csv
(
'comments.csv'
)
concatArticlePd
=
pd
.
concat
([
articleNewPd
,
artileOldPd
],
join
=
'inner'
)
concatCommentsPd
=
pd
.
concat
([
commentNewPd
,
commentOldPd
],
join
=
'inner'
)
mergeArticle
=
pd
.
concat
([
newArticle
,
oldArticle
],
join
=
'inner'
)
mergeComment
=
pd
.
concat
([
newComment
,
oldComment
],
join
=
'inner'
)
concatArticlePd
.
drop_duplicates
(
subset
=
'id'
,
keep
=
'last'
,
inplace
=
True
)
concatCommentsPd
.
drop_duplicates
(
subset
=
'content'
,
keep
=
'last'
,
inplace
=
True
)
mergeArticle
.
drop_duplicates
(
subset
=
'id'
,
keep
=
'last'
,
inplace
=
True
)
mergeComment
.
drop_duplicates
(
subset
=
'content'
,
keep
=
'last'
,
inplace
=
True
)
concatArticlePd
.
to_sql
(
'article'
,
con
=
engine
,
if_exists
=
'replace'
,
index
=
False
)
concatCommentsPd
.
to_sql
(
'comments'
,
con
=
engine
,
if_exists
=
'replace'
,
index
=
False
)
mergeArticle
.
to_sql
(
'article'
,
con
=
engine
,
if_exists
=
'replace'
,
index
=
False
)
mergeComment
.
to_sql
(
'comments'
,
con
=
engine
,
if_exists
=
'replace'
,
index
=
False
)
except
:
articleNewPd
=
pd
.
read_csv
(
'article.csv'
)
commentNewPd
=
pd
.
read_csv
(
'comments.csv'
)
articleNewPd
.
to_sql
(
'article'
,
con
=
engine
,
if_exists
=
'replace'
,
index
=
False
)
commentNewPd
.
to_sql
(
'comments'
,
con
=
engine
,
if_exists
=
'replace'
,
index
=
False
)
newArticle
=
pd
.
read_csv
(
'article.csv'
)
newComment
=
pd
.
read_csv
(
'comments.csv'
)
newArticle
.
to_sql
(
'article'
,
con
=
engine
,
if_exists
=
'replace'
,
index
=
False
)
newComment
.
to_sql
(
'comments'
,
con
=
engine
,
if_exists
=
'replace'
,
index
=
False
)
os
.
remove
(
'./article.csv'
)
os
.
remove
(
'./comments.csv'
)
...
...
spider/spiderComments.py
View file @
987fdfb
...
...
@@ -20,12 +20,12 @@ def init():
'authorAvatar'
])
def
write
rRow
(
row
):
def
write
(
row
):
with
open
(
'./comments.csv'
,
'a'
,
encoding
=
'utf-8'
,
newline
=
''
)
as
csvFile
:
writer
=
csv
.
writer
(
csvFile
)
writer
.
writerow
(
row
)
def
get_d
ata
(
url
,
params
):
def
fetchD
ata
(
url
,
params
):
headers
=
{
'Cookie'
:
'SINAGLOBAL=2555941826014.1074.1676801766625; ULV=1719829459275:6:1:2:4660996305989.918.1719827559898:1719743122299; UOR=,,www.baidu.com; XSRF-TOKEN=VtLXviYSIs8lor7sz4iGyigL; SUB=_2A25LhvU9DeRhGeFH6FIX-S3MyD2IHXVo-gj1rDV8PUJbkNAGLRXMkW1Ne2nhI3Gle25QJK0Z99J3trq_NZn6YKJ-; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WW3Mv8V5EupQbbKh.vaZIwU5JpX5KzhUgL.FoM4e05c1Ke7e022dJLoIp7LxKML1KBLBKnLxKqL1hnLBoM41hz41hqReKqN; WBPSESS=Dt2hbAUaXfkVprjyrAZT_LRaDLsnxG-kIbeYwnBb5OUKZiwfVr_UrcYfWuqG-4ZVDM5HeU3HXkDNK_thfRfdS9Ao6ezT30jDksv-CpaVmlTAqGUHjJ7PYkH5aCK4HLxmRq14ZalmQNwzfWMPa4y0VNRLuYdg7L1s49ymNq_5v5vusoz0r4ki6u-MHGraF0fbUTgX14x0kHayEwOoxfLI-w==; SCF=AqmJWo31oFV5itnRgWNU1-wHQTL6PmkBLf3gDuqpdqAIfaWguDTMre6Oxjf5Uzs74JAh2r0DdV1sJ1g6m-wJ5NQ.; _s_tentry=-; Apache=4660996305989.918.1719827559898; PC_TOKEN=7955a7ab1f; appkey=; geetest_token=602cd4e3a7ed1898808f8adfe1a2048b; ALF=1722421868'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0'
...
...
@@ -36,16 +36,16 @@ def get_data(url,params):
else
:
return
None
def
getAllArticleList
():
artileList
=
[]
def
getArticleList
():
articleList
=
[]
with
open
(
'./article.csv'
,
'r'
,
encoding
=
'utf-8'
)
as
reader
:
readerCsv
=
csv
.
reader
(
reader
)
next
(
reader
)
for
nav
in
readerCsv
:
artileList
.
append
(
nav
)
return
artileList
articleList
.
append
(
nav
)
return
articleList
def
parse_j
son
(
response
,
artileId
):
def
readJ
son
(
response
,
artileId
):
for
comment
in
response
:
created_at
=
datetime
.
strptime
(
comment
[
'created_at'
],
'
%
a
%
b
%
d
%
H:
%
M:
%
S
%
z
%
Y'
)
.
strftime
(
'
%
Y-
%
m-
%
d'
)
likes_counts
=
comment
[
'like_counts'
]
...
...
@@ -58,7 +58,7 @@ def parse_json(response,artileId):
authorGender
=
comment
[
'user'
][
'gender'
]
authorAddress
=
comment
[
'user'
][
'location'
]
authorAvatar
=
comment
[
'user'
][
'avatar_large'
]
write
rRow
([
write
([
artileId
,
created_at
,
likes_counts
,
...
...
@@ -73,7 +73,7 @@ def parse_json(response,artileId):
def
start
():
commentUrl
=
'https://weibo.com/ajax/statuses/buildComments'
init
()
articleList
=
getA
llA
rticleList
()
articleList
=
getArticleList
()
for
article
in
articleList
:
articleId
=
article
[
0
]
print
(
'正在爬取id值为
%
s的文章评论'
%
articleId
)
...
...
@@ -82,8 +82,8 @@ def start():
'id'
:
int
(
articleId
),
'is_show_bulletin'
:
2
}
response
=
get_data
(
commentUrl
,
params
)
parse_json
(
response
,
articleId
)
response
=
fetchData
(
commentUrl
,
params
)
readJson
(
response
,
articleId
)
...
...
spider/spiderContent.py
View file @
987fdfb
...
...
@@ -25,12 +25,12 @@ def init():
'isVip'
# v_plus
])
def
write
rRow
(
row
):
def
write
(
row
):
with
open
(
'./article.csv'
,
'a'
,
encoding
=
'utf-8'
,
newline
=
''
)
as
csvFile
:
writer
=
csv
.
writer
(
csvFile
)
writer
.
writerow
(
row
)
def
get_d
ata
(
url
,
params
):
def
fetchD
ata
(
url
,
params
):
headers
=
{
'Cookie'
:
'SINAGLOBAL=2555941826014.1074.1676801766625; ULV=1719829459275:6:1:2:4660996305989.918.1719827559898:1719743122299; UOR=,,www.baidu.com; XSRF-TOKEN=VtLXviYSIs8lor7sz4iGyigL; SUB=_2A25LhvU9DeRhGeFH6FIX-S3MyD2IHXVo-gj1rDV8PUJbkNAGLRXMkW1Ne2nhI3Gle25QJK0Z99J3trq_NZn6YKJ-; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WW3Mv8V5EupQbbKh.vaZIwU5JpX5KzhUgL.FoM4e05c1Ke7e022dJLoIp7LxKML1KBLBKnLxKqL1hnLBoM41hz41hqReKqN; WBPSESS=Dt2hbAUaXfkVprjyrAZT_LRaDLsnxG-kIbeYwnBb5OUKZiwfVr_UrcYfWuqG-4ZVDM5HeU3HXkDNK_thfRfdS9Ao6ezT30jDksv-CpaVmlTAqGUHjJ7PYkH5aCK4HLxmRq14ZalmQNwzfWMPa4y0VNRLuYdg7L1s49ymNq_5v5vusoz0r4ki6u-MHGraF0fbUTgX14x0kHayEwOoxfLI-w==; SCF=AqmJWo31oFV5itnRgWNU1-wHQTL6PmkBLf3gDuqpdqAIfaWguDTMre6Oxjf5Uzs74JAh2r0DdV1sJ1g6m-wJ5NQ.; _s_tentry=-; Apache=4660996305989.918.1719827559898; PC_TOKEN=7955a7ab1f; appkey=; geetest_token=602cd4e3a7ed1898808f8adfe1a2048b; ALF=1722421868'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0'
...
...
@@ -41,7 +41,7 @@ def get_data(url,params):
else
:
return
None
def
get
All
TypeList
():
def
getTypeList
():
typeList
=
[]
with
open
(
'./nav.csv'
,
'r'
,
encoding
=
'utf-8'
)
as
reader
:
readerCsv
=
csv
.
reader
(
reader
)
...
...
@@ -50,7 +50,7 @@ def getAllTypeList():
typeList
.
append
(
nav
)
return
typeList
def
parse_j
son
(
response
,
type
):
def
readJ
son
(
response
,
type
):
for
artice
in
response
:
id
=
artice
[
'id'
]
likeNum
=
artice
[
'attitudes_count'
]
...
...
@@ -72,7 +72,7 @@ def parse_json(response,type):
authorName
=
artice
[
'user'
][
'screen_name'
]
authorDetail
=
'https://weibo.com/u/'
+
str
(
artice
[
'user'
][
'id'
])
isVip
=
artice
[
'user'
][
'v_plus'
]
write
rRow
([
write
([
id
,
likeNum
,
commentsLen
,
...
...
@@ -92,7 +92,7 @@ def parse_json(response,type):
def
start
(
typeNum
=
1
,
pageNum
=
1
):
articleUrl
=
'https://weibo.com/ajax/feed/hottimeline'
init
()
typeList
=
get
All
TypeList
()
typeList
=
getTypeList
()
typeNumCount
=
0
for
type
in
typeList
:
if
typeNumCount
>
typeNum
:
return
...
...
@@ -107,8 +107,8 @@ def start(typeNum=1,pageNum=1):
'count'
:
10
,
'extparam'
:
'discover|new_feed'
}
response
=
get_data
(
articleUrl
,
parmas
)
parse_json
(
response
,
type
[
0
])
response
=
fetchData
(
articleUrl
,
parmas
)
readJson
(
response
,
type
[
0
])
typeNumCount
+=
1
if
__name__
==
'__main__'
:
...
...
spider/spiderNav.py
View file @
987fdfb
...
...
@@ -13,12 +13,12 @@ def init():
'containerid'
])
def
write
rRow
(
row
):
def
write
(
row
):
with
open
(
'./nav.csv'
,
'a'
,
encoding
=
'utf-8'
,
newline
=
''
)
as
csvFile
:
writer
=
csv
.
writer
(
csvFile
)
writer
.
writerow
(
row
)
def
get_d
ata
(
url
):
def
fetchD
ata
(
url
):
headers
=
{
'Cookie'
:
'SINAGLOBAL=2555941826014.1074.1676801766625; ULV=1719829459275:6:1:2:4660996305989.918.1719827559898:1719743122299; UOR=,,www.baidu.com; XSRF-TOKEN=VtLXviYSIs8lor7sz4iGyigL; SUB=_2A25LhvU9DeRhGeFH6FIX-S3MyD2IHXVo-gj1rDV8PUJbkNAGLRXMkW1Ne2nhI3Gle25QJK0Z99J3trq_NZn6YKJ-; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WW3Mv8V5EupQbbKh.vaZIwU5JpX5KzhUgL.FoM4e05c1Ke7e022dJLoIp7LxKML1KBLBKnLxKqL1hnLBoM41hz41hqReKqN; WBPSESS=Dt2hbAUaXfkVprjyrAZT_LRaDLsnxG-kIbeYwnBb5OUKZiwfVr_UrcYfWuqG-4ZVDM5HeU3HXkDNK_thfRfdS9Ao6ezT30jDksv-CpaVmlTAqGUHjJ7PYkH5aCK4HLxmRq14ZalmQNwzfWMPa4y0VNRLuYdg7L1s49ymNq_5v5vusoz0r4ki6u-MHGraF0fbUTgX14x0kHayEwOoxfLI-w==; SCF=AqmJWo31oFV5itnRgWNU1-wHQTL6PmkBLf3gDuqpdqAIfaWguDTMre6Oxjf5Uzs74JAh2r0DdV1sJ1g6m-wJ5NQ.; _s_tentry=-; Apache=4660996305989.918.1719827559898; PC_TOKEN=7955a7ab1f; appkey=; geetest_token=602cd4e3a7ed1898808f8adfe1a2048b; ALF=1722421868'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0'
...
...
@@ -33,13 +33,13 @@ def get_data(url):
else
:
return
None
def
parse_j
son
(
response
):
def
readJ
son
(
response
):
navList
=
np
.
append
(
response
[
'groups'
][
3
][
'group'
],
response
[
'groups'
][
4
][
'group'
])
for
nav
in
navList
:
navName
=
nav
[
'title'
]
gid
=
nav
[
'gid'
]
containerid
=
nav
[
'containerid'
]
write
rRow
([
write
([
navName
,
gid
,
containerid
...
...
@@ -49,5 +49,5 @@ def parse_json(response):
if
__name__
==
'__main__'
:
init
()
url
=
'https://weibo.com/ajax/feed/allGroups'
response
=
get_data
(
url
)
parse_json
(
response
)
\ No newline at end of file
response
=
fetchData
(
url
)
readJson
(
response
)
\ No newline at end of file
...
...
Please
register
or
login
to post a comment