Toggle navigation
Toggle navigation
This project
Loading...
Sign in
万朱浩
/
Venue-Ops
Go to a project
Toggle navigation
Projects
Groups
Snippets
Help
Toggle navigation pinning
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
YYL469
2024-07-02 21:55:04 +0800
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
24a81848f476758464d057cf4c0747a4ef93b637
24a81848
1 parent
28e04026
【main.py】实现完整的爬取数据过程,并将爬取的数据存储到数据库中
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
43 additions
and
0 deletions
spider/main.py
spider/main.py
0 → 100644
View file @
24a8184
from
spiderContent
import
start
as
spiderContentStart
from
spiderComments
import
start
as
spiderCommentsStart
import
os
from
sqlalchemy
import
create_engine
import
pandas
as
pd
engine
=
create_engine
(
'mysql+pymysql://XiaoXueQi:XiaoXueQi@10.92.35.13/Weibo_PublicOpinion_AnalysisSystem?charset=utf8mb4'
)
def
save_to_sql
():
try
:
artileOldPd
=
pd
.
read_sql
(
'select * from article'
,
engine
)
articleNewPd
=
pd
.
read_csv
(
'articleData.csv'
)
commentOldPd
=
pd
.
read_sql
(
'select * from comments'
,
engine
)
commentNewPd
=
pd
.
read_csv
(
'articleComments.csv'
)
concatArticlePd
=
pd
.
concat
([
articleNewPd
,
artileOldPd
],
join
=
'inner'
)
concatCommentsPd
=
pd
.
concat
([
commentNewPd
,
commentOldPd
],
join
=
'inner'
)
concatArticlePd
.
drop_duplicates
(
subset
=
'id'
,
keep
=
'last'
,
inplace
=
True
)
concatCommentsPd
.
drop_duplicates
(
subset
=
'content'
,
keep
=
'last'
,
inplace
=
True
)
concatArticlePd
.
to_sql
(
'article'
,
con
=
engine
,
if_exists
=
'replace'
,
index
=
False
)
concatCommentsPd
.
to_sql
(
'comments'
,
con
=
engine
,
if_exists
=
'replace'
,
index
=
False
)
except
:
articleNewPd
=
pd
.
read_csv
(
'articleData.csv'
)
commentNewPd
=
pd
.
read_csv
(
'articleComments.csv'
)
articleNewPd
.
to_sql
(
'article'
,
con
=
engine
,
if_exists
=
'replace'
,
index
=
False
)
commentNewPd
.
to_sql
(
'comments'
,
con
=
engine
,
if_exists
=
'replace'
,
index
=
False
)
os
.
remove
(
'./articleData.csv'
)
os
.
remove
(
'./articleComments.csv'
)
def
main
():
print
(
'正在爬取文章数据'
)
spiderContentStart
(
1
,
1
)
print
(
'正在爬取文章评论数据'
)
spiderCommentsStart
()
print
(
'正在存储数据'
)
save_to_sql
()
if
__name__
==
'__main__'
:
main
()
\ No newline at end of file
...
...
Please
register
or
login
to post a comment