Toggle navigation
Toggle navigation
This project
Loading...
Sign in
万朱浩
/
Venue-Ops
Go to a project
Toggle navigation
Projects
Groups
Snippets
Help
Toggle navigation pinning
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
YYL469
2024-07-03 13:30:50 +0800
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
f3b64ee2d955d1f07d24310f4516172e2dbd4890
f3b64ee2
1 parent
5468d64e
修改【main.py】,增加【saveData.py】,将数据存储模块与调度模块分离
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
34 additions
and
31 deletions
spider/main.py
spider/saveData.py
spider/main.py
View file @
f3b64ee
from
spiderContent
import
start
as
spiderContentStart
from
spiderComments
import
start
as
spiderCommentsStart
import
os
from
sqlalchemy
import
create_engine
import
pandas
as
pd
engine
=
create_engine
(
'mysql+pymysql://XiaoXueQi:XiaoXueQi@10.92.35.13/Weibo_PublicOpinion_AnalysisSystem?charset=utf8mb4'
)
def
save_to_sql
():
try
:
artileOldPd
=
pd
.
read_sql
(
'select * from article'
,
engine
)
articleNewPd
=
pd
.
read_csv
(
'articleData.csv'
)
commentOldPd
=
pd
.
read_sql
(
'select * from comments'
,
engine
)
commentNewPd
=
pd
.
read_csv
(
'articleComments.csv'
)
concatArticlePd
=
pd
.
concat
([
articleNewPd
,
artileOldPd
],
join
=
'inner'
)
concatCommentsPd
=
pd
.
concat
([
commentNewPd
,
commentOldPd
],
join
=
'inner'
)
concatArticlePd
.
drop_duplicates
(
subset
=
'id'
,
keep
=
'last'
,
inplace
=
True
)
concatCommentsPd
.
drop_duplicates
(
subset
=
'content'
,
keep
=
'last'
,
inplace
=
True
)
concatArticlePd
.
to_sql
(
'article'
,
con
=
engine
,
if_exists
=
'replace'
,
index
=
False
)
concatCommentsPd
.
to_sql
(
'comments'
,
con
=
engine
,
if_exists
=
'replace'
,
index
=
False
)
except
:
articleNewPd
=
pd
.
read_csv
(
'articleData.csv'
)
commentNewPd
=
pd
.
read_csv
(
'articleComments.csv'
)
articleNewPd
.
to_sql
(
'article'
,
con
=
engine
,
if_exists
=
'replace'
,
index
=
False
)
commentNewPd
.
to_sql
(
'comments'
,
con
=
engine
,
if_exists
=
'replace'
,
index
=
False
)
os
.
remove
(
'./articleData.csv'
)
os
.
remove
(
'./articleComments.csv'
)
from
saveData
import
save_to_sql
as
saveData
def
main
():
print
(
'正在爬取文章数据'
)
...
...
@@ -36,8 +8,7 @@ def main():
print
(
'正在爬取文章评论数据'
)
spiderCommentsStart
()
print
(
'正在存储数据'
)
save_to_sql
()
saveData
()
if
__name__
==
'__main__'
:
main
()
\ No newline at end of file
...
...
spider/saveData.py
0 → 100644
View file @
f3b64ee
import
os
from
sqlalchemy
import
create_engine
import
pandas
as
pd
engine
=
create_engine
(
'mysql+pymysql://XiaoXueQi:XiaoXueQi@10.92.35.13/Weibo_PublicOpinion_AnalysisSystem?charset=utf8mb4'
)
def
save_to_sql
():
try
:
artileOldPd
=
pd
.
read_sql
(
'select * from article'
,
engine
)
articleNewPd
=
pd
.
read_csv
(
'articleData.csv'
)
commentOldPd
=
pd
.
read_sql
(
'select * from comments'
,
engine
)
commentNewPd
=
pd
.
read_csv
(
'articleComments.csv'
)
concatArticlePd
=
pd
.
concat
([
articleNewPd
,
artileOldPd
],
join
=
'inner'
)
concatCommentsPd
=
pd
.
concat
([
commentNewPd
,
commentOldPd
],
join
=
'inner'
)
concatArticlePd
.
drop_duplicates
(
subset
=
'id'
,
keep
=
'last'
,
inplace
=
True
)
concatCommentsPd
.
drop_duplicates
(
subset
=
'content'
,
keep
=
'last'
,
inplace
=
True
)
concatArticlePd
.
to_sql
(
'article'
,
con
=
engine
,
if_exists
=
'replace'
,
index
=
False
)
concatCommentsPd
.
to_sql
(
'comments'
,
con
=
engine
,
if_exists
=
'replace'
,
index
=
False
)
except
:
articleNewPd
=
pd
.
read_csv
(
'articleData.csv'
)
commentNewPd
=
pd
.
read_csv
(
'articleComments.csv'
)
articleNewPd
.
to_sql
(
'article'
,
con
=
engine
,
if_exists
=
'replace'
,
index
=
False
)
commentNewPd
.
to_sql
(
'comments'
,
con
=
engine
,
if_exists
=
'replace'
,
index
=
False
)
os
.
remove
(
'./articleData.csv'
)
os
.
remove
(
'./articleComments.csv'
)
if
__name__
==
'__main__'
:
save_to_sql
()
\ No newline at end of file
...
...
Please
register
or
login
to post a comment