YYL469

修改【main.py】,增加【saveData.py】,将数据存储模块与调度模块分离

from spiderContent import start as spiderContentStart
from spiderComments import start as spiderCommentsStart
import os
from sqlalchemy import create_engine
import pandas as pd
engine = create_engine('mysql+pymysql://XiaoXueQi:XiaoXueQi@10.92.35.13/Weibo_PublicOpinion_AnalysisSystem?charset=utf8mb4')
def save_to_sql():
try:
artileOldPd = pd.read_sql('select * from article',engine)
articleNewPd = pd.read_csv('articleData.csv')
commentOldPd = pd.read_sql('select * from comments',engine)
commentNewPd = pd.read_csv('articleComments.csv')
concatArticlePd = pd.concat([articleNewPd,artileOldPd],join='inner')
concatCommentsPd = pd.concat([commentNewPd,commentOldPd],join='inner')
concatArticlePd.drop_duplicates(subset='id',keep='last',inplace=True)
concatCommentsPd.drop_duplicates(subset='content',keep='last',inplace=True)
concatArticlePd.to_sql('article', con=engine, if_exists='replace', index=False)
concatCommentsPd.to_sql('comments', con=engine, if_exists='replace', index=False)
except:
articleNewPd = pd.read_csv('articleData.csv')
commentNewPd = pd.read_csv('articleComments.csv')
articleNewPd.to_sql('article',con=engine,if_exists='replace',index=False)
commentNewPd.to_sql('comments',con=engine,if_exists='replace',index=False)
os.remove('./articleData.csv')
os.remove('./articleComments.csv')
from saveData import save_to_sql as saveData
def main():
print('正在爬取文章数据')
... ... @@ -36,8 +8,7 @@ def main():
print('正在爬取文章评论数据')
spiderCommentsStart()
print('正在存储数据')
save_to_sql()
saveData()
if __name__ == '__main__':
main()
\ No newline at end of file
... ...
import os
from sqlalchemy import create_engine
import pandas as pd
engine = create_engine('mysql+pymysql://XiaoXueQi:XiaoXueQi@10.92.35.13/Weibo_PublicOpinion_AnalysisSystem?charset=utf8mb4')
def save_to_sql():
try:
artileOldPd = pd.read_sql('select * from article',engine)
articleNewPd = pd.read_csv('articleData.csv')
commentOldPd = pd.read_sql('select * from comments',engine)
commentNewPd = pd.read_csv('articleComments.csv')
concatArticlePd = pd.concat([articleNewPd,artileOldPd],join='inner')
concatCommentsPd = pd.concat([commentNewPd,commentOldPd],join='inner')
concatArticlePd.drop_duplicates(subset='id',keep='last',inplace=True)
concatCommentsPd.drop_duplicates(subset='content',keep='last',inplace=True)
concatArticlePd.to_sql('article', con=engine, if_exists='replace', index=False)
concatCommentsPd.to_sql('comments', con=engine, if_exists='replace', index=False)
except:
articleNewPd = pd.read_csv('articleData.csv')
commentNewPd = pd.read_csv('articleComments.csv')
articleNewPd.to_sql('article',con=engine,if_exists='replace',index=False)
commentNewPd.to_sql('comments',con=engine,if_exists='replace',index=False)
os.remove('./articleData.csv')
os.remove('./articleComments.csv')
if __name__ == '__main__':
save_to_sql()
\ No newline at end of file
... ...