YYL469

优化爬虫代码

1 import os 1 import os
2 from sqlalchemy import create_engine 2 from sqlalchemy import create_engine
3 import pandas as pd 3 import pandas as pd
  4 +from spiderDataPackage.settings import articleAddr,commentsAddr
4 5
5 engine = create_engine('mysql+pymysql://XiaoXueQi:XiaoXueQi@47.92.235.6/Weibo_PublicOpinion_AnalysisSystem?charset=utf8mb4') 6 engine = create_engine('mysql+pymysql://XiaoXueQi:XiaoXueQi@47.92.235.6/Weibo_PublicOpinion_AnalysisSystem?charset=utf8mb4')
6 7
7 def saveData(): 8 def saveData():
8 try: 9 try:
9 oldArticle = pd.read_sql('select * from article',engine) 10 oldArticle = pd.read_sql('select * from article',engine)
10 - newArticle = pd.read_csv('article.csv') 11 + newArticle = pd.read_csv(articleAddr)
11 oldComment = pd.read_sql('select * from comments',engine) 12 oldComment = pd.read_sql('select * from comments',engine)
12 - newComment = pd.read_csv('comments.csv') 13 + newComment = pd.read_csv(commentsAddr)
13 14
14 mergeArticle = pd.concat([newArticle,oldArticle],join='inner') 15 mergeArticle = pd.concat([newArticle,oldArticle],join='inner')
15 mergeComment = pd.concat([newComment,oldComment],join='inner') 16 mergeComment = pd.concat([newComment,oldComment],join='inner')
@@ -20,13 +21,13 @@ def saveData(): @@ -20,13 +21,13 @@ def saveData():
20 mergeArticle.to_sql('article', con=engine, if_exists='replace', index=False) 21 mergeArticle.to_sql('article', con=engine, if_exists='replace', index=False)
21 mergeComment.to_sql('comments', con=engine, if_exists='replace', index=False) 22 mergeComment.to_sql('comments', con=engine, if_exists='replace', index=False)
22 except: 23 except:
23 - newArticle = pd.read_csv('article.csv')  
24 - newComment = pd.read_csv('comments.csv') 24 + newArticle = pd.read_csv(articleAddr)
  25 + newComment = pd.read_csv(commentsAddr)
25 newArticle.to_sql('article',con=engine,if_exists='replace',index=False) 26 newArticle.to_sql('article',con=engine,if_exists='replace',index=False)
26 newComment.to_sql('comments',con=engine,if_exists='replace',index=False) 27 newComment.to_sql('comments',con=engine,if_exists='replace',index=False)
27 28
28 - os.remove('./article.csv')  
29 - os.remove('./comments.csv') 29 + os.remove(articleAddr)
  30 + os.remove(commentsAddr)
30 31
31 if __name__ == '__main__': 32 if __name__ == '__main__':
32 saveData() 33 saveData()