YYL469

【main.py】实现完整的爬取数据过程,并将爬取的数据存储到数据库中

  1 +from spiderContent import start as spiderContentStart
  2 +from spiderComments import start as spiderCommentsStart
  3 +import os
  4 +from sqlalchemy import create_engine
  5 +import pandas as pd
  6 +
  7 +engine = create_engine('mysql+pymysql://XiaoXueQi:XiaoXueQi@10.92.35.13/Weibo_PublicOpinion_AnalysisSystem?charset=utf8mb4')
  8 +
  9 +def save_to_sql():
  10 + try:
  11 + artileOldPd = pd.read_sql('select * from article',engine)
  12 + articleNewPd = pd.read_csv('articleData.csv')
  13 + commentOldPd = pd.read_sql('select * from comments',engine)
  14 + commentNewPd = pd.read_csv('articleComments.csv')
  15 +
  16 + concatArticlePd = pd.concat([articleNewPd,artileOldPd],join='inner')
  17 + concatCommentsPd = pd.concat([commentNewPd,commentOldPd],join='inner')
  18 +
  19 + concatArticlePd.drop_duplicates(subset='id',keep='last',inplace=True)
  20 + concatCommentsPd.drop_duplicates(subset='content',keep='last',inplace=True)
  21 +
  22 + concatArticlePd.to_sql('article', con=engine, if_exists='replace', index=False)
  23 + concatCommentsPd.to_sql('comments', con=engine, if_exists='replace', index=False)
  24 + except:
  25 + articleNewPd = pd.read_csv('articleData.csv')
  26 + commentNewPd = pd.read_csv('articleComments.csv')
  27 + articleNewPd.to_sql('article',con=engine,if_exists='replace',index=False)
  28 + commentNewPd.to_sql('comments',con=engine,if_exists='replace',index=False)
  29 +
  30 + os.remove('./articleData.csv')
  31 + os.remove('./articleComments.csv')
  32 +
  33 +def main():
  34 + print('正在爬取文章数据')
  35 + spiderContentStart(1,1)
  36 + print('正在爬取文章评论数据')
  37 + spiderCommentsStart()
  38 + print('正在存储数据')
  39 + save_to_sql()
  40 +
  41 +
  42 +if __name__ == '__main__':
  43 + main()