Showing
1 changed file
with
43 additions
and
0 deletions
spider/main.py
0 → 100644
| 1 | +from spiderContent import start as spiderContentStart | ||
| 2 | +from spiderComments import start as spiderCommentsStart | ||
| 3 | +import os | ||
| 4 | +from sqlalchemy import create_engine | ||
| 5 | +import pandas as pd | ||
| 6 | + | ||
| 7 | +engine = create_engine('mysql+pymysql://XiaoXueQi:XiaoXueQi@10.92.35.13/Weibo_PublicOpinion_AnalysisSystem?charset=utf8mb4') | ||
| 8 | + | ||
| 9 | +def save_to_sql(): | ||
| 10 | + try: | ||
| 11 | + artileOldPd = pd.read_sql('select * from article',engine) | ||
| 12 | + articleNewPd = pd.read_csv('articleData.csv') | ||
| 13 | + commentOldPd = pd.read_sql('select * from comments',engine) | ||
| 14 | + commentNewPd = pd.read_csv('articleComments.csv') | ||
| 15 | + | ||
| 16 | + concatArticlePd = pd.concat([articleNewPd,artileOldPd],join='inner') | ||
| 17 | + concatCommentsPd = pd.concat([commentNewPd,commentOldPd],join='inner') | ||
| 18 | + | ||
| 19 | + concatArticlePd.drop_duplicates(subset='id',keep='last',inplace=True) | ||
| 20 | + concatCommentsPd.drop_duplicates(subset='content',keep='last',inplace=True) | ||
| 21 | + | ||
| 22 | + concatArticlePd.to_sql('article', con=engine, if_exists='replace', index=False) | ||
| 23 | + concatCommentsPd.to_sql('comments', con=engine, if_exists='replace', index=False) | ||
| 24 | + except: | ||
| 25 | + articleNewPd = pd.read_csv('articleData.csv') | ||
| 26 | + commentNewPd = pd.read_csv('articleComments.csv') | ||
| 27 | + articleNewPd.to_sql('article',con=engine,if_exists='replace',index=False) | ||
| 28 | + commentNewPd.to_sql('comments',con=engine,if_exists='replace',index=False) | ||
| 29 | + | ||
| 30 | + os.remove('./articleData.csv') | ||
| 31 | + os.remove('./articleComments.csv') | ||
| 32 | + | ||
| 33 | +def main(): | ||
| 34 | + print('正在爬取文章数据') | ||
| 35 | + spiderContentStart(1,1) | ||
| 36 | + print('正在爬取文章评论数据') | ||
| 37 | + spiderCommentsStart() | ||
| 38 | + print('正在存储数据') | ||
| 39 | + save_to_sql() | ||
| 40 | + | ||
| 41 | + | ||
| 42 | +if __name__ == '__main__': | ||
| 43 | + main() |
-
Please register or login to post a comment