Showing
2 changed files
with
34 additions
and
31 deletions
| 1 | from spiderContent import start as spiderContentStart | 1 | from spiderContent import start as spiderContentStart |
| 2 | from spiderComments import start as spiderCommentsStart | 2 | from spiderComments import start as spiderCommentsStart |
| 3 | -import os | ||
| 4 | -from sqlalchemy import create_engine | ||
| 5 | -import pandas as pd | ||
| 6 | - | ||
| 7 | -engine = create_engine('mysql+pymysql://XiaoXueQi:XiaoXueQi@10.92.35.13/Weibo_PublicOpinion_AnalysisSystem?charset=utf8mb4') | ||
| 8 | - | ||
| 9 | -def save_to_sql(): | ||
| 10 | - try: | ||
| 11 | - artileOldPd = pd.read_sql('select * from article',engine) | ||
| 12 | - articleNewPd = pd.read_csv('articleData.csv') | ||
| 13 | - commentOldPd = pd.read_sql('select * from comments',engine) | ||
| 14 | - commentNewPd = pd.read_csv('articleComments.csv') | ||
| 15 | - | ||
| 16 | - concatArticlePd = pd.concat([articleNewPd,artileOldPd],join='inner') | ||
| 17 | - concatCommentsPd = pd.concat([commentNewPd,commentOldPd],join='inner') | ||
| 18 | - | ||
| 19 | - concatArticlePd.drop_duplicates(subset='id',keep='last',inplace=True) | ||
| 20 | - concatCommentsPd.drop_duplicates(subset='content',keep='last',inplace=True) | ||
| 21 | - | ||
| 22 | - concatArticlePd.to_sql('article', con=engine, if_exists='replace', index=False) | ||
| 23 | - concatCommentsPd.to_sql('comments', con=engine, if_exists='replace', index=False) | ||
| 24 | - except: | ||
| 25 | - articleNewPd = pd.read_csv('articleData.csv') | ||
| 26 | - commentNewPd = pd.read_csv('articleComments.csv') | ||
| 27 | - articleNewPd.to_sql('article',con=engine,if_exists='replace',index=False) | ||
| 28 | - commentNewPd.to_sql('comments',con=engine,if_exists='replace',index=False) | ||
| 29 | - | ||
| 30 | - os.remove('./articleData.csv') | ||
| 31 | - os.remove('./articleComments.csv') | 3 | +from saveData import save_to_sql as saveData |
| 32 | 4 | ||
| 33 | def main(): | 5 | def main(): |
| 34 | print('正在爬取文章数据') | 6 | print('正在爬取文章数据') |
| @@ -36,8 +8,7 @@ def main(): | @@ -36,8 +8,7 @@ def main(): | ||
| 36 | print('正在爬取文章评论数据') | 8 | print('正在爬取文章评论数据') |
| 37 | spiderCommentsStart() | 9 | spiderCommentsStart() |
| 38 | print('正在存储数据') | 10 | print('正在存储数据') |
| 39 | - save_to_sql() | ||
| 40 | - | 11 | + saveData() |
| 41 | 12 | ||
| 42 | if __name__ == '__main__': | 13 | if __name__ == '__main__': |
| 43 | main() | 14 | main() |
spider/saveData.py
0 → 100644
| 1 | +import os | ||
| 2 | +from sqlalchemy import create_engine | ||
| 3 | +import pandas as pd | ||
| 4 | + | ||
| 5 | +engine = create_engine('mysql+pymysql://XiaoXueQi:XiaoXueQi@10.92.35.13/Weibo_PublicOpinion_AnalysisSystem?charset=utf8mb4') | ||
| 6 | + | ||
| 7 | +def save_to_sql(): | ||
| 8 | + try: | ||
| 9 | + artileOldPd = pd.read_sql('select * from article',engine) | ||
| 10 | + articleNewPd = pd.read_csv('articleData.csv') | ||
| 11 | + commentOldPd = pd.read_sql('select * from comments',engine) | ||
| 12 | + commentNewPd = pd.read_csv('articleComments.csv') | ||
| 13 | + | ||
| 14 | + concatArticlePd = pd.concat([articleNewPd,artileOldPd],join='inner') | ||
| 15 | + concatCommentsPd = pd.concat([commentNewPd,commentOldPd],join='inner') | ||
| 16 | + | ||
| 17 | + concatArticlePd.drop_duplicates(subset='id',keep='last',inplace=True) | ||
| 18 | + concatCommentsPd.drop_duplicates(subset='content',keep='last',inplace=True) | ||
| 19 | + | ||
| 20 | + concatArticlePd.to_sql('article', con=engine, if_exists='replace', index=False) | ||
| 21 | + concatCommentsPd.to_sql('comments', con=engine, if_exists='replace', index=False) | ||
| 22 | + except: | ||
| 23 | + articleNewPd = pd.read_csv('articleData.csv') | ||
| 24 | + commentNewPd = pd.read_csv('articleComments.csv') | ||
| 25 | + articleNewPd.to_sql('article',con=engine,if_exists='replace',index=False) | ||
| 26 | + commentNewPd.to_sql('comments',con=engine,if_exists='replace',index=False) | ||
| 27 | + | ||
| 28 | + os.remove('./articleData.csv') | ||
| 29 | + os.remove('./articleComments.csv') | ||
| 30 | + | ||
| 31 | +if __name__ == '__main__': | ||
| 32 | + save_to_sql() |
-
Please register or login to post a comment