Showing
11 changed files
with
34 additions
and
80 deletions
| 1 | -from spiderContent import start as spiderContentStart | ||
| 2 | -from spiderComments import start as spiderCommentsStart | 1 | +from spiderData import spiderData |
| 3 | from saveData import save_to_sql as saveData | 2 | from saveData import save_to_sql as saveData |
| 4 | 3 | ||
| 5 | def main(): | 4 | def main(): |
| 6 | - print('正在爬取文章数据') | ||
| 7 | - spiderContentStart(1,1) | ||
| 8 | - print('正在爬取文章评论数据') | ||
| 9 | - spiderCommentsStart() | ||
| 10 | - print('正在存储数据') | 5 | + try: |
| 6 | + spiderData() | ||
| 11 | saveData() | 7 | saveData() |
| 8 | + print("爬取数据更新") | ||
| 9 | + except: | ||
| 10 | + print("爬取数据失败") | ||
| 12 | 11 | ||
| 13 | if __name__ == '__main__': | 12 | if __name__ == '__main__': |
| 14 | main() | 13 | main() |
spider/navData.csv
deleted
100644 → 0
| 1 | -typeName,gid,containerid | ||
| 2 | -热门,102803,102803 | ||
| 3 | -同城,1028032222,102803_2222 | ||
| 4 | -榜单,102803600169,102803_ctg1_600169_-_ctg1_600169 | ||
| 5 | -男篮,102803600279,102803_ctg1_600279_-_ctg1_600279 | ||
| 6 | -明星,1028034288,102803_ctg1_4288_-_ctg1_4288 | ||
| 7 | -车展,1028035188,102803_ctg1_5188_-_ctg1_5188 | ||
| 8 | -搞笑,1028034388,102803_ctg1_4388_-_ctg1_4388 | ||
| 9 | -情感,1028031988,102803_ctg1_1988_-_ctg1_1988 | ||
| 10 | -周末,102803600195,102803_ctg1_600195_-_ctg1_600195 | ||
| 11 | -电影,1028033288,102803_ctg1_3288_-_ctg1_3288 | ||
| 12 | -社会,1028034188,102803_ctg1_4188_-_ctg1_4188 | ||
| 13 | -电视剧,1028032488,102803_ctg1_2488_-_ctg1_2488 | ||
| 14 | -美食,1028032688,102803_ctg1_2688_-_ctg1_2688 | ||
| 15 | -俄乌局势,102803600267,102803_ctg1_600267_-_ctg1_600267 | ||
| 16 | -国际,1028036288,102803_ctg1_6288_-_ctg1_6288 | ||
| 17 | -深度,102803600155,102803_ctg1_600155_-_ctg1_600155 | ||
| 18 | -财经,1028036388,102803_ctg1_6388_-_ctg1_6388 | ||
| 19 | -读书,1028034588,102803_ctg1_4588_-_ctg1_4588 | ||
| 20 | -摄影,1028034988,102803_ctg1_4988_-_ctg1_4988 | ||
| 21 | -颜值,102803600165,102803_ctg1_600165_-_ctg1_600165 | ||
| 22 | -体育,1028031388,102803_ctg1_1388_-_ctg1_1388 | ||
| 23 | -数码,1028035088,102803_ctg1_5088_-_ctg1_5088 | ||
| 24 | -综艺,1028034688,102803_ctg1_4688_-_ctg1_4688 | ||
| 25 | -时尚,1028034488,102803_ctg1_4488_-_ctg1_4488 | ||
| 26 | -星座,1028031688,102803_ctg1_1688_-_ctg1_1688 | ||
| 27 | -军事,1028036688,102803_ctg1_6688_-_ctg1_6688 | ||
| 28 | -股市,1028031288,102803_ctg1_1288_-_ctg1_1288 | ||
| 29 | -房产,1028035588,102803_ctg1_5588_-_ctg1_5588 | ||
| 30 | -家居,1028035888,102803_ctg1_5888_-_ctg1_5888 | ||
| 31 | -萌宠,1028032788,102803_ctg1_2788_-_ctg1_2788 | ||
| 32 | -科技,1028032088,102803_ctg1_2088_-_ctg1_2088 | ||
| 33 | -科普,1028035988,102803_ctg1_5988_-_ctg1_5988 | ||
| 34 | -动漫,1028032388,102803_ctg1_2388_-_ctg1_2388 | ||
| 35 | -运动健身,1028034788,102803_ctg1_4788_-_ctg1_4788 | ||
| 36 | -旅游,1028032588,102803_ctg1_2588_-_ctg1_2588 | ||
| 37 | -瘦身,1028036488,102803_ctg1_6488_-_ctg1_6488 | ||
| 38 | -好物,102803600094,102803_ctg1_600094_-_ctg1_600094 | ||
| 39 | -历史,1028036788,102803_ctg1_6788_-_ctg1_6788 | ||
| 40 | -艺术,1028035488,102803_ctg1_5488_-_ctg1_5488 | ||
| 41 | -美妆,1028031588,102803_ctg1_1588_-_ctg1_1588 | ||
| 42 | -法律,1028037388,102803_ctg1_7388_-_ctg1_7388 | ||
| 43 | -设计,1028035388,102803_ctg1_5388_-_ctg1_5388 | ||
| 44 | -健康,1028032188,102803_ctg1_2188_-_ctg1_2188 | ||
| 45 | -音乐,1028035288,102803_ctg1_5288_-_ctg1_5288 | ||
| 46 | -游戏,1028034888,102803_ctg1_4888_-_ctg1_4888 | ||
| 47 | -新时代,1028037968,102803_ctg1_7968_-_ctg1_7968 | ||
| 48 | -校园,102803600177,102803_ctg1_600177_-_ctg1_600177 | ||
| 49 | -收藏,1028038189,102803_ctg1_8189_-_ctg1_8189 | ||
| 50 | -政务,1028035788,102803_ctg1_5788_-_ctg1_5788 | ||
| 51 | -养生,1028036588,102803_ctg1_6588_-_ctg1_6588 | ||
| 52 | -育儿,1028033188,102803_ctg1_3188_-_ctg1_3188 | ||
| 53 | -抽奖,102803600037,102803_ctg1_600037_-_ctg1_600037 | ||
| 54 | -教育,102803600080,102803_ctg1_600080_-_ctg1_600080 | ||
| 55 | -婚恋,1028031788,102803_ctg1_1788_-_ctg1_1788 | ||
| 56 | -舞蹈,1028038788,102803_ctg1_8788_-_ctg1_8788 | ||
| 57 | -辟谣,1028036988,102803_ctg1_6988_-_ctg1_6988 | ||
| 58 | -公益,102803600057,102803_ctg1_600057_-_ctg1_600057 | ||
| 59 | -问答,1028037977,102803_ctg1_7977_-_ctg1_7977 | ||
| 60 | -三农,1028037188,102803_ctg1_7188_-_ctg1_7188 |
spider/spiderData.py
0 → 100644
| 1 | +from spiderDataPack.spiderNav import start as spiderNavStart | ||
| 2 | +from spiderDataPack.spiderContent import start as spiderContentStart | ||
| 3 | +from spiderDataPack.spiderComments import start as spiderCommentsStart | ||
| 4 | +import os | ||
| 5 | + | ||
| 6 | +def spiderData(): | ||
| 7 | + if not os.path.exists('./nav.csv'): | ||
| 8 | + spiderNavStart() | ||
| 9 | + spiderContentStart(1,1) | ||
| 10 | + spiderCommentsStart() | ||
| 11 | + | ||
| 12 | +if __name__ == '__main__': | ||
| 13 | + spiderData() |
spider/spiderDataPack/__init__.py
0 → 100644
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
| @@ -5,8 +5,8 @@ import os | @@ -5,8 +5,8 @@ import os | ||
| 5 | from datetime import datetime | 5 | from datetime import datetime |
| 6 | 6 | ||
| 7 | def init(): | 7 | def init(): |
| 8 | - if not os.path.exists('./articleComments.csv'): | ||
| 9 | - with open('./articleComments.csv','w',encoding='utf-8',newline='') as csvFile: | 8 | + if not os.path.exists('./comments.csv'): |
| 9 | + with open('./comments.csv','w',encoding='utf-8',newline='') as csvFile: | ||
| 10 | writer = csv.writer(csvFile) | 10 | writer = csv.writer(csvFile) |
| 11 | writer.writerow([ | 11 | writer.writerow([ |
| 12 | 'articleId', | 12 | 'articleId', |
| @@ -21,7 +21,7 @@ def init(): | @@ -21,7 +21,7 @@ def init(): | ||
| 21 | ]) | 21 | ]) |
| 22 | 22 | ||
| 23 | def writerRow(row): | 23 | def writerRow(row): |
| 24 | - with open('./articleComments.csv', 'a', encoding='utf-8', newline='') as csvFile: | 24 | + with open('./comments.csv', 'a', encoding='utf-8', newline='') as csvFile: |
| 25 | writer = csv.writer(csvFile) | 25 | writer = csv.writer(csvFile) |
| 26 | writer.writerow(row) | 26 | writer.writerow(row) |
| 27 | 27 | ||
| @@ -38,7 +38,7 @@ def get_data(url,params): | @@ -38,7 +38,7 @@ def get_data(url,params): | ||
| 38 | 38 | ||
| 39 | def getAllArticleList(): | 39 | def getAllArticleList(): |
| 40 | artileList = [] | 40 | artileList = [] |
| 41 | - with open('./articleData.csv','r',encoding='utf-8') as reader: | 41 | + with open('./article.csv','r',encoding='utf-8') as reader: |
| 42 | readerCsv = csv.reader(reader) | 42 | readerCsv = csv.reader(reader) |
| 43 | next(reader) | 43 | next(reader) |
| 44 | for nav in readerCsv: | 44 | for nav in readerCsv: |
| @@ -5,8 +5,8 @@ import os | @@ -5,8 +5,8 @@ import os | ||
| 5 | from datetime import datetime | 5 | from datetime import datetime |
| 6 | 6 | ||
| 7 | def init(): | 7 | def init(): |
| 8 | - if not os.path.exists('./articleData.csv'): | ||
| 9 | - with open('./articleData.csv','w',encoding='utf-8',newline='') as csvFile: | 8 | + if not os.path.exists('./article.csv'): |
| 9 | + with open('./article.csv','w',encoding='utf-8',newline='') as csvFile: | ||
| 10 | writer = csv.writer(csvFile) | 10 | writer = csv.writer(csvFile) |
| 11 | writer.writerow([ | 11 | writer.writerow([ |
| 12 | 'id', | 12 | 'id', |
| @@ -26,7 +26,7 @@ def init(): | @@ -26,7 +26,7 @@ def init(): | ||
| 26 | ]) | 26 | ]) |
| 27 | 27 | ||
| 28 | def writerRow(row): | 28 | def writerRow(row): |
| 29 | - with open('./articleData.csv', 'a', encoding='utf-8', newline='') as csvFile: | 29 | + with open('./article.csv', 'a', encoding='utf-8', newline='') as csvFile: |
| 30 | writer = csv.writer(csvFile) | 30 | writer = csv.writer(csvFile) |
| 31 | writer.writerow(row) | 31 | writer.writerow(row) |
| 32 | 32 | ||
| @@ -43,7 +43,7 @@ def get_data(url,params): | @@ -43,7 +43,7 @@ def get_data(url,params): | ||
| 43 | 43 | ||
| 44 | def getAllTypeList(): | 44 | def getAllTypeList(): |
| 45 | typeList = [] | 45 | typeList = [] |
| 46 | - with open('./navData.csv','r',encoding='utf-8') as reader: | 46 | + with open('./nav.csv','r',encoding='utf-8') as reader: |
| 47 | readerCsv = csv.reader(reader) | 47 | readerCsv = csv.reader(reader) |
| 48 | next(reader) | 48 | next(reader) |
| 49 | for nav in readerCsv: | 49 | for nav in readerCsv: |
| @@ -4,8 +4,8 @@ import numpy as np | @@ -4,8 +4,8 @@ import numpy as np | ||
| 4 | import os | 4 | import os |
| 5 | 5 | ||
| 6 | def init(): | 6 | def init(): |
| 7 | - if not os.path.exists('./navData.csv'): | ||
| 8 | - with open('./navData.csv','w',encoding='utf-8',newline='') as csvFile: | 7 | + if not os.path.exists('./nav.csv'): |
| 8 | + with open('./nav.csv','w',encoding='utf-8',newline='') as csvFile: | ||
| 9 | writer = csv.writer(csvFile) | 9 | writer = csv.writer(csvFile) |
| 10 | writer.writerow([ | 10 | writer.writerow([ |
| 11 | 'typeName', | 11 | 'typeName', |
| @@ -14,7 +14,7 @@ def init(): | @@ -14,7 +14,7 @@ def init(): | ||
| 14 | ]) | 14 | ]) |
| 15 | 15 | ||
| 16 | def writerRow(row): | 16 | def writerRow(row): |
| 17 | - with open('./navData.csv', 'a', encoding='utf-8', newline='') as csvFile: | 17 | + with open('./nav.csv', 'a', encoding='utf-8', newline='') as csvFile: |
| 18 | writer = csv.writer(csvFile) | 18 | writer = csv.writer(csvFile) |
| 19 | writer.writerow(row) | 19 | writer.writerow(row) |
| 20 | 20 | ||
| @@ -45,9 +45,11 @@ def parse_json(response): | @@ -45,9 +45,11 @@ def parse_json(response): | ||
| 45 | containerid | 45 | containerid |
| 46 | ]) | 46 | ]) |
| 47 | 47 | ||
| 48 | - | ||
| 49 | -if __name__ == '__main__': | 48 | +def start(): |
| 50 | init() | 49 | init() |
| 51 | url = 'https://weibo.com/ajax/feed/allGroups' | 50 | url = 'https://weibo.com/ajax/feed/allGroups' |
| 52 | response = get_data(url) | 51 | response = get_data(url) |
| 53 | parse_json(response) | 52 | parse_json(response) |
| 53 | + | ||
| 54 | +if __name__ == '__main__': | ||
| 55 | + start() |
-
Please register or login to post a comment