YYL469

修复爬虫bug

No preview for this file type
No preview for this file type
  1 +id,likeNum,commentsLen,reposts_count,region,content,contentLen,created_at,type,detailUrl,authorAvatar,authorName,authorDetail,isVip
1 -from spiderData import spiderData 1 +from spiderContent import start as spiderContentStart
  2 +from spiderComments import start as spiderCommentsStart
2 from saveData import save_to_sql as saveData 3 from saveData import save_to_sql as saveData
3 4
4 def main(): 5 def main():
5 - try:  
6 - spiderData() 6 + print('正在爬取文章数据')
  7 + spiderContentStart(1,1)
  8 + print('正在爬取文章评论数据')
  9 + spiderCommentsStart()
  10 + print('正在存储数据')
7 saveData() 11 saveData()
8 print("爬取数据更新") 12 print("爬取数据更新")
9 - except:  
10 - print("爬取数据失败")  
11 13
12 if __name__ == '__main__': 14 if __name__ == '__main__':
13 main() 15 main()
  1 +typeName,gid,containerid
  2 +热门,102803,102803
  3 +同城,1028032222,102803_2222
  4 +榜单,102803600169,102803_ctg1_600169_-_ctg1_600169
  5 +男篮,102803600279,102803_ctg1_600279_-_ctg1_600279
  6 +明星,1028034288,102803_ctg1_4288_-_ctg1_4288
  7 +车展,1028035188,102803_ctg1_5188_-_ctg1_5188
  8 +搞笑,1028034388,102803_ctg1_4388_-_ctg1_4388
  9 +情感,1028031988,102803_ctg1_1988_-_ctg1_1988
  10 +周末,102803600195,102803_ctg1_600195_-_ctg1_600195
  11 +电影,1028033288,102803_ctg1_3288_-_ctg1_3288
  12 +社会,1028034188,102803_ctg1_4188_-_ctg1_4188
  13 +电视剧,1028032488,102803_ctg1_2488_-_ctg1_2488
  14 +美食,1028032688,102803_ctg1_2688_-_ctg1_2688
  15 +俄乌局势,102803600267,102803_ctg1_600267_-_ctg1_600267
  16 +国际,1028036288,102803_ctg1_6288_-_ctg1_6288
  17 +深度,102803600155,102803_ctg1_600155_-_ctg1_600155
  18 +财经,1028036388,102803_ctg1_6388_-_ctg1_6388
  19 +读书,1028034588,102803_ctg1_4588_-_ctg1_4588
  20 +摄影,1028034988,102803_ctg1_4988_-_ctg1_4988
  21 +颜值,102803600165,102803_ctg1_600165_-_ctg1_600165
  22 +体育,1028031388,102803_ctg1_1388_-_ctg1_1388
  23 +数码,1028035088,102803_ctg1_5088_-_ctg1_5088
  24 +综艺,1028034688,102803_ctg1_4688_-_ctg1_4688
  25 +时尚,1028034488,102803_ctg1_4488_-_ctg1_4488
  26 +星座,1028031688,102803_ctg1_1688_-_ctg1_1688
  27 +军事,1028036688,102803_ctg1_6688_-_ctg1_6688
  28 +股市,1028031288,102803_ctg1_1288_-_ctg1_1288
  29 +房产,1028035588,102803_ctg1_5588_-_ctg1_5588
  30 +家居,1028035888,102803_ctg1_5888_-_ctg1_5888
  31 +萌宠,1028032788,102803_ctg1_2788_-_ctg1_2788
  32 +科技,1028032088,102803_ctg1_2088_-_ctg1_2088
  33 +科普,1028035988,102803_ctg1_5988_-_ctg1_5988
  34 +动漫,1028032388,102803_ctg1_2388_-_ctg1_2388
  35 +运动健身,1028034788,102803_ctg1_4788_-_ctg1_4788
  36 +旅游,1028032588,102803_ctg1_2588_-_ctg1_2588
  37 +瘦身,1028036488,102803_ctg1_6488_-_ctg1_6488
  38 +好物,102803600094,102803_ctg1_600094_-_ctg1_600094
  39 +历史,1028036788,102803_ctg1_6788_-_ctg1_6788
  40 +艺术,1028035488,102803_ctg1_5488_-_ctg1_5488
  41 +美妆,1028031588,102803_ctg1_1588_-_ctg1_1588
  42 +法律,1028037388,102803_ctg1_7388_-_ctg1_7388
  43 +设计,1028035388,102803_ctg1_5388_-_ctg1_5388
  44 +健康,1028032188,102803_ctg1_2188_-_ctg1_2188
  45 +音乐,1028035288,102803_ctg1_5288_-_ctg1_5288
  46 +游戏,1028034888,102803_ctg1_4888_-_ctg1_4888
  47 +新时代,1028037968,102803_ctg1_7968_-_ctg1_7968
  48 +校园,102803600177,102803_ctg1_600177_-_ctg1_600177
  49 +收藏,1028038189,102803_ctg1_8189_-_ctg1_8189
  50 +政务,1028035788,102803_ctg1_5788_-_ctg1_5788
  51 +养生,1028036588,102803_ctg1_6588_-_ctg1_6588
  52 +育儿,1028033188,102803_ctg1_3188_-_ctg1_3188
  53 +抽奖,102803600037,102803_ctg1_600037_-_ctg1_600037
  54 +教育,102803600080,102803_ctg1_600080_-_ctg1_600080
  55 +婚恋,1028031788,102803_ctg1_1788_-_ctg1_1788
  56 +舞蹈,1028038788,102803_ctg1_8788_-_ctg1_8788
  57 +辟谣,1028036988,102803_ctg1_6988_-_ctg1_6988
  58 +公益,102803600057,102803_ctg1_600057_-_ctg1_600057
  59 +问答,1028037977,102803_ctg1_7977_-_ctg1_7977
  60 +三农,1028037188,102803_ctg1_7188_-_ctg1_7188
1 -from spiderDataPack.spiderNav import start as spiderNavStart  
2 -from spiderDataPack.spiderContent import start as spiderContentStart  
3 -from spiderDataPack.spiderComments import start as spiderCommentsStart  
4 -import os  
5 -  
6 -def spiderData():  
7 - if not os.path.exists('./nav.csv'):  
8 - spiderNavStart()  
9 - spiderContentStart(1,1)  
10 - spiderCommentsStart()  
11 -  
12 -if __name__ == '__main__':  
13 - spiderData()  
@@ -45,11 +45,9 @@ def parse_json(response): @@ -45,11 +45,9 @@ def parse_json(response):
45 containerid 45 containerid
46 ]) 46 ])
47 47
48 -def start(): 48 +
  49 +if __name__ == '__main__':
49 init() 50 init()
50 url = 'https://weibo.com/ajax/feed/allGroups' 51 url = 'https://weibo.com/ajax/feed/allGroups'
51 response = get_data(url) 52 response = get_data(url)
52 parse_json(response) 53 parse_json(response)
53 -  
54 -if __name__ == '__main__':  
55 - start()