Showing
14 changed files
with
70 additions
and
22 deletions
spider/__pycache__/saveData.cpython-38.pyc
0 → 100644
No preview for this file type
No preview for this file type
No preview for this file type
spider/__pycache__/spiderData.cpython-38.pyc
0 → 100644
No preview for this file type
spider/article.csv
0 → 100644
| 1 | +id,likeNum,commentsLen,reposts_count,region,content,contentLen,created_at,type,detailUrl,authorAvatar,authorName,authorDetail,isVip |
| 1 | -from spiderData import spiderData | 1 | +from spiderContent import start as spiderContentStart |
| 2 | +from spiderComments import start as spiderCommentsStart | ||
| 2 | from saveData import save_to_sql as saveData | 3 | from saveData import save_to_sql as saveData |
| 3 | 4 | ||
| 4 | def main(): | 5 | def main(): |
| 5 | - try: | ||
| 6 | - spiderData() | 6 | + print('正在爬取文章数据') |
| 7 | + spiderContentStart(1,1) | ||
| 8 | + print('正在爬取文章评论数据') | ||
| 9 | + spiderCommentsStart() | ||
| 10 | + print('正在存储数据') | ||
| 7 | saveData() | 11 | saveData() |
| 8 | print("爬取数据更新") | 12 | print("爬取数据更新") |
| 9 | - except: | ||
| 10 | - print("爬取数据失败") | ||
| 11 | 13 | ||
| 12 | if __name__ == '__main__': | 14 | if __name__ == '__main__': |
| 13 | main() | 15 | main() |
spider/nav.csv
0 → 100644
| 1 | +typeName,gid,containerid | ||
| 2 | +热门,102803,102803 | ||
| 3 | +同城,1028032222,102803_2222 | ||
| 4 | +榜单,102803600169,102803_ctg1_600169_-_ctg1_600169 | ||
| 5 | +男篮,102803600279,102803_ctg1_600279_-_ctg1_600279 | ||
| 6 | +明星,1028034288,102803_ctg1_4288_-_ctg1_4288 | ||
| 7 | +车展,1028035188,102803_ctg1_5188_-_ctg1_5188 | ||
| 8 | +搞笑,1028034388,102803_ctg1_4388_-_ctg1_4388 | ||
| 9 | +情感,1028031988,102803_ctg1_1988_-_ctg1_1988 | ||
| 10 | +周末,102803600195,102803_ctg1_600195_-_ctg1_600195 | ||
| 11 | +电影,1028033288,102803_ctg1_3288_-_ctg1_3288 | ||
| 12 | +社会,1028034188,102803_ctg1_4188_-_ctg1_4188 | ||
| 13 | +电视剧,1028032488,102803_ctg1_2488_-_ctg1_2488 | ||
| 14 | +美食,1028032688,102803_ctg1_2688_-_ctg1_2688 | ||
| 15 | +俄乌局势,102803600267,102803_ctg1_600267_-_ctg1_600267 | ||
| 16 | +国际,1028036288,102803_ctg1_6288_-_ctg1_6288 | ||
| 17 | +深度,102803600155,102803_ctg1_600155_-_ctg1_600155 | ||
| 18 | +财经,1028036388,102803_ctg1_6388_-_ctg1_6388 | ||
| 19 | +读书,1028034588,102803_ctg1_4588_-_ctg1_4588 | ||
| 20 | +摄影,1028034988,102803_ctg1_4988_-_ctg1_4988 | ||
| 21 | +颜值,102803600165,102803_ctg1_600165_-_ctg1_600165 | ||
| 22 | +体育,1028031388,102803_ctg1_1388_-_ctg1_1388 | ||
| 23 | +数码,1028035088,102803_ctg1_5088_-_ctg1_5088 | ||
| 24 | +综艺,1028034688,102803_ctg1_4688_-_ctg1_4688 | ||
| 25 | +时尚,1028034488,102803_ctg1_4488_-_ctg1_4488 | ||
| 26 | +星座,1028031688,102803_ctg1_1688_-_ctg1_1688 | ||
| 27 | +军事,1028036688,102803_ctg1_6688_-_ctg1_6688 | ||
| 28 | +股市,1028031288,102803_ctg1_1288_-_ctg1_1288 | ||
| 29 | +房产,1028035588,102803_ctg1_5588_-_ctg1_5588 | ||
| 30 | +家居,1028035888,102803_ctg1_5888_-_ctg1_5888 | ||
| 31 | +萌宠,1028032788,102803_ctg1_2788_-_ctg1_2788 | ||
| 32 | +科技,1028032088,102803_ctg1_2088_-_ctg1_2088 | ||
| 33 | +科普,1028035988,102803_ctg1_5988_-_ctg1_5988 | ||
| 34 | +动漫,1028032388,102803_ctg1_2388_-_ctg1_2388 | ||
| 35 | +运动健身,1028034788,102803_ctg1_4788_-_ctg1_4788 | ||
| 36 | +旅游,1028032588,102803_ctg1_2588_-_ctg1_2588 | ||
| 37 | +瘦身,1028036488,102803_ctg1_6488_-_ctg1_6488 | ||
| 38 | +好物,102803600094,102803_ctg1_600094_-_ctg1_600094 | ||
| 39 | +历史,1028036788,102803_ctg1_6788_-_ctg1_6788 | ||
| 40 | +艺术,1028035488,102803_ctg1_5488_-_ctg1_5488 | ||
| 41 | +美妆,1028031588,102803_ctg1_1588_-_ctg1_1588 | ||
| 42 | +法律,1028037388,102803_ctg1_7388_-_ctg1_7388 | ||
| 43 | +设计,1028035388,102803_ctg1_5388_-_ctg1_5388 | ||
| 44 | +健康,1028032188,102803_ctg1_2188_-_ctg1_2188 | ||
| 45 | +音乐,1028035288,102803_ctg1_5288_-_ctg1_5288 | ||
| 46 | +游戏,1028034888,102803_ctg1_4888_-_ctg1_4888 | ||
| 47 | +新时代,1028037968,102803_ctg1_7968_-_ctg1_7968 | ||
| 48 | +校园,102803600177,102803_ctg1_600177_-_ctg1_600177 | ||
| 49 | +收藏,1028038189,102803_ctg1_8189_-_ctg1_8189 | ||
| 50 | +政务,1028035788,102803_ctg1_5788_-_ctg1_5788 | ||
| 51 | +养生,1028036588,102803_ctg1_6588_-_ctg1_6588 | ||
| 52 | +育儿,1028033188,102803_ctg1_3188_-_ctg1_3188 | ||
| 53 | +抽奖,102803600037,102803_ctg1_600037_-_ctg1_600037 | ||
| 54 | +教育,102803600080,102803_ctg1_600080_-_ctg1_600080 | ||
| 55 | +婚恋,1028031788,102803_ctg1_1788_-_ctg1_1788 | ||
| 56 | +舞蹈,1028038788,102803_ctg1_8788_-_ctg1_8788 | ||
| 57 | +辟谣,1028036988,102803_ctg1_6988_-_ctg1_6988 | ||
| 58 | +公益,102803600057,102803_ctg1_600057_-_ctg1_600057 | ||
| 59 | +问答,1028037977,102803_ctg1_7977_-_ctg1_7977 | ||
| 60 | +三农,1028037188,102803_ctg1_7188_-_ctg1_7188 |
spider/spiderData.py
deleted
100644 → 0
| 1 | -from spiderDataPack.spiderNav import start as spiderNavStart | ||
| 2 | -from spiderDataPack.spiderContent import start as spiderContentStart | ||
| 3 | -from spiderDataPack.spiderComments import start as spiderCommentsStart | ||
| 4 | -import os | ||
| 5 | - | ||
| 6 | -def spiderData(): | ||
| 7 | - if not os.path.exists('./nav.csv'): | ||
| 8 | - spiderNavStart() | ||
| 9 | - spiderContentStart(1,1) | ||
| 10 | - spiderCommentsStart() | ||
| 11 | - | ||
| 12 | -if __name__ == '__main__': | ||
| 13 | - spiderData() |
spider/spiderDataPack/__init__.py
deleted
100644 → 0
No preview for this file type
No preview for this file type
| @@ -45,11 +45,9 @@ def parse_json(response): | @@ -45,11 +45,9 @@ def parse_json(response): | ||
| 45 | containerid | 45 | containerid |
| 46 | ]) | 46 | ]) |
| 47 | 47 | ||
| 48 | -def start(): | 48 | + |
| 49 | +if __name__ == '__main__': | ||
| 49 | init() | 50 | init() |
| 50 | url = 'https://weibo.com/ajax/feed/allGroups' | 51 | url = 'https://weibo.com/ajax/feed/allGroups' |
| 51 | response = get_data(url) | 52 | response = get_data(url) |
| 52 | parse_json(response) | 53 | parse_json(response) |
| 53 | - | ||
| 54 | -if __name__ == '__main__': | ||
| 55 | - start() |
-
Please register or login to post a comment