Committed by
GitHub
Merge pull request #12 from lintsinghua/main
Optimized the web scraping functionality to support multi-account crawling and enhance performance.
Showing
3 changed files
with
154 additions
and
182 deletions
| @@ -2,99 +2,87 @@ import time | @@ -2,99 +2,87 @@ import time | ||
| 2 | import requests | 2 | import requests |
| 3 | import csv | 3 | import csv |
| 4 | import os | 4 | import os |
| 5 | +import random | ||
| 5 | from datetime import datetime | 6 | from datetime import datetime |
| 6 | -from .settings import articleAddr,commentsAddr | 7 | +from .settings import articleAddr, commentsAddr |
| 8 | +from requests.exceptions import RequestException | ||
| 7 | 9 | ||
| 10 | +# 初始化,创建评论数据文件 | ||
| 8 | def init(): | 11 | def init(): |
| 9 | if not os.path.exists(commentsAddr): | 12 | if not os.path.exists(commentsAddr): |
| 10 | - with open(commentsAddr,'w',encoding='utf-8',newline='') as csvFile: | 13 | + with open(commentsAddr, 'w', encoding='utf-8', newline='') as csvFile: |
| 11 | writer = csv.writer(csvFile) | 14 | writer = csv.writer(csvFile) |
| 12 | writer.writerow([ | 15 | writer.writerow([ |
| 13 | - 'articleId', | ||
| 14 | - 'created_at', | ||
| 15 | - 'likes_counts', | ||
| 16 | - 'region', | ||
| 17 | - 'content', | ||
| 18 | - 'authorName', | ||
| 19 | - 'authorGender', | ||
| 20 | - 'authorAddress', | ||
| 21 | - 'authorAvatar' | 16 | + 'articleId', 'created_at', 'likes_counts', 'region', 'content', |
| 17 | + 'authorName', 'authorGender', 'authorAddress', 'authorAvatar' | ||
| 22 | ]) | 18 | ]) |
| 23 | 19 | ||
| 20 | +# 写入评论数据到CSV | ||
| 24 | def write(row): | 21 | def write(row): |
| 25 | with open(commentsAddr, 'a', encoding='utf-8', newline='') as csvFile: | 22 | with open(commentsAddr, 'a', encoding='utf-8', newline='') as csvFile: |
| 26 | writer = csv.writer(csvFile) | 23 | writer = csv.writer(csvFile) |
| 27 | writer.writerow(row) | 24 | writer.writerow(row) |
| 28 | 25 | ||
| 29 | -def fetchData(url,params): | ||
| 30 | - headers = { | ||
| 31 | - 'Cookie':'SINAGLOBAL=2555941826014.1074.1676801766625; ULV=1719829459275:6:1:2:4660996305989.918.1719827559898:1719743122299; UOR=,,www.baidu.com; XSRF-TOKEN=VtLXviYSIs8lor7sz4iGyigL; SUB=_2A25LhvU9DeRhGeFH6FIX-S3MyD2IHXVo-gj1rDV8PUJbkNAGLRXMkW1Ne2nhI3Gle25QJK0Z99J3trq_NZn6YKJ-; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WW3Mv8V5EupQbbKh.vaZIwU5JpX5KzhUgL.FoM4e05c1Ke7e022dJLoIp7LxKML1KBLBKnLxKqL1hnLBoM41hz41hqReKqN; WBPSESS=Dt2hbAUaXfkVprjyrAZT_LRaDLsnxG-kIbeYwnBb5OUKZiwfVr_UrcYfWuqG-4ZVDM5HeU3HXkDNK_thfRfdS9Ao6ezT30jDksv-CpaVmlTAqGUHjJ7PYkH5aCK4HLxmRq14ZalmQNwzfWMPa4y0VNRLuYdg7L1s49ymNq_5v5vusoz0r4ki6u-MHGraF0fbUTgX14x0kHayEwOoxfLI-w==; SCF=AqmJWo31oFV5itnRgWNU1-wHQTL6PmkBLf3gDuqpdqAIfaWguDTMre6Oxjf5Uzs74JAh2r0DdV1sJ1g6m-wJ5NQ.; _s_tentry=-; Apache=4660996305989.918.1719827559898; PC_TOKEN=7955a7ab1f; appkey=; geetest_token=602cd4e3a7ed1898808f8adfe1a2048b; ALF=1722421868', | ||
| 32 | - 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0' | ||
| 33 | - } | ||
| 34 | - response = requests.get(url,headers=headers,params=params) | ||
| 35 | - if response.status_code == 200: | ||
| 36 | - return response.json()['data'] | ||
| 37 | - else: | 26 | +# 获取数据,支持多账号随机切换 |
| 27 | +def fetchData(url, params, headers_list): | ||
| 28 | + headers = random.choice(headers_list) | ||
| 29 | + try: | ||
| 30 | + response = requests.get(url, headers=headers, params=params, timeout=10) | ||
| 31 | + if response.status_code == 200: | ||
| 32 | + return response.json()['data'] | ||
| 33 | + else: | ||
| 34 | + return None | ||
| 35 | + except RequestException as e: | ||
| 36 | + print(f"请求失败:{e}") | ||
| 38 | return None | 37 | return None |
| 39 | 38 | ||
| 39 | +# 获取文章列表 | ||
| 40 | def getArticleList(): | 40 | def getArticleList(): |
| 41 | articleList = [] | 41 | articleList = [] |
| 42 | - with open(articleAddr,'r',encoding='utf-8') as reader: | 42 | + with open(articleAddr, 'r', encoding='utf-8') as reader: |
| 43 | readerCsv = csv.reader(reader) | 43 | readerCsv = csv.reader(reader) |
| 44 | next(reader) | 44 | next(reader) |
| 45 | for nav in readerCsv: | 45 | for nav in readerCsv: |
| 46 | articleList.append(nav) | 46 | articleList.append(nav) |
| 47 | return articleList | 47 | return articleList |
| 48 | 48 | ||
| 49 | -def readJson(response,artileId): | 49 | +# 解析评论数据 |
| 50 | +def readJson(response, articleId): | ||
| 50 | for comment in response: | 51 | for comment in response: |
| 51 | - created_at = datetime.strptime(comment['created_at'],'%a %b %d %H:%M:%S %z %Y').strftime('%Y-%m-%d') | 52 | + created_at = datetime.strptime(comment['created_at'], '%a %b %d %H:%M:%S %z %Y').strftime('%Y-%m-%d') |
| 52 | likes_counts = comment['like_counts'] | 53 | likes_counts = comment['like_counts'] |
| 53 | - try: | ||
| 54 | - region = comment['source'].replace('来自', '') | ||
| 55 | - except: | ||
| 56 | - region = '无' | 54 | + region = comment.get('source', '无').replace('来自', '') |
| 57 | content = comment['text_raw'] | 55 | content = comment['text_raw'] |
| 58 | authorName = comment['user']['screen_name'] | 56 | authorName = comment['user']['screen_name'] |
| 59 | authorGender = comment['user']['gender'] | 57 | authorGender = comment['user']['gender'] |
| 60 | authorAddress = comment['user']['location'] | 58 | authorAddress = comment['user']['location'] |
| 61 | authorAvatar = comment['user']['avatar_large'] | 59 | authorAvatar = comment['user']['avatar_large'] |
| 62 | - write([ | ||
| 63 | - artileId, | ||
| 64 | - created_at, | ||
| 65 | - likes_counts, | ||
| 66 | - region, | ||
| 67 | - content, | ||
| 68 | - authorName, | ||
| 69 | - authorGender, | ||
| 70 | - authorAddress, | ||
| 71 | - authorAvatar | ||
| 72 | - ]) | 60 | + write([articleId, created_at, likes_counts, region, content, authorName, authorGender, authorAddress, authorAvatar]) |
| 73 | 61 | ||
| 74 | -def start(): | 62 | +# 启动爬虫 |
| 63 | +def start(headers_list, delay=2): | ||
| 75 | commentUrl = 'https://weibo.com/ajax/statuses/buildComments' | 64 | commentUrl = 'https://weibo.com/ajax/statuses/buildComments' |
| 76 | init() | 65 | init() |
| 77 | articleList = getArticleList() | 66 | articleList = getArticleList() |
| 78 | for article in articleList: | 67 | for article in articleList: |
| 79 | articleId = article[0] | 68 | articleId = article[0] |
| 80 | - print('正在爬取id值为%s的文章评论' % articleId) | ||
| 81 | - time.sleep(2) | ||
| 82 | - params = { | ||
| 83 | - 'id':int(articleId), | ||
| 84 | - 'is_show_bulletin':2 | ||
| 85 | - } | ||
| 86 | - response = fetchData(commentUrl,params) | ||
| 87 | - readJson(response,articleId) | ||
| 88 | - | ||
| 89 | - | 69 | + print(f'正在爬取id值为{articleId}的文章评论') |
| 70 | + time.sleep(random.uniform(1, delay)) # 随机延时,避免频繁访问 | ||
| 71 | + params = {'id': int(articleId), 'is_show_bulletin': 2} | ||
| 72 | + response = fetchData(commentUrl, params, headers_list) | ||
| 73 | + if response: | ||
| 74 | + readJson(response, articleId) | ||
| 90 | 75 | ||
| 91 | if __name__ == '__main__': | 76 | if __name__ == '__main__': |
| 92 | - start() | ||
| 93 | - | ||
| 94 | - | ||
| 95 | - | ||
| 96 | - | ||
| 97 | - | ||
| 98 | - | ||
| 99 | - | ||
| 100 | - | 77 | + # 这里的headers_list应该包含多个账号的cookie |
| 78 | + headers_list = [ | ||
| 79 | + { | ||
| 80 | + 'Cookie': 'your_cookie_here', | ||
| 81 | + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0' | ||
| 82 | + }, | ||
| 83 | + { | ||
| 84 | + 'Cookie': 'another_cookie_here', | ||
| 85 | + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0' | ||
| 86 | + } | ||
| 87 | + ] | ||
| 88 | + start(headers_list) |
| @@ -2,123 +2,97 @@ import time | @@ -2,123 +2,97 @@ import time | ||
| 2 | import requests | 2 | import requests |
| 3 | import csv | 3 | import csv |
| 4 | import os | 4 | import os |
| 5 | +import random | ||
| 5 | from datetime import datetime | 6 | from datetime import datetime |
| 6 | -from .settings import navAddr,articleAddr | 7 | +from .settings import navAddr, articleAddr |
| 8 | +from requests.exceptions import RequestException | ||
| 7 | 9 | ||
| 10 | +# 初始化文章数据文件 | ||
| 8 | def init(): | 11 | def init(): |
| 9 | if not os.path.exists(articleAddr): | 12 | if not os.path.exists(articleAddr): |
| 10 | - with open(articleAddr,'w',encoding='utf-8',newline='') as csvFile: | 13 | + with open(articleAddr, 'w', encoding='utf-8', newline='') as csvFile: |
| 11 | writer = csv.writer(csvFile) | 14 | writer = csv.writer(csvFile) |
| 12 | writer.writerow([ | 15 | writer.writerow([ |
| 13 | - 'id', | ||
| 14 | - 'likeNum', | ||
| 15 | - 'commentsLen', | ||
| 16 | - 'reposts_count', | ||
| 17 | - 'region', | ||
| 18 | - 'content', | ||
| 19 | - 'contentLen', | ||
| 20 | - 'created_at', | ||
| 21 | - 'type', | ||
| 22 | - 'detailUrl',# followBtnCode>uid + mblogid | ||
| 23 | - 'authorAvatar', | ||
| 24 | - 'authorName', | ||
| 25 | - 'authorDetail', | ||
| 26 | - 'isVip' # v_plus | 16 | + 'id', 'likeNum', 'commentsLen', 'reposts_count', 'region', 'content', 'contentLen', |
| 17 | + 'created_at', 'type', 'detailUrl', 'authorAvatar', 'authorName', 'authorDetail', 'isVip' | ||
| 27 | ]) | 18 | ]) |
| 28 | 19 | ||
| 20 | +# 写入数据到CSV | ||
| 29 | def write(row): | 21 | def write(row): |
| 30 | with open(articleAddr, 'a', encoding='utf-8', newline='') as csvFile: | 22 | with open(articleAddr, 'a', encoding='utf-8', newline='') as csvFile: |
| 31 | writer = csv.writer(csvFile) | 23 | writer = csv.writer(csvFile) |
| 32 | writer.writerow(row) | 24 | writer.writerow(row) |
| 33 | 25 | ||
| 34 | -def fetchData(url,params): | ||
| 35 | - headers = { | ||
| 36 | - 'Cookie':'SINAGLOBAL=2555941826014.1074.1676801766625; ULV=1719829459275:6:1:2:4660996305989.918.1719827559898:1719743122299; UOR=,,www.baidu.com; XSRF-TOKEN=VtLXviYSIs8lor7sz4iGyigL; SUB=_2A25LhvU9DeRhGeFH6FIX-S3MyD2IHXVo-gj1rDV8PUJbkNAGLRXMkW1Ne2nhI3Gle25QJK0Z99J3trq_NZn6YKJ-; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WW3Mv8V5EupQbbKh.vaZIwU5JpX5KzhUgL.FoM4e05c1Ke7e022dJLoIp7LxKML1KBLBKnLxKqL1hnLBoM41hz41hqReKqN; WBPSESS=Dt2hbAUaXfkVprjyrAZT_LRaDLsnxG-kIbeYwnBb5OUKZiwfVr_UrcYfWuqG-4ZVDM5HeU3HXkDNK_thfRfdS9Ao6ezT30jDksv-CpaVmlTAqGUHjJ7PYkH5aCK4HLxmRq14ZalmQNwzfWMPa4y0VNRLuYdg7L1s49ymNq_5v5vusoz0r4ki6u-MHGraF0fbUTgX14x0kHayEwOoxfLI-w==; SCF=AqmJWo31oFV5itnRgWNU1-wHQTL6PmkBLf3gDuqpdqAIfaWguDTMre6Oxjf5Uzs74JAh2r0DdV1sJ1g6m-wJ5NQ.; _s_tentry=-; Apache=4660996305989.918.1719827559898; PC_TOKEN=7955a7ab1f; appkey=; geetest_token=602cd4e3a7ed1898808f8adfe1a2048b; ALF=1722421868', | ||
| 37 | - 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0' | ||
| 38 | - } | ||
| 39 | - response = requests.get(url,headers=headers,params=params) | ||
| 40 | - if response.status_code == 200: | ||
| 41 | - return response.json()['statuses'] | ||
| 42 | - else: | 26 | +# 获取数据,支持多账号 |
| 27 | +def fetchData(url, params, headers_list): | ||
| 28 | + headers = random.choice(headers_list) | ||
| 29 | + try: | ||
| 30 | + response = requests.get(url, headers=headers, params=params, timeout=10) | ||
| 31 | + if response.status_code == 200: | ||
| 32 | + return response.json()['statuses'] | ||
| 33 | + else: | ||
| 34 | + return None | ||
| 35 | + except RequestException as e: | ||
| 36 | + print(f"请求失败:{e}") | ||
| 43 | return None | 37 | return None |
| 44 | 38 | ||
| 39 | +# 获取类型列表 | ||
| 45 | def getTypeList(): | 40 | def getTypeList(): |
| 46 | typeList = [] | 41 | typeList = [] |
| 47 | - with open(navAddr,'r',encoding='utf-8') as reader: | 42 | + with open(navAddr, 'r', encoding='utf-8') as reader: |
| 48 | readerCsv = csv.reader(reader) | 43 | readerCsv = csv.reader(reader) |
| 49 | next(reader) | 44 | next(reader) |
| 50 | for nav in readerCsv: | 45 | for nav in readerCsv: |
| 51 | typeList.append(nav) | 46 | typeList.append(nav) |
| 52 | return typeList | 47 | return typeList |
| 53 | 48 | ||
| 54 | -def readJson(response,type): | ||
| 55 | - for artice in response: | ||
| 56 | - id = artice['id'] | ||
| 57 | - likeNum = artice['attitudes_count'] | ||
| 58 | - commentsLen = artice['comments_count'] | ||
| 59 | - reposts_count = artice['reposts_count'] | ||
| 60 | - try: | ||
| 61 | - region = artice['region_name'].replace('发布于 ', '') | ||
| 62 | - except: | ||
| 63 | - region = '无' | ||
| 64 | - content = artice['text_raw'] | ||
| 65 | - contentLen = artice['textLength'] | ||
| 66 | - created_at = datetime.strptime(artice['created_at'],'%a %b %d %H:%M:%S %z %Y').strftime('%Y-%m-%d') | ||
| 67 | - type = type | ||
| 68 | - try: | ||
| 69 | - detailUrl = 'https://weibo.com/' + str(artice['id']) + '/' + str(artice['mblogid']) | ||
| 70 | - except: | ||
| 71 | - detailUrl = '无' | ||
| 72 | - authorAvatar = artice['user']['avatar_large'] | ||
| 73 | - authorName = artice['user']['screen_name'] | ||
| 74 | - authorDetail = 'https://weibo.com/u/' + str(artice['user']['id']) | ||
| 75 | - isVip = artice['user']['v_plus'] | ||
| 76 | - write([ | ||
| 77 | - id, | ||
| 78 | - likeNum, | ||
| 79 | - commentsLen, | ||
| 80 | - reposts_count, | ||
| 81 | - region, | ||
| 82 | - content, | ||
| 83 | - contentLen, | ||
| 84 | - created_at, | ||
| 85 | - type, | ||
| 86 | - detailUrl, | ||
| 87 | - authorAvatar, | ||
| 88 | - authorName, | ||
| 89 | - authorDetail, | ||
| 90 | - isVip | ||
| 91 | - ]) | 49 | +# 解析文章数据 |
| 50 | +def readJson(response, type): | ||
| 51 | + for article in response: | ||
| 52 | + id = article['id'] | ||
| 53 | + likeNum = article['attitudes_count'] | ||
| 54 | + commentsLen = article['comments_count'] | ||
| 55 | + reposts_count = article['reposts_count'] | ||
| 56 | + region = article.get('region_name', '无').replace('发布于 ', '') | ||
| 57 | + content = article['text_raw'] | ||
| 58 | + contentLen = article['textLength'] | ||
| 59 | + created_at = datetime.strptime(article['created_at'], '%a %b %d %H:%M:%S %z %Y').strftime('%Y-%m-%d') | ||
| 60 | + detailUrl = f"https://weibo.com/{article['id']}/{article['mblogid']}" if 'mblogid' in article else '无' | ||
| 61 | + authorAvatar = article['user']['avatar_large'] | ||
| 62 | + authorName = article['user']['screen_name'] | ||
| 63 | + authorDetail = f"https://weibo.com/u/{article['user']['id']}" | ||
| 64 | + isVip = article['user']['v_plus'] | ||
| 65 | + write([id, likeNum, commentsLen, reposts_count, region, content, contentLen, created_at, type, detailUrl, authorAvatar, authorName, authorDetail, isVip]) | ||
| 92 | 66 | ||
| 93 | -def start(typeNum=14,pageNum=3): | 67 | +# 启动爬虫 |
| 68 | +def start(headers_list, typeNum=14, pageNum=3, delay=2): | ||
| 94 | articleUrl = 'https://weibo.com/ajax/feed/hottimeline' | 69 | articleUrl = 'https://weibo.com/ajax/feed/hottimeline' |
| 95 | init() | 70 | init() |
| 96 | typeList = getTypeList() | 71 | typeList = getTypeList() |
| 97 | - typeNumCount = 0 | ||
| 98 | - for type in typeList: | ||
| 99 | - if typeNumCount > typeNum:return | ||
| 100 | - time.sleep(2) | ||
| 101 | - for page in range(0,pageNum): | ||
| 102 | - print('正在爬取的类型:%s 中的第%s页文章数据' % (type[0],page + 1)) | ||
| 103 | - time.sleep(1) | ||
| 104 | - parmas = { | ||
| 105 | - 'group_id':type[1], | ||
| 106 | - 'containerid':type[2], | ||
| 107 | - 'max_id':page, | ||
| 108 | - 'count':10, | ||
| 109 | - 'extparam':'discover|new_feed' | 72 | + for type in typeList[:typeNum]: |
| 73 | + for page in range(pageNum): | ||
| 74 | + print(f'正在爬取的类型:{type[0]} 中的第{page + 1}页文章数据') | ||
| 75 | + time.sleep(random.uniform(1, delay)) # 随机延时 | ||
| 76 | + params = { | ||
| 77 | + 'group_id': type[1], | ||
| 78 | + 'containerid': type[2], | ||
| 79 | + 'max_id': page, | ||
| 80 | + 'count': 10, | ||
| 81 | + 'extparam': 'discover|new_feed' | ||
| 110 | } | 82 | } |
| 111 | - response = fetchData(articleUrl,parmas) | ||
| 112 | - readJson(response,type[0]) | ||
| 113 | - typeNumCount += 1 | 83 | + response = fetchData(articleUrl, params, headers_list) |
| 84 | + if response: | ||
| 85 | + readJson(response, type[0]) | ||
| 114 | 86 | ||
| 115 | if __name__ == '__main__': | 87 | if __name__ == '__main__': |
| 116 | - start() | ||
| 117 | - | ||
| 118 | - | ||
| 119 | - | ||
| 120 | - | ||
| 121 | - | ||
| 122 | - | ||
| 123 | - | ||
| 124 | - | 88 | + headers_list = [ |
| 89 | + { | ||
| 90 | + 'Cookie': 'your_cookie_here', | ||
| 91 | + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0' | ||
| 92 | + }, | ||
| 93 | + { | ||
| 94 | + 'Cookie': 'another_cookie_here', | ||
| 95 | + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0' | ||
| 96 | + } | ||
| 97 | + ] | ||
| 98 | + start(headers_list) |
| @@ -2,54 +2,64 @@ import requests | @@ -2,54 +2,64 @@ import requests | ||
| 2 | import csv | 2 | import csv |
| 3 | import numpy as np | 3 | import numpy as np |
| 4 | import os | 4 | import os |
| 5 | +import random | ||
| 5 | from .settings import navAddr | 6 | from .settings import navAddr |
| 7 | +from requests.exceptions import RequestException | ||
| 8 | + | ||
| 9 | +# 初始化导航数据文件 | ||
| 6 | def init(): | 10 | def init(): |
| 7 | if not os.path.exists(navAddr): | 11 | if not os.path.exists(navAddr): |
| 8 | - with open(navAddr,'w',encoding='utf-8',newline='') as csvFile: | 12 | + with open(navAddr, 'w', encoding='utf-8', newline='') as csvFile: |
| 9 | writer = csv.writer(csvFile) | 13 | writer = csv.writer(csvFile) |
| 10 | - writer.writerow([ | ||
| 11 | - 'typeName', | ||
| 12 | - 'gid', | ||
| 13 | - 'containerid' | ||
| 14 | - ]) | 14 | + writer.writerow(['typeName', 'gid', 'containerid']) |
| 15 | 15 | ||
| 16 | +# 写入导航数据 | ||
| 16 | def write(row): | 17 | def write(row): |
| 17 | with open(navAddr, 'a', encoding='utf-8', newline='') as csvFile: | 18 | with open(navAddr, 'a', encoding='utf-8', newline='') as csvFile: |
| 18 | writer = csv.writer(csvFile) | 19 | writer = csv.writer(csvFile) |
| 19 | writer.writerow(row) | 20 | writer.writerow(row) |
| 20 | 21 | ||
| 21 | -def fetchData(url): | ||
| 22 | - headers = { | ||
| 23 | - 'Cookie':'SINAGLOBAL=2555941826014.1074.1676801766625; ULV=1719829459275:6:1:2:4660996305989.918.1719827559898:1719743122299; UOR=,,www.baidu.com; XSRF-TOKEN=VtLXviYSIs8lor7sz4iGyigL; SUB=_2A25LhvU9DeRhGeFH6FIX-S3MyD2IHXVo-gj1rDV8PUJbkNAGLRXMkW1Ne2nhI3Gle25QJK0Z99J3trq_NZn6YKJ-; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WW3Mv8V5EupQbbKh.vaZIwU5JpX5KzhUgL.FoM4e05c1Ke7e022dJLoIp7LxKML1KBLBKnLxKqL1hnLBoM41hz41hqReKqN; WBPSESS=Dt2hbAUaXfkVprjyrAZT_LRaDLsnxG-kIbeYwnBb5OUKZiwfVr_UrcYfWuqG-4ZVDM5HeU3HXkDNK_thfRfdS9Ao6ezT30jDksv-CpaVmlTAqGUHjJ7PYkH5aCK4HLxmRq14ZalmQNwzfWMPa4y0VNRLuYdg7L1s49ymNq_5v5vusoz0r4ki6u-MHGraF0fbUTgX14x0kHayEwOoxfLI-w==; SCF=AqmJWo31oFV5itnRgWNU1-wHQTL6PmkBLf3gDuqpdqAIfaWguDTMre6Oxjf5Uzs74JAh2r0DdV1sJ1g6m-wJ5NQ.; _s_tentry=-; Apache=4660996305989.918.1719827559898; PC_TOKEN=7955a7ab1f; appkey=; geetest_token=602cd4e3a7ed1898808f8adfe1a2048b; ALF=1722421868', | ||
| 24 | - 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0' | ||
| 25 | - } | ||
| 26 | - params = { | ||
| 27 | - 'is_new_segment':1, | ||
| 28 | - 'fetch_hot':1 | ||
| 29 | - } | ||
| 30 | - response = requests.get(url,headers=headers,params=params) | ||
| 31 | - if response.status_code == 200: | ||
| 32 | - return response.json() | ||
| 33 | - else: | 22 | +# 获取数据,支持多账号 |
| 23 | +def fetchData(url, headers_list): | ||
| 24 | + headers = random.choice(headers_list) | ||
| 25 | + try: | ||
| 26 | + response = requests.get(url, headers=headers, timeout=10) | ||
| 27 | + if response.status_code == 200: | ||
| 28 | + return response.json()['data']['modules'] | ||
| 29 | + else: | ||
| 30 | + return None | ||
| 31 | + except RequestException as e: | ||
| 32 | + print(f"请求失败:{e}") | ||
| 34 | return None | 33 | return None |
| 35 | 34 | ||
| 35 | +# 解析导航数据 | ||
| 36 | def readJson(response): | 36 | def readJson(response): |
| 37 | - navList = np.append(response['groups'][3]['group'],response['groups'][4]['group']) | ||
| 38 | - for nav in navList: | ||
| 39 | - navName = nav['title'] | ||
| 40 | - gid = nav['gid'] | ||
| 41 | - containerid = nav['containerid'] | ||
| 42 | - write([ | ||
| 43 | - navName, | ||
| 44 | - gid, | ||
| 45 | - containerid | ||
| 46 | - ]) | 37 | + for module in response: |
| 38 | + if 'type' in module and 'typeName' in module: | ||
| 39 | + typeName = module['typeName'] | ||
| 40 | + for submodule in module['modules']: | ||
| 41 | + if 'id' in submodule and 'containerid' in submodule: | ||
| 42 | + gid = submodule['id'] | ||
| 43 | + containerid = submodule['containerid'] | ||
| 44 | + write([typeName, gid, containerid]) | ||
| 47 | 45 | ||
| 48 | -def start(): | 46 | +# 启动爬虫 |
| 47 | +def start(headers_list): | ||
| 48 | + navUrl = 'https://weibo.com/ajax/side/hot' | ||
| 49 | init() | 49 | init() |
| 50 | - url = 'https://weibo.com/ajax/feed/allGroups' | ||
| 51 | - response = fetchData(url) | ||
| 52 | - readJson(response) | 50 | + response = fetchData(navUrl, headers_list) |
| 51 | + if response: | ||
| 52 | + readJson(response) | ||
| 53 | 53 | ||
| 54 | if __name__ == '__main__': | 54 | if __name__ == '__main__': |
| 55 | - start() | ||
| 55 | + headers_list = [ | ||
| 56 | + { | ||
| 57 | + 'Cookie': 'your_cookie_here', | ||
| 58 | + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0' | ||
| 59 | + }, | ||
| 60 | + { | ||
| 61 | + 'Cookie': 'another_cookie_here', | ||
| 62 | + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0' | ||
| 63 | + } | ||
| 64 | + ] | ||
| 65 | + start(headers_list) |
-
Please register or login to post a comment