Showing
1 changed file
with
44 additions
and
34 deletions
| @@ -2,54 +2,64 @@ import requests | @@ -2,54 +2,64 @@ import requests | ||
| 2 | import csv | 2 | import csv |
| 3 | import numpy as np | 3 | import numpy as np |
| 4 | import os | 4 | import os |
| 5 | +import random | ||
| 5 | from .settings import navAddr | 6 | from .settings import navAddr |
| 7 | +from requests.exceptions import RequestException | ||
| 8 | + | ||
| 9 | +# 初始化导航数据文件 | ||
| 6 | def init(): | 10 | def init(): |
| 7 | if not os.path.exists(navAddr): | 11 | if not os.path.exists(navAddr): |
| 8 | - with open(navAddr,'w',encoding='utf-8',newline='') as csvFile: | 12 | + with open(navAddr, 'w', encoding='utf-8', newline='') as csvFile: |
| 9 | writer = csv.writer(csvFile) | 13 | writer = csv.writer(csvFile) |
| 10 | - writer.writerow([ | ||
| 11 | - 'typeName', | ||
| 12 | - 'gid', | ||
| 13 | - 'containerid' | ||
| 14 | - ]) | 14 | + writer.writerow(['typeName', 'gid', 'containerid']) |
| 15 | 15 | ||
| 16 | +# 写入导航数据 | ||
| 16 | def write(row): | 17 | def write(row): |
| 17 | with open(navAddr, 'a', encoding='utf-8', newline='') as csvFile: | 18 | with open(navAddr, 'a', encoding='utf-8', newline='') as csvFile: |
| 18 | writer = csv.writer(csvFile) | 19 | writer = csv.writer(csvFile) |
| 19 | writer.writerow(row) | 20 | writer.writerow(row) |
| 20 | 21 | ||
| 21 | -def fetchData(url): | ||
| 22 | - headers = { | ||
| 23 | - 'Cookie':'SINAGLOBAL=2555941826014.1074.1676801766625; ULV=1719829459275:6:1:2:4660996305989.918.1719827559898:1719743122299; UOR=,,www.baidu.com; XSRF-TOKEN=VtLXviYSIs8lor7sz4iGyigL; SUB=_2A25LhvU9DeRhGeFH6FIX-S3MyD2IHXVo-gj1rDV8PUJbkNAGLRXMkW1Ne2nhI3Gle25QJK0Z99J3trq_NZn6YKJ-; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WW3Mv8V5EupQbbKh.vaZIwU5JpX5KzhUgL.FoM4e05c1Ke7e022dJLoIp7LxKML1KBLBKnLxKqL1hnLBoM41hz41hqReKqN; WBPSESS=Dt2hbAUaXfkVprjyrAZT_LRaDLsnxG-kIbeYwnBb5OUKZiwfVr_UrcYfWuqG-4ZVDM5HeU3HXkDNK_thfRfdS9Ao6ezT30jDksv-CpaVmlTAqGUHjJ7PYkH5aCK4HLxmRq14ZalmQNwzfWMPa4y0VNRLuYdg7L1s49ymNq_5v5vusoz0r4ki6u-MHGraF0fbUTgX14x0kHayEwOoxfLI-w==; SCF=AqmJWo31oFV5itnRgWNU1-wHQTL6PmkBLf3gDuqpdqAIfaWguDTMre6Oxjf5Uzs74JAh2r0DdV1sJ1g6m-wJ5NQ.; _s_tentry=-; Apache=4660996305989.918.1719827559898; PC_TOKEN=7955a7ab1f; appkey=; geetest_token=602cd4e3a7ed1898808f8adfe1a2048b; ALF=1722421868', | ||
| 24 | - 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0' | ||
| 25 | - } | ||
| 26 | - params = { | ||
| 27 | - 'is_new_segment':1, | ||
| 28 | - 'fetch_hot':1 | ||
| 29 | - } | ||
| 30 | - response = requests.get(url,headers=headers,params=params) | ||
| 31 | - if response.status_code == 200: | ||
| 32 | - return response.json() | ||
| 33 | - else: | 22 | +# 获取数据,支持多账号 |
| 23 | +def fetchData(url, headers_list): | ||
| 24 | + headers = random.choice(headers_list) | ||
| 25 | + try: | ||
| 26 | + response = requests.get(url, headers=headers, timeout=10) | ||
| 27 | + if response.status_code == 200: | ||
| 28 | + return response.json()['data']['modules'] | ||
| 29 | + else: | ||
| 30 | + return None | ||
| 31 | + except RequestException as e: | ||
| 32 | + print(f"请求失败:{e}") | ||
| 34 | return None | 33 | return None |
| 35 | 34 | ||
| 35 | +# 解析导航数据 | ||
| 36 | def readJson(response): | 36 | def readJson(response): |
| 37 | - navList = np.append(response['groups'][3]['group'],response['groups'][4]['group']) | ||
| 38 | - for nav in navList: | ||
| 39 | - navName = nav['title'] | ||
| 40 | - gid = nav['gid'] | ||
| 41 | - containerid = nav['containerid'] | ||
| 42 | - write([ | ||
| 43 | - navName, | ||
| 44 | - gid, | ||
| 45 | - containerid | ||
| 46 | - ]) | 37 | + for module in response: |
| 38 | + if 'type' in module and 'typeName' in module: | ||
| 39 | + typeName = module['typeName'] | ||
| 40 | + for submodule in module['modules']: | ||
| 41 | + if 'id' in submodule and 'containerid' in submodule: | ||
| 42 | + gid = submodule['id'] | ||
| 43 | + containerid = submodule['containerid'] | ||
| 44 | + write([typeName, gid, containerid]) | ||
| 47 | 45 | ||
| 48 | -def start(): | 46 | +# 启动爬虫 |
| 47 | +def start(headers_list): | ||
| 48 | + navUrl = 'https://weibo.com/ajax/side/hot' | ||
| 49 | init() | 49 | init() |
| 50 | - url = 'https://weibo.com/ajax/feed/allGroups' | ||
| 51 | - response = fetchData(url) | ||
| 52 | - readJson(response) | 50 | + response = fetchData(navUrl, headers_list) |
| 51 | + if response: | ||
| 52 | + readJson(response) | ||
| 53 | 53 | ||
| 54 | if __name__ == '__main__': | 54 | if __name__ == '__main__': |
| 55 | - start() | ||
| 55 | + headers_list = [ | ||
| 56 | + { | ||
| 57 | + 'Cookie': 'your_cookie_here', | ||
| 58 | + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0' | ||
| 59 | + }, | ||
| 60 | + { | ||
| 61 | + 'Cookie': 'another_cookie_here', | ||
| 62 | + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0' | ||
| 63 | + } | ||
| 64 | + ] | ||
| 65 | + start(headers_list) |
-
Please register or login to post a comment