Showing
1 changed file
with
61 additions
and
87 deletions
| @@ -2,123 +2,97 @@ import time | @@ -2,123 +2,97 @@ import time | ||
| 2 | import requests | 2 | import requests |
| 3 | import csv | 3 | import csv |
| 4 | import os | 4 | import os |
| 5 | +import random | ||
| 5 | from datetime import datetime | 6 | from datetime import datetime |
| 6 | -from .settings import navAddr,articleAddr | 7 | +from .settings import navAddr, articleAddr |
| 8 | +from requests.exceptions import RequestException | ||
| 7 | 9 | ||
| 10 | +# 初始化文章数据文件 | ||
| 8 | def init(): | 11 | def init(): |
| 9 | if not os.path.exists(articleAddr): | 12 | if not os.path.exists(articleAddr): |
| 10 | - with open(articleAddr,'w',encoding='utf-8',newline='') as csvFile: | 13 | + with open(articleAddr, 'w', encoding='utf-8', newline='') as csvFile: |
| 11 | writer = csv.writer(csvFile) | 14 | writer = csv.writer(csvFile) |
| 12 | writer.writerow([ | 15 | writer.writerow([ |
| 13 | - 'id', | ||
| 14 | - 'likeNum', | ||
| 15 | - 'commentsLen', | ||
| 16 | - 'reposts_count', | ||
| 17 | - 'region', | ||
| 18 | - 'content', | ||
| 19 | - 'contentLen', | ||
| 20 | - 'created_at', | ||
| 21 | - 'type', | ||
| 22 | - 'detailUrl',# followBtnCode>uid + mblogid | ||
| 23 | - 'authorAvatar', | ||
| 24 | - 'authorName', | ||
| 25 | - 'authorDetail', | ||
| 26 | - 'isVip' # v_plus | 16 | + 'id', 'likeNum', 'commentsLen', 'reposts_count', 'region', 'content', 'contentLen', |
| 17 | + 'created_at', 'type', 'detailUrl', 'authorAvatar', 'authorName', 'authorDetail', 'isVip' | ||
| 27 | ]) | 18 | ]) |
| 28 | 19 | ||
| 20 | +# 写入数据到CSV | ||
| 29 | def write(row): | 21 | def write(row): |
| 30 | with open(articleAddr, 'a', encoding='utf-8', newline='') as csvFile: | 22 | with open(articleAddr, 'a', encoding='utf-8', newline='') as csvFile: |
| 31 | writer = csv.writer(csvFile) | 23 | writer = csv.writer(csvFile) |
| 32 | writer.writerow(row) | 24 | writer.writerow(row) |
| 33 | 25 | ||
| 34 | -def fetchData(url,params): | ||
| 35 | - headers = { | ||
| 36 | - 'Cookie':'SINAGLOBAL=2555941826014.1074.1676801766625; ULV=1719829459275:6:1:2:4660996305989.918.1719827559898:1719743122299; UOR=,,www.baidu.com; XSRF-TOKEN=VtLXviYSIs8lor7sz4iGyigL; SUB=_2A25LhvU9DeRhGeFH6FIX-S3MyD2IHXVo-gj1rDV8PUJbkNAGLRXMkW1Ne2nhI3Gle25QJK0Z99J3trq_NZn6YKJ-; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WW3Mv8V5EupQbbKh.vaZIwU5JpX5KzhUgL.FoM4e05c1Ke7e022dJLoIp7LxKML1KBLBKnLxKqL1hnLBoM41hz41hqReKqN; WBPSESS=Dt2hbAUaXfkVprjyrAZT_LRaDLsnxG-kIbeYwnBb5OUKZiwfVr_UrcYfWuqG-4ZVDM5HeU3HXkDNK_thfRfdS9Ao6ezT30jDksv-CpaVmlTAqGUHjJ7PYkH5aCK4HLxmRq14ZalmQNwzfWMPa4y0VNRLuYdg7L1s49ymNq_5v5vusoz0r4ki6u-MHGraF0fbUTgX14x0kHayEwOoxfLI-w==; SCF=AqmJWo31oFV5itnRgWNU1-wHQTL6PmkBLf3gDuqpdqAIfaWguDTMre6Oxjf5Uzs74JAh2r0DdV1sJ1g6m-wJ5NQ.; _s_tentry=-; Apache=4660996305989.918.1719827559898; PC_TOKEN=7955a7ab1f; appkey=; geetest_token=602cd4e3a7ed1898808f8adfe1a2048b; ALF=1722421868', | ||
| 37 | - 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0' | ||
| 38 | - } | ||
| 39 | - response = requests.get(url,headers=headers,params=params) | 26 | +# 获取数据,支持多账号 |
| 27 | +def fetchData(url, params, headers_list): | ||
| 28 | + headers = random.choice(headers_list) | ||
| 29 | + try: | ||
| 30 | + response = requests.get(url, headers=headers, params=params, timeout=10) | ||
| 40 | if response.status_code == 200: | 31 | if response.status_code == 200: |
| 41 | return response.json()['statuses'] | 32 | return response.json()['statuses'] |
| 42 | else: | 33 | else: |
| 43 | return None | 34 | return None |
| 35 | + except RequestException as e: | ||
| 36 | + print(f"请求失败:{e}") | ||
| 37 | + return None | ||
| 44 | 38 | ||
| 39 | +# 获取类型列表 | ||
| 45 | def getTypeList(): | 40 | def getTypeList(): |
| 46 | typeList = [] | 41 | typeList = [] |
| 47 | - with open(navAddr,'r',encoding='utf-8') as reader: | 42 | + with open(navAddr, 'r', encoding='utf-8') as reader: |
| 48 | readerCsv = csv.reader(reader) | 43 | readerCsv = csv.reader(reader) |
| 49 | next(reader) | 44 | next(reader) |
| 50 | for nav in readerCsv: | 45 | for nav in readerCsv: |
| 51 | typeList.append(nav) | 46 | typeList.append(nav) |
| 52 | return typeList | 47 | return typeList |
| 53 | 48 | ||
| 54 | -def readJson(response,type): | ||
| 55 | - for artice in response: | ||
| 56 | - id = artice['id'] | ||
| 57 | - likeNum = artice['attitudes_count'] | ||
| 58 | - commentsLen = artice['comments_count'] | ||
| 59 | - reposts_count = artice['reposts_count'] | ||
| 60 | - try: | ||
| 61 | - region = artice['region_name'].replace('发布于 ', '') | ||
| 62 | - except: | ||
| 63 | - region = '无' | ||
| 64 | - content = artice['text_raw'] | ||
| 65 | - contentLen = artice['textLength'] | ||
| 66 | - created_at = datetime.strptime(artice['created_at'],'%a %b %d %H:%M:%S %z %Y').strftime('%Y-%m-%d') | ||
| 67 | - type = type | ||
| 68 | - try: | ||
| 69 | - detailUrl = 'https://weibo.com/' + str(artice['id']) + '/' + str(artice['mblogid']) | ||
| 70 | - except: | ||
| 71 | - detailUrl = '无' | ||
| 72 | - authorAvatar = artice['user']['avatar_large'] | ||
| 73 | - authorName = artice['user']['screen_name'] | ||
| 74 | - authorDetail = 'https://weibo.com/u/' + str(artice['user']['id']) | ||
| 75 | - isVip = artice['user']['v_plus'] | ||
| 76 | - write([ | ||
| 77 | - id, | ||
| 78 | - likeNum, | ||
| 79 | - commentsLen, | ||
| 80 | - reposts_count, | ||
| 81 | - region, | ||
| 82 | - content, | ||
| 83 | - contentLen, | ||
| 84 | - created_at, | ||
| 85 | - type, | ||
| 86 | - detailUrl, | ||
| 87 | - authorAvatar, | ||
| 88 | - authorName, | ||
| 89 | - authorDetail, | ||
| 90 | - isVip | ||
| 91 | - ]) | 49 | +# 解析文章数据 |
| 50 | +def readJson(response, type): | ||
| 51 | + for article in response: | ||
| 52 | + id = article['id'] | ||
| 53 | + likeNum = article['attitudes_count'] | ||
| 54 | + commentsLen = article['comments_count'] | ||
| 55 | + reposts_count = article['reposts_count'] | ||
| 56 | + region = article.get('region_name', '无').replace('发布于 ', '') | ||
| 57 | + content = article['text_raw'] | ||
| 58 | + contentLen = article['textLength'] | ||
| 59 | + created_at = datetime.strptime(article['created_at'], '%a %b %d %H:%M:%S %z %Y').strftime('%Y-%m-%d') | ||
| 60 | + detailUrl = f"https://weibo.com/{article['id']}/{article['mblogid']}" if 'mblogid' in article else '无' | ||
| 61 | + authorAvatar = article['user']['avatar_large'] | ||
| 62 | + authorName = article['user']['screen_name'] | ||
| 63 | + authorDetail = f"https://weibo.com/u/{article['user']['id']}" | ||
| 64 | + isVip = article['user']['v_plus'] | ||
| 65 | + write([id, likeNum, commentsLen, reposts_count, region, content, contentLen, created_at, type, detailUrl, authorAvatar, authorName, authorDetail, isVip]) | ||
| 92 | 66 | ||
| 93 | -def start(typeNum=14,pageNum=3): | 67 | +# 启动爬虫 |
| 68 | +def start(headers_list, typeNum=14, pageNum=3, delay=2): | ||
| 94 | articleUrl = 'https://weibo.com/ajax/feed/hottimeline' | 69 | articleUrl = 'https://weibo.com/ajax/feed/hottimeline' |
| 95 | init() | 70 | init() |
| 96 | typeList = getTypeList() | 71 | typeList = getTypeList() |
| 97 | - typeNumCount = 0 | ||
| 98 | - for type in typeList: | ||
| 99 | - if typeNumCount > typeNum:return | ||
| 100 | - time.sleep(2) | ||
| 101 | - for page in range(0,pageNum): | ||
| 102 | - print('正在爬取的类型:%s 中的第%s页文章数据' % (type[0],page + 1)) | ||
| 103 | - time.sleep(1) | ||
| 104 | - parmas = { | ||
| 105 | - 'group_id':type[1], | ||
| 106 | - 'containerid':type[2], | ||
| 107 | - 'max_id':page, | ||
| 108 | - 'count':10, | ||
| 109 | - 'extparam':'discover|new_feed' | 72 | + for type in typeList[:typeNum]: |
| 73 | + for page in range(pageNum): | ||
| 74 | + print(f'正在爬取的类型:{type[0]} 中的第{page + 1}页文章数据') | ||
| 75 | + time.sleep(random.uniform(1, delay)) # 随机延时 | ||
| 76 | + params = { | ||
| 77 | + 'group_id': type[1], | ||
| 78 | + 'containerid': type[2], | ||
| 79 | + 'max_id': page, | ||
| 80 | + 'count': 10, | ||
| 81 | + 'extparam': 'discover|new_feed' | ||
| 110 | } | 82 | } |
| 111 | - response = fetchData(articleUrl,parmas) | ||
| 112 | - readJson(response,type[0]) | ||
| 113 | - typeNumCount += 1 | 83 | + response = fetchData(articleUrl, params, headers_list) |
| 84 | + if response: | ||
| 85 | + readJson(response, type[0]) | ||
| 114 | 86 | ||
| 115 | if __name__ == '__main__': | 87 | if __name__ == '__main__': |
| 116 | - start() | ||
| 117 | - | ||
| 118 | - | ||
| 119 | - | ||
| 120 | - | ||
| 121 | - | ||
| 122 | - | ||
| 123 | - | ||
| 124 | - | 88 | + headers_list = [ |
| 89 | + { | ||
| 90 | + 'Cookie': 'your_cookie_here', | ||
| 91 | + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0' | ||
| 92 | + }, | ||
| 93 | + { | ||
| 94 | + 'Cookie': 'another_cookie_here', | ||
| 95 | + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0' | ||
| 96 | + } | ||
| 97 | + ] | ||
| 98 | + start(headers_list) |
-
Please register or login to post a comment