Showing
1 changed file
with
43 additions
and
55 deletions
| @@ -2,99 +2,87 @@ import time | @@ -2,99 +2,87 @@ import time | ||
| 2 | import requests | 2 | import requests |
| 3 | import csv | 3 | import csv |
| 4 | import os | 4 | import os |
| 5 | +import random | ||
| 5 | from datetime import datetime | 6 | from datetime import datetime |
| 6 | -from .settings import articleAddr,commentsAddr | 7 | +from .settings import articleAddr, commentsAddr |
| 8 | +from requests.exceptions import RequestException | ||
| 7 | 9 | ||
| 10 | +# 初始化,创建评论数据文件 | ||
| 8 | def init(): | 11 | def init(): |
| 9 | if not os.path.exists(commentsAddr): | 12 | if not os.path.exists(commentsAddr): |
| 10 | - with open(commentsAddr,'w',encoding='utf-8',newline='') as csvFile: | 13 | + with open(commentsAddr, 'w', encoding='utf-8', newline='') as csvFile: |
| 11 | writer = csv.writer(csvFile) | 14 | writer = csv.writer(csvFile) |
| 12 | writer.writerow([ | 15 | writer.writerow([ |
| 13 | - 'articleId', | ||
| 14 | - 'created_at', | ||
| 15 | - 'likes_counts', | ||
| 16 | - 'region', | ||
| 17 | - 'content', | ||
| 18 | - 'authorName', | ||
| 19 | - 'authorGender', | ||
| 20 | - 'authorAddress', | ||
| 21 | - 'authorAvatar' | 16 | + 'articleId', 'created_at', 'likes_counts', 'region', 'content', |
| 17 | + 'authorName', 'authorGender', 'authorAddress', 'authorAvatar' | ||
| 22 | ]) | 18 | ]) |
| 23 | 19 | ||
| 20 | +# 写入评论数据到CSV | ||
| 24 | def write(row): | 21 | def write(row): |
| 25 | with open(commentsAddr, 'a', encoding='utf-8', newline='') as csvFile: | 22 | with open(commentsAddr, 'a', encoding='utf-8', newline='') as csvFile: |
| 26 | writer = csv.writer(csvFile) | 23 | writer = csv.writer(csvFile) |
| 27 | writer.writerow(row) | 24 | writer.writerow(row) |
| 28 | 25 | ||
| 29 | -def fetchData(url,params): | ||
| 30 | - headers = { | ||
| 31 | - 'Cookie':'SINAGLOBAL=2555941826014.1074.1676801766625; ULV=1719829459275:6:1:2:4660996305989.918.1719827559898:1719743122299; UOR=,,www.baidu.com; XSRF-TOKEN=VtLXviYSIs8lor7sz4iGyigL; SUB=_2A25LhvU9DeRhGeFH6FIX-S3MyD2IHXVo-gj1rDV8PUJbkNAGLRXMkW1Ne2nhI3Gle25QJK0Z99J3trq_NZn6YKJ-; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WW3Mv8V5EupQbbKh.vaZIwU5JpX5KzhUgL.FoM4e05c1Ke7e022dJLoIp7LxKML1KBLBKnLxKqL1hnLBoM41hz41hqReKqN; WBPSESS=Dt2hbAUaXfkVprjyrAZT_LRaDLsnxG-kIbeYwnBb5OUKZiwfVr_UrcYfWuqG-4ZVDM5HeU3HXkDNK_thfRfdS9Ao6ezT30jDksv-CpaVmlTAqGUHjJ7PYkH5aCK4HLxmRq14ZalmQNwzfWMPa4y0VNRLuYdg7L1s49ymNq_5v5vusoz0r4ki6u-MHGraF0fbUTgX14x0kHayEwOoxfLI-w==; SCF=AqmJWo31oFV5itnRgWNU1-wHQTL6PmkBLf3gDuqpdqAIfaWguDTMre6Oxjf5Uzs74JAh2r0DdV1sJ1g6m-wJ5NQ.; _s_tentry=-; Apache=4660996305989.918.1719827559898; PC_TOKEN=7955a7ab1f; appkey=; geetest_token=602cd4e3a7ed1898808f8adfe1a2048b; ALF=1722421868', | ||
| 32 | - 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0' | ||
| 33 | - } | ||
| 34 | - response = requests.get(url,headers=headers,params=params) | 26 | +# 获取数据,支持多账号随机切换 |
| 27 | +def fetchData(url, params, headers_list): | ||
| 28 | + headers = random.choice(headers_list) | ||
| 29 | + try: | ||
| 30 | + response = requests.get(url, headers=headers, params=params, timeout=10) | ||
| 35 | if response.status_code == 200: | 31 | if response.status_code == 200: |
| 36 | return response.json()['data'] | 32 | return response.json()['data'] |
| 37 | else: | 33 | else: |
| 38 | return None | 34 | return None |
| 35 | + except RequestException as e: | ||
| 36 | + print(f"请求失败:{e}") | ||
| 37 | + return None | ||
| 39 | 38 | ||
| 39 | +# 获取文章列表 | ||
| 40 | def getArticleList(): | 40 | def getArticleList(): |
| 41 | articleList = [] | 41 | articleList = [] |
| 42 | - with open(articleAddr,'r',encoding='utf-8') as reader: | 42 | + with open(articleAddr, 'r', encoding='utf-8') as reader: |
| 43 | readerCsv = csv.reader(reader) | 43 | readerCsv = csv.reader(reader) |
| 44 | next(reader) | 44 | next(reader) |
| 45 | for nav in readerCsv: | 45 | for nav in readerCsv: |
| 46 | articleList.append(nav) | 46 | articleList.append(nav) |
| 47 | return articleList | 47 | return articleList |
| 48 | 48 | ||
| 49 | -def readJson(response,artileId): | 49 | +# 解析评论数据 |
| 50 | +def readJson(response, articleId): | ||
| 50 | for comment in response: | 51 | for comment in response: |
| 51 | - created_at = datetime.strptime(comment['created_at'],'%a %b %d %H:%M:%S %z %Y').strftime('%Y-%m-%d') | 52 | + created_at = datetime.strptime(comment['created_at'], '%a %b %d %H:%M:%S %z %Y').strftime('%Y-%m-%d') |
| 52 | likes_counts = comment['like_counts'] | 53 | likes_counts = comment['like_counts'] |
| 53 | - try: | ||
| 54 | - region = comment['source'].replace('来自', '') | ||
| 55 | - except: | ||
| 56 | - region = '无' | 54 | + region = comment.get('source', '无').replace('来自', '') |
| 57 | content = comment['text_raw'] | 55 | content = comment['text_raw'] |
| 58 | authorName = comment['user']['screen_name'] | 56 | authorName = comment['user']['screen_name'] |
| 59 | authorGender = comment['user']['gender'] | 57 | authorGender = comment['user']['gender'] |
| 60 | authorAddress = comment['user']['location'] | 58 | authorAddress = comment['user']['location'] |
| 61 | authorAvatar = comment['user']['avatar_large'] | 59 | authorAvatar = comment['user']['avatar_large'] |
| 62 | - write([ | ||
| 63 | - artileId, | ||
| 64 | - created_at, | ||
| 65 | - likes_counts, | ||
| 66 | - region, | ||
| 67 | - content, | ||
| 68 | - authorName, | ||
| 69 | - authorGender, | ||
| 70 | - authorAddress, | ||
| 71 | - authorAvatar | ||
| 72 | - ]) | 60 | + write([articleId, created_at, likes_counts, region, content, authorName, authorGender, authorAddress, authorAvatar]) |
| 73 | 61 | ||
| 74 | -def start(): | 62 | +# 启动爬虫 |
| 63 | +def start(headers_list, delay=2): | ||
| 75 | commentUrl = 'https://weibo.com/ajax/statuses/buildComments' | 64 | commentUrl = 'https://weibo.com/ajax/statuses/buildComments' |
| 76 | init() | 65 | init() |
| 77 | articleList = getArticleList() | 66 | articleList = getArticleList() |
| 78 | for article in articleList: | 67 | for article in articleList: |
| 79 | articleId = article[0] | 68 | articleId = article[0] |
| 80 | - print('正在爬取id值为%s的文章评论' % articleId) | ||
| 81 | - time.sleep(2) | ||
| 82 | - params = { | ||
| 83 | - 'id':int(articleId), | ||
| 84 | - 'is_show_bulletin':2 | ||
| 85 | - } | ||
| 86 | - response = fetchData(commentUrl,params) | ||
| 87 | - readJson(response,articleId) | ||
| 88 | - | ||
| 89 | - | 69 | + print(f'正在爬取id值为{articleId}的文章评论') |
| 70 | + time.sleep(random.uniform(1, delay)) # 随机延时,避免频繁访问 | ||
| 71 | + params = {'id': int(articleId), 'is_show_bulletin': 2} | ||
| 72 | + response = fetchData(commentUrl, params, headers_list) | ||
| 73 | + if response: | ||
| 74 | + readJson(response, articleId) | ||
| 90 | 75 | ||
| 91 | if __name__ == '__main__': | 76 | if __name__ == '__main__': |
| 92 | - start() | ||
| 93 | - | ||
| 94 | - | ||
| 95 | - | ||
| 96 | - | ||
| 97 | - | ||
| 98 | - | ||
| 99 | - | ||
| 100 | - | 77 | + # 这里的headers_list应该包含多个账号的cookie |
| 78 | + headers_list = [ | ||
| 79 | + { | ||
| 80 | + 'Cookie': 'your_cookie_here', | ||
| 81 | + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0' | ||
| 82 | + }, | ||
| 83 | + { | ||
| 84 | + 'Cookie': 'another_cookie_here', | ||
| 85 | + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0' | ||
| 86 | + } | ||
| 87 | + ] | ||
| 88 | + start(headers_list) |
-
Please register or login to post a comment