lintsinghua
Committed by GitHub

Update spiderComments.py

@@ -2,99 +2,87 @@ import time @@ -2,99 +2,87 @@ import time
2 import requests 2 import requests
3 import csv 3 import csv
4 import os 4 import os
  5 +import random
5 from datetime import datetime 6 from datetime import datetime
6 -from .settings import articleAddr,commentsAddr 7 +from .settings import articleAddr, commentsAddr
  8 +from requests.exceptions import RequestException
7 9
  10 +# 初始化,创建评论数据文件
8 def init(): 11 def init():
9 if not os.path.exists(commentsAddr): 12 if not os.path.exists(commentsAddr):
10 - with open(commentsAddr,'w',encoding='utf-8',newline='') as csvFile: 13 + with open(commentsAddr, 'w', encoding='utf-8', newline='') as csvFile:
11 writer = csv.writer(csvFile) 14 writer = csv.writer(csvFile)
12 writer.writerow([ 15 writer.writerow([
13 - 'articleId',  
14 - 'created_at',  
15 - 'likes_counts',  
16 - 'region',  
17 - 'content',  
18 - 'authorName',  
19 - 'authorGender',  
20 - 'authorAddress',  
21 - 'authorAvatar' 16 + 'articleId', 'created_at', 'likes_counts', 'region', 'content',
  17 + 'authorName', 'authorGender', 'authorAddress', 'authorAvatar'
22 ]) 18 ])
23 19
  20 +# 写入评论数据到CSV
24 def write(row): 21 def write(row):
25 with open(commentsAddr, 'a', encoding='utf-8', newline='') as csvFile: 22 with open(commentsAddr, 'a', encoding='utf-8', newline='') as csvFile:
26 writer = csv.writer(csvFile) 23 writer = csv.writer(csvFile)
27 writer.writerow(row) 24 writer.writerow(row)
28 25
29 -def fetchData(url,params):  
30 - headers = {  
31 - 'Cookie':'SINAGLOBAL=2555941826014.1074.1676801766625; ULV=1719829459275:6:1:2:4660996305989.918.1719827559898:1719743122299; UOR=,,www.baidu.com; XSRF-TOKEN=VtLXviYSIs8lor7sz4iGyigL; SUB=_2A25LhvU9DeRhGeFH6FIX-S3MyD2IHXVo-gj1rDV8PUJbkNAGLRXMkW1Ne2nhI3Gle25QJK0Z99J3trq_NZn6YKJ-; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WW3Mv8V5EupQbbKh.vaZIwU5JpX5KzhUgL.FoM4e05c1Ke7e022dJLoIp7LxKML1KBLBKnLxKqL1hnLBoM41hz41hqReKqN; WBPSESS=Dt2hbAUaXfkVprjyrAZT_LRaDLsnxG-kIbeYwnBb5OUKZiwfVr_UrcYfWuqG-4ZVDM5HeU3HXkDNK_thfRfdS9Ao6ezT30jDksv-CpaVmlTAqGUHjJ7PYkH5aCK4HLxmRq14ZalmQNwzfWMPa4y0VNRLuYdg7L1s49ymNq_5v5vusoz0r4ki6u-MHGraF0fbUTgX14x0kHayEwOoxfLI-w==; SCF=AqmJWo31oFV5itnRgWNU1-wHQTL6PmkBLf3gDuqpdqAIfaWguDTMre6Oxjf5Uzs74JAh2r0DdV1sJ1g6m-wJ5NQ.; _s_tentry=-; Apache=4660996305989.918.1719827559898; PC_TOKEN=7955a7ab1f; appkey=; geetest_token=602cd4e3a7ed1898808f8adfe1a2048b; ALF=1722421868',  
32 - 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0'  
33 - }  
34 - response = requests.get(url,headers=headers,params=params) 26 +# 获取数据,支持多账号随机切换
  27 +def fetchData(url, params, headers_list):
  28 + headers = random.choice(headers_list)
  29 + try:
  30 + response = requests.get(url, headers=headers, params=params, timeout=10)
35 if response.status_code == 200: 31 if response.status_code == 200:
36 return response.json()['data'] 32 return response.json()['data']
37 else: 33 else:
38 return None 34 return None
  35 + except RequestException as e:
  36 + print(f"请求失败:{e}")
  37 + return None
39 38
  39 +# 获取文章列表
40 def getArticleList(): 40 def getArticleList():
41 articleList = [] 41 articleList = []
42 - with open(articleAddr,'r',encoding='utf-8') as reader: 42 + with open(articleAddr, 'r', encoding='utf-8') as reader:
43 readerCsv = csv.reader(reader) 43 readerCsv = csv.reader(reader)
44 next(reader) 44 next(reader)
45 for nav in readerCsv: 45 for nav in readerCsv:
46 articleList.append(nav) 46 articleList.append(nav)
47 return articleList 47 return articleList
48 48
49 -def readJson(response,artileId): 49 +# 解析评论数据
  50 +def readJson(response, articleId):
50 for comment in response: 51 for comment in response:
51 - created_at = datetime.strptime(comment['created_at'],'%a %b %d %H:%M:%S %z %Y').strftime('%Y-%m-%d') 52 + created_at = datetime.strptime(comment['created_at'], '%a %b %d %H:%M:%S %z %Y').strftime('%Y-%m-%d')
52 likes_counts = comment['like_counts'] 53 likes_counts = comment['like_counts']
53 - try:  
54 - region = comment['source'].replace('来自', '')  
55 - except:  
56 - region = '无' 54 + region = comment.get('source', '无').replace('来自', '')
57 content = comment['text_raw'] 55 content = comment['text_raw']
58 authorName = comment['user']['screen_name'] 56 authorName = comment['user']['screen_name']
59 authorGender = comment['user']['gender'] 57 authorGender = comment['user']['gender']
60 authorAddress = comment['user']['location'] 58 authorAddress = comment['user']['location']
61 authorAvatar = comment['user']['avatar_large'] 59 authorAvatar = comment['user']['avatar_large']
62 - write([  
63 - artileId,  
64 - created_at,  
65 - likes_counts,  
66 - region,  
67 - content,  
68 - authorName,  
69 - authorGender,  
70 - authorAddress,  
71 - authorAvatar  
72 - ]) 60 + write([articleId, created_at, likes_counts, region, content, authorName, authorGender, authorAddress, authorAvatar])
73 61
74 -def start(): 62 +# 启动爬虫
  63 +def start(headers_list, delay=2):
75 commentUrl = 'https://weibo.com/ajax/statuses/buildComments' 64 commentUrl = 'https://weibo.com/ajax/statuses/buildComments'
76 init() 65 init()
77 articleList = getArticleList() 66 articleList = getArticleList()
78 for article in articleList: 67 for article in articleList:
79 articleId = article[0] 68 articleId = article[0]
80 - print('正在爬取id值为%s的文章评论' % articleId)  
81 - time.sleep(2)  
82 - params = {  
83 - 'id':int(articleId),  
84 - 'is_show_bulletin':2  
85 - }  
86 - response = fetchData(commentUrl,params)  
87 - readJson(response,articleId)  
88 -  
89 - 69 + print(f'正在爬取id值为{articleId}的文章评论')
  70 + time.sleep(random.uniform(1, delay)) # 随机延时,避免频繁访问
  71 + params = {'id': int(articleId), 'is_show_bulletin': 2}
  72 + response = fetchData(commentUrl, params, headers_list)
  73 + if response:
  74 + readJson(response, articleId)
90 75
91 if __name__ == '__main__': 76 if __name__ == '__main__':
92 - start()  
93 -  
94 -  
95 -  
96 -  
97 -  
98 -  
99 -  
100 - 77 + # 这里的headers_list应该包含多个账号的cookie
  78 + headers_list = [
  79 + {
  80 + 'Cookie': 'your_cookie_here',
  81 + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0'
  82 + },
  83 + {
  84 + 'Cookie': 'another_cookie_here',
  85 + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0'
  86 + }
  87 + ]
  88 + start(headers_list)