lintsinghua
Committed by GitHub

Update spiderContent.py

@@ -2,123 +2,97 @@ import time @@ -2,123 +2,97 @@ import time
2 import requests 2 import requests
3 import csv 3 import csv
4 import os 4 import os
  5 +import random
5 from datetime import datetime 6 from datetime import datetime
6 -from .settings import navAddr,articleAddr 7 +from .settings import navAddr, articleAddr
  8 +from requests.exceptions import RequestException
7 9
  10 +# 初始化文章数据文件
8 def init(): 11 def init():
9 if not os.path.exists(articleAddr): 12 if not os.path.exists(articleAddr):
10 - with open(articleAddr,'w',encoding='utf-8',newline='') as csvFile: 13 + with open(articleAddr, 'w', encoding='utf-8', newline='') as csvFile:
11 writer = csv.writer(csvFile) 14 writer = csv.writer(csvFile)
12 writer.writerow([ 15 writer.writerow([
13 - 'id',  
14 - 'likeNum',  
15 - 'commentsLen',  
16 - 'reposts_count',  
17 - 'region',  
18 - 'content',  
19 - 'contentLen',  
20 - 'created_at',  
21 - 'type',  
22 - 'detailUrl',# followBtnCode>uid + mblogid  
23 - 'authorAvatar',  
24 - 'authorName',  
25 - 'authorDetail',  
26 - 'isVip' # v_plus 16 + 'id', 'likeNum', 'commentsLen', 'reposts_count', 'region', 'content', 'contentLen',
  17 + 'created_at', 'type', 'detailUrl', 'authorAvatar', 'authorName', 'authorDetail', 'isVip'
27 ]) 18 ])
28 19
  20 +# 写入数据到CSV
29 def write(row): 21 def write(row):
30 with open(articleAddr, 'a', encoding='utf-8', newline='') as csvFile: 22 with open(articleAddr, 'a', encoding='utf-8', newline='') as csvFile:
31 writer = csv.writer(csvFile) 23 writer = csv.writer(csvFile)
32 writer.writerow(row) 24 writer.writerow(row)
33 25
34 -def fetchData(url,params):  
35 - headers = {  
36 - 'Cookie':'SINAGLOBAL=2555941826014.1074.1676801766625; ULV=1719829459275:6:1:2:4660996305989.918.1719827559898:1719743122299; UOR=,,www.baidu.com; XSRF-TOKEN=VtLXviYSIs8lor7sz4iGyigL; SUB=_2A25LhvU9DeRhGeFH6FIX-S3MyD2IHXVo-gj1rDV8PUJbkNAGLRXMkW1Ne2nhI3Gle25QJK0Z99J3trq_NZn6YKJ-; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WW3Mv8V5EupQbbKh.vaZIwU5JpX5KzhUgL.FoM4e05c1Ke7e022dJLoIp7LxKML1KBLBKnLxKqL1hnLBoM41hz41hqReKqN; WBPSESS=Dt2hbAUaXfkVprjyrAZT_LRaDLsnxG-kIbeYwnBb5OUKZiwfVr_UrcYfWuqG-4ZVDM5HeU3HXkDNK_thfRfdS9Ao6ezT30jDksv-CpaVmlTAqGUHjJ7PYkH5aCK4HLxmRq14ZalmQNwzfWMPa4y0VNRLuYdg7L1s49ymNq_5v5vusoz0r4ki6u-MHGraF0fbUTgX14x0kHayEwOoxfLI-w==; SCF=AqmJWo31oFV5itnRgWNU1-wHQTL6PmkBLf3gDuqpdqAIfaWguDTMre6Oxjf5Uzs74JAh2r0DdV1sJ1g6m-wJ5NQ.; _s_tentry=-; Apache=4660996305989.918.1719827559898; PC_TOKEN=7955a7ab1f; appkey=; geetest_token=602cd4e3a7ed1898808f8adfe1a2048b; ALF=1722421868',  
37 - 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0'  
38 - }  
39 - response = requests.get(url,headers=headers,params=params) 26 +# 获取数据,支持多账号
  27 +def fetchData(url, params, headers_list):
  28 + headers = random.choice(headers_list)
  29 + try:
  30 + response = requests.get(url, headers=headers, params=params, timeout=10)
40 if response.status_code == 200: 31 if response.status_code == 200:
41 return response.json()['statuses'] 32 return response.json()['statuses']
42 else: 33 else:
43 return None 34 return None
  35 + except RequestException as e:
  36 + print(f"请求失败:{e}")
  37 + return None
44 38
  39 +# 获取类型列表
45 def getTypeList(): 40 def getTypeList():
46 typeList = [] 41 typeList = []
47 - with open(navAddr,'r',encoding='utf-8') as reader: 42 + with open(navAddr, 'r', encoding='utf-8') as reader:
48 readerCsv = csv.reader(reader) 43 readerCsv = csv.reader(reader)
49 next(reader) 44 next(reader)
50 for nav in readerCsv: 45 for nav in readerCsv:
51 typeList.append(nav) 46 typeList.append(nav)
52 return typeList 47 return typeList
53 48
54 -def readJson(response,type):  
55 - for artice in response:  
56 - id = artice['id']  
57 - likeNum = artice['attitudes_count']  
58 - commentsLen = artice['comments_count']  
59 - reposts_count = artice['reposts_count']  
60 - try:  
61 - region = artice['region_name'].replace('发布于 ', '')  
62 - except:  
63 - region = '无'  
64 - content = artice['text_raw']  
65 - contentLen = artice['textLength']  
66 - created_at = datetime.strptime(artice['created_at'],'%a %b %d %H:%M:%S %z %Y').strftime('%Y-%m-%d')  
67 - type = type  
68 - try:  
69 - detailUrl = 'https://weibo.com/' + str(artice['id']) + '/' + str(artice['mblogid'])  
70 - except:  
71 - detailUrl = '无'  
72 - authorAvatar = artice['user']['avatar_large']  
73 - authorName = artice['user']['screen_name']  
74 - authorDetail = 'https://weibo.com/u/' + str(artice['user']['id'])  
75 - isVip = artice['user']['v_plus']  
76 - write([  
77 - id,  
78 - likeNum,  
79 - commentsLen,  
80 - reposts_count,  
81 - region,  
82 - content,  
83 - contentLen,  
84 - created_at,  
85 - type,  
86 - detailUrl,  
87 - authorAvatar,  
88 - authorName,  
89 - authorDetail,  
90 - isVip  
91 - ]) 49 +# 解析文章数据
  50 +def readJson(response, type):
  51 + for article in response:
  52 + id = article['id']
  53 + likeNum = article['attitudes_count']
  54 + commentsLen = article['comments_count']
  55 + reposts_count = article['reposts_count']
  56 + region = article.get('region_name', '无').replace('发布于 ', '')
  57 + content = article['text_raw']
  58 + contentLen = article['textLength']
  59 + created_at = datetime.strptime(article['created_at'], '%a %b %d %H:%M:%S %z %Y').strftime('%Y-%m-%d')
  60 + detailUrl = f"https://weibo.com/{article['id']}/{article['mblogid']}" if 'mblogid' in article else '无'
  61 + authorAvatar = article['user']['avatar_large']
  62 + authorName = article['user']['screen_name']
  63 + authorDetail = f"https://weibo.com/u/{article['user']['id']}"
  64 + isVip = article['user']['v_plus']
  65 + write([id, likeNum, commentsLen, reposts_count, region, content, contentLen, created_at, type, detailUrl, authorAvatar, authorName, authorDetail, isVip])
92 66
93 -def start(typeNum=14,pageNum=3): 67 +# 启动爬虫
  68 +def start(headers_list, typeNum=14, pageNum=3, delay=2):
94 articleUrl = 'https://weibo.com/ajax/feed/hottimeline' 69 articleUrl = 'https://weibo.com/ajax/feed/hottimeline'
95 init() 70 init()
96 typeList = getTypeList() 71 typeList = getTypeList()
97 - typeNumCount = 0  
98 - for type in typeList:  
99 - if typeNumCount > typeNum:return  
100 - time.sleep(2)  
101 - for page in range(0,pageNum):  
102 - print('正在爬取的类型:%s 中的第%s页文章数据' % (type[0],page + 1))  
103 - time.sleep(1)  
104 - parmas = {  
105 - 'group_id':type[1],  
106 - 'containerid':type[2],  
107 - 'max_id':page,  
108 - 'count':10,  
109 - 'extparam':'discover|new_feed' 72 + for type in typeList[:typeNum]:
  73 + for page in range(pageNum):
  74 + print(f'正在爬取的类型:{type[0]} 中的第{page + 1}页文章数据')
  75 + time.sleep(random.uniform(1, delay)) # 随机延时
  76 + params = {
  77 + 'group_id': type[1],
  78 + 'containerid': type[2],
  79 + 'max_id': page,
  80 + 'count': 10,
  81 + 'extparam': 'discover|new_feed'
110 } 82 }
111 - response = fetchData(articleUrl,parmas)  
112 - readJson(response,type[0])  
113 - typeNumCount += 1 83 + response = fetchData(articleUrl, params, headers_list)
  84 + if response:
  85 + readJson(response, type[0])
114 86
115 if __name__ == '__main__': 87 if __name__ == '__main__':
116 - start()  
117 -  
118 -  
119 -  
120 -  
121 -  
122 -  
123 -  
124 - 88 + headers_list = [
  89 + {
  90 + 'Cookie': 'your_cookie_here',
  91 + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0'
  92 + },
  93 + {
  94 + 'Cookie': 'another_cookie_here',
  95 + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0'
  96 + }
  97 + ]
  98 + start(headers_list)