戒酒的李白
Committed by GitHub

Merge pull request #12 from lintsinghua/main

Optimized the web scraping functionality to support multi-account crawling and enhance performance.
@@ -2,99 +2,87 @@ import time @@ -2,99 +2,87 @@ import time
2 import requests 2 import requests
3 import csv 3 import csv
4 import os 4 import os
  5 +import random
5 from datetime import datetime 6 from datetime import datetime
6 -from .settings import articleAddr,commentsAddr 7 +from .settings import articleAddr, commentsAddr
  8 +from requests.exceptions import RequestException
7 9
  10 +# 初始化,创建评论数据文件
8 def init(): 11 def init():
9 if not os.path.exists(commentsAddr): 12 if not os.path.exists(commentsAddr):
10 - with open(commentsAddr,'w',encoding='utf-8',newline='') as csvFile: 13 + with open(commentsAddr, 'w', encoding='utf-8', newline='') as csvFile:
11 writer = csv.writer(csvFile) 14 writer = csv.writer(csvFile)
12 writer.writerow([ 15 writer.writerow([
13 - 'articleId',  
14 - 'created_at',  
15 - 'likes_counts',  
16 - 'region',  
17 - 'content',  
18 - 'authorName',  
19 - 'authorGender',  
20 - 'authorAddress',  
21 - 'authorAvatar' 16 + 'articleId', 'created_at', 'likes_counts', 'region', 'content',
  17 + 'authorName', 'authorGender', 'authorAddress', 'authorAvatar'
22 ]) 18 ])
23 19
  20 +# 写入评论数据到CSV
24 def write(row): 21 def write(row):
25 with open(commentsAddr, 'a', encoding='utf-8', newline='') as csvFile: 22 with open(commentsAddr, 'a', encoding='utf-8', newline='') as csvFile:
26 writer = csv.writer(csvFile) 23 writer = csv.writer(csvFile)
27 writer.writerow(row) 24 writer.writerow(row)
28 25
29 -def fetchData(url,params):  
30 - headers = {  
31 - 'Cookie':'SINAGLOBAL=2555941826014.1074.1676801766625; ULV=1719829459275:6:1:2:4660996305989.918.1719827559898:1719743122299; UOR=,,www.baidu.com; XSRF-TOKEN=VtLXviYSIs8lor7sz4iGyigL; SUB=_2A25LhvU9DeRhGeFH6FIX-S3MyD2IHXVo-gj1rDV8PUJbkNAGLRXMkW1Ne2nhI3Gle25QJK0Z99J3trq_NZn6YKJ-; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WW3Mv8V5EupQbbKh.vaZIwU5JpX5KzhUgL.FoM4e05c1Ke7e022dJLoIp7LxKML1KBLBKnLxKqL1hnLBoM41hz41hqReKqN; WBPSESS=Dt2hbAUaXfkVprjyrAZT_LRaDLsnxG-kIbeYwnBb5OUKZiwfVr_UrcYfWuqG-4ZVDM5HeU3HXkDNK_thfRfdS9Ao6ezT30jDksv-CpaVmlTAqGUHjJ7PYkH5aCK4HLxmRq14ZalmQNwzfWMPa4y0VNRLuYdg7L1s49ymNq_5v5vusoz0r4ki6u-MHGraF0fbUTgX14x0kHayEwOoxfLI-w==; SCF=AqmJWo31oFV5itnRgWNU1-wHQTL6PmkBLf3gDuqpdqAIfaWguDTMre6Oxjf5Uzs74JAh2r0DdV1sJ1g6m-wJ5NQ.; _s_tentry=-; Apache=4660996305989.918.1719827559898; PC_TOKEN=7955a7ab1f; appkey=; geetest_token=602cd4e3a7ed1898808f8adfe1a2048b; ALF=1722421868',  
32 - 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0'  
33 - }  
34 - response = requests.get(url,headers=headers,params=params)  
35 - if response.status_code == 200:  
36 - return response.json()['data']  
37 - else: 26 +# 获取数据,支持多账号随机切换
  27 +def fetchData(url, params, headers_list):
  28 + headers = random.choice(headers_list)
  29 + try:
  30 + response = requests.get(url, headers=headers, params=params, timeout=10)
  31 + if response.status_code == 200:
  32 + return response.json()['data']
  33 + else:
  34 + return None
  35 + except RequestException as e:
  36 + print(f"请求失败:{e}")
38 return None 37 return None
39 38
  39 +# 获取文章列表
40 def getArticleList(): 40 def getArticleList():
41 articleList = [] 41 articleList = []
42 - with open(articleAddr,'r',encoding='utf-8') as reader: 42 + with open(articleAddr, 'r', encoding='utf-8') as reader:
43 readerCsv = csv.reader(reader) 43 readerCsv = csv.reader(reader)
44 next(reader) 44 next(reader)
45 for nav in readerCsv: 45 for nav in readerCsv:
46 articleList.append(nav) 46 articleList.append(nav)
47 return articleList 47 return articleList
48 48
49 -def readJson(response,artileId): 49 +# 解析评论数据
  50 +def readJson(response, articleId):
50 for comment in response: 51 for comment in response:
51 - created_at = datetime.strptime(comment['created_at'],'%a %b %d %H:%M:%S %z %Y').strftime('%Y-%m-%d') 52 + created_at = datetime.strptime(comment['created_at'], '%a %b %d %H:%M:%S %z %Y').strftime('%Y-%m-%d')
52 likes_counts = comment['like_counts'] 53 likes_counts = comment['like_counts']
53 - try:  
54 - region = comment['source'].replace('来自', '')  
55 - except:  
56 - region = '无' 54 + region = comment.get('source', '无').replace('来自', '')
57 content = comment['text_raw'] 55 content = comment['text_raw']
58 authorName = comment['user']['screen_name'] 56 authorName = comment['user']['screen_name']
59 authorGender = comment['user']['gender'] 57 authorGender = comment['user']['gender']
60 authorAddress = comment['user']['location'] 58 authorAddress = comment['user']['location']
61 authorAvatar = comment['user']['avatar_large'] 59 authorAvatar = comment['user']['avatar_large']
62 - write([  
63 - artileId,  
64 - created_at,  
65 - likes_counts,  
66 - region,  
67 - content,  
68 - authorName,  
69 - authorGender,  
70 - authorAddress,  
71 - authorAvatar  
72 - ]) 60 + write([articleId, created_at, likes_counts, region, content, authorName, authorGender, authorAddress, authorAvatar])
73 61
74 -def start(): 62 +# 启动爬虫
  63 +def start(headers_list, delay=2):
75 commentUrl = 'https://weibo.com/ajax/statuses/buildComments' 64 commentUrl = 'https://weibo.com/ajax/statuses/buildComments'
76 init() 65 init()
77 articleList = getArticleList() 66 articleList = getArticleList()
78 for article in articleList: 67 for article in articleList:
79 articleId = article[0] 68 articleId = article[0]
80 - print('正在爬取id值为%s的文章评论' % articleId)  
81 - time.sleep(2)  
82 - params = {  
83 - 'id':int(articleId),  
84 - 'is_show_bulletin':2  
85 - }  
86 - response = fetchData(commentUrl,params)  
87 - readJson(response,articleId)  
88 -  
89 - 69 + print(f'正在爬取id值为{articleId}的文章评论')
  70 + time.sleep(random.uniform(1, delay)) # 随机延时,避免频繁访问
  71 + params = {'id': int(articleId), 'is_show_bulletin': 2}
  72 + response = fetchData(commentUrl, params, headers_list)
  73 + if response:
  74 + readJson(response, articleId)
90 75
91 if __name__ == '__main__': 76 if __name__ == '__main__':
92 - start()  
93 -  
94 -  
95 -  
96 -  
97 -  
98 -  
99 -  
100 - 77 + # 这里的headers_list应该包含多个账号的cookie
  78 + headers_list = [
  79 + {
  80 + 'Cookie': 'your_cookie_here',
  81 + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0'
  82 + },
  83 + {
  84 + 'Cookie': 'another_cookie_here',
  85 + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0'
  86 + }
  87 + ]
  88 + start(headers_list)
@@ -2,123 +2,97 @@ import time @@ -2,123 +2,97 @@ import time
2 import requests 2 import requests
3 import csv 3 import csv
4 import os 4 import os
  5 +import random
5 from datetime import datetime 6 from datetime import datetime
6 -from .settings import navAddr,articleAddr 7 +from .settings import navAddr, articleAddr
  8 +from requests.exceptions import RequestException
7 9
  10 +# 初始化文章数据文件
8 def init(): 11 def init():
9 if not os.path.exists(articleAddr): 12 if not os.path.exists(articleAddr):
10 - with open(articleAddr,'w',encoding='utf-8',newline='') as csvFile: 13 + with open(articleAddr, 'w', encoding='utf-8', newline='') as csvFile:
11 writer = csv.writer(csvFile) 14 writer = csv.writer(csvFile)
12 writer.writerow([ 15 writer.writerow([
13 - 'id',  
14 - 'likeNum',  
15 - 'commentsLen',  
16 - 'reposts_count',  
17 - 'region',  
18 - 'content',  
19 - 'contentLen',  
20 - 'created_at',  
21 - 'type',  
22 - 'detailUrl',# followBtnCode>uid + mblogid  
23 - 'authorAvatar',  
24 - 'authorName',  
25 - 'authorDetail',  
26 - 'isVip' # v_plus 16 + 'id', 'likeNum', 'commentsLen', 'reposts_count', 'region', 'content', 'contentLen',
  17 + 'created_at', 'type', 'detailUrl', 'authorAvatar', 'authorName', 'authorDetail', 'isVip'
27 ]) 18 ])
28 19
  20 +# 写入数据到CSV
29 def write(row): 21 def write(row):
30 with open(articleAddr, 'a', encoding='utf-8', newline='') as csvFile: 22 with open(articleAddr, 'a', encoding='utf-8', newline='') as csvFile:
31 writer = csv.writer(csvFile) 23 writer = csv.writer(csvFile)
32 writer.writerow(row) 24 writer.writerow(row)
33 25
34 -def fetchData(url,params):  
35 - headers = {  
36 - 'Cookie':'SINAGLOBAL=2555941826014.1074.1676801766625; ULV=1719829459275:6:1:2:4660996305989.918.1719827559898:1719743122299; UOR=,,www.baidu.com; XSRF-TOKEN=VtLXviYSIs8lor7sz4iGyigL; SUB=_2A25LhvU9DeRhGeFH6FIX-S3MyD2IHXVo-gj1rDV8PUJbkNAGLRXMkW1Ne2nhI3Gle25QJK0Z99J3trq_NZn6YKJ-; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WW3Mv8V5EupQbbKh.vaZIwU5JpX5KzhUgL.FoM4e05c1Ke7e022dJLoIp7LxKML1KBLBKnLxKqL1hnLBoM41hz41hqReKqN; WBPSESS=Dt2hbAUaXfkVprjyrAZT_LRaDLsnxG-kIbeYwnBb5OUKZiwfVr_UrcYfWuqG-4ZVDM5HeU3HXkDNK_thfRfdS9Ao6ezT30jDksv-CpaVmlTAqGUHjJ7PYkH5aCK4HLxmRq14ZalmQNwzfWMPa4y0VNRLuYdg7L1s49ymNq_5v5vusoz0r4ki6u-MHGraF0fbUTgX14x0kHayEwOoxfLI-w==; SCF=AqmJWo31oFV5itnRgWNU1-wHQTL6PmkBLf3gDuqpdqAIfaWguDTMre6Oxjf5Uzs74JAh2r0DdV1sJ1g6m-wJ5NQ.; _s_tentry=-; Apache=4660996305989.918.1719827559898; PC_TOKEN=7955a7ab1f; appkey=; geetest_token=602cd4e3a7ed1898808f8adfe1a2048b; ALF=1722421868',  
37 - 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0'  
38 - }  
39 - response = requests.get(url,headers=headers,params=params)  
40 - if response.status_code == 200:  
41 - return response.json()['statuses']  
42 - else: 26 +# 获取数据,支持多账号
  27 +def fetchData(url, params, headers_list):
  28 + headers = random.choice(headers_list)
  29 + try:
  30 + response = requests.get(url, headers=headers, params=params, timeout=10)
  31 + if response.status_code == 200:
  32 + return response.json()['statuses']
  33 + else:
  34 + return None
  35 + except RequestException as e:
  36 + print(f"请求失败:{e}")
43 return None 37 return None
44 38
  39 +# 获取类型列表
45 def getTypeList(): 40 def getTypeList():
46 typeList = [] 41 typeList = []
47 - with open(navAddr,'r',encoding='utf-8') as reader: 42 + with open(navAddr, 'r', encoding='utf-8') as reader:
48 readerCsv = csv.reader(reader) 43 readerCsv = csv.reader(reader)
49 next(reader) 44 next(reader)
50 for nav in readerCsv: 45 for nav in readerCsv:
51 typeList.append(nav) 46 typeList.append(nav)
52 return typeList 47 return typeList
53 48
54 -def readJson(response,type):  
55 - for artice in response:  
56 - id = artice['id']  
57 - likeNum = artice['attitudes_count']  
58 - commentsLen = artice['comments_count']  
59 - reposts_count = artice['reposts_count']  
60 - try:  
61 - region = artice['region_name'].replace('发布于 ', '')  
62 - except:  
63 - region = '无'  
64 - content = artice['text_raw']  
65 - contentLen = artice['textLength']  
66 - created_at = datetime.strptime(artice['created_at'],'%a %b %d %H:%M:%S %z %Y').strftime('%Y-%m-%d')  
67 - type = type  
68 - try:  
69 - detailUrl = 'https://weibo.com/' + str(artice['id']) + '/' + str(artice['mblogid'])  
70 - except:  
71 - detailUrl = '无'  
72 - authorAvatar = artice['user']['avatar_large']  
73 - authorName = artice['user']['screen_name']  
74 - authorDetail = 'https://weibo.com/u/' + str(artice['user']['id'])  
75 - isVip = artice['user']['v_plus']  
76 - write([  
77 - id,  
78 - likeNum,  
79 - commentsLen,  
80 - reposts_count,  
81 - region,  
82 - content,  
83 - contentLen,  
84 - created_at,  
85 - type,  
86 - detailUrl,  
87 - authorAvatar,  
88 - authorName,  
89 - authorDetail,  
90 - isVip  
91 - ]) 49 +# 解析文章数据
  50 +def readJson(response, type):
  51 + for article in response:
  52 + id = article['id']
  53 + likeNum = article['attitudes_count']
  54 + commentsLen = article['comments_count']
  55 + reposts_count = article['reposts_count']
  56 + region = article.get('region_name', '无').replace('发布于 ', '')
  57 + content = article['text_raw']
  58 + contentLen = article['textLength']
  59 + created_at = datetime.strptime(article['created_at'], '%a %b %d %H:%M:%S %z %Y').strftime('%Y-%m-%d')
  60 + detailUrl = f"https://weibo.com/{article['id']}/{article['mblogid']}" if 'mblogid' in article else '无'
  61 + authorAvatar = article['user']['avatar_large']
  62 + authorName = article['user']['screen_name']
  63 + authorDetail = f"https://weibo.com/u/{article['user']['id']}"
  64 + isVip = article['user']['v_plus']
  65 + write([id, likeNum, commentsLen, reposts_count, region, content, contentLen, created_at, type, detailUrl, authorAvatar, authorName, authorDetail, isVip])
92 66
93 -def start(typeNum=14,pageNum=3): 67 +# 启动爬虫
  68 +def start(headers_list, typeNum=14, pageNum=3, delay=2):
94 articleUrl = 'https://weibo.com/ajax/feed/hottimeline' 69 articleUrl = 'https://weibo.com/ajax/feed/hottimeline'
95 init() 70 init()
96 typeList = getTypeList() 71 typeList = getTypeList()
97 - typeNumCount = 0  
98 - for type in typeList:  
99 - if typeNumCount > typeNum:return  
100 - time.sleep(2)  
101 - for page in range(0,pageNum):  
102 - print('正在爬取的类型:%s 中的第%s页文章数据' % (type[0],page + 1))  
103 - time.sleep(1)  
104 - parmas = {  
105 - 'group_id':type[1],  
106 - 'containerid':type[2],  
107 - 'max_id':page,  
108 - 'count':10,  
109 - 'extparam':'discover|new_feed' 72 + for type in typeList[:typeNum]:
  73 + for page in range(pageNum):
  74 + print(f'正在爬取的类型:{type[0]} 中的第{page + 1}页文章数据')
  75 + time.sleep(random.uniform(1, delay)) # 随机延时
  76 + params = {
  77 + 'group_id': type[1],
  78 + 'containerid': type[2],
  79 + 'max_id': page,
  80 + 'count': 10,
  81 + 'extparam': 'discover|new_feed'
110 } 82 }
111 - response = fetchData(articleUrl,parmas)  
112 - readJson(response,type[0])  
113 - typeNumCount += 1 83 + response = fetchData(articleUrl, params, headers_list)
  84 + if response:
  85 + readJson(response, type[0])
114 86
115 if __name__ == '__main__': 87 if __name__ == '__main__':
116 - start()  
117 -  
118 -  
119 -  
120 -  
121 -  
122 -  
123 -  
124 - 88 + headers_list = [
  89 + {
  90 + 'Cookie': 'your_cookie_here',
  91 + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0'
  92 + },
  93 + {
  94 + 'Cookie': 'another_cookie_here',
  95 + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0'
  96 + }
  97 + ]
  98 + start(headers_list)
@@ -2,54 +2,64 @@ import requests @@ -2,54 +2,64 @@ import requests
2 import csv 2 import csv
3 import numpy as np 3 import numpy as np
4 import os 4 import os
  5 +import random
5 from .settings import navAddr 6 from .settings import navAddr
  7 +from requests.exceptions import RequestException
  8 +
  9 +# 初始化导航数据文件
6 def init(): 10 def init():
7 if not os.path.exists(navAddr): 11 if not os.path.exists(navAddr):
8 - with open(navAddr,'w',encoding='utf-8',newline='') as csvFile: 12 + with open(navAddr, 'w', encoding='utf-8', newline='') as csvFile:
9 writer = csv.writer(csvFile) 13 writer = csv.writer(csvFile)
10 - writer.writerow([  
11 - 'typeName',  
12 - 'gid',  
13 - 'containerid'  
14 - ]) 14 + writer.writerow(['typeName', 'gid', 'containerid'])
15 15
  16 +# 写入导航数据
16 def write(row): 17 def write(row):
17 with open(navAddr, 'a', encoding='utf-8', newline='') as csvFile: 18 with open(navAddr, 'a', encoding='utf-8', newline='') as csvFile:
18 writer = csv.writer(csvFile) 19 writer = csv.writer(csvFile)
19 writer.writerow(row) 20 writer.writerow(row)
20 21
21 -def fetchData(url):  
22 - headers = {  
23 - 'Cookie':'SINAGLOBAL=2555941826014.1074.1676801766625; ULV=1719829459275:6:1:2:4660996305989.918.1719827559898:1719743122299; UOR=,,www.baidu.com; XSRF-TOKEN=VtLXviYSIs8lor7sz4iGyigL; SUB=_2A25LhvU9DeRhGeFH6FIX-S3MyD2IHXVo-gj1rDV8PUJbkNAGLRXMkW1Ne2nhI3Gle25QJK0Z99J3trq_NZn6YKJ-; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WW3Mv8V5EupQbbKh.vaZIwU5JpX5KzhUgL.FoM4e05c1Ke7e022dJLoIp7LxKML1KBLBKnLxKqL1hnLBoM41hz41hqReKqN; WBPSESS=Dt2hbAUaXfkVprjyrAZT_LRaDLsnxG-kIbeYwnBb5OUKZiwfVr_UrcYfWuqG-4ZVDM5HeU3HXkDNK_thfRfdS9Ao6ezT30jDksv-CpaVmlTAqGUHjJ7PYkH5aCK4HLxmRq14ZalmQNwzfWMPa4y0VNRLuYdg7L1s49ymNq_5v5vusoz0r4ki6u-MHGraF0fbUTgX14x0kHayEwOoxfLI-w==; SCF=AqmJWo31oFV5itnRgWNU1-wHQTL6PmkBLf3gDuqpdqAIfaWguDTMre6Oxjf5Uzs74JAh2r0DdV1sJ1g6m-wJ5NQ.; _s_tentry=-; Apache=4660996305989.918.1719827559898; PC_TOKEN=7955a7ab1f; appkey=; geetest_token=602cd4e3a7ed1898808f8adfe1a2048b; ALF=1722421868',  
24 - 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0'  
25 - }  
26 - params = {  
27 - 'is_new_segment':1,  
28 - 'fetch_hot':1  
29 - }  
30 - response = requests.get(url,headers=headers,params=params)  
31 - if response.status_code == 200:  
32 - return response.json()  
33 - else: 22 +# 获取数据,支持多账号
  23 +def fetchData(url, headers_list):
  24 + headers = random.choice(headers_list)
  25 + try:
  26 + response = requests.get(url, headers=headers, timeout=10)
  27 + if response.status_code == 200:
  28 + return response.json()['data']['modules']
  29 + else:
  30 + return None
  31 + except RequestException as e:
  32 + print(f"请求失败:{e}")
34 return None 33 return None
35 34
  35 +# 解析导航数据
36 def readJson(response): 36 def readJson(response):
37 - navList = np.append(response['groups'][3]['group'],response['groups'][4]['group'])  
38 - for nav in navList:  
39 - navName = nav['title']  
40 - gid = nav['gid']  
41 - containerid = nav['containerid']  
42 - write([  
43 - navName,  
44 - gid,  
45 - containerid  
46 - ]) 37 + for module in response:
  38 + if 'type' in module and 'typeName' in module:
  39 + typeName = module['typeName']
  40 + for submodule in module['modules']:
  41 + if 'id' in submodule and 'containerid' in submodule:
  42 + gid = submodule['id']
  43 + containerid = submodule['containerid']
  44 + write([typeName, gid, containerid])
47 45
48 -def start(): 46 +# 启动爬虫
  47 +def start(headers_list):
  48 + navUrl = 'https://weibo.com/ajax/side/hot'
49 init() 49 init()
50 - url = 'https://weibo.com/ajax/feed/allGroups'  
51 - response = fetchData(url)  
52 - readJson(response) 50 + response = fetchData(navUrl, headers_list)
  51 + if response:
  52 + readJson(response)
53 53
54 if __name__ == '__main__': 54 if __name__ == '__main__':
55 - start()  
  55 + headers_list = [
  56 + {
  57 + 'Cookie': 'your_cookie_here',
  58 + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0'
  59 + },
  60 + {
  61 + 'Cookie': 'another_cookie_here',
  62 + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0'
  63 + }
  64 + ]
  65 + start(headers_list)