core.py
15.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
# -*- coding: utf-8 -*-
# @Author : relakkes@gmail.com
# @Time : 2023/12/23 15:41
# @Desc : 微博爬虫主流程代码
import asyncio
import os
import random
from asyncio import Task
from typing import Dict, List, Optional, Tuple
from playwright.async_api import (
BrowserContext,
BrowserType,
Page,
Playwright,
async_playwright,
)
import config
from base.base_crawler import AbstractCrawler
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
from store import weibo as weibo_store
from tools import utils
from tools.cdp_browser import CDPBrowserManager
from var import crawler_type_var, source_keyword_var
from .client import WeiboClient
from .exception import DataFetchError
from .field import SearchType
from .help import filter_search_result_card
from .login import WeiboLogin
class WeiboCrawler(AbstractCrawler):
context_page: Page
wb_client: WeiboClient
browser_context: BrowserContext
cdp_manager: Optional[CDPBrowserManager]
def __init__(self):
self.index_url = "https://www.weibo.com"
self.mobile_index_url = "https://m.weibo.cn"
self.user_agent = utils.get_user_agent()
self.mobile_user_agent = utils.get_mobile_user_agent()
self.cdp_manager = None
async def start(self):
playwright_proxy_format, httpx_proxy_format = None, None
if config.ENABLE_IP_PROXY:
ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True)
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info)
async with async_playwright() as playwright:
# 根据配置选择启动模式
if config.ENABLE_CDP_MODE:
utils.logger.info("[WeiboCrawler] 使用CDP模式启动浏览器")
self.browser_context = await self.launch_browser_with_cdp(
playwright,
playwright_proxy_format,
self.mobile_user_agent,
headless=config.CDP_HEADLESS,
)
else:
utils.logger.info("[WeiboCrawler] 使用标准模式启动浏览器")
# Launch a browser context.
chromium = playwright.chromium
self.browser_context = await self.launch_browser(chromium, None, self.mobile_user_agent, headless=config.HEADLESS)
# stealth.min.js is a js script to prevent the website from detecting the crawler.
await self.browser_context.add_init_script(path="libs/stealth.min.js")
self.context_page = await self.browser_context.new_page()
await self.context_page.goto(self.mobile_index_url)
# Create a client to interact with the xiaohongshu website.
self.wb_client = await self.create_weibo_client(httpx_proxy_format)
if not await self.wb_client.pong():
login_obj = WeiboLogin(
login_type=config.LOGIN_TYPE,
login_phone="", # your phone number
browser_context=self.browser_context,
context_page=self.context_page,
cookie_str=config.COOKIES,
)
await login_obj.begin()
# 登录成功后重定向到手机端的网站,再更新手机端登录成功的cookie
utils.logger.info("[WeiboCrawler.start] redirect weibo mobile homepage and update cookies on mobile platform")
await self.context_page.goto(self.mobile_index_url)
await asyncio.sleep(2)
await self.wb_client.update_cookies(browser_context=self.browser_context)
crawler_type_var.set(config.CRAWLER_TYPE)
if config.CRAWLER_TYPE == "search":
# Search for video and retrieve their comment information.
await self.search()
elif config.CRAWLER_TYPE == "detail":
# Get the information and comments of the specified post
await self.get_specified_notes()
elif config.CRAWLER_TYPE == "creator":
# Get creator's information and their notes and comments
await self.get_creators_and_notes()
else:
pass
utils.logger.info("[WeiboCrawler.start] Weibo Crawler finished ...")
async def search(self):
"""
search weibo note with keywords
:return:
"""
utils.logger.info("[WeiboCrawler.search] Begin search weibo keywords")
weibo_limit_count = 10 # weibo limit page fixed value
if config.CRAWLER_MAX_NOTES_COUNT < weibo_limit_count:
config.CRAWLER_MAX_NOTES_COUNT = weibo_limit_count
start_page = config.START_PAGE
# Set the search type based on the configuration for weibo
if config.WEIBO_SEARCH_TYPE == "default":
search_type = SearchType.DEFAULT
elif config.WEIBO_SEARCH_TYPE == "real_time":
search_type = SearchType.REAL_TIME
elif config.WEIBO_SEARCH_TYPE == "popular":
search_type = SearchType.POPULAR
elif config.WEIBO_SEARCH_TYPE == "video":
search_type = SearchType.VIDEO
else:
utils.logger.error(f"[WeiboCrawler.search] Invalid WEIBO_SEARCH_TYPE: {config.WEIBO_SEARCH_TYPE}")
return
for keyword in config.KEYWORDS.split(","):
source_keyword_var.set(keyword)
utils.logger.info(f"[WeiboCrawler.search] Current search keyword: {keyword}")
page = 1
while (page - start_page + 1) * weibo_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
if page < start_page:
utils.logger.info(f"[WeiboCrawler.search] Skip page: {page}")
page += 1
continue
utils.logger.info(f"[WeiboCrawler.search] search weibo keyword: {keyword}, page: {page}")
search_res = await self.wb_client.get_note_by_keyword(keyword=keyword, page=page, search_type=search_type)
note_id_list: List[str] = []
note_list = filter_search_result_card(search_res.get("cards"))
for note_item in note_list:
if note_item:
mblog: Dict = note_item.get("mblog")
if mblog:
note_id_list.append(mblog.get("id"))
await weibo_store.update_weibo_note(note_item)
await self.get_note_images(mblog)
page += 1
await self.batch_get_notes_comments(note_id_list)
async def get_specified_notes(self):
"""
get specified notes info
:return:
"""
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [self.get_note_info_task(note_id=note_id, semaphore=semaphore) for note_id in config.WEIBO_SPECIFIED_ID_LIST]
video_details = await asyncio.gather(*task_list)
for note_item in video_details:
if note_item:
await weibo_store.update_weibo_note(note_item)
await self.batch_get_notes_comments(config.WEIBO_SPECIFIED_ID_LIST)
async def get_note_info_task(self, note_id: str, semaphore: asyncio.Semaphore) -> Optional[Dict]:
"""
Get note detail task
:param note_id:
:param semaphore:
:return:
"""
async with semaphore:
try:
result = await self.wb_client.get_note_info_by_id(note_id)
return result
except DataFetchError as ex:
utils.logger.error(f"[WeiboCrawler.get_note_info_task] Get note detail error: {ex}")
return None
except KeyError as ex:
utils.logger.error(f"[WeiboCrawler.get_note_info_task] have not fund note detail note_id:{note_id}, err: {ex}")
return None
async def batch_get_notes_comments(self, note_id_list: List[str]):
"""
batch get notes comments
:param note_id_list:
:return:
"""
if not config.ENABLE_GET_COMMENTS:
utils.logger.info(f"[WeiboCrawler.batch_get_note_comments] Crawling comment mode is not enabled")
return
utils.logger.info(f"[WeiboCrawler.batch_get_notes_comments] note ids:{note_id_list}")
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list: List[Task] = []
for note_id in note_id_list:
task = asyncio.create_task(self.get_note_comments(note_id, semaphore), name=note_id)
task_list.append(task)
await asyncio.gather(*task_list)
async def get_note_comments(self, note_id: str, semaphore: asyncio.Semaphore):
"""
get comment for note id
:param note_id:
:param semaphore:
:return:
"""
async with semaphore:
try:
utils.logger.info(f"[WeiboCrawler.get_note_comments] begin get note_id: {note_id} comments ...")
await self.wb_client.get_note_all_comments(
note_id=note_id,
crawl_interval=random.randint(1, 3), # 微博对API的限流比较严重,所以延时提高一些
callback=weibo_store.batch_update_weibo_note_comments,
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
)
except DataFetchError as ex:
utils.logger.error(f"[WeiboCrawler.get_note_comments] get note_id: {note_id} comment error: {ex}")
except Exception as e:
utils.logger.error(f"[WeiboCrawler.get_note_comments] may be been blocked, err:{e}")
async def get_note_images(self, mblog: Dict):
"""
get note images
:param mblog:
:return:
"""
if not config.ENABLE_GET_MEIDAS:
utils.logger.info(f"[WeiboCrawler.get_note_images] Crawling image mode is not enabled")
return
pics: Dict = mblog.get("pics")
if not pics:
return
for pic in pics:
url = pic.get("url")
if not url:
continue
content = await self.wb_client.get_note_image(url)
if content != None:
extension_file_name = url.split(".")[-1]
await weibo_store.update_weibo_note_image(pic["pid"], content, extension_file_name)
async def get_creators_and_notes(self) -> None:
"""
Get creator's information and their notes and comments
Returns:
"""
utils.logger.info("[WeiboCrawler.get_creators_and_notes] Begin get weibo creators")
for user_id in config.WEIBO_CREATOR_ID_LIST:
createor_info_res: Dict = await self.wb_client.get_creator_info_by_id(creator_id=user_id)
if createor_info_res:
createor_info: Dict = createor_info_res.get("userInfo", {})
utils.logger.info(f"[WeiboCrawler.get_creators_and_notes] creator info: {createor_info}")
if not createor_info:
raise DataFetchError("Get creator info error")
await weibo_store.save_creator(user_id, user_info=createor_info)
# Get all note information of the creator
all_notes_list = await self.wb_client.get_all_notes_by_creator_id(
creator_id=user_id,
container_id=createor_info_res.get("lfid_container_id"),
crawl_interval=0,
callback=weibo_store.batch_update_weibo_notes,
)
note_ids = [note_item.get("mblog", {}).get("id") for note_item in all_notes_list if note_item.get("mblog", {}).get("id")]
await self.batch_get_notes_comments(note_ids)
else:
utils.logger.error(f"[WeiboCrawler.get_creators_and_notes] get creator info error, creator_id:{user_id}")
async def create_weibo_client(self, httpx_proxy: Optional[str]) -> WeiboClient:
"""Create xhs client"""
utils.logger.info("[WeiboCrawler.create_weibo_client] Begin create weibo API client ...")
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
weibo_client_obj = WeiboClient(
proxy=httpx_proxy,
headers={
"User-Agent": utils.get_mobile_user_agent(),
"Cookie": cookie_str,
"Origin": "https://m.weibo.cn",
"Referer": "https://m.weibo.cn",
"Content-Type": "application/json;charset=UTF-8",
},
playwright_page=self.context_page,
cookie_dict=cookie_dict,
)
return weibo_client_obj
async def launch_browser(
self,
chromium: BrowserType,
playwright_proxy: Optional[Dict],
user_agent: Optional[str],
headless: bool = True,
) -> BrowserContext:
"""Launch browser and create browser context"""
utils.logger.info("[WeiboCrawler.launch_browser] Begin create browser context ...")
if config.SAVE_LOGIN_STATE:
user_data_dir = os.path.join(os.getcwd(), "browser_data", config.USER_DATA_DIR % config.PLATFORM) # type: ignore
browser_context = await chromium.launch_persistent_context(
user_data_dir=user_data_dir,
accept_downloads=True,
headless=headless,
proxy=playwright_proxy, # type: ignore
viewport={
"width": 1920,
"height": 1080
},
user_agent=user_agent,
)
return browser_context
else:
browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore
browser_context = await browser.new_context(viewport={"width": 1920, "height": 1080}, user_agent=user_agent)
return browser_context
async def launch_browser_with_cdp(
self,
playwright: Playwright,
playwright_proxy: Optional[Dict],
user_agent: Optional[str],
headless: bool = True,
) -> BrowserContext:
"""
使用CDP模式启动浏览器
"""
try:
self.cdp_manager = CDPBrowserManager()
browser_context = await self.cdp_manager.launch_and_connect(
playwright=playwright,
playwright_proxy=playwright_proxy,
user_agent=user_agent,
headless=headless,
)
# 显示浏览器信息
browser_info = await self.cdp_manager.get_browser_info()
utils.logger.info(f"[WeiboCrawler] CDP浏览器信息: {browser_info}")
return browser_context
except Exception as e:
utils.logger.error(f"[WeiboCrawler] CDP模式启动失败,回退到标准模式: {e}")
# 回退到标准模式
chromium = playwright.chromium
return await self.launch_browser(chromium, playwright_proxy, user_agent, headless)
async def close(self):
"""Close browser context"""
# 如果使用CDP模式,需要特殊处理
if self.cdp_manager:
await self.cdp_manager.cleanup()
self.cdp_manager = None
else:
await self.browser_context.close()
utils.logger.info("[WeiboCrawler.close] Browser context closed ...")