core.py
17.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
import asyncio
import os
import random
from asyncio import Task
from typing import Any, Dict, List, Optional, Tuple
from playwright.async_api import (
BrowserContext,
BrowserType,
Page,
Playwright,
async_playwright,
)
import config
from base.base_crawler import AbstractCrawler
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
from store import douyin as douyin_store
from tools import utils
from tools.cdp_browser import CDPBrowserManager
from var import crawler_type_var, source_keyword_var
from .client import DouYinClient
from .exception import DataFetchError
from .field import PublishTimeType
from .login import DouYinLogin
class DouYinCrawler(AbstractCrawler):
context_page: Page
dy_client: DouYinClient
browser_context: BrowserContext
cdp_manager: Optional[CDPBrowserManager]
def __init__(self) -> None:
self.index_url = "https://www.douyin.com"
self.cdp_manager = None
async def start(self) -> None:
playwright_proxy_format, httpx_proxy_format = None, None
if config.ENABLE_IP_PROXY:
ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True)
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info)
async with async_playwright() as playwright:
# 根据配置选择启动模式
if config.ENABLE_CDP_MODE:
utils.logger.info("[DouYinCrawler] 使用CDP模式启动浏览器")
self.browser_context = await self.launch_browser_with_cdp(
playwright,
playwright_proxy_format,
None,
headless=config.CDP_HEADLESS,
)
else:
utils.logger.info("[DouYinCrawler] 使用标准模式启动浏览器")
# Launch a browser context.
chromium = playwright.chromium
self.browser_context = await self.launch_browser(
chromium,
playwright_proxy_format,
user_agent=None,
headless=config.HEADLESS,
)
# stealth.min.js is a js script to prevent the website from detecting the crawler.
await self.browser_context.add_init_script(path="libs/stealth.min.js")
self.context_page = await self.browser_context.new_page()
await self.context_page.goto(self.index_url)
self.dy_client = await self.create_douyin_client(httpx_proxy_format)
if not await self.dy_client.pong(browser_context=self.browser_context):
login_obj = DouYinLogin(
login_type=config.LOGIN_TYPE,
login_phone="", # you phone number
browser_context=self.browser_context,
context_page=self.context_page,
cookie_str=config.COOKIES,
)
await login_obj.begin()
await self.dy_client.update_cookies(browser_context=self.browser_context)
crawler_type_var.set(config.CRAWLER_TYPE)
if config.CRAWLER_TYPE == "search":
# Search for notes and retrieve their comment information.
await self.search()
elif config.CRAWLER_TYPE == "detail":
# Get the information and comments of the specified post
await self.get_specified_awemes()
elif config.CRAWLER_TYPE == "creator":
# Get the information and comments of the specified creator
await self.get_creators_and_videos()
utils.logger.info("[DouYinCrawler.start] Douyin Crawler finished ...")
async def search(self) -> None:
utils.logger.info("[DouYinCrawler.search] Begin search douyin keywords")
dy_limit_count = 10 # douyin limit page fixed value
if config.CRAWLER_MAX_NOTES_COUNT < dy_limit_count:
config.CRAWLER_MAX_NOTES_COUNT = dy_limit_count
start_page = config.START_PAGE # start page number
for keyword in config.KEYWORDS.split(","):
source_keyword_var.set(keyword)
utils.logger.info(f"[DouYinCrawler.search] Current keyword: {keyword}")
aweme_list: List[str] = []
page = 0
dy_search_id = ""
while (page - start_page + 1) * dy_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
if page < start_page:
utils.logger.info(f"[DouYinCrawler.search] Skip {page}")
page += 1
continue
try:
utils.logger.info(f"[DouYinCrawler.search] search douyin keyword: {keyword}, page: {page}")
posts_res = await self.dy_client.search_info_by_keyword(
keyword=keyword,
offset=page * dy_limit_count - dy_limit_count,
publish_time=PublishTimeType(config.PUBLISH_TIME_TYPE),
search_id=dy_search_id,
)
if posts_res.get("data") is None or posts_res.get("data") == []:
utils.logger.info(f"[DouYinCrawler.search] search douyin keyword: {keyword}, page: {page} is empty,{posts_res.get('data')}`")
break
except DataFetchError:
utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed")
break
page += 1
if "data" not in posts_res:
utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed,账号也许被风控了。")
break
dy_search_id = posts_res.get("extra", {}).get("logid", "")
for post_item in posts_res.get("data"):
try:
aweme_info: Dict = (post_item.get("aweme_info") or post_item.get("aweme_mix_info", {}).get("mix_items")[0])
except TypeError:
continue
aweme_list.append(aweme_info.get("aweme_id", ""))
await douyin_store.update_douyin_aweme(aweme_item=aweme_info)
await self.get_aweme_media(aweme_item=aweme_info)
utils.logger.info(f"[DouYinCrawler.search] keyword:{keyword}, aweme_list:{aweme_list}")
await self.batch_get_note_comments(aweme_list)
async def get_specified_awemes(self):
"""Get the information and comments of the specified post"""
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [self.get_aweme_detail(aweme_id=aweme_id, semaphore=semaphore) for aweme_id in config.DY_SPECIFIED_ID_LIST]
aweme_details = await asyncio.gather(*task_list)
for aweme_detail in aweme_details:
if aweme_detail is not None:
await douyin_store.update_douyin_aweme(aweme_item=aweme_detail)
await self.get_aweme_media(aweme_item=aweme_detail)
await self.batch_get_note_comments(config.DY_SPECIFIED_ID_LIST)
async def get_aweme_detail(self, aweme_id: str, semaphore: asyncio.Semaphore) -> Any:
"""Get note detail"""
async with semaphore:
try:
return await self.dy_client.get_video_by_id(aweme_id)
except DataFetchError as ex:
utils.logger.error(f"[DouYinCrawler.get_aweme_detail] Get aweme detail error: {ex}")
return None
except KeyError as ex:
utils.logger.error(f"[DouYinCrawler.get_aweme_detail] have not fund note detail aweme_id:{aweme_id}, err: {ex}")
return None
async def batch_get_note_comments(self, aweme_list: List[str]) -> None:
"""
Batch get note comments
"""
if not config.ENABLE_GET_COMMENTS:
utils.logger.info(f"[DouYinCrawler.batch_get_note_comments] Crawling comment mode is not enabled")
return
task_list: List[Task] = []
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
for aweme_id in aweme_list:
task = asyncio.create_task(self.get_comments(aweme_id, semaphore), name=aweme_id)
task_list.append(task)
if len(task_list) > 0:
await asyncio.wait(task_list)
async def get_comments(self, aweme_id: str, semaphore: asyncio.Semaphore) -> None:
async with semaphore:
try:
# 将关键词列表传递给 get_aweme_all_comments 方法
await self.dy_client.get_aweme_all_comments(
aweme_id=aweme_id,
crawl_interval=random.random(),
is_fetch_sub_comments=config.ENABLE_GET_SUB_COMMENTS,
callback=douyin_store.batch_update_dy_aweme_comments,
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
)
utils.logger.info(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} comments have all been obtained and filtered ...")
except DataFetchError as e:
utils.logger.error(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} get comments failed, error: {e}")
async def get_creators_and_videos(self) -> None:
"""
Get the information and videos of the specified creator
"""
utils.logger.info("[DouYinCrawler.get_creators_and_videos] Begin get douyin creators")
for user_id in config.DY_CREATOR_ID_LIST:
creator_info: Dict = await self.dy_client.get_user_info(user_id)
if creator_info:
await douyin_store.save_creator(user_id, creator=creator_info)
# Get all video information of the creator
all_video_list = await self.dy_client.get_all_user_aweme_posts(sec_user_id=user_id, callback=self.fetch_creator_video_detail)
video_ids = [video_item.get("aweme_id") for video_item in all_video_list]
await self.batch_get_note_comments(video_ids)
async def fetch_creator_video_detail(self, video_list: List[Dict]):
"""
Concurrently obtain the specified post list and save the data
"""
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [self.get_aweme_detail(post_item.get("aweme_id"), semaphore) for post_item in video_list]
note_details = await asyncio.gather(*task_list)
for aweme_item in note_details:
if aweme_item is not None:
await douyin_store.update_douyin_aweme(aweme_item=aweme_item)
await self.get_aweme_media(aweme_item=aweme_item)
async def create_douyin_client(self, httpx_proxy: Optional[str]) -> DouYinClient:
"""Create douyin client"""
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) # type: ignore
douyin_client = DouYinClient(
proxy=httpx_proxy,
headers={
"User-Agent": await self.context_page.evaluate("() => navigator.userAgent"),
"Cookie": cookie_str,
"Host": "www.douyin.com",
"Origin": "https://www.douyin.com/",
"Referer": "https://www.douyin.com/",
"Content-Type": "application/json;charset=UTF-8",
},
playwright_page=self.context_page,
cookie_dict=cookie_dict,
)
return douyin_client
async def launch_browser(
self,
chromium: BrowserType,
playwright_proxy: Optional[Dict],
user_agent: Optional[str],
headless: bool = True,
) -> BrowserContext:
"""Launch browser and create browser context"""
if config.SAVE_LOGIN_STATE:
user_data_dir = os.path.join(os.getcwd(), "browser_data", config.USER_DATA_DIR % config.PLATFORM) # type: ignore
browser_context = await chromium.launch_persistent_context(
user_data_dir=user_data_dir,
accept_downloads=True,
headless=headless,
proxy=playwright_proxy, # type: ignore
viewport={
"width": 1920,
"height": 1080
},
user_agent=user_agent,
) # type: ignore
return browser_context
else:
browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore
browser_context = await browser.new_context(viewport={"width": 1920, "height": 1080}, user_agent=user_agent)
return browser_context
async def launch_browser_with_cdp(
self,
playwright: Playwright,
playwright_proxy: Optional[Dict],
user_agent: Optional[str],
headless: bool = True,
) -> BrowserContext:
"""
使用CDP模式启动浏览器
"""
try:
self.cdp_manager = CDPBrowserManager()
browser_context = await self.cdp_manager.launch_and_connect(
playwright=playwright,
playwright_proxy=playwright_proxy,
user_agent=user_agent,
headless=headless,
)
# 添加反检测脚本
await self.cdp_manager.add_stealth_script()
# 显示浏览器信息
browser_info = await self.cdp_manager.get_browser_info()
utils.logger.info(f"[DouYinCrawler] CDP浏览器信息: {browser_info}")
return browser_context
except Exception as e:
utils.logger.error(f"[DouYinCrawler] CDP模式启动失败,回退到标准模式: {e}")
# 回退到标准模式
chromium = playwright.chromium
return await self.launch_browser(chromium, playwright_proxy, user_agent, headless)
async def close(self) -> None:
"""Close browser context"""
# 如果使用CDP模式,需要特殊处理
if self.cdp_manager:
await self.cdp_manager.cleanup()
self.cdp_manager = None
else:
await self.browser_context.close()
utils.logger.info("[DouYinCrawler.close] Browser context closed ...")
async def get_aweme_media(self, aweme_item: Dict):
"""
获取抖音媒体,自动判断媒体类型是短视频还是帖子图片并下载
Args:
aweme_item (Dict): 抖音作品详情
"""
if not config.ENABLE_GET_MEIDAS:
utils.logger.info(f"[DouYinCrawler.get_aweme_media] Crawling image mode is not enabled")
return
# 笔记 urls 列表,若为短视频类型则返回为空列表
note_download_url: List[str] = douyin_store._extract_note_image_list(aweme_item)
# 视频 url,永远存在,但为短视频类型时的文件其实是音频文件
video_download_url: str = douyin_store._extract_video_download_url(aweme_item)
# TODO: 抖音并没采用音视频分离的策略,故音频可从原视频中分离,暂不提取
if note_download_url:
await self.get_aweme_images(aweme_item)
else:
await self.get_aweme_video(aweme_item)
async def get_aweme_images(self, aweme_item: Dict):
"""
get aweme images. please use get_aweme_media
Args:
aweme_item (Dict): 抖音作品详情
"""
if not config.ENABLE_GET_MEIDAS:
return
aweme_id = aweme_item.get("aweme_id")
# 笔记 urls 列表,若为短视频类型则返回为空列表
note_download_url: List[str] = douyin_store._extract_note_image_list(aweme_item)
if not note_download_url:
return
picNum = 0
for url in note_download_url:
if not url:
continue
content = await self.dy_client.get_aweme_media(url)
if content is None:
continue
extension_file_name = f"{picNum:>03d}.jpeg"
picNum += 1
await douyin_store.update_dy_aweme_image(aweme_id, content, extension_file_name)
async def get_aweme_video(self, aweme_item: Dict):
"""
get aweme videos. please use get_aweme_media
Args:
aweme_item (Dict): 抖音作品详情
"""
if not config.ENABLE_GET_MEIDAS:
return
aweme_id = aweme_item.get("aweme_id")
# 视频 url,永远存在,但为短视频类型时的文件其实是音频文件
video_download_url: str = douyin_store._extract_video_download_url(aweme_item)
if not video_download_url:
return
content = await self.dy_client.get_aweme_media(video_download_url)
if content is None:
return
extension_file_name = f"video.mp4"
await douyin_store.update_dy_aweme_video(aweme_id, content, extension_file_name)