Committed by
GitHub
feat(xhs): 集成xhshow库优化签名生成与请求参数 (#330)
* feat(xhs): 集成xhshow库优化签名生成与请求参数 - 引入xhshow库用于小红书API签名生成 - 替换原有的seccore_signv2_playwright签名校验方式 - 支持GET和POST请求的差异化签名处理 - 增加对b1值从localStorage获取的容错处理 - 更新x-t时间戳为毫秒级精度 - 在获取博主笔记接口中增加xsec_token和xsec_source参数- 支持通过配置传递验证token和渠道来源 - 更新依赖文件引入xhshow库- 调整配置示例适配新的token参数要求 * Delete MindSpider/DeepSentimentCrawling/MediaCrawler/config/xhs_config.py 移除配置文件 * Add xhs_config.py for Xiaohongshu platform settings 恢复错误删除的文件 --------- Co-authored-by: gehongbin <gehongbin@autohome.com.cn> Co-authored-by: Doiiars <doiiars@qq.com>
Showing
5 changed files
with
64 additions
and
22 deletions
| @@ -17,6 +17,7 @@ from urllib.parse import urlencode | @@ -17,6 +17,7 @@ from urllib.parse import urlencode | ||
| 17 | import httpx | 17 | import httpx |
| 18 | from playwright.async_api import BrowserContext, Page | 18 | from playwright.async_api import BrowserContext, Page |
| 19 | from tenacity import retry, stop_after_attempt, wait_fixed | 19 | from tenacity import retry, stop_after_attempt, wait_fixed |
| 20 | +from xhshow import Xhshow | ||
| 20 | 21 | ||
| 21 | import config | 22 | import config |
| 22 | from base.base_crawler import AbstractApiClient | 23 | from base.base_crawler import AbstractApiClient |
| @@ -27,7 +28,6 @@ from .exception import DataFetchError, IPBlockError | @@ -27,7 +28,6 @@ from .exception import DataFetchError, IPBlockError | ||
| 27 | from .field import SearchNoteType, SearchSortType | 28 | from .field import SearchNoteType, SearchSortType |
| 28 | from .help import get_search_id, sign | 29 | from .help import get_search_id, sign |
| 29 | from .extractor import XiaoHongShuExtractor | 30 | from .extractor import XiaoHongShuExtractor |
| 30 | -from .secsign import seccore_signv2_playwright | ||
| 31 | 31 | ||
| 32 | 32 | ||
| 33 | class XiaoHongShuClient(AbstractApiClient): | 33 | class XiaoHongShuClient(AbstractApiClient): |
| @@ -53,24 +53,51 @@ class XiaoHongShuClient(AbstractApiClient): | @@ -53,24 +53,51 @@ class XiaoHongShuClient(AbstractApiClient): | ||
| 53 | self.playwright_page = playwright_page | 53 | self.playwright_page = playwright_page |
| 54 | self.cookie_dict = cookie_dict | 54 | self.cookie_dict = cookie_dict |
| 55 | self._extractor = XiaoHongShuExtractor() | 55 | self._extractor = XiaoHongShuExtractor() |
| 56 | + # 初始化 xhshow 客户端用于签名生成 | ||
| 57 | + self._xhshow_client = Xhshow() | ||
| 56 | 58 | ||
| 57 | async def _pre_headers(self, url: str, data=None) -> Dict: | 59 | async def _pre_headers(self, url: str, data=None) -> Dict: |
| 58 | """ | 60 | """ |
| 59 | - 请求头参数签名 | 61 | + 请求头参数签名,使用 xhshow 库生成签名 |
| 60 | Args: | 62 | Args: |
| 61 | - url: | ||
| 62 | - data: | 63 | + url: 完整的 URI(GET 请求包含查询参数) |
| 64 | + data: POST 请求的请求体数据 | ||
| 63 | 65 | ||
| 64 | Returns: | 66 | Returns: |
| 65 | 67 | ||
| 66 | """ | 68 | """ |
| 67 | - x_s = await seccore_signv2_playwright(self.playwright_page, url, data) | ||
| 68 | - local_storage = await self.playwright_page.evaluate("() => window.localStorage") | 69 | + # 获取 a1 cookie 值 |
| 70 | + a1_value = self.cookie_dict.get("a1", "") | ||
| 71 | + | ||
| 72 | + # 根据请求类型使用不同的签名方法 | ||
| 73 | + if data is None: | ||
| 74 | + # GET 请求:从 url 中提取参数 | ||
| 75 | + from urllib.parse import urlparse, parse_qs | ||
| 76 | + parsed = urlparse(url) | ||
| 77 | + params = {k: v[0] if len(v) == 1 else v for k, v in parse_qs(parsed.query).items()} | ||
| 78 | + # 使用完整的 URL(包含 host) | ||
| 79 | + full_url = f"{self._host}{url}" | ||
| 80 | + x_s = self._xhshow_client.sign_xs_get(uri=full_url, a1_value=a1_value, params=params) | ||
| 81 | + else: | ||
| 82 | + # POST 请求:使用 data 作为 payload | ||
| 83 | + full_url = f"{self._host}{url}" | ||
| 84 | + x_s = self._xhshow_client.sign_xs_post(uri=full_url, a1_value=a1_value, payload=data) | ||
| 85 | + | ||
| 86 | + # 尝试获取 b1 值(从 localStorage),如果获取失败则使用空字符串 | ||
| 87 | + b1_value = "" | ||
| 88 | + try: | ||
| 89 | + if self.playwright_page: | ||
| 90 | + local_storage = await self.playwright_page.evaluate("() => window.localStorage") | ||
| 91 | + b1_value = local_storage.get("b1", "") | ||
| 92 | + except Exception as e: | ||
| 93 | + utils.logger.warning(f"[XiaoHongShuClient._pre_headers] Failed to get b1 from localStorage: {e}, using empty string") | ||
| 94 | + | ||
| 95 | + # 使用 sign 函数生成其他签名头 | ||
| 69 | signs = sign( | 96 | signs = sign( |
| 70 | - a1=self.cookie_dict.get("a1", ""), | ||
| 71 | - b1=local_storage.get("b1", ""), | 97 | + a1=a1_value, |
| 98 | + b1=b1_value, | ||
| 72 | x_s=x_s, | 99 | x_s=x_s, |
| 73 | - x_t=str(int(time.time())), | 100 | + x_t=str(int(time.time() * 1000)), # x-t 使用毫秒时间戳 |
| 74 | ) | 101 | ) |
| 75 | 102 | ||
| 76 | headers = { | 103 | headers = { |
| @@ -115,7 +142,8 @@ class XiaoHongShuClient(AbstractApiClient): | @@ -115,7 +142,8 @@ class XiaoHongShuClient(AbstractApiClient): | ||
| 115 | elif data["code"] == self.IP_ERROR_CODE: | 142 | elif data["code"] == self.IP_ERROR_CODE: |
| 116 | raise IPBlockError(self.IP_ERROR_STR) | 143 | raise IPBlockError(self.IP_ERROR_STR) |
| 117 | else: | 144 | else: |
| 118 | - raise DataFetchError(data.get("msg", None)) | 145 | + err_msg = data.get("msg", None) or f"{response.text}" |
| 146 | + raise DataFetchError(err_msg) | ||
| 119 | 147 | ||
| 120 | async def get(self, uri: str, params=None) -> Dict: | 148 | async def get(self, uri: str, params=None) -> Dict: |
| 121 | """ | 149 | """ |
| @@ -480,6 +508,8 @@ class XiaoHongShuClient(AbstractApiClient): | @@ -480,6 +508,8 @@ class XiaoHongShuClient(AbstractApiClient): | ||
| 480 | creator: str, | 508 | creator: str, |
| 481 | cursor: str, | 509 | cursor: str, |
| 482 | page_size: int = 30, | 510 | page_size: int = 30, |
| 511 | + xsec_token: str = "", | ||
| 512 | + xsec_source: str = "pc_feed", | ||
| 483 | ) -> Dict: | 513 | ) -> Dict: |
| 484 | """ | 514 | """ |
| 485 | 获取博主的笔记 | 515 | 获取博主的笔记 |
| @@ -487,24 +517,22 @@ class XiaoHongShuClient(AbstractApiClient): | @@ -487,24 +517,22 @@ class XiaoHongShuClient(AbstractApiClient): | ||
| 487 | creator: 博主ID | 517 | creator: 博主ID |
| 488 | cursor: 上一页最后一条笔记的ID | 518 | cursor: 上一页最后一条笔记的ID |
| 489 | page_size: 分页数据长度 | 519 | page_size: 分页数据长度 |
| 520 | + xsec_token: 验证token | ||
| 521 | + xsec_source: 渠道来源 | ||
| 490 | 522 | ||
| 491 | Returns: | 523 | Returns: |
| 492 | 524 | ||
| 493 | """ | 525 | """ |
| 494 | - uri = "/api/sns/web/v1/user_posted" | ||
| 495 | - data = { | ||
| 496 | - "user_id": creator, | ||
| 497 | - "cursor": cursor, | ||
| 498 | - "num": page_size, | ||
| 499 | - "image_formats": "jpg,webp,avif", | ||
| 500 | - } | ||
| 501 | - return await self.get(uri, data) | 526 | + uri = f"/api/sns/web/v1/user_posted?num={page_size}&cursor={cursor}&user_id={creator}&xsec_token={xsec_token}&xsec_source={xsec_source}" |
| 527 | + return await self.get(uri) | ||
| 502 | 528 | ||
| 503 | async def get_all_notes_by_creator( | 529 | async def get_all_notes_by_creator( |
| 504 | self, | 530 | self, |
| 505 | user_id: str, | 531 | user_id: str, |
| 506 | crawl_interval: float = 1.0, | 532 | crawl_interval: float = 1.0, |
| 507 | callback: Optional[Callable] = None, | 533 | callback: Optional[Callable] = None, |
| 534 | + xsec_token: str = "", | ||
| 535 | + xsec_source: str = "pc_feed", | ||
| 508 | ) -> List[Dict]: | 536 | ) -> List[Dict]: |
| 509 | """ | 537 | """ |
| 510 | 获取指定用户下的所有发过的帖子,该方法会一直查找一个用户下的所有帖子信息 | 538 | 获取指定用户下的所有发过的帖子,该方法会一直查找一个用户下的所有帖子信息 |
| @@ -512,6 +540,8 @@ class XiaoHongShuClient(AbstractApiClient): | @@ -512,6 +540,8 @@ class XiaoHongShuClient(AbstractApiClient): | ||
| 512 | user_id: 用户ID | 540 | user_id: 用户ID |
| 513 | crawl_interval: 爬取一次的延迟单位(秒) | 541 | crawl_interval: 爬取一次的延迟单位(秒) |
| 514 | callback: 一次分页爬取结束后的更新回调函数 | 542 | callback: 一次分页爬取结束后的更新回调函数 |
| 543 | + xsec_token: 验证token | ||
| 544 | + xsec_source: 渠道来源 | ||
| 515 | 545 | ||
| 516 | Returns: | 546 | Returns: |
| 517 | 547 | ||
| @@ -520,7 +550,7 @@ class XiaoHongShuClient(AbstractApiClient): | @@ -520,7 +550,7 @@ class XiaoHongShuClient(AbstractApiClient): | ||
| 520 | notes_has_more = True | 550 | notes_has_more = True |
| 521 | notes_cursor = "" | 551 | notes_cursor = "" |
| 522 | while notes_has_more and len(result) < config.CRAWLER_MAX_NOTES_COUNT: | 552 | while notes_has_more and len(result) < config.CRAWLER_MAX_NOTES_COUNT: |
| 523 | - notes_res = await self.get_notes_by_creator(user_id, notes_cursor) | 553 | + notes_res = await self.get_notes_by_creator(user_id, notes_cursor, xsec_token=xsec_token, xsec_source=xsec_source) |
| 524 | if not notes_res: | 554 | if not notes_res: |
| 525 | utils.logger.error( | 555 | utils.logger.error( |
| 526 | f"[XiaoHongShuClient.get_notes_by_creator] The current creator may have been banned by xhs, so they cannot access the data." | 556 | f"[XiaoHongShuClient.get_notes_by_creator] The current creator may have been banned by xhs, so they cannot access the data." |
| @@ -201,6 +201,8 @@ class XiaoHongShuCrawler(AbstractCrawler): | @@ -201,6 +201,8 @@ class XiaoHongShuCrawler(AbstractCrawler): | ||
| 201 | user_id=user_id, | 201 | user_id=user_id, |
| 202 | crawl_interval=crawl_interval, | 202 | crawl_interval=crawl_interval, |
| 203 | callback=self.fetch_creator_notes_detail, | 203 | callback=self.fetch_creator_notes_detail, |
| 204 | + xsec_token=creator_info.xsec_token, | ||
| 205 | + xsec_source=creator_info.xsec_source, | ||
| 204 | ) | 206 | ) |
| 205 | 207 | ||
| 206 | note_ids = [] | 208 | note_ids = [] |
| @@ -279,12 +281,19 @@ class XiaoHongShuCrawler(AbstractCrawler): | @@ -279,12 +281,19 @@ class XiaoHongShuCrawler(AbstractCrawler): | ||
| 279 | Dict: note detail | 281 | Dict: note detail |
| 280 | """ | 282 | """ |
| 281 | note_detail = None | 283 | note_detail = None |
| 284 | + utils.logger.info(f"[get_note_detail_async_task] Begin get note detail, note_id: {note_id}") | ||
| 282 | async with semaphore: | 285 | async with semaphore: |
| 283 | try: | 286 | try: |
| 284 | - utils.logger.info(f"[get_note_detail_async_task] Begin get note detail, note_id: {note_id}") | ||
| 285 | - note_detail = await self.xhs_client.get_note_by_id_from_html(note_id, xsec_source, xsec_token, enable_cookie=True) | 287 | + try: |
| 288 | + note_detail = await self.xhs_client.get_note_by_id(note_id, xsec_source, xsec_token) | ||
| 289 | + except RetryError: | ||
| 290 | + pass | ||
| 291 | + | ||
| 286 | if not note_detail: | 292 | if not note_detail: |
| 287 | - raise Exception(f"[get_note_detail_async_task] Failed to get note detail, Id: {note_id}") | 293 | + note_detail = await self.xhs_client.get_note_by_id_from_html(note_id, xsec_source, xsec_token, |
| 294 | + enable_cookie=True) | ||
| 295 | + if not note_detail: | ||
| 296 | + raise Exception(f"[get_note_detail_async_task] Failed to get note detail, Id: {note_id}") | ||
| 288 | 297 | ||
| 289 | note_detail.update({"xsec_token": xsec_token, "xsec_source": xsec_source}) | 298 | note_detail.update({"xsec_token": xsec_token, "xsec_source": xsec_source}) |
| 290 | 299 |
-
Please register or login to post a comment