老葛
Committed by GitHub

feat(xhs): 集成xhshow库优化签名生成与请求参数 (#330)

* feat(xhs): 集成xhshow库优化签名生成与请求参数

- 引入xhshow库用于小红书API签名生成
- 替换原有的seccore_signv2_playwright签名校验方式
- 支持GET和POST请求的差异化签名处理
- 增加对b1值从localStorage获取的容错处理
- 更新x-t时间戳为毫秒级精度
- 在获取博主笔记接口中增加xsec_token和xsec_source参数- 支持通过配置传递验证token和渠道来源
- 更新依赖文件引入xhshow库- 调整配置示例适配新的token参数要求

* Delete MindSpider/DeepSentimentCrawling/MediaCrawler/config/xhs_config.py

移除配置文件

* Add xhs_config.py for Xiaohongshu platform settings

恢复错误删除的文件

---------

Co-authored-by: gehongbin <gehongbin@autohome.com.cn>
Co-authored-by: Doiiars <doiiars@qq.com>
@@ -17,6 +17,7 @@ from urllib.parse import urlencode @@ -17,6 +17,7 @@ from urllib.parse import urlencode
17 import httpx 17 import httpx
18 from playwright.async_api import BrowserContext, Page 18 from playwright.async_api import BrowserContext, Page
19 from tenacity import retry, stop_after_attempt, wait_fixed 19 from tenacity import retry, stop_after_attempt, wait_fixed
  20 +from xhshow import Xhshow
20 21
21 import config 22 import config
22 from base.base_crawler import AbstractApiClient 23 from base.base_crawler import AbstractApiClient
@@ -27,7 +28,6 @@ from .exception import DataFetchError, IPBlockError @@ -27,7 +28,6 @@ from .exception import DataFetchError, IPBlockError
27 from .field import SearchNoteType, SearchSortType 28 from .field import SearchNoteType, SearchSortType
28 from .help import get_search_id, sign 29 from .help import get_search_id, sign
29 from .extractor import XiaoHongShuExtractor 30 from .extractor import XiaoHongShuExtractor
30 -from .secsign import seccore_signv2_playwright  
31 31
32 32
33 class XiaoHongShuClient(AbstractApiClient): 33 class XiaoHongShuClient(AbstractApiClient):
@@ -53,24 +53,51 @@ class XiaoHongShuClient(AbstractApiClient): @@ -53,24 +53,51 @@ class XiaoHongShuClient(AbstractApiClient):
53 self.playwright_page = playwright_page 53 self.playwright_page = playwright_page
54 self.cookie_dict = cookie_dict 54 self.cookie_dict = cookie_dict
55 self._extractor = XiaoHongShuExtractor() 55 self._extractor = XiaoHongShuExtractor()
  56 + # 初始化 xhshow 客户端用于签名生成
  57 + self._xhshow_client = Xhshow()
56 58
57 async def _pre_headers(self, url: str, data=None) -> Dict: 59 async def _pre_headers(self, url: str, data=None) -> Dict:
58 """ 60 """
59 - 请求头参数签名 61 + 请求头参数签名,使用 xhshow 库生成签名
60 Args: 62 Args:
61 - url:  
62 - data: 63 + url: 完整的 URI(GET 请求包含查询参数)
  64 + data: POST 请求的请求体数据
63 65
64 Returns: 66 Returns:
65 67
66 """ 68 """
67 - x_s = await seccore_signv2_playwright(self.playwright_page, url, data)  
68 - local_storage = await self.playwright_page.evaluate("() => window.localStorage") 69 + # 获取 a1 cookie 值
  70 + a1_value = self.cookie_dict.get("a1", "")
  71 +
  72 + # 根据请求类型使用不同的签名方法
  73 + if data is None:
  74 + # GET 请求:从 url 中提取参数
  75 + from urllib.parse import urlparse, parse_qs
  76 + parsed = urlparse(url)
  77 + params = {k: v[0] if len(v) == 1 else v for k, v in parse_qs(parsed.query).items()}
  78 + # 使用完整的 URL(包含 host)
  79 + full_url = f"{self._host}{url}"
  80 + x_s = self._xhshow_client.sign_xs_get(uri=full_url, a1_value=a1_value, params=params)
  81 + else:
  82 + # POST 请求:使用 data 作为 payload
  83 + full_url = f"{self._host}{url}"
  84 + x_s = self._xhshow_client.sign_xs_post(uri=full_url, a1_value=a1_value, payload=data)
  85 +
  86 + # 尝试获取 b1 值(从 localStorage),如果获取失败则使用空字符串
  87 + b1_value = ""
  88 + try:
  89 + if self.playwright_page:
  90 + local_storage = await self.playwright_page.evaluate("() => window.localStorage")
  91 + b1_value = local_storage.get("b1", "")
  92 + except Exception as e:
  93 + utils.logger.warning(f"[XiaoHongShuClient._pre_headers] Failed to get b1 from localStorage: {e}, using empty string")
  94 +
  95 + # 使用 sign 函数生成其他签名头
69 signs = sign( 96 signs = sign(
70 - a1=self.cookie_dict.get("a1", ""),  
71 - b1=local_storage.get("b1", ""), 97 + a1=a1_value,
  98 + b1=b1_value,
72 x_s=x_s, 99 x_s=x_s,
73 - x_t=str(int(time.time())), 100 + x_t=str(int(time.time() * 1000)), # x-t 使用毫秒时间戳
74 ) 101 )
75 102
76 headers = { 103 headers = {
@@ -115,7 +142,8 @@ class XiaoHongShuClient(AbstractApiClient): @@ -115,7 +142,8 @@ class XiaoHongShuClient(AbstractApiClient):
115 elif data["code"] == self.IP_ERROR_CODE: 142 elif data["code"] == self.IP_ERROR_CODE:
116 raise IPBlockError(self.IP_ERROR_STR) 143 raise IPBlockError(self.IP_ERROR_STR)
117 else: 144 else:
118 - raise DataFetchError(data.get("msg", None)) 145 + err_msg = data.get("msg", None) or f"{response.text}"
  146 + raise DataFetchError(err_msg)
119 147
120 async def get(self, uri: str, params=None) -> Dict: 148 async def get(self, uri: str, params=None) -> Dict:
121 """ 149 """
@@ -480,6 +508,8 @@ class XiaoHongShuClient(AbstractApiClient): @@ -480,6 +508,8 @@ class XiaoHongShuClient(AbstractApiClient):
480 creator: str, 508 creator: str,
481 cursor: str, 509 cursor: str,
482 page_size: int = 30, 510 page_size: int = 30,
  511 + xsec_token: str = "",
  512 + xsec_source: str = "pc_feed",
483 ) -> Dict: 513 ) -> Dict:
484 """ 514 """
485 获取博主的笔记 515 获取博主的笔记
@@ -487,24 +517,22 @@ class XiaoHongShuClient(AbstractApiClient): @@ -487,24 +517,22 @@ class XiaoHongShuClient(AbstractApiClient):
487 creator: 博主ID 517 creator: 博主ID
488 cursor: 上一页最后一条笔记的ID 518 cursor: 上一页最后一条笔记的ID
489 page_size: 分页数据长度 519 page_size: 分页数据长度
  520 + xsec_token: 验证token
  521 + xsec_source: 渠道来源
490 522
491 Returns: 523 Returns:
492 524
493 """ 525 """
494 - uri = "/api/sns/web/v1/user_posted"  
495 - data = {  
496 - "user_id": creator,  
497 - "cursor": cursor,  
498 - "num": page_size,  
499 - "image_formats": "jpg,webp,avif",  
500 - }  
501 - return await self.get(uri, data) 526 + uri = f"/api/sns/web/v1/user_posted?num={page_size}&cursor={cursor}&user_id={creator}&xsec_token={xsec_token}&xsec_source={xsec_source}"
  527 + return await self.get(uri)
502 528
503 async def get_all_notes_by_creator( 529 async def get_all_notes_by_creator(
504 self, 530 self,
505 user_id: str, 531 user_id: str,
506 crawl_interval: float = 1.0, 532 crawl_interval: float = 1.0,
507 callback: Optional[Callable] = None, 533 callback: Optional[Callable] = None,
  534 + xsec_token: str = "",
  535 + xsec_source: str = "pc_feed",
508 ) -> List[Dict]: 536 ) -> List[Dict]:
509 """ 537 """
510 获取指定用户下的所有发过的帖子,该方法会一直查找一个用户下的所有帖子信息 538 获取指定用户下的所有发过的帖子,该方法会一直查找一个用户下的所有帖子信息
@@ -512,6 +540,8 @@ class XiaoHongShuClient(AbstractApiClient): @@ -512,6 +540,8 @@ class XiaoHongShuClient(AbstractApiClient):
512 user_id: 用户ID 540 user_id: 用户ID
513 crawl_interval: 爬取一次的延迟单位(秒) 541 crawl_interval: 爬取一次的延迟单位(秒)
514 callback: 一次分页爬取结束后的更新回调函数 542 callback: 一次分页爬取结束后的更新回调函数
  543 + xsec_token: 验证token
  544 + xsec_source: 渠道来源
515 545
516 Returns: 546 Returns:
517 547
@@ -520,7 +550,7 @@ class XiaoHongShuClient(AbstractApiClient): @@ -520,7 +550,7 @@ class XiaoHongShuClient(AbstractApiClient):
520 notes_has_more = True 550 notes_has_more = True
521 notes_cursor = "" 551 notes_cursor = ""
522 while notes_has_more and len(result) < config.CRAWLER_MAX_NOTES_COUNT: 552 while notes_has_more and len(result) < config.CRAWLER_MAX_NOTES_COUNT:
523 - notes_res = await self.get_notes_by_creator(user_id, notes_cursor) 553 + notes_res = await self.get_notes_by_creator(user_id, notes_cursor, xsec_token=xsec_token, xsec_source=xsec_source)
524 if not notes_res: 554 if not notes_res:
525 utils.logger.error( 555 utils.logger.error(
526 f"[XiaoHongShuClient.get_notes_by_creator] The current creator may have been banned by xhs, so they cannot access the data." 556 f"[XiaoHongShuClient.get_notes_by_creator] The current creator may have been banned by xhs, so they cannot access the data."
@@ -201,6 +201,8 @@ class XiaoHongShuCrawler(AbstractCrawler): @@ -201,6 +201,8 @@ class XiaoHongShuCrawler(AbstractCrawler):
201 user_id=user_id, 201 user_id=user_id,
202 crawl_interval=crawl_interval, 202 crawl_interval=crawl_interval,
203 callback=self.fetch_creator_notes_detail, 203 callback=self.fetch_creator_notes_detail,
  204 + xsec_token=creator_info.xsec_token,
  205 + xsec_source=creator_info.xsec_source,
204 ) 206 )
205 207
206 note_ids = [] 208 note_ids = []
@@ -279,12 +281,19 @@ class XiaoHongShuCrawler(AbstractCrawler): @@ -279,12 +281,19 @@ class XiaoHongShuCrawler(AbstractCrawler):
279 Dict: note detail 281 Dict: note detail
280 """ 282 """
281 note_detail = None 283 note_detail = None
  284 + utils.logger.info(f"[get_note_detail_async_task] Begin get note detail, note_id: {note_id}")
282 async with semaphore: 285 async with semaphore:
283 try: 286 try:
284 - utils.logger.info(f"[get_note_detail_async_task] Begin get note detail, note_id: {note_id}")  
285 - note_detail = await self.xhs_client.get_note_by_id_from_html(note_id, xsec_source, xsec_token, enable_cookie=True) 287 + try:
  288 + note_detail = await self.xhs_client.get_note_by_id(note_id, xsec_source, xsec_token)
  289 + except RetryError:
  290 + pass
  291 +
286 if not note_detail: 292 if not note_detail:
287 - raise Exception(f"[get_note_detail_async_task] Failed to get note detail, Id: {note_id}") 293 + note_detail = await self.xhs_client.get_note_by_id_from_html(note_id, xsec_source, xsec_token,
  294 + enable_cookie=True)
  295 + if not note_detail:
  296 + raise Exception(f"[get_note_detail_async_task] Failed to get note detail, Id: {note_id}")
288 297
289 note_detail.update({"xsec_token": xsec_token, "xsec_source": xsec_source}) 298 note_detail.update({"xsec_token": xsec_token, "xsec_source": xsec_source})
290 299
@@ -24,3 +24,4 @@ cryptography>=45.0.7 @@ -24,3 +24,4 @@ cryptography>=45.0.7
24 alembic>=1.16.5 24 alembic>=1.16.5
25 asyncmy>=0.2.10 25 asyncmy>=0.2.10
26 sqlalchemy>=2.0.43 26 sqlalchemy>=2.0.43
  27 +xhshow>=0.1.3
@@ -49,6 +49,7 @@ parsel==1.9.1 @@ -49,6 +49,7 @@ parsel==1.9.1
49 pyexecjs==1.5.1 49 pyexecjs==1.5.1
50 typer>=0.12.3 50 typer>=0.12.3
51 pyhumps==3.8.0 51 pyhumps==3.8.0
  52 +xhshow>=0.1.3
52 53
53 # =============================== 54 # ===============================
54 # 工具包 55 # 工具包
@@ -48,6 +48,7 @@ beautifulsoup4>=4.12.0 @@ -48,6 +48,7 @@ beautifulsoup4>=4.12.0
48 lxml>=4.9.0 48 lxml>=4.9.0
49 parsel==1.9.1 49 parsel==1.9.1
50 pyexecjs==1.5.1 50 pyexecjs==1.5.1
  51 +xhshow>=0.1.3
51 52
52 # ===== 可视化 ===== 53 # ===== 可视化 =====
53 plotly>=5.17.0 54 plotly>=5.17.0