Committed by
GitHub
Merge pull request #3 from Angiin/feat/anti-detection
feat: 增强反检测能力 — JS 伪装、CDP 真实交互、随机延迟
Showing
13 changed files
with
536 additions
and
148 deletions
| @@ -2,14 +2,17 @@ | @@ -2,14 +2,17 @@ | ||
| 2 | 2 | ||
| 3 | from __future__ import annotations | 3 | from __future__ import annotations |
| 4 | 4 | ||
| 5 | +import contextlib | ||
| 5 | import json | 6 | import json |
| 6 | import logging | 7 | import logging |
| 7 | import os | 8 | import os |
| 8 | import platform | 9 | import platform |
| 9 | import shutil | 10 | import shutil |
| 10 | -import signal | 11 | +import socket |
| 11 | import subprocess | 12 | import subprocess |
| 13 | +import sys | ||
| 12 | import time | 14 | import time |
| 15 | +from pathlib import Path | ||
| 13 | 16 | ||
| 14 | from xhs.stealth import STEALTH_ARGS | 17 | from xhs.stealth import STEALTH_ARGS |
| 15 | 18 | ||
| @@ -18,6 +21,9 @@ logger = logging.getLogger(__name__) | @@ -18,6 +21,9 @@ logger = logging.getLogger(__name__) | ||
| 18 | # 默认远程调试端口 | 21 | # 默认远程调试端口 |
| 19 | DEFAULT_PORT = 9222 | 22 | DEFAULT_PORT = 9222 |
| 20 | 23 | ||
| 24 | +# 全局进程追踪 | ||
| 25 | +_chrome_process: subprocess.Popen | None = None | ||
| 26 | + | ||
| 21 | # 各平台 Chrome 默认路径 | 27 | # 各平台 Chrome 默认路径 |
| 22 | _CHROME_PATHS: dict[str, list[str]] = { | 28 | _CHROME_PATHS: dict[str, list[str]] = { |
| 23 | "Darwin": [ | 29 | "Darwin": [ |
| @@ -38,6 +44,22 @@ _CHROME_PATHS: dict[str, list[str]] = { | @@ -38,6 +44,22 @@ _CHROME_PATHS: dict[str, list[str]] = { | ||
| 38 | } | 44 | } |
| 39 | 45 | ||
| 40 | 46 | ||
| 47 | +def _get_default_data_dir() -> str: | ||
| 48 | + """返回默认 Chrome Profile 目录路径。""" | ||
| 49 | + return str(Path.home() / ".xhs" / "chrome-profile") | ||
| 50 | + | ||
| 51 | + | ||
| 52 | +def is_port_open(port: int, host: str = "127.0.0.1") -> bool: | ||
| 53 | + """TCP socket 级端口检测(秒级响应)。""" | ||
| 54 | + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: | ||
| 55 | + s.settimeout(1) | ||
| 56 | + try: | ||
| 57 | + s.connect((host, port)) | ||
| 58 | + return True | ||
| 59 | + except (ConnectionRefusedError, TimeoutError, OSError): | ||
| 60 | + return False | ||
| 61 | + | ||
| 62 | + | ||
| 41 | def find_chrome() -> str | None: | 63 | def find_chrome() -> str | None: |
| 42 | """查找 Chrome 可执行文件路径。""" | 64 | """查找 Chrome 可执行文件路径。""" |
| 43 | # 环境变量优先 | 65 | # 环境变量优先 |
| @@ -45,13 +67,28 @@ def find_chrome() -> str | None: | @@ -45,13 +67,28 @@ def find_chrome() -> str | None: | ||
| 45 | if env_path and os.path.isfile(env_path): | 67 | if env_path and os.path.isfile(env_path): |
| 46 | return env_path | 68 | return env_path |
| 47 | 69 | ||
| 48 | - # which/where 查找 | ||
| 49 | - chrome = shutil.which("google-chrome") or shutil.which("chromium") | 70 | + # which/where 查找(含 Windows chrome.exe) |
| 71 | + chrome = ( | ||
| 72 | + shutil.which("google-chrome") | ||
| 73 | + or shutil.which("chromium") | ||
| 74 | + or shutil.which("chrome") | ||
| 75 | + or shutil.which("chrome.exe") | ||
| 76 | + ) | ||
| 50 | if chrome: | 77 | if chrome: |
| 51 | return chrome | 78 | return chrome |
| 52 | 79 | ||
| 53 | # 平台默认路径 | 80 | # 平台默认路径 |
| 54 | system = platform.system() | 81 | system = platform.system() |
| 82 | + | ||
| 83 | + # Windows: 额外检查环境变量路径 | ||
| 84 | + if system == "Windows": | ||
| 85 | + for env_var in ("PROGRAMFILES", "PROGRAMFILES(X86)", "LOCALAPPDATA"): | ||
| 86 | + base = os.environ.get(env_var, "") | ||
| 87 | + if base: | ||
| 88 | + candidate = os.path.join(base, "Google", "Chrome", "Application", "chrome.exe") | ||
| 89 | + if os.path.isfile(candidate): | ||
| 90 | + return candidate | ||
| 91 | + | ||
| 55 | for path in _CHROME_PATHS.get(system, []): | 92 | for path in _CHROME_PATHS.get(system, []): |
| 56 | if os.path.isfile(path): | 93 | if os.path.isfile(path): |
| 57 | return path | 94 | return path |
| @@ -59,55 +96,70 @@ def find_chrome() -> str | None: | @@ -59,55 +96,70 @@ def find_chrome() -> str | None: | ||
| 59 | return None | 96 | return None |
| 60 | 97 | ||
| 61 | 98 | ||
| 99 | +def is_chrome_running(port: int = DEFAULT_PORT) -> bool: | ||
| 100 | + """检查指定端口的 Chrome 是否在运行(TCP 级检测)。""" | ||
| 101 | + return is_port_open(port) | ||
| 102 | + | ||
| 103 | + | ||
| 62 | def launch_chrome( | 104 | def launch_chrome( |
| 63 | port: int = DEFAULT_PORT, | 105 | port: int = DEFAULT_PORT, |
| 64 | headless: bool = False, | 106 | headless: bool = False, |
| 65 | user_data_dir: str | None = None, | 107 | user_data_dir: str | None = None, |
| 66 | chrome_bin: str | None = None, | 108 | chrome_bin: str | None = None, |
| 67 | -) -> subprocess.Popen: | 109 | +) -> subprocess.Popen | None: |
| 68 | """启动 Chrome 进程(带远程调试端口)。 | 110 | """启动 Chrome 进程(带远程调试端口)。 |
| 69 | 111 | ||
| 70 | Args: | 112 | Args: |
| 71 | port: 远程调试端口。 | 113 | port: 远程调试端口。 |
| 72 | headless: 是否无头模式。 | 114 | headless: 是否无头模式。 |
| 73 | - user_data_dir: 用户数据目录(Profile 隔离)。 | 115 | + user_data_dir: 用户数据目录(Profile 隔离),默认 ~/.xhs/chrome-profile。 |
| 74 | chrome_bin: Chrome 可执行文件路径。 | 116 | chrome_bin: Chrome 可执行文件路径。 |
| 75 | 117 | ||
| 76 | Returns: | 118 | Returns: |
| 77 | - Chrome 子进程。 | 119 | + Chrome 子进程,若已在运行则返回 None。 |
| 78 | 120 | ||
| 79 | Raises: | 121 | Raises: |
| 80 | FileNotFoundError: 未找到 Chrome。 | 122 | FileNotFoundError: 未找到 Chrome。 |
| 81 | """ | 123 | """ |
| 124 | + global _chrome_process | ||
| 125 | + | ||
| 126 | + # 已在运行则跳过 | ||
| 127 | + if is_port_open(port): | ||
| 128 | + logger.info("Chrome 已在运行 (port=%d),跳过启动", port) | ||
| 129 | + return None | ||
| 130 | + | ||
| 82 | if not chrome_bin: | 131 | if not chrome_bin: |
| 83 | chrome_bin = find_chrome() | 132 | chrome_bin = find_chrome() |
| 84 | if not chrome_bin: | 133 | if not chrome_bin: |
| 85 | raise FileNotFoundError("未找到 Chrome,请设置 CHROME_BIN 环境变量或安装 Chrome") | 134 | raise FileNotFoundError("未找到 Chrome,请设置 CHROME_BIN 环境变量或安装 Chrome") |
| 86 | 135 | ||
| 136 | + # 默认 user-data-dir | ||
| 137 | + if not user_data_dir: | ||
| 138 | + user_data_dir = _get_default_data_dir() | ||
| 139 | + | ||
| 87 | args = [ | 140 | args = [ |
| 88 | chrome_bin, | 141 | chrome_bin, |
| 89 | f"--remote-debugging-port={port}", | 142 | f"--remote-debugging-port={port}", |
| 143 | + f"--user-data-dir={user_data_dir}", | ||
| 90 | *STEALTH_ARGS, | 144 | *STEALTH_ARGS, |
| 91 | ] | 145 | ] |
| 92 | 146 | ||
| 93 | if headless: | 147 | if headless: |
| 94 | args.append("--headless=new") | 148 | args.append("--headless=new") |
| 95 | 149 | ||
| 96 | - if user_data_dir: | ||
| 97 | - args.append(f"--user-data-dir={user_data_dir}") | ||
| 98 | - | ||
| 99 | # 代理 | 150 | # 代理 |
| 100 | proxy = os.getenv("XHS_PROXY") | 151 | proxy = os.getenv("XHS_PROXY") |
| 101 | if proxy: | 152 | if proxy: |
| 102 | args.append(f"--proxy-server={proxy}") | 153 | args.append(f"--proxy-server={proxy}") |
| 103 | logger.info("使用代理: %s", _mask_proxy(proxy)) | 154 | logger.info("使用代理: %s", _mask_proxy(proxy)) |
| 104 | 155 | ||
| 105 | - logger.info("启动 Chrome: port=%d, headless=%s", port, headless) | 156 | + logger.info("启动 Chrome: port=%d, headless=%s, profile=%s", port, headless, user_data_dir) |
| 106 | process = subprocess.Popen( | 157 | process = subprocess.Popen( |
| 107 | args, | 158 | args, |
| 108 | stdout=subprocess.DEVNULL, | 159 | stdout=subprocess.DEVNULL, |
| 109 | stderr=subprocess.DEVNULL, | 160 | stderr=subprocess.DEVNULL, |
| 110 | ) | 161 | ) |
| 162 | + _chrome_process = process | ||
| 111 | 163 | ||
| 112 | # 等待 Chrome 准备就绪 | 164 | # 等待 Chrome 准备就绪 |
| 113 | _wait_for_chrome(port) | 165 | _wait_for_chrome(port) |
| @@ -120,7 +172,7 @@ def close_chrome(process: subprocess.Popen) -> None: | @@ -120,7 +172,7 @@ def close_chrome(process: subprocess.Popen) -> None: | ||
| 120 | return | 172 | return |
| 121 | 173 | ||
| 122 | try: | 174 | try: |
| 123 | - process.send_signal(signal.SIGTERM) | 175 | + process.terminate() |
| 124 | process.wait(timeout=5) | 176 | process.wait(timeout=5) |
| 125 | except (subprocess.TimeoutExpired, OSError): | 177 | except (subprocess.TimeoutExpired, OSError): |
| 126 | process.kill() | 178 | process.kill() |
| @@ -129,29 +181,20 @@ def close_chrome(process: subprocess.Popen) -> None: | @@ -129,29 +181,20 @@ def close_chrome(process: subprocess.Popen) -> None: | ||
| 129 | logger.info("Chrome 进程已关闭") | 181 | logger.info("Chrome 进程已关闭") |
| 130 | 182 | ||
| 131 | 183 | ||
| 132 | -def is_chrome_running(port: int = DEFAULT_PORT) -> bool: | ||
| 133 | - """检查指定端口的 Chrome 是否在运行。""" | ||
| 134 | - import requests | ||
| 135 | - | ||
| 136 | - try: | ||
| 137 | - resp = requests.get(f"http://127.0.0.1:{port}/json/version", timeout=2) | ||
| 138 | - return resp.status_code == 200 | ||
| 139 | - except (requests.ConnectionError, requests.Timeout): | ||
| 140 | - return False | ||
| 141 | - | ||
| 142 | - | ||
| 143 | def kill_chrome(port: int = DEFAULT_PORT) -> None: | 184 | def kill_chrome(port: int = DEFAULT_PORT) -> None: |
| 144 | """关闭指定端口的 Chrome 实例。 | 185 | """关闭指定端口的 Chrome 实例。 |
| 145 | 186 | ||
| 146 | - 尝试通过 CDP Browser.close 命令关闭,失败则使用进程信号。 | 187 | + 策略: CDP Browser.close → terminate 追踪进程 → 端口查找终止进程。 |
| 147 | 188 | ||
| 148 | Args: | 189 | Args: |
| 149 | port: Chrome 调试端口。 | 190 | port: Chrome 调试端口。 |
| 150 | """ | 191 | """ |
| 151 | - import requests | 192 | + global _chrome_process |
| 152 | 193 | ||
| 153 | # 策略1: 通过 CDP 关闭 | 194 | # 策略1: 通过 CDP 关闭 |
| 154 | try: | 195 | try: |
| 196 | + import requests | ||
| 197 | + | ||
| 155 | resp = requests.get(f"http://127.0.0.1:{port}/json/version", timeout=2) | 198 | resp = requests.get(f"http://127.0.0.1:{port}/json/version", timeout=2) |
| 156 | if resp.status_code == 200: | 199 | if resp.status_code == 200: |
| 157 | ws_url = resp.json().get("webSocketDebuggerUrl") | 200 | ws_url = resp.json().get("webSocketDebuggerUrl") |
| @@ -163,32 +206,70 @@ def kill_chrome(port: int = DEFAULT_PORT) -> None: | @@ -163,32 +206,70 @@ def kill_chrome(port: int = DEFAULT_PORT) -> None: | ||
| 163 | ws.close() | 206 | ws.close() |
| 164 | logger.info("通过 CDP Browser.close 关闭 Chrome (port=%d)", port) | 207 | logger.info("通过 CDP Browser.close 关闭 Chrome (port=%d)", port) |
| 165 | time.sleep(1) | 208 | time.sleep(1) |
| 166 | - return | ||
| 167 | except Exception: | 209 | except Exception: |
| 168 | pass | 210 | pass |
| 169 | 211 | ||
| 170 | - # 策略2: 通过 lsof 查找并 kill 进程 | ||
| 171 | - try: | ||
| 172 | - result = subprocess.run( | ||
| 173 | - ["lsof", "-ti", f":{port}"], | ||
| 174 | - capture_output=True, | ||
| 175 | - text=True, | ||
| 176 | - timeout=5, | ||
| 177 | - ) | ||
| 178 | - if result.returncode == 0 and result.stdout.strip(): | ||
| 179 | - import contextlib | ||
| 180 | - | ||
| 181 | - pids = result.stdout.strip().split("\n") | 212 | + # 策略2: terminate 追踪的子进程 |
| 213 | + if _chrome_process and _chrome_process.poll() is None: | ||
| 214 | + try: | ||
| 215 | + _chrome_process.terminate() | ||
| 216 | + _chrome_process.wait(timeout=5) | ||
| 217 | + logger.info("通过 terminate 关闭追踪的 Chrome 进程") | ||
| 218 | + except Exception: | ||
| 219 | + with contextlib.suppress(Exception): | ||
| 220 | + _chrome_process.kill() | ||
| 221 | + _chrome_process = None | ||
| 222 | + | ||
| 223 | + # 策略3: 通过端口查找并终止进程(跨平台) | ||
| 224 | + if is_port_open(port): | ||
| 225 | + pids = _find_pids_by_port(port) | ||
| 226 | + if pids: | ||
| 182 | for pid in pids: | 227 | for pid in pids: |
| 183 | - with contextlib.suppress(OSError, ValueError): | ||
| 184 | - os.kill(int(pid), signal.SIGTERM) | ||
| 185 | - logger.info("通过 SIGTERM 关闭 Chrome 进程 (port=%d)", port) | ||
| 186 | - time.sleep(1) | 228 | + _kill_pid(pid) |
| 229 | + logger.info("通过进程终止关闭 Chrome (port=%d)", port) | ||
| 230 | + | ||
| 231 | + # 等待端口释放(最多 5s) | ||
| 232 | + deadline = time.monotonic() + 5 | ||
| 233 | + while time.monotonic() < deadline: | ||
| 234 | + if not is_port_open(port): | ||
| 187 | return | 235 | return |
| 188 | - except Exception: | ||
| 189 | - pass | 236 | + time.sleep(0.5) |
| 237 | + | ||
| 238 | + if is_port_open(port): | ||
| 239 | + logger.warning("端口 %d 仍被占用,kill 可能未完全生效", port) | ||
| 240 | + | ||
| 190 | 241 | ||
| 191 | - logger.warning("未能关闭 Chrome (port=%d)", port) | 242 | +def ensure_chrome( |
| 243 | + port: int = DEFAULT_PORT, | ||
| 244 | + headless: bool = False, | ||
| 245 | + user_data_dir: str | None = None, | ||
| 246 | + chrome_bin: str | None = None, | ||
| 247 | +) -> bool: | ||
| 248 | + """确保 Chrome 在指定端口可用(一站式入口)。 | ||
| 249 | + | ||
| 250 | + 如果 Chrome 已在运行,直接返回 True。 | ||
| 251 | + 否则尝试启动 Chrome 并等待端口就绪。 | ||
| 252 | + | ||
| 253 | + Args: | ||
| 254 | + port: 远程调试端口。 | ||
| 255 | + headless: 是否无头模式(仅新启动时生效)。 | ||
| 256 | + user_data_dir: 用户数据目录。 | ||
| 257 | + chrome_bin: Chrome 可执行文件路径。 | ||
| 258 | + | ||
| 259 | + Returns: | ||
| 260 | + True 表示 Chrome 可用,False 表示启动失败。 | ||
| 261 | + """ | ||
| 262 | + if is_port_open(port): | ||
| 263 | + return True | ||
| 264 | + | ||
| 265 | + try: | ||
| 266 | + launch_chrome( | ||
| 267 | + port=port, headless=headless, user_data_dir=user_data_dir, chrome_bin=chrome_bin, | ||
| 268 | + ) | ||
| 269 | + return is_port_open(port) | ||
| 270 | + except FileNotFoundError as e: | ||
| 271 | + logger.error("启动 Chrome 失败: %s", e) | ||
| 272 | + return False | ||
| 192 | 273 | ||
| 193 | 274 | ||
| 194 | def restart_chrome( | 275 | def restart_chrome( |
| @@ -196,7 +277,7 @@ def restart_chrome( | @@ -196,7 +277,7 @@ def restart_chrome( | ||
| 196 | headless: bool = False, | 277 | headless: bool = False, |
| 197 | user_data_dir: str | None = None, | 278 | user_data_dir: str | None = None, |
| 198 | chrome_bin: str | None = None, | 279 | chrome_bin: str | None = None, |
| 199 | -) -> subprocess.Popen: | 280 | +) -> subprocess.Popen | None: |
| 200 | """重启 Chrome:关闭当前实例后以新模式重新启动。 | 281 | """重启 Chrome:关闭当前实例后以新模式重新启动。 |
| 201 | 282 | ||
| 202 | Args: | 283 | Args: |
| @@ -206,7 +287,7 @@ def restart_chrome( | @@ -206,7 +287,7 @@ def restart_chrome( | ||
| 206 | chrome_bin: Chrome 可执行文件路径。 | 287 | chrome_bin: Chrome 可执行文件路径。 |
| 207 | 288 | ||
| 208 | Returns: | 289 | Returns: |
| 209 | - 新的 Chrome 子进程。 | 290 | + 新的 Chrome 子进程,或 None。 |
| 210 | """ | 291 | """ |
| 211 | logger.info("重启 Chrome: port=%d, headless=%s", port, headless) | 292 | logger.info("重启 Chrome: port=%d, headless=%s", port, headless) |
| 212 | kill_chrome(port) | 293 | kill_chrome(port) |
| @@ -220,16 +301,70 @@ def restart_chrome( | @@ -220,16 +301,70 @@ def restart_chrome( | ||
| 220 | 301 | ||
| 221 | 302 | ||
| 222 | def _wait_for_chrome(port: int, timeout: float = 15.0) -> None: | 303 | def _wait_for_chrome(port: int, timeout: float = 15.0) -> None: |
| 223 | - """等待 Chrome 调试端口就绪。""" | 304 | + """等待 Chrome 调试端口就绪(TCP 级检测)。""" |
| 224 | deadline = time.monotonic() + timeout | 305 | deadline = time.monotonic() + timeout |
| 225 | while time.monotonic() < deadline: | 306 | while time.monotonic() < deadline: |
| 226 | - if is_chrome_running(port): | 307 | + if is_port_open(port): |
| 227 | logger.info("Chrome 已就绪 (port=%d)", port) | 308 | logger.info("Chrome 已就绪 (port=%d)", port) |
| 228 | return | 309 | return |
| 229 | time.sleep(0.5) | 310 | time.sleep(0.5) |
| 230 | logger.warning("等待 Chrome 就绪超时 (port=%d)", port) | 311 | logger.warning("等待 Chrome 就绪超时 (port=%d)", port) |
| 231 | 312 | ||
| 232 | 313 | ||
| 314 | +def _find_pids_by_port(port: int) -> list[int]: | ||
| 315 | + """查找占用指定端口的进程 PID(跨平台)。""" | ||
| 316 | + try: | ||
| 317 | + if sys.platform == "win32": | ||
| 318 | + result = subprocess.run( | ||
| 319 | + ["netstat", "-ano", "-p", "TCP"], | ||
| 320 | + capture_output=True, | ||
| 321 | + text=True, | ||
| 322 | + timeout=5, | ||
| 323 | + ) | ||
| 324 | + if result.returncode != 0: | ||
| 325 | + return [] | ||
| 326 | + pids: list[int] = [] | ||
| 327 | + for line in result.stdout.splitlines(): | ||
| 328 | + if f":{port}" in line and "LISTENING" in line: | ||
| 329 | + parts = line.split() | ||
| 330 | + with contextlib.suppress(ValueError, IndexError): | ||
| 331 | + pids.append(int(parts[-1])) | ||
| 332 | + return list(set(pids)) | ||
| 333 | + else: | ||
| 334 | + result = subprocess.run( | ||
| 335 | + ["lsof", "-ti", f":{port}"], | ||
| 336 | + capture_output=True, | ||
| 337 | + text=True, | ||
| 338 | + timeout=5, | ||
| 339 | + ) | ||
| 340 | + if result.returncode != 0 or not result.stdout.strip(): | ||
| 341 | + return [] | ||
| 342 | + pids = [] | ||
| 343 | + for p in result.stdout.strip().split("\n"): | ||
| 344 | + with contextlib.suppress(ValueError): | ||
| 345 | + pids.append(int(p)) | ||
| 346 | + return pids | ||
| 347 | + except Exception: | ||
| 348 | + return [] | ||
| 349 | + | ||
| 350 | + | ||
| 351 | +def _kill_pid(pid: int) -> None: | ||
| 352 | + """终止指定 PID 的进程(跨平台)。""" | ||
| 353 | + try: | ||
| 354 | + if sys.platform == "win32": | ||
| 355 | + subprocess.run( | ||
| 356 | + ["taskkill", "/PID", str(pid), "/F"], | ||
| 357 | + capture_output=True, | ||
| 358 | + timeout=5, | ||
| 359 | + ) | ||
| 360 | + else: | ||
| 361 | + import signal | ||
| 362 | + | ||
| 363 | + os.kill(pid, signal.SIGTERM) | ||
| 364 | + except Exception: | ||
| 365 | + logger.debug("终止进程 %d 失败", pid) | ||
| 366 | + | ||
| 367 | + | ||
| 233 | def _mask_proxy(proxy_url: str) -> str: | 368 | def _mask_proxy(proxy_url: str) -> str: |
| 234 | """隐藏代理 URL 中的敏感信息。""" | 369 | """隐藏代理 URL 中的敏感信息。""" |
| 235 | from urllib.parse import urlparse | 370 | from urllib.parse import urlparse |
| @@ -12,6 +12,12 @@ import json | @@ -12,6 +12,12 @@ import json | ||
| 12 | import logging | 12 | import logging |
| 13 | import sys | 13 | import sys |
| 14 | 14 | ||
| 15 | +# Windows 控制台默认编码(如 cp1252)不支持中文,强制 UTF-8 | ||
| 16 | +if sys.stdout and hasattr(sys.stdout, "reconfigure"): | ||
| 17 | + sys.stdout.reconfigure(encoding="utf-8") | ||
| 18 | +if sys.stderr and hasattr(sys.stderr, "reconfigure"): | ||
| 19 | + sys.stderr.reconfigure(encoding="utf-8") | ||
| 20 | + | ||
| 15 | logging.basicConfig( | 21 | logging.basicConfig( |
| 16 | level=logging.INFO, | 22 | level=logging.INFO, |
| 17 | format="%(asctime)s %(levelname)s %(name)s: %(message)s", | 23 | format="%(asctime)s %(levelname)s %(name)s: %(message)s", |
| @@ -27,14 +33,43 @@ def _output(data: dict, exit_code: int = 0) -> None: | @@ -27,14 +33,43 @@ def _output(data: dict, exit_code: int = 0) -> None: | ||
| 27 | 33 | ||
| 28 | def _connect(args: argparse.Namespace): | 34 | def _connect(args: argparse.Namespace): |
| 29 | """连接到 Chrome 并返回 (browser, page)。""" | 35 | """连接到 Chrome 并返回 (browser, page)。""" |
| 36 | + from chrome_launcher import ensure_chrome | ||
| 30 | from xhs.cdp import Browser | 37 | from xhs.cdp import Browser |
| 31 | 38 | ||
| 39 | + if not ensure_chrome(port=args.port): | ||
| 40 | + _output( | ||
| 41 | + {"success": False, "error": "无法启动 Chrome,请检查 Chrome 是否已安装"}, | ||
| 42 | + exit_code=2, | ||
| 43 | + ) | ||
| 44 | + | ||
| 32 | browser = Browser(host=args.host, port=args.port) | 45 | browser = Browser(host=args.host, port=args.port) |
| 33 | browser.connect() | 46 | browser.connect() |
| 34 | page = browser.new_page() | 47 | page = browser.new_page() |
| 35 | return browser, page | 48 | return browser, page |
| 36 | 49 | ||
| 37 | 50 | ||
| 51 | +def _connect_existing(args: argparse.Namespace): | ||
| 52 | + """连接到 Chrome 并复用已有页面(用于分步发布的后续步骤)。""" | ||
| 53 | + from chrome_launcher import ensure_chrome | ||
| 54 | + from xhs.cdp import Browser | ||
| 55 | + | ||
| 56 | + if not ensure_chrome(port=args.port): | ||
| 57 | + _output( | ||
| 58 | + {"success": False, "error": "无法连接到 Chrome"}, | ||
| 59 | + exit_code=2, | ||
| 60 | + ) | ||
| 61 | + | ||
| 62 | + browser = Browser(host=args.host, port=args.port) | ||
| 63 | + browser.connect() | ||
| 64 | + page = browser.get_existing_page() | ||
| 65 | + if not page: | ||
| 66 | + _output( | ||
| 67 | + {"success": False, "error": "未找到已打开的页面,请先执行前置步骤"}, | ||
| 68 | + exit_code=2, | ||
| 69 | + ) | ||
| 70 | + return browser, page | ||
| 71 | + | ||
| 72 | + | ||
| 38 | def _headless_fallback(port: int) -> None: | 73 | def _headless_fallback(port: int) -> None: |
| 39 | """Headless 模式未登录时自动降级到有窗口模式。""" | 74 | """Headless 模式未登录时自动降级到有窗口模式。""" |
| 40 | from chrome_launcher import restart_chrome | 75 | from chrome_launcher import restart_chrome |
| @@ -332,7 +367,7 @@ def cmd_fill_publish(args: argparse.Namespace) -> None: | @@ -332,7 +367,7 @@ def cmd_fill_publish(args: argparse.Namespace) -> None: | ||
| 332 | } | 367 | } |
| 333 | ) | 368 | ) |
| 334 | finally: | 369 | finally: |
| 335 | - browser.close_page(page) | 370 | + # 不关闭页面,让用户在浏览器中预览 |
| 336 | browser.close() | 371 | browser.close() |
| 337 | 372 | ||
| 338 | 373 | ||
| @@ -368,15 +403,15 @@ def cmd_fill_publish_video(args: argparse.Namespace) -> None: | @@ -368,15 +403,15 @@ def cmd_fill_publish_video(args: argparse.Namespace) -> None: | ||
| 368 | } | 403 | } |
| 369 | ) | 404 | ) |
| 370 | finally: | 405 | finally: |
| 371 | - browser.close_page(page) | 406 | + # 不关闭页面,让用户在浏览器中预览 |
| 372 | browser.close() | 407 | browser.close() |
| 373 | 408 | ||
| 374 | 409 | ||
| 375 | def cmd_click_publish(args: argparse.Namespace) -> None: | 410 | def cmd_click_publish(args: argparse.Namespace) -> None: |
| 376 | - """点击发布按钮(在用户确认后调用)。""" | 411 | + """点击发布按钮(在用户确认后调用)。复用已有的发布页 tab。""" |
| 377 | from xhs.publish import click_publish_button | 412 | from xhs.publish import click_publish_button |
| 378 | 413 | ||
| 379 | - browser, page = _connect(args) | 414 | + browser, page = _connect_existing(args) |
| 380 | try: | 415 | try: |
| 381 | click_publish_button(page) | 416 | click_publish_button(page) |
| 382 | _output({"success": True, "status": "发布完成"}) | 417 | _output({"success": True, "status": "发布完成"}) |
| @@ -410,15 +445,15 @@ def cmd_long_article(args: argparse.Namespace) -> None: | @@ -410,15 +445,15 @@ def cmd_long_article(args: argparse.Namespace) -> None: | ||
| 410 | } | 445 | } |
| 411 | ) | 446 | ) |
| 412 | finally: | 447 | finally: |
| 413 | - browser.close_page(page) | 448 | + # 不关闭页面,后续 select-template / next-step 需要复用 |
| 414 | browser.close() | 449 | browser.close() |
| 415 | 450 | ||
| 416 | 451 | ||
| 417 | def cmd_select_template(args: argparse.Namespace) -> None: | 452 | def cmd_select_template(args: argparse.Namespace) -> None: |
| 418 | - """选择排版模板。""" | 453 | + """选择排版模板。复用已有的长文编辑页 tab。""" |
| 419 | from xhs.publish_long_article import select_template | 454 | from xhs.publish_long_article import select_template |
| 420 | 455 | ||
| 421 | - browser, page = _connect(args) | 456 | + browser, page = _connect_existing(args) |
| 422 | try: | 457 | try: |
| 423 | selected = select_template(page, args.name) | 458 | selected = select_template(page, args.name) |
| 424 | if selected: | 459 | if selected: |
| @@ -429,23 +464,23 @@ def cmd_select_template(args: argparse.Namespace) -> None: | @@ -429,23 +464,23 @@ def cmd_select_template(args: argparse.Namespace) -> None: | ||
| 429 | exit_code=2, | 464 | exit_code=2, |
| 430 | ) | 465 | ) |
| 431 | finally: | 466 | finally: |
| 432 | - browser.close_page(page) | 467 | + # 不关闭页面,后续 next-step 需要复用 |
| 433 | browser.close() | 468 | browser.close() |
| 434 | 469 | ||
| 435 | 470 | ||
| 436 | def cmd_next_step(args: argparse.Namespace) -> None: | 471 | def cmd_next_step(args: argparse.Namespace) -> None: |
| 437 | - """点击下一步 + 填写发布页描述。""" | 472 | + """点击下一步 + 填写发布页描述。复用已有的长文编辑页 tab。""" |
| 438 | from xhs.publish_long_article import click_next_and_fill_description | 473 | from xhs.publish_long_article import click_next_and_fill_description |
| 439 | 474 | ||
| 440 | with open(args.content_file, encoding="utf-8") as f: | 475 | with open(args.content_file, encoding="utf-8") as f: |
| 441 | description = f.read().strip() | 476 | description = f.read().strip() |
| 442 | 477 | ||
| 443 | - browser, page = _connect(args) | 478 | + browser, page = _connect_existing(args) |
| 444 | try: | 479 | try: |
| 445 | click_next_and_fill_description(page, description) | 480 | click_next_and_fill_description(page, description) |
| 446 | _output({"success": True, "status": "已进入发布页,等待确认发布"}) | 481 | _output({"success": True, "status": "已进入发布页,等待确认发布"}) |
| 447 | finally: | 482 | finally: |
| 448 | - browser.close_page(page) | 483 | + # 不关闭页面,等待 click-publish |
| 449 | browser.close() | 484 | browser.close() |
| 450 | 485 | ||
| 451 | 486 |
| @@ -71,7 +71,7 @@ class RunLock: | @@ -71,7 +71,7 @@ class RunLock: | ||
| 71 | # 检查进程是否存在 | 71 | # 检查进程是否存在 |
| 72 | os.kill(pid, 0) | 72 | os.kill(pid, 0) |
| 73 | return False | 73 | return False |
| 74 | - except (FileNotFoundError, ValueError, ProcessLookupError, PermissionError): | 74 | + except (ValueError, OSError): |
| 75 | return True | 75 | return True |
| 76 | 76 | ||
| 77 | def _force_release(self) -> None: | 77 | def _force_release(self) -> None: |
| @@ -7,6 +7,7 @@ from __future__ import annotations | @@ -7,6 +7,7 @@ from __future__ import annotations | ||
| 7 | 7 | ||
| 8 | import json | 8 | import json |
| 9 | import logging | 9 | import logging |
| 10 | +import random | ||
| 10 | import time | 11 | import time |
| 11 | from typing import Any | 12 | from typing import Any |
| 12 | 13 | ||
| @@ -14,7 +15,7 @@ import requests | @@ -14,7 +15,7 @@ import requests | ||
| 14 | import websockets.sync.client as ws_client | 15 | import websockets.sync.client as ws_client |
| 15 | 16 | ||
| 16 | from .errors import CDPError, ElementNotFoundError | 17 | from .errors import CDPError, ElementNotFoundError |
| 17 | -from .stealth import STEALTH_JS | 18 | +from .stealth import REALISTIC_UA, STEALTH_JS |
| 18 | 19 | ||
| 19 | logger = logging.getLogger(__name__) | 20 | logger = logging.getLogger(__name__) |
| 20 | 21 | ||
| @@ -211,15 +212,25 @@ class Page: | @@ -211,15 +212,25 @@ class Page: | ||
| 211 | raise ElementNotFoundError(selector) | 212 | raise ElementNotFoundError(selector) |
| 212 | 213 | ||
| 213 | def click_element(self, selector: str) -> None: | 214 | def click_element(self, selector: str) -> None: |
| 214 | - """点击指定选择器的元素。""" | ||
| 215 | - self.evaluate( | 215 | + """点击指定选择器的元素(通过 CDP Input 事件,isTrusted=true)。""" |
| 216 | + box = self.evaluate( | ||
| 216 | f""" | 217 | f""" |
| 217 | (() => {{ | 218 | (() => {{ |
| 218 | const el = document.querySelector({json.dumps(selector)}); | 219 | const el = document.querySelector({json.dumps(selector)}); |
| 219 | - if (el) el.click(); | 220 | + if (!el) return null; |
| 221 | + el.scrollIntoView({{block: 'center'}}); | ||
| 222 | + const rect = el.getBoundingClientRect(); | ||
| 223 | + return {{x: rect.left + rect.width / 2, y: rect.top + rect.height / 2}}; | ||
| 220 | }})() | 224 | }})() |
| 221 | """ | 225 | """ |
| 222 | ) | 226 | ) |
| 227 | + if not box: | ||
| 228 | + return | ||
| 229 | + x = box["x"] + random.uniform(-3, 3) | ||
| 230 | + y = box["y"] + random.uniform(-3, 3) | ||
| 231 | + self.mouse_move(x, y) | ||
| 232 | + time.sleep(random.uniform(0.03, 0.08)) | ||
| 233 | + self.mouse_click(x, y) | ||
| 223 | 234 | ||
| 224 | def input_text(self, selector: str, text: str) -> None: | 235 | def input_text(self, selector: str, text: str) -> None: |
| 225 | """向指定选择器的元素输入文本。""" | 236 | """向指定选择器的元素输入文本。""" |
| @@ -237,18 +248,59 @@ class Page: | @@ -237,18 +248,59 @@ class Page: | ||
| 237 | ) | 248 | ) |
| 238 | 249 | ||
| 239 | def input_content_editable(self, selector: str, text: str) -> None: | 250 | def input_content_editable(self, selector: str, text: str) -> None: |
| 240 | - """向 contentEditable 元素输入文本(如 div.ql-editor)。""" | 251 | + """向 contentEditable 元素输入文本(CDP 逐字输入,模拟真实打字)。""" |
| 252 | + # 1. focus 元素 | ||
| 241 | self.evaluate( | 253 | self.evaluate( |
| 242 | f""" | 254 | f""" |
| 243 | (() => {{ | 255 | (() => {{ |
| 244 | const el = document.querySelector({json.dumps(selector)}); | 256 | const el = document.querySelector({json.dumps(selector)}); |
| 245 | - if (!el) return; | ||
| 246 | - el.focus(); | ||
| 247 | - el.textContent = {json.dumps(text)}; | ||
| 248 | - el.dispatchEvent(new Event('input', {{bubbles: true}})); | 257 | + if (el) el.focus(); |
| 249 | }})() | 258 | }})() |
| 250 | """ | 259 | """ |
| 251 | ) | 260 | ) |
| 261 | + time.sleep(0.1) | ||
| 262 | + # 2. 全选清空(Ctrl+A + Backspace) | ||
| 263 | + self._send_session( | ||
| 264 | + "Input.dispatchKeyEvent", | ||
| 265 | + {"type": "keyDown", "key": "a", "code": "KeyA", "modifiers": 2}, | ||
| 266 | + ) | ||
| 267 | + self._send_session( | ||
| 268 | + "Input.dispatchKeyEvent", | ||
| 269 | + {"type": "keyUp", "key": "a", "code": "KeyA", "modifiers": 2}, | ||
| 270 | + ) | ||
| 271 | + self._send_session( | ||
| 272 | + "Input.dispatchKeyEvent", | ||
| 273 | + { | ||
| 274 | + "type": "keyDown", | ||
| 275 | + "key": "Backspace", | ||
| 276 | + "code": "Backspace", | ||
| 277 | + "windowsVirtualKeyCode": 8, | ||
| 278 | + }, | ||
| 279 | + ) | ||
| 280 | + self._send_session( | ||
| 281 | + "Input.dispatchKeyEvent", | ||
| 282 | + { | ||
| 283 | + "type": "keyUp", | ||
| 284 | + "key": "Backspace", | ||
| 285 | + "code": "Backspace", | ||
| 286 | + "windowsVirtualKeyCode": 8, | ||
| 287 | + }, | ||
| 288 | + ) | ||
| 289 | + time.sleep(0.1) | ||
| 290 | + # 3. 逐字输入(随机 30-80ms 间隔,换行符转为 Enter 键) | ||
| 291 | + for char in text: | ||
| 292 | + if char == "\n": | ||
| 293 | + self.press_key("Enter") | ||
| 294 | + else: | ||
| 295 | + self._send_session( | ||
| 296 | + "Input.dispatchKeyEvent", | ||
| 297 | + {"type": "keyDown", "text": char}, | ||
| 298 | + ) | ||
| 299 | + self._send_session( | ||
| 300 | + "Input.dispatchKeyEvent", | ||
| 301 | + {"type": "keyUp", "text": char}, | ||
| 302 | + ) | ||
| 303 | + time.sleep(random.uniform(0.03, 0.08)) | ||
| 252 | 304 | ||
| 253 | def get_element_text(self, selector: str) -> str | None: | 305 | def get_element_text(self, selector: str) -> str | None: |
| 254 | """获取元素文本内容。""" | 306 | """获取元素文本内容。""" |
| @@ -500,14 +552,31 @@ class Browser: | @@ -500,14 +552,31 @@ class Browser: | ||
| 500 | 552 | ||
| 501 | page = Page(self._cdp, target_id, session_id) | 553 | page = Page(self._cdp, target_id, session_id) |
| 502 | 554 | ||
| 555 | + # 注入反检测(必须在 enable domains 之前) | ||
| 556 | + page.inject_stealth() | ||
| 557 | + | ||
| 558 | + # UA 覆盖 | ||
| 559 | + page._send_session( | ||
| 560 | + "Emulation.setUserAgentOverride", | ||
| 561 | + {"userAgent": REALISTIC_UA}, | ||
| 562 | + ) | ||
| 563 | + | ||
| 564 | + # 随机 viewport(模拟真实屏幕尺寸) | ||
| 565 | + page._send_session( | ||
| 566 | + "Emulation.setDeviceMetricsOverride", | ||
| 567 | + { | ||
| 568 | + "width": random.randint(1366, 1920), | ||
| 569 | + "height": random.randint(768, 1080), | ||
| 570 | + "deviceScaleFactor": 1, | ||
| 571 | + "mobile": False, | ||
| 572 | + }, | ||
| 573 | + ) | ||
| 574 | + | ||
| 503 | # 启用必要的 domain | 575 | # 启用必要的 domain |
| 504 | page._send_session("Page.enable") | 576 | page._send_session("Page.enable") |
| 505 | page._send_session("DOM.enable") | 577 | page._send_session("DOM.enable") |
| 506 | page._send_session("Runtime.enable") | 578 | page._send_session("Runtime.enable") |
| 507 | 579 | ||
| 508 | - # 注入反检测 | ||
| 509 | - page.inject_stealth() | ||
| 510 | - | ||
| 511 | return page | 580 | return page |
| 512 | 581 | ||
| 513 | def get_existing_page(self) -> Page | None: | 582 | def get_existing_page(self) -> Page | None: |
| @@ -3,10 +3,10 @@ | @@ -3,10 +3,10 @@ | ||
| 3 | from __future__ import annotations | 3 | from __future__ import annotations |
| 4 | 4 | ||
| 5 | import logging | 5 | import logging |
| 6 | -import time | ||
| 7 | 6 | ||
| 8 | from .cdp import Page | 7 | from .cdp import Page |
| 9 | from .feed_detail import _check_end_container, _check_page_accessible, _get_comment_count | 8 | from .feed_detail import _check_end_container, _check_page_accessible, _get_comment_count |
| 9 | +from .human import sleep_random | ||
| 10 | from .selectors import ( | 10 | from .selectors import ( |
| 11 | COMMENT_INPUT_FIELD, | 11 | COMMENT_INPUT_FIELD, |
| 12 | COMMENT_INPUT_TRIGGER, | 12 | COMMENT_INPUT_TRIGGER, |
| @@ -37,7 +37,7 @@ def post_comment(page: Page, feed_id: str, xsec_token: str, content: str) -> Non | @@ -37,7 +37,7 @@ def post_comment(page: Page, feed_id: str, xsec_token: str, content: str) -> Non | ||
| 37 | page.navigate(url) | 37 | page.navigate(url) |
| 38 | page.wait_for_load() | 38 | page.wait_for_load() |
| 39 | page.wait_dom_stable() | 39 | page.wait_dom_stable() |
| 40 | - time.sleep(1) | 40 | + sleep_random(800, 1500) |
| 41 | 41 | ||
| 42 | _check_page_accessible(page) | 42 | _check_page_accessible(page) |
| 43 | 43 | ||
| @@ -46,27 +46,16 @@ def post_comment(page: Page, feed_id: str, xsec_token: str, content: str) -> Non | @@ -46,27 +46,16 @@ def post_comment(page: Page, feed_id: str, xsec_token: str, content: str) -> Non | ||
| 46 | raise RuntimeError("未找到评论输入框,该帖子可能不支持评论或网页端不可访问") | 46 | raise RuntimeError("未找到评论输入框,该帖子可能不支持评论或网页端不可访问") |
| 47 | 47 | ||
| 48 | page.click_element(COMMENT_INPUT_TRIGGER) | 48 | page.click_element(COMMENT_INPUT_TRIGGER) |
| 49 | - time.sleep(0.5) | 49 | + sleep_random(400, 800) |
| 50 | 50 | ||
| 51 | - # 输入评论内容 | 51 | + # 输入评论内容(CDP 逐字输入) |
| 52 | page.wait_for_element(COMMENT_INPUT_FIELD, timeout=5) | 52 | page.wait_for_element(COMMENT_INPUT_FIELD, timeout=5) |
| 53 | - page.evaluate( | ||
| 54 | - f""" | ||
| 55 | - (() => {{ | ||
| 56 | - const el = document.querySelector({_js_str(COMMENT_INPUT_FIELD)}); | ||
| 57 | - if (el) {{ | ||
| 58 | - el.focus(); | ||
| 59 | - el.textContent = {_js_str(content)}; | ||
| 60 | - el.dispatchEvent(new Event('input', {{bubbles: true}})); | ||
| 61 | - }} | ||
| 62 | - }})() | ||
| 63 | - """ | ||
| 64 | - ) | ||
| 65 | - time.sleep(1) | 53 | + page.input_content_editable(COMMENT_INPUT_FIELD, content) |
| 54 | + sleep_random(600, 1200) | ||
| 66 | 55 | ||
| 67 | # 点击提交 | 56 | # 点击提交 |
| 68 | page.click_element(COMMENT_SUBMIT_BUTTON) | 57 | page.click_element(COMMENT_SUBMIT_BUTTON) |
| 69 | - time.sleep(1) | 58 | + sleep_random(800, 1500) |
| 70 | 59 | ||
| 71 | logger.info("评论发送成功: feed=%s", feed_id) | 60 | logger.info("评论发送成功: feed=%s", feed_id) |
| 72 | 61 | ||
| @@ -103,42 +92,31 @@ def reply_comment( | @@ -103,42 +92,31 @@ def reply_comment( | ||
| 103 | page.navigate(url) | 92 | page.navigate(url) |
| 104 | page.wait_for_load() | 93 | page.wait_for_load() |
| 105 | page.wait_dom_stable() | 94 | page.wait_dom_stable() |
| 106 | - time.sleep(1) | 95 | + sleep_random(800, 1500) |
| 107 | 96 | ||
| 108 | _check_page_accessible(page) | 97 | _check_page_accessible(page) |
| 109 | - time.sleep(2) | 98 | + sleep_random(1500, 2500) |
| 110 | 99 | ||
| 111 | # 查找目标评论 | 100 | # 查找目标评论 |
| 112 | comment_found = _find_and_scroll_to_comment(page, comment_id, user_id) | 101 | comment_found = _find_and_scroll_to_comment(page, comment_id, user_id) |
| 113 | if not comment_found: | 102 | if not comment_found: |
| 114 | raise RuntimeError(f"未找到评论 (commentID: {comment_id}, userID: {user_id})") | 103 | raise RuntimeError(f"未找到评论 (commentID: {comment_id}, userID: {user_id})") |
| 115 | 104 | ||
| 116 | - time.sleep(1) | 105 | + sleep_random(800, 1500) |
| 117 | 106 | ||
| 118 | # 点击回复按钮 | 107 | # 点击回复按钮 |
| 119 | reply_selector = f"#comment-{comment_id} {REPLY_BUTTON}" if comment_id else REPLY_BUTTON | 108 | reply_selector = f"#comment-{comment_id} {REPLY_BUTTON}" if comment_id else REPLY_BUTTON |
| 120 | page.click_element(reply_selector) | 109 | page.click_element(reply_selector) |
| 121 | - time.sleep(1) | 110 | + sleep_random(800, 1500) |
| 122 | 111 | ||
| 123 | - # 输入回复内容 | 112 | + # 输入回复内容(CDP 逐字输入) |
| 124 | page.wait_for_element(COMMENT_INPUT_FIELD, timeout=5) | 113 | page.wait_for_element(COMMENT_INPUT_FIELD, timeout=5) |
| 125 | - page.evaluate( | ||
| 126 | - f""" | ||
| 127 | - (() => {{ | ||
| 128 | - const el = document.querySelector({_js_str(COMMENT_INPUT_FIELD)}); | ||
| 129 | - if (el) {{ | ||
| 130 | - el.focus(); | ||
| 131 | - el.textContent = {_js_str(content)}; | ||
| 132 | - el.dispatchEvent(new Event('input', {{bubbles: true}})); | ||
| 133 | - }} | ||
| 134 | - }})() | ||
| 135 | - """ | ||
| 136 | - ) | ||
| 137 | - time.sleep(0.5) | 114 | + page.input_content_editable(COMMENT_INPUT_FIELD, content) |
| 115 | + sleep_random(600, 1200) | ||
| 138 | 116 | ||
| 139 | # 点击提交 | 117 | # 点击提交 |
| 140 | page.click_element(COMMENT_SUBMIT_BUTTON) | 118 | page.click_element(COMMENT_SUBMIT_BUTTON) |
| 141 | - time.sleep(2) | 119 | + sleep_random(1500, 2500) |
| 142 | 120 | ||
| 143 | logger.info("回复评论成功") | 121 | logger.info("回复评论成功") |
| 144 | 122 | ||
| @@ -154,7 +132,7 @@ def _find_and_scroll_to_comment( | @@ -154,7 +132,7 @@ def _find_and_scroll_to_comment( | ||
| 154 | 132 | ||
| 155 | # 先滚动到评论区 | 133 | # 先滚动到评论区 |
| 156 | page.scroll_element_into_view(".comments-container") | 134 | page.scroll_element_into_view(".comments-container") |
| 157 | - time.sleep(1) | 135 | + sleep_random(800, 1500) |
| 158 | 136 | ||
| 159 | last_count = 0 | 137 | last_count = 0 |
| 160 | stagnant = 0 | 138 | stagnant = 0 |
| @@ -179,11 +157,11 @@ def _find_and_scroll_to_comment( | @@ -179,11 +157,11 @@ def _find_and_scroll_to_comment( | ||
| 179 | # 滚动到最后一条评论 | 157 | # 滚动到最后一条评论 |
| 180 | if current_count > 0: | 158 | if current_count > 0: |
| 181 | page.scroll_nth_element_into_view(PARENT_COMMENT, current_count - 1) | 159 | page.scroll_nth_element_into_view(PARENT_COMMENT, current_count - 1) |
| 182 | - time.sleep(0.3) | 160 | + sleep_random(200, 500) |
| 183 | 161 | ||
| 184 | # 继续滚动 | 162 | # 继续滚动 |
| 185 | page.evaluate("window.scrollBy(0, window.innerHeight * 0.8)") | 163 | page.evaluate("window.scrollBy(0, window.innerHeight * 0.8)") |
| 186 | - time.sleep(0.5) | 164 | + sleep_random(400, 800) |
| 187 | 165 | ||
| 188 | # 通过 commentID 查找 | 166 | # 通过 commentID 查找 |
| 189 | if comment_id: | 167 | if comment_id: |
| @@ -215,7 +193,7 @@ def _find_and_scroll_to_comment( | @@ -215,7 +193,7 @@ def _find_and_scroll_to_comment( | ||
| 215 | logger.info("通过 userID 找到评论 (尝试 %d 次)", attempt + 1) | 193 | logger.info("通过 userID 找到评论 (尝试 %d 次)", attempt + 1) |
| 216 | return True | 194 | return True |
| 217 | 195 | ||
| 218 | - time.sleep(0.8) | 196 | + sleep_random(600, 1200) |
| 219 | 197 | ||
| 220 | return False | 198 | return False |
| 221 | 199 |
| @@ -58,6 +58,15 @@ _INACCESSIBLE_KEYWORDS = [ | @@ -58,6 +58,15 @@ _INACCESSIBLE_KEYWORDS = [ | ||
| 58 | "仅作者可见", | 58 | "仅作者可见", |
| 59 | "因用户设置,你无法查看", | 59 | "因用户设置,你无法查看", |
| 60 | "因违规无法查看", | 60 | "因违规无法查看", |
| 61 | + "Isn't Available", | ||
| 62 | + "isn't available", | ||
| 63 | +] | ||
| 64 | + | ||
| 65 | +# 扫码验证关键词(触发反爬机制) | ||
| 66 | +_SCAN_QRCODE_KEYWORDS = [ | ||
| 67 | + "扫码查看", | ||
| 68 | + "打开小红书App扫码", | ||
| 69 | + "请使用小红书App扫码", | ||
| 61 | ] | 70 | ] |
| 62 | 71 | ||
| 63 | _REPLY_COUNT_RE = re.compile(r"展开\s*(\d+)\s*条回复") | 72 | _REPLY_COUNT_RE = re.compile(r"展开\s*(\d+)\s*条回复") |
| @@ -110,10 +119,10 @@ def get_feed_detail( | @@ -110,10 +119,10 @@ def get_feed_detail( | ||
| 110 | else: | 119 | else: |
| 111 | raise RuntimeError("页面导航失败") | 120 | raise RuntimeError("页面导航失败") |
| 112 | 121 | ||
| 113 | - sleep_random(1000, 1000) | 122 | + sleep_random(800, 1500) |
| 114 | 123 | ||
| 115 | - # 检查页面可访问性 | ||
| 116 | - _check_page_accessible(page) | 124 | + # 检查页面可访问性(扫码验证时自动等待重试) |
| 125 | + _check_page_accessible(page, url) | ||
| 117 | 126 | ||
| 118 | # 加载全部评论 | 127 | # 加载全部评论 |
| 119 | if load_all_comments: | 128 | if load_all_comments: |
| @@ -128,8 +137,11 @@ def get_feed_detail( | @@ -128,8 +137,11 @@ def get_feed_detail( | ||
| 128 | # ========== 页面检查 ========== | 137 | # ========== 页面检查 ========== |
| 129 | 138 | ||
| 130 | 139 | ||
| 131 | -def _check_page_accessible(page: Page) -> None: | ||
| 132 | - """检查页面是否可访问。""" | 140 | +def _check_page_accessible(page: Page, url: str = "") -> None: |
| 141 | + """检查页面是否可访问。 | ||
| 142 | + | ||
| 143 | + 扫码验证场景:等待 10 秒后自动重新访问,验证消失则继续,否则报错。 | ||
| 144 | + """ | ||
| 133 | time.sleep(0.5) | 145 | time.sleep(0.5) |
| 134 | 146 | ||
| 135 | text = page.get_element_text(ACCESS_ERROR_WRAPPER) | 147 | text = page.get_element_text(ACCESS_ERROR_WRAPPER) |
| @@ -137,6 +149,28 @@ def _check_page_accessible(page: Page) -> None: | @@ -137,6 +149,28 @@ def _check_page_accessible(page: Page) -> None: | ||
| 137 | return | 149 | return |
| 138 | 150 | ||
| 139 | text = text.strip() | 151 | text = text.strip() |
| 152 | + | ||
| 153 | + # 检测扫码验证(反爬机制触发)→ 等待后重试 | ||
| 154 | + if _is_scan_qrcode_verification(text) and url: | ||
| 155 | + logger.warning("触发小红书扫码验证,等待 10 秒后重新访问...") | ||
| 156 | + time.sleep(10) | ||
| 157 | + page.navigate(url) | ||
| 158 | + page.wait_for_load() | ||
| 159 | + page.wait_dom_stable() | ||
| 160 | + time.sleep(1) | ||
| 161 | + | ||
| 162 | + retry_text = page.get_element_text(ACCESS_ERROR_WRAPPER) | ||
| 163 | + if retry_text and _is_scan_qrcode_verification(retry_text.strip()): | ||
| 164 | + raise PageNotAccessibleError( | ||
| 165 | + "触发了小红书验证,需要在浏览器中扫码完成验证后重试。" | ||
| 166 | + "这通常是小红书的反爬机制,请稍后再试或在 Chrome 中手动打开该笔记完成验证" | ||
| 167 | + ) | ||
| 168 | + if not retry_text or not retry_text.strip(): | ||
| 169 | + logger.info("验证已消失,继续加载笔记") | ||
| 170 | + return | ||
| 171 | + # 重试后仍有其他错误,继续走下面的关键词检测 | ||
| 172 | + text = retry_text.strip() | ||
| 173 | + | ||
| 140 | for kw in _INACCESSIBLE_KEYWORDS: | 174 | for kw in _INACCESSIBLE_KEYWORDS: |
| 141 | if kw in text: | 175 | if kw in text: |
| 142 | raise PageNotAccessibleError(kw) | 176 | raise PageNotAccessibleError(kw) |
| @@ -145,6 +179,11 @@ def _check_page_accessible(page: Page) -> None: | @@ -145,6 +179,11 @@ def _check_page_accessible(page: Page) -> None: | ||
| 145 | raise PageNotAccessibleError(text) | 179 | raise PageNotAccessibleError(text) |
| 146 | 180 | ||
| 147 | 181 | ||
| 182 | +def _is_scan_qrcode_verification(text: str) -> bool: | ||
| 183 | + """判断页面文本是否为扫码验证。""" | ||
| 184 | + return any(kw in text for kw in _SCAN_QRCODE_KEYWORDS) | ||
| 185 | + | ||
| 186 | + | ||
| 148 | # ========== 数据提取 ========== | 187 | # ========== 数据提取 ========== |
| 149 | 188 | ||
| 150 | 189 |
| @@ -32,6 +32,11 @@ def sleep_random(min_ms: int, max_ms: int) -> None: | @@ -32,6 +32,11 @@ def sleep_random(min_ms: int, max_ms: int) -> None: | ||
| 32 | time.sleep(delay) | 32 | time.sleep(delay) |
| 33 | 33 | ||
| 34 | 34 | ||
| 35 | +def navigation_delay() -> None: | ||
| 36 | + """页面导航后的随机等待,模拟人类阅读。""" | ||
| 37 | + sleep_random(1000, 2500) | ||
| 38 | + | ||
| 39 | + | ||
| 35 | def get_scroll_interval(speed: str) -> float: | 40 | def get_scroll_interval(speed: str) -> float: |
| 36 | """根据速度获取滚动间隔(秒)。""" | 41 | """根据速度获取滚动间隔(秒)。""" |
| 37 | if speed == "slow": | 42 | if speed == "slow": |
| @@ -9,6 +9,7 @@ import tempfile | @@ -9,6 +9,7 @@ import tempfile | ||
| 9 | import time | 9 | import time |
| 10 | 10 | ||
| 11 | from .cdp import Page | 11 | from .cdp import Page |
| 12 | +from .human import sleep_random | ||
| 12 | from .selectors import LOGIN_STATUS, QRCODE_IMG | 13 | from .selectors import LOGIN_STATUS, QRCODE_IMG |
| 13 | from .urls import EXPLORE_URL | 14 | from .urls import EXPLORE_URL |
| 14 | 15 | ||
| @@ -23,7 +24,7 @@ def check_login_status(page: Page) -> bool: | @@ -23,7 +24,7 @@ def check_login_status(page: Page) -> bool: | ||
| 23 | """ | 24 | """ |
| 24 | page.navigate(EXPLORE_URL) | 25 | page.navigate(EXPLORE_URL) |
| 25 | page.wait_for_load() | 26 | page.wait_for_load() |
| 26 | - time.sleep(1) | 27 | + sleep_random(800, 1500) |
| 27 | 28 | ||
| 28 | return page.has_element(LOGIN_STATUS) | 29 | return page.has_element(LOGIN_STATUS) |
| 29 | 30 | ||
| @@ -38,7 +39,7 @@ def fetch_qrcode(page: Page) -> tuple[str, bool]: | @@ -38,7 +39,7 @@ def fetch_qrcode(page: Page) -> tuple[str, bool]: | ||
| 38 | """ | 39 | """ |
| 39 | page.navigate(EXPLORE_URL) | 40 | page.navigate(EXPLORE_URL) |
| 40 | page.wait_for_load() | 41 | page.wait_for_load() |
| 41 | - time.sleep(2) | 42 | + sleep_random(1500, 2500) |
| 42 | 43 | ||
| 43 | # 检查是否已登录 | 44 | # 检查是否已登录 |
| 44 | if page.has_element(LOGIN_STATUS): | 45 | if page.has_element(LOGIN_STATUS): |
| @@ -5,6 +5,7 @@ from __future__ import annotations | @@ -5,6 +5,7 @@ from __future__ import annotations | ||
| 5 | import json | 5 | import json |
| 6 | import logging | 6 | import logging |
| 7 | import random | 7 | import random |
| 8 | +import re | ||
| 8 | import time | 9 | import time |
| 9 | 10 | ||
| 10 | from .cdp import Page | 11 | from .cdp import Page |
| @@ -127,27 +128,31 @@ def _navigate_to_publish_page(page: Page) -> None: | @@ -127,27 +128,31 @@ def _navigate_to_publish_page(page: Page) -> None: | ||
| 127 | """导航到发布页面。""" | 128 | """导航到发布页面。""" |
| 128 | page.navigate(PUBLISH_URL) | 129 | page.navigate(PUBLISH_URL) |
| 129 | page.wait_for_load(timeout=300) | 130 | page.wait_for_load(timeout=300) |
| 130 | - time.sleep(2) | 131 | + time.sleep(3) |
| 131 | page.wait_dom_stable() | 132 | page.wait_dom_stable() |
| 132 | - time.sleep(1) | 133 | + time.sleep(2) |
| 133 | 134 | ||
| 134 | 135 | ||
| 135 | def _click_publish_tab(page: Page, tab_name: str) -> None: | 136 | def _click_publish_tab(page: Page, tab_name: str) -> None: |
| 136 | """点击发布页 TAB(上传图文/上传视频)。""" | 137 | """点击发布页 TAB(上传图文/上传视频)。""" |
| 137 | - page.wait_for_element(UPLOAD_CONTENT, timeout=15) | ||
| 138 | - | ||
| 139 | deadline = time.monotonic() + 15 | 138 | deadline = time.monotonic() + 15 |
| 140 | while time.monotonic() < deadline: | 139 | while time.monotonic() < deadline: |
| 141 | - # 查找匹配的 TAB | 140 | + # 查找匹配的 TAB(支持多种结构) |
| 142 | found = page.evaluate( | 141 | found = page.evaluate( |
| 143 | f""" | 142 | f""" |
| 144 | (() => {{ | 143 | (() => {{ |
| 145 | - const tabs = document.querySelectorAll({json.dumps(CREATOR_TAB)}); | 144 | + // 策略1: 查找 div.creator-tab(过滤隐藏元素) |
| 145 | + let tabs = document.querySelectorAll({json.dumps(CREATOR_TAB)}); | ||
| 146 | for (const tab of tabs) {{ | 146 | for (const tab of tabs) {{ |
| 147 | - if (tab.textContent.trim() === {json.dumps(tab_name)}) {{ | ||
| 148 | - // 检查是否被遮挡 | 147 | + const titleSpan = tab.querySelector('span.title'); |
| 148 | + const tabText = titleSpan ? titleSpan.textContent.trim() : tab.textContent.trim(); | ||
| 149 | + if (tabText === {json.dumps(tab_name)}) {{ | ||
| 149 | const rect = tab.getBoundingClientRect(); | 150 | const rect = tab.getBoundingClientRect(); |
| 151 | + const style = window.getComputedStyle(tab); | ||
| 152 | + // 跳过隐藏或被移出视口的元素 | ||
| 150 | if (rect.width === 0 || rect.height === 0) continue; | 153 | if (rect.width === 0 || rect.height === 0) continue; |
| 154 | + if (rect.left < 0 || rect.top < 0) continue; | ||
| 155 | + if (style.display === 'none' || style.visibility === 'hidden') continue; | ||
| 151 | const x = rect.left + rect.width / 2; | 156 | const x = rect.left + rect.width / 2; |
| 152 | const y = rect.top + rect.height / 2; | 157 | const y = rect.top + rect.height / 2; |
| 153 | const target = document.elementFromPoint(x, y); | 158 | const target = document.elementFromPoint(x, y); |
| @@ -158,6 +163,21 @@ def _click_publish_tab(page: Page, tab_name: str) -> None: | @@ -158,6 +163,21 @@ def _click_publish_tab(page: Page, tab_name: str) -> None: | ||
| 158 | return 'blocked'; | 163 | return 'blocked'; |
| 159 | }} | 164 | }} |
| 160 | }} | 165 | }} |
| 166 | + | ||
| 167 | + // 策略2: 查找任意包含目标文本的元素 | ||
| 168 | + const allElements = document.querySelectorAll('*'); | ||
| 169 | + for (const el of allElements) {{ | ||
| 170 | + if (el.children.length === 0 && el.textContent.trim() === {json.dumps(tab_name)}) {{ | ||
| 171 | + const rect = el.getBoundingClientRect(); | ||
| 172 | + const style = window.getComputedStyle(el); | ||
| 173 | + if (rect.width === 0 || rect.height === 0) continue; | ||
| 174 | + if (rect.left < 0 || rect.top < 0) continue; | ||
| 175 | + if (style.display === 'none' || style.visibility === 'hidden') continue; | ||
| 176 | + el.click(); | ||
| 177 | + return 'clicked'; | ||
| 178 | + }} | ||
| 179 | + }} | ||
| 180 | + | ||
| 161 | return 'not_found'; | 181 | return 'not_found'; |
| 162 | }})() | 182 | }})() |
| 163 | """ | 183 | """ |
| @@ -172,6 +192,19 @@ def _click_publish_tab(page: Page, tab_name: str) -> None: | @@ -172,6 +192,19 @@ def _click_publish_tab(page: Page, tab_name: str) -> None: | ||
| 172 | 192 | ||
| 173 | time.sleep(0.2) | 193 | time.sleep(0.2) |
| 174 | 194 | ||
| 195 | + # 调试:输出页面信息 | ||
| 196 | + debug_info = page.evaluate(""" | ||
| 197 | + (() => { | ||
| 198 | + const creatorTabs = document.querySelectorAll('div.creator-tab'); | ||
| 199 | + const tabTexts = Array.from(creatorTabs).map(t => ({ | ||
| 200 | + text: t.textContent.trim(), | ||
| 201 | + html: t.outerHTML.substring(0, 200) | ||
| 202 | + })); | ||
| 203 | + const url = window.location.href; | ||
| 204 | + return JSON.stringify({url, tabCount: creatorTabs.length, tabs: tabTexts}); | ||
| 205 | + })() | ||
| 206 | + """) | ||
| 207 | + logger.error("调试信息: %s", debug_info) | ||
| 175 | raise PublishError(f"没有找到发布 TAB - {tab_name}") | 208 | raise PublishError(f"没有找到发布 TAB - {tab_name}") |
| 176 | 209 | ||
| 177 | 210 | ||
| @@ -223,6 +256,34 @@ def _wait_for_upload_complete(page: Page, expected_count: int) -> None: | @@ -223,6 +256,34 @@ def _wait_for_upload_complete(page: Page, expected_count: int) -> None: | ||
| 223 | # ========== 表单提交 ========== | 256 | # ========== 表单提交 ========== |
| 224 | 257 | ||
| 225 | 258 | ||
| 259 | +def _extract_hashtags_from_content(content: str, tags: list[str]) -> tuple[str, list[str]]: | ||
| 260 | + """从正文末尾提取 hashtag 行,合并到 tags 列表。 | ||
| 261 | + | ||
| 262 | + Returns: | ||
| 263 | + (cleaned_content, merged_tags) | ||
| 264 | + """ | ||
| 265 | + lines = content.rstrip().split("\n") | ||
| 266 | + # 检查最后一行是否全是 #tag 格式 | ||
| 267 | + if lines: | ||
| 268 | + last_line = lines[-1].strip() | ||
| 269 | + hashtag_pattern = re.compile(r"^(#\S+\s*)+$") | ||
| 270 | + if hashtag_pattern.match(last_line): | ||
| 271 | + # 提取 hashtag | ||
| 272 | + extracted = re.findall(r"#(\S+)", last_line) | ||
| 273 | + # 合并到 tags(去重) | ||
| 274 | + existing = {t.lstrip("#") for t in tags} | ||
| 275 | + merged = list(tags) | ||
| 276 | + for t in extracted: | ||
| 277 | + if t not in existing: | ||
| 278 | + merged.append(t) | ||
| 279 | + existing.add(t) | ||
| 280 | + # 去掉最后一行 | ||
| 281 | + cleaned = "\n".join(lines[:-1]).rstrip() | ||
| 282 | + logger.info("从正文末尾提取 %d 个标签,合并后共 %d 个", len(extracted), len(merged)) | ||
| 283 | + return cleaned, merged | ||
| 284 | + return content, list(tags) | ||
| 285 | + | ||
| 286 | + | ||
| 226 | def _fill_publish_form( | 287 | def _fill_publish_form( |
| 227 | page: Page, | 288 | page: Page, |
| 228 | title: str, | 289 | title: str, |
| @@ -233,6 +294,9 @@ def _fill_publish_form( | @@ -233,6 +294,9 @@ def _fill_publish_form( | ||
| 233 | visibility: str, | 294 | visibility: str, |
| 234 | ) -> None: | 295 | ) -> None: |
| 235 | """填写表单(不点击发布)。""" | 296 | """填写表单(不点击发布)。""" |
| 297 | + # 从正文末尾提取 hashtag 并合并到 tags | ||
| 298 | + content, tags = _extract_hashtags_from_content(content, tags) | ||
| 299 | + | ||
| 236 | # 标题 | 300 | # 标题 |
| 237 | page.input_text(TITLE_INPUT, title) | 301 | page.input_text(TITLE_INPUT, title) |
| 238 | time.sleep(0.5) | 302 | time.sleep(0.5) |
| @@ -334,6 +398,10 @@ def _input_tags(page: Page, content_selector: str, tags: list[str]) -> None: | @@ -334,6 +398,10 @@ def _input_tags(page: Page, content_selector: str, tags: list[str]) -> None: | ||
| 334 | """输入标签。""" | 398 | """输入标签。""" |
| 335 | time.sleep(1) | 399 | time.sleep(1) |
| 336 | 400 | ||
| 401 | + # 先点击正文编辑器,确保焦点在正文而非标题 | ||
| 402 | + page.click_element(content_selector) | ||
| 403 | + time.sleep(0.3) | ||
| 404 | + | ||
| 337 | # 移动光标到正文末尾(20次 ArrowDown) | 405 | # 移动光标到正文末尾(20次 ArrowDown) |
| 338 | for _ in range(20): | 406 | for _ in range(20): |
| 339 | page.press_key("ArrowDown") | 407 | page.press_key("ArrowDown") |
| @@ -353,27 +421,32 @@ def _input_single_tag(page: Page, content_selector: str, tag: str) -> None: | @@ -353,27 +421,32 @@ def _input_single_tag(page: Page, content_selector: str, tag: str) -> None: | ||
| 353 | """输入单个标签。""" | 421 | """输入单个标签。""" |
| 354 | # 输入 # | 422 | # 输入 # |
| 355 | page.type_text("#", delay_ms=0) | 423 | page.type_text("#", delay_ms=0) |
| 356 | - time.sleep(0.2) | 424 | + time.sleep(0.3) |
| 357 | 425 | ||
| 358 | - # 逐字输入标签 | 426 | + # 逐字输入标签(随机间隔模拟真实输入) |
| 359 | for char in tag: | 427 | for char in tag: |
| 360 | - page.type_text(char, delay_ms=50) | 428 | + page.type_text(char, delay_ms=0) |
| 429 | + time.sleep(random.uniform(0.05, 0.12)) | ||
| 361 | 430 | ||
| 362 | - time.sleep(1) | 431 | + # 等待标签联想出现(最多 3 秒) |
| 432 | + deadline = time.monotonic() + 3.0 | ||
| 433 | + clicked = False | ||
| 434 | + while time.monotonic() < deadline: | ||
| 435 | + time.sleep(0.5) | ||
| 436 | + if page.has_element(TAG_TOPIC_CONTAINER): | ||
| 437 | + item_selector = f"{TAG_TOPIC_CONTAINER} {TAG_FIRST_ITEM}" | ||
| 438 | + if page.has_element(item_selector): | ||
| 439 | + page.click_element(item_selector) | ||
| 440 | + logger.info("点击标签联想: %s", tag) | ||
| 441 | + clicked = True | ||
| 442 | + break | ||
| 363 | 443 | ||
| 364 | - # 尝试点击标签联想 | ||
| 365 | - if page.has_element(TAG_TOPIC_CONTAINER): | ||
| 366 | - item_selector = f"{TAG_TOPIC_CONTAINER} {TAG_FIRST_ITEM}" | ||
| 367 | - if page.has_element(item_selector): | ||
| 368 | - page.click_element(item_selector) | ||
| 369 | - logger.info("点击标签联想: %s", tag) | ||
| 370 | - time.sleep(0.5) | ||
| 371 | - return | 444 | + if not clicked: |
| 445 | + # 没有联想,直接空格 | ||
| 446 | + logger.warning("未找到标签联想,直接输入空格: %s", tag) | ||
| 447 | + page.type_text(" ", delay_ms=0) | ||
| 372 | 448 | ||
| 373 | - # 没有联想,直接空格 | ||
| 374 | - logger.warning("未找到标签联想,直接输入空格: %s", tag) | ||
| 375 | - page.type_text(" ", delay_ms=0) | ||
| 376 | - time.sleep(0.5) | 449 | + time.sleep(0.8) |
| 377 | 450 | ||
| 378 | 451 | ||
| 379 | # ========== 定时发布 ========== | 452 | # ========== 定时发布 ========== |
| @@ -5,6 +5,7 @@ from __future__ import annotations | @@ -5,6 +5,7 @@ from __future__ import annotations | ||
| 5 | import json | 5 | import json |
| 6 | import logging | 6 | import logging |
| 7 | import time | 7 | import time |
| 8 | +from pathlib import Path | ||
| 8 | 9 | ||
| 9 | from .cdp import Page | 10 | from .cdp import Page |
| 10 | from .errors import PublishError | 11 | from .errors import PublishError |
| @@ -217,14 +218,14 @@ def _fill_long_content(page: Page, content: str) -> None: | @@ -217,14 +218,14 @@ def _fill_long_content(page: Page, content: str) -> None: | ||
| 217 | def _insert_images_to_editor(page: Page, image_paths: list[str]) -> None: | 218 | def _insert_images_to_editor(page: Page, image_paths: list[str]) -> None: |
| 218 | """将图片插入到编辑器中。""" | 219 | """将图片插入到编辑器中。""" |
| 219 | for img_path in image_paths: | 220 | for img_path in image_paths: |
| 220 | - normalized = img_path.replace("\\", "/") | 221 | + file_uri = Path(img_path).resolve().as_uri() |
| 221 | page.evaluate( | 222 | page.evaluate( |
| 222 | f""" | 223 | f""" |
| 223 | (() => {{ | 224 | (() => {{ |
| 224 | const editor = document.querySelector({json.dumps(CONTENT_EDITOR)}); | 225 | const editor = document.querySelector({json.dumps(CONTENT_EDITOR)}); |
| 225 | if (!editor) return false; | 226 | if (!editor) return false; |
| 226 | const img = document.createElement('img'); | 227 | const img = document.createElement('img'); |
| 227 | - img.src = 'file:///' + {json.dumps(normalized)}; | 228 | + img.src = {json.dumps(file_uri)}; |
| 228 | editor.appendChild(img); | 229 | editor.appendChild(img); |
| 229 | editor.dispatchEvent(new Event('input', {{ bubbles: true }})); | 230 | editor.dispatchEvent(new Event('input', {{ bubbles: true }})); |
| 230 | return true; | 231 | return true; |
| @@ -8,6 +8,7 @@ import time | @@ -8,6 +8,7 @@ import time | ||
| 8 | 8 | ||
| 9 | from .cdp import Page | 9 | from .cdp import Page |
| 10 | from .errors import NoFeedsError | 10 | from .errors import NoFeedsError |
| 11 | +from .human import sleep_random | ||
| 11 | from .selectors import FILTER_BUTTON, FILTER_PANEL | 12 | from .selectors import FILTER_BUTTON, FILTER_PANEL |
| 12 | from .types import Feed, FilterOption | 13 | from .types import Feed, FilterOption |
| 13 | from .urls import make_search_url | 14 | from .urls import make_search_url |
| @@ -139,7 +140,7 @@ def _apply_filters(page: Page, filters: list[tuple[int, int]]) -> None: | @@ -139,7 +140,7 @@ def _apply_filters(page: Page, filters: list[tuple[int, int]]) -> None: | ||
| 139 | while time.monotonic() < deadline: | 140 | while time.monotonic() < deadline: |
| 140 | if page.has_element(FILTER_PANEL): | 141 | if page.has_element(FILTER_PANEL): |
| 141 | break | 142 | break |
| 142 | - time.sleep(0.3) | 143 | + sleep_random(300, 600) |
| 143 | 144 | ||
| 144 | # 点击各筛选项 | 145 | # 点击各筛选项 |
| 145 | for filters_index, tags_index in filters: | 146 | for filters_index, tags_index in filters: |
| @@ -148,7 +149,7 @@ def _apply_filters(page: Page, filters: list[tuple[int, int]]) -> None: | @@ -148,7 +149,7 @@ def _apply_filters(page: Page, filters: list[tuple[int, int]]) -> None: | ||
| 148 | f"div.tags:nth-child({tags_index})" | 149 | f"div.tags:nth-child({tags_index})" |
| 149 | ) | 150 | ) |
| 150 | page.click_element(selector) | 151 | page.click_element(selector) |
| 151 | - time.sleep(0.3) | 152 | + sleep_random(300, 600) |
| 152 | 153 | ||
| 153 | # 等待页面更新 | 154 | # 等待页面更新 |
| 154 | page.wait_dom_stable() | 155 | page.wait_dom_stable() |
| 1 | """反检测 JS 注入 + Chrome 启动参数,对应 go-rod/stealth。""" | 1 | """反检测 JS 注入 + Chrome 启动参数,对应 go-rod/stealth。""" |
| 2 | 2 | ||
| 3 | +# 真实 Chrome UA(固定版本,避免每次随机导致指纹不一致) | ||
| 4 | +REALISTIC_UA = ( | ||
| 5 | + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " | ||
| 6 | + "AppleWebKit/537.36 (KHTML, like Gecko) " | ||
| 7 | + "Chrome/131.0.0.0 Safari/537.36" | ||
| 8 | +) | ||
| 9 | + | ||
| 3 | # 反检测 JS 脚本:在页面加载时注入 | 10 | # 反检测 JS 脚本:在页面加载时注入 |
| 4 | STEALTH_JS = """ | 11 | STEALTH_JS = """ |
| 5 | (() => { | 12 | (() => { |
| @@ -72,6 +79,45 @@ STEALTH_JS = """ | @@ -72,6 +79,45 @@ STEALTH_JS = """ | ||
| 72 | if (parameter === 37446) return 'Intel Iris OpenGL Engine'; | 79 | if (parameter === 37446) return 'Intel Iris OpenGL Engine'; |
| 73 | return getParameter.call(this, parameter); | 80 | return getParameter.call(this, parameter); |
| 74 | }; | 81 | }; |
| 82 | + | ||
| 83 | + // 7. hardwareConcurrency — 随机 4 或 8 | ||
| 84 | + Object.defineProperty(navigator, 'hardwareConcurrency', { | ||
| 85 | + get: () => [4, 8][Math.floor(Math.random() * 2)], | ||
| 86 | + configurable: true, | ||
| 87 | + }); | ||
| 88 | + | ||
| 89 | + // 8. deviceMemory — 随机 4 或 8 | ||
| 90 | + Object.defineProperty(navigator, 'deviceMemory', { | ||
| 91 | + get: () => [4, 8][Math.floor(Math.random() * 2)], | ||
| 92 | + configurable: true, | ||
| 93 | + }); | ||
| 94 | + | ||
| 95 | + // 9. navigator.connection — 伪造网络信息 | ||
| 96 | + Object.defineProperty(navigator, 'connection', { | ||
| 97 | + get: () => ({ | ||
| 98 | + effectiveType: '4g', | ||
| 99 | + downlink: 10, | ||
| 100 | + rtt: 50, | ||
| 101 | + saveData: false, | ||
| 102 | + }), | ||
| 103 | + configurable: true, | ||
| 104 | + }); | ||
| 105 | + | ||
| 106 | + // 10. chrome.csi / chrome.loadTimes — 空函数伪装 | ||
| 107 | + if (window.chrome) { | ||
| 108 | + window.chrome.csi = function() { return {}; }; | ||
| 109 | + window.chrome.loadTimes = function() { return {}; }; | ||
| 110 | + } | ||
| 111 | + | ||
| 112 | + // 11. outerWidth/outerHeight — 与 innerWidth/innerHeight 对齐 | ||
| 113 | + Object.defineProperty(window, 'outerWidth', { | ||
| 114 | + get: () => window.innerWidth, | ||
| 115 | + configurable: true, | ||
| 116 | + }); | ||
| 117 | + Object.defineProperty(window, 'outerHeight', { | ||
| 118 | + get: () => window.innerHeight, | ||
| 119 | + configurable: true, | ||
| 120 | + }); | ||
| 75 | })(); | 121 | })(); |
| 76 | """ | 122 | """ |
| 77 | 123 | ||
| @@ -85,4 +131,6 @@ STEALTH_ARGS = [ | @@ -85,4 +131,6 @@ STEALTH_ARGS = [ | ||
| 85 | "--disable-backgrounding-occluded-windows", | 131 | "--disable-backgrounding-occluded-windows", |
| 86 | "--disable-renderer-backgrounding", | 132 | "--disable-renderer-backgrounding", |
| 87 | "--disable-component-update", | 133 | "--disable-component-update", |
| 134 | + "--disable-extensions", | ||
| 135 | + "--disable-sync", | ||
| 88 | ] | 136 | ] |
| @@ -159,6 +159,9 @@ class Feed: | @@ -159,6 +159,9 @@ class Feed: | ||
| 159 | "sharedCount": self.note_card.interact_info.shared_count, | 159 | "sharedCount": self.note_card.interact_info.shared_count, |
| 160 | }, | 160 | }, |
| 161 | } | 161 | } |
| 162 | + cover = self.note_card.cover | ||
| 163 | + if cover.url or cover.url_default: | ||
| 164 | + result["cover"] = cover.url or cover.url_default | ||
| 162 | if self.note_card.video: | 165 | if self.note_card.video: |
| 163 | result["video"] = {"duration": self.note_card.video.capa.duration} | 166 | result["video"] = {"duration": self.note_card.video.capa.duration} |
| 164 | return result | 167 | return result |
-
Please register or login to post a comment