Showing
6 changed files
with
581 additions
and
3 deletions
| @@ -38,9 +38,10 @@ metadata: | @@ -38,9 +38,10 @@ metadata: | ||
| 38 | 38 | ||
| 39 | 1. **认证相关**("登录 / 检查登录 / 切换账号")→ 执行 `xhs-auth` 技能。 | 39 | 1. **认证相关**("登录 / 检查登录 / 切换账号")→ 执行 `xhs-auth` 技能。 |
| 40 | 2. **内容发布**("发布 / 发帖 / 上传图文 / 上传视频")→ 执行 `xhs-publish` 技能。 | 40 | 2. **内容发布**("发布 / 发帖 / 上传图文 / 上传视频")→ 执行 `xhs-publish` 技能。 |
| 41 | -3. **搜索发现**("搜索笔记 / 查看详情 / 浏览首页 / 查看用户")→ 执行 `xhs-explore` 技能。 | ||
| 42 | -4. **社交互动**("评论 / 回复 / 点赞 / 收藏")→ 执行 `xhs-interact` 技能。 | ||
| 43 | -5. **复合运营**("竞品分析 / 热点追踪 / 批量互动 / 一键创作")→ 执行 `xhs-content-ops` 技能。 | 41 | +3. **数据导出**("搜索并导出 / 导出评论 / 导出Excel / 批量采集评论")→ 执行 `xhs-export` 技能。 |
| 42 | +4. **搜索发现**("搜索笔记 / 查看详情 / 浏览首页 / 查看用户")→ 执行 `xhs-explore` 技能。 | ||
| 43 | +5. **社交互动**("评论 / 回复 / 点赞 / 收藏")→ 执行 `xhs-interact` 技能。 | ||
| 44 | +6. **复合运营**("竞品分析 / 热点追踪 / 批量互动 / 一键创作")→ 执行 `xhs-content-ops` 技能。 | ||
| 44 | 45 | ||
| 45 | ## 全局约束 | 46 | ## 全局约束 |
| 46 | 47 | ||
| @@ -100,6 +101,51 @@ metadata: | @@ -100,6 +101,51 @@ metadata: | ||
| 100 | 101 | ||
| 101 | 组合多步骤完成运营工作流:竞品分析、热点追踪、内容创作、互动管理。 | 102 | 组合多步骤完成运营工作流:竞品分析、热点追踪、内容创作、互动管理。 |
| 102 | 103 | ||
| 104 | +### xhs-export — 数据导出 | ||
| 105 | + | ||
| 106 | +将小红书搜索结果和评论导出到 Excel 文件,支持断点续传。 | ||
| 107 | + | ||
| 108 | +**使用场景**:当用户要求"搜索并导出"、"导出评论"、"批量采集评论"时触发。 | ||
| 109 | + | ||
| 110 | +**命令**: | ||
| 111 | + | ||
| 112 | +| 命令 | 功能 | | ||
| 113 | +|------|------| | ||
| 114 | +| `cli.py export-search` | 搜索文章并导出到 Excel | | ||
| 115 | +| `cli.py export-comments` | 轮询查询评论并导出到 Excel | | ||
| 116 | + | ||
| 117 | +**第一步:搜索文章并导出** | ||
| 118 | + | ||
| 119 | +```bash | ||
| 120 | +python scripts/cli.py export-search \ | ||
| 121 | + --keyword "关键词" \ | ||
| 122 | + --output /abs/path/result.xlsx \ | ||
| 123 | + [--sort-by 综合|最新|最多点赞|最多评论|最多收藏] \ | ||
| 124 | + [--note-type 不限|视频|图文] \ | ||
| 125 | + [--publish-time 不限|一天内|一周内|半年内] | ||
| 126 | +``` | ||
| 127 | + | ||
| 128 | +**第二步:轮询查询评论** | ||
| 129 | + | ||
| 130 | +```bash | ||
| 131 | +# 查询所有未完成的文章评论 | ||
| 132 | +python scripts/cli.py export-comments --input /abs/path/result.xlsx | ||
| 133 | + | ||
| 134 | +# 指定查询某篇文章(序号从1开始) | ||
| 135 | +python scripts/cli.py export-comments --input /abs/path/result.xlsx --feed-index 1 | ||
| 136 | + | ||
| 137 | +# 设置请求间隔(默认1秒) | ||
| 138 | +python scripts/cli.py export-comments --input /abs/path/result.xlsx --delay 2.0 | ||
| 139 | +``` | ||
| 140 | + | ||
| 141 | +**Excel 输出结构**: | ||
| 142 | + | ||
| 143 | +- **Sheet1 (文章列表)**:序号、标题、作者、类型、点赞数、评论数、收藏数、评论查询状态、feed_id、xsec_token、链接 | ||
| 144 | + - 评论查询状态:空=未查询,成功=绿色标记,失败=红色标记 | ||
| 145 | +- **Sheet2 (评论列表)**:文章序号、评论内容、评论时间、点赞数、用户、IP属地、回复数 | ||
| 146 | + | ||
| 147 | +**断点续传**:第一步失败后,重新运行第二步命令,会自动跳过已成功的文章,只查询未完成和失败的文章。 | ||
| 148 | + | ||
| 103 | ## 快速开始 | 149 | ## 快速开始 |
| 104 | 150 | ||
| 105 | ```bash | 151 | ```bash |
| @@ -8,6 +8,7 @@ requires-python = ">=3.11" | @@ -8,6 +8,7 @@ requires-python = ">=3.11" | ||
| 8 | dependencies = [ | 8 | dependencies = [ |
| 9 | "requests>=2.28.0", | 9 | "requests>=2.28.0", |
| 10 | "websockets>=12.0", | 10 | "websockets>=12.0", |
| 11 | + "openpyxl>=3.1.0", | ||
| 11 | ] | 12 | ] |
| 12 | 13 | ||
| 13 | [project.optional-dependencies] | 14 | [project.optional-dependencies] |
| @@ -669,6 +669,60 @@ def cmd_next_step(args: argparse.Namespace) -> None: | @@ -669,6 +669,60 @@ def cmd_next_step(args: argparse.Namespace) -> None: | ||
| 669 | browser.close() | 669 | browser.close() |
| 670 | 670 | ||
| 671 | 671 | ||
| 672 | +def cmd_export_search(args: argparse.Namespace) -> None: | ||
| 673 | + """搜索文章并导出到 Excel。""" | ||
| 674 | + from xhs.export_excel import search_and_export | ||
| 675 | + from xhs.types import FilterOption | ||
| 676 | + | ||
| 677 | + filter_opt = FilterOption( | ||
| 678 | + sort_by=args.sort_by or "", | ||
| 679 | + note_type=args.note_type or "", | ||
| 680 | + publish_time=args.publish_time or "", | ||
| 681 | + search_scope=args.search_scope or "", | ||
| 682 | + location=args.location or "", | ||
| 683 | + ) | ||
| 684 | + | ||
| 685 | + browser, page = _connect(args) | ||
| 686 | + try: | ||
| 687 | + search_and_export( | ||
| 688 | + page, | ||
| 689 | + args.keyword, | ||
| 690 | + args.output, | ||
| 691 | + filter_opt, | ||
| 692 | + limit=args.limit, | ||
| 693 | + ) | ||
| 694 | + _output({ | ||
| 695 | + "success": True, | ||
| 696 | + "keyword": args.keyword, | ||
| 697 | + "output": args.output, | ||
| 698 | + "message": "搜索完成,文章已保存到 Excel", | ||
| 699 | + }) | ||
| 700 | + finally: | ||
| 701 | + browser.close() | ||
| 702 | + | ||
| 703 | + | ||
| 704 | +def cmd_export_comments(args: argparse.Namespace) -> None: | ||
| 705 | + """轮询查询评论并导出到 Excel。""" | ||
| 706 | + from xhs.export_excel import poll_comments | ||
| 707 | + | ||
| 708 | + browser, page = _connect(args) | ||
| 709 | + try: | ||
| 710 | + stats = poll_comments( | ||
| 711 | + page, | ||
| 712 | + args.input, | ||
| 713 | + feed_index=args.feed_index if args.feed_index > 0 else None, | ||
| 714 | + delay=args.delay, | ||
| 715 | + ) | ||
| 716 | + _output({ | ||
| 717 | + "success": True, | ||
| 718 | + "input": args.input, | ||
| 719 | + "stats": stats, | ||
| 720 | + "message": f"评论查询完成: 成功={stats['success']}, 失败={stats['failed']}", | ||
| 721 | + }) | ||
| 722 | + finally: | ||
| 723 | + browser.close() | ||
| 724 | + | ||
| 725 | + | ||
| 672 | def cmd_publish_video(args: argparse.Namespace) -> None: | 726 | def cmd_publish_video(args: argparse.Namespace) -> None: |
| 673 | """发布视频内容。""" | 727 | """发布视频内容。""" |
| 674 | from xhs.publish_video import publish_video_content | 728 | from xhs.publish_video import publish_video_content |
| @@ -832,6 +886,25 @@ def build_parser() -> argparse.ArgumentParser: | @@ -832,6 +886,25 @@ def build_parser() -> argparse.ArgumentParser: | ||
| 832 | sub.add_argument("--visibility") | 886 | sub.add_argument("--visibility") |
| 833 | sub.set_defaults(func=cmd_publish_video) | 887 | sub.set_defaults(func=cmd_publish_video) |
| 834 | 888 | ||
| 889 | + # export-search | ||
| 890 | + sub = subparsers.add_parser("export-search", help="搜索文章并导出到 Excel") | ||
| 891 | + sub.add_argument("--keyword", required=True, help="搜索关键词") | ||
| 892 | + sub.add_argument("--output", required=True, help="输出 Excel 文件路径") | ||
| 893 | + sub.add_argument("--sort-by", help="排序: 综合、最新、最多点赞、最多评论、最多收藏") | ||
| 894 | + sub.add_argument("--note-type", help="类型: 不限、视频、图文") | ||
| 895 | + sub.add_argument("--publish-time", help="时间: 不限、一天内、一周内、半年内") | ||
| 896 | + sub.add_argument("--search-scope", help="范围: 不限、已看过、未看过、已关注") | ||
| 897 | + sub.add_argument("--location", help="位置: 不限、同城、附近") | ||
| 898 | + sub.add_argument("--limit", type=int, default=0, help="限制搜索文章数量,0表示不限制 (default: 0)") | ||
| 899 | + sub.set_defaults(func=cmd_export_search) | ||
| 900 | + | ||
| 901 | + # export-comments | ||
| 902 | + sub = subparsers.add_parser("export-comments", help="轮询查询评论并导出到 Excel") | ||
| 903 | + sub.add_argument("--input", required=True, help="输入 Excel 文件路径") | ||
| 904 | + sub.add_argument("--feed-index", type=int, default=0, help="指定文章序号(从1开始),0或不指定表示查询所有未完成的") | ||
| 905 | + sub.add_argument("--delay", type=float, default=1.0, help="请求间隔秒数 (default: 1.0)") | ||
| 906 | + sub.set_defaults(func=cmd_export_comments) | ||
| 907 | + | ||
| 835 | # fill-publish | 908 | # fill-publish |
| 836 | sub = subparsers.add_parser("fill-publish", help="填写图文表单(不发布)") | 909 | sub = subparsers.add_parser("fill-publish", help="填写图文表单(不发布)") |
| 837 | sub.add_argument("--title-file", required=True) | 910 | sub.add_argument("--title-file", required=True) |
scripts/xhs/export_excel.py
0 → 100644
| 1 | +"""导出小红书搜索结果和评论到 Excel。""" | ||
| 2 | + | ||
| 3 | +from __future__ import annotations | ||
| 4 | + | ||
| 5 | +import logging | ||
| 6 | +import time | ||
| 7 | +from datetime import datetime | ||
| 8 | +from pathlib import Path | ||
| 9 | + | ||
| 10 | +from openpyxl import load_workbook | ||
| 11 | +from openpyxl.styles import Alignment, Font, PatternFill | ||
| 12 | + | ||
| 13 | +from .errors import NoFeedDetailError, PageNotAccessibleError, XHSError | ||
| 14 | +from .feed_detail import get_feed_detail | ||
| 15 | +from .search import search_feeds | ||
| 16 | +from .types import Comment, Feed, FilterOption | ||
| 17 | + | ||
| 18 | +logger = logging.getLogger(__name__) | ||
| 19 | + | ||
| 20 | +# Sheet 名称 | ||
| 21 | +SHEET_FEEDS = "文章列表" | ||
| 22 | +SHEET_COMMENTS = "评论列表" | ||
| 23 | + | ||
| 24 | +# 评论查询状态 | ||
| 25 | +STATUS_PENDING = "" # 未查询 | ||
| 26 | +STATUS_SUCCESS = "成功" | ||
| 27 | +STATUS_FAILED = "失败" | ||
| 28 | + | ||
| 29 | +# 标题行样式 | ||
| 30 | +HEADER_FONT = Font(bold=True, color="FFFFFF") | ||
| 31 | +HEADER_FILL = PatternFill(start_color="4472C4", end_color="4472C4", fill_type="solid") | ||
| 32 | +HEADER_ALIGNMENT = Alignment(horizontal="center", vertical="center", wrap_text=True) | ||
| 33 | + | ||
| 34 | + | ||
| 35 | +class ExportExcel: | ||
| 36 | + """导出小红书数据到 Excel。""" | ||
| 37 | + | ||
| 38 | + def __init__(self, file_path: str, keyword: str) -> None: | ||
| 39 | + self.file_path = Path(file_path) | ||
| 40 | + self.keyword = keyword | ||
| 41 | + self.wb = None | ||
| 42 | + self.feeds_sheet = None | ||
| 43 | + self.comments_sheet = None | ||
| 44 | + | ||
| 45 | + # 列定义(从 1 开始) | ||
| 46 | + self.feeds_headers = [ | ||
| 47 | + "序号", "标题", "作者", "类型", "点赞数", "评论数", "收藏数", | ||
| 48 | + "评论查询状态", "文章id", "xsec_token", "链接" | ||
| 49 | + ] | ||
| 50 | + self.comments_headers = [ | ||
| 51 | + "文章id", "评论内容", "评论时间", "点赞数", "用户", "IP属地", "回复数" | ||
| 52 | + ] | ||
| 53 | + | ||
| 54 | + def create(self) -> None: | ||
| 55 | + """创建新的 Excel 文件并初始化表头。""" | ||
| 56 | + from openpyxl import Workbook | ||
| 57 | + | ||
| 58 | + self.wb = Workbook() | ||
| 59 | + self.wb.remove(self.wb.active) # 删除默认 sheet | ||
| 60 | + | ||
| 61 | + # 创建文章列表 sheet | ||
| 62 | + self.feeds_sheet = self.wb.create_sheet(SHEET_FEEDS) | ||
| 63 | + self._write_feeds_headers() | ||
| 64 | + | ||
| 65 | + # 创建评论列表 sheet | ||
| 66 | + self.comments_sheet = self.wb.create_sheet(SHEET_COMMENTS) | ||
| 67 | + self._write_comments_headers() | ||
| 68 | + | ||
| 69 | + self.wb.save(self.file_path) | ||
| 70 | + logger.info("创建 Excel 文件: %s", self.file_path) | ||
| 71 | + | ||
| 72 | + def load(self) -> bool: | ||
| 73 | + """加载已存在的 Excel 文件。""" | ||
| 74 | + if not self.file_path.exists(): | ||
| 75 | + return False | ||
| 76 | + | ||
| 77 | + self.wb = load_workbook(self.file_path) | ||
| 78 | + self.feeds_sheet = self.wb[SHEET_FEEDS] | ||
| 79 | + self.comments_sheet = self.wb[SHEET_COMMENTS] | ||
| 80 | + logger.info("加载已存在的 Excel 文件: %s", self.file_path) | ||
| 81 | + return True | ||
| 82 | + | ||
| 83 | + def _write_feeds_headers(self) -> None: | ||
| 84 | + """写入文章列表表头。""" | ||
| 85 | + for col, header in enumerate(self.feeds_headers, 1): | ||
| 86 | + cell = self.feeds_sheet.cell(row=1, column=col, value=header) | ||
| 87 | + cell.font = HEADER_FONT | ||
| 88 | + cell.fill = HEADER_FILL | ||
| 89 | + cell.alignment = HEADER_ALIGNMENT | ||
| 90 | + | ||
| 91 | + self.feeds_sheet.freeze_panes = "A2" | ||
| 92 | + | ||
| 93 | + # 设置列宽 (A=序号, B=标题, C=作者, D=类型, E=点赞数, F=评论数, G=收藏数, H=状态, I=文章id, J=xsec_token, K=链接) | ||
| 94 | + col_widths = { | ||
| 95 | + "A": 6, # 序号 | ||
| 96 | + "B": 40, # 标题 | ||
| 97 | + "C": 12, # 作者 | ||
| 98 | + "D": 6, # 类型 | ||
| 99 | + "E": 8, # 点赞数 | ||
| 100 | + "F": 8, # 评论数 | ||
| 101 | + "G": 8, # 收藏数 | ||
| 102 | + "H": 12, # 评论查询状态 | ||
| 103 | + "I": 24, # 文章id | ||
| 104 | + "J": 60, # xsec_token | ||
| 105 | + "K": 50, # 链接 | ||
| 106 | + } | ||
| 107 | + for col_letter, width in col_widths.items(): | ||
| 108 | + self.feeds_sheet.column_dimensions[col_letter].width = width | ||
| 109 | + | ||
| 110 | + # 设置行高 | ||
| 111 | + self.feeds_sheet.row_dimensions[1].height = 25 | ||
| 112 | + | ||
| 113 | + def _write_comments_headers(self) -> None: | ||
| 114 | + """写入评论列表表头。""" | ||
| 115 | + for col, header in enumerate(self.comments_headers, 1): | ||
| 116 | + cell = self.comments_sheet.cell(row=1, column=col, value=header) | ||
| 117 | + cell.font = HEADER_FONT | ||
| 118 | + cell.fill = HEADER_FILL | ||
| 119 | + cell.alignment = HEADER_ALIGNMENT | ||
| 120 | + | ||
| 121 | + self.comments_sheet.freeze_panes = "A2" | ||
| 122 | + | ||
| 123 | + # 设置列宽 (A=文章id, B=评论内容, C=评论时间, D=点赞数, E=用户, F=IP属地, G=回复数) | ||
| 124 | + col_widths = { | ||
| 125 | + "A": 24, # 文章id | ||
| 126 | + "B": 60, # 评论内容 | ||
| 127 | + "C": 18, # 评论时间 | ||
| 128 | + "D": 8, # 点赞数 | ||
| 129 | + "E": 12, # 用户 | ||
| 130 | + "F": 12, # IP属地 | ||
| 131 | + "G": 8, # 回复数 | ||
| 132 | + } | ||
| 133 | + for col_letter, width in col_widths.items(): | ||
| 134 | + self.comments_sheet.column_dimensions[col_letter].width = width | ||
| 135 | + | ||
| 136 | + # 设置行高 | ||
| 137 | + self.comments_sheet.row_dimensions[1].height = 25 | ||
| 138 | + | ||
| 139 | + def _get_feeds_next_row(self) -> int: | ||
| 140 | + """获取文章列表下一个空行号。""" | ||
| 141 | + return self.feeds_sheet.max_row + 1 | ||
| 142 | + | ||
| 143 | + def _get_comments_next_row(self) -> int: | ||
| 144 | + """获取评论列表下一个空行号。""" | ||
| 145 | + return self.comments_sheet.max_row + 1 | ||
| 146 | + | ||
| 147 | + def append_feed(self, feed: Feed, row: int | None = None) -> int: | ||
| 148 | + """追加文章数据到 Excel。 | ||
| 149 | + | ||
| 150 | + Returns: | ||
| 151 | + 实际写入的行号 | ||
| 152 | + """ | ||
| 153 | + if row is None: | ||
| 154 | + row = self._get_feeds_next_row() | ||
| 155 | + | ||
| 156 | + interact = feed.note_card.interact_info | ||
| 157 | + user = feed.note_card.user | ||
| 158 | + title = feed.note_card.display_title | ||
| 159 | + | ||
| 160 | + data = [ | ||
| 161 | + row - 1, # 序号(从 1 开始,但Excel行从2开始) | ||
| 162 | + title, | ||
| 163 | + user.nickname or user.nick_name, | ||
| 164 | + "视频" if feed.note_card.type == "video" else "图文", | ||
| 165 | + interact.liked_count, | ||
| 166 | + interact.comment_count, | ||
| 167 | + interact.collected_count, | ||
| 168 | + STATUS_PENDING, # 评论查询状态 | ||
| 169 | + feed.id, | ||
| 170 | + feed.xsec_token, | ||
| 171 | + f"https://www.xiaohongshu.com/explore/{feed.id}?xsec_token={feed.xsec_token}&xsec_source=pc_feed", | ||
| 172 | + ] | ||
| 173 | + | ||
| 174 | + for col, value in enumerate(data, 1): | ||
| 175 | + cell = self.feeds_sheet.cell(row=row, column=col, value=value) | ||
| 176 | + if col == 2: # 标题列 | ||
| 177 | + cell.alignment = Alignment(wrap_text=True) | ||
| 178 | + elif col == 11: # 链接列 | ||
| 179 | + cell.alignment = Alignment(wrap_text=True) | ||
| 180 | + | ||
| 181 | + # 设置数据行行高 | ||
| 182 | + self.feeds_sheet.row_dimensions[row].height = 18 | ||
| 183 | + | ||
| 184 | + self.wb.save(self.file_path) | ||
| 185 | + return row | ||
| 186 | + | ||
| 187 | + def append_comment(self, feed_id: str, comment: Comment) -> int: | ||
| 188 | + """追加评论数据到 Excel。 | ||
| 189 | + | ||
| 190 | + Args: | ||
| 191 | + feed_id: 文章的 feed_id,用于跳转链接 | ||
| 192 | + Returns: | ||
| 193 | + 实际写入的行号 | ||
| 194 | + """ | ||
| 195 | + row = self._get_comments_next_row() | ||
| 196 | + | ||
| 197 | + # 格式化评论时间 | ||
| 198 | + comment_time = "" | ||
| 199 | + if comment.create_time: | ||
| 200 | + try: | ||
| 201 | + dt = datetime.fromtimestamp(comment.create_time) | ||
| 202 | + comment_time = dt.strftime("%Y-%m-%d %H:%M:%S") | ||
| 203 | + except (ValueError, OSError): | ||
| 204 | + comment_time = str(comment.create_time) | ||
| 205 | + | ||
| 206 | + data = [ | ||
| 207 | + feed_id, | ||
| 208 | + comment.content, | ||
| 209 | + comment_time, | ||
| 210 | + comment.like_count, | ||
| 211 | + comment.user_info.nickname or comment.user_info.nick_name, | ||
| 212 | + comment.ip_location, | ||
| 213 | + comment.sub_comment_count, | ||
| 214 | + ] | ||
| 215 | + | ||
| 216 | + for col, value in enumerate(data, 1): | ||
| 217 | + cell = self.comments_sheet.cell(row=row, column=col, value=value) | ||
| 218 | + if col == 2: # 评论内容列 | ||
| 219 | + cell.alignment = Alignment(wrap_text=True) | ||
| 220 | + elif col == 1: # feed_id 列,添加超链接跳转到文章 sheet | ||
| 221 | + # 查找 feed_id 对应的行号 | ||
| 222 | + target_row = self._find_feed_row(feed_id) | ||
| 223 | + if target_row: | ||
| 224 | + cell.hyperlink = f"#'{SHEET_FEEDS}'!A{target_row}" | ||
| 225 | + cell.font = Font(color="0563C1", underline="single") | ||
| 226 | + | ||
| 227 | + # 设置数据行行高 | ||
| 228 | + self.comments_sheet.row_dimensions[row].height = 18 | ||
| 229 | + | ||
| 230 | + self.wb.save(self.file_path) | ||
| 231 | + return row | ||
| 232 | + | ||
| 233 | + def _find_feed_row(self, feed_id: str) -> int | None: | ||
| 234 | + """根据 feed_id 查找对应的行号。""" | ||
| 235 | + for row in range(2, self.feeds_sheet.max_row + 1): | ||
| 236 | + cell_feed_id = self.feeds_sheet.cell(row=row, column=9).value # feed_id 列 | ||
| 237 | + if cell_feed_id == feed_id: | ||
| 238 | + return row | ||
| 239 | + return None | ||
| 240 | + | ||
| 241 | + def update_feed_status(self, row: int, status: str) -> None: | ||
| 242 | + """更新文章的评论查询状态。""" | ||
| 243 | + status_col = 8 # 评论查询状态列 | ||
| 244 | + self.feeds_sheet.cell(row=row, column=status_col, value=status) | ||
| 245 | + | ||
| 246 | + # 设置状态颜色 | ||
| 247 | + cell = self.feeds_sheet.cell(row=row, column=status_col) | ||
| 248 | + if status == STATUS_SUCCESS: | ||
| 249 | + cell.fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid") | ||
| 250 | + cell.font = Font(color="006100") | ||
| 251 | + elif status == STATUS_FAILED: | ||
| 252 | + cell.fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid") | ||
| 253 | + cell.font = Font(color="9C0006") | ||
| 254 | + | ||
| 255 | + self.wb.save(self.file_path) | ||
| 256 | + | ||
| 257 | + def get_pending_feeds(self) -> list[dict]: | ||
| 258 | + """获取所有待查询评论的文章。""" | ||
| 259 | + pending = [] | ||
| 260 | + for row in range(2, self.feeds_sheet.max_row + 1): | ||
| 261 | + status = self.feeds_sheet.cell(row=row, column=8).value | ||
| 262 | + if status != STATUS_SUCCESS: | ||
| 263 | + pending.append({ | ||
| 264 | + "row": row, | ||
| 265 | + "feed_id": self.feeds_sheet.cell(row=row, column=9).value, | ||
| 266 | + "xsec_token": self.feeds_sheet.cell(row=row, column=10).value, | ||
| 267 | + "title": self.feeds_sheet.cell(row=row, column=2).value, | ||
| 268 | + }) | ||
| 269 | + return pending | ||
| 270 | + | ||
| 271 | + def get_saved_feeds_count(self) -> int: | ||
| 272 | + """获取已保存的文章数量。""" | ||
| 273 | + return max(0, self.feeds_sheet.max_row - 1) | ||
| 274 | + | ||
| 275 | + def save(self) -> None: | ||
| 276 | + """保存 Excel 文件。""" | ||
| 277 | + self.wb.save(self.file_path) | ||
| 278 | + | ||
| 279 | + | ||
| 280 | +def search_and_export( | ||
| 281 | + page, | ||
| 282 | + keyword: str, | ||
| 283 | + output_file: str, | ||
| 284 | + filter_option: FilterOption | None = None, | ||
| 285 | + limit: int = 0, | ||
| 286 | +) -> None: | ||
| 287 | + """搜索文章并导出到 Excel。 | ||
| 288 | + | ||
| 289 | + Args: | ||
| 290 | + page: CDP 页面对象 | ||
| 291 | + keyword: 搜索关键词 | ||
| 292 | + output_file: 输出 Excel 文件路径 | ||
| 293 | + filter_option: 筛选选项 | ||
| 294 | + limit: 限制搜索文章数量,0 表示不限制 | ||
| 295 | + """ | ||
| 296 | + export = ExportExcel(output_file, keyword) | ||
| 297 | + | ||
| 298 | + if export.load(): | ||
| 299 | + logger.info("文件已存在,将追加新数据") | ||
| 300 | + else: | ||
| 301 | + export.create() | ||
| 302 | + | ||
| 303 | + # 搜索文章 | ||
| 304 | + logger.info("开始搜索关键词: %s", keyword) | ||
| 305 | + feeds = search_feeds(page, keyword, filter_option) | ||
| 306 | + logger.info("搜索到 %d 篇文章", len(feeds)) | ||
| 307 | + | ||
| 308 | + # 限制数量 | ||
| 309 | + if limit > 0 and len(feeds) > limit: | ||
| 310 | + feeds = feeds[:limit] | ||
| 311 | + logger.info("限制为前 %d 篇", limit) | ||
| 312 | + | ||
| 313 | + start_row = export._get_feeds_next_row() if export.load() else 2 | ||
| 314 | + | ||
| 315 | + # 追加到 Excel | ||
| 316 | + for i, feed in enumerate(feeds): | ||
| 317 | + row = start_row + i | ||
| 318 | + export.append_feed(feed, row) | ||
| 319 | + logger.info("已保存文章 [%d/%d]: %s", i + 1, len(feeds), feed.note_card.display_title[:30]) | ||
| 320 | + | ||
| 321 | + logger.info("文章搜索完成,已保存到: %s", output_file) | ||
| 322 | + | ||
| 323 | + | ||
| 324 | +def poll_comments( | ||
| 325 | + page, | ||
| 326 | + excel_file: str, | ||
| 327 | + feed_index: int | None = None, | ||
| 328 | + delay: float = 1.0, | ||
| 329 | +) -> dict: | ||
| 330 | + """轮询查询评论并保存到 Excel。 | ||
| 331 | + | ||
| 332 | + Args: | ||
| 333 | + page: CDP 页面对象 | ||
| 334 | + excel_file: Excel 文件路径 | ||
| 335 | + feed_index: 指定文章序号(从 1 开始),None 表示查询所有未完成的 | ||
| 336 | + delay: 请求间隔(秒) | ||
| 337 | + | ||
| 338 | + Returns: | ||
| 339 | + 统计信息 {"success": count, "failed": count} | ||
| 340 | + """ | ||
| 341 | + export = ExportExcel(excel_file, "") | ||
| 342 | + | ||
| 343 | + if not export.load(): | ||
| 344 | + raise FileNotFoundError(f"Excel 文件不存在: {excel_file}") | ||
| 345 | + | ||
| 346 | + pending_feeds = export.get_pending_feeds() | ||
| 347 | + | ||
| 348 | + if feed_index is not None: | ||
| 349 | + # 只查询指定文章 | ||
| 350 | + feed_index_0 = feed_index - 1 # 转为 0-based | ||
| 351 | + pending_feeds = [ | ||
| 352 | + f for f in pending_feeds | ||
| 353 | + if f["row"] - 2 == feed_index_0 # row 2 = index 0 | ||
| 354 | + ] | ||
| 355 | + if not pending_feeds: | ||
| 356 | + # 检查是否已完成 | ||
| 357 | + row = feed_index + 1 | ||
| 358 | + status = export.feeds_sheet.cell(row=row, column=8).value | ||
| 359 | + if status == STATUS_SUCCESS: | ||
| 360 | + logger.info("文章 %d 的评论已查询完成", feed_index) | ||
| 361 | + return {"success": 0, "failed": 0, "skipped": 1} | ||
| 362 | + raise ValueError(f"文章序号 {feed_index} 不存在") | ||
| 363 | + | ||
| 364 | + if not pending_feeds: | ||
| 365 | + logger.info("所有文章的评论都已查询完成") | ||
| 366 | + return {"success": 0, "failed": 0, "skipped": 0} | ||
| 367 | + | ||
| 368 | + logger.info("开始查询 %d 篇文章的评论", len(pending_feeds)) | ||
| 369 | + | ||
| 370 | + stats = {"success": 0, "failed": 0} | ||
| 371 | + | ||
| 372 | + for i, feed_info in enumerate(pending_feeds): | ||
| 373 | + row = feed_info["row"] | ||
| 374 | + feed_id = feed_info["feed_id"] | ||
| 375 | + xsec_token = feed_info["xsec_token"] | ||
| 376 | + title = feed_info["title"] | ||
| 377 | + | ||
| 378 | + logger.info("[%d/%d] 查询文章评论: %s (row=%d)", | ||
| 379 | + i + 1, len(pending_feeds), str(title)[:30], row) | ||
| 380 | + | ||
| 381 | + try: | ||
| 382 | + # 获取文章详情(含评论) | ||
| 383 | + detail = get_feed_detail(page, feed_id, xsec_token, load_all_comments=True) | ||
| 384 | + | ||
| 385 | + # 保存评论 | ||
| 386 | + for comment in detail.comments.list_: | ||
| 387 | + export.append_comment(feed_id=feed_id, comment=comment) | ||
| 388 | + _append_sub_comments(export, feed_id, comment.sub_comments) | ||
| 389 | + | ||
| 390 | + # 更新状态为成功 | ||
| 391 | + export.update_feed_status(row, STATUS_SUCCESS) | ||
| 392 | + stats["success"] += 1 | ||
| 393 | + logger.info("文章评论查询成功: %s", str(title)[:30]) | ||
| 394 | + | ||
| 395 | + except PageNotAccessibleError as e: | ||
| 396 | + export.update_feed_status(row, STATUS_FAILED) | ||
| 397 | + stats["failed"] += 1 | ||
| 398 | + logger.warning("文章无法访问 [%s]: %s", title, e) | ||
| 399 | + except NoFeedDetailError as e: | ||
| 400 | + export.update_feed_status(row, STATUS_FAILED) | ||
| 401 | + stats["failed"] += 1 | ||
| 402 | + logger.warning("获取详情失败 [%s]: %s", title, e) | ||
| 403 | + except XHSError as e: | ||
| 404 | + export.update_feed_status(row, STATUS_FAILED) | ||
| 405 | + stats["failed"] += 1 | ||
| 406 | + logger.warning("查询失败 [%s]: %s", title, e) | ||
| 407 | + except Exception as e: | ||
| 408 | + export.update_feed_status(row, STATUS_FAILED) | ||
| 409 | + stats["failed"] += 1 | ||
| 410 | + logger.error("未知错误 [%s]: %s", title, e) | ||
| 411 | + | ||
| 412 | + # 请求间隔 | ||
| 413 | + if i < len(pending_feeds) - 1: | ||
| 414 | + time.sleep(delay) | ||
| 415 | + | ||
| 416 | + logger.info("评论查询完成: 成功=%d, 失败=%d", stats["success"], stats["failed"]) | ||
| 417 | + return stats | ||
| 418 | + | ||
| 419 | + | ||
| 420 | +def _append_sub_comments( | ||
| 421 | + export: ExportExcel, | ||
| 422 | + feed_id: str, | ||
| 423 | + sub_comments: list[Comment], | ||
| 424 | +) -> None: | ||
| 425 | + """递归追加子评论。""" | ||
| 426 | + for comment in sub_comments: | ||
| 427 | + export.append_comment(feed_id, comment) | ||
| 428 | + if comment.sub_comments: | ||
| 429 | + _append_sub_comments(export, feed_id, comment.sub_comments) |
| @@ -31,6 +31,7 @@ from .human import ( | @@ -31,6 +31,7 @@ from .human import ( | ||
| 31 | ) | 31 | ) |
| 32 | from .selectors import ( | 32 | from .selectors import ( |
| 33 | ACCESS_ERROR_WRAPPER, | 33 | ACCESS_ERROR_WRAPPER, |
| 34 | + COMMENTS_CONTAINER, | ||
| 34 | END_CONTAINER, | 35 | END_CONTAINER, |
| 35 | NO_COMMENTS_TEXT, | 36 | NO_COMMENTS_TEXT, |
| 36 | PARENT_COMMENT, | 37 | PARENT_COMMENT, |
| @@ -236,6 +237,11 @@ def _load_all_comments(page: Page, config: CommentLoadConfig) -> None: | @@ -236,6 +237,11 @@ def _load_all_comments(page: Page, config: CommentLoadConfig) -> None: | ||
| 236 | _scroll_to_comments_area(page) | 237 | _scroll_to_comments_area(page) |
| 237 | sleep_random(*HUMAN_DELAY) | 238 | sleep_random(*HUMAN_DELAY) |
| 238 | 239 | ||
| 240 | + # 检查评论区容器是否存在(下架/违规帖子可能没有评论区) | ||
| 241 | + if page.get_elements_count(COMMENTS_CONTAINER) == 0: | ||
| 242 | + logger.info("评论区容器不存在,可能是下架帖子,跳过加载") | ||
| 243 | + return | ||
| 244 | + | ||
| 239 | # 检查是否无评论 | 245 | # 检查是否无评论 |
| 240 | if _check_no_comments(page): | 246 | if _check_no_comments(page): |
| 241 | logger.info("检测到无评论区域,跳过加载") | 247 | logger.info("检测到无评论区域,跳过加载") |
| @@ -94,6 +94,15 @@ wheels = [ | @@ -94,6 +94,15 @@ wheels = [ | ||
| 94 | ] | 94 | ] |
| 95 | 95 | ||
| 96 | [[package]] | 96 | [[package]] |
| 97 | +name = "et-xmlfile" | ||
| 98 | +version = "2.0.0" | ||
| 99 | +source = { registry = "https://pypi.org/simple" } | ||
| 100 | +sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234, upload-time = "2024-10-25T17:25:40.039Z" } | ||
| 101 | +wheels = [ | ||
| 102 | + { url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059, upload-time = "2024-10-25T17:25:39.051Z" }, | ||
| 103 | +] | ||
| 104 | + | ||
| 105 | +[[package]] | ||
| 97 | name = "idna" | 106 | name = "idna" |
| 98 | version = "3.11" | 107 | version = "3.11" |
| 99 | source = { registry = "https://pypi.org/simple" } | 108 | source = { registry = "https://pypi.org/simple" } |
| @@ -112,6 +121,18 @@ wheels = [ | @@ -112,6 +121,18 @@ wheels = [ | ||
| 112 | ] | 121 | ] |
| 113 | 122 | ||
| 114 | [[package]] | 123 | [[package]] |
| 124 | +name = "openpyxl" | ||
| 125 | +version = "3.1.5" | ||
| 126 | +source = { registry = "https://pypi.org/simple" } | ||
| 127 | +dependencies = [ | ||
| 128 | + { name = "et-xmlfile" }, | ||
| 129 | +] | ||
| 130 | +sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464, upload-time = "2024-06-28T14:03:44.161Z" } | ||
| 131 | +wheels = [ | ||
| 132 | + { url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910, upload-time = "2024-06-28T14:03:41.161Z" }, | ||
| 133 | +] | ||
| 134 | + | ||
| 135 | +[[package]] | ||
| 115 | name = "packaging" | 136 | name = "packaging" |
| 116 | version = "26.0" | 137 | version = "26.0" |
| 117 | source = { registry = "https://pypi.org/simple" } | 138 | source = { registry = "https://pypi.org/simple" } |
| @@ -267,6 +288,7 @@ name = "xiaohongshu-skills" | @@ -267,6 +288,7 @@ name = "xiaohongshu-skills" | ||
| 267 | version = "0.1.0" | 288 | version = "0.1.0" |
| 268 | source = { virtual = "." } | 289 | source = { virtual = "." } |
| 269 | dependencies = [ | 290 | dependencies = [ |
| 291 | + { name = "openpyxl" }, | ||
| 270 | { name = "requests" }, | 292 | { name = "requests" }, |
| 271 | { name = "websockets" }, | 293 | { name = "websockets" }, |
| 272 | ] | 294 | ] |
| @@ -284,6 +306,7 @@ dev = [ | @@ -284,6 +306,7 @@ dev = [ | ||
| 284 | 306 | ||
| 285 | [package.metadata] | 307 | [package.metadata] |
| 286 | requires-dist = [ | 308 | requires-dist = [ |
| 309 | + { name = "openpyxl", specifier = ">=3.1.0" }, | ||
| 287 | { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0" }, | 310 | { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0" }, |
| 288 | { name = "requests", specifier = ">=2.28.0" }, | 311 | { name = "requests", specifier = ">=2.28.0" }, |
| 289 | { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.9.0" }, | 312 | { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.9.0" }, |
-
Please register or login to post a comment