Committed by
GitHub
Fixing dy data scraping (#400)
* Fixing dy data scraping * chore: revert base_config.py to defaul values and modify some comments --------- Co-authored-by: yling <yling@test.com>
Showing
6 changed files
with
63 additions
and
17 deletions
| @@ -104,7 +104,7 @@ class DouyinAweme(Base): | @@ -104,7 +104,7 @@ class DouyinAweme(Base): | ||
| 104 | ip_location = Column(Text) | 104 | ip_location = Column(Text) |
| 105 | add_ts = Column(BigInteger) | 105 | add_ts = Column(BigInteger) |
| 106 | last_modify_ts = Column(BigInteger) | 106 | last_modify_ts = Column(BigInteger) |
| 107 | - aweme_id = Column(BigInteger, index=True) | 107 | + aweme_id = Column(String(64), index=True) |
| 108 | aweme_type = Column(Text) | 108 | aweme_type = Column(Text) |
| 109 | title = Column(Text) | 109 | title = Column(Text) |
| 110 | desc = Column(Text) | 110 | desc = Column(Text) |
| @@ -133,8 +133,8 @@ class DouyinAwemeComment(Base): | @@ -133,8 +133,8 @@ class DouyinAwemeComment(Base): | ||
| 133 | ip_location = Column(Text) | 133 | ip_location = Column(Text) |
| 134 | add_ts = Column(BigInteger) | 134 | add_ts = Column(BigInteger) |
| 135 | last_modify_ts = Column(BigInteger) | 135 | last_modify_ts = Column(BigInteger) |
| 136 | - comment_id = Column(BigInteger, index=True) | ||
| 137 | - aweme_id = Column(BigInteger, index=True) | 136 | + comment_id = Column(String(64), index=True) |
| 137 | + aweme_id = Column(String(64), index=True) | ||
| 138 | content = Column(Text) | 138 | content = Column(Text) |
| 139 | create_time = Column(BigInteger) | 139 | create_time = Column(BigInteger) |
| 140 | sub_comment_count = Column(Text) | 140 | sub_comment_count = Column(Text) |
| @@ -431,4 +431,4 @@ class ZhihuCreator(Base): | @@ -431,4 +431,4 @@ class ZhihuCreator(Base): | ||
| 431 | column_count = Column(Integer, default=0) | 431 | column_count = Column(Integer, default=0) |
| 432 | get_voteup_count = Column(Integer, default=0) | 432 | get_voteup_count = Column(Integer, default=0) |
| 433 | add_ts = Column(BigInteger) | 433 | add_ts = Column(BigInteger) |
| 434 | - last_modify_ts = Column(BigInteger) | ||
| 434 | + last_modify_ts = Column(BigInteger) |
| @@ -151,24 +151,33 @@ class DouYinClient(AbstractApiClient): | @@ -151,24 +151,33 @@ class DouYinClient(AbstractApiClient): | ||
| 151 | :return: | 151 | :return: |
| 152 | """ | 152 | """ |
| 153 | query_params = { | 153 | query_params = { |
| 154 | - 'search_channel': search_channel.value, | ||
| 155 | - 'enable_history': '1', | 154 | + 'device_platform': 'webapp', |
| 155 | + 'aid': '6383', | ||
| 156 | + 'channel': 'channel_pc_web', | ||
| 157 | + 'search_channel': 'aweme_general', | ||
| 158 | + 'sort_type': '0', | ||
| 159 | + 'publish_time': '0', | ||
| 156 | 'keyword': keyword, | 160 | 'keyword': keyword, |
| 157 | - 'search_source': 'tab_search', | 161 | + 'search_source': 'normal_search', |
| 158 | 'query_correct_type': '1', | 162 | 'query_correct_type': '1', |
| 159 | 'is_filter_search': '0', | 163 | 'is_filter_search': '0', |
| 160 | - 'from_group_id': '7378810571505847586', | 164 | + # 'from_group_id': '', # 删掉或留空,不要硬编码过期的 ID |
| 161 | 'offset': offset, | 165 | 'offset': offset, |
| 162 | - 'count': '15', | 166 | + 'count': '10', |
| 163 | 'need_filter_settings': '1', | 167 | 'need_filter_settings': '1', |
| 164 | - 'list_type': 'multi', | 168 | + # 'list_type': 'multi', 注释掉 |
| 165 | 'search_id': search_id, | 169 | 'search_id': search_id, |
| 170 | + 'pc_client_type': '1', | ||
| 171 | + 'version_code': '190600', | ||
| 172 | + 'version_name': '19.6.0', | ||
| 173 | + 'cookie_enabled': 'true', | ||
| 174 | + 'platform': 'PC', | ||
| 175 | + 'downlink': '10', | ||
| 166 | } | 176 | } |
| 167 | if sort_type.value != SearchSortType.GENERAL.value or publish_time.value != PublishTimeType.UNLIMITED.value: | 177 | if sort_type.value != SearchSortType.GENERAL.value or publish_time.value != PublishTimeType.UNLIMITED.value: |
| 168 | query_params["filter_selected"] = json.dumps({"sort_type": str(sort_type.value), "publish_time": str(publish_time.value)}) | 178 | query_params["filter_selected"] = json.dumps({"sort_type": str(sort_type.value), "publish_time": str(publish_time.value)}) |
| 169 | query_params["is_filter_search"] = 1 | 179 | query_params["is_filter_search"] = 1 |
| 170 | - query_params["search_source"] = "tab_search" | ||
| 171 | - referer_url = f"https://www.douyin.com/search/{keyword}?aid=f594bbd9-a0e2-4651-9319-ebe3cb6298c1&type=general" | 180 | + referer_url = f"https://www.douyin.com/search/{keyword}?aid=6383&type=general" |
| 172 | headers = copy.copy(self.headers) | 181 | headers = copy.copy(self.headers) |
| 173 | headers["Referer"] = urllib.parse.quote(referer_url, safe=':/') | 182 | headers["Referer"] = urllib.parse.quote(referer_url, safe=':/') |
| 174 | return await self.get("/aweme/v1/web/general/search/single/", query_params, headers=headers) | 183 | return await self.get("/aweme/v1/web/general/search/single/", query_params, headers=headers) |
| @@ -129,8 +129,18 @@ class DouYinCrawler(AbstractCrawler): | @@ -129,8 +129,18 @@ class DouYinCrawler(AbstractCrawler): | ||
| 129 | publish_time=PublishTimeType(config.PUBLISH_TIME_TYPE), | 129 | publish_time=PublishTimeType(config.PUBLISH_TIME_TYPE), |
| 130 | search_id=dy_search_id, | 130 | search_id=dy_search_id, |
| 131 | ) | 131 | ) |
| 132 | + # === 新增调试代码 START === | ||
| 133 | + # 打印返回的所有 Key,看看有没有 'data' 或者 'aweme_list' | ||
| 134 | + utils.logger.info(f"[DEBUG] 接口返回的字段 keys: {list(posts_res.keys())}") | ||
| 135 | + | ||
| 136 | + # 如果返回里直接有 aweme_list,说明结构变了 | ||
| 137 | + if "aweme_list" in posts_res and "data" not in posts_res: | ||
| 138 | + utils.logger.info("[DEBUG] 检测到 aweme_list 在根节点,正在修正数据结构...") | ||
| 139 | + posts_res["data"] = [{"aweme_info": item} for item in posts_res["aweme_list"]] | ||
| 140 | + # === 新增调试代码 END === | ||
| 141 | + | ||
| 132 | if posts_res.get("data") is None or posts_res.get("data") == []: | 142 | if posts_res.get("data") is None or posts_res.get("data") == []: |
| 133 | - utils.logger.info(f"[DouYinCrawler.search] search douyin keyword: {keyword}, page: {page} is empty,{posts_res.get('data')}`") | 143 | + utils.logger.info(f"[DouYinCrawler.search] 结果为空。Status: {posts_res.get('status_code')}, Msg: {posts_res.get('status_msg')}") |
| 134 | break | 144 | break |
| 135 | except DataFetchError: | 145 | except DataFetchError: |
| 136 | utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed") | 146 | utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed") |
| @@ -33,6 +33,23 @@ from tools import utils, words | @@ -33,6 +33,23 @@ from tools import utils, words | ||
| 33 | from var import crawler_type_var | 33 | from var import crawler_type_var |
| 34 | 34 | ||
| 35 | 35 | ||
| 36 | +def _sanitize_strings(data: Dict) -> Dict: | ||
| 37 | + """ | ||
| 38 | + Remove PostgreSQL-incompatible control characters (e.g., NULL) from all string fields. | ||
| 39 | + Args: | ||
| 40 | + data: original dict | ||
| 41 | + Returns: | ||
| 42 | + A new dict with sanitized string values | ||
| 43 | + """ | ||
| 44 | + cleaned = {} | ||
| 45 | + for key, value in data.items(): | ||
| 46 | + if isinstance(value, str): | ||
| 47 | + cleaned[key] = value.replace('\x00', '') | ||
| 48 | + else: | ||
| 49 | + cleaned[key] = value | ||
| 50 | + return cleaned | ||
| 51 | + | ||
| 52 | + | ||
| 36 | class BiliCsvStoreImplement(AbstractStore): | 53 | class BiliCsvStoreImplement(AbstractStore): |
| 37 | def __init__(self): | 54 | def __init__(self): |
| 38 | self.file_writer = AsyncFileWriter( | 55 | self.file_writer = AsyncFileWriter( |
| @@ -122,6 +139,8 @@ class BiliDbStoreImplement(AbstractStore): | @@ -122,6 +139,8 @@ class BiliDbStoreImplement(AbstractStore): | ||
| 122 | # 确保 video_id 为整数类型,匹配数据库 BigInteger 字段 | 139 | # 确保 video_id 为整数类型,匹配数据库 BigInteger 字段 |
| 123 | if video_id is not None: | 140 | if video_id is not None: |
| 124 | video_id = int(video_id) if not isinstance(video_id, int) else video_id | 141 | video_id = int(video_id) if not isinstance(video_id, int) else video_id |
| 142 | + content_item["video_id"] = video_id | ||
| 143 | + content_item = _sanitize_strings(content_item) | ||
| 125 | async with get_session() as session: | 144 | async with get_session() as session: |
| 126 | result = await session.execute(select(BilibiliVideo).where(BilibiliVideo.video_id == video_id)) | 145 | result = await session.execute(select(BilibiliVideo).where(BilibiliVideo.video_id == video_id)) |
| 127 | video_detail = result.scalar_one_or_none() | 146 | video_detail = result.scalar_one_or_none() |
| @@ -145,6 +164,8 @@ class BiliDbStoreImplement(AbstractStore): | @@ -145,6 +164,8 @@ class BiliDbStoreImplement(AbstractStore): | ||
| 145 | # 确保 comment_id 为整数类型,匹配数据库 BigInteger 字段 | 164 | # 确保 comment_id 为整数类型,匹配数据库 BigInteger 字段 |
| 146 | if comment_id is not None: | 165 | if comment_id is not None: |
| 147 | comment_id = int(comment_id) if not isinstance(comment_id, int) else comment_id | 166 | comment_id = int(comment_id) if not isinstance(comment_id, int) else comment_id |
| 167 | + comment_item["comment_id"] = comment_id | ||
| 168 | + comment_item = _sanitize_strings(comment_item) | ||
| 148 | async with get_session() as session: | 169 | async with get_session() as session: |
| 149 | result = await session.execute(select(BilibiliVideoComment).where(BilibiliVideoComment.comment_id == comment_id)) | 170 | result = await session.execute(select(BilibiliVideoComment).where(BilibiliVideoComment.comment_id == comment_id)) |
| 150 | comment_detail = result.scalar_one_or_none() | 171 | comment_detail = result.scalar_one_or_none() |
| @@ -168,6 +189,8 @@ class BiliDbStoreImplement(AbstractStore): | @@ -168,6 +189,8 @@ class BiliDbStoreImplement(AbstractStore): | ||
| 168 | # 确保 creator_id 为整数类型,匹配数据库 BigInteger 字段 | 189 | # 确保 creator_id 为整数类型,匹配数据库 BigInteger 字段 |
| 169 | if creator_id is not None: | 190 | if creator_id is not None: |
| 170 | creator_id = int(creator_id) if not isinstance(creator_id, int) else creator_id | 191 | creator_id = int(creator_id) if not isinstance(creator_id, int) else creator_id |
| 192 | + creator["user_id"] = creator_id | ||
| 193 | + creator = _sanitize_strings(creator) | ||
| 171 | async with get_session() as session: | 194 | async with get_session() as session: |
| 172 | result = await session.execute(select(BilibiliUpInfo).where(BilibiliUpInfo.user_id == creator_id)) | 195 | result = await session.execute(select(BilibiliUpInfo).where(BilibiliUpInfo.user_id == creator_id)) |
| 173 | creator_detail = result.scalar_one_or_none() | 196 | creator_detail = result.scalar_one_or_none() |
| @@ -192,8 +215,11 @@ class BiliDbStoreImplement(AbstractStore): | @@ -192,8 +215,11 @@ class BiliDbStoreImplement(AbstractStore): | ||
| 192 | # 确保 up_id 和 fan_id 为整数类型,匹配数据库 BigInteger 字段 | 215 | # 确保 up_id 和 fan_id 为整数类型,匹配数据库 BigInteger 字段 |
| 193 | if up_id is not None: | 216 | if up_id is not None: |
| 194 | up_id = int(up_id) if not isinstance(up_id, int) else up_id | 217 | up_id = int(up_id) if not isinstance(up_id, int) else up_id |
| 218 | + contact_item["up_id"] = up_id | ||
| 195 | if fan_id is not None: | 219 | if fan_id is not None: |
| 196 | fan_id = int(fan_id) if not isinstance(fan_id, int) else fan_id | 220 | fan_id = int(fan_id) if not isinstance(fan_id, int) else fan_id |
| 221 | + contact_item["fan_id"] = fan_id | ||
| 222 | + contact_item = _sanitize_strings(contact_item) | ||
| 197 | async with get_session() as session: | 223 | async with get_session() as session: |
| 198 | result = await session.execute( | 224 | result = await session.execute( |
| 199 | select(BilibiliContactInfo).where(BilibiliContactInfo.up_id == up_id, BilibiliContactInfo.fan_id == fan_id) | 225 | select(BilibiliContactInfo).where(BilibiliContactInfo.up_id == up_id, BilibiliContactInfo.fan_id == fan_id) |
| @@ -216,6 +242,7 @@ class BiliDbStoreImplement(AbstractStore): | @@ -216,6 +242,7 @@ class BiliDbStoreImplement(AbstractStore): | ||
| 216 | dynamic_item: dynamic item dict | 242 | dynamic_item: dynamic item dict |
| 217 | """ | 243 | """ |
| 218 | dynamic_id = dynamic_item.get("dynamic_id") | 244 | dynamic_id = dynamic_item.get("dynamic_id") |
| 245 | + dynamic_item = _sanitize_strings(dynamic_item) | ||
| 219 | async with get_session() as session: | 246 | async with get_session() as session: |
| 220 | result = await session.execute(select(BilibiliUpDynamic).where(BilibiliUpDynamic.dynamic_id == dynamic_id)) | 247 | result = await session.execute(select(BilibiliUpDynamic).where(BilibiliUpDynamic.dynamic_id == dynamic_id)) |
| 221 | dynamic_detail = result.scalar_one_or_none() | 248 | dynamic_detail = result.scalar_one_or_none() |
| @@ -206,7 +206,7 @@ async def update_dy_aweme_comment(aweme_id: str, comment_item: Dict): | @@ -206,7 +206,7 @@ async def update_dy_aweme_comment(aweme_id: str, comment_item: Dict): | ||
| 206 | "nickname": user_info.get("nickname"), | 206 | "nickname": user_info.get("nickname"), |
| 207 | "avatar": avatar_info.get("url_list", [""])[0], | 207 | "avatar": avatar_info.get("url_list", [""])[0], |
| 208 | "sub_comment_count": str(comment_item.get("reply_comment_total", 0)), | 208 | "sub_comment_count": str(comment_item.get("reply_comment_total", 0)), |
| 209 | - "like_count": (comment_item.get("digg_count") if comment_item.get("digg_count") else 0), | 209 | + "like_count": str(comment_item.get("digg_count") or 0), |
| 210 | "last_modify_ts": utils.get_current_timestamp(), | 210 | "last_modify_ts": utils.get_current_timestamp(), |
| 211 | "parent_comment_id": parent_comment_id, | 211 | "parent_comment_id": parent_comment_id, |
| 212 | "pictures": ",".join(_extract_comment_image_list(comment_item)), | 212 | "pictures": ",".join(_extract_comment_image_list(comment_item)), |
| @@ -121,7 +121,7 @@ class DouyinAweme(Base): | @@ -121,7 +121,7 @@ class DouyinAweme(Base): | ||
| 121 | ip_location: Mapped[str | None] = mapped_column(Text, nullable=True) | 121 | ip_location: Mapped[str | None] = mapped_column(Text, nullable=True) |
| 122 | add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True) | 122 | add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True) |
| 123 | last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True) | 123 | last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True) |
| 124 | - aweme_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True) | 124 | + aweme_id: Mapped[str | None] = mapped_column(String(64), index=True, nullable=True) |
| 125 | aweme_type: Mapped[str | None] = mapped_column(Text, nullable=True) | 125 | aweme_type: Mapped[str | None] = mapped_column(Text, nullable=True) |
| 126 | title: Mapped[str | None] = mapped_column(Text, nullable=True) | 126 | title: Mapped[str | None] = mapped_column(Text, nullable=True) |
| 127 | desc: Mapped[str | None] = mapped_column(Text, nullable=True) | 127 | desc: Mapped[str | None] = mapped_column(Text, nullable=True) |
| @@ -152,8 +152,8 @@ class DouyinAwemeComment(Base): | @@ -152,8 +152,8 @@ class DouyinAwemeComment(Base): | ||
| 152 | ip_location: Mapped[str | None] = mapped_column(Text, nullable=True) | 152 | ip_location: Mapped[str | None] = mapped_column(Text, nullable=True) |
| 153 | add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True) | 153 | add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True) |
| 154 | last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True) | 154 | last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True) |
| 155 | - comment_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True) | ||
| 156 | - aweme_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True) | 155 | + comment_id: Mapped[str | None] = mapped_column(String(64), index=True, nullable=True) |
| 156 | + aweme_id: Mapped[str | None] = mapped_column(String(64), index=True, nullable=True) | ||
| 157 | content: Mapped[str | None] = mapped_column(Text, nullable=True) | 157 | content: Mapped[str | None] = mapped_column(Text, nullable=True) |
| 158 | create_time: Mapped[int | None] = mapped_column(BigInteger, nullable=True) | 158 | create_time: Mapped[int | None] = mapped_column(BigInteger, nullable=True) |
| 159 | sub_comment_count: Mapped[str | None] = mapped_column(Text, nullable=True) | 159 | sub_comment_count: Mapped[str | None] = mapped_column(Text, nullable=True) |
-
Please register or login to post a comment