yelingdenghe
Committed by GitHub

Fixing dy data scraping (#400)

* Fixing dy data scraping

* chore: revert base_config.py to defaul values and modify some comments

---------

Co-authored-by: yling <yling@test.com>
@@ -104,7 +104,7 @@ class DouyinAweme(Base): @@ -104,7 +104,7 @@ class DouyinAweme(Base):
104 ip_location = Column(Text) 104 ip_location = Column(Text)
105 add_ts = Column(BigInteger) 105 add_ts = Column(BigInteger)
106 last_modify_ts = Column(BigInteger) 106 last_modify_ts = Column(BigInteger)
107 - aweme_id = Column(BigInteger, index=True) 107 + aweme_id = Column(String(64), index=True)
108 aweme_type = Column(Text) 108 aweme_type = Column(Text)
109 title = Column(Text) 109 title = Column(Text)
110 desc = Column(Text) 110 desc = Column(Text)
@@ -133,8 +133,8 @@ class DouyinAwemeComment(Base): @@ -133,8 +133,8 @@ class DouyinAwemeComment(Base):
133 ip_location = Column(Text) 133 ip_location = Column(Text)
134 add_ts = Column(BigInteger) 134 add_ts = Column(BigInteger)
135 last_modify_ts = Column(BigInteger) 135 last_modify_ts = Column(BigInteger)
136 - comment_id = Column(BigInteger, index=True)  
137 - aweme_id = Column(BigInteger, index=True) 136 + comment_id = Column(String(64), index=True)
  137 + aweme_id = Column(String(64), index=True)
138 content = Column(Text) 138 content = Column(Text)
139 create_time = Column(BigInteger) 139 create_time = Column(BigInteger)
140 sub_comment_count = Column(Text) 140 sub_comment_count = Column(Text)
@@ -431,4 +431,4 @@ class ZhihuCreator(Base): @@ -431,4 +431,4 @@ class ZhihuCreator(Base):
431 column_count = Column(Integer, default=0) 431 column_count = Column(Integer, default=0)
432 get_voteup_count = Column(Integer, default=0) 432 get_voteup_count = Column(Integer, default=0)
433 add_ts = Column(BigInteger) 433 add_ts = Column(BigInteger)
434 - last_modify_ts = Column(BigInteger)  
  434 + last_modify_ts = Column(BigInteger)
@@ -151,24 +151,33 @@ class DouYinClient(AbstractApiClient): @@ -151,24 +151,33 @@ class DouYinClient(AbstractApiClient):
151 :return: 151 :return:
152 """ 152 """
153 query_params = { 153 query_params = {
154 - 'search_channel': search_channel.value,  
155 - 'enable_history': '1', 154 + 'device_platform': 'webapp',
  155 + 'aid': '6383',
  156 + 'channel': 'channel_pc_web',
  157 + 'search_channel': 'aweme_general',
  158 + 'sort_type': '0',
  159 + 'publish_time': '0',
156 'keyword': keyword, 160 'keyword': keyword,
157 - 'search_source': 'tab_search', 161 + 'search_source': 'normal_search',
158 'query_correct_type': '1', 162 'query_correct_type': '1',
159 'is_filter_search': '0', 163 'is_filter_search': '0',
160 - 'from_group_id': '7378810571505847586', 164 + # 'from_group_id': '', # 删掉或留空,不要硬编码过期的 ID
161 'offset': offset, 165 'offset': offset,
162 - 'count': '15', 166 + 'count': '10',
163 'need_filter_settings': '1', 167 'need_filter_settings': '1',
164 - 'list_type': 'multi', 168 + # 'list_type': 'multi', 注释掉
165 'search_id': search_id, 169 'search_id': search_id,
  170 + 'pc_client_type': '1',
  171 + 'version_code': '190600',
  172 + 'version_name': '19.6.0',
  173 + 'cookie_enabled': 'true',
  174 + 'platform': 'PC',
  175 + 'downlink': '10',
166 } 176 }
167 if sort_type.value != SearchSortType.GENERAL.value or publish_time.value != PublishTimeType.UNLIMITED.value: 177 if sort_type.value != SearchSortType.GENERAL.value or publish_time.value != PublishTimeType.UNLIMITED.value:
168 query_params["filter_selected"] = json.dumps({"sort_type": str(sort_type.value), "publish_time": str(publish_time.value)}) 178 query_params["filter_selected"] = json.dumps({"sort_type": str(sort_type.value), "publish_time": str(publish_time.value)})
169 query_params["is_filter_search"] = 1 179 query_params["is_filter_search"] = 1
170 - query_params["search_source"] = "tab_search"  
171 - referer_url = f"https://www.douyin.com/search/{keyword}?aid=f594bbd9-a0e2-4651-9319-ebe3cb6298c1&type=general" 180 + referer_url = f"https://www.douyin.com/search/{keyword}?aid=6383&type=general"
172 headers = copy.copy(self.headers) 181 headers = copy.copy(self.headers)
173 headers["Referer"] = urllib.parse.quote(referer_url, safe=':/') 182 headers["Referer"] = urllib.parse.quote(referer_url, safe=':/')
174 return await self.get("/aweme/v1/web/general/search/single/", query_params, headers=headers) 183 return await self.get("/aweme/v1/web/general/search/single/", query_params, headers=headers)
@@ -129,8 +129,18 @@ class DouYinCrawler(AbstractCrawler): @@ -129,8 +129,18 @@ class DouYinCrawler(AbstractCrawler):
129 publish_time=PublishTimeType(config.PUBLISH_TIME_TYPE), 129 publish_time=PublishTimeType(config.PUBLISH_TIME_TYPE),
130 search_id=dy_search_id, 130 search_id=dy_search_id,
131 ) 131 )
  132 + # === 新增调试代码 START ===
  133 + # 打印返回的所有 Key,看看有没有 'data' 或者 'aweme_list'
  134 + utils.logger.info(f"[DEBUG] 接口返回的字段 keys: {list(posts_res.keys())}")
  135 +
  136 + # 如果返回里直接有 aweme_list,说明结构变了
  137 + if "aweme_list" in posts_res and "data" not in posts_res:
  138 + utils.logger.info("[DEBUG] 检测到 aweme_list 在根节点,正在修正数据结构...")
  139 + posts_res["data"] = [{"aweme_info": item} for item in posts_res["aweme_list"]]
  140 + # === 新增调试代码 END ===
  141 +
132 if posts_res.get("data") is None or posts_res.get("data") == []: 142 if posts_res.get("data") is None or posts_res.get("data") == []:
133 - utils.logger.info(f"[DouYinCrawler.search] search douyin keyword: {keyword}, page: {page} is empty,{posts_res.get('data')}`") 143 + utils.logger.info(f"[DouYinCrawler.search] 结果为空。Status: {posts_res.get('status_code')}, Msg: {posts_res.get('status_msg')}")
134 break 144 break
135 except DataFetchError: 145 except DataFetchError:
136 utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed") 146 utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed")
@@ -33,6 +33,23 @@ from tools import utils, words @@ -33,6 +33,23 @@ from tools import utils, words
33 from var import crawler_type_var 33 from var import crawler_type_var
34 34
35 35
  36 +def _sanitize_strings(data: Dict) -> Dict:
  37 + """
  38 + Remove PostgreSQL-incompatible control characters (e.g., NULL) from all string fields.
  39 + Args:
  40 + data: original dict
  41 + Returns:
  42 + A new dict with sanitized string values
  43 + """
  44 + cleaned = {}
  45 + for key, value in data.items():
  46 + if isinstance(value, str):
  47 + cleaned[key] = value.replace('\x00', '')
  48 + else:
  49 + cleaned[key] = value
  50 + return cleaned
  51 +
  52 +
36 class BiliCsvStoreImplement(AbstractStore): 53 class BiliCsvStoreImplement(AbstractStore):
37 def __init__(self): 54 def __init__(self):
38 self.file_writer = AsyncFileWriter( 55 self.file_writer = AsyncFileWriter(
@@ -122,6 +139,8 @@ class BiliDbStoreImplement(AbstractStore): @@ -122,6 +139,8 @@ class BiliDbStoreImplement(AbstractStore):
122 # 确保 video_id 为整数类型,匹配数据库 BigInteger 字段 139 # 确保 video_id 为整数类型,匹配数据库 BigInteger 字段
123 if video_id is not None: 140 if video_id is not None:
124 video_id = int(video_id) if not isinstance(video_id, int) else video_id 141 video_id = int(video_id) if not isinstance(video_id, int) else video_id
  142 + content_item["video_id"] = video_id
  143 + content_item = _sanitize_strings(content_item)
125 async with get_session() as session: 144 async with get_session() as session:
126 result = await session.execute(select(BilibiliVideo).where(BilibiliVideo.video_id == video_id)) 145 result = await session.execute(select(BilibiliVideo).where(BilibiliVideo.video_id == video_id))
127 video_detail = result.scalar_one_or_none() 146 video_detail = result.scalar_one_or_none()
@@ -145,6 +164,8 @@ class BiliDbStoreImplement(AbstractStore): @@ -145,6 +164,8 @@ class BiliDbStoreImplement(AbstractStore):
145 # 确保 comment_id 为整数类型,匹配数据库 BigInteger 字段 164 # 确保 comment_id 为整数类型,匹配数据库 BigInteger 字段
146 if comment_id is not None: 165 if comment_id is not None:
147 comment_id = int(comment_id) if not isinstance(comment_id, int) else comment_id 166 comment_id = int(comment_id) if not isinstance(comment_id, int) else comment_id
  167 + comment_item["comment_id"] = comment_id
  168 + comment_item = _sanitize_strings(comment_item)
148 async with get_session() as session: 169 async with get_session() as session:
149 result = await session.execute(select(BilibiliVideoComment).where(BilibiliVideoComment.comment_id == comment_id)) 170 result = await session.execute(select(BilibiliVideoComment).where(BilibiliVideoComment.comment_id == comment_id))
150 comment_detail = result.scalar_one_or_none() 171 comment_detail = result.scalar_one_or_none()
@@ -168,6 +189,8 @@ class BiliDbStoreImplement(AbstractStore): @@ -168,6 +189,8 @@ class BiliDbStoreImplement(AbstractStore):
168 # 确保 creator_id 为整数类型,匹配数据库 BigInteger 字段 189 # 确保 creator_id 为整数类型,匹配数据库 BigInteger 字段
169 if creator_id is not None: 190 if creator_id is not None:
170 creator_id = int(creator_id) if not isinstance(creator_id, int) else creator_id 191 creator_id = int(creator_id) if not isinstance(creator_id, int) else creator_id
  192 + creator["user_id"] = creator_id
  193 + creator = _sanitize_strings(creator)
171 async with get_session() as session: 194 async with get_session() as session:
172 result = await session.execute(select(BilibiliUpInfo).where(BilibiliUpInfo.user_id == creator_id)) 195 result = await session.execute(select(BilibiliUpInfo).where(BilibiliUpInfo.user_id == creator_id))
173 creator_detail = result.scalar_one_or_none() 196 creator_detail = result.scalar_one_or_none()
@@ -192,8 +215,11 @@ class BiliDbStoreImplement(AbstractStore): @@ -192,8 +215,11 @@ class BiliDbStoreImplement(AbstractStore):
192 # 确保 up_id 和 fan_id 为整数类型,匹配数据库 BigInteger 字段 215 # 确保 up_id 和 fan_id 为整数类型,匹配数据库 BigInteger 字段
193 if up_id is not None: 216 if up_id is not None:
194 up_id = int(up_id) if not isinstance(up_id, int) else up_id 217 up_id = int(up_id) if not isinstance(up_id, int) else up_id
  218 + contact_item["up_id"] = up_id
195 if fan_id is not None: 219 if fan_id is not None:
196 fan_id = int(fan_id) if not isinstance(fan_id, int) else fan_id 220 fan_id = int(fan_id) if not isinstance(fan_id, int) else fan_id
  221 + contact_item["fan_id"] = fan_id
  222 + contact_item = _sanitize_strings(contact_item)
197 async with get_session() as session: 223 async with get_session() as session:
198 result = await session.execute( 224 result = await session.execute(
199 select(BilibiliContactInfo).where(BilibiliContactInfo.up_id == up_id, BilibiliContactInfo.fan_id == fan_id) 225 select(BilibiliContactInfo).where(BilibiliContactInfo.up_id == up_id, BilibiliContactInfo.fan_id == fan_id)
@@ -216,6 +242,7 @@ class BiliDbStoreImplement(AbstractStore): @@ -216,6 +242,7 @@ class BiliDbStoreImplement(AbstractStore):
216 dynamic_item: dynamic item dict 242 dynamic_item: dynamic item dict
217 """ 243 """
218 dynamic_id = dynamic_item.get("dynamic_id") 244 dynamic_id = dynamic_item.get("dynamic_id")
  245 + dynamic_item = _sanitize_strings(dynamic_item)
219 async with get_session() as session: 246 async with get_session() as session:
220 result = await session.execute(select(BilibiliUpDynamic).where(BilibiliUpDynamic.dynamic_id == dynamic_id)) 247 result = await session.execute(select(BilibiliUpDynamic).where(BilibiliUpDynamic.dynamic_id == dynamic_id))
221 dynamic_detail = result.scalar_one_or_none() 248 dynamic_detail = result.scalar_one_or_none()
@@ -206,7 +206,7 @@ async def update_dy_aweme_comment(aweme_id: str, comment_item: Dict): @@ -206,7 +206,7 @@ async def update_dy_aweme_comment(aweme_id: str, comment_item: Dict):
206 "nickname": user_info.get("nickname"), 206 "nickname": user_info.get("nickname"),
207 "avatar": avatar_info.get("url_list", [""])[0], 207 "avatar": avatar_info.get("url_list", [""])[0],
208 "sub_comment_count": str(comment_item.get("reply_comment_total", 0)), 208 "sub_comment_count": str(comment_item.get("reply_comment_total", 0)),
209 - "like_count": (comment_item.get("digg_count") if comment_item.get("digg_count") else 0), 209 + "like_count": str(comment_item.get("digg_count") or 0),
210 "last_modify_ts": utils.get_current_timestamp(), 210 "last_modify_ts": utils.get_current_timestamp(),
211 "parent_comment_id": parent_comment_id, 211 "parent_comment_id": parent_comment_id,
212 "pictures": ",".join(_extract_comment_image_list(comment_item)), 212 "pictures": ",".join(_extract_comment_image_list(comment_item)),
@@ -121,7 +121,7 @@ class DouyinAweme(Base): @@ -121,7 +121,7 @@ class DouyinAweme(Base):
121 ip_location: Mapped[str | None] = mapped_column(Text, nullable=True) 121 ip_location: Mapped[str | None] = mapped_column(Text, nullable=True)
122 add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True) 122 add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
123 last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True) 123 last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
124 - aweme_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True) 124 + aweme_id: Mapped[str | None] = mapped_column(String(64), index=True, nullable=True)
125 aweme_type: Mapped[str | None] = mapped_column(Text, nullable=True) 125 aweme_type: Mapped[str | None] = mapped_column(Text, nullable=True)
126 title: Mapped[str | None] = mapped_column(Text, nullable=True) 126 title: Mapped[str | None] = mapped_column(Text, nullable=True)
127 desc: Mapped[str | None] = mapped_column(Text, nullable=True) 127 desc: Mapped[str | None] = mapped_column(Text, nullable=True)
@@ -152,8 +152,8 @@ class DouyinAwemeComment(Base): @@ -152,8 +152,8 @@ class DouyinAwemeComment(Base):
152 ip_location: Mapped[str | None] = mapped_column(Text, nullable=True) 152 ip_location: Mapped[str | None] = mapped_column(Text, nullable=True)
153 add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True) 153 add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
154 last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True) 154 last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
155 - comment_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)  
156 - aweme_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True) 155 + comment_id: Mapped[str | None] = mapped_column(String(64), index=True, nullable=True)
  156 + aweme_id: Mapped[str | None] = mapped_column(String(64), index=True, nullable=True)
157 content: Mapped[str | None] = mapped_column(Text, nullable=True) 157 content: Mapped[str | None] = mapped_column(Text, nullable=True)
158 create_time: Mapped[int | None] = mapped_column(BigInteger, nullable=True) 158 create_time: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
159 sub_comment_count: Mapped[str | None] = mapped_column(Text, nullable=True) 159 sub_comment_count: Mapped[str | None] = mapped_column(Text, nullable=True)