Showing
2 changed files
with
362 additions
and
8 deletions
| @@ -1232,6 +1232,176 @@ class HTMLRenderer: | @@ -1232,6 +1232,176 @@ class HTMLRenderer: | ||
| 1232 | class_attr = f' class="{extra_class}"' if extra_class else "" | 1232 | class_attr = f' class="{extra_class}"' if extra_class else "" |
| 1233 | return f'<{tag}{class_attr}>{items_html}</{tag}>' | 1233 | return f'<{tag}{class_attr}>{items_html}</{tag}>' |
| 1234 | 1234 | ||
| 1235 | + def _flatten_nested_cells(self, cells: List[Dict[str, Any]]) -> List[Dict[str, Any]]: | ||
| 1236 | + """ | ||
| 1237 | + 展平错误嵌套的单元格结构。 | ||
| 1238 | + | ||
| 1239 | + 某些 LLM 生成的表格数据中,单元格被错误地递归嵌套: | ||
| 1240 | + cells[0] 正常, cells[1].cells[0] 正常, cells[1].cells[1].cells[0] 正常... | ||
| 1241 | + 本方法将这种嵌套结构展平为标准的平行单元格数组。 | ||
| 1242 | + | ||
| 1243 | + 参数: | ||
| 1244 | + cells: 可能包含嵌套结构的单元格数组。 | ||
| 1245 | + | ||
| 1246 | + 返回: | ||
| 1247 | + List[Dict]: 展平后的单元格数组。 | ||
| 1248 | + """ | ||
| 1249 | + if not cells: | ||
| 1250 | + return [] | ||
| 1251 | + | ||
| 1252 | + flattened: List[Dict[str, Any]] = [] | ||
| 1253 | + | ||
| 1254 | + def _extract_cells(cell_or_list: Any) -> None: | ||
| 1255 | + """递归提取所有单元格""" | ||
| 1256 | + if not isinstance(cell_or_list, dict): | ||
| 1257 | + return | ||
| 1258 | + | ||
| 1259 | + # 如果当前对象有 blocks,说明它是一个有效的单元格 | ||
| 1260 | + if "blocks" in cell_or_list: | ||
| 1261 | + # 创建单元格副本,移除嵌套的 cells | ||
| 1262 | + clean_cell = { | ||
| 1263 | + k: v for k, v in cell_or_list.items() | ||
| 1264 | + if k != "cells" | ||
| 1265 | + } | ||
| 1266 | + flattened.append(clean_cell) | ||
| 1267 | + | ||
| 1268 | + # 如果当前对象有嵌套的 cells,递归处理 | ||
| 1269 | + nested_cells = cell_or_list.get("cells") | ||
| 1270 | + if isinstance(nested_cells, list): | ||
| 1271 | + for nested_cell in nested_cells: | ||
| 1272 | + _extract_cells(nested_cell) | ||
| 1273 | + | ||
| 1274 | + for cell in cells: | ||
| 1275 | + _extract_cells(cell) | ||
| 1276 | + | ||
| 1277 | + return flattened | ||
| 1278 | + | ||
| 1279 | + def _fix_nested_table_rows(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]: | ||
| 1280 | + """ | ||
| 1281 | + 修复嵌套错误的表格行结构。 | ||
| 1282 | + | ||
| 1283 | + 某些 LLM 生成的表格数据中,所有行的单元格都被嵌套在第一行中, | ||
| 1284 | + 导致表格只有1行但包含所有数据。本方法检测并修复这种情况。 | ||
| 1285 | + | ||
| 1286 | + 参数: | ||
| 1287 | + rows: 原始的表格行数组。 | ||
| 1288 | + | ||
| 1289 | + 返回: | ||
| 1290 | + List[Dict]: 修复后的表格行数组。 | ||
| 1291 | + """ | ||
| 1292 | + if not rows or len(rows) != 1: | ||
| 1293 | + # 只处理只有1行的异常情况 | ||
| 1294 | + return rows | ||
| 1295 | + | ||
| 1296 | + first_row = rows[0] | ||
| 1297 | + original_cells = first_row.get("cells", []) | ||
| 1298 | + | ||
| 1299 | + # 检查是否存在嵌套结构 | ||
| 1300 | + has_nested = any( | ||
| 1301 | + isinstance(cell.get("cells"), list) | ||
| 1302 | + for cell in original_cells | ||
| 1303 | + if isinstance(cell, dict) | ||
| 1304 | + ) | ||
| 1305 | + | ||
| 1306 | + if not has_nested: | ||
| 1307 | + return rows | ||
| 1308 | + | ||
| 1309 | + # 展平所有单元格 | ||
| 1310 | + all_cells = self._flatten_nested_cells(original_cells) | ||
| 1311 | + | ||
| 1312 | + if len(all_cells) <= 2: | ||
| 1313 | + # 单元格太少,不需要重组 | ||
| 1314 | + return rows | ||
| 1315 | + | ||
| 1316 | + # 辅助函数:获取单元格文本 | ||
| 1317 | + def _get_cell_text(cell: Dict[str, Any]) -> str: | ||
| 1318 | + """获取单元格的文本内容""" | ||
| 1319 | + blocks = cell.get("blocks", []) | ||
| 1320 | + for block in blocks: | ||
| 1321 | + if block.get("type") == "paragraph": | ||
| 1322 | + inlines = block.get("inlines", []) | ||
| 1323 | + for inline in inlines: | ||
| 1324 | + text = inline.get("text", "") | ||
| 1325 | + if text: | ||
| 1326 | + return text.strip() | ||
| 1327 | + return "" | ||
| 1328 | + | ||
| 1329 | + def _is_placeholder_cell(cell: Dict[str, Any]) -> bool: | ||
| 1330 | + """判断单元格是否是占位符(如 '--', '-', '—' 等)""" | ||
| 1331 | + text = _get_cell_text(cell) | ||
| 1332 | + return text in ("--", "-", "—", "——", "", "N/A", "n/a") | ||
| 1333 | + | ||
| 1334 | + # 先过滤掉占位符单元格 | ||
| 1335 | + all_cells = [c for c in all_cells if not _is_placeholder_cell(c)] | ||
| 1336 | + | ||
| 1337 | + if len(all_cells) <= 2: | ||
| 1338 | + return rows | ||
| 1339 | + | ||
| 1340 | + # 检测表头列数:查找带有 bold 标记的单元格 | ||
| 1341 | + def _is_header_cell(cell: Dict[str, Any]) -> bool: | ||
| 1342 | + """判断单元格是否像表头(通常有加粗标记)""" | ||
| 1343 | + blocks = cell.get("blocks", []) | ||
| 1344 | + for block in blocks: | ||
| 1345 | + if block.get("type") == "paragraph": | ||
| 1346 | + inlines = block.get("inlines", []) | ||
| 1347 | + for inline in inlines: | ||
| 1348 | + marks = inline.get("marks", []) | ||
| 1349 | + if any(m.get("type") == "bold" for m in marks): | ||
| 1350 | + return True | ||
| 1351 | + return False | ||
| 1352 | + | ||
| 1353 | + # 计算表头列数:统计连续的加粗单元格数量 | ||
| 1354 | + # 占位符已经在前面被过滤掉了 | ||
| 1355 | + header_count = 0 | ||
| 1356 | + for cell in all_cells: | ||
| 1357 | + if _is_header_cell(cell): | ||
| 1358 | + header_count += 1 | ||
| 1359 | + else: | ||
| 1360 | + # 遇到第一个非表头单元格,说明数据区开始 | ||
| 1361 | + break | ||
| 1362 | + | ||
| 1363 | + # 如果没有检测到表头,尝试使用启发式方法 | ||
| 1364 | + if header_count == 0: | ||
| 1365 | + # 假设列数为 4 或 5(常见的表格列数) | ||
| 1366 | + total = len(all_cells) | ||
| 1367 | + for possible_cols in [4, 5, 3, 6]: | ||
| 1368 | + if total % possible_cols == 0: | ||
| 1369 | + header_count = possible_cols | ||
| 1370 | + break | ||
| 1371 | + else: | ||
| 1372 | + # 尝试找到最接近的能整除的列数 | ||
| 1373 | + for possible_cols in [4, 5, 3, 6]: | ||
| 1374 | + remainder = total % possible_cols | ||
| 1375 | + # 允许最多3个多余的单元格(可能是尾部的总结或注释) | ||
| 1376 | + if remainder <= 3: | ||
| 1377 | + header_count = possible_cols | ||
| 1378 | + break | ||
| 1379 | + else: | ||
| 1380 | + # 无法确定列数,返回原始数据 | ||
| 1381 | + return rows | ||
| 1382 | + | ||
| 1383 | + # 计算有效的单元格数量(可能需要截断尾部多余的单元格) | ||
| 1384 | + total = len(all_cells) | ||
| 1385 | + remainder = total % header_count | ||
| 1386 | + if remainder > 0 and remainder <= 3: | ||
| 1387 | + # 截断尾部多余的单元格(可能是总结或注释) | ||
| 1388 | + all_cells = all_cells[:total - remainder] | ||
| 1389 | + elif remainder > 3: | ||
| 1390 | + # 余数太大,可能列数检测错误,返回原始数据 | ||
| 1391 | + return rows | ||
| 1392 | + | ||
| 1393 | + # 重新组织成多行 | ||
| 1394 | + fixed_rows: List[Dict[str, Any]] = [] | ||
| 1395 | + for i in range(0, len(all_cells), header_count): | ||
| 1396 | + row_cells = all_cells[i:i + header_count] | ||
| 1397 | + # 标记第一行为表头 | ||
| 1398 | + if i == 0: | ||
| 1399 | + for cell in row_cells: | ||
| 1400 | + cell["header"] = True | ||
| 1401 | + fixed_rows.append({"cells": row_cells}) | ||
| 1402 | + | ||
| 1403 | + return fixed_rows | ||
| 1404 | + | ||
| 1235 | def _render_table(self, block: Dict[str, Any]) -> str: | 1405 | def _render_table(self, block: Dict[str, Any]) -> str: |
| 1236 | """ | 1406 | """ |
| 1237 | 渲染表格,同时保留caption与单元格属性。 | 1407 | 渲染表格,同时保留caption与单元格属性。 |
| @@ -1242,11 +1412,16 @@ class HTMLRenderer: | @@ -1242,11 +1412,16 @@ class HTMLRenderer: | ||
| 1242 | 返回: | 1412 | 返回: |
| 1243 | str: 包含<table>结构的HTML。 | 1413 | str: 包含<table>结构的HTML。 |
| 1244 | """ | 1414 | """ |
| 1245 | - rows = self._normalize_table_rows(block.get("rows") or []) | 1415 | + # 先修复可能存在的嵌套行结构问题 |
| 1416 | + raw_rows = block.get("rows") or [] | ||
| 1417 | + fixed_rows = self._fix_nested_table_rows(raw_rows) | ||
| 1418 | + rows = self._normalize_table_rows(fixed_rows) | ||
| 1246 | rows_html = "" | 1419 | rows_html = "" |
| 1247 | for row in rows: | 1420 | for row in rows: |
| 1248 | row_cells = "" | 1421 | row_cells = "" |
| 1249 | - for cell in row.get("cells", []): | 1422 | + # 展平可能存在的嵌套单元格结构(作为额外保护) |
| 1423 | + cells = self._flatten_nested_cells(row.get("cells", [])) | ||
| 1424 | + for cell in cells: | ||
| 1250 | cell_tag = "th" if cell.get("header") or cell.get("isHeader") else "td" | 1425 | cell_tag = "th" if cell.get("header") or cell.get("isHeader") else "td" |
| 1251 | attr = [] | 1426 | attr = [] |
| 1252 | if cell.get("rowspan"): | 1427 | if cell.get("rowspan"): |
| @@ -168,24 +168,201 @@ class MarkdownRenderer: | @@ -168,24 +168,201 @@ class MarkdownRenderer: | ||
| 168 | lines.append(f" {cont}") | 168 | lines.append(f" {cont}") |
| 169 | return "\n".join(lines) | 169 | return "\n".join(lines) |
| 170 | 170 | ||
| 171 | + def _flatten_nested_cells(self, cells: List[Dict[str, Any]]) -> List[Dict[str, Any]]: | ||
| 172 | + """ | ||
| 173 | + 展平错误嵌套的单元格结构。 | ||
| 174 | + | ||
| 175 | + 某些 LLM 生成的表格数据中,单元格被错误地递归嵌套: | ||
| 176 | + cells[0] 正常, cells[1].cells[0] 正常, cells[1].cells[1].cells[0] 正常... | ||
| 177 | + 本方法将这种嵌套结构展平为标准的平行单元格数组。 | ||
| 178 | + | ||
| 179 | + 参数: | ||
| 180 | + cells: 可能包含嵌套结构的单元格数组。 | ||
| 181 | + | ||
| 182 | + 返回: | ||
| 183 | + List[Dict]: 展平后的单元格数组。 | ||
| 184 | + """ | ||
| 185 | + if not cells: | ||
| 186 | + return [] | ||
| 187 | + | ||
| 188 | + flattened: List[Dict[str, Any]] = [] | ||
| 189 | + | ||
| 190 | + def _extract_cells(cell_or_list: Any) -> None: | ||
| 191 | + """递归提取所有单元格""" | ||
| 192 | + if not isinstance(cell_or_list, dict): | ||
| 193 | + return | ||
| 194 | + | ||
| 195 | + # 如果当前对象有 blocks,说明它是一个有效的单元格 | ||
| 196 | + if "blocks" in cell_or_list: | ||
| 197 | + # 创建单元格副本,移除嵌套的 cells | ||
| 198 | + clean_cell = { | ||
| 199 | + k: v for k, v in cell_or_list.items() | ||
| 200 | + if k != "cells" | ||
| 201 | + } | ||
| 202 | + flattened.append(clean_cell) | ||
| 203 | + | ||
| 204 | + # 如果当前对象有嵌套的 cells,递归处理 | ||
| 205 | + nested_cells = cell_or_list.get("cells") | ||
| 206 | + if isinstance(nested_cells, list): | ||
| 207 | + for nested_cell in nested_cells: | ||
| 208 | + _extract_cells(nested_cell) | ||
| 209 | + | ||
| 210 | + for cell in cells: | ||
| 211 | + _extract_cells(cell) | ||
| 212 | + | ||
| 213 | + return flattened | ||
| 214 | + | ||
| 215 | + def _fix_nested_table_rows(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]: | ||
| 216 | + """ | ||
| 217 | + 修复嵌套错误的表格行结构。 | ||
| 218 | + | ||
| 219 | + 某些 LLM 生成的表格数据中,所有行的单元格都被嵌套在第一行中, | ||
| 220 | + 导致表格只有1行但包含所有数据。本方法检测并修复这种情况。 | ||
| 221 | + | ||
| 222 | + 参数: | ||
| 223 | + rows: 原始的表格行数组。 | ||
| 224 | + | ||
| 225 | + 返回: | ||
| 226 | + List[Dict]: 修复后的表格行数组。 | ||
| 227 | + """ | ||
| 228 | + if not rows or len(rows) != 1: | ||
| 229 | + # 只处理只有1行的异常情况 | ||
| 230 | + return rows | ||
| 231 | + | ||
| 232 | + first_row = rows[0] | ||
| 233 | + original_cells = first_row.get("cells", []) | ||
| 234 | + | ||
| 235 | + # 检查是否存在嵌套结构 | ||
| 236 | + has_nested = any( | ||
| 237 | + isinstance(cell.get("cells"), list) | ||
| 238 | + for cell in original_cells | ||
| 239 | + if isinstance(cell, dict) | ||
| 240 | + ) | ||
| 241 | + | ||
| 242 | + if not has_nested: | ||
| 243 | + return rows | ||
| 244 | + | ||
| 245 | + # 展平所有单元格 | ||
| 246 | + all_cells = self._flatten_nested_cells(original_cells) | ||
| 247 | + | ||
| 248 | + if len(all_cells) <= 2: | ||
| 249 | + # 单元格太少,不需要重组 | ||
| 250 | + return rows | ||
| 251 | + | ||
| 252 | + # 辅助函数:获取单元格文本 | ||
| 253 | + def _get_cell_text(cell: Dict[str, Any]) -> str: | ||
| 254 | + """获取单元格的文本内容""" | ||
| 255 | + blocks = cell.get("blocks", []) | ||
| 256 | + for block in blocks: | ||
| 257 | + if block.get("type") == "paragraph": | ||
| 258 | + inlines = block.get("inlines", []) | ||
| 259 | + for inline in inlines: | ||
| 260 | + text = inline.get("text", "") | ||
| 261 | + if text: | ||
| 262 | + return text.strip() | ||
| 263 | + return "" | ||
| 264 | + | ||
| 265 | + def _is_placeholder_cell(cell: Dict[str, Any]) -> bool: | ||
| 266 | + """判断单元格是否是占位符(如 '--', '-', '—' 等)""" | ||
| 267 | + text = _get_cell_text(cell) | ||
| 268 | + return text in ("--", "-", "—", "——", "", "N/A", "n/a") | ||
| 269 | + | ||
| 270 | + # 先过滤掉占位符单元格 | ||
| 271 | + all_cells = [c for c in all_cells if not _is_placeholder_cell(c)] | ||
| 272 | + | ||
| 273 | + if len(all_cells) <= 2: | ||
| 274 | + return rows | ||
| 275 | + | ||
| 276 | + # 检测表头列数:查找带有 bold 标记的单元格 | ||
| 277 | + def _is_header_cell(cell: Dict[str, Any]) -> bool: | ||
| 278 | + """判断单元格是否像表头(通常有加粗标记)""" | ||
| 279 | + blocks = cell.get("blocks", []) | ||
| 280 | + for block in blocks: | ||
| 281 | + if block.get("type") == "paragraph": | ||
| 282 | + inlines = block.get("inlines", []) | ||
| 283 | + for inline in inlines: | ||
| 284 | + marks = inline.get("marks", []) | ||
| 285 | + if any(m.get("type") == "bold" for m in marks): | ||
| 286 | + return True | ||
| 287 | + return False | ||
| 288 | + | ||
| 289 | + # 计算表头列数:统计连续的加粗单元格数量 | ||
| 290 | + # 占位符已经在前面被过滤掉了 | ||
| 291 | + header_count = 0 | ||
| 292 | + for cell in all_cells: | ||
| 293 | + if _is_header_cell(cell): | ||
| 294 | + header_count += 1 | ||
| 295 | + else: | ||
| 296 | + # 遇到第一个非表头单元格,说明数据区开始 | ||
| 297 | + break | ||
| 298 | + | ||
| 299 | + # 如果没有检测到表头,尝试使用启发式方法 | ||
| 300 | + if header_count == 0: | ||
| 301 | + # 假设列数为 4 或 5(常见的表格列数) | ||
| 302 | + total = len(all_cells) | ||
| 303 | + for possible_cols in [4, 5, 3, 6]: | ||
| 304 | + if total % possible_cols == 0: | ||
| 305 | + header_count = possible_cols | ||
| 306 | + break | ||
| 307 | + else: | ||
| 308 | + # 尝试找到最接近的能整除的列数 | ||
| 309 | + for possible_cols in [4, 5, 3, 6]: | ||
| 310 | + remainder = total % possible_cols | ||
| 311 | + # 允许最多3个多余的单元格(可能是尾部的总结或注释) | ||
| 312 | + if remainder <= 3: | ||
| 313 | + header_count = possible_cols | ||
| 314 | + break | ||
| 315 | + else: | ||
| 316 | + # 无法确定列数,返回原始数据 | ||
| 317 | + return rows | ||
| 318 | + | ||
| 319 | + # 计算有效的单元格数量(可能需要截断尾部多余的单元格) | ||
| 320 | + total = len(all_cells) | ||
| 321 | + remainder = total % header_count | ||
| 322 | + if remainder > 0 and remainder <= 3: | ||
| 323 | + # 截断尾部多余的单元格(可能是总结或注释) | ||
| 324 | + all_cells = all_cells[:total - remainder] | ||
| 325 | + elif remainder > 3: | ||
| 326 | + # 余数太大,可能列数检测错误,返回原始数据 | ||
| 327 | + return rows | ||
| 328 | + | ||
| 329 | + # 重新组织成多行 | ||
| 330 | + fixed_rows: List[Dict[str, Any]] = [] | ||
| 331 | + for i in range(0, len(all_cells), header_count): | ||
| 332 | + row_cells = all_cells[i:i + header_count] | ||
| 333 | + # 标记第一行为表头 | ||
| 334 | + if i == 0: | ||
| 335 | + for cell in row_cells: | ||
| 336 | + cell["header"] = True | ||
| 337 | + fixed_rows.append({"cells": row_cells}) | ||
| 338 | + | ||
| 339 | + return fixed_rows | ||
| 340 | + | ||
| 171 | def _render_table(self, block: Dict[str, Any]) -> str: | 341 | def _render_table(self, block: Dict[str, Any]) -> str: |
| 172 | - rows = block.get("rows") or [] | ||
| 173 | - if not rows: | 342 | + raw_rows = block.get("rows") or [] |
| 343 | + if not raw_rows: | ||
| 174 | return "" | 344 | return "" |
| 175 | 345 | ||
| 346 | + # 先修复可能存在的嵌套行结构问题 | ||
| 347 | + rows = self._fix_nested_table_rows(raw_rows) | ||
| 348 | + | ||
| 176 | header_cells: List[str] = [] | 349 | header_cells: List[str] = [] |
| 177 | body_rows: List[List[str]] = [] | 350 | body_rows: List[List[str]] = [] |
| 178 | 351 | ||
| 352 | + # 展平可能存在的嵌套单元格结构(作为额外保护) | ||
| 353 | + first_row_cells_raw = rows[0].get("cells") if isinstance(rows[0], dict) else None | ||
| 354 | + first_row_cells = self._flatten_nested_cells(first_row_cells_raw) if first_row_cells_raw else None | ||
| 355 | + | ||
| 179 | # 检测首行是否声明为表头 | 356 | # 检测首行是否声明为表头 |
| 180 | - first_row_cells = rows[0].get("cells") if isinstance(rows[0], dict) else None | ||
| 181 | has_header = bool(first_row_cells and any(cell.get("header") or cell.get("isHeader") for cell in first_row_cells)) | 357 | has_header = bool(first_row_cells and any(cell.get("header") or cell.get("isHeader") for cell in first_row_cells)) |
| 182 | 358 | ||
| 183 | # 计算最大列数,忽略rowspan | 359 | # 计算最大列数,忽略rowspan |
| 184 | col_count = 0 | 360 | col_count = 0 |
| 185 | for row in rows: | 361 | for row in rows: |
| 186 | - cells = row.get("cells") if isinstance(row, dict) else None | 362 | + cells_raw = row.get("cells") if isinstance(row, dict) else None |
| 363 | + cells = self._flatten_nested_cells(cells_raw) if cells_raw else [] | ||
| 187 | span = 0 | 364 | span = 0 |
| 188 | - for cell in cells or []: | 365 | + for cell in cells: |
| 189 | span += int(cell.get("colspan") or 1) | 366 | span += int(cell.get("colspan") or 1) |
| 190 | col_count = max(col_count, span) | 367 | col_count = max(col_count, span) |
| 191 | 368 | ||
| @@ -198,7 +375,9 @@ class MarkdownRenderer: | @@ -198,7 +375,9 @@ class MarkdownRenderer: | ||
| 198 | for row in rows: | 375 | for row in rows: |
| 199 | if not isinstance(row, dict): | 376 | if not isinstance(row, dict): |
| 200 | continue | 377 | continue |
| 201 | - cells = row.get("cells") or [] | 378 | + cells_raw = row.get("cells") or [] |
| 379 | + # 展平可能存在的嵌套单元格结构 | ||
| 380 | + cells = self._flatten_nested_cells(cells_raw) | ||
| 202 | row_cells: List[str] = [] | 381 | row_cells: List[str] = [] |
| 203 | for cell in cells: | 382 | for cell in cells: |
| 204 | text = self._render_cell_content(cell) | 383 | text = self._render_cell_content(cell) |
-
Please register or login to post a comment