Showing
1 changed file
with
78 additions
and
21 deletions
| @@ -1329,8 +1329,84 @@ class HTMLRenderer: | @@ -1329,8 +1329,84 @@ class HTMLRenderer: | ||
| 1329 | 返回: | 1329 | 返回: |
| 1330 | List[Dict]: 修复后的表格行数组。 | 1330 | List[Dict]: 修复后的表格行数组。 |
| 1331 | """ | 1331 | """ |
| 1332 | - if not rows or len(rows) != 1: | ||
| 1333 | - # 只处理只有1行的异常情况 | 1332 | + if not rows: |
| 1333 | + return [] | ||
| 1334 | + | ||
| 1335 | + # 辅助函数:获取单元格文本 | ||
| 1336 | + def _get_cell_text(cell: Dict[str, Any]) -> str: | ||
| 1337 | + """获取单元格的文本内容""" | ||
| 1338 | + blocks = cell.get("blocks", []) | ||
| 1339 | + for block in blocks: | ||
| 1340 | + if isinstance(block, dict) and block.get("type") == "paragraph": | ||
| 1341 | + inlines = block.get("inlines", []) | ||
| 1342 | + for inline in inlines: | ||
| 1343 | + if isinstance(inline, dict): | ||
| 1344 | + text = inline.get("text", "") | ||
| 1345 | + if text: | ||
| 1346 | + return str(text).strip() | ||
| 1347 | + return "" | ||
| 1348 | + | ||
| 1349 | + def _is_placeholder_cell(cell: Dict[str, Any]) -> bool: | ||
| 1350 | + """判断单元格是否是占位符(如 '--', '-', '—' 等)""" | ||
| 1351 | + text = _get_cell_text(cell) | ||
| 1352 | + return text in ("--", "-", "—", "——", "", "N/A", "n/a") | ||
| 1353 | + | ||
| 1354 | + def _is_heading_like_cell(cell: Dict[str, Any]) -> bool: | ||
| 1355 | + """检测是否疑似被错误并入表格的章节/标题单元格""" | ||
| 1356 | + text = _get_cell_text(cell) | ||
| 1357 | + if not text: | ||
| 1358 | + return False | ||
| 1359 | + stripped = text.strip() | ||
| 1360 | + # 章节号或“第X章/部分”常见格式,避免误删正常数字值 | ||
| 1361 | + heading_patterns = ( | ||
| 1362 | + r"^\d{1,2}(?:\.\d{1,2}){1,3}\s+", | ||
| 1363 | + r"^第[一二三四五六七八九十]+[章节部分]", | ||
| 1364 | + ) | ||
| 1365 | + return any(re.match(pat, stripped) for pat in heading_patterns) | ||
| 1366 | + | ||
| 1367 | + # 第一阶段:处理“有表头行 + 数据被串在一行”的情况 | ||
| 1368 | + header_cells = self._flatten_nested_cells((rows[0] or {}).get("cells", [])) | ||
| 1369 | + header_count = len(header_cells) | ||
| 1370 | + overflow_fixed = None | ||
| 1371 | + if header_count >= 2: | ||
| 1372 | + rebuilt_rows: List[Dict[str, Any]] = [ | ||
| 1373 | + { | ||
| 1374 | + **{k: v for k, v in (rows[0] or {}).items() if k != "cells"}, | ||
| 1375 | + "cells": header_cells, | ||
| 1376 | + } | ||
| 1377 | + ] | ||
| 1378 | + changed = False | ||
| 1379 | + for row in rows[1:]: | ||
| 1380 | + cells = self._flatten_nested_cells((row or {}).get("cells", [])) | ||
| 1381 | + cell_count = len(cells) | ||
| 1382 | + if cell_count <= header_count: | ||
| 1383 | + rebuilt_rows.append({**{k: v for k, v in (row or {}).items() if k != "cells"}, "cells": cells}) | ||
| 1384 | + continue | ||
| 1385 | + | ||
| 1386 | + remainder = cell_count % header_count | ||
| 1387 | + trimmed_cells = cells | ||
| 1388 | + if remainder: | ||
| 1389 | + trailing = cells[-remainder:] | ||
| 1390 | + if all(_is_placeholder_cell(c) or _is_heading_like_cell(c) for c in trailing): | ||
| 1391 | + trimmed_cells = cells[:-remainder] | ||
| 1392 | + remainder = 0 | ||
| 1393 | + | ||
| 1394 | + if remainder == 0 and len(trimmed_cells) >= header_count * 2: | ||
| 1395 | + for i in range(0, len(trimmed_cells), header_count): | ||
| 1396 | + chunk = trimmed_cells[i : i + header_count] | ||
| 1397 | + rebuilt_rows.append({"cells": chunk}) | ||
| 1398 | + changed = True | ||
| 1399 | + else: | ||
| 1400 | + rebuilt_rows.append({**{k: v for k, v in (row or {}).items() if k != "cells"}, "cells": cells}) | ||
| 1401 | + | ||
| 1402 | + if changed: | ||
| 1403 | + overflow_fixed = rebuilt_rows | ||
| 1404 | + | ||
| 1405 | + if overflow_fixed is not None: | ||
| 1406 | + rows = overflow_fixed | ||
| 1407 | + | ||
| 1408 | + if len(rows) != 1: | ||
| 1409 | + # 只有一行的异常情况由后续逻辑处理;正常多行直接返回 | ||
| 1334 | return rows | 1410 | return rows |
| 1335 | 1411 | ||
| 1336 | first_row = rows[0] | 1412 | first_row = rows[0] |
| @@ -1353,25 +1429,6 @@ class HTMLRenderer: | @@ -1353,25 +1429,6 @@ class HTMLRenderer: | ||
| 1353 | # 单元格太少,不需要重组 | 1429 | # 单元格太少,不需要重组 |
| 1354 | return rows | 1430 | return rows |
| 1355 | 1431 | ||
| 1356 | - # 辅助函数:获取单元格文本 | ||
| 1357 | - def _get_cell_text(cell: Dict[str, Any]) -> str: | ||
| 1358 | - """获取单元格的文本内容""" | ||
| 1359 | - blocks = cell.get("blocks", []) | ||
| 1360 | - for block in blocks: | ||
| 1361 | - if isinstance(block, dict) and block.get("type") == "paragraph": | ||
| 1362 | - inlines = block.get("inlines", []) | ||
| 1363 | - for inline in inlines: | ||
| 1364 | - if isinstance(inline, dict): | ||
| 1365 | - text = inline.get("text", "") | ||
| 1366 | - if text: | ||
| 1367 | - return str(text).strip() | ||
| 1368 | - return "" | ||
| 1369 | - | ||
| 1370 | - def _is_placeholder_cell(cell: Dict[str, Any]) -> bool: | ||
| 1371 | - """判断单元格是否是占位符(如 '--', '-', '—' 等)""" | ||
| 1372 | - text = _get_cell_text(cell) | ||
| 1373 | - return text in ("--", "-", "—", "——", "", "N/A", "n/a") | ||
| 1374 | - | ||
| 1375 | # 先过滤掉占位符单元格 | 1432 | # 先过滤掉占位符单元格 |
| 1376 | all_cells = [c for c in all_cells if not _is_placeholder_cell(c)] | 1433 | all_cells = [c for c in all_cells if not _is_placeholder_cell(c)] |
| 1377 | 1434 |
-
Please register or login to post a comment