Fix:
1. _parse_chapter 异常类型降级(影响:高)
# 回退后(本 PR)
raise ValueError("LLM返回空内容")
raise ValueError("章节JSON缺少chapter字段")
# 回退前(当前 main)
raise ChapterJsonParseError("LLM返回空内容", raw_text=raw_text)
raise ChapterJsonParseError("章节JSON缺少chapter字段或结构不完整", raw_text=cleaned)
ChapterJsonParseError 是 ValueError 的子类。run() 方法中 except ChapterJsonParseError 捕获不到父类 ValueError。当 LLM 返回空内容或 JSON 缺少 chapter
字段时,异常会直接穿透上层所有 except 块,导致整章生成失败且不会进入重试或降级逻辑。
2. agent.py 移除宽泛异常重试(影响:中高)
移除了对 AttributeError, TypeError, KeyError, IndexError, ValueError, json.JSONDecodeError 的捕获重试。如果 LLM 返回畸形 JSON
导致运行时异常,现在会直接崩溃而非重试。
3. 移除非字典 block 防御性处理(影响:中)
chapter_generation_node.py 中 walk() / _merge_fragment_sequences() / _merge_nested_fragments() 里对 LLM 返回非字典类型 block(string、list
等)的容错处理全部移除。如果 LLM 输出异常结构,现在会直接报错而非自动修复。
4. 移除 _normalize_list_type 和表格行溢出修复(影响:低)
- _normalize_list_type():将非法 listType(如 "unordered")自动映射为 "bullet" 的逻辑被移除
- html_renderer.py 的 _fix_nested_table_rows():多行表格数据溢出到单行时的重组逻辑被简化
Showing
3 changed files
with
227 additions
and
25 deletions
| @@ -663,6 +663,40 @@ class ReportAgent: | @@ -663,6 +663,40 @@ class ReportAgent: | ||
| 663 | raise | 663 | raise |
| 664 | attempt += 1 | 664 | attempt += 1 |
| 665 | continue | 665 | continue |
| 666 | + except (AttributeError, TypeError, KeyError, IndexError, ValueError, json.JSONDecodeError) as structure_error: | ||
| 667 | + # 捕获因 JSON 结构异常导致的运行时错误,包装为可重试异常 | ||
| 668 | + # 包括: | ||
| 669 | + # - AttributeError: 如 list.get() 调用失败 | ||
| 670 | + # - TypeError: 类型不匹配 | ||
| 671 | + # - KeyError: 字典键缺失 | ||
| 672 | + # - IndexError: 列表索引越界 | ||
| 673 | + # - ValueError: 值错误(如 LLM 返回空内容、缺少必要字段) | ||
| 674 | + # - json.JSONDecodeError: JSON 解析失败(未被内部捕获的情况) | ||
| 675 | + error_type = type(structure_error).__name__ | ||
| 676 | + logger.warning( | ||
| 677 | + "章节 {title} 生成过程中发生 {error_type}(第 {attempt}/{total} 次尝试),将尝试重新生成: {error}", | ||
| 678 | + title=section.title, | ||
| 679 | + error_type=error_type, | ||
| 680 | + attempt=attempt, | ||
| 681 | + total=chapter_max_attempts, | ||
| 682 | + error=structure_error, | ||
| 683 | + ) | ||
| 684 | + emit('chapter_status', { | ||
| 685 | + 'chapterId': section.chapter_id, | ||
| 686 | + 'title': section.title, | ||
| 687 | + 'status': 'retrying' if attempt < chapter_max_attempts else 'error', | ||
| 688 | + 'attempt': attempt, | ||
| 689 | + 'error': str(structure_error), | ||
| 690 | + 'reason': 'structure_error', | ||
| 691 | + 'error_type': error_type | ||
| 692 | + }) | ||
| 693 | + if attempt >= chapter_max_attempts: | ||
| 694 | + # 达到最大重试次数,包装为 ChapterJsonParseError 抛出 | ||
| 695 | + raise ChapterJsonParseError( | ||
| 696 | + f"{section.title} 章节因 {error_type} 在 {chapter_max_attempts} 次尝试后仍无法生成: {structure_error}" | ||
| 697 | + ) from structure_error | ||
| 698 | + attempt += 1 | ||
| 699 | + continue | ||
| 666 | except Exception as chapter_error: | 700 | except Exception as chapter_error: |
| 667 | if not self._should_retry_inappropriate_content_error(chapter_error): | 701 | if not self._should_retry_inappropriate_content_error(chapter_error): |
| 668 | raise | 702 | raise |
| @@ -642,7 +642,7 @@ class ChapterGenerationNode(BaseNode): | @@ -642,7 +642,7 @@ class ChapterGenerationNode(BaseNode): | ||
| 642 | cleaned = cleaned[:-3] | 642 | cleaned = cleaned[:-3] |
| 643 | cleaned = cleaned.strip() | 643 | cleaned = cleaned.strip() |
| 644 | if not cleaned: | 644 | if not cleaned: |
| 645 | - raise ValueError("LLM返回空内容") | 645 | + raise ChapterJsonParseError("LLM返回空内容", raw_text=raw_text) |
| 646 | 646 | ||
| 647 | candidate_payloads = [cleaned] | 647 | candidate_payloads = [cleaned] |
| 648 | repaired = self._repair_llm_json(cleaned) | 648 | repaired = self._repair_llm_json(cleaned) |
| @@ -685,7 +685,7 @@ class ChapterGenerationNode(BaseNode): | @@ -685,7 +685,7 @@ class ChapterGenerationNode(BaseNode): | ||
| 685 | return item["chapter"] | 685 | return item["chapter"] |
| 686 | if all(key in item for key in ("chapterId", "title", "blocks")): | 686 | if all(key in item for key in ("chapterId", "title", "blocks")): |
| 687 | return item | 687 | return item |
| 688 | - raise ValueError("章节JSON缺少chapter字段") | 688 | + raise ChapterJsonParseError("章节JSON缺少chapter字段或结构不完整", raw_text=cleaned) |
| 689 | 689 | ||
| 690 | def _persist_error_payload( | 690 | def _persist_error_payload( |
| 691 | self, | 691 | self, |
| @@ -967,13 +967,41 @@ class ChapterGenerationNode(BaseNode): | @@ -967,13 +967,41 @@ class ChapterGenerationNode(BaseNode): | ||
| 967 | """递归检查并修复嵌套结构,保证每个block合法""" | 967 | """递归检查并修复嵌套结构,保证每个block合法""" |
| 968 | if not isinstance(blocks, list): | 968 | if not isinstance(blocks, list): |
| 969 | return | 969 | return |
| 970 | - for block in blocks: | 970 | + # 先过滤掉非字典类型的异常 block |
| 971 | + valid_indices = [] | ||
| 972 | + for idx, block in enumerate(blocks): | ||
| 973 | + if not isinstance(block, dict): | ||
| 974 | + # 尝试将字符串转换为 paragraph | ||
| 975 | + if isinstance(block, str) and block.strip(): | ||
| 976 | + blocks[idx] = self._as_paragraph_block(block) | ||
| 977 | + valid_indices.append(idx) | ||
| 978 | + logger.warning(f"walk: 将字符串 block 转换为 paragraph") | ||
| 979 | + elif isinstance(block, list): | ||
| 980 | + # 尝试提取列表中的有效字典 | ||
| 981 | + for item in block: | ||
| 982 | + if isinstance(item, dict): | ||
| 983 | + self._ensure_block_type(item) | ||
| 984 | + blocks[idx] = item | ||
| 985 | + valid_indices.append(idx) | ||
| 986 | + logger.warning(f"walk: 从列表中提取字典 block") | ||
| 987 | + break | ||
| 988 | + else: | ||
| 989 | + logger.warning(f"walk: 跳过无效的列表 block: {block}") | ||
| 990 | + else: | ||
| 991 | + logger.warning(f"walk: 跳过无效的 block(类型: {type(block).__name__})") | ||
| 992 | + else: | ||
| 993 | + valid_indices.append(idx) | ||
| 994 | + | ||
| 995 | + for idx in valid_indices: | ||
| 996 | + block = blocks[idx] | ||
| 971 | if not isinstance(block, dict): | 997 | if not isinstance(block, dict): |
| 972 | continue | 998 | continue |
| 973 | self._ensure_block_type(block) | 999 | self._ensure_block_type(block) |
| 974 | self._sanitize_block_content(block) | 1000 | self._sanitize_block_content(block) |
| 975 | block_type = block.get("type") | 1001 | block_type = block.get("type") |
| 976 | if block_type == "list": | 1002 | if block_type == "list": |
| 1003 | + # 自动修复 listType:确保是合法值 | ||
| 1004 | + self._normalize_list_type(block) | ||
| 977 | items = block.get("items") | 1005 | items = block.get("items") |
| 978 | normalized = self._normalize_list_items(items) | 1006 | normalized = self._normalize_list_items(items) |
| 979 | if normalized: | 1007 | if normalized: |
| @@ -984,8 +1012,12 @@ class ChapterGenerationNode(BaseNode): | @@ -984,8 +1012,12 @@ class ChapterGenerationNode(BaseNode): | ||
| 984 | walk(block.get("blocks")) | 1012 | walk(block.get("blocks")) |
| 985 | elif block_type == "table": | 1013 | elif block_type == "table": |
| 986 | for row in block.get("rows", []): | 1014 | for row in block.get("rows", []): |
| 1015 | + if not isinstance(row, dict): | ||
| 1016 | + continue | ||
| 987 | cells = row.get("cells") or [] | 1017 | cells = row.get("cells") or [] |
| 988 | for cell in cells: | 1018 | for cell in cells: |
| 1019 | + if not isinstance(cell, dict): | ||
| 1020 | + continue | ||
| 989 | walk(cell.get("blocks")) | 1021 | walk(cell.get("blocks")) |
| 990 | elif block_type == "widget": | 1022 | elif block_type == "widget": |
| 991 | self._normalize_widget_block(block) | 1023 | self._normalize_widget_block(block) |
| @@ -998,7 +1030,9 @@ class ChapterGenerationNode(BaseNode): | @@ -998,7 +1030,9 @@ class ChapterGenerationNode(BaseNode): | ||
| 998 | 1030 | ||
| 999 | blocks = chapter.get("blocks") | 1031 | blocks = chapter.get("blocks") |
| 1000 | if isinstance(blocks, list): | 1032 | if isinstance(blocks, list): |
| 1001 | - chapter["blocks"] = self._merge_fragment_sequences(blocks) | 1033 | + # 在合并前先过滤掉所有非字典类型的 block |
| 1034 | + filtered_blocks = [b for b in blocks if isinstance(b, dict)] | ||
| 1035 | + chapter["blocks"] = self._merge_fragment_sequences(filtered_blocks) | ||
| 1002 | 1036 | ||
| 1003 | def _ensure_content_density(self, chapter: Dict[str, Any]): | 1037 | def _ensure_content_density(self, chapter: Dict[str, Any]): |
| 1004 | """ | 1038 | """ |
| @@ -1657,6 +1691,25 @@ class ChapterGenerationNode(BaseNode): | @@ -1657,6 +1691,25 @@ class ChapterGenerationNode(BaseNode): | ||
| 1657 | fragment_buffer = [] | 1691 | fragment_buffer = [] |
| 1658 | 1692 | ||
| 1659 | for block in blocks: | 1693 | for block in blocks: |
| 1694 | + # 类型检查:跳过非字典类型的异常 block,避免 AttributeError | ||
| 1695 | + if not isinstance(block, dict): | ||
| 1696 | + # 尝试将非字典类型转换为 paragraph | ||
| 1697 | + if isinstance(block, str) and block.strip(): | ||
| 1698 | + converted = self._as_paragraph_block(block) | ||
| 1699 | + logger.warning(f"检测到非字典类型的 block(字符串),已转换为 paragraph: {block[:50]}...") | ||
| 1700 | + merged.append(converted) | ||
| 1701 | + elif isinstance(block, list): | ||
| 1702 | + # 列表类型的 block 可能是 LLM 输出错误,尝试提取有效内容 | ||
| 1703 | + logger.warning(f"检测到列表类型的 block,尝试提取有效内容: {block}") | ||
| 1704 | + for item in block: | ||
| 1705 | + if isinstance(item, dict): | ||
| 1706 | + self._ensure_block_type(item) | ||
| 1707 | + merged.append(self._merge_nested_fragments(item)) | ||
| 1708 | + elif isinstance(item, str) and item.strip(): | ||
| 1709 | + merged.append(self._as_paragraph_block(item)) | ||
| 1710 | + else: | ||
| 1711 | + logger.warning(f"跳过无效的 block(类型: {type(block).__name__}): {block}") | ||
| 1712 | + continue | ||
| 1660 | if self._is_paragraph_fragment(block): | 1713 | if self._is_paragraph_fragment(block): |
| 1661 | fragment_buffer.append(block) | 1714 | fragment_buffer.append(block) |
| 1662 | continue | 1715 | continue |
| @@ -1668,6 +1721,24 @@ class ChapterGenerationNode(BaseNode): | @@ -1668,6 +1721,24 @@ class ChapterGenerationNode(BaseNode): | ||
| 1668 | 1721 | ||
| 1669 | def _merge_nested_fragments(self, block: Dict[str, Any]) -> Dict[str, Any]: | 1722 | def _merge_nested_fragments(self, block: Dict[str, Any]) -> Dict[str, Any]: |
| 1670 | """对嵌套结构(callout/blockquote/engineQuote/list/table)递归处理片段合并""" | 1723 | """对嵌套结构(callout/blockquote/engineQuote/list/table)递归处理片段合并""" |
| 1724 | + # 类型检查:确保 block 是字典类型 | ||
| 1725 | + if not isinstance(block, dict): | ||
| 1726 | + # 尝试将非字典类型转换为 paragraph | ||
| 1727 | + if isinstance(block, str) and block.strip(): | ||
| 1728 | + logger.warning(f"_merge_nested_fragments 收到字符串类型,已转换为 paragraph") | ||
| 1729 | + return self._as_paragraph_block(block) | ||
| 1730 | + elif isinstance(block, list): | ||
| 1731 | + # 尝试提取列表中的第一个有效字典 | ||
| 1732 | + for item in block: | ||
| 1733 | + if isinstance(item, dict): | ||
| 1734 | + self._ensure_block_type(item) | ||
| 1735 | + return self._merge_nested_fragments(item) | ||
| 1736 | + logger.warning(f"_merge_nested_fragments 收到无效列表,返回空 paragraph") | ||
| 1737 | + return self._as_paragraph_block("") | ||
| 1738 | + else: | ||
| 1739 | + logger.warning(f"_merge_nested_fragments 收到无效类型({type(block).__name__}),返回空 paragraph") | ||
| 1740 | + return self._as_paragraph_block("") | ||
| 1741 | + | ||
| 1671 | block_type = block.get("type") | 1742 | block_type = block.get("type") |
| 1672 | if block_type in {"callout", "blockquote", "engineQuote"}: | 1743 | if block_type in {"callout", "blockquote", "engineQuote"}: |
| 1673 | nested = block.get("blocks") | 1744 | nested = block.get("blocks") |
| @@ -1682,8 +1753,12 @@ class ChapterGenerationNode(BaseNode): | @@ -1682,8 +1753,12 @@ class ChapterGenerationNode(BaseNode): | ||
| 1682 | entry[:] = merged_entry | 1753 | entry[:] = merged_entry |
| 1683 | elif block_type == "table": | 1754 | elif block_type == "table": |
| 1684 | for row in block.get("rows", []): | 1755 | for row in block.get("rows", []): |
| 1756 | + if not isinstance(row, dict): | ||
| 1757 | + continue | ||
| 1685 | cells = row.get("cells") or [] | 1758 | cells = row.get("cells") or [] |
| 1686 | for cell in cells: | 1759 | for cell in cells: |
| 1760 | + if not isinstance(cell, dict): | ||
| 1761 | + continue | ||
| 1687 | nested_blocks = cell.get("blocks") | 1762 | nested_blocks = cell.get("blocks") |
| 1688 | if isinstance(nested_blocks, list): | 1763 | if isinstance(nested_blocks, list): |
| 1689 | cell["blocks"] = self._merge_fragment_sequences(nested_blocks) | 1764 | cell["blocks"] = self._merge_fragment_sequences(nested_blocks) |
| @@ -1819,6 +1894,42 @@ class ChapterGenerationNode(BaseNode): | @@ -1819,6 +1894,42 @@ class ChapterGenerationNode(BaseNode): | ||
| 1819 | return str(value) | 1894 | return str(value) |
| 1820 | return "" | 1895 | return "" |
| 1821 | 1896 | ||
| 1897 | + # 合法的 listType 值 | ||
| 1898 | + _ALLOWED_LIST_TYPES = {"ordered", "bullet", "task"} | ||
| 1899 | + # listType 的别名映射 | ||
| 1900 | + _LIST_TYPE_ALIASES = { | ||
| 1901 | + "unordered": "bullet", | ||
| 1902 | + "ul": "bullet", | ||
| 1903 | + "ol": "ordered", | ||
| 1904 | + "numbered": "ordered", | ||
| 1905 | + "checkbox": "task", | ||
| 1906 | + "check": "task", | ||
| 1907 | + "todo": "task", | ||
| 1908 | + } | ||
| 1909 | + | ||
| 1910 | + def _normalize_list_type(self, block: Dict[str, Any]): | ||
| 1911 | + """ | ||
| 1912 | + 确保 list block 的 listType 是合法值。 | ||
| 1913 | + | ||
| 1914 | + 如果 listType 缺失或非法,自动修复为 bullet。 | ||
| 1915 | + """ | ||
| 1916 | + list_type = block.get("listType") | ||
| 1917 | + if list_type in self._ALLOWED_LIST_TYPES: | ||
| 1918 | + return | ||
| 1919 | + # 尝试别名映射 | ||
| 1920 | + if isinstance(list_type, str): | ||
| 1921 | + lowered = list_type.strip().lower() | ||
| 1922 | + if lowered in self._LIST_TYPE_ALIASES: | ||
| 1923 | + block["listType"] = self._LIST_TYPE_ALIASES[lowered] | ||
| 1924 | + logger.warning(f"已将 listType '{list_type}' 映射为 '{block['listType']}'") | ||
| 1925 | + return | ||
| 1926 | + if lowered in self._ALLOWED_LIST_TYPES: | ||
| 1927 | + block["listType"] = lowered | ||
| 1928 | + return | ||
| 1929 | + # 无法识别,默认使用 bullet | ||
| 1930 | + logger.warning(f"检测到非法 listType: {list_type},已修复为 bullet") | ||
| 1931 | + block["listType"] = "bullet" | ||
| 1932 | + | ||
| 1822 | def _normalize_list_items(self, items: Any) -> List[List[Dict[str, Any]]]: | 1933 | def _normalize_list_items(self, items: Any) -> List[List[Dict[str, Any]]]: |
| 1823 | """确保list block的items为[[block, block], ...]结构""" | 1934 | """确保list block的items为[[block, block], ...]结构""" |
| 1824 | if not isinstance(items, list): | 1935 | if not isinstance(items, list): |
| @@ -1329,8 +1329,84 @@ class HTMLRenderer: | @@ -1329,8 +1329,84 @@ class HTMLRenderer: | ||
| 1329 | 返回: | 1329 | 返回: |
| 1330 | List[Dict]: 修复后的表格行数组。 | 1330 | List[Dict]: 修复后的表格行数组。 |
| 1331 | """ | 1331 | """ |
| 1332 | - if not rows or len(rows) != 1: | ||
| 1333 | - # 只处理只有1行的异常情况 | 1332 | + if not rows: |
| 1333 | + return [] | ||
| 1334 | + | ||
| 1335 | + # 辅助函数:获取单元格文本 | ||
| 1336 | + def _get_cell_text(cell: Dict[str, Any]) -> str: | ||
| 1337 | + """获取单元格的文本内容""" | ||
| 1338 | + blocks = cell.get("blocks", []) | ||
| 1339 | + for block in blocks: | ||
| 1340 | + if isinstance(block, dict) and block.get("type") == "paragraph": | ||
| 1341 | + inlines = block.get("inlines", []) | ||
| 1342 | + for inline in inlines: | ||
| 1343 | + if isinstance(inline, dict): | ||
| 1344 | + text = inline.get("text", "") | ||
| 1345 | + if text: | ||
| 1346 | + return str(text).strip() | ||
| 1347 | + return "" | ||
| 1348 | + | ||
| 1349 | + def _is_placeholder_cell(cell: Dict[str, Any]) -> bool: | ||
| 1350 | + """判断单元格是否是占位符(如 '--', '-', '—' 等)""" | ||
| 1351 | + text = _get_cell_text(cell) | ||
| 1352 | + return text in ("--", "-", "—", "——", "", "N/A", "n/a") | ||
| 1353 | + | ||
| 1354 | + def _is_heading_like_cell(cell: Dict[str, Any]) -> bool: | ||
| 1355 | + """检测是否疑似被错误并入表格的章节/标题单元格""" | ||
| 1356 | + text = _get_cell_text(cell) | ||
| 1357 | + if not text: | ||
| 1358 | + return False | ||
| 1359 | + stripped = text.strip() | ||
| 1360 | + # 章节号或"第X章/部分"常见格式,避免误删正常数字值 | ||
| 1361 | + heading_patterns = ( | ||
| 1362 | + r"^\d{1,2}(?:\.\d{1,2}){1,3}\s+", | ||
| 1363 | + r"^第[一二三四五六七八九十]+[章节部分]", | ||
| 1364 | + ) | ||
| 1365 | + return any(re.match(pat, stripped) for pat in heading_patterns) | ||
| 1366 | + | ||
| 1367 | + # 第一阶段:处理"有表头行 + 数据被串在一行"的情况 | ||
| 1368 | + header_cells = self._flatten_nested_cells((rows[0] or {}).get("cells", [])) | ||
| 1369 | + header_count = len(header_cells) | ||
| 1370 | + overflow_fixed = None | ||
| 1371 | + if header_count >= 2: | ||
| 1372 | + rebuilt_rows: List[Dict[str, Any]] = [ | ||
| 1373 | + { | ||
| 1374 | + **{k: v for k, v in (rows[0] or {}).items() if k != "cells"}, | ||
| 1375 | + "cells": header_cells, | ||
| 1376 | + } | ||
| 1377 | + ] | ||
| 1378 | + changed = False | ||
| 1379 | + for row in rows[1:]: | ||
| 1380 | + cells = self._flatten_nested_cells((row or {}).get("cells", [])) | ||
| 1381 | + cell_count = len(cells) | ||
| 1382 | + if cell_count <= header_count: | ||
| 1383 | + rebuilt_rows.append({**{k: v for k, v in (row or {}).items() if k != "cells"}, "cells": cells}) | ||
| 1384 | + continue | ||
| 1385 | + | ||
| 1386 | + remainder = cell_count % header_count | ||
| 1387 | + trimmed_cells = cells | ||
| 1388 | + if remainder: | ||
| 1389 | + trailing = cells[-remainder:] | ||
| 1390 | + if all(_is_placeholder_cell(c) or _is_heading_like_cell(c) for c in trailing): | ||
| 1391 | + trimmed_cells = cells[:-remainder] | ||
| 1392 | + remainder = 0 | ||
| 1393 | + | ||
| 1394 | + if remainder == 0 and len(trimmed_cells) >= header_count * 2: | ||
| 1395 | + for i in range(0, len(trimmed_cells), header_count): | ||
| 1396 | + chunk = trimmed_cells[i : i + header_count] | ||
| 1397 | + rebuilt_rows.append({"cells": chunk}) | ||
| 1398 | + changed = True | ||
| 1399 | + else: | ||
| 1400 | + rebuilt_rows.append({**{k: v for k, v in (row or {}).items() if k != "cells"}, "cells": cells}) | ||
| 1401 | + | ||
| 1402 | + if changed: | ||
| 1403 | + overflow_fixed = rebuilt_rows | ||
| 1404 | + | ||
| 1405 | + if overflow_fixed is not None: | ||
| 1406 | + rows = overflow_fixed | ||
| 1407 | + | ||
| 1408 | + if len(rows) != 1: | ||
| 1409 | + # 只有一行的异常情况由后续逻辑处理;正常多行直接返回 | ||
| 1334 | return rows | 1410 | return rows |
| 1335 | 1411 | ||
| 1336 | first_row = rows[0] | 1412 | first_row = rows[0] |
| @@ -1353,25 +1429,6 @@ class HTMLRenderer: | @@ -1353,25 +1429,6 @@ class HTMLRenderer: | ||
| 1353 | # 单元格太少,不需要重组 | 1429 | # 单元格太少,不需要重组 |
| 1354 | return rows | 1430 | return rows |
| 1355 | 1431 | ||
| 1356 | - # 辅助函数:获取单元格文本 | ||
| 1357 | - def _get_cell_text(cell: Dict[str, Any]) -> str: | ||
| 1358 | - """获取单元格的文本内容""" | ||
| 1359 | - blocks = cell.get("blocks", []) | ||
| 1360 | - for block in blocks: | ||
| 1361 | - if isinstance(block, dict) and block.get("type") == "paragraph": | ||
| 1362 | - inlines = block.get("inlines", []) | ||
| 1363 | - for inline in inlines: | ||
| 1364 | - if isinstance(inline, dict): | ||
| 1365 | - text = inline.get("text", "") | ||
| 1366 | - if text: | ||
| 1367 | - return str(text).strip() | ||
| 1368 | - return "" | ||
| 1369 | - | ||
| 1370 | - def _is_placeholder_cell(cell: Dict[str, Any]) -> bool: | ||
| 1371 | - """判断单元格是否是占位符(如 '--', '-', '—' 等)""" | ||
| 1372 | - text = _get_cell_text(cell) | ||
| 1373 | - return text in ("--", "-", "—", "——", "", "N/A", "n/a") | ||
| 1374 | - | ||
| 1375 | # 先过滤掉占位符单元格 | 1432 | # 先过滤掉占位符单元格 |
| 1376 | all_cells = [c for c in all_cells if not _is_placeholder_cell(c)] | 1433 | all_cells = [c for c in all_cells if not _is_placeholder_cell(c)] |
| 1377 | 1434 |
-
Please register or login to post a comment