MaYiding

Fix:

  1. _parse_chapter 异常类型降级(影响:高)

  # 回退后(本 PR)
  raise ValueError("LLM返回空内容")
  raise ValueError("章节JSON缺少chapter字段")

  # 回退前(当前 main)
  raise ChapterJsonParseError("LLM返回空内容", raw_text=raw_text)
  raise ChapterJsonParseError("章节JSON缺少chapter字段或结构不完整", raw_text=cleaned)

  ChapterJsonParseError 是 ValueError 的子类。run() 方法中 except ChapterJsonParseError 捕获不到父类 ValueError。当 LLM 返回空内容或 JSON 缺少 chapter
  字段时,异常会直接穿透上层所有 except 块,导致整章生成失败且不会进入重试或降级逻辑。

  2. agent.py 移除宽泛异常重试(影响:中高)

  移除了对 AttributeError, TypeError, KeyError, IndexError, ValueError, json.JSONDecodeError 的捕获重试。如果 LLM 返回畸形 JSON
  导致运行时异常,现在会直接崩溃而非重试。

  3. 移除非字典 block 防御性处理(影响:中)

  chapter_generation_node.py 中 walk() / _merge_fragment_sequences() / _merge_nested_fragments() 里对 LLM 返回非字典类型 block(string、list
  等)的容错处理全部移除。如果 LLM 输出异常结构,现在会直接报错而非自动修复。

  4. 移除 _normalize_list_type 和表格行溢出修复(影响:低)

  - _normalize_list_type():将非法 listType(如 "unordered")自动映射为 "bullet" 的逻辑被移除
  - html_renderer.py 的 _fix_nested_table_rows():多行表格数据溢出到单行时的重组逻辑被简化
@@ -663,6 +663,40 @@ class ReportAgent: @@ -663,6 +663,40 @@ class ReportAgent:
663 raise 663 raise
664 attempt += 1 664 attempt += 1
665 continue 665 continue
  666 + except (AttributeError, TypeError, KeyError, IndexError, ValueError, json.JSONDecodeError) as structure_error:
  667 + # 捕获因 JSON 结构异常导致的运行时错误,包装为可重试异常
  668 + # 包括:
  669 + # - AttributeError: 如 list.get() 调用失败
  670 + # - TypeError: 类型不匹配
  671 + # - KeyError: 字典键缺失
  672 + # - IndexError: 列表索引越界
  673 + # - ValueError: 值错误(如 LLM 返回空内容、缺少必要字段)
  674 + # - json.JSONDecodeError: JSON 解析失败(未被内部捕获的情况)
  675 + error_type = type(structure_error).__name__
  676 + logger.warning(
  677 + "章节 {title} 生成过程中发生 {error_type}(第 {attempt}/{total} 次尝试),将尝试重新生成: {error}",
  678 + title=section.title,
  679 + error_type=error_type,
  680 + attempt=attempt,
  681 + total=chapter_max_attempts,
  682 + error=structure_error,
  683 + )
  684 + emit('chapter_status', {
  685 + 'chapterId': section.chapter_id,
  686 + 'title': section.title,
  687 + 'status': 'retrying' if attempt < chapter_max_attempts else 'error',
  688 + 'attempt': attempt,
  689 + 'error': str(structure_error),
  690 + 'reason': 'structure_error',
  691 + 'error_type': error_type
  692 + })
  693 + if attempt >= chapter_max_attempts:
  694 + # 达到最大重试次数,包装为 ChapterJsonParseError 抛出
  695 + raise ChapterJsonParseError(
  696 + f"{section.title} 章节因 {error_type} 在 {chapter_max_attempts} 次尝试后仍无法生成: {structure_error}"
  697 + ) from structure_error
  698 + attempt += 1
  699 + continue
666 except Exception as chapter_error: 700 except Exception as chapter_error:
667 if not self._should_retry_inappropriate_content_error(chapter_error): 701 if not self._should_retry_inappropriate_content_error(chapter_error):
668 raise 702 raise
@@ -642,7 +642,7 @@ class ChapterGenerationNode(BaseNode): @@ -642,7 +642,7 @@ class ChapterGenerationNode(BaseNode):
642 cleaned = cleaned[:-3] 642 cleaned = cleaned[:-3]
643 cleaned = cleaned.strip() 643 cleaned = cleaned.strip()
644 if not cleaned: 644 if not cleaned:
645 - raise ValueError("LLM返回空内容") 645 + raise ChapterJsonParseError("LLM返回空内容", raw_text=raw_text)
646 646
647 candidate_payloads = [cleaned] 647 candidate_payloads = [cleaned]
648 repaired = self._repair_llm_json(cleaned) 648 repaired = self._repair_llm_json(cleaned)
@@ -685,7 +685,7 @@ class ChapterGenerationNode(BaseNode): @@ -685,7 +685,7 @@ class ChapterGenerationNode(BaseNode):
685 return item["chapter"] 685 return item["chapter"]
686 if all(key in item for key in ("chapterId", "title", "blocks")): 686 if all(key in item for key in ("chapterId", "title", "blocks")):
687 return item 687 return item
688 - raise ValueError("章节JSON缺少chapter字段") 688 + raise ChapterJsonParseError("章节JSON缺少chapter字段或结构不完整", raw_text=cleaned)
689 689
690 def _persist_error_payload( 690 def _persist_error_payload(
691 self, 691 self,
@@ -967,13 +967,41 @@ class ChapterGenerationNode(BaseNode): @@ -967,13 +967,41 @@ class ChapterGenerationNode(BaseNode):
967 """递归检查并修复嵌套结构,保证每个block合法""" 967 """递归检查并修复嵌套结构,保证每个block合法"""
968 if not isinstance(blocks, list): 968 if not isinstance(blocks, list):
969 return 969 return
970 - for block in blocks: 970 + # 先过滤掉非字典类型的异常 block
  971 + valid_indices = []
  972 + for idx, block in enumerate(blocks):
  973 + if not isinstance(block, dict):
  974 + # 尝试将字符串转换为 paragraph
  975 + if isinstance(block, str) and block.strip():
  976 + blocks[idx] = self._as_paragraph_block(block)
  977 + valid_indices.append(idx)
  978 + logger.warning(f"walk: 将字符串 block 转换为 paragraph")
  979 + elif isinstance(block, list):
  980 + # 尝试提取列表中的有效字典
  981 + for item in block:
  982 + if isinstance(item, dict):
  983 + self._ensure_block_type(item)
  984 + blocks[idx] = item
  985 + valid_indices.append(idx)
  986 + logger.warning(f"walk: 从列表中提取字典 block")
  987 + break
  988 + else:
  989 + logger.warning(f"walk: 跳过无效的列表 block: {block}")
  990 + else:
  991 + logger.warning(f"walk: 跳过无效的 block(类型: {type(block).__name__})")
  992 + else:
  993 + valid_indices.append(idx)
  994 +
  995 + for idx in valid_indices:
  996 + block = blocks[idx]
971 if not isinstance(block, dict): 997 if not isinstance(block, dict):
972 continue 998 continue
973 self._ensure_block_type(block) 999 self._ensure_block_type(block)
974 self._sanitize_block_content(block) 1000 self._sanitize_block_content(block)
975 block_type = block.get("type") 1001 block_type = block.get("type")
976 if block_type == "list": 1002 if block_type == "list":
  1003 + # 自动修复 listType:确保是合法值
  1004 + self._normalize_list_type(block)
977 items = block.get("items") 1005 items = block.get("items")
978 normalized = self._normalize_list_items(items) 1006 normalized = self._normalize_list_items(items)
979 if normalized: 1007 if normalized:
@@ -984,8 +1012,12 @@ class ChapterGenerationNode(BaseNode): @@ -984,8 +1012,12 @@ class ChapterGenerationNode(BaseNode):
984 walk(block.get("blocks")) 1012 walk(block.get("blocks"))
985 elif block_type == "table": 1013 elif block_type == "table":
986 for row in block.get("rows", []): 1014 for row in block.get("rows", []):
  1015 + if not isinstance(row, dict):
  1016 + continue
987 cells = row.get("cells") or [] 1017 cells = row.get("cells") or []
988 for cell in cells: 1018 for cell in cells:
  1019 + if not isinstance(cell, dict):
  1020 + continue
989 walk(cell.get("blocks")) 1021 walk(cell.get("blocks"))
990 elif block_type == "widget": 1022 elif block_type == "widget":
991 self._normalize_widget_block(block) 1023 self._normalize_widget_block(block)
@@ -998,7 +1030,9 @@ class ChapterGenerationNode(BaseNode): @@ -998,7 +1030,9 @@ class ChapterGenerationNode(BaseNode):
998 1030
999 blocks = chapter.get("blocks") 1031 blocks = chapter.get("blocks")
1000 if isinstance(blocks, list): 1032 if isinstance(blocks, list):
1001 - chapter["blocks"] = self._merge_fragment_sequences(blocks) 1033 + # 在合并前先过滤掉所有非字典类型的 block
  1034 + filtered_blocks = [b for b in blocks if isinstance(b, dict)]
  1035 + chapter["blocks"] = self._merge_fragment_sequences(filtered_blocks)
1002 1036
1003 def _ensure_content_density(self, chapter: Dict[str, Any]): 1037 def _ensure_content_density(self, chapter: Dict[str, Any]):
1004 """ 1038 """
@@ -1657,6 +1691,25 @@ class ChapterGenerationNode(BaseNode): @@ -1657,6 +1691,25 @@ class ChapterGenerationNode(BaseNode):
1657 fragment_buffer = [] 1691 fragment_buffer = []
1658 1692
1659 for block in blocks: 1693 for block in blocks:
  1694 + # 类型检查:跳过非字典类型的异常 block,避免 AttributeError
  1695 + if not isinstance(block, dict):
  1696 + # 尝试将非字典类型转换为 paragraph
  1697 + if isinstance(block, str) and block.strip():
  1698 + converted = self._as_paragraph_block(block)
  1699 + logger.warning(f"检测到非字典类型的 block(字符串),已转换为 paragraph: {block[:50]}...")
  1700 + merged.append(converted)
  1701 + elif isinstance(block, list):
  1702 + # 列表类型的 block 可能是 LLM 输出错误,尝试提取有效内容
  1703 + logger.warning(f"检测到列表类型的 block,尝试提取有效内容: {block}")
  1704 + for item in block:
  1705 + if isinstance(item, dict):
  1706 + self._ensure_block_type(item)
  1707 + merged.append(self._merge_nested_fragments(item))
  1708 + elif isinstance(item, str) and item.strip():
  1709 + merged.append(self._as_paragraph_block(item))
  1710 + else:
  1711 + logger.warning(f"跳过无效的 block(类型: {type(block).__name__}): {block}")
  1712 + continue
1660 if self._is_paragraph_fragment(block): 1713 if self._is_paragraph_fragment(block):
1661 fragment_buffer.append(block) 1714 fragment_buffer.append(block)
1662 continue 1715 continue
@@ -1668,6 +1721,24 @@ class ChapterGenerationNode(BaseNode): @@ -1668,6 +1721,24 @@ class ChapterGenerationNode(BaseNode):
1668 1721
1669 def _merge_nested_fragments(self, block: Dict[str, Any]) -> Dict[str, Any]: 1722 def _merge_nested_fragments(self, block: Dict[str, Any]) -> Dict[str, Any]:
1670 """对嵌套结构(callout/blockquote/engineQuote/list/table)递归处理片段合并""" 1723 """对嵌套结构(callout/blockquote/engineQuote/list/table)递归处理片段合并"""
  1724 + # 类型检查:确保 block 是字典类型
  1725 + if not isinstance(block, dict):
  1726 + # 尝试将非字典类型转换为 paragraph
  1727 + if isinstance(block, str) and block.strip():
  1728 + logger.warning(f"_merge_nested_fragments 收到字符串类型,已转换为 paragraph")
  1729 + return self._as_paragraph_block(block)
  1730 + elif isinstance(block, list):
  1731 + # 尝试提取列表中的第一个有效字典
  1732 + for item in block:
  1733 + if isinstance(item, dict):
  1734 + self._ensure_block_type(item)
  1735 + return self._merge_nested_fragments(item)
  1736 + logger.warning(f"_merge_nested_fragments 收到无效列表,返回空 paragraph")
  1737 + return self._as_paragraph_block("")
  1738 + else:
  1739 + logger.warning(f"_merge_nested_fragments 收到无效类型({type(block).__name__}),返回空 paragraph")
  1740 + return self._as_paragraph_block("")
  1741 +
1671 block_type = block.get("type") 1742 block_type = block.get("type")
1672 if block_type in {"callout", "blockquote", "engineQuote"}: 1743 if block_type in {"callout", "blockquote", "engineQuote"}:
1673 nested = block.get("blocks") 1744 nested = block.get("blocks")
@@ -1682,8 +1753,12 @@ class ChapterGenerationNode(BaseNode): @@ -1682,8 +1753,12 @@ class ChapterGenerationNode(BaseNode):
1682 entry[:] = merged_entry 1753 entry[:] = merged_entry
1683 elif block_type == "table": 1754 elif block_type == "table":
1684 for row in block.get("rows", []): 1755 for row in block.get("rows", []):
  1756 + if not isinstance(row, dict):
  1757 + continue
1685 cells = row.get("cells") or [] 1758 cells = row.get("cells") or []
1686 for cell in cells: 1759 for cell in cells:
  1760 + if not isinstance(cell, dict):
  1761 + continue
1687 nested_blocks = cell.get("blocks") 1762 nested_blocks = cell.get("blocks")
1688 if isinstance(nested_blocks, list): 1763 if isinstance(nested_blocks, list):
1689 cell["blocks"] = self._merge_fragment_sequences(nested_blocks) 1764 cell["blocks"] = self._merge_fragment_sequences(nested_blocks)
@@ -1819,6 +1894,42 @@ class ChapterGenerationNode(BaseNode): @@ -1819,6 +1894,42 @@ class ChapterGenerationNode(BaseNode):
1819 return str(value) 1894 return str(value)
1820 return "" 1895 return ""
1821 1896
  1897 + # 合法的 listType
  1898 + _ALLOWED_LIST_TYPES = {"ordered", "bullet", "task"}
  1899 + # listType 的别名映射
  1900 + _LIST_TYPE_ALIASES = {
  1901 + "unordered": "bullet",
  1902 + "ul": "bullet",
  1903 + "ol": "ordered",
  1904 + "numbered": "ordered",
  1905 + "checkbox": "task",
  1906 + "check": "task",
  1907 + "todo": "task",
  1908 + }
  1909 +
  1910 + def _normalize_list_type(self, block: Dict[str, Any]):
  1911 + """
  1912 + 确保 list block listType 是合法值。
  1913 +
  1914 + 如果 listType 缺失或非法,自动修复为 bullet
  1915 + """
  1916 + list_type = block.get("listType")
  1917 + if list_type in self._ALLOWED_LIST_TYPES:
  1918 + return
  1919 + # 尝试别名映射
  1920 + if isinstance(list_type, str):
  1921 + lowered = list_type.strip().lower()
  1922 + if lowered in self._LIST_TYPE_ALIASES:
  1923 + block["listType"] = self._LIST_TYPE_ALIASES[lowered]
  1924 + logger.warning(f"已将 listType '{list_type}' 映射为 '{block['listType']}'")
  1925 + return
  1926 + if lowered in self._ALLOWED_LIST_TYPES:
  1927 + block["listType"] = lowered
  1928 + return
  1929 + # 无法识别,默认使用 bullet
  1930 + logger.warning(f"检测到非法 listType: {list_type},已修复为 bullet")
  1931 + block["listType"] = "bullet"
  1932 +
1822 def _normalize_list_items(self, items: Any) -> List[List[Dict[str, Any]]]: 1933 def _normalize_list_items(self, items: Any) -> List[List[Dict[str, Any]]]:
1823 """确保list blockitems为[[block, block], ...]结构""" 1934 """确保list blockitems为[[block, block], ...]结构"""
1824 if not isinstance(items, list): 1935 if not isinstance(items, list):
@@ -1329,8 +1329,84 @@ class HTMLRenderer: @@ -1329,8 +1329,84 @@ class HTMLRenderer:
1329 返回: 1329 返回:
1330 List[Dict]: 修复后的表格行数组。 1330 List[Dict]: 修复后的表格行数组。
1331 """ 1331 """
1332 - if not rows or len(rows) != 1:  
1333 - # 只处理只有1行的异常情况 1332 + if not rows:
  1333 + return []
  1334 +
  1335 + # 辅助函数:获取单元格文本
  1336 + def _get_cell_text(cell: Dict[str, Any]) -> str:
  1337 + """获取单元格的文本内容"""
  1338 + blocks = cell.get("blocks", [])
  1339 + for block in blocks:
  1340 + if isinstance(block, dict) and block.get("type") == "paragraph":
  1341 + inlines = block.get("inlines", [])
  1342 + for inline in inlines:
  1343 + if isinstance(inline, dict):
  1344 + text = inline.get("text", "")
  1345 + if text:
  1346 + return str(text).strip()
  1347 + return ""
  1348 +
  1349 + def _is_placeholder_cell(cell: Dict[str, Any]) -> bool:
  1350 + """判断单元格是否是占位符(如 '--', '-', '—' 等)"""
  1351 + text = _get_cell_text(cell)
  1352 + return text in ("--", "-", "—", "——", "", "N/A", "n/a")
  1353 +
  1354 + def _is_heading_like_cell(cell: Dict[str, Any]) -> bool:
  1355 + """检测是否疑似被错误并入表格的章节/标题单元格"""
  1356 + text = _get_cell_text(cell)
  1357 + if not text:
  1358 + return False
  1359 + stripped = text.strip()
  1360 + # 章节号或"第X章/部分"常见格式,避免误删正常数字值
  1361 + heading_patterns = (
  1362 + r"^\d{1,2}(?:\.\d{1,2}){1,3}\s+",
  1363 + r"^第[一二三四五六七八九十]+[章节部分]",
  1364 + )
  1365 + return any(re.match(pat, stripped) for pat in heading_patterns)
  1366 +
  1367 + # 第一阶段:处理"有表头行 + 数据被串在一行"的情况
  1368 + header_cells = self._flatten_nested_cells((rows[0] or {}).get("cells", []))
  1369 + header_count = len(header_cells)
  1370 + overflow_fixed = None
  1371 + if header_count >= 2:
  1372 + rebuilt_rows: List[Dict[str, Any]] = [
  1373 + {
  1374 + **{k: v for k, v in (rows[0] or {}).items() if k != "cells"},
  1375 + "cells": header_cells,
  1376 + }
  1377 + ]
  1378 + changed = False
  1379 + for row in rows[1:]:
  1380 + cells = self._flatten_nested_cells((row or {}).get("cells", []))
  1381 + cell_count = len(cells)
  1382 + if cell_count <= header_count:
  1383 + rebuilt_rows.append({**{k: v for k, v in (row or {}).items() if k != "cells"}, "cells": cells})
  1384 + continue
  1385 +
  1386 + remainder = cell_count % header_count
  1387 + trimmed_cells = cells
  1388 + if remainder:
  1389 + trailing = cells[-remainder:]
  1390 + if all(_is_placeholder_cell(c) or _is_heading_like_cell(c) for c in trailing):
  1391 + trimmed_cells = cells[:-remainder]
  1392 + remainder = 0
  1393 +
  1394 + if remainder == 0 and len(trimmed_cells) >= header_count * 2:
  1395 + for i in range(0, len(trimmed_cells), header_count):
  1396 + chunk = trimmed_cells[i : i + header_count]
  1397 + rebuilt_rows.append({"cells": chunk})
  1398 + changed = True
  1399 + else:
  1400 + rebuilt_rows.append({**{k: v for k, v in (row or {}).items() if k != "cells"}, "cells": cells})
  1401 +
  1402 + if changed:
  1403 + overflow_fixed = rebuilt_rows
  1404 +
  1405 + if overflow_fixed is not None:
  1406 + rows = overflow_fixed
  1407 +
  1408 + if len(rows) != 1:
  1409 + # 只有一行的异常情况由后续逻辑处理;正常多行直接返回
1334 return rows 1410 return rows
1335 1411
1336 first_row = rows[0] 1412 first_row = rows[0]
@@ -1353,25 +1429,6 @@ class HTMLRenderer: @@ -1353,25 +1429,6 @@ class HTMLRenderer:
1353 # 单元格太少,不需要重组 1429 # 单元格太少,不需要重组
1354 return rows 1430 return rows
1355 1431
1356 - # 辅助函数:获取单元格文本  
1357 - def _get_cell_text(cell: Dict[str, Any]) -> str:  
1358 - """获取单元格的文本内容"""  
1359 - blocks = cell.get("blocks", [])  
1360 - for block in blocks:  
1361 - if isinstance(block, dict) and block.get("type") == "paragraph":  
1362 - inlines = block.get("inlines", [])  
1363 - for inline in inlines:  
1364 - if isinstance(inline, dict):  
1365 - text = inline.get("text", "")  
1366 - if text:  
1367 - return str(text).strip()  
1368 - return ""  
1369 -  
1370 - def _is_placeholder_cell(cell: Dict[str, Any]) -> bool:  
1371 - """判断单元格是否是占位符(如 '--', '-', '—' 等)"""  
1372 - text = _get_cell_text(cell)  
1373 - return text in ("--", "-", "—", "——", "", "N/A", "n/a")  
1374 -  
1375 # 先过滤掉占位符单元格 1432 # 先过滤掉占位符单元格
1376 all_cells = [c for c in all_cells if not _is_placeholder_cell(c)] 1433 all_cells = [c for c in all_cells if not _is_placeholder_cell(c)]
1377 1434