MaYiding

Fix:

  1. _parse_chapter 异常类型降级(影响:高)

  # 回退后(本 PR)
  raise ValueError("LLM返回空内容")
  raise ValueError("章节JSON缺少chapter字段")

  # 回退前(当前 main)
  raise ChapterJsonParseError("LLM返回空内容", raw_text=raw_text)
  raise ChapterJsonParseError("章节JSON缺少chapter字段或结构不完整", raw_text=cleaned)

  ChapterJsonParseError 是 ValueError 的子类。run() 方法中 except ChapterJsonParseError 捕获不到父类 ValueError。当 LLM 返回空内容或 JSON 缺少 chapter
  字段时,异常会直接穿透上层所有 except 块,导致整章生成失败且不会进入重试或降级逻辑。

  2. agent.py 移除宽泛异常重试(影响:中高)

  移除了对 AttributeError, TypeError, KeyError, IndexError, ValueError, json.JSONDecodeError 的捕获重试。如果 LLM 返回畸形 JSON
  导致运行时异常,现在会直接崩溃而非重试。

  3. 移除非字典 block 防御性处理(影响:中)

  chapter_generation_node.py 中 walk() / _merge_fragment_sequences() / _merge_nested_fragments() 里对 LLM 返回非字典类型 block(string、list
  等)的容错处理全部移除。如果 LLM 输出异常结构,现在会直接报错而非自动修复。

  4. 移除 _normalize_list_type 和表格行溢出修复(影响:低)

  - _normalize_list_type():将非法 listType(如 "unordered")自动映射为 "bullet" 的逻辑被移除
  - html_renderer.py 的 _fix_nested_table_rows():多行表格数据溢出到单行时的重组逻辑被简化
... ... @@ -663,6 +663,40 @@ class ReportAgent:
raise
attempt += 1
continue
except (AttributeError, TypeError, KeyError, IndexError, ValueError, json.JSONDecodeError) as structure_error:
# 捕获因 JSON 结构异常导致的运行时错误,包装为可重试异常
# 包括:
# - AttributeError: 如 list.get() 调用失败
# - TypeError: 类型不匹配
# - KeyError: 字典键缺失
# - IndexError: 列表索引越界
# - ValueError: 值错误(如 LLM 返回空内容、缺少必要字段)
# - json.JSONDecodeError: JSON 解析失败(未被内部捕获的情况)
error_type = type(structure_error).__name__
logger.warning(
"章节 {title} 生成过程中发生 {error_type}(第 {attempt}/{total} 次尝试),将尝试重新生成: {error}",
title=section.title,
error_type=error_type,
attempt=attempt,
total=chapter_max_attempts,
error=structure_error,
)
emit('chapter_status', {
'chapterId': section.chapter_id,
'title': section.title,
'status': 'retrying' if attempt < chapter_max_attempts else 'error',
'attempt': attempt,
'error': str(structure_error),
'reason': 'structure_error',
'error_type': error_type
})
if attempt >= chapter_max_attempts:
# 达到最大重试次数,包装为 ChapterJsonParseError 抛出
raise ChapterJsonParseError(
f"{section.title} 章节因 {error_type} 在 {chapter_max_attempts} 次尝试后仍无法生成: {structure_error}"
) from structure_error
attempt += 1
continue
except Exception as chapter_error:
if not self._should_retry_inappropriate_content_error(chapter_error):
raise
... ...
... ... @@ -642,7 +642,7 @@ class ChapterGenerationNode(BaseNode):
cleaned = cleaned[:-3]
cleaned = cleaned.strip()
if not cleaned:
raise ValueError("LLM返回空内容")
raise ChapterJsonParseError("LLM返回空内容", raw_text=raw_text)
candidate_payloads = [cleaned]
repaired = self._repair_llm_json(cleaned)
... ... @@ -685,7 +685,7 @@ class ChapterGenerationNode(BaseNode):
return item["chapter"]
if all(key in item for key in ("chapterId", "title", "blocks")):
return item
raise ValueError("章节JSON缺少chapter字段")
raise ChapterJsonParseError("章节JSON缺少chapter字段或结构不完整", raw_text=cleaned)
def _persist_error_payload(
self,
... ... @@ -967,13 +967,41 @@ class ChapterGenerationNode(BaseNode):
"""递归检查并修复嵌套结构,保证每个block合法"""
if not isinstance(blocks, list):
return
for block in blocks:
# 先过滤掉非字典类型的异常 block
valid_indices = []
for idx, block in enumerate(blocks):
if not isinstance(block, dict):
# 尝试将字符串转换为 paragraph
if isinstance(block, str) and block.strip():
blocks[idx] = self._as_paragraph_block(block)
valid_indices.append(idx)
logger.warning(f"walk: 将字符串 block 转换为 paragraph")
elif isinstance(block, list):
# 尝试提取列表中的有效字典
for item in block:
if isinstance(item, dict):
self._ensure_block_type(item)
blocks[idx] = item
valid_indices.append(idx)
logger.warning(f"walk: 从列表中提取字典 block")
break
else:
logger.warning(f"walk: 跳过无效的列表 block: {block}")
else:
logger.warning(f"walk: 跳过无效的 block(类型: {type(block).__name__})")
else:
valid_indices.append(idx)
for idx in valid_indices:
block = blocks[idx]
if not isinstance(block, dict):
continue
self._ensure_block_type(block)
self._sanitize_block_content(block)
block_type = block.get("type")
if block_type == "list":
# 自动修复 listType:确保是合法值
self._normalize_list_type(block)
items = block.get("items")
normalized = self._normalize_list_items(items)
if normalized:
... ... @@ -984,8 +1012,12 @@ class ChapterGenerationNode(BaseNode):
walk(block.get("blocks"))
elif block_type == "table":
for row in block.get("rows", []):
if not isinstance(row, dict):
continue
cells = row.get("cells") or []
for cell in cells:
if not isinstance(cell, dict):
continue
walk(cell.get("blocks"))
elif block_type == "widget":
self._normalize_widget_block(block)
... ... @@ -998,7 +1030,9 @@ class ChapterGenerationNode(BaseNode):
blocks = chapter.get("blocks")
if isinstance(blocks, list):
chapter["blocks"] = self._merge_fragment_sequences(blocks)
# 在合并前先过滤掉所有非字典类型的 block
filtered_blocks = [b for b in blocks if isinstance(b, dict)]
chapter["blocks"] = self._merge_fragment_sequences(filtered_blocks)
def _ensure_content_density(self, chapter: Dict[str, Any]):
"""
... ... @@ -1657,6 +1691,25 @@ class ChapterGenerationNode(BaseNode):
fragment_buffer = []
for block in blocks:
# 类型检查:跳过非字典类型的异常 block,避免 AttributeError
if not isinstance(block, dict):
# 尝试将非字典类型转换为 paragraph
if isinstance(block, str) and block.strip():
converted = self._as_paragraph_block(block)
logger.warning(f"检测到非字典类型的 block(字符串),已转换为 paragraph: {block[:50]}...")
merged.append(converted)
elif isinstance(block, list):
# 列表类型的 block 可能是 LLM 输出错误,尝试提取有效内容
logger.warning(f"检测到列表类型的 block,尝试提取有效内容: {block}")
for item in block:
if isinstance(item, dict):
self._ensure_block_type(item)
merged.append(self._merge_nested_fragments(item))
elif isinstance(item, str) and item.strip():
merged.append(self._as_paragraph_block(item))
else:
logger.warning(f"跳过无效的 block(类型: {type(block).__name__}): {block}")
continue
if self._is_paragraph_fragment(block):
fragment_buffer.append(block)
continue
... ... @@ -1668,6 +1721,24 @@ class ChapterGenerationNode(BaseNode):
def _merge_nested_fragments(self, block: Dict[str, Any]) -> Dict[str, Any]:
"""对嵌套结构(callout/blockquote/engineQuote/list/table)递归处理片段合并"""
# 类型检查:确保 block 是字典类型
if not isinstance(block, dict):
# 尝试将非字典类型转换为 paragraph
if isinstance(block, str) and block.strip():
logger.warning(f"_merge_nested_fragments 收到字符串类型,已转换为 paragraph")
return self._as_paragraph_block(block)
elif isinstance(block, list):
# 尝试提取列表中的第一个有效字典
for item in block:
if isinstance(item, dict):
self._ensure_block_type(item)
return self._merge_nested_fragments(item)
logger.warning(f"_merge_nested_fragments 收到无效列表,返回空 paragraph")
return self._as_paragraph_block("")
else:
logger.warning(f"_merge_nested_fragments 收到无效类型({type(block).__name__}),返回空 paragraph")
return self._as_paragraph_block("")
block_type = block.get("type")
if block_type in {"callout", "blockquote", "engineQuote"}:
nested = block.get("blocks")
... ... @@ -1682,8 +1753,12 @@ class ChapterGenerationNode(BaseNode):
entry[:] = merged_entry
elif block_type == "table":
for row in block.get("rows", []):
if not isinstance(row, dict):
continue
cells = row.get("cells") or []
for cell in cells:
if not isinstance(cell, dict):
continue
nested_blocks = cell.get("blocks")
if isinstance(nested_blocks, list):
cell["blocks"] = self._merge_fragment_sequences(nested_blocks)
... ... @@ -1819,6 +1894,42 @@ class ChapterGenerationNode(BaseNode):
return str(value)
return ""
# 合法的 listType
_ALLOWED_LIST_TYPES = {"ordered", "bullet", "task"}
# listType 的别名映射
_LIST_TYPE_ALIASES = {
"unordered": "bullet",
"ul": "bullet",
"ol": "ordered",
"numbered": "ordered",
"checkbox": "task",
"check": "task",
"todo": "task",
}
def _normalize_list_type(self, block: Dict[str, Any]):
"""
确保 list block listType 是合法值。
如果 listType 缺失或非法,自动修复为 bullet
"""
list_type = block.get("listType")
if list_type in self._ALLOWED_LIST_TYPES:
return
# 尝试别名映射
if isinstance(list_type, str):
lowered = list_type.strip().lower()
if lowered in self._LIST_TYPE_ALIASES:
block["listType"] = self._LIST_TYPE_ALIASES[lowered]
logger.warning(f"已将 listType '{list_type}' 映射为 '{block['listType']}'")
return
if lowered in self._ALLOWED_LIST_TYPES:
block["listType"] = lowered
return
# 无法识别,默认使用 bullet
logger.warning(f"检测到非法 listType: {list_type},已修复为 bullet")
block["listType"] = "bullet"
def _normalize_list_items(self, items: Any) -> List[List[Dict[str, Any]]]:
"""确保list blockitems为[[block, block], ...]结构"""
if not isinstance(items, list):
... ...
... ... @@ -1329,8 +1329,84 @@ class HTMLRenderer:
返回:
List[Dict]: 修复后的表格行数组。
"""
if not rows or len(rows) != 1:
# 只处理只有1行的异常情况
if not rows:
return []
# 辅助函数:获取单元格文本
def _get_cell_text(cell: Dict[str, Any]) -> str:
"""获取单元格的文本内容"""
blocks = cell.get("blocks", [])
for block in blocks:
if isinstance(block, dict) and block.get("type") == "paragraph":
inlines = block.get("inlines", [])
for inline in inlines:
if isinstance(inline, dict):
text = inline.get("text", "")
if text:
return str(text).strip()
return ""
def _is_placeholder_cell(cell: Dict[str, Any]) -> bool:
"""判断单元格是否是占位符(如 '--', '-', '—' 等)"""
text = _get_cell_text(cell)
return text in ("--", "-", "—", "——", "", "N/A", "n/a")
def _is_heading_like_cell(cell: Dict[str, Any]) -> bool:
"""检测是否疑似被错误并入表格的章节/标题单元格"""
text = _get_cell_text(cell)
if not text:
return False
stripped = text.strip()
# 章节号或"第X章/部分"常见格式,避免误删正常数字值
heading_patterns = (
r"^\d{1,2}(?:\.\d{1,2}){1,3}\s+",
r"^第[一二三四五六七八九十]+[章节部分]",
)
return any(re.match(pat, stripped) for pat in heading_patterns)
# 第一阶段:处理"有表头行 + 数据被串在一行"的情况
header_cells = self._flatten_nested_cells((rows[0] or {}).get("cells", []))
header_count = len(header_cells)
overflow_fixed = None
if header_count >= 2:
rebuilt_rows: List[Dict[str, Any]] = [
{
**{k: v for k, v in (rows[0] or {}).items() if k != "cells"},
"cells": header_cells,
}
]
changed = False
for row in rows[1:]:
cells = self._flatten_nested_cells((row or {}).get("cells", []))
cell_count = len(cells)
if cell_count <= header_count:
rebuilt_rows.append({**{k: v for k, v in (row or {}).items() if k != "cells"}, "cells": cells})
continue
remainder = cell_count % header_count
trimmed_cells = cells
if remainder:
trailing = cells[-remainder:]
if all(_is_placeholder_cell(c) or _is_heading_like_cell(c) for c in trailing):
trimmed_cells = cells[:-remainder]
remainder = 0
if remainder == 0 and len(trimmed_cells) >= header_count * 2:
for i in range(0, len(trimmed_cells), header_count):
chunk = trimmed_cells[i : i + header_count]
rebuilt_rows.append({"cells": chunk})
changed = True
else:
rebuilt_rows.append({**{k: v for k, v in (row or {}).items() if k != "cells"}, "cells": cells})
if changed:
overflow_fixed = rebuilt_rows
if overflow_fixed is not None:
rows = overflow_fixed
if len(rows) != 1:
# 只有一行的异常情况由后续逻辑处理;正常多行直接返回
return rows
first_row = rows[0]
... ... @@ -1353,25 +1429,6 @@ class HTMLRenderer:
# 单元格太少,不需要重组
return rows
# 辅助函数:获取单元格文本
def _get_cell_text(cell: Dict[str, Any]) -> str:
"""获取单元格的文本内容"""
blocks = cell.get("blocks", [])
for block in blocks:
if isinstance(block, dict) and block.get("type") == "paragraph":
inlines = block.get("inlines", [])
for inline in inlines:
if isinstance(inline, dict):
text = inline.get("text", "")
if text:
return str(text).strip()
return ""
def _is_placeholder_cell(cell: Dict[str, Any]) -> bool:
"""判断单元格是否是占位符(如 '--', '-', '—' 等)"""
text = _get_cell_text(cell)
return text in ("--", "-", "—", "——", "", "N/A", "n/a")
# 先过滤掉占位符单元格
all_cells = [c for c in all_cells if not _is_placeholder_cell(c)]
... ...