Fix:

1. _parse_chapter 异常类型降级（影响：高） # 回退后（本 PR） raise ValueError("LLM返回空内容") raise ValueError("章节JSON缺少chapter字段") # 回退前（当前 main） raise ChapterJsonParseError("LLM返回空内容", raw_text=raw_text) raise ChapterJsonParseError("章节JSON缺少chapter字段或结构不完整", raw_text=cleaned) ChapterJsonParseError 是 ValueError 的子类。run() 方法中 except ChapterJsonParseError 捕获不到父类 ValueError。当 LLM 返回空内容或 JSON 缺少 chapter 字段时，异常会直接穿透上层所有 except 块，导致整章生成失败且不会进入重试或降级逻辑。 2. agent.py 移除宽泛异常重试（影响：中高）移除了对 AttributeError, TypeError, KeyError, IndexError, ValueError, json.JSONDecodeError 的捕获重试。如果 LLM 返回畸形 JSON 导致运行时异常，现在会直接崩溃而非重试。 3. 移除非字典 block 防御性处理（影响：中） chapter_generation_node.py 中 walk() / _merge_fragment_sequences() / _merge_nested_fragments() 里对 LLM 返回非字典类型 block（string、list 等）的容错处理全部移除。如果 LLM 输出异常结构，现在会直接报错而非自动修复。 4. 移除 _normalize_list_type 和表格行溢出修复（影响：低） - _normalize_list_type()：将非法 listType（如 "unordered"）自动映射为 "bullet" 的逻辑被移除 - html_renderer.py 的 _fix_nested_table_rows()：多行表格数据溢出到单行时的重组逻辑被简化

Fix:
1. _parse_chapter 异常类型降级（影响：高） # 回退后（本 PR） raise ValueError("LLM返回空内容") raise ValueError("章节JSON缺少chapter字段") # 回退前（当前 main） raise ChapterJsonParseError("LLM返回空内容", raw_text=raw_text) raise ChapterJsonParseError("章节JSON缺少chapter字段或结构不完整", raw_text=cleaned) ChapterJsonParseError 是 ValueError 的子类。run() 方法中 except ChapterJsonParseError 捕获不到父类 ValueError。当 LLM 返回空内容或 JSON 缺少 chapter 字段时，异常会直接穿透上层所有 except 块，导致整章生成失败且不会进入重试或降级逻辑。 2. agent.py 移除宽泛异常重试（影响：中高）移除了对 AttributeError, TypeError, KeyError, IndexError, ValueError, json.JSONDecodeError 的捕获重试。如果 LLM 返回畸形 JSON 导致运行时异常，现在会直接崩溃而非重试。 3. 移除非字典 block 防御性处理（影响：中） chapter_generation_node.py 中 walk() / _merge_fragment_sequences() / _merge_nested_fragments() 里对 LLM 返回非字典类型 block（string、list 等）的容错处理全部移除。如果 LLM 输出异常结构，现在会直接报错而非自动修复。 4. 移除 _normalize_list_type 和表格行溢出修复（影响：低） - _normalize_list_type()：将非法 listType（如 "unordered"）自动映射为 "bullet" 的逻辑被移除 - html_renderer.py 的 _fix_nested_table_rows()：多行表格数据溢出到单行时的重组逻辑被简化
MaYiding
Commit e5210bfc83c623031cd5aab8b6bf5298f943215b e5210bfc 1 parent fdbb0f21
Showing 3 changed files with 227 additions and 25 deletions
ReportEngine/agent.py
ReportEngine/nodes/chapter_generation_node.py
ReportEngine/renderers/html_renderer.py
--- a/ReportEngine/agent.py
View file @e5210bf
+++ b/ReportEngine/agent.py
View file @e5210bf
@@ -663,6 +663,40 @@ class ReportAgent:
                             raise
                         attempt += 1
                         continue
+                    except (AttributeError, TypeError, KeyError, IndexError, ValueError, json.JSONDecodeError) as structure_error:
+                        # 捕获因 JSON 结构异常导致的运行时错误，包装为可重试异常
+                        # 包括：
+                        # - AttributeError: 如 list.get() 调用失败
+                        # - TypeError: 类型不匹配
+                        # - KeyError: 字典键缺失
+                        # - IndexError: 列表索引越界
+                        # - ValueError: 值错误（如 LLM 返回空内容、缺少必要字段）
+                        # - json.JSONDecodeError: JSON 解析失败（未被内部捕获的情况）
+                        error_type = type(structure_error).__name__
+                        logger.warning(
+                            "章节 {title} 生成过程中发生 {error_type}（第 {attempt}/{total} 次尝试），将尝试重新生成: {error}",
+                            title=section.title,
+                            error_type=error_type,
+                            attempt=attempt,
+                            total=chapter_max_attempts,
+                            error=structure_error,
+                        )
+                        emit('chapter_status', {
+                            'chapterId': section.chapter_id,
+                            'title': section.title,
+                            'status': 'retrying' if attempt < chapter_max_attempts else 'error',
+                            'attempt': attempt,
+                            'error': str(structure_error),
+                            'reason': 'structure_error',
+                            'error_type': error_type
+                        })
+                        if attempt >= chapter_max_attempts:
+                            # 达到最大重试次数，包装为 ChapterJsonParseError 抛出
+                            raise ChapterJsonParseError(
+                                f"{section.title} 章节因 {error_type} 在 {chapter_max_attempts} 次尝试后仍无法生成: {structure_error}"
+                            ) from structure_error
+                        attempt += 1
+                        continue
                     except Exception as chapter_error:
                         if not self._should_retry_inappropriate_content_error(chapter_error):
                             raise
--- a/ReportEngine/nodes/chapter_generation_node.py
View file @e5210bf
+++ b/ReportEngine/nodes/chapter_generation_node.py
View file @e5210bf
@@ -642,7 +642,7 @@ class ChapterGenerationNode(BaseNode):
             cleaned = cleaned[:-3]
         cleaned = cleaned.strip()
         if not cleaned:
-            raise ValueError("LLM返回空内容")
+            raise ChapterJsonParseError("LLM返回空内容", raw_text=raw_text)
         candidate_payloads = [cleaned]
         repaired = self._repair_llm_json(cleaned)
@@ -685,7 +685,7 @@ class ChapterGenerationNode(BaseNode):
                         return item["chapter"]
                     if all(key in item for key in ("chapterId", "title", "blocks")):
                         return item
-        raise ValueError("章节JSON缺少chapter字段")
+        raise ChapterJsonParseError("章节JSON缺少chapter字段或结构不完整", raw_text=cleaned)
     def _persist_error_payload(
         self,
@@ -967,13 +967,41 @@ class ChapterGenerationNode(BaseNode):
             """递归检查并修复嵌套结构，保证每个block合法"""
             if not isinstance(blocks, list):
                 return
-            for block in blocks:
+            # 先过滤掉非字典类型的异常 block
+            valid_indices = []
+            for idx, block in enumerate(blocks):
+                if not isinstance(block, dict):
+                    # 尝试将字符串转换为 paragraph
+                    if isinstance(block, str) and block.strip():
+                        blocks[idx] = self._as_paragraph_block(block)
+                        valid_indices.append(idx)
+                        logger.warning(f"walk: 将字符串 block 转换为 paragraph")
+                    elif isinstance(block, list):
+                        # 尝试提取列表中的有效字典
+                        for item in block:
+                            if isinstance(item, dict):
+                                self._ensure_block_type(item)
+                                blocks[idx] = item
+                                valid_indices.append(idx)
+                                logger.warning(f"walk: 从列表中提取字典 block")
+                                break
+                        else:
+                            logger.warning(f"walk: 跳过无效的列表 block: {block}")
+                    else:
+                        logger.warning(f"walk: 跳过无效的 block（类型: {type(block).__name__}）")
+                else:
+                    valid_indices.append(idx)
+
+            for idx in valid_indices:
+                block = blocks[idx]
                 if not isinstance(block, dict):
                     continue
                 self._ensure_block_type(block)
                 self._sanitize_block_content(block)
                 block_type = block.get("type")
                 if block_type == "list":
+                    # 自动修复 listType：确保是合法值
+                    self._normalize_list_type(block)
                     items = block.get("items")
                     normalized = self._normalize_list_items(items)
                     if normalized:
@@ -984,8 +1012,12 @@ class ChapterGenerationNode(BaseNode):
                     walk(block.get("blocks"))
                 elif block_type == "table":
                     for row in block.get("rows", []):
+                        if not isinstance(row, dict):
+                            continue
                         cells = row.get("cells") or []
                         for cell in cells:
+                            if not isinstance(cell, dict):
+                                continue
                             walk(cell.get("blocks"))
                 elif block_type == "widget":
                     self._normalize_widget_block(block)
@@ -998,7 +1030,9 @@ class ChapterGenerationNode(BaseNode):
         blocks = chapter.get("blocks")
         if isinstance(blocks, list):
-            chapter["blocks"] = self._merge_fragment_sequences(blocks)
+            # 在合并前先过滤掉所有非字典类型的 block
+            filtered_blocks = [b for b in blocks if isinstance(b, dict)]
+            chapter["blocks"] = self._merge_fragment_sequences(filtered_blocks)
     def _ensure_content_density(self, chapter: Dict[str, Any]):
         """
@@ -1657,6 +1691,25 @@ class ChapterGenerationNode(BaseNode):
             fragment_buffer = []
         for block in blocks:
+            # 类型检查：跳过非字典类型的异常 block，避免 AttributeError
+            if not isinstance(block, dict):
+                # 尝试将非字典类型转换为 paragraph
+                if isinstance(block, str) and block.strip():
+                    converted = self._as_paragraph_block(block)
+                    logger.warning(f"检测到非字典类型的 block（字符串），已转换为 paragraph: {block[:50]}...")
+                    merged.append(converted)
+                elif isinstance(block, list):
+                    # 列表类型的 block 可能是 LLM 输出错误，尝试提取有效内容
+                    logger.warning(f"检测到列表类型的 block，尝试提取有效内容: {block}")
+                    for item in block:
+                        if isinstance(item, dict):
+                            self._ensure_block_type(item)
+                            merged.append(self._merge_nested_fragments(item))
+                        elif isinstance(item, str) and item.strip():
+                            merged.append(self._as_paragraph_block(item))
+                else:
+                    logger.warning(f"跳过无效的 block（类型: {type(block).__name__}）: {block}")
+                continue
             if self._is_paragraph_fragment(block):
                 fragment_buffer.append(block)
                 continue
@@ -1668,6 +1721,24 @@ class ChapterGenerationNode(BaseNode):
     def _merge_nested_fragments(self, block: Dict[str, Any]) -> Dict[str, Any]:
         """对嵌套结构（callout/blockquote/engineQuote/list/table）递归处理片段合并"""
+        # 类型检查：确保 block 是字典类型
+        if not isinstance(block, dict):
+            # 尝试将非字典类型转换为 paragraph
+            if isinstance(block, str) and block.strip():
+                logger.warning(f"_merge_nested_fragments 收到字符串类型，已转换为 paragraph")
+                return self._as_paragraph_block(block)
+            elif isinstance(block, list):
+                # 尝试提取列表中的第一个有效字典
+                for item in block:
+                    if isinstance(item, dict):
+                        self._ensure_block_type(item)
+                        return self._merge_nested_fragments(item)
+                logger.warning(f"_merge_nested_fragments 收到无效列表，返回空 paragraph")
+                return self._as_paragraph_block("")
+            else:
+                logger.warning(f"_merge_nested_fragments 收到无效类型（{type(block).__name__}），返回空 paragraph")
+                return self._as_paragraph_block("")
+
         block_type = block.get("type")
         if block_type in {"callout", "blockquote", "engineQuote"}:
             nested = block.get("blocks")
@@ -1682,8 +1753,12 @@ class ChapterGenerationNode(BaseNode):
                         entry[:] = merged_entry
         elif block_type == "table":
             for row in block.get("rows", []):
+                if not isinstance(row, dict):
+                    continue
                 cells = row.get("cells") or []
                 for cell in cells:
+                    if not isinstance(cell, dict):
+                        continue
                     nested_blocks = cell.get("blocks")
                     if isinstance(nested_blocks, list):
                         cell["blocks"] = self._merge_fragment_sequences(nested_blocks)
@@ -1819,6 +1894,42 @@ class ChapterGenerationNode(BaseNode):
                 return str(value)
         return ""
+    # 合法的 listType 值
+    _ALLOWED_LIST_TYPES = {"ordered", "bullet", "task"}
+    # listType 的别名映射
+    _LIST_TYPE_ALIASES = {
+        "unordered": "bullet",
+        "ul": "bullet",
+        "ol": "ordered",
+        "numbered": "ordered",
+        "checkbox": "task",
+        "check": "task",
+        "todo": "task",
+    }
+
+    def _normalize_list_type(self, block: Dict[str, Any]):
+        """
+        确保 list block 的 listType 是合法值。
+
+        如果 listType 缺失或非法，自动修复为 bullet。
+        """
+        list_type = block.get("listType")
+        if list_type in self._ALLOWED_LIST_TYPES:
+            return
+        # 尝试别名映射
+        if isinstance(list_type, str):
+            lowered = list_type.strip().lower()
+            if lowered in self._LIST_TYPE_ALIASES:
+                block["listType"] = self._LIST_TYPE_ALIASES[lowered]
+                logger.warning(f"已将 listType '{list_type}' 映射为 '{block['listType']}'")
+                return
+            if lowered in self._ALLOWED_LIST_TYPES:
+                block["listType"] = lowered
+                return
+        # 无法识别，默认使用 bullet
+        logger.warning(f"检测到非法 listType: {list_type}，已修复为 bullet")
+        block["listType"] = "bullet"
+
     def _normalize_list_items(self, items: Any) -> List[List[Dict[str, Any]]]:
         """确保list block的items为[[block, block], ...]结构"""
         if not isinstance(items, list):
--- a/ReportEngine/renderers/html_renderer.py
View file @e5210bf
+++ b/ReportEngine/renderers/html_renderer.py
View file @e5210bf
@@ -1329,8 +1329,84 @@ class HTMLRenderer:
         返回:
             List[Dict]: 修复后的表格行数组。
         """
-        if not rows or len(rows) != 1:
-            # 只处理只有1行的异常情况
+        if not rows:
+            return []
+
+        # 辅助函数：获取单元格文本
+        def _get_cell_text(cell: Dict[str, Any]) -> str:
+            """获取单元格的文本内容"""
+            blocks = cell.get("blocks", [])
+            for block in blocks:
+                if isinstance(block, dict) and block.get("type") == "paragraph":
+                    inlines = block.get("inlines", [])
+                    for inline in inlines:
+                        if isinstance(inline, dict):
+                            text = inline.get("text", "")
+                            if text:
+                                return str(text).strip()
+            return ""
+
+        def _is_placeholder_cell(cell: Dict[str, Any]) -> bool:
+            """判断单元格是否是占位符（如 '--', '-', '—' 等）"""
+            text = _get_cell_text(cell)
+            return text in ("--", "-", "—", "——", "", "N/A", "n/a")
+
+        def _is_heading_like_cell(cell: Dict[str, Any]) -> bool:
+            """检测是否疑似被错误并入表格的章节/标题单元格"""
+            text = _get_cell_text(cell)
+            if not text:
+                return False
+            stripped = text.strip()
+            # 章节号或"第X章/部分"常见格式，避免误删正常数字值
+            heading_patterns = (
+                r"^\d{1,2}(?:\.\d{1,2}){1,3}\s+",
+                r"^第[一二三四五六七八九十]+[章节部分]",
+            )
+            return any(re.match(pat, stripped) for pat in heading_patterns)
+
+        # 第一阶段：处理"有表头行 + 数据被串在一行"的情况
+        header_cells = self._flatten_nested_cells((rows[0] or {}).get("cells", []))
+        header_count = len(header_cells)
+        overflow_fixed = None
+        if header_count >= 2:
+            rebuilt_rows: List[Dict[str, Any]] = [
+                {
+                    **{k: v for k, v in (rows[0] or {}).items() if k != "cells"},
+                    "cells": header_cells,
+                }
+            ]
+            changed = False
+            for row in rows[1:]:
+                cells = self._flatten_nested_cells((row or {}).get("cells", []))
+                cell_count = len(cells)
+                if cell_count <= header_count:
+                    rebuilt_rows.append({**{k: v for k, v in (row or {}).items() if k != "cells"}, "cells": cells})
+                    continue
+
+                remainder = cell_count % header_count
+                trimmed_cells = cells
+                if remainder:
+                    trailing = cells[-remainder:]
+                    if all(_is_placeholder_cell(c) or _is_heading_like_cell(c) for c in trailing):
+                        trimmed_cells = cells[:-remainder]
+                        remainder = 0
+
+                if remainder == 0 and len(trimmed_cells) >= header_count * 2:
+                    for i in range(0, len(trimmed_cells), header_count):
+                        chunk = trimmed_cells[i : i + header_count]
+                        rebuilt_rows.append({"cells": chunk})
+                    changed = True
+                else:
+                    rebuilt_rows.append({**{k: v for k, v in (row or {}).items() if k != "cells"}, "cells": cells})
+
+            if changed:
+                overflow_fixed = rebuilt_rows
+
+        if overflow_fixed is not None:
+            rows = overflow_fixed
+
+        if len(rows) != 1:
+            # 只有一行的异常情况由后续逻辑处理；正常多行直接返回
             return rows
         first_row = rows[0]
@@ -1353,25 +1429,6 @@ class HTMLRenderer:
             # 单元格太少，不需要重组
             return rows
-        # 辅助函数：获取单元格文本
-        def _get_cell_text(cell: Dict[str, Any]) -> str:
-            """获取单元格的文本内容"""
-            blocks = cell.get("blocks", [])
-            for block in blocks:
-                if isinstance(block, dict) and block.get("type") == "paragraph":
-                    inlines = block.get("inlines", [])
-                    for inline in inlines:
-                        if isinstance(inline, dict):
-                            text = inline.get("text", "")
-                            if text:
-                                return str(text).strip()
-            return ""
-
-        def _is_placeholder_cell(cell: Dict[str, Any]) -> bool:
-            """判断单元格是否是占位符（如 '--', '-', '—' 等）"""
-            text = _get_cell_text(cell)
-            return text in ("--", "-", "—", "——", "", "N/A", "n/a")
-
         # 先过滤掉占位符单元格
         all_cells = [c for c in all_cells if not _is_placeholder_cell(c)]