Further fixes to table rendering logic

马一丁
Commit 95de1d6e45efe0361df9b75f5736a80a9dfea1a5 95de1d6e 1 parent f4c59f22
Showing 3 changed files with 238 additions and 33 deletions
ReportEngine/nodes/chapter_generation_node.py
ReportEngine/renderers/html_renderer.py
ReportEngine/renderers/markdown_renderer.py
--- a/ReportEngine/nodes/chapter_generation_node.py
View file @95de1d6
+++ b/ReportEngine/nodes/chapter_generation_node.py
View file @95de1d6
@@ -1168,9 +1168,192 @@ class ChapterGenerationNode(BaseNode):
 
     def _sanitize_table_block(self, block: Dict[str, Any]):
         """保证表格的rows/cells结构合法且每个单元格包含至少一个block"""
-         rows = self._normalize_table_rows(block.get("rows"))
+         raw_rows = block.get("rows")
+         # 先检测是否存在嵌套行结构问题（只有1行但cells中有嵌套）
+         if isinstance(raw_rows, list) and len(raw_rows) == 1:
+             first_row = raw_rows[0]
+             if isinstance(first_row, dict):
+                 cells = first_row.get("cells", [])
+                 # 检测是否存在嵌套结构
+                 has_nested = any(
+                     isinstance(cell, dict) and "cells" in cell and "blocks" not in cell
+                     for cell in cells
+                     if isinstance(cell, dict)
+                 )
+                 if has_nested:
+                     # 修复嵌套行结构
+                     fixed_rows = self._fix_nested_rows_structure(raw_rows)
+                     block["rows"] = fixed_rows
+                     return
+         # 正常情况下，使用标准规范化
+         rows = self._normalize_table_rows(raw_rows)
         block["rows"] = rows
 
+     def _fix_nested_rows_structure(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+         """
+         修复嵌套错误的表格行结构。
+ 
+         当LLM生成的表格只有1行但所有数据被嵌套在cells中时，
+         本方法会展平所有单元格并重新组织成正确的多行结构。
+ 
+         参数:
+             rows: 原始的表格行数组（应该只有1行）。
+ 
+         返回:
+             List[Dict]: 修复后的多行表格结构。
+         """
+         if not rows or len(rows) != 1:
+             return self._normalize_table_rows(rows)
+ 
+         first_row = rows[0]
+         original_cells = first_row.get("cells", [])
+ 
+         # 递归展平所有嵌套的单元格
+         all_cells = self._flatten_all_cells_recursive(original_cells)
+ 
+         if len(all_cells) <= 1:
+             return self._normalize_table_rows(rows)
+ 
+         # 辅助函数：获取单元格文本
+         def _get_cell_text(cell: Dict[str, Any]) -> str:
+             blocks = cell.get("blocks", [])
+             for block in blocks:
+                 if isinstance(block, dict) and block.get("type") == "paragraph":
+                     inlines = block.get("inlines", [])
+                     for inline in inlines:
+                         if isinstance(inline, dict):
+                             text = inline.get("text", "")
+                             if text:
+                                 return str(text).strip()
+             return ""
+ 
+         def _is_placeholder_cell(cell: Dict[str, Any]) -> bool:
+             """判断单元格是否是占位符"""
+             text = _get_cell_text(cell)
+             return text in ("--", "-", "—", "——", "", "N/A", "n/a")
+ 
+         def _is_header_cell(cell: Dict[str, Any]) -> bool:
+             """判断单元格是否像表头（通常有加粗标记或是典型表头词）"""
+             blocks = cell.get("blocks", [])
+             for block in blocks:
+                 if isinstance(block, dict) and block.get("type") == "paragraph":
+                     inlines = block.get("inlines", [])
+                     for inline in inlines:
+                         if isinstance(inline, dict):
+                             marks = inline.get("marks", [])
+                             if any(isinstance(m, dict) and m.get("type") == "bold" for m in marks):
+                                 return True
+             # 也检查典型的表头词
+             text = _get_cell_text(cell)
+             header_keywords = {
+                 "时间", "日期", "名称", "类型", "状态", "数量", "金额", "比例", "指标",
+                 "平台", "渠道", "来源", "描述", "说明", "备注", "序号", "编号",
+                 "事件", "关键", "数据", "支撑", "反应", "市场", "情感", "节点",
+                 "维度", "要点", "详情", "标签", "影响", "趋势", "权重", "类别",
+                 "信息", "内容", "风格", "偏好", "主要", "用户", "核心", "特征",
+                 "分类", "范围", "对象", "项目", "阶段", "周期", "频率", "等级",
+             }
+             return any(kw in text for kw in header_keywords) and len(text) <= 20
+ 
+         # 过滤掉占位符单元格
+         valid_cells = [c for c in all_cells if not _is_placeholder_cell(c)]
+ 
+         if len(valid_cells) <= 1:
+             return self._normalize_table_rows(rows)
+ 
+         # 检测表头列数：统计连续的表头单元格数量
+         header_count = 0
+         for cell in valid_cells:
+             if _is_header_cell(cell):
+                 header_count += 1
+             else:
+                 break
+ 
+         # 如果没有检测到表头，使用启发式方法
+         if header_count == 0:
+             total = len(valid_cells)
+             for possible_cols in [4, 5, 3, 6, 2]:
+                 if total % possible_cols == 0:
+                     header_count = possible_cols
+                     break
+             else:
+                 # 尝试找到最接近的能整除的列数
+                 for possible_cols in [4, 5, 3, 6, 2]:
+                     remainder = total % possible_cols
+                     if remainder <= 3:
+                         header_count = possible_cols
+                         break
+                 else:
+                     # 无法确定列数，使用原始数据
+                     return self._normalize_table_rows(rows)
+ 
+         # 计算有效的单元格数量
+         total = len(valid_cells)
+         remainder = total % header_count
+         if remainder > 0 and remainder <= 3:
+             # 截断尾部多余的单元格
+             valid_cells = valid_cells[:total - remainder]
+         elif remainder > 3:
+             # 余数太大，可能列数检测错误
+             return self._normalize_table_rows(rows)
+ 
+         # 重新组织成多行
+         fixed_rows: List[Dict[str, Any]] = []
+         for i in range(0, len(valid_cells), header_count):
+             row_cells = valid_cells[i:i + header_count]
+             # 标记第一行为表头
+             if i == 0:
+                 for cell in row_cells:
+                     cell["header"] = True
+             fixed_rows.append({"cells": row_cells})
+ 
+         return fixed_rows if fixed_rows else self._normalize_table_rows(rows)
+ 
+     def _flatten_all_cells_recursive(self, cells: List[Any]) -> List[Dict[str, Any]]:
+         """
+         递归展平所有嵌套的单元格结构。
+ 
+         参数:
+             cells: 可能包含嵌套结构的单元格数组。
+ 
+         返回:
+             List[Dict]: 展平后的单元格数组，每个单元格都有blocks。
+         """
+         if not cells:
+             return []
+ 
+         flattened: List[Dict[str, Any]] = []
+ 
+         def _extract_cells(cell_or_list: Any) -> None:
+             if not isinstance(cell_or_list, dict):
+                 if isinstance(cell_or_list, (str, int, float)):
+                     flattened.append({"blocks": [self._as_paragraph_block(str(cell_or_list))]})
+                 return
+ 
+             # 如果当前对象有 blocks，说明它是一个有效的单元格
+             if "blocks" in cell_or_list:
+                 # 创建单元格副本，移除嵌套的 cells
+                 clean_cell = {
+                     k: v for k, v in cell_or_list.items()
+                     if k != "cells"
+                 }
+                 # 确保blocks有效
+                 blocks = clean_cell.get("blocks")
+                 if not isinstance(blocks, list) or not blocks:
+                     clean_cell["blocks"] = [self._as_paragraph_block("")]
+                 flattened.append(clean_cell)
+ 
+             # 如果当前对象有嵌套的 cells，递归处理
+             nested_cells = cell_or_list.get("cells")
+             if isinstance(nested_cells, list):
+                 for nested_cell in nested_cells:
+                     _extract_cells(nested_cell)
+ 
+         for cell in cells:
+             _extract_cells(cell)
+ 
+         return flattened
+ 
     def _sanitize_engine_quote_block(self, block: Dict[str, Any]):
         """engineQuote仅用于单Agent发言，内部仅允许paragraph且title需锁定Agent名称"""
         engine_raw = block.get("engine")
--- a/ReportEngine/renderers/html_renderer.py
View file @95de1d6
+++ b/ReportEngine/renderers/html_renderer.py
View file @95de1d6
@@ -1318,12 +1318,13 @@ class HTMLRenderer:
             """获取单元格的文本内容"""
             blocks = cell.get("blocks", [])
             for block in blocks:
-                 if block.get("type") == "paragraph":
+                 if isinstance(block, dict) and block.get("type") == "paragraph":
                     inlines = block.get("inlines", [])
                     for inline in inlines:
-                         text = inline.get("text", "")
-                         if text:
-                             return text.strip()
+                         if isinstance(inline, dict):
+                             text = inline.get("text", "")
+                             if text:
+                                 return str(text).strip()
             return ""
 
         def _is_placeholder_cell(cell: Dict[str, Any]) -> bool:
@@ -1337,21 +1338,31 @@ class HTMLRenderer:
         if len(all_cells) <= 2:
             return rows
 
-         # 检测表头列数：查找带有 bold 标记的单元格
+         # 检测表头列数：查找带有 bold 标记或典型表头词的单元格
         def _is_header_cell(cell: Dict[str, Any]) -> bool:
-             """判断单元格是否像表头（通常有加粗标记）"""
+             """判断单元格是否像表头（有加粗标记或是典型表头词）"""
             blocks = cell.get("blocks", [])
             for block in blocks:
-                 if block.get("type") == "paragraph":
+                 if isinstance(block, dict) and block.get("type") == "paragraph":
                     inlines = block.get("inlines", [])
                     for inline in inlines:
-                         marks = inline.get("marks", [])
-                         if any(m.get("type") == "bold" for m in marks):
-                             return True
-             return False
- 
-         # 计算表头列数：统计连续的加粗单元格数量
-         # 占位符已经在前面被过滤掉了
+                         if isinstance(inline, dict):
+                             marks = inline.get("marks", [])
+                             if any(isinstance(m, dict) and m.get("type") == "bold" for m in marks):
+                                 return True
+             # 也检查典型的表头词
+             text = _get_cell_text(cell)
+             header_keywords = {
+                 "时间", "日期", "名称", "类型", "状态", "数量", "金额", "比例", "指标",
+                 "平台", "渠道", "来源", "描述", "说明", "备注", "序号", "编号",
+                 "事件", "关键", "数据", "支撑", "反应", "市场", "情感", "节点",
+                 "维度", "要点", "详情", "标签", "影响", "趋势", "权重", "类别",
+                 "信息", "内容", "风格", "偏好", "主要", "用户", "核心", "特征",
+                 "分类", "范围", "对象", "项目", "阶段", "周期", "频率", "等级",
+             }
+             return any(kw in text for kw in header_keywords) and len(text) <= 20
+ 
+         # 计算表头列数：统计连续的表头单元格数量
         header_count = 0
         for cell in all_cells:
             if _is_header_cell(cell):
@@ -1364,13 +1375,13 @@ class HTMLRenderer:
         if header_count == 0:
             # 假设列数为 4 或 5（常见的表格列数）
             total = len(all_cells)
-             for possible_cols in [4, 5, 3, 6]:
+             for possible_cols in [4, 5, 3, 6, 2]:
                 if total % possible_cols == 0:
                     header_count = possible_cols
                     break
             else:
                 # 尝试找到最接近的能整除的列数
-                 for possible_cols in [4, 5, 3, 6]:
+                 for possible_cols in [4, 5, 3, 6, 2]:
                     remainder = total % possible_cols
                     # 允许最多3个多余的单元格（可能是尾部的总结或注释）
                     if remainder <= 3:
--- a/ReportEngine/renderers/markdown_renderer.py
View file @95de1d6
+++ b/ReportEngine/renderers/markdown_renderer.py
View file @95de1d6
@@ -254,12 +254,13 @@ class MarkdownRenderer:
             """获取单元格的文本内容"""
             blocks = cell.get("blocks", [])
             for block in blocks:
-                 if block.get("type") == "paragraph":
+                 if isinstance(block, dict) and block.get("type") == "paragraph":
                     inlines = block.get("inlines", [])
                     for inline in inlines:
-                         text = inline.get("text", "")
-                         if text:
-                             return text.strip()
+                         if isinstance(inline, dict):
+                             text = inline.get("text", "")
+                             if text:
+                                 return str(text).strip()
             return ""
 
         def _is_placeholder_cell(cell: Dict[str, Any]) -> bool:
@@ -273,21 +274,31 @@ class MarkdownRenderer:
         if len(all_cells) <= 2:
             return rows
 
-         # 检测表头列数：查找带有 bold 标记的单元格
+         # 检测表头列数：查找带有 bold 标记或典型表头词的单元格
         def _is_header_cell(cell: Dict[str, Any]) -> bool:
-             """判断单元格是否像表头（通常有加粗标记）"""
+             """判断单元格是否像表头（有加粗标记或是典型表头词）"""
             blocks = cell.get("blocks", [])
             for block in blocks:
-                 if block.get("type") == "paragraph":
+                 if isinstance(block, dict) and block.get("type") == "paragraph":
                     inlines = block.get("inlines", [])
                     for inline in inlines:
-                         marks = inline.get("marks", [])
-                         if any(m.get("type") == "bold" for m in marks):
-                             return True
-             return False
- 
-         # 计算表头列数：统计连续的加粗单元格数量
-         # 占位符已经在前面被过滤掉了
+                         if isinstance(inline, dict):
+                             marks = inline.get("marks", [])
+                             if any(isinstance(m, dict) and m.get("type") == "bold" for m in marks):
+                                 return True
+             # 也检查典型的表头词
+             text = _get_cell_text(cell)
+             header_keywords = {
+                 "时间", "日期", "名称", "类型", "状态", "数量", "金额", "比例", "指标",
+                 "平台", "渠道", "来源", "描述", "说明", "备注", "序号", "编号",
+                 "事件", "关键", "数据", "支撑", "反应", "市场", "情感", "节点",
+                 "维度", "要点", "详情", "标签", "影响", "趋势", "权重", "类别",
+                 "信息", "内容", "风格", "偏好", "主要", "用户", "核心", "特征",
+                 "分类", "范围", "对象", "项目", "阶段", "周期", "频率", "等级",
+             }
+             return any(kw in text for kw in header_keywords) and len(text) <= 20
+ 
+         # 计算表头列数：统计连续的表头单元格数量
         header_count = 0
         for cell in all_cells:
             if _is_header_cell(cell):
@@ -300,13 +311,13 @@ class MarkdownRenderer:
         if header_count == 0:
             # 假设列数为 4 或 5（常见的表格列数）
             total = len(all_cells)
-             for possible_cols in [4, 5, 3, 6]:
+             for possible_cols in [4, 5, 3, 6, 2]:
                 if total % possible_cols == 0:
                     header_count = possible_cols
                     break
             else:
                 # 尝试找到最接近的能整除的列数
-                 for possible_cols in [4, 5, 3, 6]:
+                 for possible_cols in [4, 5, 3, 6, 2]:
                     remainder = total % possible_cols
                     # 允许最多3个多余的单元格（可能是尾部的总结或注释）
                     if remainder <= 3: