Fix table rendering logic

马一丁
Commit f4c59f228e487cf1f476d0b1b4081c077aab5534 f4c59f22 1 parent 6d98e935
Showing 2 changed files with 362 additions and 8 deletions
ReportEngine/renderers/html_renderer.py
ReportEngine/renderers/markdown_renderer.py
--- a/ReportEngine/renderers/html_renderer.py
View file @f4c59f2
+++ b/ReportEngine/renderers/html_renderer.py
View file @f4c59f2
@@ -1232,6 +1232,176 @@ class HTMLRenderer:
         class_attr = f' class="{extra_class}"' if extra_class else ""
         return f'<{tag}{class_attr}>{items_html}</{tag}>'
+    def _flatten_nested_cells(self, cells: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """
+        展平错误嵌套的单元格结构。
+
+        某些 LLM 生成的表格数据中，单元格被错误地递归嵌套：
+        cells[0] 正常, cells[1].cells[0] 正常, cells[1].cells[1].cells[0] 正常...
+        本方法将这种嵌套结构展平为标准的平行单元格数组。
+
+        参数:
+            cells: 可能包含嵌套结构的单元格数组。
+
+        返回:
+            List[Dict]: 展平后的单元格数组。
+        """
+        if not cells:
+            return []
+
+        flattened: List[Dict[str, Any]] = []
+
+        def _extract_cells(cell_or_list: Any) -> None:
+            """递归提取所有单元格"""
+            if not isinstance(cell_or_list, dict):
+                return
+
+            # 如果当前对象有 blocks，说明它是一个有效的单元格
+            if "blocks" in cell_or_list:
+                # 创建单元格副本，移除嵌套的 cells
+                clean_cell = {
+                    k: v for k, v in cell_or_list.items()
+                    if k != "cells"
+                }
+                flattened.append(clean_cell)
+
+            # 如果当前对象有嵌套的 cells，递归处理
+            nested_cells = cell_or_list.get("cells")
+            if isinstance(nested_cells, list):
+                for nested_cell in nested_cells:
+                    _extract_cells(nested_cell)
+
+        for cell in cells:
+            _extract_cells(cell)
+
+        return flattened
+
+    def _fix_nested_table_rows(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """
+        修复嵌套错误的表格行结构。
+
+        某些 LLM 生成的表格数据中，所有行的单元格都被嵌套在第一行中，
+        导致表格只有1行但包含所有数据。本方法检测并修复这种情况。
+
+        参数:
+            rows: 原始的表格行数组。
+
+        返回:
+            List[Dict]: 修复后的表格行数组。
+        """
+        if not rows or len(rows) != 1:
+            # 只处理只有1行的异常情况
+            return rows
+
+        first_row = rows[0]
+        original_cells = first_row.get("cells", [])
+
+        # 检查是否存在嵌套结构
+        has_nested = any(
+            isinstance(cell.get("cells"), list)
+            for cell in original_cells
+            if isinstance(cell, dict)
+        )
+
+        if not has_nested:
+            return rows
+
+        # 展平所有单元格
+        all_cells = self._flatten_nested_cells(original_cells)
+
+        if len(all_cells) <= 2:
+            # 单元格太少，不需要重组
+            return rows
+
+        # 辅助函数：获取单元格文本
+        def _get_cell_text(cell: Dict[str, Any]) -> str:
+            """获取单元格的文本内容"""
+            blocks = cell.get("blocks", [])
+            for block in blocks:
+                if block.get("type") == "paragraph":
+                    inlines = block.get("inlines", [])
+                    for inline in inlines:
+                        text = inline.get("text", "")
+                        if text:
+                            return text.strip()
+            return ""
+
+        def _is_placeholder_cell(cell: Dict[str, Any]) -> bool:
+            """判断单元格是否是占位符（如 '--', '-', '—' 等）"""
+            text = _get_cell_text(cell)
+            return text in ("--", "-", "—", "——", "", "N/A", "n/a")
+
+        # 先过滤掉占位符单元格
+        all_cells = [c for c in all_cells if not _is_placeholder_cell(c)]
+
+        if len(all_cells) <= 2:
+            return rows
+
+        # 检测表头列数：查找带有 bold 标记的单元格
+        def _is_header_cell(cell: Dict[str, Any]) -> bool:
+            """判断单元格是否像表头（通常有加粗标记）"""
+            blocks = cell.get("blocks", [])
+            for block in blocks:
+                if block.get("type") == "paragraph":
+                    inlines = block.get("inlines", [])
+                    for inline in inlines:
+                        marks = inline.get("marks", [])
+                        if any(m.get("type") == "bold" for m in marks):
+                            return True
+            return False
+
+        # 计算表头列数：统计连续的加粗单元格数量
+        # 占位符已经在前面被过滤掉了
+        header_count = 0
+        for cell in all_cells:
+            if _is_header_cell(cell):
+                header_count += 1
+            else:
+                # 遇到第一个非表头单元格，说明数据区开始
+                break
+
+        # 如果没有检测到表头，尝试使用启发式方法
+        if header_count == 0:
+            # 假设列数为 4 或 5（常见的表格列数）
+            total = len(all_cells)
+            for possible_cols in [4, 5, 3, 6]:
+                if total % possible_cols == 0:
+                    header_count = possible_cols
+                    break
+            else:
+                # 尝试找到最接近的能整除的列数
+                for possible_cols in [4, 5, 3, 6]:
+                    remainder = total % possible_cols
+                    # 允许最多3个多余的单元格（可能是尾部的总结或注释）
+                    if remainder <= 3:
+                        header_count = possible_cols
+                        break
+                else:
+                    # 无法确定列数，返回原始数据
+                    return rows
+
+        # 计算有效的单元格数量（可能需要截断尾部多余的单元格）
+        total = len(all_cells)
+        remainder = total % header_count
+        if remainder > 0 and remainder <= 3:
+            # 截断尾部多余的单元格（可能是总结或注释）
+            all_cells = all_cells[:total - remainder]
+        elif remainder > 3:
+            # 余数太大，可能列数检测错误，返回原始数据
+            return rows
+
+        # 重新组织成多行
+        fixed_rows: List[Dict[str, Any]] = []
+        for i in range(0, len(all_cells), header_count):
+            row_cells = all_cells[i:i + header_count]
+            # 标记第一行为表头
+            if i == 0:
+                for cell in row_cells:
+                    cell["header"] = True
+            fixed_rows.append({"cells": row_cells})
+
+        return fixed_rows
+
     def _render_table(self, block: Dict[str, Any]) -> str:
         """
         渲染表格，同时保留caption与单元格属性。
@@ -1242,11 +1412,16 @@ class HTMLRenderer:
         返回:
             str: 包含<table>结构的HTML。
         """
-        rows = self._normalize_table_rows(block.get("rows") or [])
+        # 先修复可能存在的嵌套行结构问题
+        raw_rows = block.get("rows") or []
+        fixed_rows = self._fix_nested_table_rows(raw_rows)
+        rows = self._normalize_table_rows(fixed_rows)
         rows_html = ""
         for row in rows:
             row_cells = ""
-            for cell in row.get("cells", []):
+            # 展平可能存在的嵌套单元格结构（作为额外保护）
+            cells = self._flatten_nested_cells(row.get("cells", []))
+            for cell in cells:
                 cell_tag = "th" if cell.get("header") or cell.get("isHeader") else "td"
                 attr = []
                 if cell.get("rowspan"):
--- a/ReportEngine/renderers/markdown_renderer.py
View file @f4c59f2
+++ b/ReportEngine/renderers/markdown_renderer.py
View file @f4c59f2
@@ -168,24 +168,201 @@ class MarkdownRenderer:
                 lines.append(f"  {cont}")
         return "\n".join(lines)
+    def _flatten_nested_cells(self, cells: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """
+        展平错误嵌套的单元格结构。
+
+        某些 LLM 生成的表格数据中，单元格被错误地递归嵌套：
+        cells[0] 正常, cells[1].cells[0] 正常, cells[1].cells[1].cells[0] 正常...
+        本方法将这种嵌套结构展平为标准的平行单元格数组。
+
+        参数:
+            cells: 可能包含嵌套结构的单元格数组。
+
+        返回:
+            List[Dict]: 展平后的单元格数组。
+        """
+        if not cells:
+            return []
+
+        flattened: List[Dict[str, Any]] = []
+
+        def _extract_cells(cell_or_list: Any) -> None:
+            """递归提取所有单元格"""
+            if not isinstance(cell_or_list, dict):
+                return
+
+            # 如果当前对象有 blocks，说明它是一个有效的单元格
+            if "blocks" in cell_or_list:
+                # 创建单元格副本，移除嵌套的 cells
+                clean_cell = {
+                    k: v for k, v in cell_or_list.items()
+                    if k != "cells"
+                }
+                flattened.append(clean_cell)
+
+            # 如果当前对象有嵌套的 cells，递归处理
+            nested_cells = cell_or_list.get("cells")
+            if isinstance(nested_cells, list):
+                for nested_cell in nested_cells:
+                    _extract_cells(nested_cell)
+
+        for cell in cells:
+            _extract_cells(cell)
+
+        return flattened
+
+    def _fix_nested_table_rows(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """
+        修复嵌套错误的表格行结构。
+
+        某些 LLM 生成的表格数据中，所有行的单元格都被嵌套在第一行中，
+        导致表格只有1行但包含所有数据。本方法检测并修复这种情况。
+
+        参数:
+            rows: 原始的表格行数组。
+
+        返回:
+            List[Dict]: 修复后的表格行数组。
+        """
+        if not rows or len(rows) != 1:
+            # 只处理只有1行的异常情况
+            return rows
+
+        first_row = rows[0]
+        original_cells = first_row.get("cells", [])
+
+        # 检查是否存在嵌套结构
+        has_nested = any(
+            isinstance(cell.get("cells"), list)
+            for cell in original_cells
+            if isinstance(cell, dict)
+        )
+
+        if not has_nested:
+            return rows
+
+        # 展平所有单元格
+        all_cells = self._flatten_nested_cells(original_cells)
+
+        if len(all_cells) <= 2:
+            # 单元格太少，不需要重组
+            return rows
+
+        # 辅助函数：获取单元格文本
+        def _get_cell_text(cell: Dict[str, Any]) -> str:
+            """获取单元格的文本内容"""
+            blocks = cell.get("blocks", [])
+            for block in blocks:
+                if block.get("type") == "paragraph":
+                    inlines = block.get("inlines", [])
+                    for inline in inlines:
+                        text = inline.get("text", "")
+                        if text:
+                            return text.strip()
+            return ""
+
+        def _is_placeholder_cell(cell: Dict[str, Any]) -> bool:
+            """判断单元格是否是占位符（如 '--', '-', '—' 等）"""
+            text = _get_cell_text(cell)
+            return text in ("--", "-", "—", "——", "", "N/A", "n/a")
+
+        # 先过滤掉占位符单元格
+        all_cells = [c for c in all_cells if not _is_placeholder_cell(c)]
+
+        if len(all_cells) <= 2:
+            return rows
+
+        # 检测表头列数：查找带有 bold 标记的单元格
+        def _is_header_cell(cell: Dict[str, Any]) -> bool:
+            """判断单元格是否像表头（通常有加粗标记）"""
+            blocks = cell.get("blocks", [])
+            for block in blocks:
+                if block.get("type") == "paragraph":
+                    inlines = block.get("inlines", [])
+                    for inline in inlines:
+                        marks = inline.get("marks", [])
+                        if any(m.get("type") == "bold" for m in marks):
+                            return True
+            return False
+
+        # 计算表头列数：统计连续的加粗单元格数量
+        # 占位符已经在前面被过滤掉了
+        header_count = 0
+        for cell in all_cells:
+            if _is_header_cell(cell):
+                header_count += 1
+            else:
+                # 遇到第一个非表头单元格，说明数据区开始
+                break
+
+        # 如果没有检测到表头，尝试使用启发式方法
+        if header_count == 0:
+            # 假设列数为 4 或 5（常见的表格列数）
+            total = len(all_cells)
+            for possible_cols in [4, 5, 3, 6]:
+                if total % possible_cols == 0:
+                    header_count = possible_cols
+                    break
+            else:
+                # 尝试找到最接近的能整除的列数
+                for possible_cols in [4, 5, 3, 6]:
+                    remainder = total % possible_cols
+                    # 允许最多3个多余的单元格（可能是尾部的总结或注释）
+                    if remainder <= 3:
+                        header_count = possible_cols
+                        break
+                else:
+                    # 无法确定列数，返回原始数据
+                    return rows
+
+        # 计算有效的单元格数量（可能需要截断尾部多余的单元格）
+        total = len(all_cells)
+        remainder = total % header_count
+        if remainder > 0 and remainder <= 3:
+            # 截断尾部多余的单元格（可能是总结或注释）
+            all_cells = all_cells[:total - remainder]
+        elif remainder > 3:
+            # 余数太大，可能列数检测错误，返回原始数据
+            return rows
+
+        # 重新组织成多行
+        fixed_rows: List[Dict[str, Any]] = []
+        for i in range(0, len(all_cells), header_count):
+            row_cells = all_cells[i:i + header_count]
+            # 标记第一行为表头
+            if i == 0:
+                for cell in row_cells:
+                    cell["header"] = True
+            fixed_rows.append({"cells": row_cells})
+
+        return fixed_rows
+
     def _render_table(self, block: Dict[str, Any]) -> str:
-        rows = block.get("rows") or []
-        if not rows:
+        raw_rows = block.get("rows") or []
+        if not raw_rows:
             return ""
+        # 先修复可能存在的嵌套行结构问题
+        rows = self._fix_nested_table_rows(raw_rows)
+
         header_cells: List[str] = []
         body_rows: List[List[str]] = []
+        # 展平可能存在的嵌套单元格结构（作为额外保护）
+        first_row_cells_raw = rows[0].get("cells") if isinstance(rows[0], dict) else None
+        first_row_cells = self._flatten_nested_cells(first_row_cells_raw) if first_row_cells_raw else None
+
         # 检测首行是否声明为表头
-        first_row_cells = rows[0].get("cells") if isinstance(rows[0], dict) else None
         has_header = bool(first_row_cells and any(cell.get("header") or cell.get("isHeader") for cell in first_row_cells))
         # 计算最大列数，忽略rowspan
         col_count = 0
         for row in rows:
-            cells = row.get("cells") if isinstance(row, dict) else None
+            cells_raw = row.get("cells") if isinstance(row, dict) else None
+            cells = self._flatten_nested_cells(cells_raw) if cells_raw else []
             span = 0
-            for cell in cells or []:
+            for cell in cells:
                 span += int(cell.get("colspan") or 1)
             col_count = max(col_count, span)
@@ -198,7 +375,9 @@ class MarkdownRenderer:
         for row in rows:
             if not isinstance(row, dict):
                 continue
-            cells = row.get("cells") or []
+            cells_raw = row.get("cells") or []
+            # 展平可能存在的嵌套单元格结构
+            cells = self._flatten_nested_cells(cells_raw)
             row_cells: List[str] = []
             for cell in cells:
                 text = self._render_cell_content(cell)