马一丁

Fixed an issue where charts might only display as one row

@@ -1329,8 +1329,84 @@ class HTMLRenderer: @@ -1329,8 +1329,84 @@ class HTMLRenderer:
1329 返回: 1329 返回:
1330 List[Dict]: 修复后的表格行数组。 1330 List[Dict]: 修复后的表格行数组。
1331 """ 1331 """
1332 - if not rows or len(rows) != 1:  
1333 - # 只处理只有1行的异常情况 1332 + if not rows:
  1333 + return []
  1334 +
  1335 + # 辅助函数:获取单元格文本
  1336 + def _get_cell_text(cell: Dict[str, Any]) -> str:
  1337 + """获取单元格的文本内容"""
  1338 + blocks = cell.get("blocks", [])
  1339 + for block in blocks:
  1340 + if isinstance(block, dict) and block.get("type") == "paragraph":
  1341 + inlines = block.get("inlines", [])
  1342 + for inline in inlines:
  1343 + if isinstance(inline, dict):
  1344 + text = inline.get("text", "")
  1345 + if text:
  1346 + return str(text).strip()
  1347 + return ""
  1348 +
  1349 + def _is_placeholder_cell(cell: Dict[str, Any]) -> bool:
  1350 + """判断单元格是否是占位符(如 '--', '-', '—' 等)"""
  1351 + text = _get_cell_text(cell)
  1352 + return text in ("--", "-", "—", "——", "", "N/A", "n/a")
  1353 +
  1354 + def _is_heading_like_cell(cell: Dict[str, Any]) -> bool:
  1355 + """检测是否疑似被错误并入表格的章节/标题单元格"""
  1356 + text = _get_cell_text(cell)
  1357 + if not text:
  1358 + return False
  1359 + stripped = text.strip()
  1360 + # 章节号或“第X章/部分”常见格式,避免误删正常数字值
  1361 + heading_patterns = (
  1362 + r"^\d{1,2}(?:\.\d{1,2}){1,3}\s+",
  1363 + r"^第[一二三四五六七八九十]+[章节部分]",
  1364 + )
  1365 + return any(re.match(pat, stripped) for pat in heading_patterns)
  1366 +
  1367 + # 第一阶段:处理“有表头行 + 数据被串在一行”的情况
  1368 + header_cells = self._flatten_nested_cells((rows[0] or {}).get("cells", []))
  1369 + header_count = len(header_cells)
  1370 + overflow_fixed = None
  1371 + if header_count >= 2:
  1372 + rebuilt_rows: List[Dict[str, Any]] = [
  1373 + {
  1374 + **{k: v for k, v in (rows[0] or {}).items() if k != "cells"},
  1375 + "cells": header_cells,
  1376 + }
  1377 + ]
  1378 + changed = False
  1379 + for row in rows[1:]:
  1380 + cells = self._flatten_nested_cells((row or {}).get("cells", []))
  1381 + cell_count = len(cells)
  1382 + if cell_count <= header_count:
  1383 + rebuilt_rows.append({**{k: v for k, v in (row or {}).items() if k != "cells"}, "cells": cells})
  1384 + continue
  1385 +
  1386 + remainder = cell_count % header_count
  1387 + trimmed_cells = cells
  1388 + if remainder:
  1389 + trailing = cells[-remainder:]
  1390 + if all(_is_placeholder_cell(c) or _is_heading_like_cell(c) for c in trailing):
  1391 + trimmed_cells = cells[:-remainder]
  1392 + remainder = 0
  1393 +
  1394 + if remainder == 0 and len(trimmed_cells) >= header_count * 2:
  1395 + for i in range(0, len(trimmed_cells), header_count):
  1396 + chunk = trimmed_cells[i : i + header_count]
  1397 + rebuilt_rows.append({"cells": chunk})
  1398 + changed = True
  1399 + else:
  1400 + rebuilt_rows.append({**{k: v for k, v in (row or {}).items() if k != "cells"}, "cells": cells})
  1401 +
  1402 + if changed:
  1403 + overflow_fixed = rebuilt_rows
  1404 +
  1405 + if overflow_fixed is not None:
  1406 + rows = overflow_fixed
  1407 +
  1408 + if len(rows) != 1:
  1409 + # 只有一行的异常情况由后续逻辑处理;正常多行直接返回
1334 return rows 1410 return rows
1335 1411
1336 first_row = rows[0] 1412 first_row = rows[0]
@@ -1353,25 +1429,6 @@ class HTMLRenderer: @@ -1353,25 +1429,6 @@ class HTMLRenderer:
1353 # 单元格太少,不需要重组 1429 # 单元格太少,不需要重组
1354 return rows 1430 return rows
1355 1431
1356 - # 辅助函数:获取单元格文本  
1357 - def _get_cell_text(cell: Dict[str, Any]) -> str:  
1358 - """获取单元格的文本内容"""  
1359 - blocks = cell.get("blocks", [])  
1360 - for block in blocks:  
1361 - if isinstance(block, dict) and block.get("type") == "paragraph":  
1362 - inlines = block.get("inlines", [])  
1363 - for inline in inlines:  
1364 - if isinstance(inline, dict):  
1365 - text = inline.get("text", "")  
1366 - if text:  
1367 - return str(text).strip()  
1368 - return ""  
1369 -  
1370 - def _is_placeholder_cell(cell: Dict[str, Any]) -> bool:  
1371 - """判断单元格是否是占位符(如 '--', '-', '—' 等)"""  
1372 - text = _get_cell_text(cell)  
1373 - return text in ("--", "-", "—", "——", "", "N/A", "n/a")  
1374 -  
1375 # 先过滤掉占位符单元格 1432 # 先过滤掉占位符单元格
1376 all_cells = [c for c in all_cells if not _is_placeholder_cell(c)] 1433 all_cells = [c for c in all_cells if not _is_placeholder_cell(c)]
1377 1434