马一丁

Further fixes to table rendering logic

@@ -1168,9 +1168,192 @@ class ChapterGenerationNode(BaseNode): @@ -1168,9 +1168,192 @@ class ChapterGenerationNode(BaseNode):
1168 1168
1169 def _sanitize_table_block(self, block: Dict[str, Any]): 1169 def _sanitize_table_block(self, block: Dict[str, Any]):
1170 """保证表格的rows/cells结构合法且每个单元格包含至少一个block""" 1170 """保证表格的rows/cells结构合法且每个单元格包含至少一个block"""
1171 - rows = self._normalize_table_rows(block.get("rows")) 1171 + raw_rows = block.get("rows")
  1172 + # 先检测是否存在嵌套行结构问题(只有1行但cells中有嵌套)
  1173 + if isinstance(raw_rows, list) and len(raw_rows) == 1:
  1174 + first_row = raw_rows[0]
  1175 + if isinstance(first_row, dict):
  1176 + cells = first_row.get("cells", [])
  1177 + # 检测是否存在嵌套结构
  1178 + has_nested = any(
  1179 + isinstance(cell, dict) and "cells" in cell and "blocks" not in cell
  1180 + for cell in cells
  1181 + if isinstance(cell, dict)
  1182 + )
  1183 + if has_nested:
  1184 + # 修复嵌套行结构
  1185 + fixed_rows = self._fix_nested_rows_structure(raw_rows)
  1186 + block["rows"] = fixed_rows
  1187 + return
  1188 + # 正常情况下,使用标准规范化
  1189 + rows = self._normalize_table_rows(raw_rows)
1172 block["rows"] = rows 1190 block["rows"] = rows
1173 1191
  1192 + def _fix_nested_rows_structure(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
  1193 + """
  1194 + 修复嵌套错误的表格行结构。
  1195 +
  1196 + LLM生成的表格只有1行但所有数据被嵌套在cells中时,
  1197 + 本方法会展平所有单元格并重新组织成正确的多行结构。
  1198 +
  1199 + 参数:
  1200 + rows: 原始的表格行数组(应该只有1行)。
  1201 +
  1202 + 返回:
  1203 + List[Dict]: 修复后的多行表格结构。
  1204 + """
  1205 + if not rows or len(rows) != 1:
  1206 + return self._normalize_table_rows(rows)
  1207 +
  1208 + first_row = rows[0]
  1209 + original_cells = first_row.get("cells", [])
  1210 +
  1211 + # 递归展平所有嵌套的单元格
  1212 + all_cells = self._flatten_all_cells_recursive(original_cells)
  1213 +
  1214 + if len(all_cells) <= 1:
  1215 + return self._normalize_table_rows(rows)
  1216 +
  1217 + # 辅助函数:获取单元格文本
  1218 + def _get_cell_text(cell: Dict[str, Any]) -> str:
  1219 + blocks = cell.get("blocks", [])
  1220 + for block in blocks:
  1221 + if isinstance(block, dict) and block.get("type") == "paragraph":
  1222 + inlines = block.get("inlines", [])
  1223 + for inline in inlines:
  1224 + if isinstance(inline, dict):
  1225 + text = inline.get("text", "")
  1226 + if text:
  1227 + return str(text).strip()
  1228 + return ""
  1229 +
  1230 + def _is_placeholder_cell(cell: Dict[str, Any]) -> bool:
  1231 + """判断单元格是否是占位符"""
  1232 + text = _get_cell_text(cell)
  1233 + return text in ("--", "-", "—", "——", "", "N/A", "n/a")
  1234 +
  1235 + def _is_header_cell(cell: Dict[str, Any]) -> bool:
  1236 + """判断单元格是否像表头(通常有加粗标记或是典型表头词)"""
  1237 + blocks = cell.get("blocks", [])
  1238 + for block in blocks:
  1239 + if isinstance(block, dict) and block.get("type") == "paragraph":
  1240 + inlines = block.get("inlines", [])
  1241 + for inline in inlines:
  1242 + if isinstance(inline, dict):
  1243 + marks = inline.get("marks", [])
  1244 + if any(isinstance(m, dict) and m.get("type") == "bold" for m in marks):
  1245 + return True
  1246 + # 也检查典型的表头词
  1247 + text = _get_cell_text(cell)
  1248 + header_keywords = {
  1249 + "时间", "日期", "名称", "类型", "状态", "数量", "金额", "比例", "指标",
  1250 + "平台", "渠道", "来源", "描述", "说明", "备注", "序号", "编号",
  1251 + "事件", "关键", "数据", "支撑", "反应", "市场", "情感", "节点",
  1252 + "维度", "要点", "详情", "标签", "影响", "趋势", "权重", "类别",
  1253 + "信息", "内容", "风格", "偏好", "主要", "用户", "核心", "特征",
  1254 + "分类", "范围", "对象", "项目", "阶段", "周期", "频率", "等级",
  1255 + }
  1256 + return any(kw in text for kw in header_keywords) and len(text) <= 20
  1257 +
  1258 + # 过滤掉占位符单元格
  1259 + valid_cells = [c for c in all_cells if not _is_placeholder_cell(c)]
  1260 +
  1261 + if len(valid_cells) <= 1:
  1262 + return self._normalize_table_rows(rows)
  1263 +
  1264 + # 检测表头列数:统计连续的表头单元格数量
  1265 + header_count = 0
  1266 + for cell in valid_cells:
  1267 + if _is_header_cell(cell):
  1268 + header_count += 1
  1269 + else:
  1270 + break
  1271 +
  1272 + # 如果没有检测到表头,使用启发式方法
  1273 + if header_count == 0:
  1274 + total = len(valid_cells)
  1275 + for possible_cols in [4, 5, 3, 6, 2]:
  1276 + if total % possible_cols == 0:
  1277 + header_count = possible_cols
  1278 + break
  1279 + else:
  1280 + # 尝试找到最接近的能整除的列数
  1281 + for possible_cols in [4, 5, 3, 6, 2]:
  1282 + remainder = total % possible_cols
  1283 + if remainder <= 3:
  1284 + header_count = possible_cols
  1285 + break
  1286 + else:
  1287 + # 无法确定列数,使用原始数据
  1288 + return self._normalize_table_rows(rows)
  1289 +
  1290 + # 计算有效的单元格数量
  1291 + total = len(valid_cells)
  1292 + remainder = total % header_count
  1293 + if remainder > 0 and remainder <= 3:
  1294 + # 截断尾部多余的单元格
  1295 + valid_cells = valid_cells[:total - remainder]
  1296 + elif remainder > 3:
  1297 + # 余数太大,可能列数检测错误
  1298 + return self._normalize_table_rows(rows)
  1299 +
  1300 + # 重新组织成多行
  1301 + fixed_rows: List[Dict[str, Any]] = []
  1302 + for i in range(0, len(valid_cells), header_count):
  1303 + row_cells = valid_cells[i:i + header_count]
  1304 + # 标记第一行为表头
  1305 + if i == 0:
  1306 + for cell in row_cells:
  1307 + cell["header"] = True
  1308 + fixed_rows.append({"cells": row_cells})
  1309 +
  1310 + return fixed_rows if fixed_rows else self._normalize_table_rows(rows)
  1311 +
  1312 + def _flatten_all_cells_recursive(self, cells: List[Any]) -> List[Dict[str, Any]]:
  1313 + """
  1314 + 递归展平所有嵌套的单元格结构。
  1315 +
  1316 + 参数:
  1317 + cells: 可能包含嵌套结构的单元格数组。
  1318 +
  1319 + 返回:
  1320 + List[Dict]: 展平后的单元格数组,每个单元格都有blocks
  1321 + """
  1322 + if not cells:
  1323 + return []
  1324 +
  1325 + flattened: List[Dict[str, Any]] = []
  1326 +
  1327 + def _extract_cells(cell_or_list: Any) -> None:
  1328 + if not isinstance(cell_or_list, dict):
  1329 + if isinstance(cell_or_list, (str, int, float)):
  1330 + flattened.append({"blocks": [self._as_paragraph_block(str(cell_or_list))]})
  1331 + return
  1332 +
  1333 + # 如果当前对象有 blocks,说明它是一个有效的单元格
  1334 + if "blocks" in cell_or_list:
  1335 + # 创建单元格副本,移除嵌套的 cells
  1336 + clean_cell = {
  1337 + k: v for k, v in cell_or_list.items()
  1338 + if k != "cells"
  1339 + }
  1340 + # 确保blocks有效
  1341 + blocks = clean_cell.get("blocks")
  1342 + if not isinstance(blocks, list) or not blocks:
  1343 + clean_cell["blocks"] = [self._as_paragraph_block("")]
  1344 + flattened.append(clean_cell)
  1345 +
  1346 + # 如果当前对象有嵌套的 cells,递归处理
  1347 + nested_cells = cell_or_list.get("cells")
  1348 + if isinstance(nested_cells, list):
  1349 + for nested_cell in nested_cells:
  1350 + _extract_cells(nested_cell)
  1351 +
  1352 + for cell in cells:
  1353 + _extract_cells(cell)
  1354 +
  1355 + return flattened
  1356 +
1174 def _sanitize_engine_quote_block(self, block: Dict[str, Any]): 1357 def _sanitize_engine_quote_block(self, block: Dict[str, Any]):
1175 """engineQuote仅用于单Agent发言,内部仅允许paragraphtitle需锁定Agent名称""" 1358 """engineQuote仅用于单Agent发言,内部仅允许paragraphtitle需锁定Agent名称"""
1176 engine_raw = block.get("engine") 1359 engine_raw = block.get("engine")
@@ -1318,12 +1318,13 @@ class HTMLRenderer: @@ -1318,12 +1318,13 @@ class HTMLRenderer:
1318 """获取单元格的文本内容""" 1318 """获取单元格的文本内容"""
1319 blocks = cell.get("blocks", []) 1319 blocks = cell.get("blocks", [])
1320 for block in blocks: 1320 for block in blocks:
1321 - if block.get("type") == "paragraph": 1321 + if isinstance(block, dict) and block.get("type") == "paragraph":
1322 inlines = block.get("inlines", []) 1322 inlines = block.get("inlines", [])
1323 for inline in inlines: 1323 for inline in inlines:
1324 - text = inline.get("text", "")  
1325 - if text:  
1326 - return text.strip() 1324 + if isinstance(inline, dict):
  1325 + text = inline.get("text", "")
  1326 + if text:
  1327 + return str(text).strip()
1327 return "" 1328 return ""
1328 1329
1329 def _is_placeholder_cell(cell: Dict[str, Any]) -> bool: 1330 def _is_placeholder_cell(cell: Dict[str, Any]) -> bool:
@@ -1337,21 +1338,31 @@ class HTMLRenderer: @@ -1337,21 +1338,31 @@ class HTMLRenderer:
1337 if len(all_cells) <= 2: 1338 if len(all_cells) <= 2:
1338 return rows 1339 return rows
1339 1340
1340 - # 检测表头列数:查找带有 bold 标记的单元格 1341 + # 检测表头列数:查找带有 bold 标记或典型表头词的单元格
1341 def _is_header_cell(cell: Dict[str, Any]) -> bool: 1342 def _is_header_cell(cell: Dict[str, Any]) -> bool:
1342 - """判断单元格是否像表头(通常有加粗标记)""" 1343 + """判断单元格是否像表头(有加粗标记或是典型表头词)"""
1343 blocks = cell.get("blocks", []) 1344 blocks = cell.get("blocks", [])
1344 for block in blocks: 1345 for block in blocks:
1345 - if block.get("type") == "paragraph": 1346 + if isinstance(block, dict) and block.get("type") == "paragraph":
1346 inlines = block.get("inlines", []) 1347 inlines = block.get("inlines", [])
1347 for inline in inlines: 1348 for inline in inlines:
1348 - marks = inline.get("marks", [])  
1349 - if any(m.get("type") == "bold" for m in marks):  
1350 - return True  
1351 - return False  
1352 -  
1353 - # 计算表头列数:统计连续的加粗单元格数量  
1354 - # 占位符已经在前面被过滤掉了 1349 + if isinstance(inline, dict):
  1350 + marks = inline.get("marks", [])
  1351 + if any(isinstance(m, dict) and m.get("type") == "bold" for m in marks):
  1352 + return True
  1353 + # 也检查典型的表头词
  1354 + text = _get_cell_text(cell)
  1355 + header_keywords = {
  1356 + "时间", "日期", "名称", "类型", "状态", "数量", "金额", "比例", "指标",
  1357 + "平台", "渠道", "来源", "描述", "说明", "备注", "序号", "编号",
  1358 + "事件", "关键", "数据", "支撑", "反应", "市场", "情感", "节点",
  1359 + "维度", "要点", "详情", "标签", "影响", "趋势", "权重", "类别",
  1360 + "信息", "内容", "风格", "偏好", "主要", "用户", "核心", "特征",
  1361 + "分类", "范围", "对象", "项目", "阶段", "周期", "频率", "等级",
  1362 + }
  1363 + return any(kw in text for kw in header_keywords) and len(text) <= 20
  1364 +
  1365 + # 计算表头列数:统计连续的表头单元格数量
1355 header_count = 0 1366 header_count = 0
1356 for cell in all_cells: 1367 for cell in all_cells:
1357 if _is_header_cell(cell): 1368 if _is_header_cell(cell):
@@ -1364,13 +1375,13 @@ class HTMLRenderer: @@ -1364,13 +1375,13 @@ class HTMLRenderer:
1364 if header_count == 0: 1375 if header_count == 0:
1365 # 假设列数为 4 或 5(常见的表格列数) 1376 # 假设列数为 4 或 5(常见的表格列数)
1366 total = len(all_cells) 1377 total = len(all_cells)
1367 - for possible_cols in [4, 5, 3, 6]: 1378 + for possible_cols in [4, 5, 3, 6, 2]:
1368 if total % possible_cols == 0: 1379 if total % possible_cols == 0:
1369 header_count = possible_cols 1380 header_count = possible_cols
1370 break 1381 break
1371 else: 1382 else:
1372 # 尝试找到最接近的能整除的列数 1383 # 尝试找到最接近的能整除的列数
1373 - for possible_cols in [4, 5, 3, 6]: 1384 + for possible_cols in [4, 5, 3, 6, 2]:
1374 remainder = total % possible_cols 1385 remainder = total % possible_cols
1375 # 允许最多3个多余的单元格(可能是尾部的总结或注释) 1386 # 允许最多3个多余的单元格(可能是尾部的总结或注释)
1376 if remainder <= 3: 1387 if remainder <= 3:
@@ -254,12 +254,13 @@ class MarkdownRenderer: @@ -254,12 +254,13 @@ class MarkdownRenderer:
254 """获取单元格的文本内容""" 254 """获取单元格的文本内容"""
255 blocks = cell.get("blocks", []) 255 blocks = cell.get("blocks", [])
256 for block in blocks: 256 for block in blocks:
257 - if block.get("type") == "paragraph": 257 + if isinstance(block, dict) and block.get("type") == "paragraph":
258 inlines = block.get("inlines", []) 258 inlines = block.get("inlines", [])
259 for inline in inlines: 259 for inline in inlines:
260 - text = inline.get("text", "")  
261 - if text:  
262 - return text.strip() 260 + if isinstance(inline, dict):
  261 + text = inline.get("text", "")
  262 + if text:
  263 + return str(text).strip()
263 return "" 264 return ""
264 265
265 def _is_placeholder_cell(cell: Dict[str, Any]) -> bool: 266 def _is_placeholder_cell(cell: Dict[str, Any]) -> bool:
@@ -273,21 +274,31 @@ class MarkdownRenderer: @@ -273,21 +274,31 @@ class MarkdownRenderer:
273 if len(all_cells) <= 2: 274 if len(all_cells) <= 2:
274 return rows 275 return rows
275 276
276 - # 检测表头列数:查找带有 bold 标记的单元格 277 + # 检测表头列数:查找带有 bold 标记或典型表头词的单元格
277 def _is_header_cell(cell: Dict[str, Any]) -> bool: 278 def _is_header_cell(cell: Dict[str, Any]) -> bool:
278 - """判断单元格是否像表头(通常有加粗标记)""" 279 + """判断单元格是否像表头(有加粗标记或是典型表头词)"""
279 blocks = cell.get("blocks", []) 280 blocks = cell.get("blocks", [])
280 for block in blocks: 281 for block in blocks:
281 - if block.get("type") == "paragraph": 282 + if isinstance(block, dict) and block.get("type") == "paragraph":
282 inlines = block.get("inlines", []) 283 inlines = block.get("inlines", [])
283 for inline in inlines: 284 for inline in inlines:
284 - marks = inline.get("marks", [])  
285 - if any(m.get("type") == "bold" for m in marks):  
286 - return True  
287 - return False  
288 -  
289 - # 计算表头列数:统计连续的加粗单元格数量  
290 - # 占位符已经在前面被过滤掉了 285 + if isinstance(inline, dict):
  286 + marks = inline.get("marks", [])
  287 + if any(isinstance(m, dict) and m.get("type") == "bold" for m in marks):
  288 + return True
  289 + # 也检查典型的表头词
  290 + text = _get_cell_text(cell)
  291 + header_keywords = {
  292 + "时间", "日期", "名称", "类型", "状态", "数量", "金额", "比例", "指标",
  293 + "平台", "渠道", "来源", "描述", "说明", "备注", "序号", "编号",
  294 + "事件", "关键", "数据", "支撑", "反应", "市场", "情感", "节点",
  295 + "维度", "要点", "详情", "标签", "影响", "趋势", "权重", "类别",
  296 + "信息", "内容", "风格", "偏好", "主要", "用户", "核心", "特征",
  297 + "分类", "范围", "对象", "项目", "阶段", "周期", "频率", "等级",
  298 + }
  299 + return any(kw in text for kw in header_keywords) and len(text) <= 20
  300 +
  301 + # 计算表头列数:统计连续的表头单元格数量
291 header_count = 0 302 header_count = 0
292 for cell in all_cells: 303 for cell in all_cells:
293 if _is_header_cell(cell): 304 if _is_header_cell(cell):
@@ -300,13 +311,13 @@ class MarkdownRenderer: @@ -300,13 +311,13 @@ class MarkdownRenderer:
300 if header_count == 0: 311 if header_count == 0:
301 # 假设列数为 4 或 5(常见的表格列数) 312 # 假设列数为 4 或 5(常见的表格列数)
302 total = len(all_cells) 313 total = len(all_cells)
303 - for possible_cols in [4, 5, 3, 6]: 314 + for possible_cols in [4, 5, 3, 6, 2]:
304 if total % possible_cols == 0: 315 if total % possible_cols == 0:
305 header_count = possible_cols 316 header_count = possible_cols
306 break 317 break
307 else: 318 else:
308 # 尝试找到最接近的能整除的列数 319 # 尝试找到最接近的能整除的列数
309 - for possible_cols in [4, 5, 3, 6]: 320 + for possible_cols in [4, 5, 3, 6, 2]:
310 remainder = total % possible_cols 321 remainder = total % possible_cols
311 # 允许最多3个多余的单元格(可能是尾部的总结或注释) 322 # 允许最多3个多余的单元格(可能是尾部的总结或注释)
312 if remainder <= 3: 323 if remainder <= 3: