Showing
3 changed files
with
238 additions
and
33 deletions
| @@ -1168,9 +1168,192 @@ class ChapterGenerationNode(BaseNode): | @@ -1168,9 +1168,192 @@ class ChapterGenerationNode(BaseNode): | ||
| 1168 | 1168 | ||
| 1169 | def _sanitize_table_block(self, block: Dict[str, Any]): | 1169 | def _sanitize_table_block(self, block: Dict[str, Any]): |
| 1170 | """保证表格的rows/cells结构合法且每个单元格包含至少一个block""" | 1170 | """保证表格的rows/cells结构合法且每个单元格包含至少一个block""" |
| 1171 | - rows = self._normalize_table_rows(block.get("rows")) | 1171 | + raw_rows = block.get("rows") |
| 1172 | + # 先检测是否存在嵌套行结构问题(只有1行但cells中有嵌套) | ||
| 1173 | + if isinstance(raw_rows, list) and len(raw_rows) == 1: | ||
| 1174 | + first_row = raw_rows[0] | ||
| 1175 | + if isinstance(first_row, dict): | ||
| 1176 | + cells = first_row.get("cells", []) | ||
| 1177 | + # 检测是否存在嵌套结构 | ||
| 1178 | + has_nested = any( | ||
| 1179 | + isinstance(cell, dict) and "cells" in cell and "blocks" not in cell | ||
| 1180 | + for cell in cells | ||
| 1181 | + if isinstance(cell, dict) | ||
| 1182 | + ) | ||
| 1183 | + if has_nested: | ||
| 1184 | + # 修复嵌套行结构 | ||
| 1185 | + fixed_rows = self._fix_nested_rows_structure(raw_rows) | ||
| 1186 | + block["rows"] = fixed_rows | ||
| 1187 | + return | ||
| 1188 | + # 正常情况下,使用标准规范化 | ||
| 1189 | + rows = self._normalize_table_rows(raw_rows) | ||
| 1172 | block["rows"] = rows | 1190 | block["rows"] = rows |
| 1173 | 1191 | ||
| 1192 | + def _fix_nested_rows_structure(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]: | ||
| 1193 | + """ | ||
| 1194 | + 修复嵌套错误的表格行结构。 | ||
| 1195 | + | ||
| 1196 | + 当LLM生成的表格只有1行但所有数据被嵌套在cells中时, | ||
| 1197 | + 本方法会展平所有单元格并重新组织成正确的多行结构。 | ||
| 1198 | + | ||
| 1199 | + 参数: | ||
| 1200 | + rows: 原始的表格行数组(应该只有1行)。 | ||
| 1201 | + | ||
| 1202 | + 返回: | ||
| 1203 | + List[Dict]: 修复后的多行表格结构。 | ||
| 1204 | + """ | ||
| 1205 | + if not rows or len(rows) != 1: | ||
| 1206 | + return self._normalize_table_rows(rows) | ||
| 1207 | + | ||
| 1208 | + first_row = rows[0] | ||
| 1209 | + original_cells = first_row.get("cells", []) | ||
| 1210 | + | ||
| 1211 | + # 递归展平所有嵌套的单元格 | ||
| 1212 | + all_cells = self._flatten_all_cells_recursive(original_cells) | ||
| 1213 | + | ||
| 1214 | + if len(all_cells) <= 1: | ||
| 1215 | + return self._normalize_table_rows(rows) | ||
| 1216 | + | ||
| 1217 | + # 辅助函数:获取单元格文本 | ||
| 1218 | + def _get_cell_text(cell: Dict[str, Any]) -> str: | ||
| 1219 | + blocks = cell.get("blocks", []) | ||
| 1220 | + for block in blocks: | ||
| 1221 | + if isinstance(block, dict) and block.get("type") == "paragraph": | ||
| 1222 | + inlines = block.get("inlines", []) | ||
| 1223 | + for inline in inlines: | ||
| 1224 | + if isinstance(inline, dict): | ||
| 1225 | + text = inline.get("text", "") | ||
| 1226 | + if text: | ||
| 1227 | + return str(text).strip() | ||
| 1228 | + return "" | ||
| 1229 | + | ||
| 1230 | + def _is_placeholder_cell(cell: Dict[str, Any]) -> bool: | ||
| 1231 | + """判断单元格是否是占位符""" | ||
| 1232 | + text = _get_cell_text(cell) | ||
| 1233 | + return text in ("--", "-", "—", "——", "", "N/A", "n/a") | ||
| 1234 | + | ||
| 1235 | + def _is_header_cell(cell: Dict[str, Any]) -> bool: | ||
| 1236 | + """判断单元格是否像表头(通常有加粗标记或是典型表头词)""" | ||
| 1237 | + blocks = cell.get("blocks", []) | ||
| 1238 | + for block in blocks: | ||
| 1239 | + if isinstance(block, dict) and block.get("type") == "paragraph": | ||
| 1240 | + inlines = block.get("inlines", []) | ||
| 1241 | + for inline in inlines: | ||
| 1242 | + if isinstance(inline, dict): | ||
| 1243 | + marks = inline.get("marks", []) | ||
| 1244 | + if any(isinstance(m, dict) and m.get("type") == "bold" for m in marks): | ||
| 1245 | + return True | ||
| 1246 | + # 也检查典型的表头词 | ||
| 1247 | + text = _get_cell_text(cell) | ||
| 1248 | + header_keywords = { | ||
| 1249 | + "时间", "日期", "名称", "类型", "状态", "数量", "金额", "比例", "指标", | ||
| 1250 | + "平台", "渠道", "来源", "描述", "说明", "备注", "序号", "编号", | ||
| 1251 | + "事件", "关键", "数据", "支撑", "反应", "市场", "情感", "节点", | ||
| 1252 | + "维度", "要点", "详情", "标签", "影响", "趋势", "权重", "类别", | ||
| 1253 | + "信息", "内容", "风格", "偏好", "主要", "用户", "核心", "特征", | ||
| 1254 | + "分类", "范围", "对象", "项目", "阶段", "周期", "频率", "等级", | ||
| 1255 | + } | ||
| 1256 | + return any(kw in text for kw in header_keywords) and len(text) <= 20 | ||
| 1257 | + | ||
| 1258 | + # 过滤掉占位符单元格 | ||
| 1259 | + valid_cells = [c for c in all_cells if not _is_placeholder_cell(c)] | ||
| 1260 | + | ||
| 1261 | + if len(valid_cells) <= 1: | ||
| 1262 | + return self._normalize_table_rows(rows) | ||
| 1263 | + | ||
| 1264 | + # 检测表头列数:统计连续的表头单元格数量 | ||
| 1265 | + header_count = 0 | ||
| 1266 | + for cell in valid_cells: | ||
| 1267 | + if _is_header_cell(cell): | ||
| 1268 | + header_count += 1 | ||
| 1269 | + else: | ||
| 1270 | + break | ||
| 1271 | + | ||
| 1272 | + # 如果没有检测到表头,使用启发式方法 | ||
| 1273 | + if header_count == 0: | ||
| 1274 | + total = len(valid_cells) | ||
| 1275 | + for possible_cols in [4, 5, 3, 6, 2]: | ||
| 1276 | + if total % possible_cols == 0: | ||
| 1277 | + header_count = possible_cols | ||
| 1278 | + break | ||
| 1279 | + else: | ||
| 1280 | + # 尝试找到最接近的能整除的列数 | ||
| 1281 | + for possible_cols in [4, 5, 3, 6, 2]: | ||
| 1282 | + remainder = total % possible_cols | ||
| 1283 | + if remainder <= 3: | ||
| 1284 | + header_count = possible_cols | ||
| 1285 | + break | ||
| 1286 | + else: | ||
| 1287 | + # 无法确定列数,使用原始数据 | ||
| 1288 | + return self._normalize_table_rows(rows) | ||
| 1289 | + | ||
| 1290 | + # 计算有效的单元格数量 | ||
| 1291 | + total = len(valid_cells) | ||
| 1292 | + remainder = total % header_count | ||
| 1293 | + if remainder > 0 and remainder <= 3: | ||
| 1294 | + # 截断尾部多余的单元格 | ||
| 1295 | + valid_cells = valid_cells[:total - remainder] | ||
| 1296 | + elif remainder > 3: | ||
| 1297 | + # 余数太大,可能列数检测错误 | ||
| 1298 | + return self._normalize_table_rows(rows) | ||
| 1299 | + | ||
| 1300 | + # 重新组织成多行 | ||
| 1301 | + fixed_rows: List[Dict[str, Any]] = [] | ||
| 1302 | + for i in range(0, len(valid_cells), header_count): | ||
| 1303 | + row_cells = valid_cells[i:i + header_count] | ||
| 1304 | + # 标记第一行为表头 | ||
| 1305 | + if i == 0: | ||
| 1306 | + for cell in row_cells: | ||
| 1307 | + cell["header"] = True | ||
| 1308 | + fixed_rows.append({"cells": row_cells}) | ||
| 1309 | + | ||
| 1310 | + return fixed_rows if fixed_rows else self._normalize_table_rows(rows) | ||
| 1311 | + | ||
| 1312 | + def _flatten_all_cells_recursive(self, cells: List[Any]) -> List[Dict[str, Any]]: | ||
| 1313 | + """ | ||
| 1314 | + 递归展平所有嵌套的单元格结构。 | ||
| 1315 | + | ||
| 1316 | + 参数: | ||
| 1317 | + cells: 可能包含嵌套结构的单元格数组。 | ||
| 1318 | + | ||
| 1319 | + 返回: | ||
| 1320 | + List[Dict]: 展平后的单元格数组,每个单元格都有blocks。 | ||
| 1321 | + """ | ||
| 1322 | + if not cells: | ||
| 1323 | + return [] | ||
| 1324 | + | ||
| 1325 | + flattened: List[Dict[str, Any]] = [] | ||
| 1326 | + | ||
| 1327 | + def _extract_cells(cell_or_list: Any) -> None: | ||
| 1328 | + if not isinstance(cell_or_list, dict): | ||
| 1329 | + if isinstance(cell_or_list, (str, int, float)): | ||
| 1330 | + flattened.append({"blocks": [self._as_paragraph_block(str(cell_or_list))]}) | ||
| 1331 | + return | ||
| 1332 | + | ||
| 1333 | + # 如果当前对象有 blocks,说明它是一个有效的单元格 | ||
| 1334 | + if "blocks" in cell_or_list: | ||
| 1335 | + # 创建单元格副本,移除嵌套的 cells | ||
| 1336 | + clean_cell = { | ||
| 1337 | + k: v for k, v in cell_or_list.items() | ||
| 1338 | + if k != "cells" | ||
| 1339 | + } | ||
| 1340 | + # 确保blocks有效 | ||
| 1341 | + blocks = clean_cell.get("blocks") | ||
| 1342 | + if not isinstance(blocks, list) or not blocks: | ||
| 1343 | + clean_cell["blocks"] = [self._as_paragraph_block("")] | ||
| 1344 | + flattened.append(clean_cell) | ||
| 1345 | + | ||
| 1346 | + # 如果当前对象有嵌套的 cells,递归处理 | ||
| 1347 | + nested_cells = cell_or_list.get("cells") | ||
| 1348 | + if isinstance(nested_cells, list): | ||
| 1349 | + for nested_cell in nested_cells: | ||
| 1350 | + _extract_cells(nested_cell) | ||
| 1351 | + | ||
| 1352 | + for cell in cells: | ||
| 1353 | + _extract_cells(cell) | ||
| 1354 | + | ||
| 1355 | + return flattened | ||
| 1356 | + | ||
| 1174 | def _sanitize_engine_quote_block(self, block: Dict[str, Any]): | 1357 | def _sanitize_engine_quote_block(self, block: Dict[str, Any]): |
| 1175 | """engineQuote仅用于单Agent发言,内部仅允许paragraph且title需锁定Agent名称""" | 1358 | """engineQuote仅用于单Agent发言,内部仅允许paragraph且title需锁定Agent名称""" |
| 1176 | engine_raw = block.get("engine") | 1359 | engine_raw = block.get("engine") |
| @@ -1318,12 +1318,13 @@ class HTMLRenderer: | @@ -1318,12 +1318,13 @@ class HTMLRenderer: | ||
| 1318 | """获取单元格的文本内容""" | 1318 | """获取单元格的文本内容""" |
| 1319 | blocks = cell.get("blocks", []) | 1319 | blocks = cell.get("blocks", []) |
| 1320 | for block in blocks: | 1320 | for block in blocks: |
| 1321 | - if block.get("type") == "paragraph": | 1321 | + if isinstance(block, dict) and block.get("type") == "paragraph": |
| 1322 | inlines = block.get("inlines", []) | 1322 | inlines = block.get("inlines", []) |
| 1323 | for inline in inlines: | 1323 | for inline in inlines: |
| 1324 | - text = inline.get("text", "") | ||
| 1325 | - if text: | ||
| 1326 | - return text.strip() | 1324 | + if isinstance(inline, dict): |
| 1325 | + text = inline.get("text", "") | ||
| 1326 | + if text: | ||
| 1327 | + return str(text).strip() | ||
| 1327 | return "" | 1328 | return "" |
| 1328 | 1329 | ||
| 1329 | def _is_placeholder_cell(cell: Dict[str, Any]) -> bool: | 1330 | def _is_placeholder_cell(cell: Dict[str, Any]) -> bool: |
| @@ -1337,21 +1338,31 @@ class HTMLRenderer: | @@ -1337,21 +1338,31 @@ class HTMLRenderer: | ||
| 1337 | if len(all_cells) <= 2: | 1338 | if len(all_cells) <= 2: |
| 1338 | return rows | 1339 | return rows |
| 1339 | 1340 | ||
| 1340 | - # 检测表头列数:查找带有 bold 标记的单元格 | 1341 | + # 检测表头列数:查找带有 bold 标记或典型表头词的单元格 |
| 1341 | def _is_header_cell(cell: Dict[str, Any]) -> bool: | 1342 | def _is_header_cell(cell: Dict[str, Any]) -> bool: |
| 1342 | - """判断单元格是否像表头(通常有加粗标记)""" | 1343 | + """判断单元格是否像表头(有加粗标记或是典型表头词)""" |
| 1343 | blocks = cell.get("blocks", []) | 1344 | blocks = cell.get("blocks", []) |
| 1344 | for block in blocks: | 1345 | for block in blocks: |
| 1345 | - if block.get("type") == "paragraph": | 1346 | + if isinstance(block, dict) and block.get("type") == "paragraph": |
| 1346 | inlines = block.get("inlines", []) | 1347 | inlines = block.get("inlines", []) |
| 1347 | for inline in inlines: | 1348 | for inline in inlines: |
| 1348 | - marks = inline.get("marks", []) | ||
| 1349 | - if any(m.get("type") == "bold" for m in marks): | ||
| 1350 | - return True | ||
| 1351 | - return False | ||
| 1352 | - | ||
| 1353 | - # 计算表头列数:统计连续的加粗单元格数量 | ||
| 1354 | - # 占位符已经在前面被过滤掉了 | 1349 | + if isinstance(inline, dict): |
| 1350 | + marks = inline.get("marks", []) | ||
| 1351 | + if any(isinstance(m, dict) and m.get("type") == "bold" for m in marks): | ||
| 1352 | + return True | ||
| 1353 | + # 也检查典型的表头词 | ||
| 1354 | + text = _get_cell_text(cell) | ||
| 1355 | + header_keywords = { | ||
| 1356 | + "时间", "日期", "名称", "类型", "状态", "数量", "金额", "比例", "指标", | ||
| 1357 | + "平台", "渠道", "来源", "描述", "说明", "备注", "序号", "编号", | ||
| 1358 | + "事件", "关键", "数据", "支撑", "反应", "市场", "情感", "节点", | ||
| 1359 | + "维度", "要点", "详情", "标签", "影响", "趋势", "权重", "类别", | ||
| 1360 | + "信息", "内容", "风格", "偏好", "主要", "用户", "核心", "特征", | ||
| 1361 | + "分类", "范围", "对象", "项目", "阶段", "周期", "频率", "等级", | ||
| 1362 | + } | ||
| 1363 | + return any(kw in text for kw in header_keywords) and len(text) <= 20 | ||
| 1364 | + | ||
| 1365 | + # 计算表头列数:统计连续的表头单元格数量 | ||
| 1355 | header_count = 0 | 1366 | header_count = 0 |
| 1356 | for cell in all_cells: | 1367 | for cell in all_cells: |
| 1357 | if _is_header_cell(cell): | 1368 | if _is_header_cell(cell): |
| @@ -1364,13 +1375,13 @@ class HTMLRenderer: | @@ -1364,13 +1375,13 @@ class HTMLRenderer: | ||
| 1364 | if header_count == 0: | 1375 | if header_count == 0: |
| 1365 | # 假设列数为 4 或 5(常见的表格列数) | 1376 | # 假设列数为 4 或 5(常见的表格列数) |
| 1366 | total = len(all_cells) | 1377 | total = len(all_cells) |
| 1367 | - for possible_cols in [4, 5, 3, 6]: | 1378 | + for possible_cols in [4, 5, 3, 6, 2]: |
| 1368 | if total % possible_cols == 0: | 1379 | if total % possible_cols == 0: |
| 1369 | header_count = possible_cols | 1380 | header_count = possible_cols |
| 1370 | break | 1381 | break |
| 1371 | else: | 1382 | else: |
| 1372 | # 尝试找到最接近的能整除的列数 | 1383 | # 尝试找到最接近的能整除的列数 |
| 1373 | - for possible_cols in [4, 5, 3, 6]: | 1384 | + for possible_cols in [4, 5, 3, 6, 2]: |
| 1374 | remainder = total % possible_cols | 1385 | remainder = total % possible_cols |
| 1375 | # 允许最多3个多余的单元格(可能是尾部的总结或注释) | 1386 | # 允许最多3个多余的单元格(可能是尾部的总结或注释) |
| 1376 | if remainder <= 3: | 1387 | if remainder <= 3: |
| @@ -254,12 +254,13 @@ class MarkdownRenderer: | @@ -254,12 +254,13 @@ class MarkdownRenderer: | ||
| 254 | """获取单元格的文本内容""" | 254 | """获取单元格的文本内容""" |
| 255 | blocks = cell.get("blocks", []) | 255 | blocks = cell.get("blocks", []) |
| 256 | for block in blocks: | 256 | for block in blocks: |
| 257 | - if block.get("type") == "paragraph": | 257 | + if isinstance(block, dict) and block.get("type") == "paragraph": |
| 258 | inlines = block.get("inlines", []) | 258 | inlines = block.get("inlines", []) |
| 259 | for inline in inlines: | 259 | for inline in inlines: |
| 260 | - text = inline.get("text", "") | ||
| 261 | - if text: | ||
| 262 | - return text.strip() | 260 | + if isinstance(inline, dict): |
| 261 | + text = inline.get("text", "") | ||
| 262 | + if text: | ||
| 263 | + return str(text).strip() | ||
| 263 | return "" | 264 | return "" |
| 264 | 265 | ||
| 265 | def _is_placeholder_cell(cell: Dict[str, Any]) -> bool: | 266 | def _is_placeholder_cell(cell: Dict[str, Any]) -> bool: |
| @@ -273,21 +274,31 @@ class MarkdownRenderer: | @@ -273,21 +274,31 @@ class MarkdownRenderer: | ||
| 273 | if len(all_cells) <= 2: | 274 | if len(all_cells) <= 2: |
| 274 | return rows | 275 | return rows |
| 275 | 276 | ||
| 276 | - # 检测表头列数:查找带有 bold 标记的单元格 | 277 | + # 检测表头列数:查找带有 bold 标记或典型表头词的单元格 |
| 277 | def _is_header_cell(cell: Dict[str, Any]) -> bool: | 278 | def _is_header_cell(cell: Dict[str, Any]) -> bool: |
| 278 | - """判断单元格是否像表头(通常有加粗标记)""" | 279 | + """判断单元格是否像表头(有加粗标记或是典型表头词)""" |
| 279 | blocks = cell.get("blocks", []) | 280 | blocks = cell.get("blocks", []) |
| 280 | for block in blocks: | 281 | for block in blocks: |
| 281 | - if block.get("type") == "paragraph": | 282 | + if isinstance(block, dict) and block.get("type") == "paragraph": |
| 282 | inlines = block.get("inlines", []) | 283 | inlines = block.get("inlines", []) |
| 283 | for inline in inlines: | 284 | for inline in inlines: |
| 284 | - marks = inline.get("marks", []) | ||
| 285 | - if any(m.get("type") == "bold" for m in marks): | ||
| 286 | - return True | ||
| 287 | - return False | ||
| 288 | - | ||
| 289 | - # 计算表头列数:统计连续的加粗单元格数量 | ||
| 290 | - # 占位符已经在前面被过滤掉了 | 285 | + if isinstance(inline, dict): |
| 286 | + marks = inline.get("marks", []) | ||
| 287 | + if any(isinstance(m, dict) and m.get("type") == "bold" for m in marks): | ||
| 288 | + return True | ||
| 289 | + # 也检查典型的表头词 | ||
| 290 | + text = _get_cell_text(cell) | ||
| 291 | + header_keywords = { | ||
| 292 | + "时间", "日期", "名称", "类型", "状态", "数量", "金额", "比例", "指标", | ||
| 293 | + "平台", "渠道", "来源", "描述", "说明", "备注", "序号", "编号", | ||
| 294 | + "事件", "关键", "数据", "支撑", "反应", "市场", "情感", "节点", | ||
| 295 | + "维度", "要点", "详情", "标签", "影响", "趋势", "权重", "类别", | ||
| 296 | + "信息", "内容", "风格", "偏好", "主要", "用户", "核心", "特征", | ||
| 297 | + "分类", "范围", "对象", "项目", "阶段", "周期", "频率", "等级", | ||
| 298 | + } | ||
| 299 | + return any(kw in text for kw in header_keywords) and len(text) <= 20 | ||
| 300 | + | ||
| 301 | + # 计算表头列数:统计连续的表头单元格数量 | ||
| 291 | header_count = 0 | 302 | header_count = 0 |
| 292 | for cell in all_cells: | 303 | for cell in all_cells: |
| 293 | if _is_header_cell(cell): | 304 | if _is_header_cell(cell): |
| @@ -300,13 +311,13 @@ class MarkdownRenderer: | @@ -300,13 +311,13 @@ class MarkdownRenderer: | ||
| 300 | if header_count == 0: | 311 | if header_count == 0: |
| 301 | # 假设列数为 4 或 5(常见的表格列数) | 312 | # 假设列数为 4 或 5(常见的表格列数) |
| 302 | total = len(all_cells) | 313 | total = len(all_cells) |
| 303 | - for possible_cols in [4, 5, 3, 6]: | 314 | + for possible_cols in [4, 5, 3, 6, 2]: |
| 304 | if total % possible_cols == 0: | 315 | if total % possible_cols == 0: |
| 305 | header_count = possible_cols | 316 | header_count = possible_cols |
| 306 | break | 317 | break |
| 307 | else: | 318 | else: |
| 308 | # 尝试找到最接近的能整除的列数 | 319 | # 尝试找到最接近的能整除的列数 |
| 309 | - for possible_cols in [4, 5, 3, 6]: | 320 | + for possible_cols in [4, 5, 3, 6, 2]: |
| 310 | remainder = total % possible_cols | 321 | remainder = total % possible_cols |
| 311 | # 允许最多3个多余的单元格(可能是尾部的总结或注释) | 322 | # 允许最多3个多余的单元格(可能是尾部的总结或注释) |
| 312 | if remainder <= 3: | 323 | if remainder <= 3: |
-
Please register or login to post a comment