Showing
6 changed files
with
1583 additions
and
3 deletions
| @@ -1270,18 +1270,73 @@ class ChapterGenerationNode(BaseNode): | @@ -1270,18 +1270,73 @@ class ChapterGenerationNode(BaseNode): | ||
| 1270 | 1270 | ||
| 1271 | normalized_cells: List[Dict[str, Any]] = [] | 1271 | normalized_cells: List[Dict[str, Any]] = [] |
| 1272 | for cell in cell_entries: | 1272 | for cell in cell_entries: |
| 1273 | - sanitized = self._normalize_table_cell(cell) | ||
| 1274 | - if sanitized: | ||
| 1275 | - normalized_cells.append(sanitized) | 1273 | + # 检测错误嵌套的 cells 结构:有 cells 但没有 blocks |
| 1274 | + # 需要展平成多个独立的 cells | ||
| 1275 | + if isinstance(cell, dict) and "cells" in cell and "blocks" not in cell: | ||
| 1276 | + flattened = self._flatten_all_nested_cells(cell) | ||
| 1277 | + normalized_cells.extend(flattened) | ||
| 1278 | + else: | ||
| 1279 | + sanitized = self._normalize_table_cell(cell) | ||
| 1280 | + if sanitized: | ||
| 1281 | + normalized_cells.append(sanitized) | ||
| 1276 | 1282 | ||
| 1277 | return normalized_cells | 1283 | return normalized_cells |
| 1278 | 1284 | ||
| 1285 | + def _flatten_all_nested_cells(self, cell: Dict[str, Any]) -> List[Dict[str, Any]]: | ||
| 1286 | + """ | ||
| 1287 | + 展平错误嵌套的 cells 结构,返回所有展平后的 cells。 | ||
| 1288 | + | ||
| 1289 | + LLM 有时会生成类似这样的错误结构: | ||
| 1290 | + { "cells": [ | ||
| 1291 | + { "blocks": [...] }, | ||
| 1292 | + { "cells": [ | ||
| 1293 | + { "blocks": [...] }, | ||
| 1294 | + { "cells": [...] } | ||
| 1295 | + ] | ||
| 1296 | + } | ||
| 1297 | + ] | ||
| 1298 | + } | ||
| 1299 | + | ||
| 1300 | + 应该展平为独立的 cells 列表。 | ||
| 1301 | + """ | ||
| 1302 | + nested_cells = cell.get("cells") | ||
| 1303 | + if not isinstance(nested_cells, list) or not nested_cells: | ||
| 1304 | + return [{"blocks": [self._as_paragraph_block("")]}] | ||
| 1305 | + | ||
| 1306 | + result: List[Dict[str, Any]] = [] | ||
| 1307 | + for nested in nested_cells: | ||
| 1308 | + if isinstance(nested, dict): | ||
| 1309 | + if "blocks" in nested and "cells" not in nested: | ||
| 1310 | + # 正常的 cell,直接规范化添加 | ||
| 1311 | + sanitized = self._normalize_table_cell(nested) | ||
| 1312 | + if sanitized: | ||
| 1313 | + result.append(sanitized) | ||
| 1314 | + elif "cells" in nested and "blocks" not in nested: | ||
| 1315 | + # 继续递归展平嵌套的 cells | ||
| 1316 | + result.extend(self._flatten_all_nested_cells(nested)) | ||
| 1317 | + else: | ||
| 1318 | + # 其他情况,尝试规范化 | ||
| 1319 | + sanitized = self._normalize_table_cell(nested) | ||
| 1320 | + if sanitized: | ||
| 1321 | + result.append(sanitized) | ||
| 1322 | + elif isinstance(nested, (str, int, float)): | ||
| 1323 | + result.append({"blocks": [self._as_paragraph_block(str(nested))]}) | ||
| 1324 | + | ||
| 1325 | + return result if result else [{"blocks": [self._as_paragraph_block("")]}] | ||
| 1326 | + | ||
| 1279 | def _normalize_table_cell(self, cell: Any) -> Dict[str, Any] | None: | 1327 | def _normalize_table_cell(self, cell: Any) -> Dict[str, Any] | None: |
| 1280 | """把各种单元格写法规整为schema认可的形式""" | 1328 | """把各种单元格写法规整为schema认可的形式""" |
| 1281 | if cell is None: | 1329 | if cell is None: |
| 1282 | return {"blocks": [self._as_paragraph_block("")]} | 1330 | return {"blocks": [self._as_paragraph_block("")]} |
| 1283 | 1331 | ||
| 1284 | if isinstance(cell, dict): | 1332 | if isinstance(cell, dict): |
| 1333 | + # 检测错误嵌套的 cells 结构:有 cells 但没有 blocks | ||
| 1334 | + # 这是 LLM 常见的错误,把同级 cell 嵌套进了 cells 数组 | ||
| 1335 | + if "cells" in cell and "blocks" not in cell: | ||
| 1336 | + # 展平嵌套的 cells 并返回第一个有效 cell | ||
| 1337 | + # 注意:其余嵌套的 cells 会在 _normalize_table_cells 中被处理 | ||
| 1338 | + return self._flatten_nested_cell(cell) | ||
| 1339 | + | ||
| 1285 | normalized = dict(cell) | 1340 | normalized = dict(cell) |
| 1286 | blocks = self._coerce_cell_blocks(normalized.get("blocks"), normalized) | 1341 | blocks = self._coerce_cell_blocks(normalized.get("blocks"), normalized) |
| 1287 | elif isinstance(cell, list): | 1342 | elif isinstance(cell, list): |
| @@ -1297,6 +1352,40 @@ class ChapterGenerationNode(BaseNode): | @@ -1297,6 +1352,40 @@ class ChapterGenerationNode(BaseNode): | ||
| 1297 | normalized["blocks"] = blocks or [self._as_paragraph_block("")] | 1352 | normalized["blocks"] = blocks or [self._as_paragraph_block("")] |
| 1298 | return normalized | 1353 | return normalized |
| 1299 | 1354 | ||
| 1355 | + def _flatten_nested_cell(self, cell: Dict[str, Any]) -> Dict[str, Any]: | ||
| 1356 | + """ | ||
| 1357 | + 展平错误嵌套的 cell 结构。 | ||
| 1358 | + | ||
| 1359 | + LLM 有时会生成类似这样的错误结构: | ||
| 1360 | + { "cells": [ { "blocks": [...] }, { "cells": [...] } ] } | ||
| 1361 | + | ||
| 1362 | + 应该返回第一个有效的 cell 内容。 | ||
| 1363 | + """ | ||
| 1364 | + nested_cells = cell.get("cells") | ||
| 1365 | + if not isinstance(nested_cells, list) or not nested_cells: | ||
| 1366 | + # 没有有效的嵌套内容,返回空 cell | ||
| 1367 | + return {"blocks": [self._as_paragraph_block("")]} | ||
| 1368 | + | ||
| 1369 | + # 递归查找第一个包含 blocks 的有效 cell | ||
| 1370 | + for nested in nested_cells: | ||
| 1371 | + if isinstance(nested, dict): | ||
| 1372 | + if "blocks" in nested: | ||
| 1373 | + # 找到有效 cell,递归规范化 | ||
| 1374 | + return self._normalize_table_cell(nested) | ||
| 1375 | + elif "cells" in nested: | ||
| 1376 | + # 继续递归展平 | ||
| 1377 | + result = self._flatten_nested_cell(nested) | ||
| 1378 | + if result: | ||
| 1379 | + return result | ||
| 1380 | + | ||
| 1381 | + # 没有找到有效内容,尝试从第一个嵌套元素提取文本 | ||
| 1382 | + first_nested = nested_cells[0] | ||
| 1383 | + if isinstance(first_nested, dict): | ||
| 1384 | + text = self._extract_block_text(first_nested) | ||
| 1385 | + return {"blocks": [self._as_paragraph_block(text or "")]} | ||
| 1386 | + | ||
| 1387 | + return {"blocks": [self._as_paragraph_block("")]} | ||
| 1388 | + | ||
| 1300 | def _coerce_cell_blocks( | 1389 | def _coerce_cell_blocks( |
| 1301 | self, blocks: Any, source: Dict[str, Any] | None | 1390 | self, blocks: Any, source: Dict[str, Any] | None |
| 1302 | ) -> List[Dict[str, Any]]: | 1391 | ) -> List[Dict[str, Any]]: |
| @@ -605,6 +605,8 @@ class MarkdownRenderer: | @@ -605,6 +605,8 @@ class MarkdownRenderer: | ||
| 605 | elif isinstance(data_field, dict): | 605 | elif isinstance(data_field, dict): |
| 606 | if isinstance(data_field.get("items"), list): | 606 | if isinstance(data_field.get("items"), list): |
| 607 | candidates.append(data_field.get("items")) | 607 | candidates.append(data_field.get("items")) |
| 608 | + if isinstance(data_field.get("words"), list): | ||
| 609 | + candidates.append(data_field.get("words")) | ||
| 608 | 610 | ||
| 609 | items: List[Dict[str, Any]] = [] | 611 | items: List[Dict[str, Any]] = [] |
| 610 | seen: set[str] = set() | 612 | seen: set[str] = set() |
ReportEngine/scripts/validate_ir.py
0 → 100644
| 1 | +#!/usr/bin/env python3 | ||
| 2 | +""" | ||
| 3 | +IR 文档验证工具。 | ||
| 4 | + | ||
| 5 | +命令行工具,用于: | ||
| 6 | +- 扫描指定 JSON 文件中的所有图表和表格 | ||
| 7 | +- 报告结构问题和数据缺失 | ||
| 8 | +- 支持自动修复常见问题 | ||
| 9 | +- 支持批量处理 | ||
| 10 | + | ||
| 11 | +使用方法: | ||
| 12 | + python -m ReportEngine.scripts.validate_ir chapter-030-section-3-0.json | ||
| 13 | + python -m ReportEngine.scripts.validate_ir *.json --fix | ||
| 14 | + python -m ReportEngine.scripts.validate_ir ./output/ --recursive --fix --verbose | ||
| 15 | +""" | ||
| 16 | + | ||
| 17 | +from __future__ import annotations | ||
| 18 | + | ||
| 19 | +import argparse | ||
| 20 | +import json | ||
| 21 | +import sys | ||
| 22 | +from pathlib import Path | ||
| 23 | +from typing import Any, Dict, List, Optional, Tuple | ||
| 24 | +from dataclasses import dataclass, field | ||
| 25 | + | ||
| 26 | +# 添加项目根目录到路径 | ||
| 27 | +project_root = Path(__file__).parent.parent.parent | ||
| 28 | +if str(project_root) not in sys.path: | ||
| 29 | + sys.path.insert(0, str(project_root)) | ||
| 30 | + | ||
| 31 | +from loguru import logger | ||
| 32 | + | ||
| 33 | +from ReportEngine.utils.chart_validator import ( | ||
| 34 | + ChartValidator, | ||
| 35 | + ChartRepairer, | ||
| 36 | + ValidationResult, | ||
| 37 | +) | ||
| 38 | +from ReportEngine.utils.table_validator import ( | ||
| 39 | + TableValidator, | ||
| 40 | + TableRepairer, | ||
| 41 | + TableValidationResult, | ||
| 42 | +) | ||
| 43 | + | ||
| 44 | + | ||
| 45 | +@dataclass | ||
| 46 | +class BlockIssue: | ||
| 47 | + """单个 block 的问题""" | ||
| 48 | + block_type: str | ||
| 49 | + block_id: str | ||
| 50 | + path: str | ||
| 51 | + errors: List[str] = field(default_factory=list) | ||
| 52 | + warnings: List[str] = field(default_factory=list) | ||
| 53 | + is_fixable: bool = False | ||
| 54 | + | ||
| 55 | + | ||
| 56 | +@dataclass | ||
| 57 | +class DocumentReport: | ||
| 58 | + """文档验证报告""" | ||
| 59 | + file_path: str | ||
| 60 | + total_blocks: int = 0 | ||
| 61 | + chart_count: int = 0 | ||
| 62 | + table_count: int = 0 | ||
| 63 | + wordcloud_count: int = 0 | ||
| 64 | + issues: List[BlockIssue] = field(default_factory=list) | ||
| 65 | + fixed_count: int = 0 | ||
| 66 | + | ||
| 67 | + @property | ||
| 68 | + def has_issues(self) -> bool: | ||
| 69 | + return len(self.issues) > 0 | ||
| 70 | + | ||
| 71 | + @property | ||
| 72 | + def error_count(self) -> int: | ||
| 73 | + return sum(len(issue.errors) for issue in self.issues) | ||
| 74 | + | ||
| 75 | + @property | ||
| 76 | + def warning_count(self) -> int: | ||
| 77 | + return sum(len(issue.warnings) for issue in self.issues) | ||
| 78 | + | ||
| 79 | + | ||
| 80 | +class IRValidator: | ||
| 81 | + """IR 文档验证器""" | ||
| 82 | + | ||
| 83 | + def __init__( | ||
| 84 | + self, | ||
| 85 | + chart_validator: Optional[ChartValidator] = None, | ||
| 86 | + table_validator: Optional[TableValidator] = None, | ||
| 87 | + chart_repairer: Optional[ChartRepairer] = None, | ||
| 88 | + table_repairer: Optional[TableRepairer] = None, | ||
| 89 | + ): | ||
| 90 | + self.chart_validator = chart_validator or ChartValidator() | ||
| 91 | + self.table_validator = table_validator or TableValidator() | ||
| 92 | + self.chart_repairer = chart_repairer or ChartRepairer(self.chart_validator) | ||
| 93 | + self.table_repairer = table_repairer or TableRepairer(self.table_validator) | ||
| 94 | + | ||
| 95 | + def validate_document( | ||
| 96 | + self, | ||
| 97 | + document: Dict[str, Any], | ||
| 98 | + file_path: str = "<unknown>", | ||
| 99 | + ) -> DocumentReport: | ||
| 100 | + """ | ||
| 101 | + 验证整个文档。 | ||
| 102 | + | ||
| 103 | + Args: | ||
| 104 | + document: IR 文档数据 | ||
| 105 | + file_path: 文件路径(用于报告) | ||
| 106 | + | ||
| 107 | + Returns: | ||
| 108 | + DocumentReport: 验证报告 | ||
| 109 | + """ | ||
| 110 | + report = DocumentReport(file_path=file_path) | ||
| 111 | + | ||
| 112 | + # 遍历所有章节 | ||
| 113 | + chapters = document.get("chapters", []) | ||
| 114 | + for chapter_idx, chapter in enumerate(chapters): | ||
| 115 | + if not isinstance(chapter, dict): | ||
| 116 | + continue | ||
| 117 | + | ||
| 118 | + chapter_id = chapter.get("chapterId", f"chapter-{chapter_idx}") | ||
| 119 | + blocks = chapter.get("blocks", []) | ||
| 120 | + | ||
| 121 | + self._validate_blocks( | ||
| 122 | + blocks, | ||
| 123 | + f"chapters[{chapter_idx}].blocks", | ||
| 124 | + chapter_id, | ||
| 125 | + report, | ||
| 126 | + ) | ||
| 127 | + | ||
| 128 | + return report | ||
| 129 | + | ||
| 130 | + def _validate_blocks( | ||
| 131 | + self, | ||
| 132 | + blocks: List[Any], | ||
| 133 | + path: str, | ||
| 134 | + chapter_id: str, | ||
| 135 | + report: DocumentReport, | ||
| 136 | + ): | ||
| 137 | + """递归验证 blocks 列表""" | ||
| 138 | + if not isinstance(blocks, list): | ||
| 139 | + return | ||
| 140 | + | ||
| 141 | + for idx, block in enumerate(blocks): | ||
| 142 | + if not isinstance(block, dict): | ||
| 143 | + continue | ||
| 144 | + | ||
| 145 | + report.total_blocks += 1 | ||
| 146 | + block_path = f"{path}[{idx}]" | ||
| 147 | + block_type = block.get("type", "") | ||
| 148 | + block_id = block.get("widgetId") or block.get("id") or f"block-{idx}" | ||
| 149 | + | ||
| 150 | + # 根据类型验证 | ||
| 151 | + if block_type == "widget": | ||
| 152 | + widget_type = (block.get("widgetType") or "").lower() | ||
| 153 | + if "chart.js" in widget_type: | ||
| 154 | + report.chart_count += 1 | ||
| 155 | + self._validate_chart(block, block_path, block_id, report) | ||
| 156 | + elif "wordcloud" in widget_type: | ||
| 157 | + report.wordcloud_count += 1 | ||
| 158 | + self._validate_wordcloud(block, block_path, block_id, report) | ||
| 159 | + | ||
| 160 | + elif block_type == "table": | ||
| 161 | + report.table_count += 1 | ||
| 162 | + self._validate_table(block, block_path, block_id, report) | ||
| 163 | + | ||
| 164 | + # 递归处理嵌套 blocks | ||
| 165 | + nested_blocks = block.get("blocks") | ||
| 166 | + if isinstance(nested_blocks, list): | ||
| 167 | + self._validate_blocks(nested_blocks, f"{block_path}.blocks", chapter_id, report) | ||
| 168 | + | ||
| 169 | + # 处理 table rows 中的 blocks | ||
| 170 | + if block_type == "table": | ||
| 171 | + rows = block.get("rows", []) | ||
| 172 | + for row_idx, row in enumerate(rows): | ||
| 173 | + if isinstance(row, dict): | ||
| 174 | + cells = row.get("cells", []) | ||
| 175 | + for cell_idx, cell in enumerate(cells): | ||
| 176 | + if isinstance(cell, dict): | ||
| 177 | + cell_blocks = cell.get("blocks", []) | ||
| 178 | + self._validate_blocks( | ||
| 179 | + cell_blocks, | ||
| 180 | + f"{block_path}.rows[{row_idx}].cells[{cell_idx}].blocks", | ||
| 181 | + chapter_id, | ||
| 182 | + report, | ||
| 183 | + ) | ||
| 184 | + | ||
| 185 | + # 处理 list items 中的 blocks | ||
| 186 | + if block_type == "list": | ||
| 187 | + items = block.get("items", []) | ||
| 188 | + for item_idx, item in enumerate(items): | ||
| 189 | + if isinstance(item, list): | ||
| 190 | + self._validate_blocks( | ||
| 191 | + item, | ||
| 192 | + f"{block_path}.items[{item_idx}]", | ||
| 193 | + chapter_id, | ||
| 194 | + report, | ||
| 195 | + ) | ||
| 196 | + | ||
| 197 | + def _validate_chart( | ||
| 198 | + self, | ||
| 199 | + block: Dict[str, Any], | ||
| 200 | + path: str, | ||
| 201 | + block_id: str, | ||
| 202 | + report: DocumentReport, | ||
| 203 | + ): | ||
| 204 | + """验证图表""" | ||
| 205 | + result = self.chart_validator.validate(block) | ||
| 206 | + | ||
| 207 | + if not result.is_valid or result.warnings: | ||
| 208 | + issue = BlockIssue( | ||
| 209 | + block_type="chart", | ||
| 210 | + block_id=block_id, | ||
| 211 | + path=path, | ||
| 212 | + errors=result.errors, | ||
| 213 | + warnings=result.warnings, | ||
| 214 | + is_fixable=result.has_critical_errors(), | ||
| 215 | + ) | ||
| 216 | + report.issues.append(issue) | ||
| 217 | + | ||
| 218 | + def _validate_table( | ||
| 219 | + self, | ||
| 220 | + block: Dict[str, Any], | ||
| 221 | + path: str, | ||
| 222 | + block_id: str, | ||
| 223 | + report: DocumentReport, | ||
| 224 | + ): | ||
| 225 | + """验证表格""" | ||
| 226 | + result = self.table_validator.validate(block) | ||
| 227 | + | ||
| 228 | + if not result.is_valid or result.warnings or result.nested_cells_detected: | ||
| 229 | + issue = BlockIssue( | ||
| 230 | + block_type="table", | ||
| 231 | + block_id=block_id, | ||
| 232 | + path=path, | ||
| 233 | + errors=result.errors, | ||
| 234 | + warnings=result.warnings, | ||
| 235 | + is_fixable=result.nested_cells_detected or result.has_critical_errors(), | ||
| 236 | + ) | ||
| 237 | + | ||
| 238 | + # 添加嵌套 cells 警告 | ||
| 239 | + if result.nested_cells_detected: | ||
| 240 | + issue.warnings.insert(0, "检测到嵌套 cells 结构(LLM 常见错误)") | ||
| 241 | + | ||
| 242 | + # 添加空单元格信息 | ||
| 243 | + if result.empty_cells_count > 0: | ||
| 244 | + issue.warnings.append( | ||
| 245 | + f"空单元格数量: {result.empty_cells_count}/{result.total_cells_count}" | ||
| 246 | + ) | ||
| 247 | + | ||
| 248 | + report.issues.append(issue) | ||
| 249 | + | ||
| 250 | + def _validate_wordcloud( | ||
| 251 | + self, | ||
| 252 | + block: Dict[str, Any], | ||
| 253 | + path: str, | ||
| 254 | + block_id: str, | ||
| 255 | + report: DocumentReport, | ||
| 256 | + ): | ||
| 257 | + """验证词云""" | ||
| 258 | + errors: List[str] = [] | ||
| 259 | + warnings: List[str] = [] | ||
| 260 | + | ||
| 261 | + # 检查数据结构 | ||
| 262 | + data = block.get("data") | ||
| 263 | + props = block.get("props", {}) | ||
| 264 | + | ||
| 265 | + words_found = False | ||
| 266 | + words_count = 0 | ||
| 267 | + | ||
| 268 | + # 检查各种可能的词云数据路径 | ||
| 269 | + data_paths = [ | ||
| 270 | + ("data.words", data.get("words") if isinstance(data, dict) else None), | ||
| 271 | + ("data.items", data.get("items") if isinstance(data, dict) else None), | ||
| 272 | + ("data", data if isinstance(data, list) else None), | ||
| 273 | + ("props.words", props.get("words") if isinstance(props, dict) else None), | ||
| 274 | + ("props.items", props.get("items") if isinstance(props, dict) else None), | ||
| 275 | + ("props.data", props.get("data") if isinstance(props, dict) else None), | ||
| 276 | + ] | ||
| 277 | + | ||
| 278 | + for path_name, value in data_paths: | ||
| 279 | + if isinstance(value, list) and len(value) > 0: | ||
| 280 | + words_found = True | ||
| 281 | + words_count = len(value) | ||
| 282 | + | ||
| 283 | + # 验证词云项格式 | ||
| 284 | + for idx, item in enumerate(value[:5]): # 只检查前5个 | ||
| 285 | + if isinstance(item, dict): | ||
| 286 | + word = item.get("word") or item.get("text") or item.get("label") | ||
| 287 | + weight = item.get("weight") or item.get("value") | ||
| 288 | + if not word: | ||
| 289 | + warnings.append(f"{path_name}[{idx}] 缺少 word/text/label 字段") | ||
| 290 | + if weight is None: | ||
| 291 | + warnings.append(f"{path_name}[{idx}] 缺少 weight/value 字段") | ||
| 292 | + elif not isinstance(item, (str, list, tuple)): | ||
| 293 | + warnings.append(f"{path_name}[{idx}] 格式不正确") | ||
| 294 | + | ||
| 295 | + break | ||
| 296 | + | ||
| 297 | + if not words_found: | ||
| 298 | + errors.append("词云数据缺失:未在 data.words, data.items, props.words 等路径找到有效数据") | ||
| 299 | + elif words_count == 0: | ||
| 300 | + warnings.append("词云数据为空") | ||
| 301 | + | ||
| 302 | + if errors or warnings: | ||
| 303 | + issue = BlockIssue( | ||
| 304 | + block_type="wordcloud", | ||
| 305 | + block_id=block_id, | ||
| 306 | + path=path, | ||
| 307 | + errors=errors, | ||
| 308 | + warnings=warnings, | ||
| 309 | + is_fixable=False, # 词云数据缺失通常无法自动修复 | ||
| 310 | + ) | ||
| 311 | + report.issues.append(issue) | ||
| 312 | + | ||
| 313 | + def repair_document( | ||
| 314 | + self, | ||
| 315 | + document: Dict[str, Any], | ||
| 316 | + report: DocumentReport, | ||
| 317 | + ) -> Tuple[Dict[str, Any], int]: | ||
| 318 | + """ | ||
| 319 | + 修复文档中的问题。 | ||
| 320 | + | ||
| 321 | + Args: | ||
| 322 | + document: IR 文档数据 | ||
| 323 | + report: 验证报告 | ||
| 324 | + | ||
| 325 | + Returns: | ||
| 326 | + Tuple[Dict[str, Any], int]: (修复后的文档, 修复数量) | ||
| 327 | + """ | ||
| 328 | + fixed_count = 0 | ||
| 329 | + | ||
| 330 | + # 遍历所有章节 | ||
| 331 | + chapters = document.get("chapters", []) | ||
| 332 | + for chapter in chapters: | ||
| 333 | + if not isinstance(chapter, dict): | ||
| 334 | + continue | ||
| 335 | + | ||
| 336 | + blocks = chapter.get("blocks", []) | ||
| 337 | + chapter["blocks"], chapter_fixed = self._repair_blocks(blocks) | ||
| 338 | + fixed_count += chapter_fixed | ||
| 339 | + | ||
| 340 | + return document, fixed_count | ||
| 341 | + | ||
| 342 | + def _repair_blocks( | ||
| 343 | + self, | ||
| 344 | + blocks: List[Any], | ||
| 345 | + ) -> Tuple[List[Any], int]: | ||
| 346 | + """递归修复 blocks 列表""" | ||
| 347 | + if not isinstance(blocks, list): | ||
| 348 | + return blocks, 0 | ||
| 349 | + | ||
| 350 | + fixed_count = 0 | ||
| 351 | + repaired_blocks: List[Any] = [] | ||
| 352 | + | ||
| 353 | + for block in blocks: | ||
| 354 | + if not isinstance(block, dict): | ||
| 355 | + repaired_blocks.append(block) | ||
| 356 | + continue | ||
| 357 | + | ||
| 358 | + block_type = block.get("type", "") | ||
| 359 | + | ||
| 360 | + # 修复表格 | ||
| 361 | + if block_type == "table": | ||
| 362 | + result = self.table_repairer.repair(block) | ||
| 363 | + if result.has_changes(): | ||
| 364 | + block = result.repaired_block | ||
| 365 | + fixed_count += 1 | ||
| 366 | + logger.info(f"修复表格: {result.changes}") | ||
| 367 | + | ||
| 368 | + # 修复图表 | ||
| 369 | + elif block_type == "widget": | ||
| 370 | + widget_type = (block.get("widgetType") or "").lower() | ||
| 371 | + if "chart.js" in widget_type: | ||
| 372 | + result = self.chart_repairer.repair(block) | ||
| 373 | + if result.has_changes(): | ||
| 374 | + block = result.repaired_block | ||
| 375 | + fixed_count += 1 | ||
| 376 | + logger.info(f"修复图表: {result.changes}") | ||
| 377 | + | ||
| 378 | + # 递归处理嵌套 blocks | ||
| 379 | + nested_blocks = block.get("blocks") | ||
| 380 | + if isinstance(nested_blocks, list): | ||
| 381 | + block["blocks"], nested_fixed = self._repair_blocks(nested_blocks) | ||
| 382 | + fixed_count += nested_fixed | ||
| 383 | + | ||
| 384 | + # 处理 table rows 中的 blocks | ||
| 385 | + if block_type == "table": | ||
| 386 | + rows = block.get("rows", []) | ||
| 387 | + for row in rows: | ||
| 388 | + if isinstance(row, dict): | ||
| 389 | + cells = row.get("cells", []) | ||
| 390 | + for cell in cells: | ||
| 391 | + if isinstance(cell, dict): | ||
| 392 | + cell_blocks = cell.get("blocks", []) | ||
| 393 | + cell["blocks"], cell_fixed = self._repair_blocks(cell_blocks) | ||
| 394 | + fixed_count += cell_fixed | ||
| 395 | + | ||
| 396 | + # 处理 list items 中的 blocks | ||
| 397 | + if block_type == "list": | ||
| 398 | + items = block.get("items", []) | ||
| 399 | + for i, item in enumerate(items): | ||
| 400 | + if isinstance(item, list): | ||
| 401 | + items[i], item_fixed = self._repair_blocks(item) | ||
| 402 | + fixed_count += item_fixed | ||
| 403 | + | ||
| 404 | + repaired_blocks.append(block) | ||
| 405 | + | ||
| 406 | + return repaired_blocks, fixed_count | ||
| 407 | + | ||
| 408 | + | ||
| 409 | +def print_report(report: DocumentReport, verbose: bool = False): | ||
| 410 | + """打印验证报告""" | ||
| 411 | + print(f"\n{'=' * 60}") | ||
| 412 | + print(f"文件: {report.file_path}") | ||
| 413 | + print(f"{'=' * 60}") | ||
| 414 | + | ||
| 415 | + print(f"\n📊 统计:") | ||
| 416 | + print(f" - 总 blocks: {report.total_blocks}") | ||
| 417 | + print(f" - 图表数量: {report.chart_count}") | ||
| 418 | + print(f" - 表格数量: {report.table_count}") | ||
| 419 | + print(f" - 词云数量: {report.wordcloud_count}") | ||
| 420 | + | ||
| 421 | + if report.has_issues: | ||
| 422 | + print(f"\n⚠️ 发现 {len(report.issues)} 个问题:") | ||
| 423 | + print(f" - 错误: {report.error_count}") | ||
| 424 | + print(f" - 警告: {report.warning_count}") | ||
| 425 | + | ||
| 426 | + if verbose: | ||
| 427 | + for issue in report.issues: | ||
| 428 | + print(f"\n [{issue.block_type}] {issue.block_id}") | ||
| 429 | + print(f" 路径: {issue.path}") | ||
| 430 | + if issue.errors: | ||
| 431 | + for error in issue.errors: | ||
| 432 | + print(f" ❌ {error}") | ||
| 433 | + if issue.warnings: | ||
| 434 | + for warning in issue.warnings: | ||
| 435 | + print(f" ⚠️ {warning}") | ||
| 436 | + if issue.is_fixable: | ||
| 437 | + print(f" 🔧 可自动修复") | ||
| 438 | + else: | ||
| 439 | + print(f"\n✅ 未发现问题") | ||
| 440 | + | ||
| 441 | + if report.fixed_count > 0: | ||
| 442 | + print(f"\n🔧 已修复 {report.fixed_count} 个问题") | ||
| 443 | + | ||
| 444 | + | ||
| 445 | +def validate_file( | ||
| 446 | + file_path: Path, | ||
| 447 | + validator: IRValidator, | ||
| 448 | + fix: bool = False, | ||
| 449 | + verbose: bool = False, | ||
| 450 | +) -> DocumentReport: | ||
| 451 | + """验证单个文件""" | ||
| 452 | + try: | ||
| 453 | + with open(file_path, "r", encoding="utf-8") as f: | ||
| 454 | + document = json.load(f) | ||
| 455 | + except json.JSONDecodeError as e: | ||
| 456 | + logger.error(f"JSON 解析错误: {file_path}: {e}") | ||
| 457 | + report = DocumentReport(file_path=str(file_path)) | ||
| 458 | + report.issues.append(BlockIssue( | ||
| 459 | + block_type="document", | ||
| 460 | + block_id="root", | ||
| 461 | + path="", | ||
| 462 | + errors=[f"JSON 解析错误: {e}"], | ||
| 463 | + )) | ||
| 464 | + return report | ||
| 465 | + except Exception as e: | ||
| 466 | + logger.error(f"读取文件错误: {file_path}: {e}") | ||
| 467 | + report = DocumentReport(file_path=str(file_path)) | ||
| 468 | + report.issues.append(BlockIssue( | ||
| 469 | + block_type="document", | ||
| 470 | + block_id="root", | ||
| 471 | + path="", | ||
| 472 | + errors=[f"读取文件错误: {e}"], | ||
| 473 | + )) | ||
| 474 | + return report | ||
| 475 | + | ||
| 476 | + # 验证文档 | ||
| 477 | + report = validator.validate_document(document, str(file_path)) | ||
| 478 | + | ||
| 479 | + # 修复问题 | ||
| 480 | + if fix and report.has_issues: | ||
| 481 | + fixable_issues = [i for i in report.issues if i.is_fixable] | ||
| 482 | + if fixable_issues: | ||
| 483 | + logger.info(f"尝试修复 {len(fixable_issues)} 个问题...") | ||
| 484 | + document, fixed_count = validator.repair_document(document, report) | ||
| 485 | + report.fixed_count = fixed_count | ||
| 486 | + | ||
| 487 | + if fixed_count > 0: | ||
| 488 | + # 保存修复后的文件 | ||
| 489 | + backup_path = file_path.with_suffix(f".bak{file_path.suffix}") | ||
| 490 | + try: | ||
| 491 | + # 创建备份 | ||
| 492 | + import shutil | ||
| 493 | + shutil.copy(file_path, backup_path) | ||
| 494 | + logger.info(f"已创建备份: {backup_path}") | ||
| 495 | + | ||
| 496 | + # 保存修复后的文件 | ||
| 497 | + with open(file_path, "w", encoding="utf-8") as f: | ||
| 498 | + json.dump(document, f, ensure_ascii=False, indent=2) | ||
| 499 | + logger.info(f"已保存修复后的文件: {file_path}") | ||
| 500 | + except Exception as e: | ||
| 501 | + logger.error(f"保存文件失败: {e}") | ||
| 502 | + | ||
| 503 | + return report | ||
| 504 | + | ||
| 505 | + | ||
| 506 | +def main(): | ||
| 507 | + """主函数""" | ||
| 508 | + parser = argparse.ArgumentParser( | ||
| 509 | + description="IR 文档验证工具", | ||
| 510 | + formatter_class=argparse.RawDescriptionHelpFormatter, | ||
| 511 | + epilog=""" | ||
| 512 | +示例: | ||
| 513 | + %(prog)s chapter-030-section-3-0.json | ||
| 514 | + %(prog)s *.json --fix | ||
| 515 | + %(prog)s ./output/ --recursive --fix --verbose | ||
| 516 | + """, | ||
| 517 | + ) | ||
| 518 | + parser.add_argument( | ||
| 519 | + "paths", | ||
| 520 | + nargs="+", | ||
| 521 | + help="要验证的 JSON 文件或目录", | ||
| 522 | + ) | ||
| 523 | + parser.add_argument( | ||
| 524 | + "-r", "--recursive", | ||
| 525 | + action="store_true", | ||
| 526 | + help="递归处理目录", | ||
| 527 | + ) | ||
| 528 | + parser.add_argument( | ||
| 529 | + "-f", "--fix", | ||
| 530 | + action="store_true", | ||
| 531 | + help="自动修复常见问题", | ||
| 532 | + ) | ||
| 533 | + parser.add_argument( | ||
| 534 | + "-v", "--verbose", | ||
| 535 | + action="store_true", | ||
| 536 | + help="显示详细信息", | ||
| 537 | + ) | ||
| 538 | + parser.add_argument( | ||
| 539 | + "--no-color", | ||
| 540 | + action="store_true", | ||
| 541 | + help="禁用彩色输出", | ||
| 542 | + ) | ||
| 543 | + | ||
| 544 | + args = parser.parse_args() | ||
| 545 | + | ||
| 546 | + # 配置日志 | ||
| 547 | + logger.remove() | ||
| 548 | + if args.verbose: | ||
| 549 | + logger.add(sys.stderr, level="DEBUG") | ||
| 550 | + else: | ||
| 551 | + logger.add(sys.stderr, level="INFO") | ||
| 552 | + | ||
| 553 | + # 收集文件 | ||
| 554 | + files: List[Path] = [] | ||
| 555 | + for path_str in args.paths: | ||
| 556 | + path = Path(path_str) | ||
| 557 | + if path.is_file(): | ||
| 558 | + if path.suffix.lower() == ".json": | ||
| 559 | + files.append(path) | ||
| 560 | + elif path.is_dir(): | ||
| 561 | + if args.recursive: | ||
| 562 | + files.extend(path.rglob("*.json")) | ||
| 563 | + else: | ||
| 564 | + files.extend(path.glob("*.json")) | ||
| 565 | + else: | ||
| 566 | + # 可能是 glob 模式 | ||
| 567 | + import glob | ||
| 568 | + matched = glob.glob(path_str) | ||
| 569 | + for m in matched: | ||
| 570 | + mp = Path(m) | ||
| 571 | + if mp.is_file() and mp.suffix.lower() == ".json": | ||
| 572 | + files.append(mp) | ||
| 573 | + | ||
| 574 | + if not files: | ||
| 575 | + print("未找到 JSON 文件") | ||
| 576 | + sys.exit(1) | ||
| 577 | + | ||
| 578 | + print(f"找到 {len(files)} 个文件") | ||
| 579 | + | ||
| 580 | + # 创建验证器 | ||
| 581 | + validator = IRValidator() | ||
| 582 | + | ||
| 583 | + # 验证文件 | ||
| 584 | + total_issues = 0 | ||
| 585 | + total_fixed = 0 | ||
| 586 | + reports: List[DocumentReport] = [] | ||
| 587 | + | ||
| 588 | + for file_path in files: | ||
| 589 | + report = validate_file(file_path, validator, args.fix, args.verbose) | ||
| 590 | + reports.append(report) | ||
| 591 | + total_issues += len(report.issues) | ||
| 592 | + total_fixed += report.fixed_count | ||
| 593 | + | ||
| 594 | + if args.verbose or report.has_issues: | ||
| 595 | + print_report(report, args.verbose) | ||
| 596 | + | ||
| 597 | + # 打印总结 | ||
| 598 | + print(f"\n{'=' * 60}") | ||
| 599 | + print("总结") | ||
| 600 | + print(f"{'=' * 60}") | ||
| 601 | + print(f" - 文件数: {len(files)}") | ||
| 602 | + print(f" - 问题总数: {total_issues}") | ||
| 603 | + if args.fix: | ||
| 604 | + print(f" - 已修复: {total_fixed}") | ||
| 605 | + | ||
| 606 | + # 返回适当的退出码 | ||
| 607 | + if total_issues > 0 and total_fixed < total_issues: | ||
| 608 | + sys.exit(1) | ||
| 609 | + sys.exit(0) | ||
| 610 | + | ||
| 611 | + | ||
| 612 | +if __name__ == "__main__": | ||
| 613 | + main() |
| @@ -10,8 +10,23 @@ from ReportEngine.utils.chart_review_service import ( | @@ -10,8 +10,23 @@ from ReportEngine.utils.chart_review_service import ( | ||
| 10 | review_document_charts, | 10 | review_document_charts, |
| 11 | ) | 11 | ) |
| 12 | 12 | ||
| 13 | +from ReportEngine.utils.table_validator import ( | ||
| 14 | + TableValidator, | ||
| 15 | + TableRepairer, | ||
| 16 | + TableValidationResult, | ||
| 17 | + TableRepairResult, | ||
| 18 | + create_table_validator, | ||
| 19 | + create_table_repairer, | ||
| 20 | +) | ||
| 21 | + | ||
| 13 | __all__ = [ | 22 | __all__ = [ |
| 14 | "ChartReviewService", | 23 | "ChartReviewService", |
| 15 | "get_chart_review_service", | 24 | "get_chart_review_service", |
| 16 | "review_document_charts", | 25 | "review_document_charts", |
| 26 | + "TableValidator", | ||
| 27 | + "TableRepairer", | ||
| 28 | + "TableValidationResult", | ||
| 29 | + "TableRepairResult", | ||
| 30 | + "create_table_validator", | ||
| 31 | + "create_table_repairer", | ||
| 17 | ] | 32 | ] |
| @@ -83,6 +83,243 @@ CHART_REPAIR_SYSTEM_PROMPT = """你是一个专业的图表数据修复助手。 | @@ -83,6 +83,243 @@ CHART_REPAIR_SYSTEM_PROMPT = """你是一个专业的图表数据修复助手。 | ||
| 83 | """ | 83 | """ |
| 84 | 84 | ||
| 85 | 85 | ||
| 86 | +# 表格修复提示词 | ||
| 87 | +TABLE_REPAIR_SYSTEM_PROMPT = """你是一个专业的表格数据修复助手。你的任务是修复IR表格数据中的格式错误,确保表格能够正常渲染。 | ||
| 88 | + | ||
| 89 | +**标准表格数据格式:** | ||
| 90 | + | ||
| 91 | +```json | ||
| 92 | +{ | ||
| 93 | + "type": "table", | ||
| 94 | + "rows": [ | ||
| 95 | + { | ||
| 96 | + "cells": [ | ||
| 97 | + { | ||
| 98 | + "header": true, | ||
| 99 | + "blocks": [ | ||
| 100 | + { | ||
| 101 | + "type": "paragraph", | ||
| 102 | + "inlines": [{"text": "列标题", "marks": []}] | ||
| 103 | + } | ||
| 104 | + ] | ||
| 105 | + }, | ||
| 106 | + { | ||
| 107 | + "header": true, | ||
| 108 | + "blocks": [ | ||
| 109 | + { | ||
| 110 | + "type": "paragraph", | ||
| 111 | + "inlines": [{"text": "另一列", "marks": []}] | ||
| 112 | + } | ||
| 113 | + ] | ||
| 114 | + } | ||
| 115 | + ] | ||
| 116 | + }, | ||
| 117 | + { | ||
| 118 | + "cells": [ | ||
| 119 | + { | ||
| 120 | + "blocks": [ | ||
| 121 | + { | ||
| 122 | + "type": "paragraph", | ||
| 123 | + "inlines": [{"text": "数据内容", "marks": []}] | ||
| 124 | + } | ||
| 125 | + ] | ||
| 126 | + }, | ||
| 127 | + { | ||
| 128 | + "blocks": [ | ||
| 129 | + { | ||
| 130 | + "type": "paragraph", | ||
| 131 | + "inlines": [{"text": "另一数据", "marks": []}] | ||
| 132 | + } | ||
| 133 | + ] | ||
| 134 | + } | ||
| 135 | + ] | ||
| 136 | + } | ||
| 137 | + ] | ||
| 138 | +} | ||
| 139 | +``` | ||
| 140 | + | ||
| 141 | +**⚠️ 常见错误:嵌套 cells 结构** | ||
| 142 | + | ||
| 143 | +这是一个非常常见的错误,LLM 经常把同级的 cells 错误地嵌套起来: | ||
| 144 | + | ||
| 145 | +❌ **错误示例:** | ||
| 146 | +```json | ||
| 147 | +{ | ||
| 148 | + "cells": [ | ||
| 149 | + { "blocks": [...], "colspan": 1 }, | ||
| 150 | + { "cells": [ | ||
| 151 | + { "blocks": [...] }, | ||
| 152 | + { "cells": [...] } | ||
| 153 | + ] | ||
| 154 | + } | ||
| 155 | + ] | ||
| 156 | +} | ||
| 157 | +``` | ||
| 158 | + | ||
| 159 | +✅ **正确格式:** | ||
| 160 | +```json | ||
| 161 | +{ | ||
| 162 | + "cells": [ | ||
| 163 | + { "blocks": [...], "colspan": 1 }, | ||
| 164 | + { "blocks": [...] }, | ||
| 165 | + { "blocks": [...] } | ||
| 166 | + ] | ||
| 167 | +} | ||
| 168 | +``` | ||
| 169 | + | ||
| 170 | +**修复原则:** | ||
| 171 | +1. **展平嵌套 cells** - 将错误嵌套的 cells 展平为同级 | ||
| 172 | +2. **确保每个 cell 有 blocks** - 每个单元格必须有 blocks 数组 | ||
| 173 | +3. **blocks 内使用 paragraph** - 文本内容应放在 paragraph block 内 | ||
| 174 | +4. **保持数据完整性** - 不要丢失原始内容 | ||
| 175 | + | ||
| 176 | +**修复方法:** | ||
| 177 | +1. 嵌套 cells 结构 → 展平为同级 cells 数组 | ||
| 178 | +2. 缺少 blocks 字段 → 添加包含 paragraph 的 blocks | ||
| 179 | +3. 空 cells 数组 → 添加默认空单元格 | ||
| 180 | +4. 非法 cell 类型 → 转换为标准格式 | ||
| 181 | + | ||
| 182 | +请根据错误信息修复表格数据,并返回修复后的完整 table block(JSON格式)。 | ||
| 183 | +""" | ||
| 184 | + | ||
| 185 | + | ||
| 186 | +# 词云修复提示词 | ||
| 187 | +WORDCLOUD_REPAIR_SYSTEM_PROMPT = """你是一个专业的词云数据修复助手。你的任务是修复词云 widget 数据中的格式错误,确保词云能够正常渲染。 | ||
| 188 | + | ||
| 189 | +**标准词云数据格式:** | ||
| 190 | + | ||
| 191 | +```json | ||
| 192 | +{ | ||
| 193 | + "type": "widget", | ||
| 194 | + "widgetType": "wordcloud", | ||
| 195 | + "widgetId": "wordcloud-001", | ||
| 196 | + "title": "词云标题", | ||
| 197 | + "data": { | ||
| 198 | + "words": [ | ||
| 199 | + {"text": "关键词1", "weight": 10}, | ||
| 200 | + {"text": "关键词2", "weight": 8}, | ||
| 201 | + {"text": "关键词3", "weight": 6} | ||
| 202 | + ] | ||
| 203 | + } | ||
| 204 | +} | ||
| 205 | +``` | ||
| 206 | + | ||
| 207 | +**⚠️ 数据路径说明:** | ||
| 208 | + | ||
| 209 | +词云数据可以位于以下路径(按优先级): | ||
| 210 | +1. `data.words` - 推荐路径 | ||
| 211 | +2. `data.items` - 备选路径 | ||
| 212 | +3. `props.words` - 备选路径 | ||
| 213 | +4. `props.items` - 备选路径 | ||
| 214 | +5. `props.data` - 备选路径 | ||
| 215 | + | ||
| 216 | +**词云项目格式:** | ||
| 217 | + | ||
| 218 | +每个词云项目应该是一个对象,包含: | ||
| 219 | +- `text` 或 `word` 或 `label`: 词语文本(必需) | ||
| 220 | +- `weight` 或 `value`: 权重/频率(必需) | ||
| 221 | +- `category`: 类别(可选) | ||
| 222 | + | ||
| 223 | +**修复原则:** | ||
| 224 | +1. **规范化数据路径** - 优先使用 `data.words` | ||
| 225 | +2. **确保必需字段** - 每个词项必须有文本和权重 | ||
| 226 | +3. **转换兼容格式** - 将其他格式转换为标准格式 | ||
| 227 | +4. **保持数据完整性** - 不要丢失原始词语 | ||
| 228 | + | ||
| 229 | +**常见错误及修复方法:** | ||
| 230 | +1. 数据位于错误路径 → 移动到 `data.words` | ||
| 231 | +2. 缺少 weight 字段 → 根据位置生成默认权重 | ||
| 232 | +3. 使用 word 而非 text → 统一为 text 字段 | ||
| 233 | +4. 数组元素是字符串 → 转换为对象格式 | ||
| 234 | + | ||
| 235 | +请根据错误信息修复词云数据,并返回修复后的完整 widget block(JSON格式)。 | ||
| 236 | +""" | ||
| 237 | + | ||
| 238 | + | ||
| 239 | +def build_table_repair_prompt( | ||
| 240 | + table_block: Dict[str, Any], | ||
| 241 | + validation_errors: List[str] | ||
| 242 | +) -> str: | ||
| 243 | + """ | ||
| 244 | + 构建表格修复提示词。 | ||
| 245 | + | ||
| 246 | + Args: | ||
| 247 | + table_block: 原始 table block | ||
| 248 | + validation_errors: 验证错误列表 | ||
| 249 | + | ||
| 250 | + Returns: | ||
| 251 | + str: 提示词 | ||
| 252 | + """ | ||
| 253 | + block_json = json.dumps(table_block, ensure_ascii=False, indent=2) | ||
| 254 | + errors_text = "\n".join(f"- {error}" for error in validation_errors) | ||
| 255 | + | ||
| 256 | + prompt = f"""请修复以下表格数据中的错误: | ||
| 257 | + | ||
| 258 | +**原始数据:** | ||
| 259 | +```json | ||
| 260 | +{block_json} | ||
| 261 | +``` | ||
| 262 | + | ||
| 263 | +**检测到的错误:** | ||
| 264 | +{errors_text} | ||
| 265 | + | ||
| 266 | +**要求:** | ||
| 267 | +1. 返回修复后的完整 table block(JSON格式) | ||
| 268 | +2. 特别注意展平嵌套的 cells 结构 | ||
| 269 | +3. 确保每个 cell 都有 blocks 数组 | ||
| 270 | +4. 如果无法确定如何修复,保持原始数据 | ||
| 271 | + | ||
| 272 | +**重要的输出格式要求:** | ||
| 273 | +1. 只返回纯JSON对象,不要添加任何说明文字 | ||
| 274 | +2. 不要使用```json```标记包裹 | ||
| 275 | +3. 确保JSON语法完全正确 | ||
| 276 | +4. 所有字符串使用双引号 | ||
| 277 | +""" | ||
| 278 | + return prompt | ||
| 279 | + | ||
| 280 | + | ||
| 281 | +def build_wordcloud_repair_prompt( | ||
| 282 | + widget_block: Dict[str, Any], | ||
| 283 | + validation_errors: List[str] | ||
| 284 | +) -> str: | ||
| 285 | + """ | ||
| 286 | + 构建词云修复提示词。 | ||
| 287 | + | ||
| 288 | + Args: | ||
| 289 | + widget_block: 原始 wordcloud widget block | ||
| 290 | + validation_errors: 验证错误列表 | ||
| 291 | + | ||
| 292 | + Returns: | ||
| 293 | + str: 提示词 | ||
| 294 | + """ | ||
| 295 | + block_json = json.dumps(widget_block, ensure_ascii=False, indent=2) | ||
| 296 | + errors_text = "\n".join(f"- {error}" for error in validation_errors) | ||
| 297 | + | ||
| 298 | + prompt = f"""请修复以下词云数据中的错误: | ||
| 299 | + | ||
| 300 | +**原始数据:** | ||
| 301 | +```json | ||
| 302 | +{block_json} | ||
| 303 | +``` | ||
| 304 | + | ||
| 305 | +**检测到的错误:** | ||
| 306 | +{errors_text} | ||
| 307 | + | ||
| 308 | +**要求:** | ||
| 309 | +1. 返回修复后的完整 widget block(JSON格式) | ||
| 310 | +2. 确保词云数据位于 data.words 路径 | ||
| 311 | +3. 每个词项必须有 text 和 weight 字段 | ||
| 312 | +4. 如果无法确定如何修复,保持原始数据 | ||
| 313 | + | ||
| 314 | +**重要的输出格式要求:** | ||
| 315 | +1. 只返回纯JSON对象,不要添加任何说明文字 | ||
| 316 | +2. 不要使用```json```标记包裹 | ||
| 317 | +3. 确保JSON语法完全正确 | ||
| 318 | +4. 所有字符串使用双引号 | ||
| 319 | +""" | ||
| 320 | + return prompt | ||
| 321 | + | ||
| 322 | + | ||
| 86 | def build_chart_repair_prompt( | 323 | def build_chart_repair_prompt( |
| 87 | widget_block: Dict[str, Any], | 324 | widget_block: Dict[str, Any], |
| 88 | validation_errors: List[str] | 325 | validation_errors: List[str] |
| @@ -283,3 +520,111 @@ def create_llm_repair_functions() -> List: | @@ -283,3 +520,111 @@ def create_llm_repair_functions() -> List: | ||
| 283 | logger.info(f"图表API修复功能已启用,共 {len(repair_functions)} 个Engine可用") | 520 | logger.info(f"图表API修复功能已启用,共 {len(repair_functions)} 个Engine可用") |
| 284 | 521 | ||
| 285 | return repair_functions | 522 | return repair_functions |
| 523 | + | ||
| 524 | + | ||
| 525 | +def create_table_repair_functions() -> List: | ||
| 526 | + """ | ||
| 527 | + 创建表格 LLM 修复函数列表。 | ||
| 528 | + | ||
| 529 | + 使用与图表修复相同的 Engine 配置。 | ||
| 530 | + | ||
| 531 | + Returns: | ||
| 532 | + List[Callable]: 修复函数列表 | ||
| 533 | + """ | ||
| 534 | + repair_functions = [] | ||
| 535 | + | ||
| 536 | + # 使用 ReportEngine 修复表格 | ||
| 537 | + if settings.REPORT_ENGINE_API_KEY and settings.REPORT_ENGINE_BASE_URL: | ||
| 538 | + def repair_table_with_report_engine(table_block: Dict[str, Any], errors: List[str]) -> Optional[Dict[str, Any]]: | ||
| 539 | + """使用 ReportEngine 的 LLM 修复表格""" | ||
| 540 | + try: | ||
| 541 | + from ReportEngine.llms import LLMClient | ||
| 542 | + | ||
| 543 | + client = LLMClient( | ||
| 544 | + api_key=settings.REPORT_ENGINE_API_KEY, | ||
| 545 | + base_url=settings.REPORT_ENGINE_BASE_URL, | ||
| 546 | + model_name=settings.REPORT_ENGINE_MODEL_NAME or "gpt-4", | ||
| 547 | + ) | ||
| 548 | + | ||
| 549 | + prompt = build_table_repair_prompt(table_block, errors) | ||
| 550 | + response = client.invoke( | ||
| 551 | + TABLE_REPAIR_SYSTEM_PROMPT, | ||
| 552 | + prompt, | ||
| 553 | + temperature=0.0, | ||
| 554 | + top_p=0.05 | ||
| 555 | + ) | ||
| 556 | + | ||
| 557 | + if not response: | ||
| 558 | + return None | ||
| 559 | + | ||
| 560 | + # 解析响应 | ||
| 561 | + repaired = json.loads(response) | ||
| 562 | + return repaired | ||
| 563 | + | ||
| 564 | + except Exception as e: | ||
| 565 | + logger.exception(f"ReportEngine 表格修复失败: {e}") | ||
| 566 | + return None | ||
| 567 | + | ||
| 568 | + repair_functions.append(repair_table_with_report_engine) | ||
| 569 | + logger.debug("已添加 ReportEngine 表格修复函数") | ||
| 570 | + | ||
| 571 | + if not repair_functions: | ||
| 572 | + logger.warning("未配置任何 Engine API,表格 API 修复功能将不可用") | ||
| 573 | + else: | ||
| 574 | + logger.info(f"表格 API 修复功能已启用,共 {len(repair_functions)} 个 Engine 可用") | ||
| 575 | + | ||
| 576 | + return repair_functions | ||
| 577 | + | ||
| 578 | + | ||
| 579 | +def create_wordcloud_repair_functions() -> List: | ||
| 580 | + """ | ||
| 581 | + 创建词云 LLM 修复函数列表。 | ||
| 582 | + | ||
| 583 | + 使用与图表修复相同的 Engine 配置。 | ||
| 584 | + | ||
| 585 | + Returns: | ||
| 586 | + List[Callable]: 修复函数列表 | ||
| 587 | + """ | ||
| 588 | + repair_functions = [] | ||
| 589 | + | ||
| 590 | + # 使用 ReportEngine 修复词云 | ||
| 591 | + if settings.REPORT_ENGINE_API_KEY and settings.REPORT_ENGINE_BASE_URL: | ||
| 592 | + def repair_wordcloud_with_report_engine(widget_block: Dict[str, Any], errors: List[str]) -> Optional[Dict[str, Any]]: | ||
| 593 | + """使用 ReportEngine 的 LLM 修复词云""" | ||
| 594 | + try: | ||
| 595 | + from ReportEngine.llms import LLMClient | ||
| 596 | + | ||
| 597 | + client = LLMClient( | ||
| 598 | + api_key=settings.REPORT_ENGINE_API_KEY, | ||
| 599 | + base_url=settings.REPORT_ENGINE_BASE_URL, | ||
| 600 | + model_name=settings.REPORT_ENGINE_MODEL_NAME or "gpt-4", | ||
| 601 | + ) | ||
| 602 | + | ||
| 603 | + prompt = build_wordcloud_repair_prompt(widget_block, errors) | ||
| 604 | + response = client.invoke( | ||
| 605 | + WORDCLOUD_REPAIR_SYSTEM_PROMPT, | ||
| 606 | + prompt, | ||
| 607 | + temperature=0.0, | ||
| 608 | + top_p=0.05 | ||
| 609 | + ) | ||
| 610 | + | ||
| 611 | + if not response: | ||
| 612 | + return None | ||
| 613 | + | ||
| 614 | + # 解析响应 | ||
| 615 | + repaired = json.loads(response) | ||
| 616 | + return repaired | ||
| 617 | + | ||
| 618 | + except Exception as e: | ||
| 619 | + logger.exception(f"ReportEngine 词云修复失败: {e}") | ||
| 620 | + return None | ||
| 621 | + | ||
| 622 | + repair_functions.append(repair_wordcloud_with_report_engine) | ||
| 623 | + logger.debug("已添加 ReportEngine 词云修复函数") | ||
| 624 | + | ||
| 625 | + if not repair_functions: | ||
| 626 | + logger.warning("未配置任何 Engine API,词云 API 修复功能将不可用") | ||
| 627 | + else: | ||
| 628 | + logger.info(f"词云 API 修复功能已启用,共 {len(repair_functions)} 个 Engine 可用") | ||
| 629 | + | ||
| 630 | + return repair_functions |
ReportEngine/utils/table_validator.py
0 → 100644
| 1 | +""" | ||
| 2 | +表格验证和修复工具。 | ||
| 3 | + | ||
| 4 | +提供对 IR 表格数据的验证和修复能力: | ||
| 5 | +1. 验证表格数据格式是否符合 IR schema 要求 | ||
| 6 | +2. 检测嵌套 cells 结构问题 | ||
| 7 | +3. 验证 rows/cells 基本格式 | ||
| 8 | +4. 检查数据完整性 | ||
| 9 | +5. 本地规则修复常见问题 | ||
| 10 | +""" | ||
| 11 | + | ||
| 12 | +from __future__ import annotations | ||
| 13 | + | ||
| 14 | +import copy | ||
| 15 | +from typing import Any, Dict, List, Optional, Tuple | ||
| 16 | +from dataclasses import dataclass | ||
| 17 | +from loguru import logger | ||
| 18 | + | ||
| 19 | + | ||
| 20 | +@dataclass | ||
| 21 | +class TableValidationResult: | ||
| 22 | + """表格验证结果""" | ||
| 23 | + is_valid: bool | ||
| 24 | + errors: List[str] | ||
| 25 | + warnings: List[str] | ||
| 26 | + nested_cells_detected: bool = False | ||
| 27 | + empty_cells_count: int = 0 | ||
| 28 | + total_cells_count: int = 0 | ||
| 29 | + | ||
| 30 | + def has_critical_errors(self) -> bool: | ||
| 31 | + """是否有严重错误(会导致渲染失败)""" | ||
| 32 | + return not self.is_valid and len(self.errors) > 0 | ||
| 33 | + | ||
| 34 | + | ||
| 35 | +@dataclass | ||
| 36 | +class TableRepairResult: | ||
| 37 | + """表格修复结果""" | ||
| 38 | + success: bool | ||
| 39 | + repaired_block: Optional[Dict[str, Any]] | ||
| 40 | + changes: List[str] | ||
| 41 | + | ||
| 42 | + def has_changes(self) -> bool: | ||
| 43 | + """是否有修改""" | ||
| 44 | + return len(self.changes) > 0 | ||
| 45 | + | ||
| 46 | + | ||
| 47 | +class TableValidator: | ||
| 48 | + """ | ||
| 49 | + 表格验证器 - 验证 IR 表格数据格式是否正确。 | ||
| 50 | + | ||
| 51 | + 验证规则: | ||
| 52 | + 1. 基本结构验证:type, rows 字段 | ||
| 53 | + 2. 行结构验证:每行必须有 cells 数组 | ||
| 54 | + 3. 单元格结构验证:每个 cell 必须有 blocks 数组 | ||
| 55 | + 4. 嵌套 cells 检测:检测错误的嵌套 cells 结构 | ||
| 56 | + 5. 数据完整性验证:检查空单元格和缺失数据 | ||
| 57 | + """ | ||
| 58 | + | ||
| 59 | + def __init__(self): | ||
| 60 | + """初始化验证器""" | ||
| 61 | + pass | ||
| 62 | + | ||
| 63 | + def validate(self, table_block: Dict[str, Any]) -> TableValidationResult: | ||
| 64 | + """ | ||
| 65 | + 验证表格格式。 | ||
| 66 | + | ||
| 67 | + Args: | ||
| 68 | + table_block: table 类型的 block,包含 type, rows 等字段 | ||
| 69 | + | ||
| 70 | + Returns: | ||
| 71 | + TableValidationResult: 验证结果 | ||
| 72 | + """ | ||
| 73 | + errors: List[str] = [] | ||
| 74 | + warnings: List[str] = [] | ||
| 75 | + nested_cells_detected = False | ||
| 76 | + empty_cells_count = 0 | ||
| 77 | + total_cells_count = 0 | ||
| 78 | + | ||
| 79 | + # 1. 基本结构验证 | ||
| 80 | + if not isinstance(table_block, dict): | ||
| 81 | + errors.append("table_block 必须是字典类型") | ||
| 82 | + return TableValidationResult( | ||
| 83 | + False, errors, warnings, nested_cells_detected, | ||
| 84 | + empty_cells_count, total_cells_count | ||
| 85 | + ) | ||
| 86 | + | ||
| 87 | + # 2. 检查 type | ||
| 88 | + block_type = table_block.get('type') | ||
| 89 | + if block_type != 'table': | ||
| 90 | + errors.append(f"block type 应为 'table',实际为 '{block_type}'") | ||
| 91 | + | ||
| 92 | + # 3. 验证 rows 字段 | ||
| 93 | + rows = table_block.get('rows') | ||
| 94 | + if rows is None: | ||
| 95 | + errors.append("缺少 rows 字段") | ||
| 96 | + return TableValidationResult( | ||
| 97 | + False, errors, warnings, nested_cells_detected, | ||
| 98 | + empty_cells_count, total_cells_count | ||
| 99 | + ) | ||
| 100 | + | ||
| 101 | + if not isinstance(rows, list): | ||
| 102 | + errors.append("rows 必须是数组类型") | ||
| 103 | + return TableValidationResult( | ||
| 104 | + False, errors, warnings, nested_cells_detected, | ||
| 105 | + empty_cells_count, total_cells_count | ||
| 106 | + ) | ||
| 107 | + | ||
| 108 | + if len(rows) == 0: | ||
| 109 | + warnings.append("rows 数组为空,表格可能无法正常显示") | ||
| 110 | + | ||
| 111 | + # 4. 验证每一行 | ||
| 112 | + for row_idx, row in enumerate(rows): | ||
| 113 | + row_result = self._validate_row(row, row_idx) | ||
| 114 | + errors.extend(row_result['errors']) | ||
| 115 | + warnings.extend(row_result['warnings']) | ||
| 116 | + if row_result['nested_cells_detected']: | ||
| 117 | + nested_cells_detected = True | ||
| 118 | + empty_cells_count += row_result['empty_cells_count'] | ||
| 119 | + total_cells_count += row_result['total_cells_count'] | ||
| 120 | + | ||
| 121 | + # 5. 检查列数一致性 | ||
| 122 | + column_counts = [] | ||
| 123 | + for row in rows: | ||
| 124 | + if isinstance(row, dict): | ||
| 125 | + cells = row.get('cells', []) | ||
| 126 | + if isinstance(cells, list): | ||
| 127 | + col_count = 0 | ||
| 128 | + for cell in cells: | ||
| 129 | + if isinstance(cell, dict): | ||
| 130 | + col_count += int(cell.get('colspan', 1)) | ||
| 131 | + else: | ||
| 132 | + col_count += 1 | ||
| 133 | + column_counts.append(col_count) | ||
| 134 | + | ||
| 135 | + if column_counts and len(set(column_counts)) > 1: | ||
| 136 | + warnings.append( | ||
| 137 | + f"各行列数不一致: {column_counts},可能导致渲染问题" | ||
| 138 | + ) | ||
| 139 | + | ||
| 140 | + # 6. 空单元格警告 | ||
| 141 | + if total_cells_count > 0 and empty_cells_count > total_cells_count * 0.5: | ||
| 142 | + warnings.append( | ||
| 143 | + f"超过50%的单元格为空 ({empty_cells_count}/{total_cells_count})," | ||
| 144 | + "表格可能缺少数据" | ||
| 145 | + ) | ||
| 146 | + | ||
| 147 | + is_valid = len(errors) == 0 | ||
| 148 | + return TableValidationResult( | ||
| 149 | + is_valid, errors, warnings, nested_cells_detected, | ||
| 150 | + empty_cells_count, total_cells_count | ||
| 151 | + ) | ||
| 152 | + | ||
| 153 | + def _validate_row(self, row: Any, row_idx: int) -> Dict[str, Any]: | ||
| 154 | + """验证单行""" | ||
| 155 | + result = { | ||
| 156 | + 'errors': [], | ||
| 157 | + 'warnings': [], | ||
| 158 | + 'nested_cells_detected': False, | ||
| 159 | + 'empty_cells_count': 0, | ||
| 160 | + 'total_cells_count': 0, | ||
| 161 | + } | ||
| 162 | + | ||
| 163 | + if not isinstance(row, dict): | ||
| 164 | + result['errors'].append(f"rows[{row_idx}] 必须是对象类型") | ||
| 165 | + return result | ||
| 166 | + | ||
| 167 | + cells = row.get('cells') | ||
| 168 | + if cells is None: | ||
| 169 | + result['errors'].append(f"rows[{row_idx}] 缺少 cells 字段") | ||
| 170 | + return result | ||
| 171 | + | ||
| 172 | + if not isinstance(cells, list): | ||
| 173 | + result['errors'].append(f"rows[{row_idx}].cells 必须是数组类型") | ||
| 174 | + return result | ||
| 175 | + | ||
| 176 | + if len(cells) == 0: | ||
| 177 | + result['warnings'].append(f"rows[{row_idx}].cells 数组为空") | ||
| 178 | + | ||
| 179 | + # 验证每个单元格 | ||
| 180 | + for cell_idx, cell in enumerate(cells): | ||
| 181 | + cell_result = self._validate_cell(cell, row_idx, cell_idx) | ||
| 182 | + result['errors'].extend(cell_result['errors']) | ||
| 183 | + result['warnings'].extend(cell_result['warnings']) | ||
| 184 | + if cell_result['nested_cells_detected']: | ||
| 185 | + result['nested_cells_detected'] = True | ||
| 186 | + if cell_result['is_empty']: | ||
| 187 | + result['empty_cells_count'] += 1 | ||
| 188 | + result['total_cells_count'] += 1 | ||
| 189 | + | ||
| 190 | + return result | ||
| 191 | + | ||
| 192 | + def _validate_cell(self, cell: Any, row_idx: int, cell_idx: int) -> Dict[str, Any]: | ||
| 193 | + """验证单个单元格""" | ||
| 194 | + result = { | ||
| 195 | + 'errors': [], | ||
| 196 | + 'warnings': [], | ||
| 197 | + 'nested_cells_detected': False, | ||
| 198 | + 'is_empty': False, | ||
| 199 | + } | ||
| 200 | + | ||
| 201 | + if not isinstance(cell, dict): | ||
| 202 | + result['errors'].append( | ||
| 203 | + f"rows[{row_idx}].cells[{cell_idx}] 必须是对象类型" | ||
| 204 | + ) | ||
| 205 | + return result | ||
| 206 | + | ||
| 207 | + # 检测嵌套 cells 结构(这是常见的 LLM 错误) | ||
| 208 | + if 'cells' in cell and 'blocks' not in cell: | ||
| 209 | + result['nested_cells_detected'] = True | ||
| 210 | + result['errors'].append( | ||
| 211 | + f"rows[{row_idx}].cells[{cell_idx}] 检测到错误的嵌套 cells 结构," | ||
| 212 | + "应该是 blocks 而不是 cells" | ||
| 213 | + ) | ||
| 214 | + return result | ||
| 215 | + | ||
| 216 | + # 验证 blocks 字段 | ||
| 217 | + blocks = cell.get('blocks') | ||
| 218 | + if blocks is None: | ||
| 219 | + result['errors'].append( | ||
| 220 | + f"rows[{row_idx}].cells[{cell_idx}] 缺少 blocks 字段" | ||
| 221 | + ) | ||
| 222 | + return result | ||
| 223 | + | ||
| 224 | + if not isinstance(blocks, list): | ||
| 225 | + result['errors'].append( | ||
| 226 | + f"rows[{row_idx}].cells[{cell_idx}].blocks 必须是数组类型" | ||
| 227 | + ) | ||
| 228 | + return result | ||
| 229 | + | ||
| 230 | + # 检查是否为空 | ||
| 231 | + if len(blocks) == 0: | ||
| 232 | + result['is_empty'] = True | ||
| 233 | + else: | ||
| 234 | + # 检查 blocks 内容是否有效 | ||
| 235 | + has_content = False | ||
| 236 | + for block in blocks: | ||
| 237 | + if isinstance(block, dict): | ||
| 238 | + # 检查 paragraph 的 inlines | ||
| 239 | + if block.get('type') == 'paragraph': | ||
| 240 | + inlines = block.get('inlines', []) | ||
| 241 | + for inline in inlines: | ||
| 242 | + if isinstance(inline, dict): | ||
| 243 | + text = inline.get('text', '') | ||
| 244 | + if text and text.strip(): | ||
| 245 | + has_content = True | ||
| 246 | + break | ||
| 247 | + # 检查其他类型的 text/content | ||
| 248 | + elif block.get('text') or block.get('content'): | ||
| 249 | + has_content = True | ||
| 250 | + break | ||
| 251 | + if has_content: | ||
| 252 | + break | ||
| 253 | + | ||
| 254 | + if not has_content: | ||
| 255 | + result['is_empty'] = True | ||
| 256 | + | ||
| 257 | + # 验证 colspan/rowspan | ||
| 258 | + colspan = cell.get('colspan') | ||
| 259 | + if colspan is not None: | ||
| 260 | + if not isinstance(colspan, int) or colspan < 1: | ||
| 261 | + result['warnings'].append( | ||
| 262 | + f"rows[{row_idx}].cells[{cell_idx}].colspan 值无效: {colspan}" | ||
| 263 | + ) | ||
| 264 | + | ||
| 265 | + rowspan = cell.get('rowspan') | ||
| 266 | + if rowspan is not None: | ||
| 267 | + if not isinstance(rowspan, int) or rowspan < 1: | ||
| 268 | + result['warnings'].append( | ||
| 269 | + f"rows[{row_idx}].cells[{cell_idx}].rowspan 值无效: {rowspan}" | ||
| 270 | + ) | ||
| 271 | + | ||
| 272 | + return result | ||
| 273 | + | ||
| 274 | + def can_render(self, table_block: Dict[str, Any]) -> bool: | ||
| 275 | + """ | ||
| 276 | + 判断表格是否能正常渲染(快速检查)。 | ||
| 277 | + | ||
| 278 | + Args: | ||
| 279 | + table_block: table 类型的 block | ||
| 280 | + | ||
| 281 | + Returns: | ||
| 282 | + bool: 是否能正常渲染 | ||
| 283 | + """ | ||
| 284 | + result = self.validate(table_block) | ||
| 285 | + return result.is_valid | ||
| 286 | + | ||
| 287 | + def has_nested_cells(self, table_block: Dict[str, Any]) -> bool: | ||
| 288 | + """ | ||
| 289 | + 检测表格是否包含嵌套 cells 结构。 | ||
| 290 | + | ||
| 291 | + Args: | ||
| 292 | + table_block: table 类型的 block | ||
| 293 | + | ||
| 294 | + Returns: | ||
| 295 | + bool: 是否包含嵌套 cells | ||
| 296 | + """ | ||
| 297 | + result = self.validate(table_block) | ||
| 298 | + return result.nested_cells_detected | ||
| 299 | + | ||
| 300 | + | ||
| 301 | +class TableRepairer: | ||
| 302 | + """ | ||
| 303 | + 表格修复器 - 尝试修复表格数据。 | ||
| 304 | + | ||
| 305 | + 修复策略: | ||
| 306 | + 1. 展平嵌套 cells 结构 | ||
| 307 | + 2. 补充缺失的 blocks 字段 | ||
| 308 | + 3. 规范化单元格结构 | ||
| 309 | + 4. 验证修复结果 | ||
| 310 | + """ | ||
| 311 | + | ||
| 312 | + def __init__(self, validator: Optional[TableValidator] = None): | ||
| 313 | + """ | ||
| 314 | + 初始化修复器。 | ||
| 315 | + | ||
| 316 | + Args: | ||
| 317 | + validator: 表格验证器实例 | ||
| 318 | + """ | ||
| 319 | + self.validator = validator or TableValidator() | ||
| 320 | + | ||
| 321 | + def repair( | ||
| 322 | + self, | ||
| 323 | + table_block: Dict[str, Any], | ||
| 324 | + validation_result: Optional[TableValidationResult] = None | ||
| 325 | + ) -> TableRepairResult: | ||
| 326 | + """ | ||
| 327 | + 尝试修复表格数据。 | ||
| 328 | + | ||
| 329 | + Args: | ||
| 330 | + table_block: table 类型的 block | ||
| 331 | + validation_result: 验证结果(可选,如果没有会先进行验证) | ||
| 332 | + | ||
| 333 | + Returns: | ||
| 334 | + TableRepairResult: 修复结果 | ||
| 335 | + """ | ||
| 336 | + # 1. 如果没有验证结果,先验证 | ||
| 337 | + if validation_result is None: | ||
| 338 | + validation_result = self.validator.validate(table_block) | ||
| 339 | + | ||
| 340 | + # 2. 如果已经有效,返回原数据 | ||
| 341 | + if validation_result.is_valid and not validation_result.nested_cells_detected: | ||
| 342 | + return TableRepairResult(True, table_block, []) | ||
| 343 | + | ||
| 344 | + # 3. 尝试修复 | ||
| 345 | + repaired = copy.deepcopy(table_block) | ||
| 346 | + changes: List[str] = [] | ||
| 347 | + | ||
| 348 | + # 确保基本结构 | ||
| 349 | + if 'type' not in repaired: | ||
| 350 | + repaired['type'] = 'table' | ||
| 351 | + changes.append("添加缺失的 type 字段") | ||
| 352 | + | ||
| 353 | + if 'rows' not in repaired or not isinstance(repaired.get('rows'), list): | ||
| 354 | + repaired['rows'] = [] | ||
| 355 | + changes.append("添加缺失的 rows 字段") | ||
| 356 | + | ||
| 357 | + # 修复每一行 | ||
| 358 | + repaired_rows: List[Dict[str, Any]] = [] | ||
| 359 | + for row_idx, row in enumerate(repaired.get('rows', [])): | ||
| 360 | + repaired_row, row_changes = self._repair_row(row, row_idx) | ||
| 361 | + repaired_rows.append(repaired_row) | ||
| 362 | + changes.extend(row_changes) | ||
| 363 | + | ||
| 364 | + repaired['rows'] = repaired_rows | ||
| 365 | + | ||
| 366 | + # 4. 验证修复结果 | ||
| 367 | + repaired_validation = self.validator.validate(repaired) | ||
| 368 | + success = repaired_validation.is_valid | ||
| 369 | + | ||
| 370 | + if not success: | ||
| 371 | + logger.warning( | ||
| 372 | + f"表格修复后仍有问题: {repaired_validation.errors}" | ||
| 373 | + ) | ||
| 374 | + | ||
| 375 | + return TableRepairResult(success, repaired, changes) | ||
| 376 | + | ||
| 377 | + def _repair_row( | ||
| 378 | + self, row: Any, row_idx: int | ||
| 379 | + ) -> Tuple[Dict[str, Any], List[str]]: | ||
| 380 | + """修复单行""" | ||
| 381 | + changes: List[str] = [] | ||
| 382 | + | ||
| 383 | + if not isinstance(row, dict): | ||
| 384 | + return {'cells': [self._default_cell()]}, [ | ||
| 385 | + f"rows[{row_idx}] 类型错误,已重建" | ||
| 386 | + ] | ||
| 387 | + | ||
| 388 | + repaired_row = dict(row) | ||
| 389 | + | ||
| 390 | + # 确保有 cells 字段 | ||
| 391 | + if 'cells' not in repaired_row or not isinstance(repaired_row.get('cells'), list): | ||
| 392 | + repaired_row['cells'] = [self._default_cell()] | ||
| 393 | + changes.append(f"rows[{row_idx}] 添加缺失的 cells 字段") | ||
| 394 | + return repaired_row, changes | ||
| 395 | + | ||
| 396 | + # 修复每个单元格 | ||
| 397 | + repaired_cells: List[Dict[str, Any]] = [] | ||
| 398 | + for cell_idx, cell in enumerate(repaired_row.get('cells', [])): | ||
| 399 | + if isinstance(cell, dict) and 'cells' in cell and 'blocks' not in cell: | ||
| 400 | + # 展平嵌套 cells | ||
| 401 | + flattened = self._flatten_nested_cells(cell) | ||
| 402 | + repaired_cells.extend(flattened) | ||
| 403 | + changes.append( | ||
| 404 | + f"rows[{row_idx}].cells[{cell_idx}] 展平嵌套 cells 结构" | ||
| 405 | + ) | ||
| 406 | + else: | ||
| 407 | + repaired_cell, cell_changes = self._repair_cell(cell, row_idx, cell_idx) | ||
| 408 | + repaired_cells.append(repaired_cell) | ||
| 409 | + changes.extend(cell_changes) | ||
| 410 | + | ||
| 411 | + repaired_row['cells'] = repaired_cells | ||
| 412 | + return repaired_row, changes | ||
| 413 | + | ||
| 414 | + def _repair_cell( | ||
| 415 | + self, cell: Any, row_idx: int, cell_idx: int | ||
| 416 | + ) -> Tuple[Dict[str, Any], List[str]]: | ||
| 417 | + """修复单个单元格""" | ||
| 418 | + changes: List[str] = [] | ||
| 419 | + | ||
| 420 | + if not isinstance(cell, dict): | ||
| 421 | + if isinstance(cell, (str, int, float)): | ||
| 422 | + return { | ||
| 423 | + 'blocks': [self._text_to_paragraph(str(cell))] | ||
| 424 | + }, [f"rows[{row_idx}].cells[{cell_idx}] 转换为标准格式"] | ||
| 425 | + return self._default_cell(), [ | ||
| 426 | + f"rows[{row_idx}].cells[{cell_idx}] 类型错误,已重建" | ||
| 427 | + ] | ||
| 428 | + | ||
| 429 | + repaired_cell = dict(cell) | ||
| 430 | + | ||
| 431 | + # 确保有 blocks 字段 | ||
| 432 | + if 'blocks' not in repaired_cell: | ||
| 433 | + # 尝试从其他字段提取内容 | ||
| 434 | + text = '' | ||
| 435 | + for key in ('text', 'content', 'value'): | ||
| 436 | + if key in repaired_cell and repaired_cell[key]: | ||
| 437 | + text = str(repaired_cell[key]) | ||
| 438 | + break | ||
| 439 | + | ||
| 440 | + repaired_cell['blocks'] = [self._text_to_paragraph(text or '')] | ||
| 441 | + changes.append( | ||
| 442 | + f"rows[{row_idx}].cells[{cell_idx}] 添加缺失的 blocks 字段" | ||
| 443 | + ) | ||
| 444 | + elif not isinstance(repaired_cell['blocks'], list): | ||
| 445 | + repaired_cell['blocks'] = [self._text_to_paragraph('')] | ||
| 446 | + changes.append( | ||
| 447 | + f"rows[{row_idx}].cells[{cell_idx}].blocks 类型错误,已重建" | ||
| 448 | + ) | ||
| 449 | + elif len(repaired_cell['blocks']) == 0: | ||
| 450 | + repaired_cell['blocks'] = [self._text_to_paragraph('')] | ||
| 451 | + changes.append( | ||
| 452 | + f"rows[{row_idx}].cells[{cell_idx}].blocks 为空,添加默认内容" | ||
| 453 | + ) | ||
| 454 | + | ||
| 455 | + return repaired_cell, changes | ||
| 456 | + | ||
| 457 | + def _flatten_nested_cells(self, cell: Dict[str, Any]) -> List[Dict[str, Any]]: | ||
| 458 | + """展平嵌套的 cells 结构""" | ||
| 459 | + nested_cells = cell.get('cells', []) | ||
| 460 | + if not isinstance(nested_cells, list): | ||
| 461 | + return [self._default_cell()] | ||
| 462 | + | ||
| 463 | + result: List[Dict[str, Any]] = [] | ||
| 464 | + for nested in nested_cells: | ||
| 465 | + if isinstance(nested, dict): | ||
| 466 | + if 'blocks' in nested and 'cells' not in nested: | ||
| 467 | + # 正常的 cell | ||
| 468 | + result.append(nested) | ||
| 469 | + elif 'cells' in nested and 'blocks' not in nested: | ||
| 470 | + # 继续递归展平 | ||
| 471 | + result.extend(self._flatten_nested_cells(nested)) | ||
| 472 | + else: | ||
| 473 | + # 尝试修复 | ||
| 474 | + repaired, _ = self._repair_cell(nested, 0, 0) | ||
| 475 | + result.append(repaired) | ||
| 476 | + elif isinstance(nested, (str, int, float)): | ||
| 477 | + result.append({ | ||
| 478 | + 'blocks': [self._text_to_paragraph(str(nested))] | ||
| 479 | + }) | ||
| 480 | + | ||
| 481 | + return result if result else [self._default_cell()] | ||
| 482 | + | ||
| 483 | + def _default_cell(self) -> Dict[str, Any]: | ||
| 484 | + """创建默认单元格""" | ||
| 485 | + return { | ||
| 486 | + 'blocks': [self._text_to_paragraph('')] | ||
| 487 | + } | ||
| 488 | + | ||
| 489 | + def _text_to_paragraph(self, text: str) -> Dict[str, Any]: | ||
| 490 | + """将文本转换为 paragraph block""" | ||
| 491 | + return { | ||
| 492 | + 'type': 'paragraph', | ||
| 493 | + 'inlines': [{'text': text, 'marks': []}] | ||
| 494 | + } | ||
| 495 | + | ||
| 496 | + | ||
| 497 | +def create_table_validator() -> TableValidator: | ||
| 498 | + """创建表格验证器实例""" | ||
| 499 | + return TableValidator() | ||
| 500 | + | ||
| 501 | + | ||
| 502 | +def create_table_repairer( | ||
| 503 | + validator: Optional[TableValidator] = None | ||
| 504 | +) -> TableRepairer: | ||
| 505 | + """创建表格修复器实例""" | ||
| 506 | + return TableRepairer(validator) | ||
| 507 | + | ||
| 508 | + | ||
| 509 | +__all__ = [ | ||
| 510 | + 'TableValidator', | ||
| 511 | + 'TableRepairer', | ||
| 512 | + 'TableValidationResult', | ||
| 513 | + 'TableRepairResult', | ||
| 514 | + 'create_table_validator', | ||
| 515 | + 'create_table_repairer', | ||
| 516 | +] |
-
Please register or login to post a comment