Showing
1 changed file
with
69 additions
and
2 deletions
| @@ -427,13 +427,80 @@ class ChartReviewService: | @@ -427,13 +427,80 @@ class ChartReviewService: | ||
| 427 | f"失败 {self._stats['failed']} 个" | 427 | f"失败 {self._stats['failed']} 个" |
| 428 | ) | 428 | ) |
| 429 | 429 | ||
| 430 | + # 内部元数据键,不应保存到 IR 文件 | ||
| 431 | + _INTERNAL_METADATA_KEYS = frozenset([ | ||
| 432 | + "_chart_reviewed", | ||
| 433 | + "_chart_renderable", | ||
| 434 | + "_chart_review_status", | ||
| 435 | + "_chart_review_method", | ||
| 436 | + "_chart_error_reason", | ||
| 437 | + ]) | ||
| 438 | + | ||
| 439 | + def _strip_internal_metadata(self, document_ir: Dict[str, Any]) -> Dict[str, Any]: | ||
| 440 | + """ | ||
| 441 | + 移除文档中所有内部元数据键,返回干净的副本用于持久化。 | ||
| 442 | + | ||
| 443 | + 这些内部标记仅用于渲染过程的状态跟踪,不应保存到 IR 文件中, | ||
| 444 | + 以避免污染文档结构和导致重复使用时的不一致行为。 | ||
| 445 | + """ | ||
| 446 | + cleaned = copy.deepcopy(document_ir) | ||
| 447 | + | ||
| 448 | + def strip_from_block(block: Dict[str, Any]) -> None: | ||
| 449 | + """递归移除 block 及其嵌套结构中的内部元数据""" | ||
| 450 | + if not isinstance(block, dict): | ||
| 451 | + return | ||
| 452 | + | ||
| 453 | + # 移除当前 block 的内部键 | ||
| 454 | + for key in self._INTERNAL_METADATA_KEYS: | ||
| 455 | + block.pop(key, None) | ||
| 456 | + | ||
| 457 | + # 递归处理嵌套的 blocks | ||
| 458 | + nested_blocks = block.get("blocks") | ||
| 459 | + if isinstance(nested_blocks, list): | ||
| 460 | + for nested in nested_blocks: | ||
| 461 | + strip_from_block(nested) | ||
| 462 | + | ||
| 463 | + # 处理 list 类型的 items | ||
| 464 | + if block.get("type") == "list": | ||
| 465 | + for item in block.get("items", []): | ||
| 466 | + if isinstance(item, list): | ||
| 467 | + for sub_block in item: | ||
| 468 | + strip_from_block(sub_block) | ||
| 469 | + | ||
| 470 | + # 处理 table 类型的 cells | ||
| 471 | + if block.get("type") == "table": | ||
| 472 | + for row in block.get("rows", []): | ||
| 473 | + if not isinstance(row, dict): | ||
| 474 | + continue | ||
| 475 | + for cell in row.get("cells", []): | ||
| 476 | + if isinstance(cell, dict): | ||
| 477 | + cell_blocks = cell.get("blocks", []) | ||
| 478 | + if isinstance(cell_blocks, list): | ||
| 479 | + for cell_block in cell_blocks: | ||
| 480 | + strip_from_block(cell_block) | ||
| 481 | + | ||
| 482 | + # 处理所有章节 | ||
| 483 | + for chapter in cleaned.get("chapters", []) or []: | ||
| 484 | + if not isinstance(chapter, dict): | ||
| 485 | + continue | ||
| 486 | + blocks = chapter.get("blocks", []) | ||
| 487 | + if isinstance(blocks, list): | ||
| 488 | + for block in blocks: | ||
| 489 | + strip_from_block(block) | ||
| 490 | + | ||
| 491 | + return cleaned | ||
| 492 | + | ||
| 430 | def _save_ir_to_file(self, document_ir: Dict[str, Any], file_path: str | Path) -> None: | 493 | def _save_ir_to_file(self, document_ir: Dict[str, Any], file_path: str | Path) -> None: |
| 431 | - """保存 IR 到文件""" | 494 | + """保存 IR 到文件(移除内部元数据后)""" |
| 432 | try: | 495 | try: |
| 433 | path = Path(file_path) | 496 | path = Path(file_path) |
| 434 | path.parent.mkdir(parents=True, exist_ok=True) | 497 | path.parent.mkdir(parents=True, exist_ok=True) |
| 498 | + | ||
| 499 | + # 移除内部元数据键,保持 IR 文件干净 | ||
| 500 | + cleaned_ir = self._strip_internal_metadata(document_ir) | ||
| 501 | + | ||
| 435 | path.write_text( | 502 | path.write_text( |
| 436 | - json.dumps(document_ir, ensure_ascii=False, indent=2), | 503 | + json.dumps(cleaned_ir, ensure_ascii=False, indent=2), |
| 437 | encoding="utf-8" | 504 | encoding="utf-8" |
| 438 | ) | 505 | ) |
| 439 | logger.info(f"ChartReviewService: 修复后的 IR 已保存到 {path}") | 506 | logger.info(f"ChartReviewService: 修复后的 IR 已保存到 {path}") |
-
Please register or login to post a comment