马一丁

Review metadata persisted to

IR file pollutes data
@@ -427,13 +427,80 @@ class ChartReviewService: @@ -427,13 +427,80 @@ class ChartReviewService:
427 f"失败 {self._stats['failed']} 个" 427 f"失败 {self._stats['failed']} 个"
428 ) 428 )
429 429
  430 + # 内部元数据键,不应保存到 IR 文件
  431 + _INTERNAL_METADATA_KEYS = frozenset([
  432 + "_chart_reviewed",
  433 + "_chart_renderable",
  434 + "_chart_review_status",
  435 + "_chart_review_method",
  436 + "_chart_error_reason",
  437 + ])
  438 +
  439 + def _strip_internal_metadata(self, document_ir: Dict[str, Any]) -> Dict[str, Any]:
  440 + """
  441 + 移除文档中所有内部元数据键,返回干净的副本用于持久化。
  442 +
  443 + 这些内部标记仅用于渲染过程的状态跟踪,不应保存到 IR 文件中,
  444 + 以避免污染文档结构和导致重复使用时的不一致行为。
  445 + """
  446 + cleaned = copy.deepcopy(document_ir)
  447 +
  448 + def strip_from_block(block: Dict[str, Any]) -> None:
  449 + """递归移除 block 及其嵌套结构中的内部元数据"""
  450 + if not isinstance(block, dict):
  451 + return
  452 +
  453 + # 移除当前 block 的内部键
  454 + for key in self._INTERNAL_METADATA_KEYS:
  455 + block.pop(key, None)
  456 +
  457 + # 递归处理嵌套的 blocks
  458 + nested_blocks = block.get("blocks")
  459 + if isinstance(nested_blocks, list):
  460 + for nested in nested_blocks:
  461 + strip_from_block(nested)
  462 +
  463 + # 处理 list 类型的 items
  464 + if block.get("type") == "list":
  465 + for item in block.get("items", []):
  466 + if isinstance(item, list):
  467 + for sub_block in item:
  468 + strip_from_block(sub_block)
  469 +
  470 + # 处理 table 类型的 cells
  471 + if block.get("type") == "table":
  472 + for row in block.get("rows", []):
  473 + if not isinstance(row, dict):
  474 + continue
  475 + for cell in row.get("cells", []):
  476 + if isinstance(cell, dict):
  477 + cell_blocks = cell.get("blocks", [])
  478 + if isinstance(cell_blocks, list):
  479 + for cell_block in cell_blocks:
  480 + strip_from_block(cell_block)
  481 +
  482 + # 处理所有章节
  483 + for chapter in cleaned.get("chapters", []) or []:
  484 + if not isinstance(chapter, dict):
  485 + continue
  486 + blocks = chapter.get("blocks", [])
  487 + if isinstance(blocks, list):
  488 + for block in blocks:
  489 + strip_from_block(block)
  490 +
  491 + return cleaned
  492 +
430 def _save_ir_to_file(self, document_ir: Dict[str, Any], file_path: str | Path) -> None: 493 def _save_ir_to_file(self, document_ir: Dict[str, Any], file_path: str | Path) -> None:
431 - """保存 IR 到文件""" 494 + """保存 IR 到文件(移除内部元数据后)"""
432 try: 495 try:
433 path = Path(file_path) 496 path = Path(file_path)
434 path.parent.mkdir(parents=True, exist_ok=True) 497 path.parent.mkdir(parents=True, exist_ok=True)
  498 +
  499 + # 移除内部元数据键,保持 IR 文件干净
  500 + cleaned_ir = self._strip_internal_metadata(document_ir)
  501 +
435 path.write_text( 502 path.write_text(
436 - json.dumps(document_ir, ensure_ascii=False, indent=2), 503 + json.dumps(cleaned_ir, ensure_ascii=False, indent=2),
437 encoding="utf-8" 504 encoding="utf-8"
438 ) 505 )
439 logger.info(f"ChartReviewService: 修复后的 IR 已保存到 {path}") 506 logger.info(f"ChartReviewService: 修复后的 IR 已保存到 {path}")