马一丁

Avoid repeated icon repairs

... ... @@ -182,6 +182,18 @@ class HTMLRenderer:
self._pdf_font_base64 = ""
return self._pdf_font_base64
def _reset_chart_validation_stats(self) -> None:
"""重置图表校验统计并清除失败计数标记"""
self.chart_validation_stats = {
'total': 0,
'valid': 0,
'repaired_locally': 0,
'repaired_api': 0,
'failed': 0
}
# 保留失败原因缓存,但重置本次渲染的计数
self._chart_failure_recorded = set()
def _build_script_with_fallback(
self,
inline_code: str,
... ... @@ -267,6 +279,8 @@ class HTMLRenderer:
str: 可直接写入磁盘的完整HTML文档。
"""
self.document = document_ir or {}
# 先对图表做统一审查与修复,并将结果回写,供后续PDF/HTML共用
self.review_and_patch_document(self.document, reset_stats=True)
self.widget_scripts = []
self.chart_counter = 0
self.heading_counter = 0
... ... @@ -282,17 +296,6 @@ class HTMLRenderer:
self.heading_label_map = self._compute_heading_labels(self.chapters)
self.toc_entries = self._collect_toc_entries(self.chapters)
# 重置图表验证统计
self.chart_validation_stats = {
'total': 0,
'valid': 0,
'repaired_locally': 0,
'repaired_api': 0,
'failed': 0
}
# 每次渲染重新统计失败计数,但保留失败原因,避免重复LLM调用
self._chart_failure_recorded = set()
metadata = self.metadata
theme_tokens = metadata.get("themeTokens") or self.document.get("themeTokens", {})
title = metadata.get("title") or metadata.get("query") or "智能舆情报告"
... ... @@ -2087,6 +2090,31 @@ class HTMLRenderer:
if cache_key:
self._chart_failure_recorded.add(cache_key)
def _apply_cached_review_stats(self, block: Dict[str, Any]) -> None:
"""
在已审查过的图表上重新累计统计信息,避免重复修复。
当渲染流程重置了统计但图表已经审查过(_chart_reviewed=True),
直接根据记录的状态累加各项计数,防止再次触发 ChartRepairer。
"""
if not isinstance(block, dict):
return
status = block.get("_chart_review_status") or "valid"
method = (block.get("_chart_review_method") or "none").lower()
cache_key = self._chart_cache_key(block)
self.chart_validation_stats['total'] += 1
if status == "failed":
self._record_chart_failure_stat(cache_key)
elif status == "repaired":
if method == "api":
self.chart_validation_stats['repaired_api'] += 1
else:
self.chart_validation_stats['repaired_locally'] += 1
else:
self.chart_validation_stats['valid'] += 1
def _format_chart_error_reason(
self,
validation_result: ValidationResult | None = None,
... ... @@ -2211,6 +2239,177 @@ class HTMLRenderer:
if labels_from_data:
data_ref["labels"] = labels_from_data
def _ensure_chart_reviewed(
self,
block: Dict[str, Any],
chapter_context: Dict[str, Any] | None = None,
*,
increment_stats: bool = True
) -> tuple[bool, str | None]:
"""
确保图表已完成审查/修复,并将结果回写到原始block。
返回:
(renderable, fail_reason)
"""
if not isinstance(block, dict):
return True, None
widget_type = block.get('widgetType', '')
is_chart = isinstance(widget_type, str) and widget_type.startswith('chart.js')
if not is_chart:
return True, None
is_wordcloud = 'wordcloud' in widget_type.lower() if isinstance(widget_type, str) else False
cache_key = self._chart_cache_key(block)
# 已有失败记录或显式标记为不可渲染,直接复用结果
if block.get("_chart_renderable") is False:
if increment_stats:
self.chart_validation_stats['total'] += 1
self._record_chart_failure_stat(cache_key)
reason = block.get("_chart_error_reason")
block["_chart_reviewed"] = True
block["_chart_review_status"] = block.get("_chart_review_status") or "failed"
block["_chart_review_method"] = block.get("_chart_review_method") or "none"
if reason:
self._note_chart_failure(cache_key, reason)
return False, reason
if block.get("_chart_reviewed"):
if increment_stats:
self._apply_cached_review_stats(block)
failed, cached_reason = self._has_chart_failure(block)
renderable = not failed and block.get("_chart_renderable", True) is not False
return renderable, block.get("_chart_error_reason") or cached_reason
# 首次审查:先补全结构,再验证/修复
self._normalize_chart_block(block, chapter_context)
if increment_stats:
self.chart_validation_stats['total'] += 1
if is_wordcloud:
if increment_stats:
self.chart_validation_stats['valid'] += 1
block["_chart_reviewed"] = True
block["_chart_review_status"] = "valid"
block["_chart_review_method"] = "none"
return True, None
validation_result = self.chart_validator.validate(block)
if not validation_result.is_valid:
logger.warning(
f"图表 {block.get('widgetId', 'unknown')} 验证失败: {validation_result.errors}"
)
repair_result = self.chart_repairer.repair(block, validation_result)
if repair_result.success and repair_result.repaired_block:
# 修复成功,回写修复后的数据
repaired_block = repair_result.repaired_block
block.clear()
block.update(repaired_block)
method = repair_result.method or "local"
logger.info(
f"图表 {block.get('widgetId', 'unknown')} 修复成功 "
f"(方法: {method}): {repair_result.changes}"
)
if increment_stats:
if method == 'local':
self.chart_validation_stats['repaired_locally'] += 1
elif method == 'api':
self.chart_validation_stats['repaired_api'] += 1
block["_chart_review_status"] = "repaired"
block["_chart_review_method"] = method
block["_chart_reviewed"] = True
return True, None
# 修复失败,记录失败并输出占位提示
fail_reason = self._format_chart_error_reason(validation_result)
block["_chart_renderable"] = False
block["_chart_error_reason"] = fail_reason
block["_chart_review_status"] = "failed"
block["_chart_review_method"] = "none"
block["_chart_reviewed"] = True
self._note_chart_failure(cache_key, fail_reason)
if increment_stats:
self._record_chart_failure_stat(cache_key)
logger.warning(
f"图表 {block.get('widgetId', 'unknown')} 修复失败,已跳过渲染: {fail_reason}"
)
return False, fail_reason
# 验证通过
if increment_stats:
self.chart_validation_stats['valid'] += 1
if validation_result.warnings:
logger.info(
f"图表 {block.get('widgetId', 'unknown')} 验证通过,"
f"但有警告: {validation_result.warnings}"
)
block["_chart_review_status"] = "valid"
block["_chart_review_method"] = "none"
block["_chart_reviewed"] = True
return True, None
def review_and_patch_document(
self,
document_ir: Dict[str, Any],
*,
reset_stats: bool = True,
clone: bool = False
) -> Dict[str, Any]:
"""
全局审查并修复图表,将修复结果回写到原始 IR,避免多次渲染重复修复。
参数:
document_ir: 原始 Document IR
reset_stats: 是否重置统计数据
clone: 是否返回修复后的深拷贝(原始 IR 仍会被回写修复结果)
返回:
修复后的 IR(可能是原对象或其深拷贝)
"""
if reset_stats:
self._reset_chart_validation_stats()
target_ir = document_ir or {}
def _walk_blocks(blocks: list, chapter_ctx: Dict[str, Any] | None = None) -> None:
for blk in blocks or []:
if not isinstance(blk, dict):
continue
if blk.get("type") == "widget":
self._ensure_chart_reviewed(blk, chapter_ctx, increment_stats=True)
nested_blocks = blk.get("blocks")
if isinstance(nested_blocks, list):
_walk_blocks(nested_blocks, chapter_ctx)
if blk.get("type") == "list":
for item in blk.get("items", []):
if isinstance(item, list):
_walk_blocks(item, chapter_ctx)
if blk.get("type") == "table":
for row in blk.get("rows", []):
cells = row.get("cells", [])
for cell in cells:
if isinstance(cell, dict):
cell_blocks = cell.get("blocks", [])
if isinstance(cell_blocks, list):
_walk_blocks(cell_blocks, chapter_ctx)
for chapter in target_ir.get("chapters", []) or []:
if not isinstance(chapter, dict):
continue
_walk_blocks(chapter.get("blocks", []), chapter)
return copy.deepcopy(target_ir) if clone else target_ir
def _render_widget(self, block: Dict[str, Any]) -> str:
"""
渲染Chart.js等交互组件的占位容器,并记录配置JSON。
... ... @@ -2230,75 +2429,28 @@ class HTMLRenderer:
返回:
str: 含canvas与配置脚本的HTML。
"""
# 先在block层面做一次容错补全(scales、章节级数据等)
self._normalize_chart_block(block, getattr(self, "_current_chapter", None))
# 统计
# 统一的审查/修复入口,避免后续重复修复
widget_type = block.get('widgetType', '')
is_chart = isinstance(widget_type, str) and widget_type.startswith('chart.js')
is_wordcloud = isinstance(widget_type, str) and 'wordcloud' in widget_type.lower()
widget_id = block.get('widgetId')
cache_key = self._chart_cache_key(block) if is_chart else ""
props_snapshot = block.get("props") if isinstance(block.get("props"), dict) else {}
display_title = props_snapshot.get("title") or block.get("title") or widget_id or "图表"
reviewed = bool(block.get("_chart_reviewed"))
renderable = True
fail_reason = None
if is_chart:
self.chart_validation_stats['total'] += 1
# 词云使用专用渲染逻辑,不按Chart.js规则验证,直接跳过防止误判
if is_wordcloud:
self.chart_validation_stats['valid'] += 1
else:
# 如果此前已记录失败,直接使用占位提示,避免重复修复
has_failed, cached_reason = self._has_chart_failure(block)
if has_failed:
self._record_chart_failure_stat(cache_key)
reason = cached_reason or "LLM返回的图表信息格式有误,无法正常显示"
return self._render_chart_error_placeholder(display_title, reason, widget_id)
# 验证图表数据
validation_result = self.chart_validator.validate(block)
renderable, fail_reason = self._ensure_chart_reviewed(
block,
getattr(self, "_current_chapter", None),
increment_stats=not reviewed
)
if not validation_result.is_valid:
logger.warning(
f"图表 {block.get('widgetId', 'unknown')} 验证失败: {validation_result.errors}"
)
widget_id = block.get('widgetId')
props_snapshot = block.get("props") if isinstance(block.get("props"), dict) else {}
display_title = props_snapshot.get("title") or block.get("title") or widget_id or "图表"
# 尝试修复
repair_result = self.chart_repairer.repair(block, validation_result)
if repair_result.success and repair_result.repaired_block:
# 修复成功,使用修复后的数据
block = repair_result.repaired_block
logger.info(
f"图表 {block.get('widgetId', 'unknown')} 修复成功 "
f"(方法: {repair_result.method}): {repair_result.changes}"
)
# 更新统计
if repair_result.method == 'local':
self.chart_validation_stats['repaired_locally'] += 1
elif repair_result.method == 'api':
self.chart_validation_stats['repaired_api'] += 1
else:
# 修复失败,记录失败并输出占位提示
fail_reason = self._format_chart_error_reason(validation_result)
block["_chart_renderable"] = False
block["_chart_error_reason"] = fail_reason
self._note_chart_failure(cache_key, fail_reason)
self._record_chart_failure_stat(cache_key)
logger.warning(
f"图表 {block.get('widgetId', 'unknown')} 修复失败,已跳过渲染: {fail_reason}"
)
return self._render_chart_error_placeholder(display_title, fail_reason, widget_id)
else:
# 验证通过
self.chart_validation_stats['valid'] += 1
if validation_result.warnings:
logger.info(
f"图表 {block.get('widgetId', 'unknown')} 验证通过,"
f"但有警告: {validation_result.warnings}"
)
if is_chart and not renderable:
reason = fail_reason or "LLM返回的图表信息格式有误,无法正常显示"
return self._render_chart_error_placeholder(display_title, reason, widget_id)
# 渲染图表HTML
self.chart_counter += 1
... ...
... ... @@ -157,10 +157,11 @@ class PDFRenderer:
def _preprocess_charts(self, document_ir: Dict[str, Any]) -> Dict[str, Any]:
"""
预处理图表:验证和修复所有图表数据
预处理图表:验证并修复所有图表数据,结果回写原始IR。
这个方法确保在转换为SVG之前,所有图表数据都是有效的。
使用与HTMLRenderer相同的验证和修复逻辑,保证PDF和HTML的一致性。
先统一审查并修复图表,把修复结果直接写回传入的 IR,
然后返回修复后的深拷贝供后续 SVG/词云转换使用,避免
HTML 和 PDF 分别重复触发 ChartRepairer。
参数:
document_ir: Document IR数据
... ... @@ -168,101 +169,24 @@ class PDFRenderer:
返回:
Dict[str, Any]: 修复后的Document IR(深拷贝)
"""
# 深拷贝以避免修改原始IR
ir_copy = copy.deepcopy(document_ir)
repair_stats = {
'total': 0,
'repaired': 0,
'failed': 0
}
def repair_widgets_in_blocks(blocks: list, chapter_context: Dict[str, Any] | None = None) -> None:
"""递归修复blocks中的所有widget"""
for block in blocks:
if not isinstance(block, dict):
continue
# 处理widget类型
if block.get('type') == 'widget':
# 先用HTML渲染器的容错逻辑补全字段
try:
self.html_renderer._normalize_chart_block(block, chapter_context)
except Exception as exc: # 防御性处理,避免单个图表阻断流程
logger.debug(f"预处理图表 {block.get('widgetId')} 时出错: {exc}")
widget_type = block.get('widgetType', '')
if widget_type.startswith('chart.js'):
repair_stats['total'] += 1
# 使用HTMLRenderer的验证器和修复器
validation = self.html_renderer.chart_validator.validate(block)
if not validation.is_valid:
logger.debug(f"图表 {block.get('widgetId')} 需要修复: {validation.errors}")
# 尝试修复
repair_result = self.html_renderer.chart_repairer.repair(block, validation)
if repair_result.success and repair_result.repaired_block:
# 更新block内容(在副本中)
block.update(repair_result.repaired_block)
repair_stats['repaired'] += 1
logger.debug(
f"图表 {block.get('widgetId')} 已修复 "
f"(方法: {repair_result.method})"
)
else:
repair_stats['failed'] += 1
reason = self.html_renderer._format_chart_error_reason(validation)
block["_chart_renderable"] = False
block["_chart_error_reason"] = reason
self.html_renderer._note_chart_failure(
self.html_renderer._chart_cache_key(block),
reason
)
logger.warning(
f"图表 {block.get('widgetId')} 修复失败,将使用占位提示: {reason}"
)
# 递归处理嵌套的blocks
nested_blocks = block.get('blocks')
if isinstance(nested_blocks, list):
repair_widgets_in_blocks(nested_blocks, chapter_context)
# 处理列表项
if block.get('type') == 'list':
items = block.get('items', [])
for item in items:
if isinstance(item, list):
repair_widgets_in_blocks(item, chapter_context)
# 处理表格单元格
if block.get('type') == 'table':
rows = block.get('rows', [])
for row in rows:
cells = row.get('cells', [])
for cell in cells:
cell_blocks = cell.get('blocks', [])
if isinstance(cell_blocks, list):
repair_widgets_in_blocks(cell_blocks, chapter_context)
# 处理所有章节
chapters = ir_copy.get('chapters', [])
for chapter in chapters:
blocks = chapter.get('blocks', [])
repair_widgets_in_blocks(blocks, chapter)
reviewed_ir = self.html_renderer.review_and_patch_document(
document_ir,
reset_stats=True,
clone=False
)
# 输出统计信息
if repair_stats['total'] > 0:
stats = self.html_renderer.chart_validation_stats
if stats.get('total', 0) > 0:
repaired_count = stats.get('repaired_locally', 0) + stats.get('repaired_api', 0)
logger.info(
f"PDF图表预处理完成: "
f"总计 {repair_stats['total']} 个图表, "
f"修复 {repair_stats['repaired']} 个, "
f"失败 {repair_stats['failed']} 个"
f"总计 {stats.get('total', 0)} 个图表, "
f"修复 {repaired_count} 个, "
f"失败 {stats.get('failed', 0)} 个"
)
return ir_copy
# 返回深拷贝,避免后续 SVG 转换过程影响回写后的原始 IR
return copy.deepcopy(reviewed_ir)
def _convert_charts_to_svg(self, document_ir: Dict[str, Any]) -> Dict[str, str]:
"""
... ...