Add a Program for Quickly Regenerating HTML

马一丁
Commit 09c83af057c31267c8bfc3bc8240b9d0022ada72 09c83af0 1 parent 2e0a526d
Showing 1 changed file with 211 additions and 0 deletions
regenerate_latest_html.py
--- a/regenerate_latest_html.py 0 → 100644
View file @09c83af
+++ b/regenerate_latest_html.py 0 → 100644
View file @09c83af
+"""
+使用最新的章节JSON重新装订并渲染HTML报告。
+"""
+
+import json
+import sys
+from datetime import datetime
+from pathlib import Path
+from loguru import logger
+
+# 确保可以找到项目内模块
+sys.path.insert(0, str(Path(__file__).parent))
+
+from ReportEngine.core import ChapterStorage, DocumentComposer
+from ReportEngine.ir import IRValidator
+from ReportEngine.renderers import HTMLRenderer
+from ReportEngine.utils.config import settings
+
+
+def find_latest_run_dir(chapter_root: Path):
+    """定位包含 manifest.json 的最新章节输出目录。"""
+    if not chapter_root.exists():
+        logger.error(f"章节目录不存在: {chapter_root}")
+        return None
+
+    run_dirs = []
+    for candidate in chapter_root.iterdir():
+        if not candidate.is_dir():
+            continue
+        manifest_path = candidate / "manifest.json"
+        if manifest_path.exists():
+            run_dirs.append((candidate, manifest_path.stat().st_mtime))
+
+    if not run_dirs:
+        logger.error("未找到带 manifest.json 的章节目录")
+        return None
+
+    latest_dir = sorted(run_dirs, key=lambda item: item[1], reverse=True)[0][0]
+    logger.info(f"找到最新run目录: {latest_dir.name}")
+    return latest_dir
+
+
+def load_manifest(run_dir: Path):
+    """读取manifest.json并返回report_id与metadata。"""
+    manifest_path = run_dir / "manifest.json"
+    try:
+        with manifest_path.open("r", encoding="utf-8") as f:
+            manifest = json.load(f)
+        report_id = manifest.get("reportId") or run_dir.name
+        metadata = manifest.get("metadata") or {}
+        logger.info(f"报告ID: {report_id}")
+        if manifest.get("createdAt"):
+            logger.info(f"创建时间: {manifest['createdAt']}")
+        return report_id, metadata
+    except Exception as exc:
+        logger.error(f"读取manifest失败: {exc}")
+        return None, None
+
+
+def load_chapters(run_dir: Path):
+    """加载章节JSON列表。"""
+    storage = ChapterStorage(settings.CHAPTER_OUTPUT_DIR)
+    chapters = storage.load_chapters(run_dir)
+    logger.info(f"加载章节数: {len(chapters)}")
+    return chapters
+
+
+def validate_chapters(chapters):
+    """使用IRValidator做快速校验，仅记录警告不阻断流程。"""
+    validator = IRValidator()
+    invalid = []
+    for chapter in chapters:
+        ok, errors = validator.validate_chapter(chapter)
+        if not ok:
+            invalid.append((chapter.get("chapterId") or "unknown", errors))
+
+    if invalid:
+        logger.warning(f"有 {len(invalid)} 个章节未通过结构校验，将继续装订：")
+        for chapter_id, errors in invalid:
+            preview = "; ".join(errors[:3])
+            logger.warning(f"  - {chapter_id}: {preview}")
+    else:
+        logger.info("章节结构校验通过")
+
+
+def stitch_document(report_id, metadata, chapters):
+    """将章节装订为整本Document IR。"""
+    composer = DocumentComposer()
+    document_ir = composer.build_document(report_id, metadata, chapters)
+    logger.info(
+        f"装订完成: {len(document_ir.get('chapters', []))} 个章节，"
+        f"{count_charts(document_ir)} 个图表"
+    )
+    return document_ir
+
+
+def count_charts(document_ir):
+    """统计IR中的图表数量。"""
+    chart_count = 0
+    for chapter in document_ir.get("chapters", []):
+        blocks = chapter.get("blocks", [])
+        chart_count += _count_chart_blocks(blocks)
+    return chart_count
+
+
+def _count_chart_blocks(blocks):
+    """递归统计chart.js组件。"""
+    count = 0
+    for block in blocks:
+        if not isinstance(block, dict):
+            continue
+        if block.get("type") == "widget" and str(block.get("widgetType", "")).startswith("chart.js"):
+            count += 1
+        nested = block.get("blocks")
+        if isinstance(nested, list):
+            count += _count_chart_blocks(nested)
+        if block.get("type") == "list":
+            for item in block.get("items", []):
+                if isinstance(item, list):
+                    count += _count_chart_blocks(item)
+        if block.get("type") == "table":
+            for row in block.get("rows", []):
+                for cell in row.get("cells", []):
+                    if isinstance(cell, dict):
+                        cell_blocks = cell.get("blocks", [])
+                        if isinstance(cell_blocks, list):
+                            count += _count_chart_blocks(cell_blocks)
+    return count
+
+
+def save_document_ir(document_ir, base_name, timestamp):
+    """将装订好的IR重新落盘，便于后续复用。"""
+    output_dir = Path(settings.DOCUMENT_IR_OUTPUT_DIR)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    ir_filename = f"report_ir_{base_name}_{timestamp}_regen.json"
+    ir_path = output_dir / ir_filename
+    ir_path.write_text(json.dumps(document_ir, ensure_ascii=False, indent=2), encoding="utf-8")
+    logger.info(f"IR已保存: {ir_path}")
+    return ir_path
+
+
+def render_html(document_ir, base_name, timestamp):
+    """使用HTMLRenderer渲染并落盘HTML文件。"""
+    renderer = HTMLRenderer()
+    html_content = renderer.render(document_ir)
+
+    output_dir = Path(settings.OUTPUT_DIR) / "html"
+    output_dir.mkdir(parents=True, exist_ok=True)
+    html_filename = f"report_html_{base_name}_{timestamp}.html"
+    html_path = output_dir / html_filename
+    html_path.write_text(html_content, encoding="utf-8")
+
+    file_size_mb = html_path.stat().st_size / (1024 * 1024)
+    logger.info(f"HTML生成成功: {html_path} ({file_size_mb:.2f} MB)")
+    logger.info(
+        "图表验证统计: "
+        f"total={renderer.chart_validation_stats.get('total', 0)}, "
+        f"valid={renderer.chart_validation_stats.get('valid', 0)}, "
+        f"repaired={renderer.chart_validation_stats.get('repaired_locally', 0) + renderer.chart_validation_stats.get('repaired_api', 0)}, "
+        f"failed={renderer.chart_validation_stats.get('failed', 0)}"
+    )
+    return html_path
+
+
+def build_slug(text):
+    """将主题/标题转换为安全的文件名片段。"""
+    text = str(text or "report")
+    sanitized = "".join(c for c in text if c.isalnum() or c in (" ", "-", "_")).strip()
+    sanitized = sanitized.replace(" ", "_")
+    return sanitized[:60] or "report"
+
+
+def main():
+    """主入口：装订最新章节并渲染HTML。"""
+    logger.info("🚀 使用最新的LLM章节重新装订并渲染HTML")
+
+    chapter_root = Path(settings.CHAPTER_OUTPUT_DIR)
+    latest_run = find_latest_run_dir(chapter_root)
+    if not latest_run:
+        return 1
+
+    report_id, metadata = load_manifest(latest_run)
+    if not report_id or metadata is None:
+        return 1
+
+    chapters = load_chapters(latest_run)
+    if not chapters:
+        logger.error("未找到章节JSON，无法装订")
+        return 1
+
+    validate_chapters(chapters)
+
+    document_ir = stitch_document(report_id, metadata, chapters)
+
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    base_name = build_slug(
+        metadata.get("query") or metadata.get("title") or metadata.get("reportId") or report_id
+    )
+
+    ir_path = save_document_ir(document_ir, base_name, timestamp)
+    html_path = render_html(document_ir, base_name, timestamp)
+
+    logger.info("")
+    logger.info("🎉 HTML装订与渲染完成")
+    logger.info(f"IR文件: {ir_path.resolve()}")
+    logger.info(f"HTML文件: {html_path.resolve()}")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())