马一丁

Add a Program for Quickly Regenerating HTML

  1 +"""
  2 +使用最新的章节JSON重新装订并渲染HTML报告。
  3 +"""
  4 +
  5 +import json
  6 +import sys
  7 +from datetime import datetime
  8 +from pathlib import Path
  9 +from loguru import logger
  10 +
  11 +# 确保可以找到项目内模块
  12 +sys.path.insert(0, str(Path(__file__).parent))
  13 +
  14 +from ReportEngine.core import ChapterStorage, DocumentComposer
  15 +from ReportEngine.ir import IRValidator
  16 +from ReportEngine.renderers import HTMLRenderer
  17 +from ReportEngine.utils.config import settings
  18 +
  19 +
  20 +def find_latest_run_dir(chapter_root: Path):
  21 + """定位包含 manifest.json 的最新章节输出目录。"""
  22 + if not chapter_root.exists():
  23 + logger.error(f"章节目录不存在: {chapter_root}")
  24 + return None
  25 +
  26 + run_dirs = []
  27 + for candidate in chapter_root.iterdir():
  28 + if not candidate.is_dir():
  29 + continue
  30 + manifest_path = candidate / "manifest.json"
  31 + if manifest_path.exists():
  32 + run_dirs.append((candidate, manifest_path.stat().st_mtime))
  33 +
  34 + if not run_dirs:
  35 + logger.error("未找到带 manifest.json 的章节目录")
  36 + return None
  37 +
  38 + latest_dir = sorted(run_dirs, key=lambda item: item[1], reverse=True)[0][0]
  39 + logger.info(f"找到最新run目录: {latest_dir.name}")
  40 + return latest_dir
  41 +
  42 +
  43 +def load_manifest(run_dir: Path):
  44 + """读取manifest.json并返回report_id与metadata。"""
  45 + manifest_path = run_dir / "manifest.json"
  46 + try:
  47 + with manifest_path.open("r", encoding="utf-8") as f:
  48 + manifest = json.load(f)
  49 + report_id = manifest.get("reportId") or run_dir.name
  50 + metadata = manifest.get("metadata") or {}
  51 + logger.info(f"报告ID: {report_id}")
  52 + if manifest.get("createdAt"):
  53 + logger.info(f"创建时间: {manifest['createdAt']}")
  54 + return report_id, metadata
  55 + except Exception as exc:
  56 + logger.error(f"读取manifest失败: {exc}")
  57 + return None, None
  58 +
  59 +
  60 +def load_chapters(run_dir: Path):
  61 + """加载章节JSON列表。"""
  62 + storage = ChapterStorage(settings.CHAPTER_OUTPUT_DIR)
  63 + chapters = storage.load_chapters(run_dir)
  64 + logger.info(f"加载章节数: {len(chapters)}")
  65 + return chapters
  66 +
  67 +
  68 +def validate_chapters(chapters):
  69 + """使用IRValidator做快速校验,仅记录警告不阻断流程。"""
  70 + validator = IRValidator()
  71 + invalid = []
  72 + for chapter in chapters:
  73 + ok, errors = validator.validate_chapter(chapter)
  74 + if not ok:
  75 + invalid.append((chapter.get("chapterId") or "unknown", errors))
  76 +
  77 + if invalid:
  78 + logger.warning(f"有 {len(invalid)} 个章节未通过结构校验,将继续装订:")
  79 + for chapter_id, errors in invalid:
  80 + preview = "; ".join(errors[:3])
  81 + logger.warning(f" - {chapter_id}: {preview}")
  82 + else:
  83 + logger.info("章节结构校验通过")
  84 +
  85 +
  86 +def stitch_document(report_id, metadata, chapters):
  87 + """将章节装订为整本Document IR。"""
  88 + composer = DocumentComposer()
  89 + document_ir = composer.build_document(report_id, metadata, chapters)
  90 + logger.info(
  91 + f"装订完成: {len(document_ir.get('chapters', []))} 个章节,"
  92 + f"{count_charts(document_ir)} 个图表"
  93 + )
  94 + return document_ir
  95 +
  96 +
  97 +def count_charts(document_ir):
  98 + """统计IR中的图表数量。"""
  99 + chart_count = 0
  100 + for chapter in document_ir.get("chapters", []):
  101 + blocks = chapter.get("blocks", [])
  102 + chart_count += _count_chart_blocks(blocks)
  103 + return chart_count
  104 +
  105 +
  106 +def _count_chart_blocks(blocks):
  107 + """递归统计chart.js组件。"""
  108 + count = 0
  109 + for block in blocks:
  110 + if not isinstance(block, dict):
  111 + continue
  112 + if block.get("type") == "widget" and str(block.get("widgetType", "")).startswith("chart.js"):
  113 + count += 1
  114 + nested = block.get("blocks")
  115 + if isinstance(nested, list):
  116 + count += _count_chart_blocks(nested)
  117 + if block.get("type") == "list":
  118 + for item in block.get("items", []):
  119 + if isinstance(item, list):
  120 + count += _count_chart_blocks(item)
  121 + if block.get("type") == "table":
  122 + for row in block.get("rows", []):
  123 + for cell in row.get("cells", []):
  124 + if isinstance(cell, dict):
  125 + cell_blocks = cell.get("blocks", [])
  126 + if isinstance(cell_blocks, list):
  127 + count += _count_chart_blocks(cell_blocks)
  128 + return count
  129 +
  130 +
  131 +def save_document_ir(document_ir, base_name, timestamp):
  132 + """将装订好的IR重新落盘,便于后续复用。"""
  133 + output_dir = Path(settings.DOCUMENT_IR_OUTPUT_DIR)
  134 + output_dir.mkdir(parents=True, exist_ok=True)
  135 + ir_filename = f"report_ir_{base_name}_{timestamp}_regen.json"
  136 + ir_path = output_dir / ir_filename
  137 + ir_path.write_text(json.dumps(document_ir, ensure_ascii=False, indent=2), encoding="utf-8")
  138 + logger.info(f"IR已保存: {ir_path}")
  139 + return ir_path
  140 +
  141 +
  142 +def render_html(document_ir, base_name, timestamp):
  143 + """使用HTMLRenderer渲染并落盘HTML文件。"""
  144 + renderer = HTMLRenderer()
  145 + html_content = renderer.render(document_ir)
  146 +
  147 + output_dir = Path(settings.OUTPUT_DIR) / "html"
  148 + output_dir.mkdir(parents=True, exist_ok=True)
  149 + html_filename = f"report_html_{base_name}_{timestamp}.html"
  150 + html_path = output_dir / html_filename
  151 + html_path.write_text(html_content, encoding="utf-8")
  152 +
  153 + file_size_mb = html_path.stat().st_size / (1024 * 1024)
  154 + logger.info(f"HTML生成成功: {html_path} ({file_size_mb:.2f} MB)")
  155 + logger.info(
  156 + "图表验证统计: "
  157 + f"total={renderer.chart_validation_stats.get('total', 0)}, "
  158 + f"valid={renderer.chart_validation_stats.get('valid', 0)}, "
  159 + f"repaired={renderer.chart_validation_stats.get('repaired_locally', 0) + renderer.chart_validation_stats.get('repaired_api', 0)}, "
  160 + f"failed={renderer.chart_validation_stats.get('failed', 0)}"
  161 + )
  162 + return html_path
  163 +
  164 +
  165 +def build_slug(text):
  166 + """将主题/标题转换为安全的文件名片段。"""
  167 + text = str(text or "report")
  168 + sanitized = "".join(c for c in text if c.isalnum() or c in (" ", "-", "_")).strip()
  169 + sanitized = sanitized.replace(" ", "_")
  170 + return sanitized[:60] or "report"
  171 +
  172 +
  173 +def main():
  174 + """主入口:装订最新章节并渲染HTML。"""
  175 + logger.info("🚀 使用最新的LLM章节重新装订并渲染HTML")
  176 +
  177 + chapter_root = Path(settings.CHAPTER_OUTPUT_DIR)
  178 + latest_run = find_latest_run_dir(chapter_root)
  179 + if not latest_run:
  180 + return 1
  181 +
  182 + report_id, metadata = load_manifest(latest_run)
  183 + if not report_id or metadata is None:
  184 + return 1
  185 +
  186 + chapters = load_chapters(latest_run)
  187 + if not chapters:
  188 + logger.error("未找到章节JSON,无法装订")
  189 + return 1
  190 +
  191 + validate_chapters(chapters)
  192 +
  193 + document_ir = stitch_document(report_id, metadata, chapters)
  194 +
  195 + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
  196 + base_name = build_slug(
  197 + metadata.get("query") or metadata.get("title") or metadata.get("reportId") or report_id
  198 + )
  199 +
  200 + ir_path = save_document_ir(document_ir, base_name, timestamp)
  201 + html_path = render_html(document_ir, base_name, timestamp)
  202 +
  203 + logger.info("")
  204 + logger.info("🎉 HTML装订与渲染完成")
  205 + logger.info(f"IR文件: {ir_path.resolve()}")
  206 + logger.info(f"HTML文件: {html_path.resolve()}")
  207 + return 0
  208 +
  209 +
  210 +if __name__ == "__main__":
  211 + sys.exit(main())