Showing
1 changed file
with
211 additions
and
0 deletions
regenerate_latest_html.py
0 → 100644
| 1 | +""" | ||
| 2 | +使用最新的章节JSON重新装订并渲染HTML报告。 | ||
| 3 | +""" | ||
| 4 | + | ||
| 5 | +import json | ||
| 6 | +import sys | ||
| 7 | +from datetime import datetime | ||
| 8 | +from pathlib import Path | ||
| 9 | +from loguru import logger | ||
| 10 | + | ||
| 11 | +# 确保可以找到项目内模块 | ||
| 12 | +sys.path.insert(0, str(Path(__file__).parent)) | ||
| 13 | + | ||
| 14 | +from ReportEngine.core import ChapterStorage, DocumentComposer | ||
| 15 | +from ReportEngine.ir import IRValidator | ||
| 16 | +from ReportEngine.renderers import HTMLRenderer | ||
| 17 | +from ReportEngine.utils.config import settings | ||
| 18 | + | ||
| 19 | + | ||
| 20 | +def find_latest_run_dir(chapter_root: Path): | ||
| 21 | + """定位包含 manifest.json 的最新章节输出目录。""" | ||
| 22 | + if not chapter_root.exists(): | ||
| 23 | + logger.error(f"章节目录不存在: {chapter_root}") | ||
| 24 | + return None | ||
| 25 | + | ||
| 26 | + run_dirs = [] | ||
| 27 | + for candidate in chapter_root.iterdir(): | ||
| 28 | + if not candidate.is_dir(): | ||
| 29 | + continue | ||
| 30 | + manifest_path = candidate / "manifest.json" | ||
| 31 | + if manifest_path.exists(): | ||
| 32 | + run_dirs.append((candidate, manifest_path.stat().st_mtime)) | ||
| 33 | + | ||
| 34 | + if not run_dirs: | ||
| 35 | + logger.error("未找到带 manifest.json 的章节目录") | ||
| 36 | + return None | ||
| 37 | + | ||
| 38 | + latest_dir = sorted(run_dirs, key=lambda item: item[1], reverse=True)[0][0] | ||
| 39 | + logger.info(f"找到最新run目录: {latest_dir.name}") | ||
| 40 | + return latest_dir | ||
| 41 | + | ||
| 42 | + | ||
| 43 | +def load_manifest(run_dir: Path): | ||
| 44 | + """读取manifest.json并返回report_id与metadata。""" | ||
| 45 | + manifest_path = run_dir / "manifest.json" | ||
| 46 | + try: | ||
| 47 | + with manifest_path.open("r", encoding="utf-8") as f: | ||
| 48 | + manifest = json.load(f) | ||
| 49 | + report_id = manifest.get("reportId") or run_dir.name | ||
| 50 | + metadata = manifest.get("metadata") or {} | ||
| 51 | + logger.info(f"报告ID: {report_id}") | ||
| 52 | + if manifest.get("createdAt"): | ||
| 53 | + logger.info(f"创建时间: {manifest['createdAt']}") | ||
| 54 | + return report_id, metadata | ||
| 55 | + except Exception as exc: | ||
| 56 | + logger.error(f"读取manifest失败: {exc}") | ||
| 57 | + return None, None | ||
| 58 | + | ||
| 59 | + | ||
| 60 | +def load_chapters(run_dir: Path): | ||
| 61 | + """加载章节JSON列表。""" | ||
| 62 | + storage = ChapterStorage(settings.CHAPTER_OUTPUT_DIR) | ||
| 63 | + chapters = storage.load_chapters(run_dir) | ||
| 64 | + logger.info(f"加载章节数: {len(chapters)}") | ||
| 65 | + return chapters | ||
| 66 | + | ||
| 67 | + | ||
| 68 | +def validate_chapters(chapters): | ||
| 69 | + """使用IRValidator做快速校验,仅记录警告不阻断流程。""" | ||
| 70 | + validator = IRValidator() | ||
| 71 | + invalid = [] | ||
| 72 | + for chapter in chapters: | ||
| 73 | + ok, errors = validator.validate_chapter(chapter) | ||
| 74 | + if not ok: | ||
| 75 | + invalid.append((chapter.get("chapterId") or "unknown", errors)) | ||
| 76 | + | ||
| 77 | + if invalid: | ||
| 78 | + logger.warning(f"有 {len(invalid)} 个章节未通过结构校验,将继续装订:") | ||
| 79 | + for chapter_id, errors in invalid: | ||
| 80 | + preview = "; ".join(errors[:3]) | ||
| 81 | + logger.warning(f" - {chapter_id}: {preview}") | ||
| 82 | + else: | ||
| 83 | + logger.info("章节结构校验通过") | ||
| 84 | + | ||
| 85 | + | ||
| 86 | +def stitch_document(report_id, metadata, chapters): | ||
| 87 | + """将章节装订为整本Document IR。""" | ||
| 88 | + composer = DocumentComposer() | ||
| 89 | + document_ir = composer.build_document(report_id, metadata, chapters) | ||
| 90 | + logger.info( | ||
| 91 | + f"装订完成: {len(document_ir.get('chapters', []))} 个章节," | ||
| 92 | + f"{count_charts(document_ir)} 个图表" | ||
| 93 | + ) | ||
| 94 | + return document_ir | ||
| 95 | + | ||
| 96 | + | ||
| 97 | +def count_charts(document_ir): | ||
| 98 | + """统计IR中的图表数量。""" | ||
| 99 | + chart_count = 0 | ||
| 100 | + for chapter in document_ir.get("chapters", []): | ||
| 101 | + blocks = chapter.get("blocks", []) | ||
| 102 | + chart_count += _count_chart_blocks(blocks) | ||
| 103 | + return chart_count | ||
| 104 | + | ||
| 105 | + | ||
| 106 | +def _count_chart_blocks(blocks): | ||
| 107 | + """递归统计chart.js组件。""" | ||
| 108 | + count = 0 | ||
| 109 | + for block in blocks: | ||
| 110 | + if not isinstance(block, dict): | ||
| 111 | + continue | ||
| 112 | + if block.get("type") == "widget" and str(block.get("widgetType", "")).startswith("chart.js"): | ||
| 113 | + count += 1 | ||
| 114 | + nested = block.get("blocks") | ||
| 115 | + if isinstance(nested, list): | ||
| 116 | + count += _count_chart_blocks(nested) | ||
| 117 | + if block.get("type") == "list": | ||
| 118 | + for item in block.get("items", []): | ||
| 119 | + if isinstance(item, list): | ||
| 120 | + count += _count_chart_blocks(item) | ||
| 121 | + if block.get("type") == "table": | ||
| 122 | + for row in block.get("rows", []): | ||
| 123 | + for cell in row.get("cells", []): | ||
| 124 | + if isinstance(cell, dict): | ||
| 125 | + cell_blocks = cell.get("blocks", []) | ||
| 126 | + if isinstance(cell_blocks, list): | ||
| 127 | + count += _count_chart_blocks(cell_blocks) | ||
| 128 | + return count | ||
| 129 | + | ||
| 130 | + | ||
| 131 | +def save_document_ir(document_ir, base_name, timestamp): | ||
| 132 | + """将装订好的IR重新落盘,便于后续复用。""" | ||
| 133 | + output_dir = Path(settings.DOCUMENT_IR_OUTPUT_DIR) | ||
| 134 | + output_dir.mkdir(parents=True, exist_ok=True) | ||
| 135 | + ir_filename = f"report_ir_{base_name}_{timestamp}_regen.json" | ||
| 136 | + ir_path = output_dir / ir_filename | ||
| 137 | + ir_path.write_text(json.dumps(document_ir, ensure_ascii=False, indent=2), encoding="utf-8") | ||
| 138 | + logger.info(f"IR已保存: {ir_path}") | ||
| 139 | + return ir_path | ||
| 140 | + | ||
| 141 | + | ||
| 142 | +def render_html(document_ir, base_name, timestamp): | ||
| 143 | + """使用HTMLRenderer渲染并落盘HTML文件。""" | ||
| 144 | + renderer = HTMLRenderer() | ||
| 145 | + html_content = renderer.render(document_ir) | ||
| 146 | + | ||
| 147 | + output_dir = Path(settings.OUTPUT_DIR) / "html" | ||
| 148 | + output_dir.mkdir(parents=True, exist_ok=True) | ||
| 149 | + html_filename = f"report_html_{base_name}_{timestamp}.html" | ||
| 150 | + html_path = output_dir / html_filename | ||
| 151 | + html_path.write_text(html_content, encoding="utf-8") | ||
| 152 | + | ||
| 153 | + file_size_mb = html_path.stat().st_size / (1024 * 1024) | ||
| 154 | + logger.info(f"HTML生成成功: {html_path} ({file_size_mb:.2f} MB)") | ||
| 155 | + logger.info( | ||
| 156 | + "图表验证统计: " | ||
| 157 | + f"total={renderer.chart_validation_stats.get('total', 0)}, " | ||
| 158 | + f"valid={renderer.chart_validation_stats.get('valid', 0)}, " | ||
| 159 | + f"repaired={renderer.chart_validation_stats.get('repaired_locally', 0) + renderer.chart_validation_stats.get('repaired_api', 0)}, " | ||
| 160 | + f"failed={renderer.chart_validation_stats.get('failed', 0)}" | ||
| 161 | + ) | ||
| 162 | + return html_path | ||
| 163 | + | ||
| 164 | + | ||
| 165 | +def build_slug(text): | ||
| 166 | + """将主题/标题转换为安全的文件名片段。""" | ||
| 167 | + text = str(text or "report") | ||
| 168 | + sanitized = "".join(c for c in text if c.isalnum() or c in (" ", "-", "_")).strip() | ||
| 169 | + sanitized = sanitized.replace(" ", "_") | ||
| 170 | + return sanitized[:60] or "report" | ||
| 171 | + | ||
| 172 | + | ||
| 173 | +def main(): | ||
| 174 | + """主入口:装订最新章节并渲染HTML。""" | ||
| 175 | + logger.info("🚀 使用最新的LLM章节重新装订并渲染HTML") | ||
| 176 | + | ||
| 177 | + chapter_root = Path(settings.CHAPTER_OUTPUT_DIR) | ||
| 178 | + latest_run = find_latest_run_dir(chapter_root) | ||
| 179 | + if not latest_run: | ||
| 180 | + return 1 | ||
| 181 | + | ||
| 182 | + report_id, metadata = load_manifest(latest_run) | ||
| 183 | + if not report_id or metadata is None: | ||
| 184 | + return 1 | ||
| 185 | + | ||
| 186 | + chapters = load_chapters(latest_run) | ||
| 187 | + if not chapters: | ||
| 188 | + logger.error("未找到章节JSON,无法装订") | ||
| 189 | + return 1 | ||
| 190 | + | ||
| 191 | + validate_chapters(chapters) | ||
| 192 | + | ||
| 193 | + document_ir = stitch_document(report_id, metadata, chapters) | ||
| 194 | + | ||
| 195 | + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | ||
| 196 | + base_name = build_slug( | ||
| 197 | + metadata.get("query") or metadata.get("title") or metadata.get("reportId") or report_id | ||
| 198 | + ) | ||
| 199 | + | ||
| 200 | + ir_path = save_document_ir(document_ir, base_name, timestamp) | ||
| 201 | + html_path = render_html(document_ir, base_name, timestamp) | ||
| 202 | + | ||
| 203 | + logger.info("") | ||
| 204 | + logger.info("🎉 HTML装订与渲染完成") | ||
| 205 | + logger.info(f"IR文件: {ir_path.resolve()}") | ||
| 206 | + logger.info(f"HTML文件: {html_path.resolve()}") | ||
| 207 | + return 0 | ||
| 208 | + | ||
| 209 | + | ||
| 210 | +if __name__ == "__main__": | ||
| 211 | + sys.exit(main()) |
-
Please register or login to post a comment