Showing
1 changed file
with
328 additions
and
0 deletions
regenerate_latest_md.py
0 → 100644
| 1 | +""" | ||
| 2 | +使用最新的章节JSON重新装订并渲染Markdown报告。 | ||
| 3 | +""" | ||
| 4 | + | ||
| 5 | +import json | ||
| 6 | +import sys | ||
| 7 | +from datetime import datetime | ||
| 8 | +from pathlib import Path | ||
| 9 | +from loguru import logger | ||
| 10 | + | ||
| 11 | +# 确保可以找到项目内模块 | ||
| 12 | +sys.path.insert(0, str(Path(__file__).parent)) | ||
| 13 | + | ||
| 14 | +from ReportEngine.core import ChapterStorage, DocumentComposer | ||
| 15 | +from ReportEngine.ir import IRValidator | ||
| 16 | +from ReportEngine.renderers import MarkdownRenderer | ||
| 17 | +from ReportEngine.utils.config import settings | ||
| 18 | + | ||
| 19 | + | ||
| 20 | +def find_latest_run_dir(chapter_root: Path): | ||
| 21 | + """ | ||
| 22 | + 定位章节根目录下最新一次运行的输出目录。 | ||
| 23 | + | ||
| 24 | + 扫描 `chapter_root` 下所有子目录,筛选出包含 `manifest.json` | ||
| 25 | + 的候选,按修改时间倒序取最新一条。若目录不存在或没有有效 | ||
| 26 | + manifest,会记录错误并返回 None。 | ||
| 27 | + | ||
| 28 | + 参数: | ||
| 29 | + chapter_root: 章节输出的根目录(通常是 settings.CHAPTER_OUTPUT_DIR) | ||
| 30 | + | ||
| 31 | + 返回: | ||
| 32 | + Path | None: 最新的 run 目录路径;若未找到则为 None。 | ||
| 33 | + """ | ||
| 34 | + if not chapter_root.exists(): | ||
| 35 | + logger.error(f"章节目录不存在: {chapter_root}") | ||
| 36 | + return None | ||
| 37 | + | ||
| 38 | + run_dirs = [] | ||
| 39 | + for candidate in chapter_root.iterdir(): | ||
| 40 | + if not candidate.is_dir(): | ||
| 41 | + continue | ||
| 42 | + manifest_path = candidate / "manifest.json" | ||
| 43 | + if manifest_path.exists(): | ||
| 44 | + run_dirs.append((candidate, manifest_path.stat().st_mtime)) | ||
| 45 | + | ||
| 46 | + if not run_dirs: | ||
| 47 | + logger.error("未找到带 manifest.json 的章节目录") | ||
| 48 | + return None | ||
| 49 | + | ||
| 50 | + latest_dir = sorted(run_dirs, key=lambda item: item[1], reverse=True)[0][0] | ||
| 51 | + logger.info(f"找到最新run目录: {latest_dir.name}") | ||
| 52 | + return latest_dir | ||
| 53 | + | ||
| 54 | + | ||
| 55 | +def load_manifest(run_dir: Path): | ||
| 56 | + """ | ||
| 57 | + 读取单次运行目录内的 manifest.json。 | ||
| 58 | + | ||
| 59 | + 成功时返回 reportId 以及元数据字典;读取或解析失败会记录错误 | ||
| 60 | + 并返回 (None, None),以便上层提前终止流程。 | ||
| 61 | + | ||
| 62 | + 参数: | ||
| 63 | + run_dir: 包含 manifest.json 的章节输出目录 | ||
| 64 | + | ||
| 65 | + 返回: | ||
| 66 | + tuple[str | None, dict | None]: (report_id, metadata) | ||
| 67 | + """ | ||
| 68 | + manifest_path = run_dir / "manifest.json" | ||
| 69 | + try: | ||
| 70 | + with manifest_path.open("r", encoding="utf-8") as f: | ||
| 71 | + manifest = json.load(f) | ||
| 72 | + report_id = manifest.get("reportId") or run_dir.name | ||
| 73 | + metadata = manifest.get("metadata") or {} | ||
| 74 | + logger.info(f"报告ID: {report_id}") | ||
| 75 | + if manifest.get("createdAt"): | ||
| 76 | + logger.info(f"创建时间: {manifest['createdAt']}") | ||
| 77 | + return report_id, metadata | ||
| 78 | + except Exception as exc: | ||
| 79 | + logger.error(f"读取manifest失败: {exc}") | ||
| 80 | + return None, None | ||
| 81 | + | ||
| 82 | + | ||
| 83 | +def load_chapters(run_dir: Path): | ||
| 84 | + """ | ||
| 85 | + 读取指定 run 目录下的所有章节 JSON。 | ||
| 86 | + | ||
| 87 | + 会复用 ChapterStorage 的 load_chapters 能力,自动按 order 排序。 | ||
| 88 | + 读取后打印章节数量,便于确认完整性。 | ||
| 89 | + | ||
| 90 | + 参数: | ||
| 91 | + run_dir: 单次报告的章节目录 | ||
| 92 | + | ||
| 93 | + 返回: | ||
| 94 | + list[dict]: 章节 JSON 列表(若目录为空则为空列表) | ||
| 95 | + """ | ||
| 96 | + storage = ChapterStorage(settings.CHAPTER_OUTPUT_DIR) | ||
| 97 | + chapters = storage.load_chapters(run_dir) | ||
| 98 | + logger.info(f"加载章节数: {len(chapters)}") | ||
| 99 | + return chapters | ||
| 100 | + | ||
| 101 | + | ||
| 102 | +def validate_chapters(chapters): | ||
| 103 | + """ | ||
| 104 | + 使用 IRValidator 对章节结构做快速校验。 | ||
| 105 | + | ||
| 106 | + 仅记录未通过的章节及前三条错误,不会中断流程;目的是在 | ||
| 107 | + 重装订前发现潜在结构问题。 | ||
| 108 | + | ||
| 109 | + 参数: | ||
| 110 | + chapters: 章节 JSON 列表 | ||
| 111 | + """ | ||
| 112 | + validator = IRValidator() | ||
| 113 | + invalid = [] | ||
| 114 | + for chapter in chapters: | ||
| 115 | + ok, errors = validator.validate_chapter(chapter) | ||
| 116 | + if not ok: | ||
| 117 | + invalid.append((chapter.get("chapterId") or "unknown", errors)) | ||
| 118 | + | ||
| 119 | + if invalid: | ||
| 120 | + logger.warning(f"有 {len(invalid)} 个章节未通过结构校验,将继续装订:") | ||
| 121 | + for chapter_id, errors in invalid: | ||
| 122 | + preview = "; ".join(errors[:3]) | ||
| 123 | + logger.warning(f" - {chapter_id}: {preview}") | ||
| 124 | + else: | ||
| 125 | + logger.info("章节结构校验通过") | ||
| 126 | + | ||
| 127 | + | ||
| 128 | +def stitch_document(report_id, metadata, chapters): | ||
| 129 | + """ | ||
| 130 | + 将各章节与元数据装订为完整的 Document IR。 | ||
| 131 | + | ||
| 132 | + 使用 DocumentComposer 统一处理章节顺序、全局元数据等,并打印 | ||
| 133 | + 装订完成的章节与图表数量。 | ||
| 134 | + | ||
| 135 | + 参数: | ||
| 136 | + report_id: 报告 ID(来自 manifest 或目录名) | ||
| 137 | + metadata: manifest 中的全局元数据 | ||
| 138 | + chapters: 已加载的章节列表 | ||
| 139 | + | ||
| 140 | + 返回: | ||
| 141 | + dict: 完整的 Document IR 对象 | ||
| 142 | + """ | ||
| 143 | + composer = DocumentComposer() | ||
| 144 | + document_ir = composer.build_document(report_id, metadata, chapters) | ||
| 145 | + logger.info( | ||
| 146 | + f"装订完成: {len(document_ir.get('chapters', []))} 个章节," | ||
| 147 | + f"{count_charts(document_ir)} 个图表" | ||
| 148 | + ) | ||
| 149 | + return document_ir | ||
| 150 | + | ||
| 151 | + | ||
| 152 | +def count_charts(document_ir): | ||
| 153 | + """ | ||
| 154 | + 统计整本 Document IR 中的 Chart.js 图表数量。 | ||
| 155 | + | ||
| 156 | + 会遍历每章的 blocks,递归查找 widget 类型中以 `chart.js` | ||
| 157 | + 开头的组件,便于快速感知图表规模。 | ||
| 158 | + | ||
| 159 | + 参数: | ||
| 160 | + document_ir: 完整的 Document IR | ||
| 161 | + | ||
| 162 | + 返回: | ||
| 163 | + int: 图表总数 | ||
| 164 | + """ | ||
| 165 | + chart_count = 0 | ||
| 166 | + for chapter in document_ir.get("chapters", []): | ||
| 167 | + blocks = chapter.get("blocks", []) | ||
| 168 | + chart_count += _count_chart_blocks(blocks) | ||
| 169 | + return chart_count | ||
| 170 | + | ||
| 171 | + | ||
| 172 | +def _count_chart_blocks(blocks): | ||
| 173 | + """ | ||
| 174 | + 递归统计 block 列表中的 Chart.js 组件数量。 | ||
| 175 | + | ||
| 176 | + 兼容嵌套的 blocks/list/table 结构,确保所有层级的图表都被计入。 | ||
| 177 | + | ||
| 178 | + 参数: | ||
| 179 | + blocks: 任意层级的 block 列表 | ||
| 180 | + | ||
| 181 | + 返回: | ||
| 182 | + int: 统计到的 chart.js 图表数量 | ||
| 183 | + """ | ||
| 184 | + count = 0 | ||
| 185 | + for block in blocks: | ||
| 186 | + if not isinstance(block, dict): | ||
| 187 | + continue | ||
| 188 | + if block.get("type") == "widget" and str(block.get("widgetType", "")).startswith("chart.js"): | ||
| 189 | + count += 1 | ||
| 190 | + nested = block.get("blocks") | ||
| 191 | + if isinstance(nested, list): | ||
| 192 | + count += _count_chart_blocks(nested) | ||
| 193 | + if block.get("type") == "list": | ||
| 194 | + for item in block.get("items", []): | ||
| 195 | + if isinstance(item, list): | ||
| 196 | + count += _count_chart_blocks(item) | ||
| 197 | + if block.get("type") == "table": | ||
| 198 | + for row in block.get("rows", []): | ||
| 199 | + for cell in row.get("cells", []): | ||
| 200 | + if isinstance(cell, dict): | ||
| 201 | + cell_blocks = cell.get("blocks", []) | ||
| 202 | + if isinstance(cell_blocks, list): | ||
| 203 | + count += _count_chart_blocks(cell_blocks) | ||
| 204 | + return count | ||
| 205 | + | ||
| 206 | + | ||
| 207 | +def save_document_ir(document_ir, base_name, timestamp): | ||
| 208 | + """ | ||
| 209 | + 将重新装订好的整本 Document IR 落盘。 | ||
| 210 | + | ||
| 211 | + 按 `report_ir_{slug}_{timestamp}_regen.json` 命名写入 | ||
| 212 | + `settings.DOCUMENT_IR_OUTPUT_DIR`,确保目录存在并返回保存路径。 | ||
| 213 | + | ||
| 214 | + 参数: | ||
| 215 | + document_ir: 已装订完成的整本 IR | ||
| 216 | + base_name: 由主题/标题生成的安全文件名片段 | ||
| 217 | + timestamp: 时间戳字符串,用于区分多次重生成 | ||
| 218 | + | ||
| 219 | + 返回: | ||
| 220 | + Path: 保存的 IR 文件路径 | ||
| 221 | + """ | ||
| 222 | + output_dir = Path(settings.DOCUMENT_IR_OUTPUT_DIR) | ||
| 223 | + output_dir.mkdir(parents=True, exist_ok=True) | ||
| 224 | + ir_filename = f"report_ir_{base_name}_{timestamp}_regen.json" | ||
| 225 | + ir_path = output_dir / ir_filename | ||
| 226 | + ir_path.write_text(json.dumps(document_ir, ensure_ascii=False, indent=2), encoding="utf-8") | ||
| 227 | + logger.info(f"IR已保存: {ir_path}") | ||
| 228 | + return ir_path | ||
| 229 | + | ||
| 230 | + | ||
| 231 | +def render_markdown(document_ir, base_name, timestamp): | ||
| 232 | + """ | ||
| 233 | + 使用 MarkdownRenderer 将 Document IR 渲染为 Markdown 并保存。 | ||
| 234 | + | ||
| 235 | + 渲染后落盘到 `final_reports/md`,打印生成文件大小,便于确认 | ||
| 236 | + 输出内容。 | ||
| 237 | + | ||
| 238 | + 参数: | ||
| 239 | + document_ir: 装订完成的整本 IR | ||
| 240 | + base_name: 文件名片段(来源于报告主题/标题) | ||
| 241 | + timestamp: 时间戳字符串 | ||
| 242 | + | ||
| 243 | + 返回: | ||
| 244 | + Path: 生成的 Markdown 文件路径 | ||
| 245 | + """ | ||
| 246 | + renderer = MarkdownRenderer() | ||
| 247 | + markdown_content = renderer.render(document_ir) | ||
| 248 | + | ||
| 249 | + output_dir = Path(settings.OUTPUT_DIR) / "md" | ||
| 250 | + output_dir.mkdir(parents=True, exist_ok=True) | ||
| 251 | + md_filename = f"report_md_{base_name}_{timestamp}.md" | ||
| 252 | + md_path = output_dir / md_filename | ||
| 253 | + md_path.write_text(markdown_content, encoding="utf-8") | ||
| 254 | + | ||
| 255 | + file_size_kb = md_path.stat().st_size / 1024 | ||
| 256 | + logger.info(f"Markdown生成成功: {md_path} ({file_size_kb:.1f} KB)") | ||
| 257 | + return md_path | ||
| 258 | + | ||
| 259 | + | ||
| 260 | +def build_slug(text): | ||
| 261 | + """ | ||
| 262 | + 将主题/标题转换为文件系统安全的片段。 | ||
| 263 | + | ||
| 264 | + 仅保留字母/数字/空格/下划线/连字符,空格统一为下划线,并限制 | ||
| 265 | + 最长 60 字符,避免过长文件名。 | ||
| 266 | + | ||
| 267 | + 参数: | ||
| 268 | + text: 原始主题或标题 | ||
| 269 | + | ||
| 270 | + 返回: | ||
| 271 | + str: 清洗后的安全字符串 | ||
| 272 | + """ | ||
| 273 | + text = str(text or "report") | ||
| 274 | + sanitized = "".join(c for c in text if c.isalnum() or c in (" ", "-", "_")).strip() | ||
| 275 | + sanitized = sanitized.replace(" ", "_") | ||
| 276 | + return sanitized[:60] or "report" | ||
| 277 | + | ||
| 278 | + | ||
| 279 | +def main(): | ||
| 280 | + """ | ||
| 281 | + 主入口:读取最新章节、装订 IR 并渲染 Markdown。 | ||
| 282 | + | ||
| 283 | + 流程: | ||
| 284 | + 1) 找到最新的章节 run 目录并读取 manifest; | ||
| 285 | + 2) 加载章节并执行结构校验(仅警告); | ||
| 286 | + 3) 装订整本 IR,保存 IR 副本; | ||
| 287 | + 4) 渲染 Markdown 并输出路径。 | ||
| 288 | + | ||
| 289 | + 返回: | ||
| 290 | + int: 0 表示成功,其余表示失败。 | ||
| 291 | + """ | ||
| 292 | + logger.info("🚀 使用最新的LLM章节重新装订并渲染Markdown") | ||
| 293 | + | ||
| 294 | + chapter_root = Path(settings.CHAPTER_OUTPUT_DIR) | ||
| 295 | + latest_run = find_latest_run_dir(chapter_root) | ||
| 296 | + if not latest_run: | ||
| 297 | + return 1 | ||
| 298 | + | ||
| 299 | + report_id, metadata = load_manifest(latest_run) | ||
| 300 | + if not report_id or metadata is None: | ||
| 301 | + return 1 | ||
| 302 | + | ||
| 303 | + chapters = load_chapters(latest_run) | ||
| 304 | + if not chapters: | ||
| 305 | + logger.error("未找到章节JSON,无法装订") | ||
| 306 | + return 1 | ||
| 307 | + | ||
| 308 | + validate_chapters(chapters) | ||
| 309 | + | ||
| 310 | + document_ir = stitch_document(report_id, metadata, chapters) | ||
| 311 | + | ||
| 312 | + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | ||
| 313 | + base_name = build_slug( | ||
| 314 | + metadata.get("query") or metadata.get("title") or metadata.get("reportId") or report_id | ||
| 315 | + ) | ||
| 316 | + | ||
| 317 | + ir_path = save_document_ir(document_ir, base_name, timestamp) | ||
| 318 | + md_path = render_markdown(document_ir, base_name, timestamp) | ||
| 319 | + | ||
| 320 | + logger.info("") | ||
| 321 | + logger.info("🎉 Markdown装订与渲染完成") | ||
| 322 | + logger.info(f"IR文件: {ir_path.resolve()}") | ||
| 323 | + logger.info(f"Markdown文件: {md_path.resolve()}") | ||
| 324 | + return 0 | ||
| 325 | + | ||
| 326 | + | ||
| 327 | +if __name__ == "__main__": | ||
| 328 | + sys.exit(main()) |
-
Please register or login to post a comment