Showing
3 changed files
with
660 additions
and
0 deletions
| @@ -16,10 +16,12 @@ from .pdf_layout_optimizer import ( | @@ -16,10 +16,12 @@ from .pdf_layout_optimizer import ( | ||
| 16 | ChartLayout, | 16 | ChartLayout, |
| 17 | GridLayout, | 17 | GridLayout, |
| 18 | ) | 18 | ) |
| 19 | +from .markdown_renderer import MarkdownRenderer | ||
| 19 | 20 | ||
| 20 | __all__ = [ | 21 | __all__ = [ |
| 21 | "HTMLRenderer", | 22 | "HTMLRenderer", |
| 22 | "PDFRenderer", | 23 | "PDFRenderer", |
| 24 | + "MarkdownRenderer", | ||
| 23 | "PDFLayoutOptimizer", | 25 | "PDFLayoutOptimizer", |
| 24 | "PDFLayoutConfig", | 26 | "PDFLayoutConfig", |
| 25 | "PageLayout", | 27 | "PageLayout", |
ReportEngine/renderers/markdown_renderer.py
0 → 100644
| 1 | +from __future__ import annotations | ||
| 2 | + | ||
| 3 | +import json | ||
| 4 | +from typing import Any, Dict, List | ||
| 5 | + | ||
| 6 | +from loguru import logger | ||
| 7 | + | ||
| 8 | + | ||
| 9 | +class MarkdownRenderer: | ||
| 10 | + """ | ||
| 11 | + 将 Document IR 转为 Markdown。 | ||
| 12 | + | ||
| 13 | + - 图表与词云统一降级为数据表格,避免丢失关键信息; | ||
| 14 | + - 尽量保留通用特性(标题、列表、代码、表格、引用等); | ||
| 15 | + - 对不常见特性(callout/kpiGrid/engineQuote等)使用近似替换。 | ||
| 16 | + """ | ||
| 17 | + | ||
| 18 | + def __init__(self) -> None: | ||
| 19 | + self.document: Dict[str, Any] = {} | ||
| 20 | + self.metadata: Dict[str, Any] = {} | ||
| 21 | + | ||
| 22 | + def render(self, document_ir: Dict[str, Any]) -> str: | ||
| 23 | + """入口:将IR转换为Markdown字符串""" | ||
| 24 | + self.document = document_ir or {} | ||
| 25 | + self.metadata = self.document.get("metadata", {}) or {} | ||
| 26 | + | ||
| 27 | + parts: List[str] = [] | ||
| 28 | + title = self.metadata.get("title") or self.metadata.get("query") or "报告" | ||
| 29 | + if title: | ||
| 30 | + parts.append(f"# {self._escape_text(title)}") | ||
| 31 | + parts.append("") | ||
| 32 | + | ||
| 33 | + for chapter in self.document.get("chapters", []) or []: | ||
| 34 | + chapter_md = self._render_chapter(chapter) | ||
| 35 | + if chapter_md: | ||
| 36 | + parts.append(chapter_md) | ||
| 37 | + | ||
| 38 | + return "\n".join(part for part in parts if part is not None).strip() | ||
| 39 | + | ||
| 40 | + # ===== 章节与块级渲染 ===== | ||
| 41 | + | ||
| 42 | + def _render_chapter(self, chapter: Dict[str, Any]) -> str: | ||
| 43 | + lines: List[str] = [] | ||
| 44 | + title = chapter.get("title") or chapter.get("chapterId") | ||
| 45 | + if title: | ||
| 46 | + lines.append(f"## {self._escape_text(title)}") | ||
| 47 | + lines.append("") | ||
| 48 | + body = self._render_blocks(chapter.get("blocks", [])) | ||
| 49 | + if body: | ||
| 50 | + lines.append(body) | ||
| 51 | + return "\n".join(lines).strip() | ||
| 52 | + | ||
| 53 | + def _render_blocks(self, blocks: List[Dict[str, Any]] | None, join_with_blank: bool = True) -> str: | ||
| 54 | + rendered: List[str] = [] | ||
| 55 | + for block in blocks or []: | ||
| 56 | + md = self._render_block(block) | ||
| 57 | + if md is None: | ||
| 58 | + continue | ||
| 59 | + md = md.strip() | ||
| 60 | + if md: | ||
| 61 | + rendered.append(md) | ||
| 62 | + if not rendered: | ||
| 63 | + return "" | ||
| 64 | + separator = "\n\n" if join_with_blank else "\n" | ||
| 65 | + return separator.join(rendered) | ||
| 66 | + | ||
| 67 | + def _render_block(self, block: Any) -> str: | ||
| 68 | + if block is None: | ||
| 69 | + return "" | ||
| 70 | + if isinstance(block, str): | ||
| 71 | + return self._escape_text(block) | ||
| 72 | + if not isinstance(block, dict): | ||
| 73 | + return "" | ||
| 74 | + | ||
| 75 | + block_type = block.get("type") or ("paragraph" if block.get("inlines") else None) | ||
| 76 | + handlers = { | ||
| 77 | + "heading": self._render_heading, | ||
| 78 | + "paragraph": self._render_paragraph, | ||
| 79 | + "list": self._render_list, | ||
| 80 | + "table": self._render_table, | ||
| 81 | + "swotTable": self._render_swot_table, | ||
| 82 | + "pestTable": self._render_pest_table, | ||
| 83 | + "blockquote": self._render_blockquote, | ||
| 84 | + "engineQuote": self._render_engine_quote, | ||
| 85 | + "hr": lambda b: "---", | ||
| 86 | + "code": self._render_code, | ||
| 87 | + "math": self._render_math, | ||
| 88 | + "figure": self._render_figure, | ||
| 89 | + "callout": self._render_callout, | ||
| 90 | + "kpiGrid": self._render_kpi_grid, | ||
| 91 | + "widget": self._render_widget, | ||
| 92 | + "toc": lambda b: "", | ||
| 93 | + } | ||
| 94 | + if block_type in handlers: | ||
| 95 | + return handlers[block_type](block) | ||
| 96 | + | ||
| 97 | + if isinstance(block.get("blocks"), list): | ||
| 98 | + return self._render_blocks(block["blocks"]) | ||
| 99 | + | ||
| 100 | + return self._fallback_unknown(block) | ||
| 101 | + | ||
| 102 | + def _render_heading(self, block: Dict[str, Any]) -> str: | ||
| 103 | + level = block.get("level", 2) | ||
| 104 | + level = max(1, min(6, level)) | ||
| 105 | + hashes = "#" * level | ||
| 106 | + text = block.get("text") or "" | ||
| 107 | + subtitle = block.get("subtitle") | ||
| 108 | + subtitle_text = f" _{self._escape_text(subtitle)}_" if subtitle else "" | ||
| 109 | + return f"{hashes} {self._escape_text(text)}{subtitle_text}" | ||
| 110 | + | ||
| 111 | + def _render_paragraph(self, block: Dict[str, Any]) -> str: | ||
| 112 | + return self._render_inlines(block.get("inlines", [])) | ||
| 113 | + | ||
| 114 | + def _render_list(self, block: Dict[str, Any]) -> str: | ||
| 115 | + list_type = block.get("listType", "bullet") | ||
| 116 | + items = block.get("items") or [] | ||
| 117 | + lines: List[str] = [] | ||
| 118 | + for idx, item_blocks in enumerate(items): | ||
| 119 | + prefix = "-" | ||
| 120 | + if list_type == "ordered": | ||
| 121 | + prefix = f"{idx + 1}." | ||
| 122 | + elif list_type == "task": | ||
| 123 | + prefix = "- [ ]" | ||
| 124 | + content = self._render_blocks(item_blocks, join_with_blank=False) | ||
| 125 | + if not content: | ||
| 126 | + continue | ||
| 127 | + content_lines = content.splitlines() or [""] | ||
| 128 | + first = content_lines[0] | ||
| 129 | + lines.append(f"{prefix} {first}") | ||
| 130 | + for cont in content_lines[1:]: | ||
| 131 | + lines.append(f" {cont}") | ||
| 132 | + return "\n".join(lines) | ||
| 133 | + | ||
| 134 | + def _render_table(self, block: Dict[str, Any]) -> str: | ||
| 135 | + rows = block.get("rows") or [] | ||
| 136 | + if not rows: | ||
| 137 | + return "" | ||
| 138 | + | ||
| 139 | + header_cells: List[str] = [] | ||
| 140 | + body_rows: List[List[str]] = [] | ||
| 141 | + | ||
| 142 | + # 检测首行是否声明为表头 | ||
| 143 | + first_row_cells = rows[0].get("cells") if isinstance(rows[0], dict) else None | ||
| 144 | + has_header = bool(first_row_cells and any(cell.get("header") or cell.get("isHeader") for cell in first_row_cells)) | ||
| 145 | + | ||
| 146 | + # 计算最大列数,忽略rowspan | ||
| 147 | + col_count = 0 | ||
| 148 | + for row in rows: | ||
| 149 | + cells = row.get("cells") if isinstance(row, dict) else None | ||
| 150 | + span = 0 | ||
| 151 | + for cell in cells or []: | ||
| 152 | + span += int(cell.get("colspan") or 1) | ||
| 153 | + col_count = max(col_count, span) | ||
| 154 | + | ||
| 155 | + if has_header and first_row_cells: | ||
| 156 | + header_cells = [self._render_cell_content(cell) for cell in first_row_cells] | ||
| 157 | + rows = rows[1:] | ||
| 158 | + else: | ||
| 159 | + header_cells = [f"列{idx + 1}" for idx in range(col_count or (len(first_row_cells or []) or 1))] | ||
| 160 | + | ||
| 161 | + for row in rows: | ||
| 162 | + if not isinstance(row, dict): | ||
| 163 | + continue | ||
| 164 | + cells = row.get("cells") or [] | ||
| 165 | + row_cells: List[str] = [] | ||
| 166 | + for cell in cells: | ||
| 167 | + text = self._render_cell_content(cell) | ||
| 168 | + span = int(cell.get("colspan") or 1) | ||
| 169 | + row_cells.append(text) | ||
| 170 | + if span > 1: | ||
| 171 | + row_cells.extend([""] * (span - 1)) | ||
| 172 | + while len(row_cells) < len(header_cells): | ||
| 173 | + row_cells.append("") | ||
| 174 | + body_rows.append(row_cells[: len(header_cells)]) | ||
| 175 | + | ||
| 176 | + lines = [ | ||
| 177 | + self._markdown_row(header_cells), | ||
| 178 | + self._markdown_separator(len(header_cells)), | ||
| 179 | + ] | ||
| 180 | + for row in body_rows: | ||
| 181 | + lines.append(self._markdown_row(row)) | ||
| 182 | + return "\n".join(lines) | ||
| 183 | + | ||
| 184 | + def _render_swot_table(self, block: Dict[str, Any]) -> str: | ||
| 185 | + title = block.get("title") or "SWOT 分析" | ||
| 186 | + summary = block.get("summary") | ||
| 187 | + quadrants = [ | ||
| 188 | + ("strengths", "S 优势"), | ||
| 189 | + ("weaknesses", "W 劣势"), | ||
| 190 | + ("opportunities", "O 机会"), | ||
| 191 | + ("threats", "T 威胁"), | ||
| 192 | + ] | ||
| 193 | + | ||
| 194 | + lines = [f"### {self._escape_text(title)}"] | ||
| 195 | + if summary: | ||
| 196 | + lines.append(self._escape_text(summary)) | ||
| 197 | + | ||
| 198 | + for key, label in quadrants: | ||
| 199 | + items = self._normalize_swot_items(block.get(key)) | ||
| 200 | + lines.append(f"#### {label}") | ||
| 201 | + if not items: | ||
| 202 | + lines.append("> 暂无数据") | ||
| 203 | + continue | ||
| 204 | + table_lines = [ | ||
| 205 | + self._markdown_row(["序号", "要点", "详情", "标签"]), | ||
| 206 | + self._markdown_separator(4), | ||
| 207 | + ] | ||
| 208 | + for idx, item in enumerate(items, start=1): | ||
| 209 | + tags = [val for val in (item.get("impact"), item.get("priority")) if val] | ||
| 210 | + tag_text = " / ".join(self._escape_text(t) for t in tags) or "" | ||
| 211 | + detail = item.get("detail") or item.get("description") or item.get("evidence") or "" | ||
| 212 | + table_lines.append( | ||
| 213 | + self._markdown_row([ | ||
| 214 | + str(idx), | ||
| 215 | + self._escape_text(item.get("title") or "未命名要点", for_table=True), | ||
| 216 | + self._escape_text(detail, for_table=True), | ||
| 217 | + self._escape_text(tag_text, for_table=True), | ||
| 218 | + ]) | ||
| 219 | + ) | ||
| 220 | + lines.append("\n".join(table_lines)) | ||
| 221 | + return "\n\n".join(lines) | ||
| 222 | + | ||
| 223 | + def _render_pest_table(self, block: Dict[str, Any]) -> str: | ||
| 224 | + title = block.get("title") or "PEST 分析" | ||
| 225 | + summary = block.get("summary") | ||
| 226 | + dimensions = [ | ||
| 227 | + ("political", "P 政治"), | ||
| 228 | + ("economic", "E 经济"), | ||
| 229 | + ("social", "S 社会"), | ||
| 230 | + ("technological", "T 技术"), | ||
| 231 | + ] | ||
| 232 | + | ||
| 233 | + lines = [f"### {self._escape_text(title)}"] | ||
| 234 | + if summary: | ||
| 235 | + lines.append(self._escape_text(summary)) | ||
| 236 | + | ||
| 237 | + for key, label in dimensions: | ||
| 238 | + items = self._normalize_pest_items(block.get(key)) | ||
| 239 | + lines.append(f"#### {label}") | ||
| 240 | + if not items: | ||
| 241 | + lines.append("> 暂无数据") | ||
| 242 | + continue | ||
| 243 | + table_lines = [ | ||
| 244 | + self._markdown_row(["序号", "要点", "详情", "标签"]), | ||
| 245 | + self._markdown_separator(4), | ||
| 246 | + ] | ||
| 247 | + for idx, item in enumerate(items, start=1): | ||
| 248 | + tags = [val for val in (item.get("impact"), item.get("weight"), item.get("priority")) if val] | ||
| 249 | + tag_text = " / ".join(self._escape_text(t) for t in tags) or "" | ||
| 250 | + detail = item.get("detail") or item.get("description") or "" | ||
| 251 | + table_lines.append( | ||
| 252 | + self._markdown_row([ | ||
| 253 | + str(idx), | ||
| 254 | + self._escape_text(item.get("title") or "未命名要点", for_table=True), | ||
| 255 | + self._escape_text(detail, for_table=True), | ||
| 256 | + self._escape_text(tag_text, for_table=True), | ||
| 257 | + ]) | ||
| 258 | + ) | ||
| 259 | + lines.append("\n".join(table_lines)) | ||
| 260 | + return "\n\n".join(lines) | ||
| 261 | + | ||
| 262 | + def _render_blockquote(self, block: Dict[str, Any]) -> str: | ||
| 263 | + inner = self._render_blocks(block.get("blocks", [])) | ||
| 264 | + return self._quote_lines(inner) | ||
| 265 | + | ||
| 266 | + def _render_engine_quote(self, block: Dict[str, Any]) -> str: | ||
| 267 | + title = block.get("title") or block.get("engine") or "引用" | ||
| 268 | + inner = self._render_blocks(block.get("blocks", [])) | ||
| 269 | + header = f"**{self._escape_text(title)}**" | ||
| 270 | + return self._quote_lines(f"{header}\n{inner}" if inner else header) | ||
| 271 | + | ||
| 272 | + def _render_code(self, block: Dict[str, Any]) -> str: | ||
| 273 | + lang = block.get("lang") or "" | ||
| 274 | + content = block.get("content") or "" | ||
| 275 | + return f"```{lang}\n{content}\n```" | ||
| 276 | + | ||
| 277 | + def _render_math(self, block: Dict[str, Any]) -> str: | ||
| 278 | + latex = self._normalize_math(block.get("latex", "")) | ||
| 279 | + if not latex: | ||
| 280 | + return "" | ||
| 281 | + return f"$$\n{latex}\n$$" | ||
| 282 | + | ||
| 283 | + def _render_figure(self, block: Dict[str, Any]) -> str: | ||
| 284 | + caption = block.get("caption") or "图像内容占位" | ||
| 285 | + return f">  {self._escape_text(caption)}" | ||
| 286 | + | ||
| 287 | + def _render_callout(self, block: Dict[str, Any]) -> str: | ||
| 288 | + tone = block.get("tone") or "info" | ||
| 289 | + title = block.get("title") | ||
| 290 | + inner = self._render_blocks(block.get("blocks", [])) | ||
| 291 | + header = f"**{self._escape_text(title)}** [{tone}]" if title else f"[{tone}]" | ||
| 292 | + content = header if not inner else f"{header}\n{inner}" | ||
| 293 | + return self._quote_lines(content) | ||
| 294 | + | ||
| 295 | + def _render_kpi_grid(self, block: Dict[str, Any]) -> str: | ||
| 296 | + items = block.get("items") or [] | ||
| 297 | + if not items: | ||
| 298 | + return "" | ||
| 299 | + header = ["指标", "数值", "变化"] | ||
| 300 | + lines = [self._markdown_row(header), self._markdown_separator(len(header))] | ||
| 301 | + for item in items: | ||
| 302 | + label = item.get("label") or "" | ||
| 303 | + value = f"{item.get('value', '')}{item.get('unit') or ''}" | ||
| 304 | + delta = self._format_delta(item.get("delta"), item.get("deltaTone")) | ||
| 305 | + lines.append(self._markdown_row([ | ||
| 306 | + self._escape_text(label, for_table=True), | ||
| 307 | + self._escape_text(value, for_table=True), | ||
| 308 | + self._escape_text(delta, for_table=True), | ||
| 309 | + ])) | ||
| 310 | + return "\n".join(lines) | ||
| 311 | + | ||
| 312 | + def _render_widget(self, block: Dict[str, Any]) -> str: | ||
| 313 | + widget_type = (block.get("widgetType") or "").lower() | ||
| 314 | + title = block.get("title") or (block.get("props", {}) or {}).get("title") | ||
| 315 | + title_prefix = f"**{self._escape_text(title)}**\n\n" if title else "" | ||
| 316 | + | ||
| 317 | + if widget_type.startswith("chart.js"): | ||
| 318 | + chart_table = self._render_chart_as_table(block) | ||
| 319 | + return f"{title_prefix}{chart_table}".strip() | ||
| 320 | + if "wordcloud" in widget_type: | ||
| 321 | + cloud_table = self._render_wordcloud_as_table(block) | ||
| 322 | + return f"{title_prefix}{cloud_table}".strip() | ||
| 323 | + | ||
| 324 | + data_preview = "" | ||
| 325 | + try: | ||
| 326 | + data_preview = json.dumps(block.get("data") or {}, ensure_ascii=False)[:200] | ||
| 327 | + except Exception: | ||
| 328 | + data_preview = "" | ||
| 329 | + note = "> 数据组件暂不支持Markdown渲染" | ||
| 330 | + return f"{title_prefix}{note}" + (f"\n\n```\n{data_preview}\n```" if data_preview else "") | ||
| 331 | + | ||
| 332 | + # ===== 工具方法 ===== | ||
| 333 | + | ||
| 334 | + def _render_chart_as_table(self, block: Dict[str, Any]) -> str: | ||
| 335 | + data = self._coerce_chart_data(block.get("data") or {}) | ||
| 336 | + labels = data.get("labels") or [] | ||
| 337 | + datasets = data.get("datasets") or [] | ||
| 338 | + if not labels or not datasets: | ||
| 339 | + return "> 图表数据缺失,无法转为表格" | ||
| 340 | + | ||
| 341 | + headers = ["类别"] + [ | ||
| 342 | + ds.get("label") or f"系列{idx + 1}" | ||
| 343 | + for idx, ds in enumerate(datasets) | ||
| 344 | + ] | ||
| 345 | + lines = [self._markdown_row(headers), self._markdown_separator(len(headers))] | ||
| 346 | + for idx, label in enumerate(labels): | ||
| 347 | + row_cells = [self._escape_text(self._stringify_value(label), for_table=True)] | ||
| 348 | + for ds in datasets: | ||
| 349 | + series = ds.get("data") or [] | ||
| 350 | + value = series[idx] if idx < len(series) else "" | ||
| 351 | + row_cells.append(self._escape_text(self._stringify_value(value), for_table=True)) | ||
| 352 | + lines.append(self._markdown_row(row_cells)) | ||
| 353 | + return "\n".join(lines) | ||
| 354 | + | ||
| 355 | + def _render_wordcloud_as_table(self, block: Dict[str, Any]) -> str: | ||
| 356 | + items = self._collect_wordcloud_items(block) | ||
| 357 | + if not items: | ||
| 358 | + return "> 词云数据缺失,无法转为表格" | ||
| 359 | + | ||
| 360 | + lines = [ | ||
| 361 | + self._markdown_row(["关键词", "权重", "类别"]), | ||
| 362 | + self._markdown_separator(3), | ||
| 363 | + ] | ||
| 364 | + for item in items: | ||
| 365 | + lines.append( | ||
| 366 | + self._markdown_row([ | ||
| 367 | + self._escape_text(item.get("word", ""), for_table=True), | ||
| 368 | + self._escape_text(self._stringify_value(item.get("weight")), for_table=True), | ||
| 369 | + self._escape_text(item.get("category", "") or "-", for_table=True), | ||
| 370 | + ]) | ||
| 371 | + ) | ||
| 372 | + return "\n".join(lines) | ||
| 373 | + | ||
| 374 | + def _render_cell_content(self, cell: Dict[str, Any]) -> str: | ||
| 375 | + blocks = cell.get("blocks") if isinstance(cell, dict) else None | ||
| 376 | + return self._render_blocks_as_text(blocks) | ||
| 377 | + | ||
| 378 | + def _render_blocks_as_text(self, blocks: List[Dict[str, Any]] | None) -> str: | ||
| 379 | + texts: List[str] = [] | ||
| 380 | + for block in blocks or []: | ||
| 381 | + texts.append(self._render_block_as_text(block)) | ||
| 382 | + return " ".join(filter(None, texts)) | ||
| 383 | + | ||
| 384 | + def _render_block_as_text(self, block: Any) -> str: | ||
| 385 | + if isinstance(block, str): | ||
| 386 | + return self._escape_text(block, for_table=True) | ||
| 387 | + if not isinstance(block, dict): | ||
| 388 | + return "" | ||
| 389 | + block_type = block.get("type") | ||
| 390 | + if block_type == "paragraph": | ||
| 391 | + return self._render_inlines(block.get("inlines", []), for_table=True) | ||
| 392 | + if block_type == "heading": | ||
| 393 | + return self._escape_text(block.get("text") or "", for_table=True) | ||
| 394 | + if block_type == "list": | ||
| 395 | + items = [] | ||
| 396 | + for sub in block.get("items") or []: | ||
| 397 | + items.append(self._render_blocks_as_text(sub)) | ||
| 398 | + return "; ".join(filter(None, items)) | ||
| 399 | + if block_type == "math": | ||
| 400 | + return f"${self._normalize_math(block.get('latex', ''))}$" | ||
| 401 | + if block_type == "code": | ||
| 402 | + return block.get("content", "") or "" | ||
| 403 | + if block_type == "widget": | ||
| 404 | + return self._escape_text(block.get("title") or "图表", for_table=True) | ||
| 405 | + if isinstance(block.get("blocks"), list): | ||
| 406 | + return self._render_blocks_as_text(block.get("blocks")) | ||
| 407 | + return self._escape_text(str(block), for_table=True) | ||
| 408 | + | ||
| 409 | + def _markdown_row(self, cells: List[str]) -> str: | ||
| 410 | + return "| " + " | ".join(cells) + " |" | ||
| 411 | + | ||
| 412 | + def _markdown_separator(self, count: int) -> str: | ||
| 413 | + return "| " + " | ".join(["---"] * max(1, count)) + " |" | ||
| 414 | + | ||
| 415 | + def _render_inlines(self, inlines: List[Any], for_table: bool = False) -> str: | ||
| 416 | + parts: List[str] = [] | ||
| 417 | + for run in inlines or []: | ||
| 418 | + parts.append(self._render_inline_run(run, for_table=for_table)) | ||
| 419 | + return "".join(parts) | ||
| 420 | + | ||
| 421 | + def _render_inline_run(self, run: Any, for_table: bool = False) -> str: | ||
| 422 | + if isinstance(run, dict): | ||
| 423 | + text = run.get("text", "") | ||
| 424 | + marks = run.get("marks") or [] | ||
| 425 | + else: | ||
| 426 | + text = run if isinstance(run, str) else "" | ||
| 427 | + marks = [] | ||
| 428 | + result = self._escape_text(text, for_table=for_table) | ||
| 429 | + for mark in marks: | ||
| 430 | + if not isinstance(mark, dict): | ||
| 431 | + continue | ||
| 432 | + mtype = mark.get("type") | ||
| 433 | + if mtype == "bold": | ||
| 434 | + result = f"**{result}**" | ||
| 435 | + elif mtype == "italic": | ||
| 436 | + result = f"*{result}*" | ||
| 437 | + elif mtype == "underline": | ||
| 438 | + result = f"__{result}__" | ||
| 439 | + elif mtype == "strike": | ||
| 440 | + result = f"~~{result}~~" | ||
| 441 | + elif mtype == "code": | ||
| 442 | + result = f"`{result}`" | ||
| 443 | + elif mtype == "link": | ||
| 444 | + href = mark.get("href") or mark.get("value") | ||
| 445 | + href = str(href) if href else "" | ||
| 446 | + result = f"[{result}]({href})" if href else result | ||
| 447 | + elif mtype == "highlight": | ||
| 448 | + result = f"=={result}==" | ||
| 449 | + elif mtype == "subscript": | ||
| 450 | + result = f"~{result}~" | ||
| 451 | + elif mtype == "superscript": | ||
| 452 | + result = f"^{result}^" | ||
| 453 | + elif mtype == "math": | ||
| 454 | + latex = self._normalize_math(mark.get("value") or text) | ||
| 455 | + result = f"${latex}$" if latex else result | ||
| 456 | + # 颜色/字体等非通用标记直接降级为纯文本 | ||
| 457 | + return result | ||
| 458 | + | ||
| 459 | + def _quote_lines(self, text: str) -> str: | ||
| 460 | + if not text: | ||
| 461 | + return "" | ||
| 462 | + lines = [] | ||
| 463 | + for line in text.splitlines(): | ||
| 464 | + line = line.strip() | ||
| 465 | + prefix = "> " if line else ">" | ||
| 466 | + lines.append(f"{prefix}{line}") | ||
| 467 | + return "\n".join(lines) | ||
| 468 | + | ||
| 469 | + def _normalize_swot_items(self, raw: Any) -> List[Dict[str, Any]]: | ||
| 470 | + items: List[Dict[str, Any]] = [] | ||
| 471 | + if not raw: | ||
| 472 | + return items | ||
| 473 | + for entry in raw: | ||
| 474 | + if isinstance(entry, str): | ||
| 475 | + items.append({"title": entry}) | ||
| 476 | + elif isinstance(entry, dict): | ||
| 477 | + title = entry.get("title") or entry.get("label") or entry.get("text") | ||
| 478 | + detail = entry.get("detail") or entry.get("description") | ||
| 479 | + impact = entry.get("impact") | ||
| 480 | + priority = entry.get("priority") | ||
| 481 | + evidence = entry.get("evidence") | ||
| 482 | + items.append({ | ||
| 483 | + "title": title or "未命名要点", | ||
| 484 | + "detail": detail, | ||
| 485 | + "impact": impact, | ||
| 486 | + "priority": priority, | ||
| 487 | + "evidence": evidence, | ||
| 488 | + }) | ||
| 489 | + return items | ||
| 490 | + | ||
| 491 | + def _normalize_pest_items(self, raw: Any) -> List[Dict[str, Any]]: | ||
| 492 | + items: List[Dict[str, Any]] = [] | ||
| 493 | + if not raw: | ||
| 494 | + return items | ||
| 495 | + for entry in raw: | ||
| 496 | + if isinstance(entry, str): | ||
| 497 | + items.append({"title": entry}) | ||
| 498 | + elif isinstance(entry, dict): | ||
| 499 | + title = entry.get("title") or entry.get("label") or entry.get("text") | ||
| 500 | + detail = entry.get("detail") or entry.get("description") | ||
| 501 | + items.append({ | ||
| 502 | + "title": title or "未命名要点", | ||
| 503 | + "detail": detail, | ||
| 504 | + "impact": entry.get("impact"), | ||
| 505 | + "priority": entry.get("priority"), | ||
| 506 | + "weight": entry.get("weight"), | ||
| 507 | + }) | ||
| 508 | + return items | ||
| 509 | + | ||
| 510 | + def _coerce_chart_data(self, data: Dict[str, Any]) -> Dict[str, Any]: | ||
| 511 | + if not isinstance(data, dict): | ||
| 512 | + return {} | ||
| 513 | + if "labels" in data or "datasets" in data: | ||
| 514 | + return data | ||
| 515 | + for key in ("data", "chartData", "payload"): | ||
| 516 | + nested = data.get(key) | ||
| 517 | + if isinstance(nested, dict) and ("labels" in nested or "datasets" in nested): | ||
| 518 | + return nested | ||
| 519 | + return data | ||
| 520 | + | ||
| 521 | + def _collect_wordcloud_items(self, block: Dict[str, Any]) -> List[Dict[str, Any]]: | ||
| 522 | + props = block.get("props") or {} | ||
| 523 | + candidates: List[Any] = [] | ||
| 524 | + for key in ("data", "words", "items"): | ||
| 525 | + value = props.get(key) | ||
| 526 | + if isinstance(value, list): | ||
| 527 | + candidates.append(value) | ||
| 528 | + data_field = block.get("data") | ||
| 529 | + if isinstance(data_field, list): | ||
| 530 | + candidates.append(data_field) | ||
| 531 | + elif isinstance(data_field, dict): | ||
| 532 | + if isinstance(data_field.get("items"), list): | ||
| 533 | + candidates.append(data_field.get("items")) | ||
| 534 | + | ||
| 535 | + items: List[Dict[str, Any]] = [] | ||
| 536 | + seen: set[str] = set() | ||
| 537 | + | ||
| 538 | + def push(word: str, weight: Any, category: str) -> None: | ||
| 539 | + key = f"{word}::{category}" | ||
| 540 | + if key in seen: | ||
| 541 | + return | ||
| 542 | + seen.add(key) | ||
| 543 | + items.append({"word": word, "weight": weight, "category": category}) | ||
| 544 | + | ||
| 545 | + for candidate in candidates: | ||
| 546 | + for entry in candidate or []: | ||
| 547 | + if isinstance(entry, dict): | ||
| 548 | + word = entry.get("word") or entry.get("text") or entry.get("label") | ||
| 549 | + if not word: | ||
| 550 | + continue | ||
| 551 | + weight = entry.get("weight") or entry.get("value") | ||
| 552 | + category = entry.get("category") or "" | ||
| 553 | + push(str(word), weight, str(category)) | ||
| 554 | + elif isinstance(entry, (list, tuple)) and entry: | ||
| 555 | + word = entry[0] | ||
| 556 | + weight = entry[1] if len(entry) > 1 else "" | ||
| 557 | + category = entry[2] if len(entry) > 2 else "" | ||
| 558 | + push(str(word), weight, str(category)) | ||
| 559 | + elif isinstance(entry, str): | ||
| 560 | + push(entry, "", "") | ||
| 561 | + return items | ||
| 562 | + | ||
| 563 | + def _escape_text(self, text: Any, for_table: bool = False) -> str: | ||
| 564 | + if text is None: | ||
| 565 | + return "" | ||
| 566 | + value = str(text) | ||
| 567 | + if for_table: | ||
| 568 | + value = value.replace("|", r"\|").replace("\n", " ").replace("\r", " ") | ||
| 569 | + return value.strip() | ||
| 570 | + | ||
| 571 | + def _stringify_value(self, value: Any) -> str: | ||
| 572 | + if value is None: | ||
| 573 | + return "" | ||
| 574 | + if isinstance(value, (int, float)) and not isinstance(value, bool): | ||
| 575 | + return str(value) | ||
| 576 | + if isinstance(value, dict): | ||
| 577 | + # 优先取常见数值字段 | ||
| 578 | + for key in ("y", "value"): | ||
| 579 | + if key in value: | ||
| 580 | + return str(value[key]) | ||
| 581 | + try: | ||
| 582 | + return json.dumps(value, ensure_ascii=False) | ||
| 583 | + except Exception: | ||
| 584 | + return str(value) | ||
| 585 | + if isinstance(value, list): | ||
| 586 | + return ", ".join(self._stringify_value(v) for v in value) | ||
| 587 | + return str(value) | ||
| 588 | + | ||
| 589 | + def _normalize_math(self, raw: Any) -> str: | ||
| 590 | + if not isinstance(raw, str): | ||
| 591 | + return "" | ||
| 592 | + text = raw.strip() | ||
| 593 | + patterns = [ | ||
| 594 | + ("$$", "$$"), | ||
| 595 | + ("\\[", "\\]"), | ||
| 596 | + ("\\(", "\\)"), | ||
| 597 | + ] | ||
| 598 | + for start, end in patterns: | ||
| 599 | + if text.startswith(start) and text.endswith(end): | ||
| 600 | + return text[len(start) : -len(end)].strip() | ||
| 601 | + return text | ||
| 602 | + | ||
| 603 | + def _format_delta(self, delta: Any, tone: Any) -> str: | ||
| 604 | + if delta is None: | ||
| 605 | + return "" | ||
| 606 | + prefix = "" | ||
| 607 | + tone_val = (tone or "").lower() | ||
| 608 | + if tone_val in ("up", "increase", "positive"): | ||
| 609 | + prefix = "▲ " | ||
| 610 | + elif tone_val in ("down", "decrease", "negative"): | ||
| 611 | + prefix = "▼ " | ||
| 612 | + return f"{prefix}{delta}" | ||
| 613 | + | ||
| 614 | + def _fallback_unknown(self, block: Dict[str, Any]) -> str: | ||
| 615 | + try: | ||
| 616 | + payload = json.dumps(block, ensure_ascii=False, indent=2) | ||
| 617 | + except Exception: | ||
| 618 | + payload = str(block) | ||
| 619 | + logger.debug(f"未识别的区块类型,使用JSON兜底: {block}") | ||
| 620 | + return f"```json\n{payload}\n```" | ||
| 621 | + | ||
| 622 | + | ||
| 623 | +__all__ = ["MarkdownRenderer"] |
| @@ -71,6 +71,7 @@ from .html_renderer import HTMLRenderer | @@ -71,6 +71,7 @@ from .html_renderer import HTMLRenderer | ||
| 71 | from .pdf_layout_optimizer import PDFLayoutOptimizer, PDFLayoutConfig | 71 | from .pdf_layout_optimizer import PDFLayoutOptimizer, PDFLayoutConfig |
| 72 | from .chart_to_svg import create_chart_converter | 72 | from .chart_to_svg import create_chart_converter |
| 73 | from .math_to_svg import MathToSVG | 73 | from .math_to_svg import MathToSVG |
| 74 | +from .markdown_renderer import MarkdownRenderer | ||
| 74 | try: | 75 | try: |
| 75 | from wordcloud import WordCloud | 76 | from wordcloud import WordCloud |
| 76 | WORDCLOUD_AVAILABLE = True | 77 | WORDCLOUD_AVAILABLE = True |
| @@ -102,6 +103,7 @@ class PDFRenderer: | @@ -102,6 +103,7 @@ class PDFRenderer: | ||
| 102 | """ | 103 | """ |
| 103 | self.config = config or {} | 104 | self.config = config or {} |
| 104 | self.html_renderer = HTMLRenderer(config) | 105 | self.html_renderer = HTMLRenderer(config) |
| 106 | + self.markdown_renderer = MarkdownRenderer() | ||
| 105 | self.layout_optimizer = layout_optimizer or PDFLayoutOptimizer() | 107 | self.layout_optimizer = layout_optimizer or PDFLayoutOptimizer() |
| 106 | 108 | ||
| 107 | if not WEASYPRINT_AVAILABLE: | 109 | if not WEASYPRINT_AVAILABLE: |
| @@ -886,6 +888,36 @@ class PDFRenderer: | @@ -886,6 +888,36 @@ class PDFRenderer: | ||
| 886 | 888 | ||
| 887 | return html | 889 | return html |
| 888 | 890 | ||
| 891 | + def _build_markdown_filename(self, document_ir: Dict[str, Any]) -> str: | ||
| 892 | + """根据元数据生成Markdown文件名""" | ||
| 893 | + metadata = document_ir.get("metadata") or {} | ||
| 894 | + title = metadata.get("title") or metadata.get("query") or metadata.get("reportId") or "report" | ||
| 895 | + safe = "".join(ch for ch in str(title) if ch.isalnum() or ch in (" ", "-", "_")).strip() | ||
| 896 | + safe = safe.replace(" ", "_")[:80] or "report" | ||
| 897 | + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | ||
| 898 | + return f"{safe}_{timestamp}.md" | ||
| 899 | + | ||
| 900 | + def _export_markdown(self, document_ir: Dict[str, Any]) -> None: | ||
| 901 | + """ | ||
| 902 | + 使用检查/修复后的IR生成Markdown版本。 | ||
| 903 | + | ||
| 904 | + - 图表/词云等交互组件降级为原始表格数据; | ||
| 905 | + - 遇到异常时仅记录警告,不阻断PDF/HTML流程。 | ||
| 906 | + """ | ||
| 907 | + try: | ||
| 908 | + markdown_content = self.markdown_renderer.render(document_ir) | ||
| 909 | + if not markdown_content.strip(): | ||
| 910 | + logger.debug("Markdown渲染结果为空,跳过落盘") | ||
| 911 | + return | ||
| 912 | + | ||
| 913 | + output_dir = Path("final_reports/markdown") | ||
| 914 | + output_dir.mkdir(parents=True, exist_ok=True) | ||
| 915 | + file_path = output_dir / self._build_markdown_filename(document_ir) | ||
| 916 | + file_path.write_text(markdown_content, encoding="utf-8") | ||
| 917 | + logger.info(f"已静默生成Markdown报告: {file_path}") | ||
| 918 | + except Exception as exc: | ||
| 919 | + logger.warning(f"生成Markdown报告失败(已忽略,不影响PDF/HTML): {exc}") | ||
| 920 | + | ||
| 889 | def _get_pdf_html( | 921 | def _get_pdf_html( |
| 890 | self, | 922 | self, |
| 891 | document_ir: Dict[str, Any], | 923 | document_ir: Dict[str, Any], |
| @@ -943,6 +975,9 @@ class PDFRenderer: | @@ -943,6 +975,9 @@ class PDFRenderer: | ||
| 943 | logger.info("开始转换数学公式为SVG矢量图形...") | 975 | logger.info("开始转换数学公式为SVG矢量图形...") |
| 944 | math_svg_map = self._convert_math_to_svg(preprocessed_ir) | 976 | math_svg_map = self._convert_math_to_svg(preprocessed_ir) |
| 945 | 977 | ||
| 978 | + # 在渲染HTML前静默导出Markdown版本 | ||
| 979 | + self._export_markdown(preprocessed_ir) | ||
| 980 | + | ||
| 946 | # 使用HTML渲染器生成基础HTML(使用预处理后的IR,以便复用mathId等标记) | 981 | # 使用HTML渲染器生成基础HTML(使用预处理后的IR,以便复用mathId等标记) |
| 947 | html = self.html_renderer.render(preprocessed_ir) | 982 | html = self.html_renderer.render(preprocessed_ir) |
| 948 | 983 |
-
Please register or login to post a comment