马一丁

Supports Markdown rendering

@@ -16,10 +16,12 @@ from .pdf_layout_optimizer import ( @@ -16,10 +16,12 @@ from .pdf_layout_optimizer import (
16 ChartLayout, 16 ChartLayout,
17 GridLayout, 17 GridLayout,
18 ) 18 )
  19 +from .markdown_renderer import MarkdownRenderer
19 20
20 __all__ = [ 21 __all__ = [
21 "HTMLRenderer", 22 "HTMLRenderer",
22 "PDFRenderer", 23 "PDFRenderer",
  24 + "MarkdownRenderer",
23 "PDFLayoutOptimizer", 25 "PDFLayoutOptimizer",
24 "PDFLayoutConfig", 26 "PDFLayoutConfig",
25 "PageLayout", 27 "PageLayout",
  1 +from __future__ import annotations
  2 +
  3 +import json
  4 +from typing import Any, Dict, List
  5 +
  6 +from loguru import logger
  7 +
  8 +
  9 +class MarkdownRenderer:
  10 + """
  11 + 将 Document IR 转为 Markdown。
  12 +
  13 + - 图表与词云统一降级为数据表格,避免丢失关键信息;
  14 + - 尽量保留通用特性(标题、列表、代码、表格、引用等);
  15 + - 对不常见特性(callout/kpiGrid/engineQuote等)使用近似替换。
  16 + """
  17 +
  18 + def __init__(self) -> None:
  19 + self.document: Dict[str, Any] = {}
  20 + self.metadata: Dict[str, Any] = {}
  21 +
  22 + def render(self, document_ir: Dict[str, Any]) -> str:
  23 + """入口:将IR转换为Markdown字符串"""
  24 + self.document = document_ir or {}
  25 + self.metadata = self.document.get("metadata", {}) or {}
  26 +
  27 + parts: List[str] = []
  28 + title = self.metadata.get("title") or self.metadata.get("query") or "报告"
  29 + if title:
  30 + parts.append(f"# {self._escape_text(title)}")
  31 + parts.append("")
  32 +
  33 + for chapter in self.document.get("chapters", []) or []:
  34 + chapter_md = self._render_chapter(chapter)
  35 + if chapter_md:
  36 + parts.append(chapter_md)
  37 +
  38 + return "\n".join(part for part in parts if part is not None).strip()
  39 +
  40 + # ===== 章节与块级渲染 =====
  41 +
  42 + def _render_chapter(self, chapter: Dict[str, Any]) -> str:
  43 + lines: List[str] = []
  44 + title = chapter.get("title") or chapter.get("chapterId")
  45 + if title:
  46 + lines.append(f"## {self._escape_text(title)}")
  47 + lines.append("")
  48 + body = self._render_blocks(chapter.get("blocks", []))
  49 + if body:
  50 + lines.append(body)
  51 + return "\n".join(lines).strip()
  52 +
  53 + def _render_blocks(self, blocks: List[Dict[str, Any]] | None, join_with_blank: bool = True) -> str:
  54 + rendered: List[str] = []
  55 + for block in blocks or []:
  56 + md = self._render_block(block)
  57 + if md is None:
  58 + continue
  59 + md = md.strip()
  60 + if md:
  61 + rendered.append(md)
  62 + if not rendered:
  63 + return ""
  64 + separator = "\n\n" if join_with_blank else "\n"
  65 + return separator.join(rendered)
  66 +
  67 + def _render_block(self, block: Any) -> str:
  68 + if block is None:
  69 + return ""
  70 + if isinstance(block, str):
  71 + return self._escape_text(block)
  72 + if not isinstance(block, dict):
  73 + return ""
  74 +
  75 + block_type = block.get("type") or ("paragraph" if block.get("inlines") else None)
  76 + handlers = {
  77 + "heading": self._render_heading,
  78 + "paragraph": self._render_paragraph,
  79 + "list": self._render_list,
  80 + "table": self._render_table,
  81 + "swotTable": self._render_swot_table,
  82 + "pestTable": self._render_pest_table,
  83 + "blockquote": self._render_blockquote,
  84 + "engineQuote": self._render_engine_quote,
  85 + "hr": lambda b: "---",
  86 + "code": self._render_code,
  87 + "math": self._render_math,
  88 + "figure": self._render_figure,
  89 + "callout": self._render_callout,
  90 + "kpiGrid": self._render_kpi_grid,
  91 + "widget": self._render_widget,
  92 + "toc": lambda b: "",
  93 + }
  94 + if block_type in handlers:
  95 + return handlers[block_type](block)
  96 +
  97 + if isinstance(block.get("blocks"), list):
  98 + return self._render_blocks(block["blocks"])
  99 +
  100 + return self._fallback_unknown(block)
  101 +
  102 + def _render_heading(self, block: Dict[str, Any]) -> str:
  103 + level = block.get("level", 2)
  104 + level = max(1, min(6, level))
  105 + hashes = "#" * level
  106 + text = block.get("text") or ""
  107 + subtitle = block.get("subtitle")
  108 + subtitle_text = f" _{self._escape_text(subtitle)}_" if subtitle else ""
  109 + return f"{hashes} {self._escape_text(text)}{subtitle_text}"
  110 +
  111 + def _render_paragraph(self, block: Dict[str, Any]) -> str:
  112 + return self._render_inlines(block.get("inlines", []))
  113 +
  114 + def _render_list(self, block: Dict[str, Any]) -> str:
  115 + list_type = block.get("listType", "bullet")
  116 + items = block.get("items") or []
  117 + lines: List[str] = []
  118 + for idx, item_blocks in enumerate(items):
  119 + prefix = "-"
  120 + if list_type == "ordered":
  121 + prefix = f"{idx + 1}."
  122 + elif list_type == "task":
  123 + prefix = "- [ ]"
  124 + content = self._render_blocks(item_blocks, join_with_blank=False)
  125 + if not content:
  126 + continue
  127 + content_lines = content.splitlines() or [""]
  128 + first = content_lines[0]
  129 + lines.append(f"{prefix} {first}")
  130 + for cont in content_lines[1:]:
  131 + lines.append(f" {cont}")
  132 + return "\n".join(lines)
  133 +
  134 + def _render_table(self, block: Dict[str, Any]) -> str:
  135 + rows = block.get("rows") or []
  136 + if not rows:
  137 + return ""
  138 +
  139 + header_cells: List[str] = []
  140 + body_rows: List[List[str]] = []
  141 +
  142 + # 检测首行是否声明为表头
  143 + first_row_cells = rows[0].get("cells") if isinstance(rows[0], dict) else None
  144 + has_header = bool(first_row_cells and any(cell.get("header") or cell.get("isHeader") for cell in first_row_cells))
  145 +
  146 + # 计算最大列数,忽略rowspan
  147 + col_count = 0
  148 + for row in rows:
  149 + cells = row.get("cells") if isinstance(row, dict) else None
  150 + span = 0
  151 + for cell in cells or []:
  152 + span += int(cell.get("colspan") or 1)
  153 + col_count = max(col_count, span)
  154 +
  155 + if has_header and first_row_cells:
  156 + header_cells = [self._render_cell_content(cell) for cell in first_row_cells]
  157 + rows = rows[1:]
  158 + else:
  159 + header_cells = [f"列{idx + 1}" for idx in range(col_count or (len(first_row_cells or []) or 1))]
  160 +
  161 + for row in rows:
  162 + if not isinstance(row, dict):
  163 + continue
  164 + cells = row.get("cells") or []
  165 + row_cells: List[str] = []
  166 + for cell in cells:
  167 + text = self._render_cell_content(cell)
  168 + span = int(cell.get("colspan") or 1)
  169 + row_cells.append(text)
  170 + if span > 1:
  171 + row_cells.extend([""] * (span - 1))
  172 + while len(row_cells) < len(header_cells):
  173 + row_cells.append("")
  174 + body_rows.append(row_cells[: len(header_cells)])
  175 +
  176 + lines = [
  177 + self._markdown_row(header_cells),
  178 + self._markdown_separator(len(header_cells)),
  179 + ]
  180 + for row in body_rows:
  181 + lines.append(self._markdown_row(row))
  182 + return "\n".join(lines)
  183 +
  184 + def _render_swot_table(self, block: Dict[str, Any]) -> str:
  185 + title = block.get("title") or "SWOT 分析"
  186 + summary = block.get("summary")
  187 + quadrants = [
  188 + ("strengths", "S 优势"),
  189 + ("weaknesses", "W 劣势"),
  190 + ("opportunities", "O 机会"),
  191 + ("threats", "T 威胁"),
  192 + ]
  193 +
  194 + lines = [f"### {self._escape_text(title)}"]
  195 + if summary:
  196 + lines.append(self._escape_text(summary))
  197 +
  198 + for key, label in quadrants:
  199 + items = self._normalize_swot_items(block.get(key))
  200 + lines.append(f"#### {label}")
  201 + if not items:
  202 + lines.append("> 暂无数据")
  203 + continue
  204 + table_lines = [
  205 + self._markdown_row(["序号", "要点", "详情", "标签"]),
  206 + self._markdown_separator(4),
  207 + ]
  208 + for idx, item in enumerate(items, start=1):
  209 + tags = [val for val in (item.get("impact"), item.get("priority")) if val]
  210 + tag_text = " / ".join(self._escape_text(t) for t in tags) or ""
  211 + detail = item.get("detail") or item.get("description") or item.get("evidence") or ""
  212 + table_lines.append(
  213 + self._markdown_row([
  214 + str(idx),
  215 + self._escape_text(item.get("title") or "未命名要点", for_table=True),
  216 + self._escape_text(detail, for_table=True),
  217 + self._escape_text(tag_text, for_table=True),
  218 + ])
  219 + )
  220 + lines.append("\n".join(table_lines))
  221 + return "\n\n".join(lines)
  222 +
  223 + def _render_pest_table(self, block: Dict[str, Any]) -> str:
  224 + title = block.get("title") or "PEST 分析"
  225 + summary = block.get("summary")
  226 + dimensions = [
  227 + ("political", "P 政治"),
  228 + ("economic", "E 经济"),
  229 + ("social", "S 社会"),
  230 + ("technological", "T 技术"),
  231 + ]
  232 +
  233 + lines = [f"### {self._escape_text(title)}"]
  234 + if summary:
  235 + lines.append(self._escape_text(summary))
  236 +
  237 + for key, label in dimensions:
  238 + items = self._normalize_pest_items(block.get(key))
  239 + lines.append(f"#### {label}")
  240 + if not items:
  241 + lines.append("> 暂无数据")
  242 + continue
  243 + table_lines = [
  244 + self._markdown_row(["序号", "要点", "详情", "标签"]),
  245 + self._markdown_separator(4),
  246 + ]
  247 + for idx, item in enumerate(items, start=1):
  248 + tags = [val for val in (item.get("impact"), item.get("weight"), item.get("priority")) if val]
  249 + tag_text = " / ".join(self._escape_text(t) for t in tags) or ""
  250 + detail = item.get("detail") or item.get("description") or ""
  251 + table_lines.append(
  252 + self._markdown_row([
  253 + str(idx),
  254 + self._escape_text(item.get("title") or "未命名要点", for_table=True),
  255 + self._escape_text(detail, for_table=True),
  256 + self._escape_text(tag_text, for_table=True),
  257 + ])
  258 + )
  259 + lines.append("\n".join(table_lines))
  260 + return "\n\n".join(lines)
  261 +
  262 + def _render_blockquote(self, block: Dict[str, Any]) -> str:
  263 + inner = self._render_blocks(block.get("blocks", []))
  264 + return self._quote_lines(inner)
  265 +
  266 + def _render_engine_quote(self, block: Dict[str, Any]) -> str:
  267 + title = block.get("title") or block.get("engine") or "引用"
  268 + inner = self._render_blocks(block.get("blocks", []))
  269 + header = f"**{self._escape_text(title)}**"
  270 + return self._quote_lines(f"{header}\n{inner}" if inner else header)
  271 +
  272 + def _render_code(self, block: Dict[str, Any]) -> str:
  273 + lang = block.get("lang") or ""
  274 + content = block.get("content") or ""
  275 + return f"```{lang}\n{content}\n```"
  276 +
  277 + def _render_math(self, block: Dict[str, Any]) -> str:
  278 + latex = self._normalize_math(block.get("latex", ""))
  279 + if not latex:
  280 + return ""
  281 + return f"$$\n{latex}\n$$"
  282 +
  283 + def _render_figure(self, block: Dict[str, Any]) -> str:
  284 + caption = block.get("caption") or "图像内容占位"
  285 + return f"> ![图示占位]({''}) {self._escape_text(caption)}"
  286 +
  287 + def _render_callout(self, block: Dict[str, Any]) -> str:
  288 + tone = block.get("tone") or "info"
  289 + title = block.get("title")
  290 + inner = self._render_blocks(block.get("blocks", []))
  291 + header = f"**{self._escape_text(title)}** [{tone}]" if title else f"[{tone}]"
  292 + content = header if not inner else f"{header}\n{inner}"
  293 + return self._quote_lines(content)
  294 +
  295 + def _render_kpi_grid(self, block: Dict[str, Any]) -> str:
  296 + items = block.get("items") or []
  297 + if not items:
  298 + return ""
  299 + header = ["指标", "数值", "变化"]
  300 + lines = [self._markdown_row(header), self._markdown_separator(len(header))]
  301 + for item in items:
  302 + label = item.get("label") or ""
  303 + value = f"{item.get('value', '')}{item.get('unit') or ''}"
  304 + delta = self._format_delta(item.get("delta"), item.get("deltaTone"))
  305 + lines.append(self._markdown_row([
  306 + self._escape_text(label, for_table=True),
  307 + self._escape_text(value, for_table=True),
  308 + self._escape_text(delta, for_table=True),
  309 + ]))
  310 + return "\n".join(lines)
  311 +
  312 + def _render_widget(self, block: Dict[str, Any]) -> str:
  313 + widget_type = (block.get("widgetType") or "").lower()
  314 + title = block.get("title") or (block.get("props", {}) or {}).get("title")
  315 + title_prefix = f"**{self._escape_text(title)}**\n\n" if title else ""
  316 +
  317 + if widget_type.startswith("chart.js"):
  318 + chart_table = self._render_chart_as_table(block)
  319 + return f"{title_prefix}{chart_table}".strip()
  320 + if "wordcloud" in widget_type:
  321 + cloud_table = self._render_wordcloud_as_table(block)
  322 + return f"{title_prefix}{cloud_table}".strip()
  323 +
  324 + data_preview = ""
  325 + try:
  326 + data_preview = json.dumps(block.get("data") or {}, ensure_ascii=False)[:200]
  327 + except Exception:
  328 + data_preview = ""
  329 + note = "> 数据组件暂不支持Markdown渲染"
  330 + return f"{title_prefix}{note}" + (f"\n\n```\n{data_preview}\n```" if data_preview else "")
  331 +
  332 + # ===== 工具方法 =====
  333 +
  334 + def _render_chart_as_table(self, block: Dict[str, Any]) -> str:
  335 + data = self._coerce_chart_data(block.get("data") or {})
  336 + labels = data.get("labels") or []
  337 + datasets = data.get("datasets") or []
  338 + if not labels or not datasets:
  339 + return "> 图表数据缺失,无法转为表格"
  340 +
  341 + headers = ["类别"] + [
  342 + ds.get("label") or f"系列{idx + 1}"
  343 + for idx, ds in enumerate(datasets)
  344 + ]
  345 + lines = [self._markdown_row(headers), self._markdown_separator(len(headers))]
  346 + for idx, label in enumerate(labels):
  347 + row_cells = [self._escape_text(self._stringify_value(label), for_table=True)]
  348 + for ds in datasets:
  349 + series = ds.get("data") or []
  350 + value = series[idx] if idx < len(series) else ""
  351 + row_cells.append(self._escape_text(self._stringify_value(value), for_table=True))
  352 + lines.append(self._markdown_row(row_cells))
  353 + return "\n".join(lines)
  354 +
  355 + def _render_wordcloud_as_table(self, block: Dict[str, Any]) -> str:
  356 + items = self._collect_wordcloud_items(block)
  357 + if not items:
  358 + return "> 词云数据缺失,无法转为表格"
  359 +
  360 + lines = [
  361 + self._markdown_row(["关键词", "权重", "类别"]),
  362 + self._markdown_separator(3),
  363 + ]
  364 + for item in items:
  365 + lines.append(
  366 + self._markdown_row([
  367 + self._escape_text(item.get("word", ""), for_table=True),
  368 + self._escape_text(self._stringify_value(item.get("weight")), for_table=True),
  369 + self._escape_text(item.get("category", "") or "-", for_table=True),
  370 + ])
  371 + )
  372 + return "\n".join(lines)
  373 +
  374 + def _render_cell_content(self, cell: Dict[str, Any]) -> str:
  375 + blocks = cell.get("blocks") if isinstance(cell, dict) else None
  376 + return self._render_blocks_as_text(blocks)
  377 +
  378 + def _render_blocks_as_text(self, blocks: List[Dict[str, Any]] | None) -> str:
  379 + texts: List[str] = []
  380 + for block in blocks or []:
  381 + texts.append(self._render_block_as_text(block))
  382 + return " ".join(filter(None, texts))
  383 +
  384 + def _render_block_as_text(self, block: Any) -> str:
  385 + if isinstance(block, str):
  386 + return self._escape_text(block, for_table=True)
  387 + if not isinstance(block, dict):
  388 + return ""
  389 + block_type = block.get("type")
  390 + if block_type == "paragraph":
  391 + return self._render_inlines(block.get("inlines", []), for_table=True)
  392 + if block_type == "heading":
  393 + return self._escape_text(block.get("text") or "", for_table=True)
  394 + if block_type == "list":
  395 + items = []
  396 + for sub in block.get("items") or []:
  397 + items.append(self._render_blocks_as_text(sub))
  398 + return "; ".join(filter(None, items))
  399 + if block_type == "math":
  400 + return f"${self._normalize_math(block.get('latex', ''))}$"
  401 + if block_type == "code":
  402 + return block.get("content", "") or ""
  403 + if block_type == "widget":
  404 + return self._escape_text(block.get("title") or "图表", for_table=True)
  405 + if isinstance(block.get("blocks"), list):
  406 + return self._render_blocks_as_text(block.get("blocks"))
  407 + return self._escape_text(str(block), for_table=True)
  408 +
  409 + def _markdown_row(self, cells: List[str]) -> str:
  410 + return "| " + " | ".join(cells) + " |"
  411 +
  412 + def _markdown_separator(self, count: int) -> str:
  413 + return "| " + " | ".join(["---"] * max(1, count)) + " |"
  414 +
  415 + def _render_inlines(self, inlines: List[Any], for_table: bool = False) -> str:
  416 + parts: List[str] = []
  417 + for run in inlines or []:
  418 + parts.append(self._render_inline_run(run, for_table=for_table))
  419 + return "".join(parts)
  420 +
  421 + def _render_inline_run(self, run: Any, for_table: bool = False) -> str:
  422 + if isinstance(run, dict):
  423 + text = run.get("text", "")
  424 + marks = run.get("marks") or []
  425 + else:
  426 + text = run if isinstance(run, str) else ""
  427 + marks = []
  428 + result = self._escape_text(text, for_table=for_table)
  429 + for mark in marks:
  430 + if not isinstance(mark, dict):
  431 + continue
  432 + mtype = mark.get("type")
  433 + if mtype == "bold":
  434 + result = f"**{result}**"
  435 + elif mtype == "italic":
  436 + result = f"*{result}*"
  437 + elif mtype == "underline":
  438 + result = f"__{result}__"
  439 + elif mtype == "strike":
  440 + result = f"~~{result}~~"
  441 + elif mtype == "code":
  442 + result = f"`{result}`"
  443 + elif mtype == "link":
  444 + href = mark.get("href") or mark.get("value")
  445 + href = str(href) if href else ""
  446 + result = f"[{result}]({href})" if href else result
  447 + elif mtype == "highlight":
  448 + result = f"=={result}=="
  449 + elif mtype == "subscript":
  450 + result = f"~{result}~"
  451 + elif mtype == "superscript":
  452 + result = f"^{result}^"
  453 + elif mtype == "math":
  454 + latex = self._normalize_math(mark.get("value") or text)
  455 + result = f"${latex}$" if latex else result
  456 + # 颜色/字体等非通用标记直接降级为纯文本
  457 + return result
  458 +
  459 + def _quote_lines(self, text: str) -> str:
  460 + if not text:
  461 + return ""
  462 + lines = []
  463 + for line in text.splitlines():
  464 + line = line.strip()
  465 + prefix = "> " if line else ">"
  466 + lines.append(f"{prefix}{line}")
  467 + return "\n".join(lines)
  468 +
  469 + def _normalize_swot_items(self, raw: Any) -> List[Dict[str, Any]]:
  470 + items: List[Dict[str, Any]] = []
  471 + if not raw:
  472 + return items
  473 + for entry in raw:
  474 + if isinstance(entry, str):
  475 + items.append({"title": entry})
  476 + elif isinstance(entry, dict):
  477 + title = entry.get("title") or entry.get("label") or entry.get("text")
  478 + detail = entry.get("detail") or entry.get("description")
  479 + impact = entry.get("impact")
  480 + priority = entry.get("priority")
  481 + evidence = entry.get("evidence")
  482 + items.append({
  483 + "title": title or "未命名要点",
  484 + "detail": detail,
  485 + "impact": impact,
  486 + "priority": priority,
  487 + "evidence": evidence,
  488 + })
  489 + return items
  490 +
  491 + def _normalize_pest_items(self, raw: Any) -> List[Dict[str, Any]]:
  492 + items: List[Dict[str, Any]] = []
  493 + if not raw:
  494 + return items
  495 + for entry in raw:
  496 + if isinstance(entry, str):
  497 + items.append({"title": entry})
  498 + elif isinstance(entry, dict):
  499 + title = entry.get("title") or entry.get("label") or entry.get("text")
  500 + detail = entry.get("detail") or entry.get("description")
  501 + items.append({
  502 + "title": title or "未命名要点",
  503 + "detail": detail,
  504 + "impact": entry.get("impact"),
  505 + "priority": entry.get("priority"),
  506 + "weight": entry.get("weight"),
  507 + })
  508 + return items
  509 +
  510 + def _coerce_chart_data(self, data: Dict[str, Any]) -> Dict[str, Any]:
  511 + if not isinstance(data, dict):
  512 + return {}
  513 + if "labels" in data or "datasets" in data:
  514 + return data
  515 + for key in ("data", "chartData", "payload"):
  516 + nested = data.get(key)
  517 + if isinstance(nested, dict) and ("labels" in nested or "datasets" in nested):
  518 + return nested
  519 + return data
  520 +
  521 + def _collect_wordcloud_items(self, block: Dict[str, Any]) -> List[Dict[str, Any]]:
  522 + props = block.get("props") or {}
  523 + candidates: List[Any] = []
  524 + for key in ("data", "words", "items"):
  525 + value = props.get(key)
  526 + if isinstance(value, list):
  527 + candidates.append(value)
  528 + data_field = block.get("data")
  529 + if isinstance(data_field, list):
  530 + candidates.append(data_field)
  531 + elif isinstance(data_field, dict):
  532 + if isinstance(data_field.get("items"), list):
  533 + candidates.append(data_field.get("items"))
  534 +
  535 + items: List[Dict[str, Any]] = []
  536 + seen: set[str] = set()
  537 +
  538 + def push(word: str, weight: Any, category: str) -> None:
  539 + key = f"{word}::{category}"
  540 + if key in seen:
  541 + return
  542 + seen.add(key)
  543 + items.append({"word": word, "weight": weight, "category": category})
  544 +
  545 + for candidate in candidates:
  546 + for entry in candidate or []:
  547 + if isinstance(entry, dict):
  548 + word = entry.get("word") or entry.get("text") or entry.get("label")
  549 + if not word:
  550 + continue
  551 + weight = entry.get("weight") or entry.get("value")
  552 + category = entry.get("category") or ""
  553 + push(str(word), weight, str(category))
  554 + elif isinstance(entry, (list, tuple)) and entry:
  555 + word = entry[0]
  556 + weight = entry[1] if len(entry) > 1 else ""
  557 + category = entry[2] if len(entry) > 2 else ""
  558 + push(str(word), weight, str(category))
  559 + elif isinstance(entry, str):
  560 + push(entry, "", "")
  561 + return items
  562 +
  563 + def _escape_text(self, text: Any, for_table: bool = False) -> str:
  564 + if text is None:
  565 + return ""
  566 + value = str(text)
  567 + if for_table:
  568 + value = value.replace("|", r"\|").replace("\n", " ").replace("\r", " ")
  569 + return value.strip()
  570 +
  571 + def _stringify_value(self, value: Any) -> str:
  572 + if value is None:
  573 + return ""
  574 + if isinstance(value, (int, float)) and not isinstance(value, bool):
  575 + return str(value)
  576 + if isinstance(value, dict):
  577 + # 优先取常见数值字段
  578 + for key in ("y", "value"):
  579 + if key in value:
  580 + return str(value[key])
  581 + try:
  582 + return json.dumps(value, ensure_ascii=False)
  583 + except Exception:
  584 + return str(value)
  585 + if isinstance(value, list):
  586 + return ", ".join(self._stringify_value(v) for v in value)
  587 + return str(value)
  588 +
  589 + def _normalize_math(self, raw: Any) -> str:
  590 + if not isinstance(raw, str):
  591 + return ""
  592 + text = raw.strip()
  593 + patterns = [
  594 + ("$$", "$$"),
  595 + ("\\[", "\\]"),
  596 + ("\\(", "\\)"),
  597 + ]
  598 + for start, end in patterns:
  599 + if text.startswith(start) and text.endswith(end):
  600 + return text[len(start) : -len(end)].strip()
  601 + return text
  602 +
  603 + def _format_delta(self, delta: Any, tone: Any) -> str:
  604 + if delta is None:
  605 + return ""
  606 + prefix = ""
  607 + tone_val = (tone or "").lower()
  608 + if tone_val in ("up", "increase", "positive"):
  609 + prefix = "▲ "
  610 + elif tone_val in ("down", "decrease", "negative"):
  611 + prefix = "▼ "
  612 + return f"{prefix}{delta}"
  613 +
  614 + def _fallback_unknown(self, block: Dict[str, Any]) -> str:
  615 + try:
  616 + payload = json.dumps(block, ensure_ascii=False, indent=2)
  617 + except Exception:
  618 + payload = str(block)
  619 + logger.debug(f"未识别的区块类型,使用JSON兜底: {block}")
  620 + return f"```json\n{payload}\n```"
  621 +
  622 +
  623 +__all__ = ["MarkdownRenderer"]
@@ -71,6 +71,7 @@ from .html_renderer import HTMLRenderer @@ -71,6 +71,7 @@ from .html_renderer import HTMLRenderer
71 from .pdf_layout_optimizer import PDFLayoutOptimizer, PDFLayoutConfig 71 from .pdf_layout_optimizer import PDFLayoutOptimizer, PDFLayoutConfig
72 from .chart_to_svg import create_chart_converter 72 from .chart_to_svg import create_chart_converter
73 from .math_to_svg import MathToSVG 73 from .math_to_svg import MathToSVG
  74 +from .markdown_renderer import MarkdownRenderer
74 try: 75 try:
75 from wordcloud import WordCloud 76 from wordcloud import WordCloud
76 WORDCLOUD_AVAILABLE = True 77 WORDCLOUD_AVAILABLE = True
@@ -102,6 +103,7 @@ class PDFRenderer: @@ -102,6 +103,7 @@ class PDFRenderer:
102 """ 103 """
103 self.config = config or {} 104 self.config = config or {}
104 self.html_renderer = HTMLRenderer(config) 105 self.html_renderer = HTMLRenderer(config)
  106 + self.markdown_renderer = MarkdownRenderer()
105 self.layout_optimizer = layout_optimizer or PDFLayoutOptimizer() 107 self.layout_optimizer = layout_optimizer or PDFLayoutOptimizer()
106 108
107 if not WEASYPRINT_AVAILABLE: 109 if not WEASYPRINT_AVAILABLE:
@@ -886,6 +888,36 @@ class PDFRenderer: @@ -886,6 +888,36 @@ class PDFRenderer:
886 888
887 return html 889 return html
888 890
  891 + def _build_markdown_filename(self, document_ir: Dict[str, Any]) -> str:
  892 + """根据元数据生成Markdown文件名"""
  893 + metadata = document_ir.get("metadata") or {}
  894 + title = metadata.get("title") or metadata.get("query") or metadata.get("reportId") or "report"
  895 + safe = "".join(ch for ch in str(title) if ch.isalnum() or ch in (" ", "-", "_")).strip()
  896 + safe = safe.replace(" ", "_")[:80] or "report"
  897 + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
  898 + return f"{safe}_{timestamp}.md"
  899 +
  900 + def _export_markdown(self, document_ir: Dict[str, Any]) -> None:
  901 + """
  902 + 使用检查/修复后的IR生成Markdown版本。
  903 +
  904 + - 图表/词云等交互组件降级为原始表格数据;
  905 + - 遇到异常时仅记录警告,不阻断PDF/HTML流程。
  906 + """
  907 + try:
  908 + markdown_content = self.markdown_renderer.render(document_ir)
  909 + if not markdown_content.strip():
  910 + logger.debug("Markdown渲染结果为空,跳过落盘")
  911 + return
  912 +
  913 + output_dir = Path("final_reports/markdown")
  914 + output_dir.mkdir(parents=True, exist_ok=True)
  915 + file_path = output_dir / self._build_markdown_filename(document_ir)
  916 + file_path.write_text(markdown_content, encoding="utf-8")
  917 + logger.info(f"已静默生成Markdown报告: {file_path}")
  918 + except Exception as exc:
  919 + logger.warning(f"生成Markdown报告失败(已忽略,不影响PDF/HTML): {exc}")
  920 +
889 def _get_pdf_html( 921 def _get_pdf_html(
890 self, 922 self,
891 document_ir: Dict[str, Any], 923 document_ir: Dict[str, Any],
@@ -943,6 +975,9 @@ class PDFRenderer: @@ -943,6 +975,9 @@ class PDFRenderer:
943 logger.info("开始转换数学公式为SVG矢量图形...") 975 logger.info("开始转换数学公式为SVG矢量图形...")
944 math_svg_map = self._convert_math_to_svg(preprocessed_ir) 976 math_svg_map = self._convert_math_to_svg(preprocessed_ir)
945 977
  978 + # 在渲染HTML前静默导出Markdown版本
  979 + self._export_markdown(preprocessed_ir)
  980 +
946 # 使用HTML渲染器生成基础HTML(使用预处理后的IR,以便复用mathId等标记) 981 # 使用HTML渲染器生成基础HTML(使用预处理后的IR,以便复用mathId等标记)
947 html = self.html_renderer.render(preprocessed_ir) 982 html = self.html_renderer.render(preprocessed_ir)
948 983