马一丁

Added Support for Word Cloud Association When Generating PDFs

@@ -9,6 +9,7 @@ import base64 @@ -9,6 +9,7 @@ import base64
9 import copy 9 import copy
10 import os 10 import os
11 import sys 11 import sys
  12 +import io
12 from pathlib import Path 13 from pathlib import Path
13 from typing import Any, Dict 14 from typing import Any, Dict
14 from datetime import datetime 15 from datetime import datetime
@@ -69,6 +70,12 @@ from .html_renderer import HTMLRenderer @@ -69,6 +70,12 @@ from .html_renderer import HTMLRenderer
69 from .pdf_layout_optimizer import PDFLayoutOptimizer, PDFLayoutConfig 70 from .pdf_layout_optimizer import PDFLayoutOptimizer, PDFLayoutConfig
70 from .chart_to_svg import create_chart_converter 71 from .chart_to_svg import create_chart_converter
71 from .math_to_svg import MathToSVG 72 from .math_to_svg import MathToSVG
  73 +try:
  74 + from wordcloud import WordCloud
  75 + WORDCLOUD_AVAILABLE = True
  76 +except ImportError:
  77 + WORDCLOUD_AVAILABLE = False
  78 + logger = logger # ensure logger exists even before declaration
72 79
73 80
74 class PDFRenderer: 81 class PDFRenderer:
@@ -272,6 +279,26 @@ class PDFRenderer: @@ -272,6 +279,26 @@ class PDFRenderer:
272 logger.info(f"成功转换 {len(svg_map)} 个图表为SVG") 279 logger.info(f"成功转换 {len(svg_map)} 个图表为SVG")
273 return svg_map 280 return svg_map
274 281
  282 + def _convert_wordclouds_to_images(self, document_ir: Dict[str, Any]) -> Dict[str, str]:
  283 + """
  284 + 将document_ir中的词云widget转换为PNG并返回data URI映射
  285 + """
  286 + img_map: Dict[str, str] = {}
  287 +
  288 + if not WORDCLOUD_AVAILABLE:
  289 + logger.debug("wordcloud库未安装,词云将使用表格兜底")
  290 + return img_map
  291 +
  292 + # 遍历所有章节
  293 + chapters = document_ir.get('chapters', [])
  294 + for chapter in chapters:
  295 + blocks = chapter.get('blocks', [])
  296 + self._extract_wordcloud_widgets(blocks, img_map)
  297 +
  298 + if img_map:
  299 + logger.info(f"成功转换 {len(img_map)} 个词云为图片")
  300 + return img_map
  301 +
275 def _extract_and_convert_widgets( 302 def _extract_and_convert_widgets(
276 self, 303 self,
277 blocks: list, 304 blocks: list,
@@ -334,6 +361,109 @@ class PDFRenderer: @@ -334,6 +361,109 @@ class PDFRenderer:
334 if isinstance(cell_blocks, list): 361 if isinstance(cell_blocks, list):
335 self._extract_and_convert_widgets(cell_blocks, svg_map) 362 self._extract_and_convert_widgets(cell_blocks, svg_map)
336 363
  364 + def _extract_wordcloud_widgets(
  365 + self,
  366 + blocks: list,
  367 + img_map: Dict[str, str]
  368 + ) -> None:
  369 + """
  370 + 递归遍历blocks,找到词云widget并生成图片
  371 + """
  372 + for block in blocks:
  373 + if not isinstance(block, dict):
  374 + continue
  375 +
  376 + block_type = block.get('type')
  377 + if block_type == 'widget':
  378 + widget_id = block.get('widgetId')
  379 + widget_type = block.get('widgetType', '')
  380 +
  381 + if widget_id and isinstance(widget_type, str) and 'wordcloud' in widget_type.lower():
  382 + try:
  383 + data_uri = self._generate_wordcloud_image(block)
  384 + if data_uri:
  385 + img_map[widget_id] = data_uri
  386 + logger.debug(f"词云 {widget_id} 转换为图片成功")
  387 + except Exception as exc:
  388 + logger.warning(f"生成词云图片失败 {widget_id}: {exc}")
  389 +
  390 + nested_blocks = block.get('blocks')
  391 + if isinstance(nested_blocks, list):
  392 + self._extract_wordcloud_widgets(nested_blocks, img_map)
  393 +
  394 + if block_type == 'list':
  395 + items = block.get('items', [])
  396 + for item in items:
  397 + if isinstance(item, list):
  398 + self._extract_wordcloud_widgets(item, img_map)
  399 +
  400 + if block_type == 'table':
  401 + rows = block.get('rows', [])
  402 + for row in rows:
  403 + cells = row.get('cells', [])
  404 + for cell in cells:
  405 + cell_blocks = cell.get('blocks', [])
  406 + if isinstance(cell_blocks, list):
  407 + self._extract_wordcloud_widgets(cell_blocks, img_map)
  408 +
  409 + def _normalize_wordcloud_items(self, block: Dict[str, Any]) -> list:
  410 + """
  411 + 从widget block中提取词云数据
  412 + """
  413 + props = block.get('props') or {}
  414 + raw_items = props.get('data')
  415 + if not isinstance(raw_items, list):
  416 + return []
  417 + normalized = []
  418 + for item in raw_items:
  419 + if not isinstance(item, dict):
  420 + continue
  421 + word = item.get('word') or item.get('text') or item.get('label')
  422 + if not word:
  423 + continue
  424 + weight = item.get('weight')
  425 + try:
  426 + weight_val = float(weight)
  427 + if weight_val <= 0:
  428 + weight_val = 1.0
  429 + except (TypeError, ValueError):
  430 + weight_val = 1.0
  431 + category = (item.get('category') or '').lower()
  432 + normalized.append({'word': str(word), 'weight': weight_val, 'category': category})
  433 + return normalized
  434 +
  435 + def _generate_wordcloud_image(self, block: Dict[str, Any]) -> str | None:
  436 + """
  437 + 生成词云PNG并返回data URI
  438 + """
  439 + items = self._normalize_wordcloud_items(block)
  440 + if not items:
  441 + return None
  442 +
  443 + # 使用频次形式馈入wordcloud库
  444 + frequencies = {}
  445 + for item in items:
  446 + weight = item['weight']
  447 + # 兼容权重为0-1的小数,放大以体现差异
  448 + freq = weight * 100 if 0 < weight <= 1.5 else weight
  449 + frequencies[item['word']] = max(1, freq)
  450 +
  451 + font_path = str(self._get_font_path())
  452 + wc = WordCloud(
  453 + width=900,
  454 + height=520,
  455 + background_color="white",
  456 + font_path=font_path,
  457 + prefer_horizontal=0.9,
  458 + random_state=42,
  459 + )
  460 + wc.generate_from_frequencies(frequencies)
  461 +
  462 + buffer = io.BytesIO()
  463 + wc.to_image().save(buffer, format='PNG')
  464 + encoded = base64.b64encode(buffer.getvalue()).decode('ascii')
  465 + return f"data:image/png;base64,{encoded}"
  466 +
337 def _convert_math_to_svg(self, document_ir: Dict[str, Any]) -> Dict[str, str]: 467 def _convert_math_to_svg(self, document_ir: Dict[str, Any]) -> Dict[str, str]:
338 """ 468 """
339 将document_ir中的所有数学公式转换为SVG 469 将document_ir中的所有数学公式转换为SVG
@@ -486,6 +616,46 @@ class PDFRenderer: @@ -486,6 +616,46 @@ class PDFRenderer:
486 616
487 return html 617 return html
488 618
  619 + def _inject_wordcloud_images(self, html: str, img_map: Dict[str, str]) -> str:
  620 + """
  621 + 将词云PNG data URI注入HTML,替换对应canvas
  622 + """
  623 + if not img_map:
  624 + return html
  625 +
  626 + import re
  627 +
  628 + for widget_id, data_uri in img_map.items():
  629 + img_html = (
  630 + f'<div class="chart-svg-container wordcloud-img">'
  631 + f'<img src="{data_uri}" alt="词云" />'
  632 + f'</div>'
  633 + )
  634 +
  635 + config_pattern = rf'<script[^>]+id="([^"]+)"[^>]*>\s*\{{[^}}]*"widgetId"\s*:\s*"{re.escape(widget_id)}"[^}}]*\}}'
  636 + match = re.search(config_pattern, html, re.DOTALL)
  637 + if not match:
  638 + logger.debug(f"未找到词云 {widget_id} 的配置脚本,跳过注入")
  639 + continue
  640 +
  641 + config_id = match.group(1)
  642 + canvas_pattern = rf'<canvas[^>]+data-config-id="{re.escape(config_id)}"[^>]*></canvas>'
  643 +
  644 + html = re.sub(canvas_pattern, lambda m: img_html, html)
  645 + logger.debug(f"已替换词云 {widget_id} 的canvas为PNG图片")
  646 +
  647 + fallback_pattern = rf'<div class="chart-fallback"([^>]*data-widget-id="{re.escape(widget_id)}"[^>]*)>'
  648 +
  649 + def _hide_fallback(m: re.Match) -> str:
  650 + tag = m.group(0)
  651 + if 'svg-hidden' in tag:
  652 + return tag
  653 + return tag.replace('chart-fallback"', 'chart-fallback svg-hidden"', 1)
  654 +
  655 + html = re.sub(fallback_pattern, _hide_fallback, html, count=1)
  656 +
  657 + return html
  658 +
489 def _inject_math_svg_into_html(self, html: str, svg_map: Dict[str, str]) -> str: 659 def _inject_math_svg_into_html(self, html: str, svg_map: Dict[str, str]) -> str:
490 """ 660 """
491 将数学公式SVG内容注入到HTML中 661 将数学公式SVG内容注入到HTML中
@@ -580,6 +750,10 @@ class PDFRenderer: @@ -580,6 +750,10 @@ class PDFRenderer:
580 logger.info("开始转换图表为SVG矢量图形...") 750 logger.info("开始转换图表为SVG矢量图形...")
581 svg_map = self._convert_charts_to_svg(preprocessed_ir) 751 svg_map = self._convert_charts_to_svg(preprocessed_ir)
582 752
  753 + # 转换词云为PNG
  754 + logger.info("开始转换词云为图片...")
  755 + wordcloud_map = self._convert_wordclouds_to_images(preprocessed_ir)
  756 +
583 # 转换数学公式为SVG 757 # 转换数学公式为SVG
584 logger.info("开始转换数学公式为SVG矢量图形...") 758 logger.info("开始转换数学公式为SVG矢量图形...")
585 math_svg_map = self._convert_math_to_svg(preprocessed_ir) 759 math_svg_map = self._convert_math_to_svg(preprocessed_ir)
@@ -594,6 +768,10 @@ class PDFRenderer: @@ -594,6 +768,10 @@ class PDFRenderer:
594 html = self._inject_svg_into_html(html, svg_map) 768 html = self._inject_svg_into_html(html, svg_map)
595 logger.info(f"已注入 {len(svg_map)} 个SVG图表") 769 logger.info(f"已注入 {len(svg_map)} 个SVG图表")
596 770
  771 + if wordcloud_map:
  772 + html = self._inject_wordcloud_images(html, wordcloud_map)
  773 + logger.info(f"已注入 {len(wordcloud_map)} 个词云图片")
  774 +
597 # 注入数学公式SVG 775 # 注入数学公式SVG
598 if math_svg_map: 776 if math_svg_map:
599 html = self._inject_math_svg_into_html(html, math_svg_map) 777 html = self._inject_math_svg_into_html(html, math_svg_map)
@@ -652,6 +830,10 @@ body {{ @@ -652,6 +830,10 @@ body {{
652 max-width: 100%; 830 max-width: 100%;
653 height: auto; 831 height: auto;
654 }} 832 }}
  833 +.chart-svg-container img {{
  834 + max-width: 100%;
  835 + height: auto;
  836 +}}
655 837
656 /* 数学公式SVG容器样式 */ 838 /* 数学公式SVG容器样式 */
657 .math-svg-container {{ 839 .math-svg-container {{