Added Support for Word Cloud Association When Generating PDFs
Showing
1 changed file
with
182 additions
and
0 deletions
| @@ -9,6 +9,7 @@ import base64 | @@ -9,6 +9,7 @@ import base64 | ||
| 9 | import copy | 9 | import copy |
| 10 | import os | 10 | import os |
| 11 | import sys | 11 | import sys |
| 12 | +import io | ||
| 12 | from pathlib import Path | 13 | from pathlib import Path |
| 13 | from typing import Any, Dict | 14 | from typing import Any, Dict |
| 14 | from datetime import datetime | 15 | from datetime import datetime |
| @@ -69,6 +70,12 @@ from .html_renderer import HTMLRenderer | @@ -69,6 +70,12 @@ from .html_renderer import HTMLRenderer | ||
| 69 | from .pdf_layout_optimizer import PDFLayoutOptimizer, PDFLayoutConfig | 70 | from .pdf_layout_optimizer import PDFLayoutOptimizer, PDFLayoutConfig |
| 70 | from .chart_to_svg import create_chart_converter | 71 | from .chart_to_svg import create_chart_converter |
| 71 | from .math_to_svg import MathToSVG | 72 | from .math_to_svg import MathToSVG |
| 73 | +try: | ||
| 74 | + from wordcloud import WordCloud | ||
| 75 | + WORDCLOUD_AVAILABLE = True | ||
| 76 | +except ImportError: | ||
| 77 | + WORDCLOUD_AVAILABLE = False | ||
| 78 | + logger = logger # ensure logger exists even before declaration | ||
| 72 | 79 | ||
| 73 | 80 | ||
| 74 | class PDFRenderer: | 81 | class PDFRenderer: |
| @@ -272,6 +279,26 @@ class PDFRenderer: | @@ -272,6 +279,26 @@ class PDFRenderer: | ||
| 272 | logger.info(f"成功转换 {len(svg_map)} 个图表为SVG") | 279 | logger.info(f"成功转换 {len(svg_map)} 个图表为SVG") |
| 273 | return svg_map | 280 | return svg_map |
| 274 | 281 | ||
| 282 | + def _convert_wordclouds_to_images(self, document_ir: Dict[str, Any]) -> Dict[str, str]: | ||
| 283 | + """ | ||
| 284 | + 将document_ir中的词云widget转换为PNG并返回data URI映射 | ||
| 285 | + """ | ||
| 286 | + img_map: Dict[str, str] = {} | ||
| 287 | + | ||
| 288 | + if not WORDCLOUD_AVAILABLE: | ||
| 289 | + logger.debug("wordcloud库未安装,词云将使用表格兜底") | ||
| 290 | + return img_map | ||
| 291 | + | ||
| 292 | + # 遍历所有章节 | ||
| 293 | + chapters = document_ir.get('chapters', []) | ||
| 294 | + for chapter in chapters: | ||
| 295 | + blocks = chapter.get('blocks', []) | ||
| 296 | + self._extract_wordcloud_widgets(blocks, img_map) | ||
| 297 | + | ||
| 298 | + if img_map: | ||
| 299 | + logger.info(f"成功转换 {len(img_map)} 个词云为图片") | ||
| 300 | + return img_map | ||
| 301 | + | ||
| 275 | def _extract_and_convert_widgets( | 302 | def _extract_and_convert_widgets( |
| 276 | self, | 303 | self, |
| 277 | blocks: list, | 304 | blocks: list, |
| @@ -334,6 +361,109 @@ class PDFRenderer: | @@ -334,6 +361,109 @@ class PDFRenderer: | ||
| 334 | if isinstance(cell_blocks, list): | 361 | if isinstance(cell_blocks, list): |
| 335 | self._extract_and_convert_widgets(cell_blocks, svg_map) | 362 | self._extract_and_convert_widgets(cell_blocks, svg_map) |
| 336 | 363 | ||
| 364 | + def _extract_wordcloud_widgets( | ||
| 365 | + self, | ||
| 366 | + blocks: list, | ||
| 367 | + img_map: Dict[str, str] | ||
| 368 | + ) -> None: | ||
| 369 | + """ | ||
| 370 | + 递归遍历blocks,找到词云widget并生成图片 | ||
| 371 | + """ | ||
| 372 | + for block in blocks: | ||
| 373 | + if not isinstance(block, dict): | ||
| 374 | + continue | ||
| 375 | + | ||
| 376 | + block_type = block.get('type') | ||
| 377 | + if block_type == 'widget': | ||
| 378 | + widget_id = block.get('widgetId') | ||
| 379 | + widget_type = block.get('widgetType', '') | ||
| 380 | + | ||
| 381 | + if widget_id and isinstance(widget_type, str) and 'wordcloud' in widget_type.lower(): | ||
| 382 | + try: | ||
| 383 | + data_uri = self._generate_wordcloud_image(block) | ||
| 384 | + if data_uri: | ||
| 385 | + img_map[widget_id] = data_uri | ||
| 386 | + logger.debug(f"词云 {widget_id} 转换为图片成功") | ||
| 387 | + except Exception as exc: | ||
| 388 | + logger.warning(f"生成词云图片失败 {widget_id}: {exc}") | ||
| 389 | + | ||
| 390 | + nested_blocks = block.get('blocks') | ||
| 391 | + if isinstance(nested_blocks, list): | ||
| 392 | + self._extract_wordcloud_widgets(nested_blocks, img_map) | ||
| 393 | + | ||
| 394 | + if block_type == 'list': | ||
| 395 | + items = block.get('items', []) | ||
| 396 | + for item in items: | ||
| 397 | + if isinstance(item, list): | ||
| 398 | + self._extract_wordcloud_widgets(item, img_map) | ||
| 399 | + | ||
| 400 | + if block_type == 'table': | ||
| 401 | + rows = block.get('rows', []) | ||
| 402 | + for row in rows: | ||
| 403 | + cells = row.get('cells', []) | ||
| 404 | + for cell in cells: | ||
| 405 | + cell_blocks = cell.get('blocks', []) | ||
| 406 | + if isinstance(cell_blocks, list): | ||
| 407 | + self._extract_wordcloud_widgets(cell_blocks, img_map) | ||
| 408 | + | ||
| 409 | + def _normalize_wordcloud_items(self, block: Dict[str, Any]) -> list: | ||
| 410 | + """ | ||
| 411 | + 从widget block中提取词云数据 | ||
| 412 | + """ | ||
| 413 | + props = block.get('props') or {} | ||
| 414 | + raw_items = props.get('data') | ||
| 415 | + if not isinstance(raw_items, list): | ||
| 416 | + return [] | ||
| 417 | + normalized = [] | ||
| 418 | + for item in raw_items: | ||
| 419 | + if not isinstance(item, dict): | ||
| 420 | + continue | ||
| 421 | + word = item.get('word') or item.get('text') or item.get('label') | ||
| 422 | + if not word: | ||
| 423 | + continue | ||
| 424 | + weight = item.get('weight') | ||
| 425 | + try: | ||
| 426 | + weight_val = float(weight) | ||
| 427 | + if weight_val <= 0: | ||
| 428 | + weight_val = 1.0 | ||
| 429 | + except (TypeError, ValueError): | ||
| 430 | + weight_val = 1.0 | ||
| 431 | + category = (item.get('category') or '').lower() | ||
| 432 | + normalized.append({'word': str(word), 'weight': weight_val, 'category': category}) | ||
| 433 | + return normalized | ||
| 434 | + | ||
| 435 | + def _generate_wordcloud_image(self, block: Dict[str, Any]) -> str | None: | ||
| 436 | + """ | ||
| 437 | + 生成词云PNG并返回data URI | ||
| 438 | + """ | ||
| 439 | + items = self._normalize_wordcloud_items(block) | ||
| 440 | + if not items: | ||
| 441 | + return None | ||
| 442 | + | ||
| 443 | + # 使用频次形式馈入wordcloud库 | ||
| 444 | + frequencies = {} | ||
| 445 | + for item in items: | ||
| 446 | + weight = item['weight'] | ||
| 447 | + # 兼容权重为0-1的小数,放大以体现差异 | ||
| 448 | + freq = weight * 100 if 0 < weight <= 1.5 else weight | ||
| 449 | + frequencies[item['word']] = max(1, freq) | ||
| 450 | + | ||
| 451 | + font_path = str(self._get_font_path()) | ||
| 452 | + wc = WordCloud( | ||
| 453 | + width=900, | ||
| 454 | + height=520, | ||
| 455 | + background_color="white", | ||
| 456 | + font_path=font_path, | ||
| 457 | + prefer_horizontal=0.9, | ||
| 458 | + random_state=42, | ||
| 459 | + ) | ||
| 460 | + wc.generate_from_frequencies(frequencies) | ||
| 461 | + | ||
| 462 | + buffer = io.BytesIO() | ||
| 463 | + wc.to_image().save(buffer, format='PNG') | ||
| 464 | + encoded = base64.b64encode(buffer.getvalue()).decode('ascii') | ||
| 465 | + return f"data:image/png;base64,{encoded}" | ||
| 466 | + | ||
| 337 | def _convert_math_to_svg(self, document_ir: Dict[str, Any]) -> Dict[str, str]: | 467 | def _convert_math_to_svg(self, document_ir: Dict[str, Any]) -> Dict[str, str]: |
| 338 | """ | 468 | """ |
| 339 | 将document_ir中的所有数学公式转换为SVG | 469 | 将document_ir中的所有数学公式转换为SVG |
| @@ -486,6 +616,46 @@ class PDFRenderer: | @@ -486,6 +616,46 @@ class PDFRenderer: | ||
| 486 | 616 | ||
| 487 | return html | 617 | return html |
| 488 | 618 | ||
| 619 | + def _inject_wordcloud_images(self, html: str, img_map: Dict[str, str]) -> str: | ||
| 620 | + """ | ||
| 621 | + 将词云PNG data URI注入HTML,替换对应canvas | ||
| 622 | + """ | ||
| 623 | + if not img_map: | ||
| 624 | + return html | ||
| 625 | + | ||
| 626 | + import re | ||
| 627 | + | ||
| 628 | + for widget_id, data_uri in img_map.items(): | ||
| 629 | + img_html = ( | ||
| 630 | + f'<div class="chart-svg-container wordcloud-img">' | ||
| 631 | + f'<img src="{data_uri}" alt="词云" />' | ||
| 632 | + f'</div>' | ||
| 633 | + ) | ||
| 634 | + | ||
| 635 | + config_pattern = rf'<script[^>]+id="([^"]+)"[^>]*>\s*\{{[^}}]*"widgetId"\s*:\s*"{re.escape(widget_id)}"[^}}]*\}}' | ||
| 636 | + match = re.search(config_pattern, html, re.DOTALL) | ||
| 637 | + if not match: | ||
| 638 | + logger.debug(f"未找到词云 {widget_id} 的配置脚本,跳过注入") | ||
| 639 | + continue | ||
| 640 | + | ||
| 641 | + config_id = match.group(1) | ||
| 642 | + canvas_pattern = rf'<canvas[^>]+data-config-id="{re.escape(config_id)}"[^>]*></canvas>' | ||
| 643 | + | ||
| 644 | + html = re.sub(canvas_pattern, lambda m: img_html, html) | ||
| 645 | + logger.debug(f"已替换词云 {widget_id} 的canvas为PNG图片") | ||
| 646 | + | ||
| 647 | + fallback_pattern = rf'<div class="chart-fallback"([^>]*data-widget-id="{re.escape(widget_id)}"[^>]*)>' | ||
| 648 | + | ||
| 649 | + def _hide_fallback(m: re.Match) -> str: | ||
| 650 | + tag = m.group(0) | ||
| 651 | + if 'svg-hidden' in tag: | ||
| 652 | + return tag | ||
| 653 | + return tag.replace('chart-fallback"', 'chart-fallback svg-hidden"', 1) | ||
| 654 | + | ||
| 655 | + html = re.sub(fallback_pattern, _hide_fallback, html, count=1) | ||
| 656 | + | ||
| 657 | + return html | ||
| 658 | + | ||
| 489 | def _inject_math_svg_into_html(self, html: str, svg_map: Dict[str, str]) -> str: | 659 | def _inject_math_svg_into_html(self, html: str, svg_map: Dict[str, str]) -> str: |
| 490 | """ | 660 | """ |
| 491 | 将数学公式SVG内容注入到HTML中 | 661 | 将数学公式SVG内容注入到HTML中 |
| @@ -580,6 +750,10 @@ class PDFRenderer: | @@ -580,6 +750,10 @@ class PDFRenderer: | ||
| 580 | logger.info("开始转换图表为SVG矢量图形...") | 750 | logger.info("开始转换图表为SVG矢量图形...") |
| 581 | svg_map = self._convert_charts_to_svg(preprocessed_ir) | 751 | svg_map = self._convert_charts_to_svg(preprocessed_ir) |
| 582 | 752 | ||
| 753 | + # 转换词云为PNG | ||
| 754 | + logger.info("开始转换词云为图片...") | ||
| 755 | + wordcloud_map = self._convert_wordclouds_to_images(preprocessed_ir) | ||
| 756 | + | ||
| 583 | # 转换数学公式为SVG | 757 | # 转换数学公式为SVG |
| 584 | logger.info("开始转换数学公式为SVG矢量图形...") | 758 | logger.info("开始转换数学公式为SVG矢量图形...") |
| 585 | math_svg_map = self._convert_math_to_svg(preprocessed_ir) | 759 | math_svg_map = self._convert_math_to_svg(preprocessed_ir) |
| @@ -594,6 +768,10 @@ class PDFRenderer: | @@ -594,6 +768,10 @@ class PDFRenderer: | ||
| 594 | html = self._inject_svg_into_html(html, svg_map) | 768 | html = self._inject_svg_into_html(html, svg_map) |
| 595 | logger.info(f"已注入 {len(svg_map)} 个SVG图表") | 769 | logger.info(f"已注入 {len(svg_map)} 个SVG图表") |
| 596 | 770 | ||
| 771 | + if wordcloud_map: | ||
| 772 | + html = self._inject_wordcloud_images(html, wordcloud_map) | ||
| 773 | + logger.info(f"已注入 {len(wordcloud_map)} 个词云图片") | ||
| 774 | + | ||
| 597 | # 注入数学公式SVG | 775 | # 注入数学公式SVG |
| 598 | if math_svg_map: | 776 | if math_svg_map: |
| 599 | html = self._inject_math_svg_into_html(html, math_svg_map) | 777 | html = self._inject_math_svg_into_html(html, math_svg_map) |
| @@ -652,6 +830,10 @@ body {{ | @@ -652,6 +830,10 @@ body {{ | ||
| 652 | max-width: 100%; | 830 | max-width: 100%; |
| 653 | height: auto; | 831 | height: auto; |
| 654 | }} | 832 | }} |
| 833 | +.chart-svg-container img {{ | ||
| 834 | + max-width: 100%; | ||
| 835 | + height: auto; | ||
| 836 | +}} | ||
| 655 | 837 | ||
| 656 | /* 数学公式SVG容器样式 */ | 838 | /* 数学公式SVG容器样式 */ |
| 657 | .math-svg-container {{ | 839 | .math-svg-container {{ |
-
Please register or login to post a comment