Add Support for Rendering Various Inline and Block-level Mathematical Formulas
Showing
3 changed files
with
185 additions
and
11 deletions
| @@ -1262,7 +1262,8 @@ class HTMLRenderer: | @@ -1262,7 +1262,8 @@ class HTMLRenderer: | ||
| 1262 | 1262 | ||
| 1263 | def _render_math(self, block: Dict[str, Any]) -> str: | 1263 | def _render_math(self, block: Dict[str, Any]) -> str: |
| 1264 | """渲染数学公式,占位符交给外部MathJax或后处理""" | 1264 | """渲染数学公式,占位符交给外部MathJax或后处理""" |
| 1265 | - latex = self._escape_html(block.get("latex", "")) | 1265 | + latex_raw = block.get("latex", "") |
| 1266 | + latex = self._escape_html(self._normalize_latex_string(latex_raw)) | ||
| 1266 | math_id = self._escape_attr(block.get("mathId", "")) if block.get("mathId") else "" | 1267 | math_id = self._escape_attr(block.get("mathId", "")) if block.get("mathId") else "" |
| 1267 | id_attr = f' data-math-id="{math_id}"' if math_id else "" | 1268 | id_attr = f' data-math-id="{math_id}"' if math_id else "" |
| 1268 | return f'<div class="math-block"{id_attr}>$$ {latex} $$</div>' | 1269 | return f'<div class="math-block"{id_attr}>$$ {latex} $$</div>' |
| @@ -1989,6 +1990,66 @@ class HTMLRenderer: | @@ -1989,6 +1990,66 @@ class HTMLRenderer: | ||
| 1989 | return text_value, marks | 1990 | return text_value, marks |
| 1990 | 1991 | ||
| 1991 | @staticmethod | 1992 | @staticmethod |
| 1993 | + def _normalize_latex_string(raw: Any) -> str: | ||
| 1994 | + """去除外层数学定界符,兼容 $...$、$$...$$、\\(\\)、\\[\\] 等格式""" | ||
| 1995 | + if not isinstance(raw, str): | ||
| 1996 | + return "" | ||
| 1997 | + latex = raw.strip() | ||
| 1998 | + patterns = [ | ||
| 1999 | + r'^\$\$(.*)\$\$$', | ||
| 2000 | + r'^\$(.*)\$$', | ||
| 2001 | + r'^\\\[(.*)\\\]$', | ||
| 2002 | + r'^\\\((.*)\\\)$', | ||
| 2003 | + ] | ||
| 2004 | + for pat in patterns: | ||
| 2005 | + m = re.match(pat, latex, re.DOTALL) | ||
| 2006 | + if m: | ||
| 2007 | + latex = m.group(1).strip() | ||
| 2008 | + break | ||
| 2009 | + return latex | ||
| 2010 | + | ||
| 2011 | + def _render_text_with_inline_math(self, text: Any, math_id: str | list | None = None) -> str | None: | ||
| 2012 | + """ | ||
| 2013 | + 识别纯文本中的数学定界符并渲染为math-inline/math-block,提升兼容性。 | ||
| 2014 | + | ||
| 2015 | + - 支持 $...$、$$...$$、\\(\\)、\\[\\]。 | ||
| 2016 | + - 若未检测到公式,返回None。 | ||
| 2017 | + """ | ||
| 2018 | + if not isinstance(text, str) or not text: | ||
| 2019 | + return None | ||
| 2020 | + | ||
| 2021 | + pattern = re.compile(r'(\$\$(.+?)\$\$|\$(.+?)\$|\\\((.+?)\\\)|\\\[(.+?)\\\])', re.S) | ||
| 2022 | + cursor = 0 | ||
| 2023 | + parts: List[str] = [] | ||
| 2024 | + idx = 0 | ||
| 2025 | + id_iter = iter(math_id) if isinstance(math_id, list) else None | ||
| 2026 | + for m in pattern.finditer(text): | ||
| 2027 | + start, end = m.span() | ||
| 2028 | + if start > cursor: | ||
| 2029 | + parts.append(self._escape_html(text[cursor:start])) | ||
| 2030 | + raw = next(g for g in m.groups()[1:] if g is not None) | ||
| 2031 | + latex = self._normalize_latex_string(raw) | ||
| 2032 | + idx += 1 | ||
| 2033 | + # 若已有math_id,直接使用,避免与SVG注入ID不一致;否则按局部序号生成 | ||
| 2034 | + if id_iter: | ||
| 2035 | + mid = next(id_iter, f"auto-math-{idx}") | ||
| 2036 | + else: | ||
| 2037 | + mid = math_id or f"auto-math-{idx}" | ||
| 2038 | + id_attr = f' data-math-id="{self._escape_attr(mid)}"' | ||
| 2039 | + is_display = m.group(1).startswith('$$') or m.group(1).startswith('\\[') | ||
| 2040 | + if is_display: | ||
| 2041 | + parts.append(f'<div class="math-block"{id_attr}>$$ {self._escape_html(latex)} $$</div>') | ||
| 2042 | + else: | ||
| 2043 | + parts.append(f'<span class="math-inline"{id_attr}>\\( {self._escape_html(latex)} \\)</span>') | ||
| 2044 | + cursor = end | ||
| 2045 | + | ||
| 2046 | + if cursor == 0: | ||
| 2047 | + return None | ||
| 2048 | + if cursor < len(text): | ||
| 2049 | + parts.append(self._escape_html(text[cursor:])) | ||
| 2050 | + return "".join(parts) | ||
| 2051 | + | ||
| 2052 | + @staticmethod | ||
| 1992 | def _coerce_inline_payload(payload: Dict[str, Any]) -> Dict[str, Any] | None: | 2053 | def _coerce_inline_payload(payload: Dict[str, Any]) -> Dict[str, Any] | None: |
| 1993 | """尽力将字符串里的内联节点恢复为dict,修复渲染遗漏""" | 2054 | """尽力将字符串里的内联节点恢复为dict,修复渲染遗漏""" |
| 1994 | if not isinstance(payload, dict): | 2055 | if not isinstance(payload, dict): |
| @@ -2013,12 +2074,19 @@ class HTMLRenderer: | @@ -2013,12 +2074,19 @@ class HTMLRenderer: | ||
| 2013 | text_value, marks = self._normalize_inline_payload(run) | 2074 | text_value, marks = self._normalize_inline_payload(run) |
| 2014 | math_mark = next((mark for mark in marks if mark.get("type") == "math"), None) | 2075 | math_mark = next((mark for mark in marks if mark.get("type") == "math"), None) |
| 2015 | if math_mark: | 2076 | if math_mark: |
| 2016 | - latex = math_mark.get("value") | 2077 | + latex = self._normalize_latex_string(math_mark.get("value")) |
| 2017 | if not isinstance(latex, str) or not latex.strip(): | 2078 | if not isinstance(latex, str) or not latex.strip(): |
| 2018 | - latex = text_value | 2079 | + latex = self._normalize_latex_string(text_value) |
| 2019 | math_id = self._escape_attr(run.get("mathId", "")) if run.get("mathId") else "" | 2080 | math_id = self._escape_attr(run.get("mathId", "")) if run.get("mathId") else "" |
| 2020 | id_attr = f' data-math-id="{math_id}"' if math_id else "" | 2081 | id_attr = f' data-math-id="{math_id}"' if math_id else "" |
| 2021 | return f'<span class="math-inline"{id_attr}>\\( {self._escape_html(latex)} \\)</span>' | 2082 | return f'<span class="math-inline"{id_attr}>\\( {self._escape_html(latex)} \\)</span>' |
| 2083 | + | ||
| 2084 | + # 尝试从纯文本中提取数学公式(即便没有math mark) | ||
| 2085 | + math_id_hint = run.get("mathIds") or run.get("mathId") | ||
| 2086 | + mathified = self._render_text_with_inline_math(text_value, math_id_hint) | ||
| 2087 | + if mathified is not None: | ||
| 2088 | + return mathified | ||
| 2089 | + | ||
| 2022 | text = self._escape_html(text_value) | 2090 | text = self._escape_html(text_value) |
| 2023 | styles: List[str] = [] | 2091 | styles: List[str] = [] |
| 2024 | prefix: List[str] = [] | 2092 | prefix: List[str] = [] |
| @@ -4,6 +4,7 @@ LaTeX 数学公式转 SVG 渲染器 | @@ -4,6 +4,7 @@ LaTeX 数学公式转 SVG 渲染器 | ||
| 4 | """ | 4 | """ |
| 5 | 5 | ||
| 6 | import io | 6 | import io |
| 7 | +import re | ||
| 7 | from typing import Optional | 8 | from typing import Optional |
| 8 | import matplotlib | 9 | import matplotlib |
| 9 | import matplotlib.pyplot as plt | 10 | import matplotlib.pyplot as plt |
| @@ -40,8 +41,22 @@ class MathToSVG: | @@ -40,8 +41,22 @@ class MathToSVG: | ||
| 40 | SVG 字符串,如果转换失败则返回 None | 41 | SVG 字符串,如果转换失败则返回 None |
| 41 | """ | 42 | """ |
| 42 | try: | 43 | try: |
| 43 | - # 清理 LaTeX 字符串 | ||
| 44 | - latex = latex.strip() | 44 | + # 清理 LaTeX 字符串,去除外层定界符,兼容 $...$ / $$...$$ / \\( \\) / \\[ \\] |
| 45 | + latex = (latex or "").strip() | ||
| 46 | + patterns = [ | ||
| 47 | + r'^\$\$(.*)\$\$$', | ||
| 48 | + r'^\$(.*)\$$', | ||
| 49 | + r'^\\\[(.*)\\\]$', | ||
| 50 | + r'^\\\((.*)\\\)$', | ||
| 51 | + ] | ||
| 52 | + for pat in patterns: | ||
| 53 | + m = re.match(pat, latex, re.DOTALL) | ||
| 54 | + if m: | ||
| 55 | + latex = m.group(1).strip() | ||
| 56 | + break | ||
| 57 | + # 清理控制字符并做常见兼容 | ||
| 58 | + latex = re.sub(r'[\x00-\x1f\x7f]', '', latex) | ||
| 59 | + latex = latex.replace(r'\\tfrac', r'\\frac').replace(r'\\dfrac', r'\\frac') | ||
| 45 | if not latex: | 60 | if not latex: |
| 46 | logger.warning("空的 LaTeX 公式") | 61 | logger.warning("空的 LaTeX 公式") |
| 47 | return None | 62 | return None |
| @@ -10,6 +10,7 @@ import copy | @@ -10,6 +10,7 @@ import copy | ||
| 10 | import os | 10 | import os |
| 11 | import sys | 11 | import sys |
| 12 | import io | 12 | import io |
| 13 | +import re | ||
| 13 | from pathlib import Path | 14 | from pathlib import Path |
| 14 | from typing import Any, Dict | 15 | from typing import Any, Dict |
| 15 | from datetime import datetime | 16 | from datetime import datetime |
| @@ -544,23 +545,62 @@ class PDFRenderer: | @@ -544,23 +545,62 @@ class PDFRenderer: | ||
| 544 | continue | 545 | continue |
| 545 | marks = run.get('marks') or [] | 546 | marks = run.get('marks') or [] |
| 546 | math_mark = next((m for m in marks if m.get('type') == 'math'), None) | 547 | math_mark = next((m for m in marks if m.get('type') == 'math'), None) |
| 547 | - if not math_mark: | 548 | + |
| 549 | + if math_mark: | ||
| 550 | + # 仅单个math mark | ||
| 551 | + raw = math_mark.get('value') or run.get('text') or '' | ||
| 552 | + latex = self._normalize_latex(raw) | ||
| 553 | + is_display = bool(re.match(r'^\s*(\$\$|\\\[)', str(raw))) | ||
| 554 | + if not latex: | ||
| 555 | + continue | ||
| 556 | + block_counter[0] += 1 | ||
| 557 | + math_id = run.get('mathId') or f"math-inline-{block_counter[0]}" | ||
| 558 | + run['mathId'] = math_id | ||
| 559 | + try: | ||
| 560 | + svg_content = ( | ||
| 561 | + self.math_converter.convert_display_to_svg(latex) | ||
| 562 | + if is_display else | ||
| 563 | + self.math_converter.convert_inline_to_svg(latex) | ||
| 564 | + ) | ||
| 565 | + if svg_content: | ||
| 566 | + svg_map[math_id] = svg_content | ||
| 567 | + logger.debug(f"公式 {math_id} 转换为SVG成功") | ||
| 568 | + else: | ||
| 569 | + logger.warning(f"公式 {math_id} 转换为SVG失败: {latex[:50]}...") | ||
| 570 | + except Exception as exc: | ||
| 571 | + logger.error(f"转换内联公式 {latex[:50]}... 时出错: {exc}") | ||
| 548 | continue | 572 | continue |
| 549 | - latex = (math_mark.get('value') or run.get('text') or '').strip() | 573 | + |
| 574 | + # 无math mark,尝试解析文本中的多个公式 | ||
| 575 | + text_val = run.get('text') | ||
| 576 | + if not isinstance(text_val, str): | ||
| 577 | + continue | ||
| 578 | + segments = self._find_all_math_in_text(text_val) | ||
| 579 | + if not segments: | ||
| 580 | + continue | ||
| 581 | + ids_for_html: list[str] = [] | ||
| 582 | + for idx, (latex, is_display) in enumerate(segments, start=1): | ||
| 550 | if not latex: | 583 | if not latex: |
| 551 | continue | 584 | continue |
| 552 | block_counter[0] += 1 | 585 | block_counter[0] += 1 |
| 553 | - math_id = f"math-inline-{block_counter[0]}" | 586 | + math_id = f"auto-math-{block_counter[0]}" |
| 587 | + ids_for_html.append(math_id) | ||
| 554 | try: | 588 | try: |
| 555 | - svg_content = self.math_converter.convert_inline_to_svg(latex) | 589 | + svg_content = ( |
| 590 | + self.math_converter.convert_display_to_svg(latex) | ||
| 591 | + if is_display else | ||
| 592 | + self.math_converter.convert_inline_to_svg(latex) | ||
| 593 | + ) | ||
| 556 | if svg_content: | 594 | if svg_content: |
| 557 | svg_map[math_id] = svg_content | 595 | svg_map[math_id] = svg_content |
| 558 | - run['mathId'] = math_id | ||
| 559 | logger.debug(f"公式 {math_id} 转换为SVG成功") | 596 | logger.debug(f"公式 {math_id} 转换为SVG成功") |
| 560 | else: | 597 | else: |
| 561 | logger.warning(f"公式 {math_id} 转换为SVG失败: {latex[:50]}...") | 598 | logger.warning(f"公式 {math_id} 转换为SVG失败: {latex[:50]}...") |
| 562 | except Exception as exc: | 599 | except Exception as exc: |
| 563 | logger.error(f"转换内联公式 {latex[:50]}... 时出错: {exc}") | 600 | logger.error(f"转换内联公式 {latex[:50]}... 时出错: {exc}") |
| 601 | + if ids_for_html: | ||
| 602 | + # 将ID列表写回run,便于HTML渲染时使用相同ID(顺序对应segments) | ||
| 603 | + run['mathIds'] = ids_for_html | ||
| 564 | 604 | ||
| 565 | for block in blocks: | 605 | for block in blocks: |
| 566 | if not isinstance(block, dict): | 606 | if not isinstance(block, dict): |
| @@ -570,7 +610,7 @@ class PDFRenderer: | @@ -570,7 +610,7 @@ class PDFRenderer: | ||
| 570 | 610 | ||
| 571 | # 处理math类型 | 611 | # 处理math类型 |
| 572 | if block_type == 'math': | 612 | if block_type == 'math': |
| 573 | - latex = block.get('latex', '').strip() | 613 | + latex = self._normalize_latex(block.get('latex', '')) |
| 574 | if latex: | 614 | if latex: |
| 575 | block_counter[0] += 1 | 615 | block_counter[0] += 1 |
| 576 | math_id = f"math-block-{block_counter[0]}" | 616 | math_id = f"math-block-{block_counter[0]}" |
| @@ -679,6 +719,57 @@ class PDFRenderer: | @@ -679,6 +719,57 @@ class PDFRenderer: | ||
| 679 | 719 | ||
| 680 | return html | 720 | return html |
| 681 | 721 | ||
| 722 | + @staticmethod | ||
| 723 | + def _normalize_latex(raw: Any) -> str: | ||
| 724 | + """去除外层数学定界符,兼容 $...$、$$...$$、\\(\\)、\\[\\] 等格式""" | ||
| 725 | + if not isinstance(raw, str): | ||
| 726 | + return "" | ||
| 727 | + latex = raw.strip() | ||
| 728 | + patterns = [ | ||
| 729 | + r'^\$\$(.*)\$\$$', | ||
| 730 | + r'^\$(.*)\$$', | ||
| 731 | + r'^\\\[(.*)\\\]$', | ||
| 732 | + r'^\\\((.*)\\\)$', | ||
| 733 | + ] | ||
| 734 | + for pat in patterns: | ||
| 735 | + m = re.match(pat, latex, re.DOTALL) | ||
| 736 | + if m: | ||
| 737 | + latex = m.group(1).strip() | ||
| 738 | + break | ||
| 739 | + # 清理控制字符、防止mathtext解析失败 | ||
| 740 | + latex = re.sub(r'[\x00-\x1f\x7f]', '', latex) | ||
| 741 | + # 常见兼容:\tfrac/\dfrac -> \frac | ||
| 742 | + latex = latex.replace(r'\tfrac', r'\frac').replace(r'\dfrac', r'\frac') | ||
| 743 | + return latex | ||
| 744 | + | ||
| 745 | + @staticmethod | ||
| 746 | + def _find_first_math_in_text(text: Any) -> tuple[str, bool] | None: | ||
| 747 | + """从纯文本中提取首个数学片段,返回(内容, 是否display)""" | ||
| 748 | + if not isinstance(text, str): | ||
| 749 | + return None | ||
| 750 | + pattern = re.compile(r'\$\$(.+?)\$\$|\$(.+?)\$|\\\((.+?)\\\)|\\\[(.+?)\\\]', re.S) | ||
| 751 | + m = pattern.search(text) | ||
| 752 | + if not m: | ||
| 753 | + return None | ||
| 754 | + raw = next(g for g in m.groups() if g is not None) | ||
| 755 | + latex = raw.strip() | ||
| 756 | + is_display = bool(m.group(1) or m.group(4)) # $$ or \[ \] | ||
| 757 | + return latex, is_display | ||
| 758 | + | ||
| 759 | + @staticmethod | ||
| 760 | + def _find_all_math_in_text(text: Any) -> list[tuple[str, bool]]: | ||
| 761 | + """从纯文本中提取所有数学片段,返回[(内容, 是否display)]""" | ||
| 762 | + if not isinstance(text, str): | ||
| 763 | + return [] | ||
| 764 | + pattern = re.compile(r'\$\$(.+?)\$\$|\$(.+?)\$|\\\((.+?)\\\)|\\\[(.+?)\\\]', re.S) | ||
| 765 | + results = [] | ||
| 766 | + for m in pattern.finditer(text): | ||
| 767 | + raw = next(g for g in m.groups() if g is not None) | ||
| 768 | + latex = raw.strip() | ||
| 769 | + is_display = bool(m.group(1) or m.group(4)) | ||
| 770 | + results.append((latex, is_display)) | ||
| 771 | + return results | ||
| 772 | + | ||
| 682 | def _inject_wordcloud_images(self, html: str, img_map: Dict[str, str]) -> str: | 773 | def _inject_wordcloud_images(self, html: str, img_map: Dict[str, str]) -> str: |
| 683 | """ | 774 | """ |
| 684 | 将词云PNG data URI注入HTML,替换对应canvas | 775 | 将词云PNG data URI注入HTML,替换对应canvas |
-
Please register or login to post a comment