马一丁

Add Support for Rendering Various Inline and Block-level Mathematical Formulas

@@ -1262,7 +1262,8 @@ class HTMLRenderer: @@ -1262,7 +1262,8 @@ class HTMLRenderer:
1262 1262
1263 def _render_math(self, block: Dict[str, Any]) -> str: 1263 def _render_math(self, block: Dict[str, Any]) -> str:
1264 """渲染数学公式,占位符交给外部MathJax或后处理""" 1264 """渲染数学公式,占位符交给外部MathJax或后处理"""
1265 - latex = self._escape_html(block.get("latex", "")) 1265 + latex_raw = block.get("latex", "")
  1266 + latex = self._escape_html(self._normalize_latex_string(latex_raw))
1266 math_id = self._escape_attr(block.get("mathId", "")) if block.get("mathId") else "" 1267 math_id = self._escape_attr(block.get("mathId", "")) if block.get("mathId") else ""
1267 id_attr = f' data-math-id="{math_id}"' if math_id else "" 1268 id_attr = f' data-math-id="{math_id}"' if math_id else ""
1268 return f'<div class="math-block"{id_attr}>$$ {latex} $$</div>' 1269 return f'<div class="math-block"{id_attr}>$$ {latex} $$</div>'
@@ -1989,6 +1990,66 @@ class HTMLRenderer: @@ -1989,6 +1990,66 @@ class HTMLRenderer:
1989 return text_value, marks 1990 return text_value, marks
1990 1991
1991 @staticmethod 1992 @staticmethod
  1993 + def _normalize_latex_string(raw: Any) -> str:
  1994 + """去除外层数学定界符,兼容 $...$、$$...$$、\\(\\)、\\[\\] 等格式"""
  1995 + if not isinstance(raw, str):
  1996 + return ""
  1997 + latex = raw.strip()
  1998 + patterns = [
  1999 + r'^\$\$(.*)\$\$$',
  2000 + r'^\$(.*)\$$',
  2001 + r'^\\\[(.*)\\\]$',
  2002 + r'^\\\((.*)\\\)$',
  2003 + ]
  2004 + for pat in patterns:
  2005 + m = re.match(pat, latex, re.DOTALL)
  2006 + if m:
  2007 + latex = m.group(1).strip()
  2008 + break
  2009 + return latex
  2010 +
  2011 + def _render_text_with_inline_math(self, text: Any, math_id: str | list | None = None) -> str | None:
  2012 + """
  2013 + 识别纯文本中的数学定界符并渲染为math-inline/math-block,提升兼容性。
  2014 +
  2015 + - 支持 $...$、$$...$$、\\(\\)、\\[\\]。
  2016 + - 若未检测到公式,返回None。
  2017 + """
  2018 + if not isinstance(text, str) or not text:
  2019 + return None
  2020 +
  2021 + pattern = re.compile(r'(\$\$(.+?)\$\$|\$(.+?)\$|\\\((.+?)\\\)|\\\[(.+?)\\\])', re.S)
  2022 + cursor = 0
  2023 + parts: List[str] = []
  2024 + idx = 0
  2025 + id_iter = iter(math_id) if isinstance(math_id, list) else None
  2026 + for m in pattern.finditer(text):
  2027 + start, end = m.span()
  2028 + if start > cursor:
  2029 + parts.append(self._escape_html(text[cursor:start]))
  2030 + raw = next(g for g in m.groups()[1:] if g is not None)
  2031 + latex = self._normalize_latex_string(raw)
  2032 + idx += 1
  2033 + # 若已有math_id,直接使用,避免与SVG注入ID不一致;否则按局部序号生成
  2034 + if id_iter:
  2035 + mid = next(id_iter, f"auto-math-{idx}")
  2036 + else:
  2037 + mid = math_id or f"auto-math-{idx}"
  2038 + id_attr = f' data-math-id="{self._escape_attr(mid)}"'
  2039 + is_display = m.group(1).startswith('$$') or m.group(1).startswith('\\[')
  2040 + if is_display:
  2041 + parts.append(f'<div class="math-block"{id_attr}>$$ {self._escape_html(latex)} $$</div>')
  2042 + else:
  2043 + parts.append(f'<span class="math-inline"{id_attr}>\\( {self._escape_html(latex)} \\)</span>')
  2044 + cursor = end
  2045 +
  2046 + if cursor == 0:
  2047 + return None
  2048 + if cursor < len(text):
  2049 + parts.append(self._escape_html(text[cursor:]))
  2050 + return "".join(parts)
  2051 +
  2052 + @staticmethod
1992 def _coerce_inline_payload(payload: Dict[str, Any]) -> Dict[str, Any] | None: 2053 def _coerce_inline_payload(payload: Dict[str, Any]) -> Dict[str, Any] | None:
1993 """尽力将字符串里的内联节点恢复为dict,修复渲染遗漏""" 2054 """尽力将字符串里的内联节点恢复为dict,修复渲染遗漏"""
1994 if not isinstance(payload, dict): 2055 if not isinstance(payload, dict):
@@ -2013,12 +2074,19 @@ class HTMLRenderer: @@ -2013,12 +2074,19 @@ class HTMLRenderer:
2013 text_value, marks = self._normalize_inline_payload(run) 2074 text_value, marks = self._normalize_inline_payload(run)
2014 math_mark = next((mark for mark in marks if mark.get("type") == "math"), None) 2075 math_mark = next((mark for mark in marks if mark.get("type") == "math"), None)
2015 if math_mark: 2076 if math_mark:
2016 - latex = math_mark.get("value") 2077 + latex = self._normalize_latex_string(math_mark.get("value"))
2017 if not isinstance(latex, str) or not latex.strip(): 2078 if not isinstance(latex, str) or not latex.strip():
2018 - latex = text_value 2079 + latex = self._normalize_latex_string(text_value)
2019 math_id = self._escape_attr(run.get("mathId", "")) if run.get("mathId") else "" 2080 math_id = self._escape_attr(run.get("mathId", "")) if run.get("mathId") else ""
2020 id_attr = f' data-math-id="{math_id}"' if math_id else "" 2081 id_attr = f' data-math-id="{math_id}"' if math_id else ""
2021 return f'<span class="math-inline"{id_attr}>\\( {self._escape_html(latex)} \\)</span>' 2082 return f'<span class="math-inline"{id_attr}>\\( {self._escape_html(latex)} \\)</span>'
  2083 +
  2084 + # 尝试从纯文本中提取数学公式(即便没有math mark)
  2085 + math_id_hint = run.get("mathIds") or run.get("mathId")
  2086 + mathified = self._render_text_with_inline_math(text_value, math_id_hint)
  2087 + if mathified is not None:
  2088 + return mathified
  2089 +
2022 text = self._escape_html(text_value) 2090 text = self._escape_html(text_value)
2023 styles: List[str] = [] 2091 styles: List[str] = []
2024 prefix: List[str] = [] 2092 prefix: List[str] = []
@@ -4,6 +4,7 @@ LaTeX 数学公式转 SVG 渲染器 @@ -4,6 +4,7 @@ LaTeX 数学公式转 SVG 渲染器
4 """ 4 """
5 5
6 import io 6 import io
  7 +import re
7 from typing import Optional 8 from typing import Optional
8 import matplotlib 9 import matplotlib
9 import matplotlib.pyplot as plt 10 import matplotlib.pyplot as plt
@@ -40,8 +41,22 @@ class MathToSVG: @@ -40,8 +41,22 @@ class MathToSVG:
40 SVG 字符串,如果转换失败则返回 None 41 SVG 字符串,如果转换失败则返回 None
41 """ 42 """
42 try: 43 try:
43 - # 清理 LaTeX 字符串  
44 - latex = latex.strip() 44 + # 清理 LaTeX 字符串,去除外层定界符,兼容 $...$ / $$...$$ / \\( \\) / \\[ \\]
  45 + latex = (latex or "").strip()
  46 + patterns = [
  47 + r'^\$\$(.*)\$\$$',
  48 + r'^\$(.*)\$$',
  49 + r'^\\\[(.*)\\\]$',
  50 + r'^\\\((.*)\\\)$',
  51 + ]
  52 + for pat in patterns:
  53 + m = re.match(pat, latex, re.DOTALL)
  54 + if m:
  55 + latex = m.group(1).strip()
  56 + break
  57 + # 清理控制字符并做常见兼容
  58 + latex = re.sub(r'[\x00-\x1f\x7f]', '', latex)
  59 + latex = latex.replace(r'\\tfrac', r'\\frac').replace(r'\\dfrac', r'\\frac')
45 if not latex: 60 if not latex:
46 logger.warning("空的 LaTeX 公式") 61 logger.warning("空的 LaTeX 公式")
47 return None 62 return None
@@ -10,6 +10,7 @@ import copy @@ -10,6 +10,7 @@ import copy
10 import os 10 import os
11 import sys 11 import sys
12 import io 12 import io
  13 +import re
13 from pathlib import Path 14 from pathlib import Path
14 from typing import Any, Dict 15 from typing import Any, Dict
15 from datetime import datetime 16 from datetime import datetime
@@ -544,23 +545,62 @@ class PDFRenderer: @@ -544,23 +545,62 @@ class PDFRenderer:
544 continue 545 continue
545 marks = run.get('marks') or [] 546 marks = run.get('marks') or []
546 math_mark = next((m for m in marks if m.get('type') == 'math'), None) 547 math_mark = next((m for m in marks if m.get('type') == 'math'), None)
547 - if not math_mark: 548 +
  549 + if math_mark:
  550 + # 仅单个math mark
  551 + raw = math_mark.get('value') or run.get('text') or ''
  552 + latex = self._normalize_latex(raw)
  553 + is_display = bool(re.match(r'^\s*(\$\$|\\\[)', str(raw)))
  554 + if not latex:
  555 + continue
  556 + block_counter[0] += 1
  557 + math_id = run.get('mathId') or f"math-inline-{block_counter[0]}"
  558 + run['mathId'] = math_id
  559 + try:
  560 + svg_content = (
  561 + self.math_converter.convert_display_to_svg(latex)
  562 + if is_display else
  563 + self.math_converter.convert_inline_to_svg(latex)
  564 + )
  565 + if svg_content:
  566 + svg_map[math_id] = svg_content
  567 + logger.debug(f"公式 {math_id} 转换为SVG成功")
  568 + else:
  569 + logger.warning(f"公式 {math_id} 转换为SVG失败: {latex[:50]}...")
  570 + except Exception as exc:
  571 + logger.error(f"转换内联公式 {latex[:50]}... 时出错: {exc}")
548 continue 572 continue
549 - latex = (math_mark.get('value') or run.get('text') or '').strip() 573 +
  574 + # 无math mark,尝试解析文本中的多个公式
  575 + text_val = run.get('text')
  576 + if not isinstance(text_val, str):
  577 + continue
  578 + segments = self._find_all_math_in_text(text_val)
  579 + if not segments:
  580 + continue
  581 + ids_for_html: list[str] = []
  582 + for idx, (latex, is_display) in enumerate(segments, start=1):
550 if not latex: 583 if not latex:
551 continue 584 continue
552 block_counter[0] += 1 585 block_counter[0] += 1
553 - math_id = f"math-inline-{block_counter[0]}" 586 + math_id = f"auto-math-{block_counter[0]}"
  587 + ids_for_html.append(math_id)
554 try: 588 try:
555 - svg_content = self.math_converter.convert_inline_to_svg(latex) 589 + svg_content = (
  590 + self.math_converter.convert_display_to_svg(latex)
  591 + if is_display else
  592 + self.math_converter.convert_inline_to_svg(latex)
  593 + )
556 if svg_content: 594 if svg_content:
557 svg_map[math_id] = svg_content 595 svg_map[math_id] = svg_content
558 - run['mathId'] = math_id  
559 logger.debug(f"公式 {math_id} 转换为SVG成功") 596 logger.debug(f"公式 {math_id} 转换为SVG成功")
560 else: 597 else:
561 logger.warning(f"公式 {math_id} 转换为SVG失败: {latex[:50]}...") 598 logger.warning(f"公式 {math_id} 转换为SVG失败: {latex[:50]}...")
562 except Exception as exc: 599 except Exception as exc:
563 logger.error(f"转换内联公式 {latex[:50]}... 时出错: {exc}") 600 logger.error(f"转换内联公式 {latex[:50]}... 时出错: {exc}")
  601 + if ids_for_html:
  602 + # 将ID列表写回run,便于HTML渲染时使用相同ID(顺序对应segments)
  603 + run['mathIds'] = ids_for_html
564 604
565 for block in blocks: 605 for block in blocks:
566 if not isinstance(block, dict): 606 if not isinstance(block, dict):
@@ -570,7 +610,7 @@ class PDFRenderer: @@ -570,7 +610,7 @@ class PDFRenderer:
570 610
571 # 处理math类型 611 # 处理math类型
572 if block_type == 'math': 612 if block_type == 'math':
573 - latex = block.get('latex', '').strip() 613 + latex = self._normalize_latex(block.get('latex', ''))
574 if latex: 614 if latex:
575 block_counter[0] += 1 615 block_counter[0] += 1
576 math_id = f"math-block-{block_counter[0]}" 616 math_id = f"math-block-{block_counter[0]}"
@@ -679,6 +719,57 @@ class PDFRenderer: @@ -679,6 +719,57 @@ class PDFRenderer:
679 719
680 return html 720 return html
681 721
  722 + @staticmethod
  723 + def _normalize_latex(raw: Any) -> str:
  724 + """去除外层数学定界符,兼容 $...$、$$...$$、\\(\\)、\\[\\] 等格式"""
  725 + if not isinstance(raw, str):
  726 + return ""
  727 + latex = raw.strip()
  728 + patterns = [
  729 + r'^\$\$(.*)\$\$$',
  730 + r'^\$(.*)\$$',
  731 + r'^\\\[(.*)\\\]$',
  732 + r'^\\\((.*)\\\)$',
  733 + ]
  734 + for pat in patterns:
  735 + m = re.match(pat, latex, re.DOTALL)
  736 + if m:
  737 + latex = m.group(1).strip()
  738 + break
  739 + # 清理控制字符、防止mathtext解析失败
  740 + latex = re.sub(r'[\x00-\x1f\x7f]', '', latex)
  741 + # 常见兼容:\tfrac/\dfrac -> \frac
  742 + latex = latex.replace(r'\tfrac', r'\frac').replace(r'\dfrac', r'\frac')
  743 + return latex
  744 +
  745 + @staticmethod
  746 + def _find_first_math_in_text(text: Any) -> tuple[str, bool] | None:
  747 + """从纯文本中提取首个数学片段,返回(内容, 是否display)"""
  748 + if not isinstance(text, str):
  749 + return None
  750 + pattern = re.compile(r'\$\$(.+?)\$\$|\$(.+?)\$|\\\((.+?)\\\)|\\\[(.+?)\\\]', re.S)
  751 + m = pattern.search(text)
  752 + if not m:
  753 + return None
  754 + raw = next(g for g in m.groups() if g is not None)
  755 + latex = raw.strip()
  756 + is_display = bool(m.group(1) or m.group(4)) # $$ or \[ \]
  757 + return latex, is_display
  758 +
  759 + @staticmethod
  760 + def _find_all_math_in_text(text: Any) -> list[tuple[str, bool]]:
  761 + """从纯文本中提取所有数学片段,返回[(内容, 是否display)]"""
  762 + if not isinstance(text, str):
  763 + return []
  764 + pattern = re.compile(r'\$\$(.+?)\$\$|\$(.+?)\$|\\\((.+?)\\\)|\\\[(.+?)\\\]', re.S)
  765 + results = []
  766 + for m in pattern.finditer(text):
  767 + raw = next(g for g in m.groups() if g is not None)
  768 + latex = raw.strip()
  769 + is_display = bool(m.group(1) or m.group(4))
  770 + results.append((latex, is_display))
  771 + return results
  772 +
682 def _inject_wordcloud_images(self, html: str, img_map: Dict[str, str]) -> str: 773 def _inject_wordcloud_images(self, html: str, img_map: Dict[str, str]) -> str:
683 """ 774 """
684 将词云PNG data URI注入HTML,替换对应canvas 775 将词云PNG data URI注入HTML,替换对应canvas