马一丁

Improve the Rendering of Inline Formulas

@@ -1094,9 +1094,37 @@ class HTMLRenderer: @@ -1094,9 +1094,37 @@ class HTMLRenderer:
1094 1094
1095 def _render_paragraph(self, block: Dict[str, Any]) -> str: 1095 def _render_paragraph(self, block: Dict[str, Any]) -> str:
1096 """渲染段落,内部通过inline run保持混排样式""" 1096 """渲染段落,内部通过inline run保持混排样式"""
1097 - inlines = "".join(self._render_inline(run) for run in block.get("inlines", [])) 1097 + inlines_data = block.get("inlines", [])
  1098 + # 仅包含单个display公式时直接渲染为块,避免<p>内嵌<div>
  1099 + if len(inlines_data) == 1:
  1100 + standalone = self._render_standalone_math_inline(inlines_data[0])
  1101 + if standalone:
  1102 + return standalone
  1103 +
  1104 + inlines = "".join(self._render_inline(run) for run in inlines_data)
1098 return f"<p>{inlines}</p>" 1105 return f"<p>{inlines}</p>"
1099 1106
  1107 + def _render_standalone_math_inline(self, run: Dict[str, Any] | str) -> str | None:
  1108 + """当段落只包含单个display公式时,转为math-block避免破坏行内布局"""
  1109 + if isinstance(run, dict):
  1110 + text_value, marks = self._normalize_inline_payload(run)
  1111 + if marks:
  1112 + return None
  1113 + math_id_hint = run.get("mathIds") or run.get("mathId")
  1114 + else:
  1115 + text_value = "" if run is None else str(run)
  1116 + math_id_hint = None
  1117 + marks = []
  1118 +
  1119 + rendered = self._render_text_with_inline_math(
  1120 + text_value,
  1121 + math_id_hint,
  1122 + allow_display_block=True
  1123 + )
  1124 + if rendered and rendered.strip().startswith('<div class="math-block"'):
  1125 + return rendered
  1126 + return None
  1127 +
1100 def _render_list(self, block: Dict[str, Any]) -> str: 1128 def _render_list(self, block: Dict[str, Any]) -> str:
1101 """渲染有序/无序/任务列表""" 1129 """渲染有序/无序/任务列表"""
1102 list_type = block.get("listType", "bullet") 1130 list_type = block.get("listType", "bullet")
@@ -2034,7 +2062,12 @@ class HTMLRenderer: @@ -2034,7 +2062,12 @@ class HTMLRenderer:
2034 break 2062 break
2035 return latex 2063 return latex
2036 2064
2037 - def _render_text_with_inline_math(self, text: Any, math_id: str | list | None = None) -> str | None: 2065 + def _render_text_with_inline_math(
  2066 + self,
  2067 + text: Any,
  2068 + math_id: str | list | None = None,
  2069 + allow_display_block: bool = False
  2070 + ) -> str | None:
2038 """ 2071 """
2039 识别纯文本中的数学定界符并渲染为math-inline/math-block,提升兼容性。 2072 识别纯文本中的数学定界符并渲染为math-inline/math-block,提升兼容性。
2040 2073
@@ -2045,17 +2078,19 @@ class HTMLRenderer: @@ -2045,17 +2078,19 @@ class HTMLRenderer:
2045 return None 2078 return None
2046 2079
2047 pattern = re.compile(r'(\$\$(.+?)\$\$|\$(.+?)\$|\\\((.+?)\\\)|\\\[(.+?)\\\])', re.S) 2080 pattern = re.compile(r'(\$\$(.+?)\$\$|\$(.+?)\$|\\\((.+?)\\\)|\\\[(.+?)\\\])', re.S)
  2081 + matches = list(pattern.finditer(text))
  2082 + if not matches:
  2083 + return None
  2084 +
2048 cursor = 0 2085 cursor = 0
2049 parts: List[str] = [] 2086 parts: List[str] = []
2050 - idx = 0  
2051 id_iter = iter(math_id) if isinstance(math_id, list) else None 2087 id_iter = iter(math_id) if isinstance(math_id, list) else None
2052 - for m in pattern.finditer(text): 2088 +
  2089 + for idx, m in enumerate(matches, start=1):
2053 start, end = m.span() 2090 start, end = m.span()
2054 - if start > cursor:  
2055 - parts.append(self._escape_html(text[cursor:start])) 2091 + prefix = text[cursor:start]
2056 raw = next(g for g in m.groups()[1:] if g is not None) 2092 raw = next(g for g in m.groups()[1:] if g is not None)
2057 latex = self._normalize_latex_string(raw) 2093 latex = self._normalize_latex_string(raw)
2058 - idx += 1  
2059 # 若已有math_id,直接使用,避免与SVG注入ID不一致;否则按局部序号生成 2094 # 若已有math_id,直接使用,避免与SVG注入ID不一致;否则按局部序号生成
2060 if id_iter: 2095 if id_iter:
2061 mid = next(id_iter, f"auto-math-{idx}") 2096 mid = next(id_iter, f"auto-math-{idx}")
@@ -2063,14 +2098,23 @@ class HTMLRenderer: @@ -2063,14 +2098,23 @@ class HTMLRenderer:
2063 mid = math_id or f"auto-math-{idx}" 2098 mid = math_id or f"auto-math-{idx}"
2064 id_attr = f' data-math-id="{self._escape_attr(mid)}"' 2099 id_attr = f' data-math-id="{self._escape_attr(mid)}"'
2065 is_display = m.group(1).startswith('$$') or m.group(1).startswith('\\[') 2100 is_display = m.group(1).startswith('$$') or m.group(1).startswith('\\[')
2066 - if is_display: 2101 + is_standalone = (
  2102 + len(matches) == 1 and
  2103 + not text[:start].strip() and
  2104 + not text[end:].strip()
  2105 + )
  2106 + use_block = allow_display_block and is_display and is_standalone
  2107 + if use_block:
  2108 + # 独立display公式,跳过两侧空白,直接渲染块级
2067 parts.append(f'<div class="math-block"{id_attr}>$$ {self._escape_html(latex)} $$</div>') 2109 parts.append(f'<div class="math-block"{id_attr}>$$ {self._escape_html(latex)} $$</div>')
  2110 + cursor = len(text)
  2111 + break
2068 else: 2112 else:
  2113 + if prefix:
  2114 + parts.append(self._escape_html(prefix))
2069 parts.append(f'<span class="math-inline"{id_attr}>\\( {self._escape_html(latex)} \\)</span>') 2115 parts.append(f'<span class="math-inline"{id_attr}>\\( {self._escape_html(latex)} \\)</span>')
2070 cursor = end 2116 cursor = end
2071 2117
2072 - if cursor == 0:  
2073 - return None  
2074 if cursor < len(text): 2118 if cursor < len(text):
2075 parts.append(self._escape_html(text[cursor:])) 2119 parts.append(self._escape_html(text[cursor:]))
2076 return "".join(parts) 2120 return "".join(parts)
@@ -550,7 +550,8 @@ class PDFRenderer: @@ -550,7 +550,8 @@ class PDFRenderer:
550 # 仅单个math mark 550 # 仅单个math mark
551 raw = math_mark.get('value') or run.get('text') or '' 551 raw = math_mark.get('value') or run.get('text') or ''
552 latex = self._normalize_latex(raw) 552 latex = self._normalize_latex(raw)
553 - is_display = bool(re.match(r'^\s*(\$\$|\\\[)', str(raw))) 553 + # 行内mark统一按inline处理,避免误将行内公式当成display
  554 + is_display = False
554 if not latex: 555 if not latex:
555 continue 556 continue
556 block_counter[0] += 1 557 block_counter[0] += 1
@@ -748,13 +749,19 @@ class PDFRenderer: @@ -748,13 +749,19 @@ class PDFRenderer:
748 if not isinstance(text, str): 749 if not isinstance(text, str):
749 return None 750 return None
750 pattern = re.compile(r'\$\$(.+?)\$\$|\$(.+?)\$|\\\((.+?)\\\)|\\\[(.+?)\\\]', re.S) 751 pattern = re.compile(r'\$\$(.+?)\$\$|\$(.+?)\$|\\\((.+?)\\\)|\\\[(.+?)\\\]', re.S)
751 - m = pattern.search(text)  
752 - if not m: 752 + matches = list(pattern.finditer(text))
  753 + if not matches:
753 return None 754 return None
  755 + m = matches[0]
754 raw = next(g for g in m.groups() if g is not None) 756 raw = next(g for g in m.groups() if g is not None)
755 latex = raw.strip() 757 latex = raw.strip()
756 - is_display = bool(m.group(1) or m.group(4)) # $$ or \[ \]  
757 - return latex, is_display 758 + is_display_raw = bool(m.group(1) or m.group(4)) # $$ or \[ \]
  759 + is_standalone = (
  760 + len(matches) == 1 and
  761 + not text[:m.start()].strip() and
  762 + not text[m.end():].strip()
  763 + )
  764 + return latex, bool(is_display_raw and is_standalone)
758 765
759 @staticmethod 766 @staticmethod
760 def _find_all_math_in_text(text: Any) -> list[tuple[str, bool]]: 767 def _find_all_math_in_text(text: Any) -> list[tuple[str, bool]]:
@@ -763,10 +770,21 @@ class PDFRenderer: @@ -763,10 +770,21 @@ class PDFRenderer:
763 return [] 770 return []
764 pattern = re.compile(r'\$\$(.+?)\$\$|\$(.+?)\$|\\\((.+?)\\\)|\\\[(.+?)\\\]', re.S) 771 pattern = re.compile(r'\$\$(.+?)\$\$|\$(.+?)\$|\\\((.+?)\\\)|\\\[(.+?)\\\]', re.S)
765 results = [] 772 results = []
766 - for m in pattern.finditer(text): 773 + matches = list(pattern.finditer(text))
  774 + if not matches:
  775 + return results
  776 + total = len(matches)
  777 +
  778 + for m in matches:
767 raw = next(g for g in m.groups() if g is not None) 779 raw = next(g for g in m.groups() if g is not None)
768 latex = raw.strip() 780 latex = raw.strip()
769 - is_display = bool(m.group(1) or m.group(4)) 781 + is_display_raw = bool(m.group(1) or m.group(4))
  782 + is_standalone = (
  783 + total == 1 and
  784 + not text[:m.start()].strip() and
  785 + not text[m.end():].strip()
  786 + )
  787 + is_display = is_display_raw and is_standalone
770 results.append((latex, is_display)) 788 results.append((latex, is_display))
771 return results 789 return results
772 790