马一丁

Improve the Rendering of Inline Formulas

... ... @@ -1094,9 +1094,37 @@ class HTMLRenderer:
def _render_paragraph(self, block: Dict[str, Any]) -> str:
"""渲染段落,内部通过inline run保持混排样式"""
inlines = "".join(self._render_inline(run) for run in block.get("inlines", []))
inlines_data = block.get("inlines", [])
# 仅包含单个display公式时直接渲染为块,避免<p>内嵌<div>
if len(inlines_data) == 1:
standalone = self._render_standalone_math_inline(inlines_data[0])
if standalone:
return standalone
inlines = "".join(self._render_inline(run) for run in inlines_data)
return f"<p>{inlines}</p>"
def _render_standalone_math_inline(self, run: Dict[str, Any] | str) -> str | None:
"""当段落只包含单个display公式时,转为math-block避免破坏行内布局"""
if isinstance(run, dict):
text_value, marks = self._normalize_inline_payload(run)
if marks:
return None
math_id_hint = run.get("mathIds") or run.get("mathId")
else:
text_value = "" if run is None else str(run)
math_id_hint = None
marks = []
rendered = self._render_text_with_inline_math(
text_value,
math_id_hint,
allow_display_block=True
)
if rendered and rendered.strip().startswith('<div class="math-block"'):
return rendered
return None
def _render_list(self, block: Dict[str, Any]) -> str:
"""渲染有序/无序/任务列表"""
list_type = block.get("listType", "bullet")
... ... @@ -2034,7 +2062,12 @@ class HTMLRenderer:
break
return latex
def _render_text_with_inline_math(self, text: Any, math_id: str | list | None = None) -> str | None:
def _render_text_with_inline_math(
self,
text: Any,
math_id: str | list | None = None,
allow_display_block: bool = False
) -> str | None:
"""
识别纯文本中的数学定界符并渲染为math-inline/math-block,提升兼容性。
... ... @@ -2045,17 +2078,19 @@ class HTMLRenderer:
return None
pattern = re.compile(r'(\$\$(.+?)\$\$|\$(.+?)\$|\\\((.+?)\\\)|\\\[(.+?)\\\])', re.S)
matches = list(pattern.finditer(text))
if not matches:
return None
cursor = 0
parts: List[str] = []
idx = 0
id_iter = iter(math_id) if isinstance(math_id, list) else None
for m in pattern.finditer(text):
for idx, m in enumerate(matches, start=1):
start, end = m.span()
if start > cursor:
parts.append(self._escape_html(text[cursor:start]))
prefix = text[cursor:start]
raw = next(g for g in m.groups()[1:] if g is not None)
latex = self._normalize_latex_string(raw)
idx += 1
# 若已有math_id,直接使用,避免与SVG注入ID不一致;否则按局部序号生成
if id_iter:
mid = next(id_iter, f"auto-math-{idx}")
... ... @@ -2063,14 +2098,23 @@ class HTMLRenderer:
mid = math_id or f"auto-math-{idx}"
id_attr = f' data-math-id="{self._escape_attr(mid)}"'
is_display = m.group(1).startswith('$$') or m.group(1).startswith('\\[')
if is_display:
is_standalone = (
len(matches) == 1 and
not text[:start].strip() and
not text[end:].strip()
)
use_block = allow_display_block and is_display and is_standalone
if use_block:
# 独立display公式,跳过两侧空白,直接渲染块级
parts.append(f'<div class="math-block"{id_attr}>$$ {self._escape_html(latex)} $$</div>')
cursor = len(text)
break
else:
if prefix:
parts.append(self._escape_html(prefix))
parts.append(f'<span class="math-inline"{id_attr}>\\( {self._escape_html(latex)} \\)</span>')
cursor = end
if cursor == 0:
return None
if cursor < len(text):
parts.append(self._escape_html(text[cursor:]))
return "".join(parts)
... ...
... ... @@ -550,7 +550,8 @@ class PDFRenderer:
# 仅单个math mark
raw = math_mark.get('value') or run.get('text') or ''
latex = self._normalize_latex(raw)
is_display = bool(re.match(r'^\s*(\$\$|\\\[)', str(raw)))
# 行内mark统一按inline处理,避免误将行内公式当成display
is_display = False
if not latex:
continue
block_counter[0] += 1
... ... @@ -748,13 +749,19 @@ class PDFRenderer:
if not isinstance(text, str):
return None
pattern = re.compile(r'\$\$(.+?)\$\$|\$(.+?)\$|\\\((.+?)\\\)|\\\[(.+?)\\\]', re.S)
m = pattern.search(text)
if not m:
matches = list(pattern.finditer(text))
if not matches:
return None
m = matches[0]
raw = next(g for g in m.groups() if g is not None)
latex = raw.strip()
is_display = bool(m.group(1) or m.group(4)) # $$ or \[ \]
return latex, is_display
is_display_raw = bool(m.group(1) or m.group(4)) # $$ or \[ \]
is_standalone = (
len(matches) == 1 and
not text[:m.start()].strip() and
not text[m.end():].strip()
)
return latex, bool(is_display_raw and is_standalone)
@staticmethod
def _find_all_math_in_text(text: Any) -> list[tuple[str, bool]]:
... ... @@ -763,10 +770,21 @@ class PDFRenderer:
return []
pattern = re.compile(r'\$\$(.+?)\$\$|\$(.+?)\$|\\\((.+?)\\\)|\\\[(.+?)\\\]', re.S)
results = []
for m in pattern.finditer(text):
matches = list(pattern.finditer(text))
if not matches:
return results
total = len(matches)
for m in matches:
raw = next(g for g in m.groups() if g is not None)
latex = raw.strip()
is_display = bool(m.group(1) or m.group(4))
is_display_raw = bool(m.group(1) or m.group(4))
is_standalone = (
total == 1 and
not text[:m.start()].strip() and
not text[m.end():].strip()
)
is_display = is_display_raw and is_standalone
results.append((latex, is_display))
return results
... ...