Showing
2 changed files
with
79 additions
and
17 deletions
| @@ -1094,9 +1094,37 @@ class HTMLRenderer: | @@ -1094,9 +1094,37 @@ class HTMLRenderer: | ||
| 1094 | 1094 | ||
| 1095 | def _render_paragraph(self, block: Dict[str, Any]) -> str: | 1095 | def _render_paragraph(self, block: Dict[str, Any]) -> str: |
| 1096 | """渲染段落,内部通过inline run保持混排样式""" | 1096 | """渲染段落,内部通过inline run保持混排样式""" |
| 1097 | - inlines = "".join(self._render_inline(run) for run in block.get("inlines", [])) | 1097 | + inlines_data = block.get("inlines", []) |
| 1098 | + # 仅包含单个display公式时直接渲染为块,避免<p>内嵌<div> | ||
| 1099 | + if len(inlines_data) == 1: | ||
| 1100 | + standalone = self._render_standalone_math_inline(inlines_data[0]) | ||
| 1101 | + if standalone: | ||
| 1102 | + return standalone | ||
| 1103 | + | ||
| 1104 | + inlines = "".join(self._render_inline(run) for run in inlines_data) | ||
| 1098 | return f"<p>{inlines}</p>" | 1105 | return f"<p>{inlines}</p>" |
| 1099 | 1106 | ||
| 1107 | + def _render_standalone_math_inline(self, run: Dict[str, Any] | str) -> str | None: | ||
| 1108 | + """当段落只包含单个display公式时,转为math-block避免破坏行内布局""" | ||
| 1109 | + if isinstance(run, dict): | ||
| 1110 | + text_value, marks = self._normalize_inline_payload(run) | ||
| 1111 | + if marks: | ||
| 1112 | + return None | ||
| 1113 | + math_id_hint = run.get("mathIds") or run.get("mathId") | ||
| 1114 | + else: | ||
| 1115 | + text_value = "" if run is None else str(run) | ||
| 1116 | + math_id_hint = None | ||
| 1117 | + marks = [] | ||
| 1118 | + | ||
| 1119 | + rendered = self._render_text_with_inline_math( | ||
| 1120 | + text_value, | ||
| 1121 | + math_id_hint, | ||
| 1122 | + allow_display_block=True | ||
| 1123 | + ) | ||
| 1124 | + if rendered and rendered.strip().startswith('<div class="math-block"'): | ||
| 1125 | + return rendered | ||
| 1126 | + return None | ||
| 1127 | + | ||
| 1100 | def _render_list(self, block: Dict[str, Any]) -> str: | 1128 | def _render_list(self, block: Dict[str, Any]) -> str: |
| 1101 | """渲染有序/无序/任务列表""" | 1129 | """渲染有序/无序/任务列表""" |
| 1102 | list_type = block.get("listType", "bullet") | 1130 | list_type = block.get("listType", "bullet") |
| @@ -2034,7 +2062,12 @@ class HTMLRenderer: | @@ -2034,7 +2062,12 @@ class HTMLRenderer: | ||
| 2034 | break | 2062 | break |
| 2035 | return latex | 2063 | return latex |
| 2036 | 2064 | ||
| 2037 | - def _render_text_with_inline_math(self, text: Any, math_id: str | list | None = None) -> str | None: | 2065 | + def _render_text_with_inline_math( |
| 2066 | + self, | ||
| 2067 | + text: Any, | ||
| 2068 | + math_id: str | list | None = None, | ||
| 2069 | + allow_display_block: bool = False | ||
| 2070 | + ) -> str | None: | ||
| 2038 | """ | 2071 | """ |
| 2039 | 识别纯文本中的数学定界符并渲染为math-inline/math-block,提升兼容性。 | 2072 | 识别纯文本中的数学定界符并渲染为math-inline/math-block,提升兼容性。 |
| 2040 | 2073 | ||
| @@ -2045,17 +2078,19 @@ class HTMLRenderer: | @@ -2045,17 +2078,19 @@ class HTMLRenderer: | ||
| 2045 | return None | 2078 | return None |
| 2046 | 2079 | ||
| 2047 | pattern = re.compile(r'(\$\$(.+?)\$\$|\$(.+?)\$|\\\((.+?)\\\)|\\\[(.+?)\\\])', re.S) | 2080 | pattern = re.compile(r'(\$\$(.+?)\$\$|\$(.+?)\$|\\\((.+?)\\\)|\\\[(.+?)\\\])', re.S) |
| 2081 | + matches = list(pattern.finditer(text)) | ||
| 2082 | + if not matches: | ||
| 2083 | + return None | ||
| 2084 | + | ||
| 2048 | cursor = 0 | 2085 | cursor = 0 |
| 2049 | parts: List[str] = [] | 2086 | parts: List[str] = [] |
| 2050 | - idx = 0 | ||
| 2051 | id_iter = iter(math_id) if isinstance(math_id, list) else None | 2087 | id_iter = iter(math_id) if isinstance(math_id, list) else None |
| 2052 | - for m in pattern.finditer(text): | 2088 | + |
| 2089 | + for idx, m in enumerate(matches, start=1): | ||
| 2053 | start, end = m.span() | 2090 | start, end = m.span() |
| 2054 | - if start > cursor: | ||
| 2055 | - parts.append(self._escape_html(text[cursor:start])) | 2091 | + prefix = text[cursor:start] |
| 2056 | raw = next(g for g in m.groups()[1:] if g is not None) | 2092 | raw = next(g for g in m.groups()[1:] if g is not None) |
| 2057 | latex = self._normalize_latex_string(raw) | 2093 | latex = self._normalize_latex_string(raw) |
| 2058 | - idx += 1 | ||
| 2059 | # 若已有math_id,直接使用,避免与SVG注入ID不一致;否则按局部序号生成 | 2094 | # 若已有math_id,直接使用,避免与SVG注入ID不一致;否则按局部序号生成 |
| 2060 | if id_iter: | 2095 | if id_iter: |
| 2061 | mid = next(id_iter, f"auto-math-{idx}") | 2096 | mid = next(id_iter, f"auto-math-{idx}") |
| @@ -2063,14 +2098,23 @@ class HTMLRenderer: | @@ -2063,14 +2098,23 @@ class HTMLRenderer: | ||
| 2063 | mid = math_id or f"auto-math-{idx}" | 2098 | mid = math_id or f"auto-math-{idx}" |
| 2064 | id_attr = f' data-math-id="{self._escape_attr(mid)}"' | 2099 | id_attr = f' data-math-id="{self._escape_attr(mid)}"' |
| 2065 | is_display = m.group(1).startswith('$$') or m.group(1).startswith('\\[') | 2100 | is_display = m.group(1).startswith('$$') or m.group(1).startswith('\\[') |
| 2066 | - if is_display: | 2101 | + is_standalone = ( |
| 2102 | + len(matches) == 1 and | ||
| 2103 | + not text[:start].strip() and | ||
| 2104 | + not text[end:].strip() | ||
| 2105 | + ) | ||
| 2106 | + use_block = allow_display_block and is_display and is_standalone | ||
| 2107 | + if use_block: | ||
| 2108 | + # 独立display公式,跳过两侧空白,直接渲染块级 | ||
| 2067 | parts.append(f'<div class="math-block"{id_attr}>$$ {self._escape_html(latex)} $$</div>') | 2109 | parts.append(f'<div class="math-block"{id_attr}>$$ {self._escape_html(latex)} $$</div>') |
| 2110 | + cursor = len(text) | ||
| 2111 | + break | ||
| 2068 | else: | 2112 | else: |
| 2113 | + if prefix: | ||
| 2114 | + parts.append(self._escape_html(prefix)) | ||
| 2069 | parts.append(f'<span class="math-inline"{id_attr}>\\( {self._escape_html(latex)} \\)</span>') | 2115 | parts.append(f'<span class="math-inline"{id_attr}>\\( {self._escape_html(latex)} \\)</span>') |
| 2070 | cursor = end | 2116 | cursor = end |
| 2071 | 2117 | ||
| 2072 | - if cursor == 0: | ||
| 2073 | - return None | ||
| 2074 | if cursor < len(text): | 2118 | if cursor < len(text): |
| 2075 | parts.append(self._escape_html(text[cursor:])) | 2119 | parts.append(self._escape_html(text[cursor:])) |
| 2076 | return "".join(parts) | 2120 | return "".join(parts) |
| @@ -550,7 +550,8 @@ class PDFRenderer: | @@ -550,7 +550,8 @@ class PDFRenderer: | ||
| 550 | # 仅单个math mark | 550 | # 仅单个math mark |
| 551 | raw = math_mark.get('value') or run.get('text') or '' | 551 | raw = math_mark.get('value') or run.get('text') or '' |
| 552 | latex = self._normalize_latex(raw) | 552 | latex = self._normalize_latex(raw) |
| 553 | - is_display = bool(re.match(r'^\s*(\$\$|\\\[)', str(raw))) | 553 | + # 行内mark统一按inline处理,避免误将行内公式当成display |
| 554 | + is_display = False | ||
| 554 | if not latex: | 555 | if not latex: |
| 555 | continue | 556 | continue |
| 556 | block_counter[0] += 1 | 557 | block_counter[0] += 1 |
| @@ -748,13 +749,19 @@ class PDFRenderer: | @@ -748,13 +749,19 @@ class PDFRenderer: | ||
| 748 | if not isinstance(text, str): | 749 | if not isinstance(text, str): |
| 749 | return None | 750 | return None |
| 750 | pattern = re.compile(r'\$\$(.+?)\$\$|\$(.+?)\$|\\\((.+?)\\\)|\\\[(.+?)\\\]', re.S) | 751 | pattern = re.compile(r'\$\$(.+?)\$\$|\$(.+?)\$|\\\((.+?)\\\)|\\\[(.+?)\\\]', re.S) |
| 751 | - m = pattern.search(text) | ||
| 752 | - if not m: | 752 | + matches = list(pattern.finditer(text)) |
| 753 | + if not matches: | ||
| 753 | return None | 754 | return None |
| 755 | + m = matches[0] | ||
| 754 | raw = next(g for g in m.groups() if g is not None) | 756 | raw = next(g for g in m.groups() if g is not None) |
| 755 | latex = raw.strip() | 757 | latex = raw.strip() |
| 756 | - is_display = bool(m.group(1) or m.group(4)) # $$ or \[ \] | ||
| 757 | - return latex, is_display | 758 | + is_display_raw = bool(m.group(1) or m.group(4)) # $$ or \[ \] |
| 759 | + is_standalone = ( | ||
| 760 | + len(matches) == 1 and | ||
| 761 | + not text[:m.start()].strip() and | ||
| 762 | + not text[m.end():].strip() | ||
| 763 | + ) | ||
| 764 | + return latex, bool(is_display_raw and is_standalone) | ||
| 758 | 765 | ||
| 759 | @staticmethod | 766 | @staticmethod |
| 760 | def _find_all_math_in_text(text: Any) -> list[tuple[str, bool]]: | 767 | def _find_all_math_in_text(text: Any) -> list[tuple[str, bool]]: |
| @@ -763,10 +770,21 @@ class PDFRenderer: | @@ -763,10 +770,21 @@ class PDFRenderer: | ||
| 763 | return [] | 770 | return [] |
| 764 | pattern = re.compile(r'\$\$(.+?)\$\$|\$(.+?)\$|\\\((.+?)\\\)|\\\[(.+?)\\\]', re.S) | 771 | pattern = re.compile(r'\$\$(.+?)\$\$|\$(.+?)\$|\\\((.+?)\\\)|\\\[(.+?)\\\]', re.S) |
| 765 | results = [] | 772 | results = [] |
| 766 | - for m in pattern.finditer(text): | 773 | + matches = list(pattern.finditer(text)) |
| 774 | + if not matches: | ||
| 775 | + return results | ||
| 776 | + total = len(matches) | ||
| 777 | + | ||
| 778 | + for m in matches: | ||
| 767 | raw = next(g for g in m.groups() if g is not None) | 779 | raw = next(g for g in m.groups() if g is not None) |
| 768 | latex = raw.strip() | 780 | latex = raw.strip() |
| 769 | - is_display = bool(m.group(1) or m.group(4)) | 781 | + is_display_raw = bool(m.group(1) or m.group(4)) |
| 782 | + is_standalone = ( | ||
| 783 | + total == 1 and | ||
| 784 | + not text[:m.start()].strip() and | ||
| 785 | + not text[m.end():].strip() | ||
| 786 | + ) | ||
| 787 | + is_display = is_display_raw and is_standalone | ||
| 770 | results.append((latex, is_display)) | 788 | results.append((latex, is_display)) |
| 771 | return results | 789 | return results |
| 772 | 790 |
-
Please register or login to post a comment