Showing
2 changed files
with
54 additions
and
1 deletions
| @@ -1199,6 +1199,11 @@ class HTMLRenderer: | @@ -1199,6 +1199,11 @@ class HTMLRenderer: | ||
| 1199 | def _render_paragraph(self, block: Dict[str, Any]) -> str: | 1199 | def _render_paragraph(self, block: Dict[str, Any]) -> str: |
| 1200 | """渲染段落,内部通过inline run保持混排样式""" | 1200 | """渲染段落,内部通过inline run保持混排样式""" |
| 1201 | inlines_data = block.get("inlines", []) | 1201 | inlines_data = block.get("inlines", []) |
| 1202 | + | ||
| 1203 | + # 检测并跳过包含文档元数据 JSON 的段落 | ||
| 1204 | + if self._is_metadata_paragraph(inlines_data): | ||
| 1205 | + return "" | ||
| 1206 | + | ||
| 1202 | # 仅包含单个display公式时直接渲染为块,避免<p>内嵌<div> | 1207 | # 仅包含单个display公式时直接渲染为块,避免<p>内嵌<div> |
| 1203 | if len(inlines_data) == 1: | 1208 | if len(inlines_data) == 1: |
| 1204 | standalone = self._render_standalone_math_inline(inlines_data[0]) | 1209 | standalone = self._render_standalone_math_inline(inlines_data[0]) |
| @@ -1208,6 +1213,28 @@ class HTMLRenderer: | @@ -1208,6 +1213,28 @@ class HTMLRenderer: | ||
| 1208 | inlines = "".join(self._render_inline(run) for run in inlines_data) | 1213 | inlines = "".join(self._render_inline(run) for run in inlines_data) |
| 1209 | return f"<p>{inlines}</p>" | 1214 | return f"<p>{inlines}</p>" |
| 1210 | 1215 | ||
| 1216 | + def _is_metadata_paragraph(self, inlines: List[Any]) -> bool: | ||
| 1217 | + """ | ||
| 1218 | + 检测段落是否只包含文档元数据 JSON。 | ||
| 1219 | + | ||
| 1220 | + 某些 LLM 生成的内容会将元数据(如 xrefs、widgets、footnotes、metadata) | ||
| 1221 | + 错误地作为段落内容输出,本方法识别并标记这种情况以便跳过渲染。 | ||
| 1222 | + """ | ||
| 1223 | + if not inlines or len(inlines) != 1: | ||
| 1224 | + return False | ||
| 1225 | + first = inlines[0] | ||
| 1226 | + if not isinstance(first, dict): | ||
| 1227 | + return False | ||
| 1228 | + text = first.get("text", "") | ||
| 1229 | + if not isinstance(text, str): | ||
| 1230 | + return False | ||
| 1231 | + text = text.strip() | ||
| 1232 | + if not text.startswith("{") or not text.endswith("}"): | ||
| 1233 | + return False | ||
| 1234 | + # 检测典型的元数据键 | ||
| 1235 | + metadata_indicators = ['"xrefs"', '"widgets"', '"footnotes"', '"metadata"', '"sectionBudgets"'] | ||
| 1236 | + return any(indicator in text for indicator in metadata_indicators) | ||
| 1237 | + | ||
| 1211 | def _render_standalone_math_inline(self, run: Dict[str, Any] | str) -> str | None: | 1238 | def _render_standalone_math_inline(self, run: Dict[str, Any] | str) -> str | None: |
| 1212 | """当段落只包含单个display公式时,转为math-block避免破坏行内布局""" | 1239 | """当段落只包含单个display公式时,转为math-block避免破坏行内布局""" |
| 1213 | if isinstance(run, dict): | 1240 | if isinstance(run, dict): |
| @@ -146,7 +146,33 @@ class MarkdownRenderer: | @@ -146,7 +146,33 @@ class MarkdownRenderer: | ||
| 146 | return heading_line | 146 | return heading_line |
| 147 | 147 | ||
| 148 | def _render_paragraph(self, block: Dict[str, Any]) -> str: | 148 | def _render_paragraph(self, block: Dict[str, Any]) -> str: |
| 149 | - return self._render_inlines(block.get("inlines", [])) | 149 | + inlines = block.get("inlines", []) |
| 150 | + # 检测并跳过包含文档元数据 JSON 的段落 | ||
| 151 | + if self._is_metadata_paragraph(inlines): | ||
| 152 | + return "" | ||
| 153 | + return self._render_inlines(inlines) | ||
| 154 | + | ||
| 155 | + def _is_metadata_paragraph(self, inlines: List[Any]) -> bool: | ||
| 156 | + """ | ||
| 157 | + 检测段落是否只包含文档元数据 JSON。 | ||
| 158 | + | ||
| 159 | + 某些 LLM 生成的内容会将元数据(如 xrefs、widgets、footnotes、metadata) | ||
| 160 | + 错误地作为段落内容输出,本方法识别并标记这种情况以便跳过渲染。 | ||
| 161 | + """ | ||
| 162 | + if not inlines or len(inlines) != 1: | ||
| 163 | + return False | ||
| 164 | + first = inlines[0] | ||
| 165 | + if not isinstance(first, dict): | ||
| 166 | + return False | ||
| 167 | + text = first.get("text", "") | ||
| 168 | + if not isinstance(text, str): | ||
| 169 | + return False | ||
| 170 | + text = text.strip() | ||
| 171 | + if not text.startswith("{") or not text.endswith("}"): | ||
| 172 | + return False | ||
| 173 | + # 检测典型的元数据键 | ||
| 174 | + metadata_indicators = ['"xrefs"', '"widgets"', '"footnotes"', '"metadata"', '"sectionBudgets"'] | ||
| 175 | + return any(indicator in text for indicator in metadata_indicators) | ||
| 150 | 176 | ||
| 151 | def _render_list(self, block: Dict[str, Any]) -> str: | 177 | def _render_list(self, block: Dict[str, Any]) -> str: |
| 152 | list_type = block.get("listType", "bullet") | 178 | list_type = block.get("listType", "bullet") |
-
Please register or login to post a comment