Showing
2 changed files
with
110 additions
and
2 deletions
| @@ -885,7 +885,12 @@ class HTMLRenderer: | @@ -885,7 +885,12 @@ class HTMLRenderer: | ||
| 885 | """粗略判断dict是否符合block结构""" | 885 | """粗略判断dict是否符合block结构""" |
| 886 | if not isinstance(payload, dict): | 886 | if not isinstance(payload, dict): |
| 887 | return False | 887 | return False |
| 888 | - if "type" in payload and isinstance(payload["type"], str): | 888 | + block_type = payload.get("type") |
| 889 | + if block_type and isinstance(block_type, str): | ||
| 890 | + # 排除内联类型(inlineRun 等),它们不是块级元素 | ||
| 891 | + inline_types = {"inlineRun", "inline", "text"} | ||
| 892 | + if block_type in inline_types: | ||
| 893 | + return False | ||
| 889 | return True | 894 | return True |
| 890 | structural_keys = {"blocks", "rows", "items", "widgetId", "widgetType", "data"} | 895 | structural_keys = {"blocks", "rows", "items", "widgetId", "widgetType", "data"} |
| 891 | return any(key in payload for key in structural_keys) | 896 | return any(key in payload for key in structural_keys) |
| @@ -896,6 +901,12 @@ class HTMLRenderer: | @@ -896,6 +901,12 @@ class HTMLRenderer: | ||
| 896 | if isinstance(payload, dict): | 901 | if isinstance(payload, dict): |
| 897 | block_list = payload.get("blocks") | 902 | block_list = payload.get("blocks") |
| 898 | block_type = payload.get("type") | 903 | block_type = payload.get("type") |
| 904 | + | ||
| 905 | + # 排除内联类型,它们不是块级元素 | ||
| 906 | + inline_types = {"inlineRun", "inline", "text"} | ||
| 907 | + if block_type in inline_types: | ||
| 908 | + return collected | ||
| 909 | + | ||
| 899 | if isinstance(block_list, list) and not block_type: | 910 | if isinstance(block_list, list) and not block_type: |
| 900 | for candidate in block_list: | 911 | for candidate in block_list: |
| 901 | collected.extend(self._collect_blocks_from_payload(candidate)) | 912 | collected.extend(self._collect_blocks_from_payload(candidate)) |
| @@ -2933,6 +2944,19 @@ class HTMLRenderer: | @@ -2933,6 +2944,19 @@ class HTMLRenderer: | ||
| 2933 | if not isinstance(run, dict): | 2944 | if not isinstance(run, dict): |
| 2934 | return ("" if run is None else str(run)), [] | 2945 | return ("" if run is None else str(run)), [] |
| 2935 | 2946 | ||
| 2947 | + # 处理 inlineRun 类型:递归展开其 inlines 数组 | ||
| 2948 | + if run.get("type") == "inlineRun": | ||
| 2949 | + inner_inlines = run.get("inlines") or [] | ||
| 2950 | + outer_marks = run.get("marks") or [] | ||
| 2951 | + # 递归合并所有内部 inlines 的文本 | ||
| 2952 | + texts = [] | ||
| 2953 | + all_marks = list(outer_marks) | ||
| 2954 | + for inline in inner_inlines: | ||
| 2955 | + inner_text, inner_marks = self._normalize_inline_payload(inline) | ||
| 2956 | + texts.append(inner_text) | ||
| 2957 | + all_marks.extend(inner_marks) | ||
| 2958 | + return "".join(texts), all_marks | ||
| 2959 | + | ||
| 2936 | marks = list(run.get("marks") or []) | 2960 | marks = list(run.get("marks") or []) |
| 2937 | text_value: Any = run.get("text", "") | 2961 | text_value: Any = run.get("text", "") |
| 2938 | seen: set[int] = set() | 2962 | seen: set[int] = set() |
| @@ -2980,6 +3004,9 @@ class HTMLRenderer: | @@ -2980,6 +3004,9 @@ class HTMLRenderer: | ||
| 2980 | else: | 3004 | else: |
| 2981 | inline_payload = self._coerce_inline_payload(payload) | 3005 | inline_payload = self._coerce_inline_payload(payload) |
| 2982 | if inline_payload: | 3006 | if inline_payload: |
| 3007 | + # 处理 inlineRun 类型 | ||
| 3008 | + if inline_payload.get("type") == "inlineRun": | ||
| 3009 | + return self._normalize_inline_payload(inline_payload) | ||
| 2983 | nested_text = inline_payload.get("text") | 3010 | nested_text = inline_payload.get("text") |
| 2984 | if nested_text is not None: | 3011 | if nested_text is not None: |
| 2985 | text_value = nested_text | 3012 | text_value = nested_text |
| @@ -3073,9 +3100,12 @@ class HTMLRenderer: | @@ -3073,9 +3100,12 @@ class HTMLRenderer: | ||
| 3073 | if not isinstance(payload, dict): | 3100 | if not isinstance(payload, dict): |
| 3074 | return None | 3101 | return None |
| 3075 | inline_type = payload.get("type") | 3102 | inline_type = payload.get("type") |
| 3103 | + # 支持 inlineRun 类型:包含嵌套的 inlines 数组 | ||
| 3104 | + if inline_type == "inlineRun": | ||
| 3105 | + return payload | ||
| 3076 | if inline_type and inline_type not in {"inline", "text"}: | 3106 | if inline_type and inline_type not in {"inline", "text"}: |
| 3077 | return None | 3107 | return None |
| 3078 | - if "text" not in payload and "marks" not in payload: | 3108 | + if "text" not in payload and "marks" not in payload and "inlines" not in payload: |
| 3079 | return None | 3109 | return None |
| 3080 | return payload | 3110 | return payload |
| 3081 | 3111 |
| @@ -647,11 +647,29 @@ class MarkdownRenderer: | @@ -647,11 +647,29 @@ class MarkdownRenderer: | ||
| 647 | 647 | ||
| 648 | def _render_inline_run(self, run: Any, for_table: bool = False) -> str: | 648 | def _render_inline_run(self, run: Any, for_table: bool = False) -> str: |
| 649 | if isinstance(run, dict): | 649 | if isinstance(run, dict): |
| 650 | + # 处理 inlineRun 类型:嵌套的 inlines 数组 | ||
| 651 | + if run.get("type") == "inlineRun": | ||
| 652 | + inner_inlines = run.get("inlines") or [] | ||
| 653 | + outer_marks = run.get("marks") or [] | ||
| 654 | + # 递归渲染内部的 inlines | ||
| 655 | + inner_text = self._render_inlines(inner_inlines, for_table=for_table) | ||
| 656 | + # 应用外层的 marks | ||
| 657 | + result = inner_text | ||
| 658 | + for mark in outer_marks: | ||
| 659 | + result = self._apply_mark(result, mark) | ||
| 660 | + return result | ||
| 650 | text = run.get("text", "") | 661 | text = run.get("text", "") |
| 651 | marks = run.get("marks") or [] | 662 | marks = run.get("marks") or [] |
| 652 | else: | 663 | else: |
| 653 | text = run if isinstance(run, str) else "" | 664 | text = run if isinstance(run, str) else "" |
| 654 | marks = [] | 665 | marks = [] |
| 666 | + | ||
| 667 | + # 尝试检测并解析被错误序列化为字符串的 inlineRun JSON | ||
| 668 | + if isinstance(text, str) and text.startswith('{"type": "inlineRun"'): | ||
| 669 | + parsed = self._try_parse_inline_run_string(text) | ||
| 670 | + if parsed: | ||
| 671 | + return self._render_inline_run(parsed, for_table=for_table) | ||
| 672 | + | ||
| 655 | result = self._escape_text(text, for_table=for_table) | 673 | result = self._escape_text(text, for_table=for_table) |
| 656 | for mark in marks: | 674 | for mark in marks: |
| 657 | if not isinstance(mark, dict): | 675 | if not isinstance(mark, dict): |
| @@ -683,6 +701,66 @@ class MarkdownRenderer: | @@ -683,6 +701,66 @@ class MarkdownRenderer: | ||
| 683 | # 颜色/字体等非通用标记直接降级为纯文本 | 701 | # 颜色/字体等非通用标记直接降级为纯文本 |
| 684 | return result | 702 | return result |
| 685 | 703 | ||
| 704 | + def _apply_mark(self, text: str, mark: Any) -> str: | ||
| 705 | + """ | ||
| 706 | + 对文本应用单个 mark 格式。 | ||
| 707 | + | ||
| 708 | + 用于处理 inlineRun 类型的外层 marks。 | ||
| 709 | + """ | ||
| 710 | + if not isinstance(mark, dict): | ||
| 711 | + return text | ||
| 712 | + mtype = mark.get("type") | ||
| 713 | + if mtype == "bold": | ||
| 714 | + return f"**{text}**" | ||
| 715 | + elif mtype == "italic": | ||
| 716 | + return f"*{text}*" | ||
| 717 | + elif mtype == "underline": | ||
| 718 | + return f"__{text}__" | ||
| 719 | + elif mtype == "strike": | ||
| 720 | + return f"~~{text}~~" | ||
| 721 | + elif mtype == "code": | ||
| 722 | + return f"`{text}`" | ||
| 723 | + elif mtype == "link": | ||
| 724 | + href = mark.get("href") or mark.get("value") | ||
| 725 | + href = str(href) if href else "" | ||
| 726 | + return f"[{text}]({href})" if href else text | ||
| 727 | + elif mtype == "highlight": | ||
| 728 | + return f"=={text}==" | ||
| 729 | + elif mtype == "subscript": | ||
| 730 | + return f"~{text}~" | ||
| 731 | + elif mtype == "superscript": | ||
| 732 | + return f"^{text}^" | ||
| 733 | + elif mtype == "math": | ||
| 734 | + latex = self._normalize_math(mark.get("value") or text) | ||
| 735 | + return f"${latex}$" if latex else text | ||
| 736 | + return text | ||
| 737 | + | ||
| 738 | + def _try_parse_inline_run_string(self, text: str) -> dict | None: | ||
| 739 | + """ | ||
| 740 | + 尝试解析被错误序列化为字符串的 inlineRun JSON。 | ||
| 741 | + | ||
| 742 | + 某些 LLM 生成的内容会将 inlineRun 结构意外地作为字符串 | ||
| 743 | + 存入 text 字段,本方法尝试识别并解析这种情况。 | ||
| 744 | + | ||
| 745 | + 参数: | ||
| 746 | + text: 可能包含 JSON 的字符串 | ||
| 747 | + | ||
| 748 | + 返回: | ||
| 749 | + dict | None: 解析成功返回 inlineRun 字典,否则返回 None | ||
| 750 | + """ | ||
| 751 | + if not text or not isinstance(text, str): | ||
| 752 | + return None | ||
| 753 | + text = text.strip() | ||
| 754 | + if not text.startswith('{"type": "inlineRun"'): | ||
| 755 | + return None | ||
| 756 | + try: | ||
| 757 | + parsed = json.loads(text) | ||
| 758 | + if isinstance(parsed, dict) and parsed.get("type") == "inlineRun": | ||
| 759 | + return parsed | ||
| 760 | + except json.JSONDecodeError: | ||
| 761 | + pass | ||
| 762 | + return None | ||
| 763 | + | ||
| 686 | def _is_heading_duplicate(self, block: Dict[str, Any], chapter_title: str | None) -> bool: | 764 | def _is_heading_duplicate(self, block: Dict[str, Any], chapter_title: str | None) -> bool: |
| 687 | """判断首个heading是否与章节标题重复""" | 765 | """判断首个heading是否与章节标题重复""" |
| 688 | if not isinstance(block, dict) or block.get("type") != "heading": | 766 | if not isinstance(block, dict) or block.get("type") != "heading": |
-
Please register or login to post a comment