Showing
1 changed file
with
61 additions
and
16 deletions
| @@ -90,7 +90,8 @@ class ChapterGenerationNode(BaseNode): | @@ -90,7 +90,8 @@ class ChapterGenerationNode(BaseNode): | ||
| 90 | } | 90 | } |
| 91 | # 章节若仅包含标题或字符过少则视为失败,强制LLM重新生成 | 91 | # 章节若仅包含标题或字符过少则视为失败,强制LLM重新生成 |
| 92 | _MIN_NON_HEADING_BLOCKS = 2 | 92 | _MIN_NON_HEADING_BLOCKS = 2 |
| 93 | - _MIN_BODY_CHARACTERS = 400 | 93 | + _MIN_BODY_CHARACTERS = 600 |
| 94 | + _MIN_NARRATIVE_CHARACTERS = 300 | ||
| 94 | _PARAGRAPH_FRAGMENT_MAX_CHARS = 80 | 95 | _PARAGRAPH_FRAGMENT_MAX_CHARS = 80 |
| 95 | _PARAGRAPH_FRAGMENT_NO_TERMINATOR_MAX_CHARS = 240 | 96 | _PARAGRAPH_FRAGMENT_NO_TERMINATOR_MAX_CHARS = 240 |
| 96 | _TERMINATION_PUNCTUATION = set("。!?!?;;……") | 97 | _TERMINATION_PUNCTUATION = set("。!?!?;;……") |
| @@ -659,10 +660,15 @@ class ChapterGenerationNode(BaseNode): | @@ -659,10 +660,15 @@ class ChapterGenerationNode(BaseNode): | ||
| 659 | and block.get("type") not in {"heading", "divider", "toc"} | 660 | and block.get("type") not in {"heading", "divider", "toc"} |
| 660 | ] | 661 | ] |
| 661 | body_characters = self._count_body_characters(blocks) | 662 | body_characters = self._count_body_characters(blocks) |
| 663 | + narrative_characters = self._count_narrative_characters(blocks) | ||
| 662 | 664 | ||
| 663 | - if len(non_heading_blocks) < self._MIN_NON_HEADING_BLOCKS or body_characters < self._MIN_BODY_CHARACTERS: | 665 | + if ( |
| 666 | + len(non_heading_blocks) < self._MIN_NON_HEADING_BLOCKS | ||
| 667 | + or body_characters < self._MIN_BODY_CHARACTERS | ||
| 668 | + or narrative_characters < self._MIN_NARRATIVE_CHARACTERS | ||
| 669 | + ): | ||
| 664 | raise ChapterContentError( | 670 | raise ChapterContentError( |
| 665 | - f"{chapter.get('title') or '该章节'} 正文不足:有效区块 {len(non_heading_blocks)} 个,估算字符数 {body_characters}" | 671 | + f"{chapter.get('title') or '该章节'} 正文不足:有效区块 {len(non_heading_blocks)} 个,估算字符数 {body_characters},叙述性字符数 {narrative_characters}" |
| 666 | ) | 672 | ) |
| 667 | 673 | ||
| 668 | def _count_body_characters(self, blocks: Any) -> int: | 674 | def _count_body_characters(self, blocks: Any) -> int: |
| @@ -696,19 +702,7 @@ class ChapterGenerationNode(BaseNode): | @@ -696,19 +702,7 @@ class ChapterGenerationNode(BaseNode): | ||
| 696 | return 0 | 702 | return 0 |
| 697 | 703 | ||
| 698 | if block_type == "paragraph": | 704 | if block_type == "paragraph": |
| 699 | - inlines = node.get("inlines") | ||
| 700 | - if isinstance(inlines, list): | ||
| 701 | - total = 0 | ||
| 702 | - for run in inlines: | ||
| 703 | - if isinstance(run, dict): | ||
| 704 | - text = run.get("text") | ||
| 705 | - if isinstance(text, str): | ||
| 706 | - total += len(text.strip()) | ||
| 707 | - return total | ||
| 708 | - text_value = node.get("text") | ||
| 709 | - if isinstance(text_value, str): | ||
| 710 | - return len(text_value.strip()) | ||
| 711 | - return len(self._extract_block_text(node).strip()) | 705 | + return self._estimate_paragraph_characters(node) |
| 712 | 706 | ||
| 713 | if block_type == "list": | 707 | if block_type == "list": |
| 714 | total = 0 | 708 | total = 0 |
| @@ -735,6 +729,57 @@ class ChapterGenerationNode(BaseNode): | @@ -735,6 +729,57 @@ class ChapterGenerationNode(BaseNode): | ||
| 735 | 729 | ||
| 736 | return walk(blocks) | 730 | return walk(blocks) |
| 737 | 731 | ||
| 732 | + def _count_narrative_characters(self, blocks: Any) -> int: | ||
| 733 | + """ | ||
| 734 | + 统计paragraph/callout/list/blockquote等叙述性结构的字符数,避免被表格/图表“刷长”。 | ||
| 735 | + """ | ||
| 736 | + | ||
| 737 | + def walk(node: Any) -> int: | ||
| 738 | + if node is None: | ||
| 739 | + return 0 | ||
| 740 | + if isinstance(node, list): | ||
| 741 | + return sum(walk(item) for item in node) | ||
| 742 | + if isinstance(node, str): | ||
| 743 | + return len(node.strip()) | ||
| 744 | + if not isinstance(node, dict): | ||
| 745 | + return 0 | ||
| 746 | + | ||
| 747 | + block_type = node.get("type") | ||
| 748 | + if block_type == "paragraph": | ||
| 749 | + return self._estimate_paragraph_characters(node) | ||
| 750 | + if block_type == "list": | ||
| 751 | + total = 0 | ||
| 752 | + for item in node.get("items", []): | ||
| 753 | + total += walk(item) | ||
| 754 | + return total | ||
| 755 | + if block_type in {"callout", "blockquote"}: | ||
| 756 | + return walk(node.get("blocks")) | ||
| 757 | + | ||
| 758 | + # list项可能是匿名dict,兼容性遍历 | ||
| 759 | + if block_type is None: | ||
| 760 | + nested = node.get("blocks") | ||
| 761 | + if isinstance(nested, list): | ||
| 762 | + return walk(nested) | ||
| 763 | + return 0 | ||
| 764 | + | ||
| 765 | + return walk(blocks) | ||
| 766 | + | ||
| 767 | + def _estimate_paragraph_characters(self, block: Dict[str, Any]) -> int: | ||
| 768 | + """提取paragraph文本长度,复用在多种统计中。""" | ||
| 769 | + inlines = block.get("inlines") | ||
| 770 | + if isinstance(inlines, list): | ||
| 771 | + total = 0 | ||
| 772 | + for run in inlines: | ||
| 773 | + if isinstance(run, dict): | ||
| 774 | + text = run.get("text") | ||
| 775 | + if isinstance(text, str): | ||
| 776 | + total += len(text.strip()) | ||
| 777 | + return total | ||
| 778 | + text_value = block.get("text") | ||
| 779 | + if isinstance(text_value, str): | ||
| 780 | + return len(text_value.strip()) | ||
| 781 | + return len(self._extract_block_text(block).strip()) | ||
| 782 | + | ||
| 738 | def _sanitize_block_content(self, block: Dict[str, Any]): | 783 | def _sanitize_block_content(self, block: Dict[str, Any]): |
| 739 | """根据类型做精细化修复,例如清理paragraph内的非法inline mark""" | 784 | """根据类型做精细化修复,例如清理paragraph内的非法inline mark""" |
| 740 | block_type = block.get("type") | 785 | block_type = block.get("type") |
-
Please register or login to post a comment