马一丁

Optimize Re-output Logic

@@ -90,7 +90,8 @@ class ChapterGenerationNode(BaseNode): @@ -90,7 +90,8 @@ class ChapterGenerationNode(BaseNode):
90 } 90 }
91 # 章节若仅包含标题或字符过少则视为失败,强制LLM重新生成 91 # 章节若仅包含标题或字符过少则视为失败,强制LLM重新生成
92 _MIN_NON_HEADING_BLOCKS = 2 92 _MIN_NON_HEADING_BLOCKS = 2
93 - _MIN_BODY_CHARACTERS = 400 93 + _MIN_BODY_CHARACTERS = 600
  94 + _MIN_NARRATIVE_CHARACTERS = 300
94 _PARAGRAPH_FRAGMENT_MAX_CHARS = 80 95 _PARAGRAPH_FRAGMENT_MAX_CHARS = 80
95 _PARAGRAPH_FRAGMENT_NO_TERMINATOR_MAX_CHARS = 240 96 _PARAGRAPH_FRAGMENT_NO_TERMINATOR_MAX_CHARS = 240
96 _TERMINATION_PUNCTUATION = set("。!?!?;;……") 97 _TERMINATION_PUNCTUATION = set("。!?!?;;……")
@@ -659,10 +660,15 @@ class ChapterGenerationNode(BaseNode): @@ -659,10 +660,15 @@ class ChapterGenerationNode(BaseNode):
659 and block.get("type") not in {"heading", "divider", "toc"} 660 and block.get("type") not in {"heading", "divider", "toc"}
660 ] 661 ]
661 body_characters = self._count_body_characters(blocks) 662 body_characters = self._count_body_characters(blocks)
  663 + narrative_characters = self._count_narrative_characters(blocks)
662 664
663 - if len(non_heading_blocks) < self._MIN_NON_HEADING_BLOCKS or body_characters < self._MIN_BODY_CHARACTERS: 665 + if (
  666 + len(non_heading_blocks) < self._MIN_NON_HEADING_BLOCKS
  667 + or body_characters < self._MIN_BODY_CHARACTERS
  668 + or narrative_characters < self._MIN_NARRATIVE_CHARACTERS
  669 + ):
664 raise ChapterContentError( 670 raise ChapterContentError(
665 - f"{chapter.get('title') or '该章节'} 正文不足:有效区块 {len(non_heading_blocks)} 个,估算字符数 {body_characters}" 671 + f"{chapter.get('title') or '该章节'} 正文不足:有效区块 {len(non_heading_blocks)} 个,估算字符数 {body_characters},叙述性字符数 {narrative_characters}"
666 ) 672 )
667 673
668 def _count_body_characters(self, blocks: Any) -> int: 674 def _count_body_characters(self, blocks: Any) -> int:
@@ -696,19 +702,7 @@ class ChapterGenerationNode(BaseNode): @@ -696,19 +702,7 @@ class ChapterGenerationNode(BaseNode):
696 return 0 702 return 0
697 703
698 if block_type == "paragraph": 704 if block_type == "paragraph":
699 - inlines = node.get("inlines")  
700 - if isinstance(inlines, list):  
701 - total = 0  
702 - for run in inlines:  
703 - if isinstance(run, dict):  
704 - text = run.get("text")  
705 - if isinstance(text, str):  
706 - total += len(text.strip())  
707 - return total  
708 - text_value = node.get("text")  
709 - if isinstance(text_value, str):  
710 - return len(text_value.strip())  
711 - return len(self._extract_block_text(node).strip()) 705 + return self._estimate_paragraph_characters(node)
712 706
713 if block_type == "list": 707 if block_type == "list":
714 total = 0 708 total = 0
@@ -735,6 +729,57 @@ class ChapterGenerationNode(BaseNode): @@ -735,6 +729,57 @@ class ChapterGenerationNode(BaseNode):
735 729
736 return walk(blocks) 730 return walk(blocks)
737 731
  732 + def _count_narrative_characters(self, blocks: Any) -> int:
  733 + """
  734 + 统计paragraph/callout/list/blockquote等叙述性结构的字符数,避免被表格/图表“刷长”。
  735 + """
  736 +
  737 + def walk(node: Any) -> int:
  738 + if node is None:
  739 + return 0
  740 + if isinstance(node, list):
  741 + return sum(walk(item) for item in node)
  742 + if isinstance(node, str):
  743 + return len(node.strip())
  744 + if not isinstance(node, dict):
  745 + return 0
  746 +
  747 + block_type = node.get("type")
  748 + if block_type == "paragraph":
  749 + return self._estimate_paragraph_characters(node)
  750 + if block_type == "list":
  751 + total = 0
  752 + for item in node.get("items", []):
  753 + total += walk(item)
  754 + return total
  755 + if block_type in {"callout", "blockquote"}:
  756 + return walk(node.get("blocks"))
  757 +
  758 + # list项可能是匿名dict,兼容性遍历
  759 + if block_type is None:
  760 + nested = node.get("blocks")
  761 + if isinstance(nested, list):
  762 + return walk(nested)
  763 + return 0
  764 +
  765 + return walk(blocks)
  766 +
  767 + def _estimate_paragraph_characters(self, block: Dict[str, Any]) -> int:
  768 + """提取paragraph文本长度,复用在多种统计中。"""
  769 + inlines = block.get("inlines")
  770 + if isinstance(inlines, list):
  771 + total = 0
  772 + for run in inlines:
  773 + if isinstance(run, dict):
  774 + text = run.get("text")
  775 + if isinstance(text, str):
  776 + total += len(text.strip())
  777 + return total
  778 + text_value = block.get("text")
  779 + if isinstance(text_value, str):
  780 + return len(text_value.strip())
  781 + return len(self._extract_block_text(block).strip())
  782 +
738 def _sanitize_block_content(self, block: Dict[str, Any]): 783 def _sanitize_block_content(self, block: Dict[str, Any]):
739 """根据类型做精细化修复,例如清理paragraph内的非法inline mark""" 784 """根据类型做精细化修复,例如清理paragraph内的非法inline mark"""
740 block_type = block.get("type") 785 block_type = block.get("type")