Increase DeepSeek Compatibility

马一丁
Commit 52eed4d0101d6299bfa012d4bd87a6a7f2990415 52eed4d0 1 parent e267b1fc
Showing 4 changed files with 460 additions and 12 deletions
ReportEngine/agent.py
ReportEngine/nodes/__init__.py
ReportEngine/nodes/chapter_generation_node.py
ReportEngine/renderers/html_renderer.py
--- a/ReportEngine/agent.py
View file @52eed4d
+++ b/ReportEngine/agent.py
View file @52eed4d
@@ -29,6 +29,7 @@ from .nodes import (
     TemplateSelectionNode,
     ChapterGenerationNode,
     ChapterJsonParseError,
+     ChapterContentError,
     DocumentLayoutNode,
     WordBudgetNode,
 )
@@ -438,20 +439,26 @@ class ReportAgent:
                             stream_callback=chunk_callback
                         )
                         break
-                     except ChapterJsonParseError as parse_error:
+                     except (ChapterJsonParseError, ChapterContentError) as structured_error:
+                         error_kind = (
+                             "content_sparse" if isinstance(structured_error, ChapterContentError) else "json_parse"
+                         )
+                         readable_label = "内容密度异常" if error_kind == "content_sparse" else "JSON解析失败"
                         logger.warning(
-                             "章节 %s JSON解析失败（第 %s/%s 次尝试）: %s",
+                             "章节 %s %s（第 %s/%s 次尝试）: %s",
                             section.title,
+                             readable_label,
                             attempt,
                             chapter_max_attempts,
-                             parse_error,
+                             structured_error,
                         )
                         emit('chapter_status', {
                             'chapterId': section.chapter_id,
                             'title': section.title,
                             'status': 'retrying' if attempt < chapter_max_attempts else 'error',
                             'attempt': attempt,
-                             'error': str(parse_error),
+                             'error': str(structured_error),
+                             'reason': error_kind,
                         })
                         if attempt >= chapter_max_attempts:
                             raise
--- a/ReportEngine/nodes/__init__.py
View file @52eed4d
+++ b/ReportEngine/nodes/__init__.py
View file @52eed4d
@@ -6,7 +6,7 @@ Report Engine节点处理模块。
 
 from .base_node import BaseNode, StateMutationNode
 from .template_selection_node import TemplateSelectionNode
- from .chapter_generation_node import ChapterGenerationNode, ChapterJsonParseError
+ from .chapter_generation_node import ChapterGenerationNode, ChapterJsonParseError, ChapterContentError
 from .document_layout_node import DocumentLayoutNode
 from .word_budget_node import WordBudgetNode
 
@@ -16,6 +16,7 @@ __all__ = [
     "TemplateSelectionNode",
     "ChapterGenerationNode",
     "ChapterJsonParseError",
+     "ChapterContentError",
     "DocumentLayoutNode",
     "WordBudgetNode",
 ]
--- a/ReportEngine/nodes/chapter_generation_node.py
View file @52eed4d
+++ b/ReportEngine/nodes/chapter_generation_node.py
View file @52eed4d
@@ -36,6 +36,14 @@ class ChapterJsonParseError(ValueError):
         self.raw_text = raw_text
 
 
+ class ChapterContentError(ValueError):
+     """
+     章节内容稀疏异常。
+ 
+     当LLM仅输出标题或正文不足以支撑一章时触发，驱动重试以保证报告质量。
+     """
+ 
+ 
 class ChapterGenerationNode(BaseNode):
     """
     负责按章节调用LLM并校验JSON结构。
@@ -71,6 +79,12 @@ class ChapterGenerationNode(BaseNode):
         "sub": "subscript",
         "sup": "superscript",
     }
+     # 章节若仅包含标题或字符过少则视为失败，强制LLM重新生成
+     _MIN_NON_HEADING_BLOCKS = 2
+     _MIN_BODY_CHARACTERS = 400
+     _PARAGRAPH_FRAGMENT_MAX_CHARS = 80
+     _PARAGRAPH_FRAGMENT_NO_TERMINATOR_MAX_CHARS = 240
+     _TERMINATION_PUNCTUATION = set("。！？!?；;……")
 
     def __init__(self, llm_client, validator: IRValidator, storage: ChapterStorage):
         """
@@ -121,17 +135,32 @@ class ChapterGenerationNode(BaseNode):
         self._sanitize_chapter_blocks(chapter_json)
 
         valid, errors = self.validator.validate_chapter(chapter_json)
+         content_error: ChapterContentError | None = None
+         if valid:
+             try:
+                 self._ensure_content_density(chapter_json)
+             except ChapterContentError as exc:
+                 content_error = exc
+ 
+         error_messages: List[str] = []
+         if not valid and errors:
+             error_messages.extend(errors)
+         if content_error:
+             error_messages.append(str(content_error))
+ 
         self.storage.persist_chapter(
             run_dir,
             chapter_meta,
             chapter_json,
-             errors=None if valid else errors,
+             errors=None if not error_messages else error_messages,
         )
 
         if not valid:
             raise ValueError(
                 f"{section.title} 章节JSON校验失败: {'; '.join(errors[:5])}"
             )
+         if content_error:
+             raise content_error
 
         return chapter_json
 
@@ -488,6 +517,97 @@ class ChapterGenerationNode(BaseNode):
 
         walk(chapter.get("blocks"))
 
+         blocks = chapter.get("blocks")
+         if isinstance(blocks, list):
+             chapter["blocks"] = self._merge_fragment_sequences(blocks)
+ 
+     def _ensure_content_density(self, chapter: Dict[str, Any]):
+         """
+         校验章节正文密度。
+ 
+         若blocks缺失、除标题外无有效区块，或正文字符数低于阈值，
+         则视为章节内容异常，触发ChapterContentError以便上游重试。
+         """
+         blocks = chapter.get("blocks")
+         if not isinstance(blocks, list) or not blocks:
+             raise ChapterContentError("章节缺少正文区块，无法输出内容")
+ 
+         non_heading_blocks = [
+             block
+             for block in blocks
+             if isinstance(block, dict)
+             and block.get("type") not in {"heading", "divider", "toc"}
+         ]
+         body_characters = self._count_body_characters(blocks)
+ 
+         if len(non_heading_blocks) < self._MIN_NON_HEADING_BLOCKS or body_characters < self._MIN_BODY_CHARACTERS:
+             raise ChapterContentError(
+                 f"{chapter.get('title') or '该章节'} 正文不足：有效区块 {len(non_heading_blocks)} 个，估算字符数 {body_characters}"
+             )
+ 
+     def _count_body_characters(self, blocks: Any) -> int:
+         """
+         递归统计正文字符数。
+ 
+         - 忽略heading/divider/widget等非正文类型；
+         - 对paragraph/list/table/callout等结构抽取嵌套文本；
+         - 仅用于粗粒度判断篇幅是否合理。
+         """
+ 
+         def walk(node: Any) -> int:
+             if node is None:
+                 return 0
+             if isinstance(node, list):
+                 return sum(walk(item) for item in node)
+             if isinstance(node, str):
+                 return len(node.strip())
+             if not isinstance(node, dict):
+                 return 0
+ 
+             block_type = node.get("type")
+             if block_type in {"heading", "divider", "toc", "widget"}:
+                 return 0
+ 
+             if block_type == "paragraph":
+                 inlines = node.get("inlines")
+                 if isinstance(inlines, list):
+                     total = 0
+                     for run in inlines:
+                         if isinstance(run, dict):
+                             text = run.get("text")
+                             if isinstance(text, str):
+                                 total += len(text.strip())
+                     return total
+                 text_value = node.get("text")
+                 if isinstance(text_value, str):
+                     return len(text_value.strip())
+                 return len(self._extract_block_text(node).strip())
+ 
+             if block_type == "list":
+                 total = 0
+                 for item in node.get("items", []):
+                     total += walk(item)
+                 return total
+ 
+             if block_type in {"blockquote", "callout"}:
+                 return walk(node.get("blocks"))
+ 
+             if block_type == "table":
+                 total = 0
+                 for row in node.get("rows", []):
+                     cells = row.get("cells") or []
+                     for cell in cells:
+                         total += walk(cell.get("blocks"))
+                 return total
+ 
+             nested = node.get("blocks")
+             if isinstance(nested, list):
+                 return walk(nested)
+ 
+             return len(self._extract_block_text(node).strip())
+ 
+         return walk(blocks)
+ 
     def _sanitize_block_content(self, block: Dict[str, Any]):
         """根据类型做精细化修复，例如清理paragraph内的非法inline mark"""
         block_type = block.get("type")
@@ -505,7 +625,134 @@ class ChapterGenerationNode(BaseNode):
             normalized_runs = [self._as_inline_run(self._extract_block_text(block))]
         if not normalized_runs:
             normalized_runs = [self._as_inline_run("")]
-         block["inlines"] = normalized_runs
+         block["inlines"] = self._strip_inline_artifacts(normalized_runs)
+ 
+     def _strip_inline_artifacts(self, inlines: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+         """移除被LLM误写入的JSON哨兵文本，防止渲染出`{\"type\": \"\"}`等垃圾字符"""
+         cleaned: List[Dict[str, Any]] = []
+         for run in inlines or []:
+             if not isinstance(run, dict):
+                 continue
+             text = run.get("text")
+             if isinstance(text, str):
+                 stripped = text.strip()
+                 if stripped.startswith("{") and stripped.endswith("}"):
+                     try:
+                         payload = json.loads(stripped)
+                     except json.JSONDecodeError:
+                         payload = None
+                     if isinstance(payload, dict) and set(payload.keys()).issubset({"type", "value"}):
+                         continue
+             cleaned.append(run)
+         return cleaned or [self._as_inline_run("")]
+ 
+     def _merge_fragment_sequences(self, blocks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+         """合并被LLM拆成多段的句子片段，避免HTML出现大量孤立<p>"""
+         if not isinstance(blocks, list):
+             return blocks
+ 
+         merged: List[Dict[str, Any]] = []
+         fragment_buffer: List[Dict[str, Any]] = []
+ 
+         def flush_buffer():
+             nonlocal fragment_buffer
+             if not fragment_buffer:
+                 return
+             if len(fragment_buffer) == 1:
+                 merged.append(fragment_buffer[0])
+             else:
+                 merged.append(self._combine_paragraph_fragments(fragment_buffer))
+             fragment_buffer = []
+ 
+         for block in blocks:
+             if self._is_paragraph_fragment(block):
+                 fragment_buffer.append(block)
+                 continue
+             flush_buffer()
+             merged.append(self._merge_nested_fragments(block))
+ 
+         flush_buffer()
+         return merged
+ 
+     def _merge_nested_fragments(self, block: Dict[str, Any]) -> Dict[str, Any]:
+         """对嵌套结构（callout/list/table）递归处理片段合并"""
+         block_type = block.get("type")
+         if block_type in {"callout", "blockquote"}:
+             nested = block.get("blocks")
+             if isinstance(nested, list):
+                 block["blocks"] = self._merge_fragment_sequences(nested)
+         elif block_type == "list":
+             items = block.get("items")
+             if isinstance(items, list):
+                 for entry in items:
+                     if isinstance(entry, list):
+                         merged_entry = self._merge_fragment_sequences(entry)
+                         entry[:] = merged_entry
+         elif block_type == "table":
+             for row in block.get("rows", []):
+                 cells = row.get("cells") or []
+                 for cell in cells:
+                     nested_blocks = cell.get("blocks")
+                     if isinstance(nested_blocks, list):
+                         cell["blocks"] = self._merge_fragment_sequences(nested_blocks)
+         return block
+ 
+     def _combine_paragraph_fragments(self, fragments: List[Dict[str, Any]]) -> Dict[str, Any]:
+         """将多个句子片段合并为单个paragraph block"""
+         template = dict(fragments[0])
+         combined_inlines: List[Dict[str, Any]] = []
+         for fragment in fragments:
+             runs = fragment.get("inlines")
+             if isinstance(runs, list) and runs:
+                 combined_inlines.extend(runs)
+             else:
+                 fallback_text = self._extract_block_text(fragment)
+                 combined_inlines.append(self._as_inline_run(fallback_text))
+         if not combined_inlines:
+             combined_inlines.append(self._as_inline_run(""))
+         template["inlines"] = combined_inlines
+         return template
+ 
+     def _is_paragraph_fragment(self, block: Dict[str, Any]) -> bool:
+         """判断paragraph是否为被错误拆分的短片段"""
+         if not isinstance(block, dict) or block.get("type") != "paragraph":
+             return False
+         inlines = block.get("inlines")
+         text = ""
+         has_marks = False
+         if isinstance(inlines, list) and inlines:
+             parts: List[str] = []
+             for run in inlines:
+                 if not isinstance(run, dict):
+                     continue
+                 parts.append(str(run.get("text") or ""))
+                 marks = run.get("marks")
+                 if isinstance(marks, list) and any(marks):
+                     has_marks = True
+             text = "".join(parts)
+         else:
+             text = self._extract_block_text(block)
+         stripped = (text or "").strip()
+         if not stripped:
+             return True
+         if has_marks:
+             return False
+         if "\n" in stripped:
+             return False
+ 
+         short_limit = self._PARAGRAPH_FRAGMENT_MAX_CHARS
+         long_limit = getattr(
+             self,
+             "_PARAGRAPH_FRAGMENT_NO_TERMINATOR_MAX_CHARS",
+             short_limit * 3,
+         )
+ 
+         if stripped[-1] in self._TERMINATION_PUNCTUATION:
+             return len(stripped) <= short_limit
+ 
+         if len(stripped) > long_limit:
+             return False
+         return True
 
     def _coerce_inline_run(self, run: Any) -> List[Dict[str, Any]]:
         """将任意inline写法规整为合法run"""
--- a/ReportEngine/renderers/html_renderer.py
View file @52eed4d
+++ b/ReportEngine/renderers/html_renderer.py
View file @52eed4d
@@ -5,6 +5,7 @@
 from __future__ import annotations
 
 import ast
+ import copy
 import html
 import json
 from typing import Any, Dict, List
@@ -19,6 +20,31 @@ class HTMLRenderer:
     - 提供主题变量、编号映射等辅助功能。
     """
 
+     CALLOUT_ALLOWED_TYPES = {
+         "paragraph",
+         "list",
+         "table",
+         "blockquote",
+         "code",
+         "math",
+         "figure",
+         "kpiGrid",
+     }
+     INLINE_ARTIFACT_KEYS = {
+         "props",
+         "widgetId",
+         "widgetType",
+         "data",
+         "dataRef",
+         "datasets",
+         "labels",
+         "config",
+         "options",
+     }
+     TABLE_COMPLEX_CHARS = set(
+         "@％%（）()，,。；;：:、？?！!·…-—_+<>[]{}|\\/\"'`~$^&*#"
+     )
+ 
     def __init__(self, config: Dict[str, Any] | None = None):
         """初始化渲染器缓存并允许注入额外配置（如主题覆盖）"""
         self.config = config or {}
@@ -72,6 +98,7 @@ class HTMLRenderer:
   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
   <title>{self._escape_html(title)}</title>
   <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
+   <script src="https://cdn.jsdelivr.net/npm/chartjs-chart-sankey@4"></script>
   <script src="https://cdnjs.cloudflare.com/ajax/libs/html2canvas/1.4.1/html2canvas.min.js"></script>
   <script src="https://cdnjs.cloudflare.com/ajax/libs/jspdf/2.5.1/jspdf.umd.min.js"></script>
   <script>
@@ -442,8 +469,9 @@ class HTMLRenderer:
 
     def _render_table(self, block: Dict[str, Any]) -> str:
         """渲染表格，同时保留caption与单元格属性"""
+         rows = self._normalize_table_rows(block.get("rows") or [])
         rows_html = ""
-         for row in block.get("rows", []):
+         for row in rows:
             row_cells = ""
             for cell in row.get("cells", []):
                 cell_tag = "th" if cell.get("header") or cell.get("isHeader") else "td"
@@ -462,6 +490,105 @@ class HTMLRenderer:
         caption_html = f"<caption>{self._escape_html(caption)}</caption>" if caption else ""
         return f'<div class="table-wrap"><table>{caption_html}<tbody>{rows_html}</tbody></table></div>'
 
+     def _normalize_table_rows(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+         """检测并修正仅有单列的竖排表，转换为标准网格"""
+         if not rows:
+             return []
+         if not all(len((row.get("cells") or [])) == 1 for row in rows):
+             return rows
+         texts = [self._extract_row_text(row) for row in rows]
+         header_span = self._detect_transposed_header_span(rows, texts)
+         if not header_span:
+             return rows
+         normalized = self._transpose_single_cell_table(rows, header_span)
+         return normalized or rows
+ 
+     def _detect_transposed_header_span(self, rows: List[Dict[str, Any]], texts: List[str]) -> int:
+         """推断竖排表头的行数，用于后续转置"""
+         max_fields = min(8, len(rows) // 2)
+         header_span = 0
+         for idx, text in enumerate(texts):
+             if idx >= max_fields:
+                 break
+             if self._is_potential_table_header(text):
+                 header_span += 1
+             else:
+                 break
+         if header_span < 2:
+             return 0
+         remainder = texts[header_span:]
+         if not remainder or (len(rows) - header_span) % header_span != 0:
+             return 0
+         if not any(self._looks_like_table_value(txt) for txt in remainder):
+             return 0
+         return header_span
+ 
+     def _is_potential_table_header(self, text: str) -> bool:
+         """根据长度与字符特征判断是否像表头字段"""
+         if not text:
+             return False
+         stripped = text.strip()
+         if not stripped or len(stripped) > 12:
+             return False
+         return not any(ch.isdigit() or ch in self.TABLE_COMPLEX_CHARS for ch in stripped)
+ 
+     def _looks_like_table_value(self, text: str) -> bool:
+         """判断该文本是否更像数据值，用于辅助判断转置"""
+         if not text:
+             return False
+         stripped = text.strip()
+         if len(stripped) >= 12:
+             return True
+         return any(ch.isdigit() or ch in self.TABLE_COMPLEX_CHARS for ch in stripped)
+ 
+     def _transpose_single_cell_table(self, rows: List[Dict[str, Any]], span: int) -> List[Dict[str, Any]]:
+         """将单列多行的表格转换为标准表头 + 若干数据行"""
+         total = len(rows)
+         if total <= span or (total - span) % span != 0:
+             return []
+         header_rows = rows[:span]
+         data_rows = rows[span:]
+         normalized: List[Dict[str, Any]] = []
+         header_cells = []
+         for row in header_rows:
+             cell = copy.deepcopy((row.get("cells") or [{}])[0])
+             cell["header"] = True
+             header_cells.append(cell)
+         normalized.append({"cells": header_cells})
+         for start in range(0, len(data_rows), span):
+             group = data_rows[start : start + span]
+             if len(group) < span:
+                 break
+             normalized.append(
+                 {
+                     "cells": [
+                         copy.deepcopy((item.get("cells") or [{}])[0])
+                         for item in group
+                     ]
+                 }
+             )
+         return normalized
+ 
+     def _extract_row_text(self, row: Dict[str, Any]) -> str:
+         """提取表格行中的纯文本，方便启发式分析"""
+         cells = row.get("cells") or []
+         if not cells:
+             return ""
+         cell = cells[0]
+         texts: List[str] = []
+         for block in cell.get("blocks", []):
+             if isinstance(block, dict):
+                 if block.get("type") == "paragraph":
+                     for inline in block.get("inlines") or []:
+                         if isinstance(inline, dict):
+                             value = inline.get("text")
+                         else:
+                             value = inline
+                         if value is None:
+                             continue
+                         texts.append(str(value))
+         return "".join(texts)
+ 
     def _render_blockquote(self, block: Dict[str, Any]) -> str:
         """渲染引用块，可嵌套其他block"""
         inner = self._render_blocks(block.get("blocks", []))
@@ -487,9 +614,63 @@ class HTMLRenderer:
         """渲染高亮提示盒，tone决定颜色"""
         tone = block.get("tone", "info")
         title = block.get("title")
-         inner = self._render_blocks(block.get("blocks", []))
+         safe_blocks, trailing_blocks = self._split_callout_content(block.get("blocks"))
+         inner = self._render_blocks(safe_blocks)
         title_html = f"<strong>{self._escape_html(title)}</strong>" if title else ""
-         return f'<div class="callout tone-{tone}">{title_html}{inner}</div>'
+         callout_html = f'<div class="callout tone-{tone}">{title_html}{inner}</div>'
+         trailing_html = self._render_blocks(trailing_blocks) if trailing_blocks else ""
+         return callout_html + trailing_html
+ 
+     def _split_callout_content(
+         self, blocks: List[Dict[str, Any]] | None
+     ) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
+         """限定callout内部仅包含轻量内容，其余块剥离到外层"""
+         if not blocks:
+             return [], []
+         safe: List[Dict[str, Any]] = []
+         trailing: List[Dict[str, Any]] = []
+         for idx, child in enumerate(blocks):
+             child_type = child.get("type")
+             if child_type == "list":
+                 sanitized, overflow = self._sanitize_callout_list(child)
+                 if sanitized:
+                     safe.append(sanitized)
+                 if overflow:
+                     trailing.extend(overflow)
+                     trailing.extend(copy.deepcopy(blocks[idx + 1 :]))
+                     break
+             elif child_type in self.CALLOUT_ALLOWED_TYPES:
+                 safe.append(child)
+             else:
+                 trailing.extend(copy.deepcopy(blocks[idx:]))
+                 break
+         else:
+             return safe, []
+         return safe, trailing
+ 
+     def _sanitize_callout_list(
+         self, block: Dict[str, Any]
+     ) -> tuple[Dict[str, Any] | None, List[Dict[str, Any]]]:
+         """当列表项包含结构型block时，将其截断移出callout"""
+         items = block.get("items") or []
+         if not items:
+             return block, []
+         sanitized_items: List[List[Dict[str, Any]]] = []
+         trailing: List[Dict[str, Any]] = []
+         for idx, item in enumerate(items):
+             safe, overflow = self._split_callout_content(item)
+             if safe:
+                 sanitized_items.append(safe)
+             if overflow:
+                 trailing.extend(overflow)
+                 for rest in items[idx + 1 :]:
+                     trailing.extend(copy.deepcopy(rest))
+                 break
+         if not sanitized_items:
+             return None, trailing
+         new_block = copy.deepcopy(block)
+         new_block["items"] = sanitized_items
+         return new_block, trailing
 
     def _render_kpi_grid(self, block: Dict[str, Any]) -> str:
         """渲染KPI卡片栅格，包含指标值与涨跌幅"""
@@ -631,6 +812,8 @@ class HTMLRenderer:
                             nested_marks = inline_payload.get("marks")
                             if isinstance(nested_marks, list):
                                 marks.extend(nested_marks)
+                         elif any(key in payload for key in self.INLINE_ARTIFACT_KEYS):
+                             text_value = ""
 
         return text_value, marks
 
@@ -1281,10 +1464,11 @@ function mergeOptions(base, override) {
 }
 
 function resolveChartTypes(payload) {
+   const explicit = payload && payload.props && payload.props.type;
   const widgetType = payload && payload.widgetType ? payload.widgetType : 'chart.js/bar';
-   const primary = widgetType.includes('/') ? widgetType.split('/').pop() : widgetType;
+   const derived = widgetType && widgetType.includes('/') ? widgetType.split('/').pop() : widgetType;
   const extra = Array.isArray(payload && payload.preferredTypes) ? payload.preferredTypes : [];
-   const pipeline = [primary, ...extra, ...STABLE_CHART_TYPES];
+   const pipeline = [explicit, derived, ...extra, ...STABLE_CHART_TYPES].filter(Boolean);
   const result = [];
   pipeline.forEach(type => {
     if (type && !result.includes(type)) {
@@ -1456,6 +1640,15 @@ function buildChartOptions(payload) {
 }
 
 function instantiateChart(ctx, payload, optionsTemplate, type) {
+   if (!ctx) {
+     return null;
+   }
+   if (ctx.canvas && typeof Chart !== 'undefined' && typeof Chart.getChart === 'function') {
+     const existing = Chart.getChart(ctx.canvas);
+     if (existing) {
+       existing.destroy();
+     }
+   }
   const data = cloneDeep(payload && payload.data ? payload.data : {});
   const config = {
     type,