Toggle navigation
Toggle navigation
This project
Loading...
Sign in
万朱浩
/
Venue-Ops
Go to a project
Toggle navigation
Projects
Groups
Snippets
Help
Toggle navigation pinning
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
马一丁
2025-11-17 21:05:00 +0800
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
b31be56297761473ab690f4a6782ad599c14b81e
b31be562
1 parent
f6714a35
Fixed Directory Parsing Issues and Optimized Directory Rendering
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
215 additions
and
8 deletions
ReportEngine/nodes/document_layout_node.py
ReportEngine/prompts/prompts.py
ReportEngine/renderers/html_renderer.py
ReportEngine/nodes/document_layout_node.py
View file @
b31be56
...
...
@@ -114,16 +114,94 @@ class DocumentLayoutNode(BaseNode):
if
not
isinstance
(
result
.
get
(
"title"
),
str
):
logger
.
warning
(
"文档设计缺少title字段或类型错误,使用默认值"
)
result
.
setdefault
(
"title"
,
"未命名报告"
)
if
not
isinstance
(
result
.
get
(
"toc"
),
(
list
,
dict
)):
logger
.
warning
(
"文档设计缺少toc字段或类型错误,使用空列表"
)
result
.
setdefault
(
"toc"
,
[])
# 处理tocPlan字段
toc_plan
=
result
.
get
(
"tocPlan"
,
[])
if
not
isinstance
(
toc_plan
,
list
):
logger
.
warning
(
"文档设计缺少tocPlan字段或类型错误,使用空列表"
)
result
[
"tocPlan"
]
=
[]
else
:
# 清理tocPlan中的description字段
result
[
"tocPlan"
]
=
self
.
_clean_toc_plan_descriptions
(
toc_plan
)
if
not
isinstance
(
result
.
get
(
"hero"
),
dict
):
logger
.
warning
(
"文档设计缺少hero字段或类型错误,使用空对象"
)
result
.
setdefault
(
"hero"
,
{})
return
result
except
JSONParseError
as
exc
:
# 转换为原有的异常类型以保持向后兼容
raise
ValueError
(
f
"文档设计JSON解析失败: {exc}"
)
from
exc
def
_clean_toc_plan_descriptions
(
self
,
toc_plan
:
List
[
Dict
[
str
,
Any
]])
->
List
[
Dict
[
str
,
Any
]]:
"""
清理tocPlan中每个条目的description字段,移除可能的JSON片段。
参数:
toc_plan: 原始的目录计划列表
返回:
List[Dict[str, Any]]: 清理后的目录计划列表
"""
import
re
def
clean_text
(
text
:
Any
)
->
str
:
"""清理文本中的JSON片段"""
if
not
text
or
not
isinstance
(
text
,
str
):
return
""
cleaned
=
text
# 移除以逗号+空白+{开头的不完整JSON对象
cleaned
=
re
.
sub
(
r',
\
s*
\
{[^}]*$'
,
''
,
cleaned
)
# 移除以逗号+空白+[开头的不完整JSON数组
cleaned
=
re
.
sub
(
r',
\
s*
\
[[^
\
]]*$'
,
''
,
cleaned
)
# 移除孤立的 { 加上后续内容(如果没有匹配的 })
open_brace_pos
=
cleaned
.
rfind
(
'{'
)
if
open_brace_pos
!=
-
1
:
close_brace_pos
=
cleaned
.
rfind
(
'}'
)
if
close_brace_pos
<
open_brace_pos
:
cleaned
=
cleaned
[:
open_brace_pos
]
.
rstrip
(
',,、
\t\n
'
)
# 移除孤立的 [ 加上后续内容(如果没有匹配的 ])
open_bracket_pos
=
cleaned
.
rfind
(
'['
)
if
open_bracket_pos
!=
-
1
:
close_bracket_pos
=
cleaned
.
rfind
(
']'
)
if
close_bracket_pos
<
open_bracket_pos
:
cleaned
=
cleaned
[:
open_bracket_pos
]
.
rstrip
(
',,、
\t\n
'
)
# 移除看起来像JSON键值对的片段
cleaned
=
re
.
sub
(
r',?
\
s*"[^"]+"
\
s*:
\
s*"[^"]*$'
,
''
,
cleaned
)
cleaned
=
re
.
sub
(
r',?
\
s*"[^"]+"
\
s*:
\
s*[^,}
\
]]*$'
,
''
,
cleaned
)
# 清理末尾的逗号和空白
cleaned
=
cleaned
.
rstrip
(
',,、
\t\n
'
)
return
cleaned
.
strip
()
cleaned_plan
=
[]
for
entry
in
toc_plan
:
if
not
isinstance
(
entry
,
dict
):
continue
# 清理description字段
if
"description"
in
entry
:
original_desc
=
entry
[
"description"
]
cleaned_desc
=
clean_text
(
original_desc
)
if
cleaned_desc
!=
original_desc
:
logger
.
warning
(
f
"清理目录项 '{entry.get('display', 'unknown')}' 的description字段中的JSON片段:
\n
"
f
" 原文: {original_desc[:100]}...
\n
"
f
" 清理后: {cleaned_desc[:100]}..."
)
entry
[
"description"
]
=
cleaned_desc
cleaned_plan
.
append
(
entry
)
return
cleaned_plan
__all__
=
[
"DocumentLayoutNode"
]
...
...
ReportEngine/prompts/prompts.py
View file @
b31be56
...
...
@@ -369,13 +369,21 @@ SYSTEM_PROMPT_DOCUMENT_LAYOUT = f"""
输入包含 templateOverview(模板标题+目录整体)、sections 列表以及多源报告,请先把模板标题和目录当成一个整体,与多引擎内容对照后设计标题与目录,再延伸出可直接渲染的视觉主题。你的输出会被独立存储以便后续拼接,请确保字段齐备。
目标:
1. 生成具有中文叙事风格的 title/subtitle/tagline,并确保可直接放在封面中央,文案中需自然提到
“文章总览”
;
1. 生成具有中文叙事风格的 title/subtitle/tagline,并确保可直接放在封面中央,文案中需自然提到
"文章总览"
;
2. 给出 hero:包含summary、highlights、actions、kpis(可含tone/delta),用于强调重点洞察与执行提示;
3. 输出 tocPlan,一级目录固定用中文数字(
“一、二、三”),二级目录用“1.1/1.2”
,可在description里说明详略;如需定制目录标题,请填写 tocTitle;
3. 输出 tocPlan,一级目录固定用中文数字(
"一、二、三"),二级目录用"1.1/1.2"
,可在description里说明详略;如需定制目录标题,请填写 tocTitle;
4. 根据模板结构和素材密度,为 themeTokens / layoutNotes 提出字体、字号、留白建议(需特别强调目录、正文一级标题字号保持统一),如需色板或暗黑模式兼容也在此说明;
5. 严禁要求外部图片或AI生图,推荐Chart.js图表、表格、色块、KPI卡等可直接渲染的原生组件;
6. 不随意增删章节,仅优化命名或描述;若有排版或章节合并提示,请放入 layoutNotes,渲染层会严格遵循。
**tocPlan的description字段特别要求:**
- description字段必须是纯文本描述,用于在目录中展示章节简介
- 严禁在description字段中嵌套JSON结构、对象、数组或任何特殊标记
- description应该是简洁的一句话或一小段话,描述该章节的核心内容
- 错误示例:{{"description": "描述内容,{{
\"
chapterId
\"
:
\"
S3
\"
}}"}}
- 正确示例:{{"description": "描述内容,详细分析章节要点"}}
- 如果需要关联chapterId,请使用tocPlan对象的chapterId字段,不要写在description中
输出必须满足下述JSON Schema:
<OUTPUT JSON SCHEMA>
{json.dumps(document_layout_output_schema, ensure_ascii=False, indent=2)}
...
...
@@ -391,7 +399,9 @@ SYSTEM_PROMPT_DOCUMENT_LAYOUT = f"""
- 括号必须成对且正确嵌套
- 不要使用尾随逗号(最后一个元素后不加逗号)
- 不要在JSON中添加注释
- description等文本字段中不得包含JSON结构
5. 所有字符串值使用双引号,数值不使用引号
6. 再次强调:tocPlan中每个条目的description必须是纯文本,不能包含任何JSON片段
"""
# 篇幅规划提示词
...
...
ReportEngine/renderers/html_renderer.py
View file @
b31be56
...
...
@@ -9,6 +9,7 @@ import copy
import
html
import
json
import
os
import
re
from
pathlib
import
Path
from
typing
import
Any
,
Dict
,
List
from
loguru
import
logger
...
...
@@ -451,23 +452,44 @@ class HTMLRenderer:
chapters: Document IR中的章节数组。
返回:
list[dict]: 规范化后的目录条目,包含level/text/anchor。
list[dict]: 规范化后的目录条目,包含level/text/anchor
/description
。
"""
metadata
=
self
.
metadata
toc_config
=
metadata
.
get
(
"toc"
)
or
{}
custom_entries
=
toc_config
.
get
(
"customEntries"
)
entries
:
List
[
Dict
[
str
,
Any
]]
=
[]
if
custom_entries
:
for
entry
in
custom_entries
:
anchor
=
entry
.
get
(
"anchor"
)
or
self
.
chapter_anchor_map
.
get
(
entry
.
get
(
"chapterId"
))
# 验证anchor是否有效
if
not
anchor
:
logger
.
warning
(
f
"目录项 '{entry.get('display') or entry.get('title')}' "
f
"缺少有效的anchor,已跳过"
)
continue
# 验证anchor是否在chapter_anchor_map中或在chapters的blocks中
anchor_valid
=
self
.
_validate_toc_anchor
(
anchor
,
chapters
)
if
not
anchor_valid
:
logger
.
warning
(
f
"目录项 '{entry.get('display') or entry.get('title')}' "
f
"的anchor '{anchor}' 在文档中未找到对应的章节"
)
# 清理描述文本
description
=
entry
.
get
(
"description"
)
if
description
:
description
=
self
.
_clean_text_from_json_artifacts
(
description
)
entries
.
append
(
{
"level"
:
entry
.
get
(
"level"
,
2
),
"text"
:
entry
.
get
(
"display"
)
or
entry
.
get
(
"title"
)
or
""
,
"anchor"
:
anchor
,
"description"
:
entry
.
get
(
"description"
)
,
"description"
:
description
,
}
)
return
entries
...
...
@@ -479,16 +501,52 @@ class HTMLRenderer:
if
not
anchor
:
continue
mapped
=
self
.
heading_label_map
.
get
(
anchor
,
{})
# 清理描述文本
description
=
mapped
.
get
(
"description"
)
if
description
:
description
=
self
.
_clean_text_from_json_artifacts
(
description
)
entries
.
append
(
{
"level"
:
block
.
get
(
"level"
,
2
),
"text"
:
mapped
.
get
(
"display"
)
or
block
.
get
(
"text"
,
""
),
"anchor"
:
anchor
,
"description"
:
mapped
.
get
(
"description"
)
,
"description"
:
description
,
}
)
return
entries
def
_validate_toc_anchor
(
self
,
anchor
:
str
,
chapters
:
List
[
Dict
[
str
,
Any
]])
->
bool
:
"""
验证目录anchor是否在文档中存在对应的章节或heading。
参数:
anchor: 需要验证的anchor
chapters: Document IR中的章节数组
返回:
bool: anchor是否有效
"""
# 检查是否是章节anchor
if
anchor
in
self
.
chapter_anchor_map
.
values
():
return
True
# 检查是否在heading_label_map中
if
anchor
in
self
.
heading_label_map
:
return
True
# 检查章节的blocks中是否有这个anchor
for
chapter
in
chapters
or
[]:
chapter_anchor
=
chapter
.
get
(
"anchor"
)
if
chapter_anchor
==
anchor
:
return
True
for
block
in
chapter
.
get
(
"blocks"
,
[]):
block_anchor
=
block
.
get
(
"anchor"
)
if
block_anchor
==
anchor
:
return
True
return
False
def
_prepare_chapters
(
self
,
chapters
:
List
[
Dict
[
str
,
Any
]])
->
List
[
Dict
[
str
,
Any
]]:
"""复制章节并展开其中序列化的block,避免渲染缺失"""
prepared
:
List
[
Dict
[
str
,
Any
]]
=
[]
...
...
@@ -640,6 +698,9 @@ class HTMLRenderer:
str: `<li>` 形式的HTML。
"""
desc
=
entry
.
get
(
"description"
)
# 清理描述文本中的JSON片段
if
desc
:
desc
=
self
.
_clean_text_from_json_artifacts
(
desc
)
desc_html
=
f
'<p class="toc-desc">{self._escape_html(desc)}</p>'
if
desc
else
""
level
=
entry
.
get
(
"level"
,
2
)
css_level
=
1
if
level
<=
2
else
min
(
level
,
4
)
...
...
@@ -1576,6 +1637,64 @@ class HTMLRenderer:
# ====== 文本 / 安全工具 ======
def
_clean_text_from_json_artifacts
(
self
,
text
:
Any
)
->
str
:
"""
清理文本中的JSON片段和伪造的结构标记。
LLM有时会在文本字段中混入未完成的JSON片段,如:
"描述文本,{
\"
chapterId
\"
:
\"
S3" 或 "描述文本,{
\"
level
\"
: 2"
此方法会:
1. 移除不完整的JSON对象(以 { 开头但未正确闭合的)
2. 移除不完整的JSON数组(以 [ 开头但未正确闭合的)
3. 移除孤立的JSON键值对片段
参数:
text: 可能包含JSON片段的文本
返回:
str: 清理后的纯文本
"""
if
not
text
:
return
""
text_str
=
self
.
_safe_text
(
text
)
# 模式1: 移除以逗号+空白+{开头的不完整JSON对象
# 例如: "文本,{ \"key\": \"value\"" 或 "文本,{\\n \"key\""
text_str
=
re
.
sub
(
r',
\
s*
\
{[^}]*$'
,
''
,
text_str
)
# 模式2: 移除以逗号+空白+[开头的不完整JSON数组
text_str
=
re
.
sub
(
r',
\
s*
\
[[^
\
]]*$'
,
''
,
text_str
)
# 模式3: 移除孤立的 { 加上后续内容(如果没有匹配的 })
# 检查是否有未闭合的 {
open_brace_pos
=
text_str
.
rfind
(
'{'
)
if
open_brace_pos
!=
-
1
:
close_brace_pos
=
text_str
.
rfind
(
'}'
)
if
close_brace_pos
<
open_brace_pos
:
# { 在 } 后面或没有 },说明是未闭合的
# 截断到 { 之前
text_str
=
text_str
[:
open_brace_pos
]
.
rstrip
(
',,、
\t\n
'
)
# 模式4: 类似处理 [
open_bracket_pos
=
text_str
.
rfind
(
'['
)
if
open_bracket_pos
!=
-
1
:
close_bracket_pos
=
text_str
.
rfind
(
']'
)
if
close_bracket_pos
<
open_bracket_pos
:
# [ 在 ] 后面或没有 ],说明是未闭合的
text_str
=
text_str
[:
open_bracket_pos
]
.
rstrip
(
',,、
\t\n
'
)
# 模式5: 移除看起来像JSON键值对的片段,如 "chapterId": "S3
# 这种情况通常出现在上面的模式之后
text_str
=
re
.
sub
(
r',?
\
s*"[^"]+"
\
s*:
\
s*"[^"]*$'
,
''
,
text_str
)
text_str
=
re
.
sub
(
r',?
\
s*"[^"]+"
\
s*:
\
s*[^,}
\
]]*$'
,
''
,
text_str
)
# 清理末尾的逗号和空白
text_str
=
text_str
.
rstrip
(
',,、
\t\n
'
)
return
text_str
.
strip
()
def
_safe_text
(
self
,
value
:
Any
)
->
str
:
"""将任意值安全转换为字符串,None与复杂对象容错"""
if
value
is
None
:
...
...
Please
register
or
login
to post a comment