Toggle navigation
Toggle navigation
This project
Loading...
Sign in
万朱浩
/
Venue-Ops
Go to a project
Toggle navigation
Projects
Groups
Snippets
Help
Toggle navigation pinning
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
马一丁
2025-11-17 15:39:02 +0800
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
50b6ab403e6e7aa58ed23ec63104034a066fc3c6
50b6ab40
1 parent
26c133c9
Cleaning Data Returned by Report Engine's LLM
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
985 additions
and
64 deletions
ReportEngine/nodes/document_layout_node.py
ReportEngine/nodes/template_selection_node.py
ReportEngine/nodes/word_budget_node.py
ReportEngine/prompts/prompts.py
ReportEngine/utils/json_parser.py
ReportEngine/utils/test_json_parser.py
ReportEngine/nodes/document_layout_node.py
View file @
50b6ab4
...
...
@@ -14,6 +14,7 @@ from ..prompts import (
SYSTEM_PROMPT_DOCUMENT_LAYOUT
,
build_document_layout_prompt
,
)
from
..utils.json_parser
import
RobustJSONParser
,
JSONParseError
from
.base_node
import
BaseNode
...
...
@@ -27,6 +28,12 @@ class DocumentLayoutNode(BaseNode):
def
__init__
(
self
,
llm_client
):
"""记录LLM客户端并设置节点名字,供BaseNode日志使用"""
super
()
.
__init__
(
llm_client
,
"DocumentLayoutNode"
)
# 初始化鲁棒JSON解析器,启用所有修复策略
self
.
json_parser
=
RobustJSONParser
(
enable_json_repair
=
True
,
enable_llm_repair
=
False
,
# 可以根据需要启用LLM修复
max_repair_attempts
=
3
,
)
def
run
(
self
,
...
...
@@ -82,8 +89,14 @@ class DocumentLayoutNode(BaseNode):
"""
解析LLM返回的JSON文本,若失败则抛出友好错误。
使用鲁棒JSON解析器进行多重修复尝试:
1. 清理markdown标记和思考内容
2. 本地语法修复(括号平衡、逗号补全、控制字符转义等)
3. 使用json_repair库进行高级修复
4. 可选的LLM辅助修复
参数:
raw: LLM原始返回字符串,允许带```包裹。
raw: LLM原始返回字符串,允许带```包裹
、思考内容等
。
返回:
dict: 结构化的设计稿。
...
...
@@ -91,19 +104,25 @@ class DocumentLayoutNode(BaseNode):
异常:
ValueError: 当响应为空或JSON解析失败时抛出。
"""
cleaned
=
raw
.
strip
()
if
cleaned
.
startswith
(
"```json"
):
cleaned
=
cleaned
[
7
:]
if
cleaned
.
startswith
(
"```"
):
cleaned
=
cleaned
[
3
:]
if
cleaned
.
endswith
(
"```"
):
cleaned
=
cleaned
[:
-
3
]
cleaned
=
cleaned
.
strip
()
if
not
cleaned
:
raise
ValueError
(
"文档设计LLM返回空内容"
)
try
:
return
json
.
loads
(
cleaned
)
except
json
.
JSONDecodeError
as
exc
:
result
=
self
.
json_parser
.
parse
(
raw
,
context_name
=
"文档设计"
,
expected_keys
=
[
"title"
,
"toc"
,
"hero"
],
)
# 验证关键字段的类型
if
not
isinstance
(
result
.
get
(
"title"
),
str
):
logger
.
warning
(
"文档设计缺少title字段或类型错误,使用默认值"
)
result
.
setdefault
(
"title"
,
"未命名报告"
)
if
not
isinstance
(
result
.
get
(
"toc"
),
(
list
,
dict
)):
logger
.
warning
(
"文档设计缺少toc字段或类型错误,使用空列表"
)
result
.
setdefault
(
"toc"
,
[])
if
not
isinstance
(
result
.
get
(
"hero"
),
dict
):
logger
.
warning
(
"文档设计缺少hero字段或类型错误,使用空对象"
)
result
.
setdefault
(
"hero"
,
{})
return
result
except
JSONParseError
as
exc
:
# 转换为原有的异常类型以保持向后兼容
raise
ValueError
(
f
"文档设计JSON解析失败: {exc}"
)
from
exc
...
...
ReportEngine/nodes/template_selection_node.py
View file @
50b6ab4
...
...
@@ -12,6 +12,7 @@ from loguru import logger
from
.base_node
import
BaseNode
from
..prompts
import
SYSTEM_PROMPT_TEMPLATE_SELECTION
from
..utils.json_parser
import
RobustJSONParser
,
JSONParseError
class
TemplateSelectionNode
(
BaseNode
):
...
...
@@ -25,13 +26,19 @@ class TemplateSelectionNode(BaseNode):
def
__init__
(
self
,
llm_client
,
template_dir
:
str
=
"ReportEngine/report_template"
):
"""
初始化模板选择节点
Args:
llm_client: LLM客户端
template_dir: 模板目录路径
"""
super
()
.
__init__
(
llm_client
,
"TemplateSelectionNode"
)
self
.
template_dir
=
template_dir
# 初始化鲁棒JSON解析器,启用所有修复策略
self
.
json_parser
=
RobustJSONParser
(
enable_json_repair
=
True
,
enable_llm_repair
=
False
,
max_repair_attempts
=
3
,
)
def
run
(
self
,
input_data
:
Dict
[
str
,
Any
],
**
kwargs
)
->
Dict
[
str
,
Any
]:
"""
...
...
@@ -137,20 +144,22 @@ class TemplateSelectionNode(BaseNode):
# 调用LLM
response
=
self
.
llm_client
.
stream_invoke_to_string
(
SYSTEM_PROMPT_TEMPLATE_SELECTION
,
user_message
)
# 检查响应是否为空
if
not
response
or
not
response
.
strip
():
logger
.
error
(
"LLM返回空响应"
)
return
None
logger
.
info
(
f
"LLM原始响应: {response}"
)
# 尝试解析JSON响应
# 尝试解析JSON响应,使用鲁棒解析器
try
:
# 清理响应文本
cleaned_response
=
self
.
_clean_llm_response
(
response
)
result
=
json
.
loads
(
cleaned_response
)
result
=
self
.
json_parser
.
parse
(
response
,
context_name
=
"模板选择"
,
expected_keys
=
[
"template_name"
,
"selection_reason"
],
)
# 验证选择的模板是否存在
selected_template_name
=
result
.
get
(
'template_name'
,
''
)
for
template
in
available_templates
:
...
...
@@ -161,38 +170,16 @@ class TemplateSelectionNode(BaseNode):
'template_content'
:
template
[
'content'
],
'selection_reason'
:
result
.
get
(
'selection_reason'
,
'LLM智能选择'
)
}
logger
.
error
(
f
"LLM选择的模板不存在: {selected_template_name}"
)
return
None
except
json
.
JSONDecodeError
as
e
:
except
JSONParseError
as
e
:
logger
.
error
(
f
"JSON解析失败: {str(e)}"
)
# 尝试从文本响应中提取模板信息
return
self
.
_extract_template_from_text
(
response
,
available_templates
)
def
_clean_llm_response
(
self
,
response
:
str
)
->
str
:
"""
清理LLM响应。
去掉 ```json``` 包裹以及前后空白,方便 `json.loads`。
参数:
response: LLM原始响应。
返回:
str: 适合直接做JSON解析的纯文本。
"""
# 移除可能的markdown代码块标记
if
'```json'
in
response
:
response
=
response
.
split
(
'```json'
)[
1
]
.
split
(
'```'
)[
0
]
elif
'```'
in
response
:
response
=
response
.
split
(
'```'
)[
1
]
.
split
(
'```'
)[
0
]
# 移除前后空白
response
=
response
.
strip
()
return
response
def
_extract_template_from_text
(
self
,
response
:
str
,
available_templates
:
List
[
Dict
[
str
,
Any
]])
->
Optional
[
Dict
[
str
,
Any
]]:
"""
从文本响应中提取模板信息。
...
...
ReportEngine/nodes/word_budget_node.py
View file @
50b6ab4
...
...
@@ -14,6 +14,7 @@ from ..prompts import (
SYSTEM_PROMPT_WORD_BUDGET
,
build_word_budget_prompt
,
)
from
..utils.json_parser
import
RobustJSONParser
,
JSONParseError
from
.base_node
import
BaseNode
...
...
@@ -27,6 +28,12 @@ class WordBudgetNode(BaseNode):
def
__init__
(
self
,
llm_client
):
"""仅记录LLM客户端引用,方便run阶段发起请求"""
super
()
.
__init__
(
llm_client
,
"WordBudgetNode"
)
# 初始化鲁棒JSON解析器,启用所有修复策略
self
.
json_parser
=
RobustJSONParser
(
enable_json_repair
=
True
,
enable_llm_repair
=
False
,
# 可以根据需要启用LLM修复
max_repair_attempts
=
3
,
)
def
run
(
self
,
...
...
@@ -79,8 +86,14 @@ class WordBudgetNode(BaseNode):
"""
将LLM输出的JSON文本转为字典,失败时提示规划异常。
使用鲁棒JSON解析器进行多重修复尝试:
1. 清理markdown标记和思考内容
2. 本地语法修复(括号平衡、逗号补全、控制字符转义等)
3. 使用json_repair库进行高级修复
4. 可选的LLM辅助修复
参数:
raw: LLM返回值,可能包含```包裹。
raw: LLM返回值,可能包含```包裹
、思考内容等
。
返回:
dict: 合法的篇幅规划JSON。
...
...
@@ -88,19 +101,25 @@ class WordBudgetNode(BaseNode):
异常:
ValueError: 当响应为空或JSON解析失败时抛出。
"""
cleaned
=
raw
.
strip
()
if
cleaned
.
startswith
(
"```json"
):
cleaned
=
cleaned
[
7
:]
if
cleaned
.
startswith
(
"```"
):
cleaned
=
cleaned
[
3
:]
if
cleaned
.
endswith
(
"```"
):
cleaned
=
cleaned
[:
-
3
]
cleaned
=
cleaned
.
strip
()
if
not
cleaned
:
raise
ValueError
(
"篇幅规划LLM返回空内容"
)
try
:
return
json
.
loads
(
cleaned
)
except
json
.
JSONDecodeError
as
exc
:
result
=
self
.
json_parser
.
parse
(
raw
,
context_name
=
"篇幅规划"
,
expected_keys
=
[
"totalWords"
,
"globalGuidelines"
,
"chapters"
],
)
# 验证关键字段的类型
if
not
isinstance
(
result
.
get
(
"totalWords"
),
(
int
,
float
)):
logger
.
warning
(
"篇幅规划缺少totalWords字段或类型错误,使用默认值"
)
result
.
setdefault
(
"totalWords"
,
10000
)
if
not
isinstance
(
result
.
get
(
"globalGuidelines"
),
list
):
logger
.
warning
(
"篇幅规划缺少globalGuidelines字段或类型错误,使用空列表"
)
result
.
setdefault
(
"globalGuidelines"
,
[])
if
not
isinstance
(
result
.
get
(
"chapters"
),
(
list
,
dict
)):
logger
.
warning
(
"篇幅规划缺少chapters字段或类型错误,使用空列表"
)
result
.
setdefault
(
"chapters"
,
[])
return
result
except
JSONParseError
as
exc
:
# 转换为原有的异常类型以保持向后兼容
raise
ValueError
(
f
"篇幅规划JSON解析失败: {exc}"
)
from
exc
...
...
ReportEngine/prompts/prompts.py
View file @
50b6ab4
...
...
@@ -216,8 +216,17 @@ SYSTEM_PROMPT_TEMPLATE_SELECTION = f"""
{json.dumps(output_schema_template_selection, indent=2, ensure_ascii=False)}
</OUTPUT JSON SCHEMA>
确保输出是一个符合上述输出JSON模式定义的JSON对象。
只返回JSON对象,不要有解释或额外文本。
**重要的输出格式要求:**
1. 只返回符合上述Schema的纯JSON对象
2. 严禁在JSON外添加任何思考过程、说明文字或解释
3. 可以使用```json和```标记包裹JSON,但不要添加其他内容
4. 确保JSON语法完全正确:
- 对象和数组元素之间必须有逗号分隔
- 字符串中的特殊字符必须正确转义(
\n
,
\t
,
\"
等)
- 括号必须成对且正确嵌套
- 不要使用尾随逗号(最后一个元素后不加逗号)
- 不要在JSON中添加注释
5. 所有字符串值使用双引号,数值不使用引号
"""
# HTML报告生成的系统提示词
...
...
@@ -372,7 +381,17 @@ SYSTEM_PROMPT_DOCUMENT_LAYOUT = f"""
{json.dumps(document_layout_output_schema, ensure_ascii=False, indent=2)}
</OUTPUT JSON SCHEMA>
只返回JSON,勿附加额外文本。
**重要的输出格式要求:**
1. 只返回符合上述Schema的纯JSON对象
2. 严禁在JSON外添加任何思考过程、说明文字或解释
3. 可以使用```json和```标记包裹JSON,但不要添加其他内容
4. 确保JSON语法完全正确:
- 对象和数组元素之间必须有逗号分隔
- 字符串中的特殊字符必须正确转义(
\n
,
\t
,
\"
等)
- 括号必须成对且正确嵌套
- 不要使用尾随逗号(最后一个元素后不加逗号)
- 不要在JSON中添加注释
5. 所有字符串值使用双引号,数值不使用引号
"""
# 篇幅规划提示词
...
...
@@ -390,7 +409,17 @@ SYSTEM_PROMPT_WORD_BUDGET = f"""
{json.dumps(word_budget_output_schema, ensure_ascii=False, indent=2)}
</OUTPUT JSON SCHEMA>
只返回JSON,无额外说明。
**重要的输出格式要求:**
1. 只返回符合上述Schema的纯JSON对象
2. 严禁在JSON外添加任何思考过程、说明文字或解释
3. 可以使用```json和```标记包裹JSON,但不要添加其他内容
4. 确保JSON语法完全正确:
- 对象和数组元素之间必须有逗号分隔
- 字符串中的特殊字符必须正确转义(
\n
,
\t
,
\"
等)
- 括号必须成对且正确嵌套
- 不要使用尾随逗号(最后一个元素后不加逗号)
- 不要在JSON中添加注释
5. 所有字符串值使用双引号,数值不使用引号
"""
...
...
ReportEngine/utils/json_parser.py
0 → 100644
View file @
50b6ab4
"""
统一的JSON解析和修复工具。
提供鲁棒的JSON解析能力,支持:
1. 自动清理markdown代码块标记和思考内容
2. 本地语法修复(括号平衡、逗号补全、控制字符转义等)
3. 使用json_repair库进行高级修复
4. LLM辅助修复(可选)
5. 详细的错误日志和调试信息
"""
from
__future__
import
annotations
import
json
import
re
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Tuple
,
Callable
from
loguru
import
logger
try
:
from
json_repair
import
repair_json
as
_json_repair_fn
except
ImportError
:
_json_repair_fn
=
None
class
JSONParseError
(
ValueError
):
"""JSON解析失败时抛出的异常,附带原始文本方便排查。"""
def
__init__
(
self
,
message
:
str
,
raw_text
:
Optional
[
str
]
=
None
):
"""
构造异常并附加原始输出,便于日志中定位。
Args:
message: 人类可读的错误描述。
raw_text: 触发异常的完整LLM输出。
"""
super
()
.
__init__
(
message
)
self
.
raw_text
=
raw_text
class
RobustJSONParser
:
"""
鲁棒的JSON解析器。
集成多种修复策略,确保LLM返回的内容能够被正确解析:
- 清理markdown包裹、思考内容等额外信息
- 修复常见语法错误(缺少逗号、括号不平衡等)
- 转义未转义的控制字符
- 使用第三方库进行高级修复
- 可选的LLM辅助修复
"""
# 常见的LLM思考内容模式
_THINKING_PATTERNS
=
[
r"<thinking>.*?</thinking>"
,
r"<thought>.*?</thought>"
,
r"让我想想.*?(?=
\
{|
\
[|$)"
,
r"首先.*?(?=
\
{|
\
[|$)"
,
r"分析.*?(?=
\
{|
\
[|$)"
,
r"根据.*?(?=
\
{|
\
[|$)"
,
]
# 冒号等号模式(LLM常见错误)
_COLON_EQUALS_PATTERN
=
re
.
compile
(
r'(":
\
s*)='
)
def
__init__
(
self
,
llm_repair_fn
:
Optional
[
Callable
[[
str
,
str
],
Optional
[
str
]]]
=
None
,
enable_json_repair
:
bool
=
True
,
enable_llm_repair
:
bool
=
False
,
max_repair_attempts
:
int
=
3
,
):
"""
初始化JSON解析器。
Args:
llm_repair_fn: 可选的LLM修复函数,接收(原始JSON, 错误信息)返回修复后的JSON
enable_json_repair: 是否启用json_repair库
enable_llm_repair: 是否启用LLM辅助修复
max_repair_attempts: 最大修复尝试次数
"""
self
.
llm_repair_fn
=
llm_repair_fn
self
.
enable_json_repair
=
enable_json_repair
and
_json_repair_fn
is
not
None
self
.
enable_llm_repair
=
enable_llm_repair
self
.
max_repair_attempts
=
max_repair_attempts
def
parse
(
self
,
raw_text
:
str
,
context_name
:
str
=
"JSON"
,
expected_keys
:
Optional
[
List
[
str
]]
=
None
,
extract_wrapper_key
:
Optional
[
str
]
=
None
,
)
->
Dict
[
str
,
Any
]:
"""
解析LLM返回的JSON文本。
参数:
raw_text: LLM原始输出(可能包含```包裹、思考内容等)
context_name: 上下文名称,用于错误信息
expected_keys: 期望的键列表,用于验证
extract_wrapper_key: 如果JSON被包裹在某个键中,指定该键名进行提取
返回:
dict: 解析后的JSON对象
异常:
JSONParseError: 多种修复策略仍无法解析合法JSON
"""
if
not
raw_text
or
not
raw_text
.
strip
():
raise
JSONParseError
(
f
"{context_name}返回空内容"
)
# 步骤1: 清理markdown标记和思考内容
cleaned
=
self
.
_clean_response
(
raw_text
)
# 步骤2: 收集候选payload
candidates
=
[
cleaned
]
# 步骤3: 应用本地修复策略
local_repaired
=
self
.
_apply_local_repairs
(
cleaned
)
if
local_repaired
!=
cleaned
:
candidates
.
append
(
local_repaired
)
# 步骤4: 尝试解析所有候选
last_error
:
Optional
[
json
.
JSONDecodeError
]
=
None
for
i
,
candidate
in
enumerate
(
candidates
):
try
:
data
=
json
.
loads
(
candidate
)
logger
.
debug
(
f
"{context_name} JSON解析成功(候选{i + 1}/{len(candidates)})"
)
return
self
.
_extract_and_validate
(
data
,
expected_keys
,
extract_wrapper_key
,
context_name
)
except
json
.
JSONDecodeError
as
exc
:
last_error
=
exc
logger
.
debug
(
f
"{context_name} 候选{i + 1}解析失败: {exc}"
)
# 步骤5: 使用json_repair库
if
self
.
enable_json_repair
:
repaired
=
self
.
_attempt_json_repair
(
cleaned
,
context_name
)
if
repaired
:
try
:
data
=
json
.
loads
(
repaired
)
logger
.
info
(
f
"{context_name} JSON通过json_repair库修复成功"
)
return
self
.
_extract_and_validate
(
data
,
expected_keys
,
extract_wrapper_key
,
context_name
)
except
json
.
JSONDecodeError
as
exc
:
last_error
=
exc
logger
.
debug
(
f
"{context_name} json_repair修复后仍无法解析: {exc}"
)
# 步骤6: 使用LLM修复(如果启用)
if
self
.
enable_llm_repair
and
self
.
llm_repair_fn
:
llm_repaired
=
self
.
_attempt_llm_repair
(
cleaned
,
str
(
last_error
),
context_name
)
if
llm_repaired
:
try
:
data
=
json
.
loads
(
llm_repaired
)
logger
.
info
(
f
"{context_name} JSON通过LLM修复成功"
)
return
self
.
_extract_and_validate
(
data
,
expected_keys
,
extract_wrapper_key
,
context_name
)
except
json
.
JSONDecodeError
as
exc
:
last_error
=
exc
logger
.
warning
(
f
"{context_name} LLM修复后仍无法解析: {exc}"
)
# 所有策略都失败了
error_msg
=
f
"{context_name} JSON解析失败: {last_error}"
logger
.
error
(
error_msg
)
logger
.
debug
(
f
"原始文本前500字符: {raw_text[:500]}"
)
raise
JSONParseError
(
error_msg
,
raw_text
=
raw_text
)
from
last_error
def
_clean_response
(
self
,
raw
:
str
)
->
str
:
"""
清理LLM响应,去除markdown标记和思考内容。
参数:
raw: LLM原始输出
返回:
str: 清理后的文本
"""
cleaned
=
raw
.
strip
()
# 移除思考内容(多语言支持)
for
pattern
in
self
.
_THINKING_PATTERNS
:
cleaned
=
re
.
sub
(
pattern
,
""
,
cleaned
,
flags
=
re
.
DOTALL
|
re
.
IGNORECASE
)
# 移除markdown代码块标记
if
cleaned
.
startswith
(
"```json"
):
cleaned
=
cleaned
[
7
:]
elif
cleaned
.
startswith
(
"```"
):
cleaned
=
cleaned
[
3
:]
if
cleaned
.
endswith
(
"```"
):
cleaned
=
cleaned
[:
-
3
]
cleaned
=
cleaned
.
strip
()
# 尝试提取第一个完整的JSON对象或数组
cleaned
=
self
.
_extract_first_json_structure
(
cleaned
)
return
cleaned
def
_extract_first_json_structure
(
self
,
text
:
str
)
->
str
:
"""
从文本中提取第一个完整的JSON对象或数组。
这对于处理LLM在JSON前后添加说明文字的情况很有用。
参数:
text: 可能包含JSON的文本
返回:
str: 提取的JSON文本,如果找不到则返回原文本
"""
# 查找第一个 { 或 [
start_brace
=
text
.
find
(
"{"
)
start_bracket
=
text
.
find
(
"["
)
if
start_brace
==
-
1
and
start_bracket
==
-
1
:
return
text
# 确定起始位置
if
start_brace
==
-
1
:
start
=
start_bracket
opener
=
"["
closer
=
"]"
elif
start_bracket
==
-
1
:
start
=
start_brace
opener
=
"{"
closer
=
"}"
else
:
start
=
min
(
start_brace
,
start_bracket
)
opener
=
text
[
start
]
closer
=
"}"
if
opener
==
"{"
else
"]"
# 查找对应的结束位置
depth
=
0
in_string
=
False
escaped
=
False
for
i
in
range
(
start
,
len
(
text
)):
ch
=
text
[
i
]
if
escaped
:
escaped
=
False
continue
if
ch
==
"
\\
"
:
escaped
=
True
continue
if
ch
==
'"'
:
in_string
=
not
in_string
continue
if
in_string
:
continue
if
ch
in
"{["
:
depth
+=
1
elif
ch
in
"}]"
:
depth
-=
1
if
depth
==
0
:
return
text
[
start
:
i
+
1
]
# 如果没找到完整的结构,返回从起始位置到结尾
return
text
[
start
:]
if
start
<
len
(
text
)
else
text
def
_apply_local_repairs
(
self
,
text
:
str
)
->
str
:
"""
应用本地修复策略。
参数:
text: 原始JSON文本
返回:
str: 修复后的文本
"""
repaired
=
text
mutated
=
False
# 修复 ":=" 错误
new_text
=
self
.
_COLON_EQUALS_PATTERN
.
sub
(
r"
\
1"
,
repaired
)
if
new_text
!=
repaired
:
logger
.
warning
(
"检测到
\"
:=
\"
字符,已自动移除多余的'='号"
)
repaired
=
new_text
mutated
=
True
# 转义控制字符
repaired
,
escaped
=
self
.
_escape_control_characters
(
repaired
)
if
escaped
:
logger
.
warning
(
"检测到未转义的控制字符,已自动转换为转义序列"
)
mutated
=
True
# 修复缺少的逗号
repaired
,
commas_fixed
=
self
.
_fix_missing_commas
(
repaired
)
if
commas_fixed
:
logger
.
warning
(
"检测到对象/数组之间缺少逗号,已自动补齐"
)
mutated
=
True
# 平衡括号
repaired
,
balanced
=
self
.
_balance_brackets
(
repaired
)
if
balanced
:
logger
.
warning
(
"检测到括号不平衡,已自动补齐/剔除异常括号"
)
mutated
=
True
# 移除尾随逗号
repaired
,
trailing_removed
=
self
.
_remove_trailing_commas
(
repaired
)
if
trailing_removed
:
logger
.
warning
(
"检测到尾随逗号,已自动移除"
)
mutated
=
True
return
repaired
if
mutated
else
text
def
_escape_control_characters
(
self
,
text
:
str
)
->
Tuple
[
str
,
bool
]:
"""
将字符串字面量中的裸换行/制表符/控制字符替换为JSON合法的转义序列。
参数:
text: 原始JSON文本
返回:
Tuple[str, bool]: (修复后的文本, 是否有修改)
"""
if
not
text
:
return
text
,
False
result
:
List
[
str
]
=
[]
in_string
=
False
escaped
=
False
mutated
=
False
control_map
=
{
"
\n
"
:
"
\\
n"
,
"
\r
"
:
"
\\
r"
,
"
\t
"
:
"
\\
t"
}
for
ch
in
text
:
if
escaped
:
result
.
append
(
ch
)
escaped
=
False
continue
if
ch
==
"
\\
"
:
result
.
append
(
ch
)
escaped
=
True
continue
if
ch
==
'"'
:
result
.
append
(
ch
)
in_string
=
not
in_string
continue
if
in_string
and
ch
in
control_map
:
result
.
append
(
control_map
[
ch
])
mutated
=
True
continue
if
in_string
and
ord
(
ch
)
<
0x20
:
result
.
append
(
f
"
\\
u{ord(ch):04x}"
)
mutated
=
True
continue
result
.
append
(
ch
)
return
""
.
join
(
result
),
mutated
def
_fix_missing_commas
(
self
,
text
:
str
)
->
Tuple
[
str
,
bool
]:
"""
在对象/数组元素之间自动补逗号。
参数:
text: 原始JSON文本
返回:
Tuple[str, bool]: (修复后的文本, 是否有修改)
"""
if
not
text
:
return
text
,
False
chars
:
List
[
str
]
=
[]
mutated
=
False
in_string
=
False
escaped
=
False
length
=
len
(
text
)
i
=
0
while
i
<
length
:
ch
=
text
[
i
]
chars
.
append
(
ch
)
if
escaped
:
escaped
=
False
i
+=
1
continue
if
ch
==
"
\\
"
:
escaped
=
True
i
+=
1
continue
if
ch
==
'"'
:
# 如果我们正在退出字符串,检查后面是否需要逗号
if
in_string
:
# 查找下一个非空白字符
j
=
i
+
1
while
j
<
length
and
text
[
j
]
in
"
\t\r\n
"
:
j
+=
1
# 如果下一个字符是 " { [ 或数字,可能需要逗号
if
j
<
length
:
next_ch
=
text
[
j
]
if
next_ch
in
"
\"
[{"
or
next_ch
.
isdigit
():
# 检查是否已经在对象或数组中
# 通过检查前面是否有未闭合的 { 或 [
has_opener
=
False
for
k
in
range
(
len
(
chars
)
-
1
,
-
1
,
-
1
):
if
chars
[
k
]
in
"{["
:
has_opener
=
True
break
elif
chars
[
k
]
in
"]}"
:
break
if
has_opener
:
chars
.
append
(
","
)
mutated
=
True
in_string
=
not
in_string
i
+=
1
continue
# 在 } 或 ] 后面检查是否需要逗号
if
not
in_string
and
ch
in
"}]"
:
j
=
i
+
1
# 跳过空白
while
j
<
length
and
text
[
j
]
in
"
\t\r\n
"
:
j
+=
1
# 如果下一个非空白字符是 { [ " 或数字,添加逗号
if
j
<
length
:
next_ch
=
text
[
j
]
if
next_ch
in
"{[
\"
"
or
next_ch
.
isdigit
():
chars
.
append
(
","
)
mutated
=
True
i
+=
1
return
""
.
join
(
chars
),
mutated
def
_balance_brackets
(
self
,
text
:
str
)
->
Tuple
[
str
,
bool
]:
"""
尝试修复因LLM多写/少写括号导致的不平衡结构。
参数:
text: 原始JSON文本
返回:
Tuple[str, bool]: (修复后的文本, 是否有修改)
"""
if
not
text
:
return
text
,
False
result
:
List
[
str
]
=
[]
stack
:
List
[
str
]
=
[]
mutated
=
False
in_string
=
False
escaped
=
False
opener_map
=
{
"{"
:
"}"
,
"["
:
"]"
}
for
ch
in
text
:
if
escaped
:
result
.
append
(
ch
)
escaped
=
False
continue
if
ch
==
"
\\
"
:
result
.
append
(
ch
)
escaped
=
True
continue
if
ch
==
'"'
:
result
.
append
(
ch
)
in_string
=
not
in_string
continue
if
in_string
:
result
.
append
(
ch
)
continue
if
ch
in
"{["
:
stack
.
append
(
ch
)
result
.
append
(
ch
)
continue
if
ch
in
"}]"
:
if
stack
and
(
(
ch
==
"}"
and
stack
[
-
1
]
==
"{"
)
or
(
ch
==
"]"
and
stack
[
-
1
]
==
"["
)
):
stack
.
pop
()
result
.
append
(
ch
)
else
:
# 不匹配的闭括号,忽略
mutated
=
True
continue
result
.
append
(
ch
)
# 补齐未闭合的括号
while
stack
:
opener
=
stack
.
pop
()
result
.
append
(
opener_map
[
opener
])
mutated
=
True
return
""
.
join
(
result
),
mutated
def
_remove_trailing_commas
(
self
,
text
:
str
)
->
Tuple
[
str
,
bool
]:
"""
移除JSON对象和数组中的尾随逗号。
参数:
text: 原始JSON文本
返回:
Tuple[str, bool]: (修复后的文本, 是否有修改)
"""
if
not
text
:
return
text
,
False
# 使用正则表达式移除尾随逗号
# 匹配 , 后面跟着空白和 } 或 ] 的情况
pattern
=
r",(
\
s*[}
\
]])"
new_text
=
re
.
sub
(
pattern
,
r"
\
1"
,
text
)
return
new_text
,
new_text
!=
text
def
_attempt_json_repair
(
self
,
text
:
str
,
context_name
:
str
)
->
Optional
[
str
]:
"""
使用json_repair库进行高级修复。
参数:
text: 原始JSON文本
context_name: 上下文名称
返回:
Optional[str]: 修复后的JSON文本,失败返回None
"""
if
not
_json_repair_fn
:
return
None
try
:
fixed
=
_json_repair_fn
(
text
)
if
fixed
and
fixed
!=
text
:
logger
.
info
(
f
"{context_name} 使用json_repair库自动修复JSON"
)
return
fixed
except
Exception
as
exc
:
logger
.
debug
(
f
"{context_name} json_repair修复失败: {exc}"
)
return
None
def
_attempt_llm_repair
(
self
,
text
:
str
,
error_msg
:
str
,
context_name
:
str
)
->
Optional
[
str
]:
"""
使用LLM进行JSON修复。
参数:
text: 原始JSON文本
error_msg: 解析错误信息
context_name: 上下文名称
返回:
Optional[str]: 修复后的JSON文本,失败返回None
"""
if
not
self
.
llm_repair_fn
:
return
None
try
:
logger
.
info
(
f
"{context_name} 尝试使用LLM修复JSON"
)
repaired
=
self
.
llm_repair_fn
(
text
,
error_msg
)
if
repaired
and
repaired
!=
text
:
return
repaired
except
Exception
as
exc
:
logger
.
warning
(
f
"{context_name} LLM修复失败: {exc}"
)
return
None
def
_extract_and_validate
(
self
,
data
:
Any
,
expected_keys
:
Optional
[
List
[
str
]],
extract_wrapper_key
:
Optional
[
str
],
context_name
:
str
,
)
->
Dict
[
str
,
Any
]:
"""
提取并验证JSON数据。
参数:
data: 解析后的数据
expected_keys: 期望的键列表
extract_wrapper_key: 包裹键名
context_name: 上下文名称
返回:
Dict[str, Any]: 提取并验证后的数据
异常:
JSONParseError: 如果数据格式不符合预期
"""
# 提取包裹的数据
if
extract_wrapper_key
and
isinstance
(
data
,
dict
):
if
extract_wrapper_key
in
data
:
data
=
data
[
extract_wrapper_key
]
else
:
logger
.
warning
(
f
"{context_name} 未找到包裹键'{extract_wrapper_key}',使用原始数据"
)
# 验证数据类型
if
not
isinstance
(
data
,
dict
):
if
isinstance
(
data
,
list
)
and
len
(
data
)
>
0
and
isinstance
(
data
[
0
],
dict
):
logger
.
warning
(
f
"{context_name} 返回数组,自动提取第一个元素"
)
data
=
data
[
0
]
else
:
raise
JSONParseError
(
f
"{context_name} 返回的不是JSON对象: {type(data).__name__}"
)
# 验证必需的键
if
expected_keys
:
missing_keys
=
[
key
for
key
in
expected_keys
if
key
not
in
data
]
if
missing_keys
:
logger
.
warning
(
f
"{context_name} 缺少预期的键: {', '.join(missing_keys)}"
)
return
data
__all__
=
[
"RobustJSONParser"
,
"JSONParseError"
]
...
...
ReportEngine/utils/test_json_parser.py
0 → 100644
View file @
50b6ab4
"""
测试RobustJSONParser的各种修复能力。
验证解析器能够处理:
1. 基本的markdown包裹
2. 思考内容清理
3. 缺少逗号的修复
4. 括号不平衡的修复
5. 控制字符转义
6. 尾随逗号移除
"""
import
json
import
unittest
from
json_parser
import
RobustJSONParser
,
JSONParseError
class
TestRobustJSONParser
(
unittest
.
TestCase
):
"""测试鲁棒JSON解析器的各种修复策略。"""
def
setUp
(
self
):
"""初始化解析器。"""
self
.
parser
=
RobustJSONParser
(
enable_json_repair
=
False
,
# 先测试本地修复
enable_llm_repair
=
False
,
)
def
test_basic_json
(
self
):
"""测试解析基本的合法JSON。"""
json_str
=
'{"name": "test", "value": 123}'
result
=
self
.
parser
.
parse
(
json_str
,
"基本测试"
)
self
.
assertEqual
(
result
[
"name"
],
"test"
)
self
.
assertEqual
(
result
[
"value"
],
123
)
def
test_markdown_wrapped
(
self
):
"""测试解析被```json包裹的JSON。"""
json_str
=
"""```json
{
"name": "test",
"value": 123
}
```"""
result
=
self
.
parser
.
parse
(
json_str
,
"Markdown包裹测试"
)
self
.
assertEqual
(
result
[
"name"
],
"test"
)
self
.
assertEqual
(
result
[
"value"
],
123
)
def
test_thinking_content_removal
(
self
):
"""测试清理思考内容。"""
json_str
=
"""<thinking>让我想想如何构造这个JSON</thinking>
{
"name": "test",
"value": 123
}"""
result
=
self
.
parser
.
parse
(
json_str
,
"思考内容清理测试"
)
self
.
assertEqual
(
result
[
"name"
],
"test"
)
self
.
assertEqual
(
result
[
"value"
],
123
)
def
test_missing_comma_fix
(
self
):
"""测试修复缺少的逗号。"""
# 这是实际错误中常见的情况:数组元素之间缺少逗号
json_str
=
"""{
"totalWords": 40000,
"globalGuidelines": [
"重点突出技术红利分配失衡"
"详略策略:技术创新"
],
"chapters": []
}"""
result
=
self
.
parser
.
parse
(
json_str
,
"缺少逗号修复测试"
)
self
.
assertEqual
(
len
(
result
[
"globalGuidelines"
]),
2
)
def
test_unbalanced_brackets
(
self
):
"""测试修复括号不平衡。"""
# 缺少结束括号
json_str
=
"""{
"name": "test",
"nested": {
"value": 123
}
"""
# 缺少最外层的 }
result
=
self
.
parser
.
parse
(
json_str
,
"括号不平衡测试"
)
self
.
assertEqual
(
result
[
"name"
],
"test"
)
self
.
assertEqual
(
result
[
"nested"
][
"value"
],
123
)
def
test_control_character_escape
(
self
):
"""测试转义控制字符。"""
# JSON字符串中的裸换行符应该被转义
json_str
=
"""{
"text": "这是第一行
这是第二行",
"value": 123
}"""
result
=
self
.
parser
.
parse
(
json_str
,
"控制字符转义测试"
)
# 确保换行符被正确处理
self
.
assertIn
(
"第一行"
,
result
[
"text"
])
self
.
assertIn
(
"第二行"
,
result
[
"text"
])
def
test_trailing_comma_removal
(
self
):
"""测试移除尾随逗号。"""
json_str
=
"""{
"name": "test",
"value": 123,
"items": [1, 2, 3,],
}"""
result
=
self
.
parser
.
parse
(
json_str
,
"尾随逗号测试"
)
self
.
assertEqual
(
result
[
"name"
],
"test"
)
self
.
assertEqual
(
len
(
result
[
"items"
]),
3
)
def
test_colon_equals_fix
(
self
):
"""测试修复冒号等号错误。"""
json_str
=
"""{
"name":= "test",
"value": 123
}"""
result
=
self
.
parser
.
parse
(
json_str
,
"冒号等号测试"
)
self
.
assertEqual
(
result
[
"name"
],
"test"
)
def
test_extract_first_json
(
self
):
"""测试从文本中提取第一个JSON结构。"""
json_str
=
"""这是一些说明文字,下面是JSON:
{
"name": "test",
"value": 123
}
后面还有一些其他文字"""
result
=
self
.
parser
.
parse
(
json_str
,
"提取JSON测试"
)
self
.
assertEqual
(
result
[
"name"
],
"test"
)
self
.
assertEqual
(
result
[
"value"
],
123
)
def
test_complex_real_world_case
(
self
):
"""测试真实世界的复杂案例(类似实际错误)。"""
# 模拟实际错误:缺少逗号、有markdown包裹、有思考内容
json_str
=
"""<thinking>我需要构造一个篇幅规划</thinking>
```json
{
"totalWords": 40000,
"tolerance": 2000,
"globalGuidelines": [
"重点突出技术红利分配失衡、人才流失与职业认同危机等结构性矛盾"
"详略策略:技术创新与传统技艺的碰撞"
"案例导向:优先引用真实数据和调研"
],
"chapters": [
{
"chapterId": "ch1",
"targetWords": 5000
}
]
}
```"""
result
=
self
.
parser
.
parse
(
json_str
,
"复杂真实案例测试"
)
self
.
assertEqual
(
result
[
"totalWords"
],
40000
)
self
.
assertEqual
(
result
[
"tolerance"
],
2000
)
self
.
assertEqual
(
len
(
result
[
"globalGuidelines"
]),
3
)
self
.
assertEqual
(
len
(
result
[
"chapters"
]),
1
)
def
test_expected_keys_validation
(
self
):
"""测试期望键的验证。"""
json_str
=
'{"name": "test"}'
# 不应该因为缺少键而失败,只是警告
result
=
self
.
parser
.
parse
(
json_str
,
"键验证测试"
,
expected_keys
=
[
"name"
,
"value"
]
)
self
.
assertIn
(
"name"
,
result
)
def
test_wrapper_key_extraction
(
self
):
"""测试从包裹键中提取数据。"""
json_str
=
"""{
"wrapper": {
"name": "test",
"value": 123
}
}"""
result
=
self
.
parser
.
parse
(
json_str
,
"包裹键测试"
,
extract_wrapper_key
=
"wrapper"
)
self
.
assertEqual
(
result
[
"name"
],
"test"
)
self
.
assertEqual
(
result
[
"value"
],
123
)
def
test_empty_input
(
self
):
"""测试空输入。"""
with
self
.
assertRaises
(
JSONParseError
):
self
.
parser
.
parse
(
""
,
"空输入测试"
)
def
test_invalid_json_after_all_repairs
(
self
):
"""测试所有修复策略都无法处理的情况。"""
# 这是一个严重损坏的JSON,无法修复
json_str
=
"{完全不是JSON格式的内容###"
with
self
.
assertRaises
(
JSONParseError
):
self
.
parser
.
parse
(
json_str
,
"无法修复测试"
)
def
run_manual_test
():
"""手动运行测试,打印详细信息。"""
print
(
"="
*
60
)
print
(
"开始测试RobustJSONParser"
)
print
(
"="
*
60
)
parser
=
RobustJSONParser
(
enable_json_repair
=
False
,
enable_llm_repair
=
False
)
# 测试实际错误案例
test_case
=
"""```json
{
"totalWords": 40000,
"tolerance": 2000,
"globalGuidelines": [
"重点突出技术红利分配失衡、人才流失与职业认同危机等结构性矛盾"
"详略策略:技术创新与传统技艺的碰撞"
],
"chapters": []
}
```"""
print
(
"
\n
测试案例:"
)
print
(
test_case
)
print
(
"
\n
"
+
"="
*
60
)
try
:
result
=
parser
.
parse
(
test_case
,
"手动测试"
)
print
(
"
\n
✓ 解析成功!"
)
print
(
"
\n
解析结果:"
)
print
(
json
.
dumps
(
result
,
ensure_ascii
=
False
,
indent
=
2
))
except
Exception
as
e
:
print
(
f
"
\n
✗ 解析失败: {e}"
)
print
(
"
\n
"
+
"="
*
60
)
if
__name__
==
"__main__"
:
# 运行手动测试
run_manual_test
()
# 运行单元测试
print
(
"
\n\n
运行单元测试..."
)
unittest
.
main
(
verbosity
=
2
)
...
...
Please
register
or
login
to post a comment