Showing
6 changed files
with
978 additions
and
57 deletions
| @@ -14,6 +14,7 @@ from ..prompts import ( | @@ -14,6 +14,7 @@ from ..prompts import ( | ||
| 14 | SYSTEM_PROMPT_DOCUMENT_LAYOUT, | 14 | SYSTEM_PROMPT_DOCUMENT_LAYOUT, |
| 15 | build_document_layout_prompt, | 15 | build_document_layout_prompt, |
| 16 | ) | 16 | ) |
| 17 | +from ..utils.json_parser import RobustJSONParser, JSONParseError | ||
| 17 | from .base_node import BaseNode | 18 | from .base_node import BaseNode |
| 18 | 19 | ||
| 19 | 20 | ||
| @@ -27,6 +28,12 @@ class DocumentLayoutNode(BaseNode): | @@ -27,6 +28,12 @@ class DocumentLayoutNode(BaseNode): | ||
| 27 | def __init__(self, llm_client): | 28 | def __init__(self, llm_client): |
| 28 | """记录LLM客户端并设置节点名字,供BaseNode日志使用""" | 29 | """记录LLM客户端并设置节点名字,供BaseNode日志使用""" |
| 29 | super().__init__(llm_client, "DocumentLayoutNode") | 30 | super().__init__(llm_client, "DocumentLayoutNode") |
| 31 | + # 初始化鲁棒JSON解析器,启用所有修复策略 | ||
| 32 | + self.json_parser = RobustJSONParser( | ||
| 33 | + enable_json_repair=True, | ||
| 34 | + enable_llm_repair=False, # 可以根据需要启用LLM修复 | ||
| 35 | + max_repair_attempts=3, | ||
| 36 | + ) | ||
| 30 | 37 | ||
| 31 | def run( | 38 | def run( |
| 32 | self, | 39 | self, |
| @@ -82,8 +89,14 @@ class DocumentLayoutNode(BaseNode): | @@ -82,8 +89,14 @@ class DocumentLayoutNode(BaseNode): | ||
| 82 | """ | 89 | """ |
| 83 | 解析LLM返回的JSON文本,若失败则抛出友好错误。 | 90 | 解析LLM返回的JSON文本,若失败则抛出友好错误。 |
| 84 | 91 | ||
| 92 | + 使用鲁棒JSON解析器进行多重修复尝试: | ||
| 93 | + 1. 清理markdown标记和思考内容 | ||
| 94 | + 2. 本地语法修复(括号平衡、逗号补全、控制字符转义等) | ||
| 95 | + 3. 使用json_repair库进行高级修复 | ||
| 96 | + 4. 可选的LLM辅助修复 | ||
| 97 | + | ||
| 85 | 参数: | 98 | 参数: |
| 86 | - raw: LLM原始返回字符串,允许带```包裹。 | 99 | + raw: LLM原始返回字符串,允许带```包裹、思考内容等。 |
| 87 | 100 | ||
| 88 | 返回: | 101 | 返回: |
| 89 | dict: 结构化的设计稿。 | 102 | dict: 结构化的设计稿。 |
| @@ -91,19 +104,25 @@ class DocumentLayoutNode(BaseNode): | @@ -91,19 +104,25 @@ class DocumentLayoutNode(BaseNode): | ||
| 91 | 异常: | 104 | 异常: |
| 92 | ValueError: 当响应为空或JSON解析失败时抛出。 | 105 | ValueError: 当响应为空或JSON解析失败时抛出。 |
| 93 | """ | 106 | """ |
| 94 | - cleaned = raw.strip() | ||
| 95 | - if cleaned.startswith("```json"): | ||
| 96 | - cleaned = cleaned[7:] | ||
| 97 | - if cleaned.startswith("```"): | ||
| 98 | - cleaned = cleaned[3:] | ||
| 99 | - if cleaned.endswith("```"): | ||
| 100 | - cleaned = cleaned[:-3] | ||
| 101 | - cleaned = cleaned.strip() | ||
| 102 | - if not cleaned: | ||
| 103 | - raise ValueError("文档设计LLM返回空内容") | ||
| 104 | try: | 107 | try: |
| 105 | - return json.loads(cleaned) | ||
| 106 | - except json.JSONDecodeError as exc: | 108 | + result = self.json_parser.parse( |
| 109 | + raw, | ||
| 110 | + context_name="文档设计", | ||
| 111 | + expected_keys=["title", "toc", "hero"], | ||
| 112 | + ) | ||
| 113 | + # 验证关键字段的类型 | ||
| 114 | + if not isinstance(result.get("title"), str): | ||
| 115 | + logger.warning("文档设计缺少title字段或类型错误,使用默认值") | ||
| 116 | + result.setdefault("title", "未命名报告") | ||
| 117 | + if not isinstance(result.get("toc"), (list, dict)): | ||
| 118 | + logger.warning("文档设计缺少toc字段或类型错误,使用空列表") | ||
| 119 | + result.setdefault("toc", []) | ||
| 120 | + if not isinstance(result.get("hero"), dict): | ||
| 121 | + logger.warning("文档设计缺少hero字段或类型错误,使用空对象") | ||
| 122 | + result.setdefault("hero", {}) | ||
| 123 | + return result | ||
| 124 | + except JSONParseError as exc: | ||
| 125 | + # 转换为原有的异常类型以保持向后兼容 | ||
| 107 | raise ValueError(f"文档设计JSON解析失败: {exc}") from exc | 126 | raise ValueError(f"文档设计JSON解析失败: {exc}") from exc |
| 108 | 127 | ||
| 109 | 128 |
| @@ -12,6 +12,7 @@ from loguru import logger | @@ -12,6 +12,7 @@ from loguru import logger | ||
| 12 | 12 | ||
| 13 | from .base_node import BaseNode | 13 | from .base_node import BaseNode |
| 14 | from ..prompts import SYSTEM_PROMPT_TEMPLATE_SELECTION | 14 | from ..prompts import SYSTEM_PROMPT_TEMPLATE_SELECTION |
| 15 | +from ..utils.json_parser import RobustJSONParser, JSONParseError | ||
| 15 | 16 | ||
| 16 | 17 | ||
| 17 | class TemplateSelectionNode(BaseNode): | 18 | class TemplateSelectionNode(BaseNode): |
| @@ -32,6 +33,12 @@ class TemplateSelectionNode(BaseNode): | @@ -32,6 +33,12 @@ class TemplateSelectionNode(BaseNode): | ||
| 32 | """ | 33 | """ |
| 33 | super().__init__(llm_client, "TemplateSelectionNode") | 34 | super().__init__(llm_client, "TemplateSelectionNode") |
| 34 | self.template_dir = template_dir | 35 | self.template_dir = template_dir |
| 36 | + # 初始化鲁棒JSON解析器,启用所有修复策略 | ||
| 37 | + self.json_parser = RobustJSONParser( | ||
| 38 | + enable_json_repair=True, | ||
| 39 | + enable_llm_repair=False, | ||
| 40 | + max_repair_attempts=3, | ||
| 41 | + ) | ||
| 35 | 42 | ||
| 36 | def run(self, input_data: Dict[str, Any], **kwargs) -> Dict[str, Any]: | 43 | def run(self, input_data: Dict[str, Any], **kwargs) -> Dict[str, Any]: |
| 37 | """ | 44 | """ |
| @@ -145,11 +152,13 @@ class TemplateSelectionNode(BaseNode): | @@ -145,11 +152,13 @@ class TemplateSelectionNode(BaseNode): | ||
| 145 | 152 | ||
| 146 | logger.info(f"LLM原始响应: {response}") | 153 | logger.info(f"LLM原始响应: {response}") |
| 147 | 154 | ||
| 148 | - # 尝试解析JSON响应 | 155 | + # 尝试解析JSON响应,使用鲁棒解析器 |
| 149 | try: | 156 | try: |
| 150 | - # 清理响应文本 | ||
| 151 | - cleaned_response = self._clean_llm_response(response) | ||
| 152 | - result = json.loads(cleaned_response) | 157 | + result = self.json_parser.parse( |
| 158 | + response, | ||
| 159 | + context_name="模板选择", | ||
| 160 | + expected_keys=["template_name", "selection_reason"], | ||
| 161 | + ) | ||
| 153 | 162 | ||
| 154 | # 验证选择的模板是否存在 | 163 | # 验证选择的模板是否存在 |
| 155 | selected_template_name = result.get('template_name', '') | 164 | selected_template_name = result.get('template_name', '') |
| @@ -165,33 +174,11 @@ class TemplateSelectionNode(BaseNode): | @@ -165,33 +174,11 @@ class TemplateSelectionNode(BaseNode): | ||
| 165 | logger.error(f"LLM选择的模板不存在: {selected_template_name}") | 174 | logger.error(f"LLM选择的模板不存在: {selected_template_name}") |
| 166 | return None | 175 | return None |
| 167 | 176 | ||
| 168 | - except json.JSONDecodeError as e: | 177 | + except JSONParseError as e: |
| 169 | logger.error(f"JSON解析失败: {str(e)}") | 178 | logger.error(f"JSON解析失败: {str(e)}") |
| 170 | # 尝试从文本响应中提取模板信息 | 179 | # 尝试从文本响应中提取模板信息 |
| 171 | return self._extract_template_from_text(response, available_templates) | 180 | return self._extract_template_from_text(response, available_templates) |
| 172 | 181 | ||
| 173 | - def _clean_llm_response(self, response: str) -> str: | ||
| 174 | - """ | ||
| 175 | - 清理LLM响应。 | ||
| 176 | - | ||
| 177 | - 去掉 ```json``` 包裹以及前后空白,方便 `json.loads`。 | ||
| 178 | - | ||
| 179 | - 参数: | ||
| 180 | - response: LLM原始响应。 | ||
| 181 | - | ||
| 182 | - 返回: | ||
| 183 | - str: 适合直接做JSON解析的纯文本。 | ||
| 184 | - """ | ||
| 185 | - # 移除可能的markdown代码块标记 | ||
| 186 | - if '```json' in response: | ||
| 187 | - response = response.split('```json')[1].split('```')[0] | ||
| 188 | - elif '```' in response: | ||
| 189 | - response = response.split('```')[1].split('```')[0] | ||
| 190 | - | ||
| 191 | - # 移除前后空白 | ||
| 192 | - response = response.strip() | ||
| 193 | - | ||
| 194 | - return response | ||
| 195 | 182 | ||
| 196 | def _extract_template_from_text(self, response: str, available_templates: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]: | 183 | def _extract_template_from_text(self, response: str, available_templates: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]: |
| 197 | """ | 184 | """ |
| @@ -14,6 +14,7 @@ from ..prompts import ( | @@ -14,6 +14,7 @@ from ..prompts import ( | ||
| 14 | SYSTEM_PROMPT_WORD_BUDGET, | 14 | SYSTEM_PROMPT_WORD_BUDGET, |
| 15 | build_word_budget_prompt, | 15 | build_word_budget_prompt, |
| 16 | ) | 16 | ) |
| 17 | +from ..utils.json_parser import RobustJSONParser, JSONParseError | ||
| 17 | from .base_node import BaseNode | 18 | from .base_node import BaseNode |
| 18 | 19 | ||
| 19 | 20 | ||
| @@ -27,6 +28,12 @@ class WordBudgetNode(BaseNode): | @@ -27,6 +28,12 @@ class WordBudgetNode(BaseNode): | ||
| 27 | def __init__(self, llm_client): | 28 | def __init__(self, llm_client): |
| 28 | """仅记录LLM客户端引用,方便run阶段发起请求""" | 29 | """仅记录LLM客户端引用,方便run阶段发起请求""" |
| 29 | super().__init__(llm_client, "WordBudgetNode") | 30 | super().__init__(llm_client, "WordBudgetNode") |
| 31 | + # 初始化鲁棒JSON解析器,启用所有修复策略 | ||
| 32 | + self.json_parser = RobustJSONParser( | ||
| 33 | + enable_json_repair=True, | ||
| 34 | + enable_llm_repair=False, # 可以根据需要启用LLM修复 | ||
| 35 | + max_repair_attempts=3, | ||
| 36 | + ) | ||
| 30 | 37 | ||
| 31 | def run( | 38 | def run( |
| 32 | self, | 39 | self, |
| @@ -79,8 +86,14 @@ class WordBudgetNode(BaseNode): | @@ -79,8 +86,14 @@ class WordBudgetNode(BaseNode): | ||
| 79 | """ | 86 | """ |
| 80 | 将LLM输出的JSON文本转为字典,失败时提示规划异常。 | 87 | 将LLM输出的JSON文本转为字典,失败时提示规划异常。 |
| 81 | 88 | ||
| 89 | + 使用鲁棒JSON解析器进行多重修复尝试: | ||
| 90 | + 1. 清理markdown标记和思考内容 | ||
| 91 | + 2. 本地语法修复(括号平衡、逗号补全、控制字符转义等) | ||
| 92 | + 3. 使用json_repair库进行高级修复 | ||
| 93 | + 4. 可选的LLM辅助修复 | ||
| 94 | + | ||
| 82 | 参数: | 95 | 参数: |
| 83 | - raw: LLM返回值,可能包含```包裹。 | 96 | + raw: LLM返回值,可能包含```包裹、思考内容等。 |
| 84 | 97 | ||
| 85 | 返回: | 98 | 返回: |
| 86 | dict: 合法的篇幅规划JSON。 | 99 | dict: 合法的篇幅规划JSON。 |
| @@ -88,19 +101,25 @@ class WordBudgetNode(BaseNode): | @@ -88,19 +101,25 @@ class WordBudgetNode(BaseNode): | ||
| 88 | 异常: | 101 | 异常: |
| 89 | ValueError: 当响应为空或JSON解析失败时抛出。 | 102 | ValueError: 当响应为空或JSON解析失败时抛出。 |
| 90 | """ | 103 | """ |
| 91 | - cleaned = raw.strip() | ||
| 92 | - if cleaned.startswith("```json"): | ||
| 93 | - cleaned = cleaned[7:] | ||
| 94 | - if cleaned.startswith("```"): | ||
| 95 | - cleaned = cleaned[3:] | ||
| 96 | - if cleaned.endswith("```"): | ||
| 97 | - cleaned = cleaned[:-3] | ||
| 98 | - cleaned = cleaned.strip() | ||
| 99 | - if not cleaned: | ||
| 100 | - raise ValueError("篇幅规划LLM返回空内容") | ||
| 101 | try: | 104 | try: |
| 102 | - return json.loads(cleaned) | ||
| 103 | - except json.JSONDecodeError as exc: | 105 | + result = self.json_parser.parse( |
| 106 | + raw, | ||
| 107 | + context_name="篇幅规划", | ||
| 108 | + expected_keys=["totalWords", "globalGuidelines", "chapters"], | ||
| 109 | + ) | ||
| 110 | + # 验证关键字段的类型 | ||
| 111 | + if not isinstance(result.get("totalWords"), (int, float)): | ||
| 112 | + logger.warning("篇幅规划缺少totalWords字段或类型错误,使用默认值") | ||
| 113 | + result.setdefault("totalWords", 10000) | ||
| 114 | + if not isinstance(result.get("globalGuidelines"), list): | ||
| 115 | + logger.warning("篇幅规划缺少globalGuidelines字段或类型错误,使用空列表") | ||
| 116 | + result.setdefault("globalGuidelines", []) | ||
| 117 | + if not isinstance(result.get("chapters"), (list, dict)): | ||
| 118 | + logger.warning("篇幅规划缺少chapters字段或类型错误,使用空列表") | ||
| 119 | + result.setdefault("chapters", []) | ||
| 120 | + return result | ||
| 121 | + except JSONParseError as exc: | ||
| 122 | + # 转换为原有的异常类型以保持向后兼容 | ||
| 104 | raise ValueError(f"篇幅规划JSON解析失败: {exc}") from exc | 123 | raise ValueError(f"篇幅规划JSON解析失败: {exc}") from exc |
| 105 | 124 | ||
| 106 | 125 |
| @@ -216,8 +216,17 @@ SYSTEM_PROMPT_TEMPLATE_SELECTION = f""" | @@ -216,8 +216,17 @@ SYSTEM_PROMPT_TEMPLATE_SELECTION = f""" | ||
| 216 | {json.dumps(output_schema_template_selection, indent=2, ensure_ascii=False)} | 216 | {json.dumps(output_schema_template_selection, indent=2, ensure_ascii=False)} |
| 217 | </OUTPUT JSON SCHEMA> | 217 | </OUTPUT JSON SCHEMA> |
| 218 | 218 | ||
| 219 | -确保输出是一个符合上述输出JSON模式定义的JSON对象。 | ||
| 220 | -只返回JSON对象,不要有解释或额外文本。 | 219 | +**重要的输出格式要求:** |
| 220 | +1. 只返回符合上述Schema的纯JSON对象 | ||
| 221 | +2. 严禁在JSON外添加任何思考过程、说明文字或解释 | ||
| 222 | +3. 可以使用```json和```标记包裹JSON,但不要添加其他内容 | ||
| 223 | +4. 确保JSON语法完全正确: | ||
| 224 | + - 对象和数组元素之间必须有逗号分隔 | ||
| 225 | + - 字符串中的特殊字符必须正确转义(\n, \t, \"等) | ||
| 226 | + - 括号必须成对且正确嵌套 | ||
| 227 | + - 不要使用尾随逗号(最后一个元素后不加逗号) | ||
| 228 | + - 不要在JSON中添加注释 | ||
| 229 | +5. 所有字符串值使用双引号,数值不使用引号 | ||
| 221 | """ | 230 | """ |
| 222 | 231 | ||
| 223 | # HTML报告生成的系统提示词 | 232 | # HTML报告生成的系统提示词 |
| @@ -372,7 +381,17 @@ SYSTEM_PROMPT_DOCUMENT_LAYOUT = f""" | @@ -372,7 +381,17 @@ SYSTEM_PROMPT_DOCUMENT_LAYOUT = f""" | ||
| 372 | {json.dumps(document_layout_output_schema, ensure_ascii=False, indent=2)} | 381 | {json.dumps(document_layout_output_schema, ensure_ascii=False, indent=2)} |
| 373 | </OUTPUT JSON SCHEMA> | 382 | </OUTPUT JSON SCHEMA> |
| 374 | 383 | ||
| 375 | -只返回JSON,勿附加额外文本。 | 384 | +**重要的输出格式要求:** |
| 385 | +1. 只返回符合上述Schema的纯JSON对象 | ||
| 386 | +2. 严禁在JSON外添加任何思考过程、说明文字或解释 | ||
| 387 | +3. 可以使用```json和```标记包裹JSON,但不要添加其他内容 | ||
| 388 | +4. 确保JSON语法完全正确: | ||
| 389 | + - 对象和数组元素之间必须有逗号分隔 | ||
| 390 | + - 字符串中的特殊字符必须正确转义(\n, \t, \"等) | ||
| 391 | + - 括号必须成对且正确嵌套 | ||
| 392 | + - 不要使用尾随逗号(最后一个元素后不加逗号) | ||
| 393 | + - 不要在JSON中添加注释 | ||
| 394 | +5. 所有字符串值使用双引号,数值不使用引号 | ||
| 376 | """ | 395 | """ |
| 377 | 396 | ||
| 378 | # 篇幅规划提示词 | 397 | # 篇幅规划提示词 |
| @@ -390,7 +409,17 @@ SYSTEM_PROMPT_WORD_BUDGET = f""" | @@ -390,7 +409,17 @@ SYSTEM_PROMPT_WORD_BUDGET = f""" | ||
| 390 | {json.dumps(word_budget_output_schema, ensure_ascii=False, indent=2)} | 409 | {json.dumps(word_budget_output_schema, ensure_ascii=False, indent=2)} |
| 391 | </OUTPUT JSON SCHEMA> | 410 | </OUTPUT JSON SCHEMA> |
| 392 | 411 | ||
| 393 | -只返回JSON,无额外说明。 | 412 | +**重要的输出格式要求:** |
| 413 | +1. 只返回符合上述Schema的纯JSON对象 | ||
| 414 | +2. 严禁在JSON外添加任何思考过程、说明文字或解释 | ||
| 415 | +3. 可以使用```json和```标记包裹JSON,但不要添加其他内容 | ||
| 416 | +4. 确保JSON语法完全正确: | ||
| 417 | + - 对象和数组元素之间必须有逗号分隔 | ||
| 418 | + - 字符串中的特殊字符必须正确转义(\n, \t, \"等) | ||
| 419 | + - 括号必须成对且正确嵌套 | ||
| 420 | + - 不要使用尾随逗号(最后一个元素后不加逗号) | ||
| 421 | + - 不要在JSON中添加注释 | ||
| 422 | +5. 所有字符串值使用双引号,数值不使用引号 | ||
| 394 | """ | 423 | """ |
| 395 | 424 | ||
| 396 | 425 |
ReportEngine/utils/json_parser.py
0 → 100644
| 1 | +""" | ||
| 2 | +统一的JSON解析和修复工具。 | ||
| 3 | + | ||
| 4 | +提供鲁棒的JSON解析能力,支持: | ||
| 5 | +1. 自动清理markdown代码块标记和思考内容 | ||
| 6 | +2. 本地语法修复(括号平衡、逗号补全、控制字符转义等) | ||
| 7 | +3. 使用json_repair库进行高级修复 | ||
| 8 | +4. LLM辅助修复(可选) | ||
| 9 | +5. 详细的错误日志和调试信息 | ||
| 10 | +""" | ||
| 11 | + | ||
| 12 | +from __future__ import annotations | ||
| 13 | + | ||
| 14 | +import json | ||
| 15 | +import re | ||
| 16 | +from typing import Any, Dict, List, Optional, Tuple, Callable | ||
| 17 | +from loguru import logger | ||
| 18 | + | ||
| 19 | +try: | ||
| 20 | + from json_repair import repair_json as _json_repair_fn | ||
| 21 | +except ImportError: | ||
| 22 | + _json_repair_fn = None | ||
| 23 | + | ||
| 24 | + | ||
| 25 | +class JSONParseError(ValueError): | ||
| 26 | + """JSON解析失败时抛出的异常,附带原始文本方便排查。""" | ||
| 27 | + | ||
| 28 | + def __init__(self, message: str, raw_text: Optional[str] = None): | ||
| 29 | + """ | ||
| 30 | + 构造异常并附加原始输出,便于日志中定位。 | ||
| 31 | + | ||
| 32 | + Args: | ||
| 33 | + message: 人类可读的错误描述。 | ||
| 34 | + raw_text: 触发异常的完整LLM输出。 | ||
| 35 | + """ | ||
| 36 | + super().__init__(message) | ||
| 37 | + self.raw_text = raw_text | ||
| 38 | + | ||
| 39 | + | ||
| 40 | +class RobustJSONParser: | ||
| 41 | + """ | ||
| 42 | + 鲁棒的JSON解析器。 | ||
| 43 | + | ||
| 44 | + 集成多种修复策略,确保LLM返回的内容能够被正确解析: | ||
| 45 | + - 清理markdown包裹、思考内容等额外信息 | ||
| 46 | + - 修复常见语法错误(缺少逗号、括号不平衡等) | ||
| 47 | + - 转义未转义的控制字符 | ||
| 48 | + - 使用第三方库进行高级修复 | ||
| 49 | + - 可选的LLM辅助修复 | ||
| 50 | + """ | ||
| 51 | + | ||
| 52 | + # 常见的LLM思考内容模式 | ||
| 53 | + _THINKING_PATTERNS = [ | ||
| 54 | + r"<thinking>.*?</thinking>", | ||
| 55 | + r"<thought>.*?</thought>", | ||
| 56 | + r"让我想想.*?(?=\{|\[|$)", | ||
| 57 | + r"首先.*?(?=\{|\[|$)", | ||
| 58 | + r"分析.*?(?=\{|\[|$)", | ||
| 59 | + r"根据.*?(?=\{|\[|$)", | ||
| 60 | + ] | ||
| 61 | + | ||
| 62 | + # 冒号等号模式(LLM常见错误) | ||
| 63 | + _COLON_EQUALS_PATTERN = re.compile(r'(":\s*)=') | ||
| 64 | + | ||
| 65 | + def __init__( | ||
| 66 | + self, | ||
| 67 | + llm_repair_fn: Optional[Callable[[str, str], Optional[str]]] = None, | ||
| 68 | + enable_json_repair: bool = True, | ||
| 69 | + enable_llm_repair: bool = False, | ||
| 70 | + max_repair_attempts: int = 3, | ||
| 71 | + ): | ||
| 72 | + """ | ||
| 73 | + 初始化JSON解析器。 | ||
| 74 | + | ||
| 75 | + Args: | ||
| 76 | + llm_repair_fn: 可选的LLM修复函数,接收(原始JSON, 错误信息)返回修复后的JSON | ||
| 77 | + enable_json_repair: 是否启用json_repair库 | ||
| 78 | + enable_llm_repair: 是否启用LLM辅助修复 | ||
| 79 | + max_repair_attempts: 最大修复尝试次数 | ||
| 80 | + """ | ||
| 81 | + self.llm_repair_fn = llm_repair_fn | ||
| 82 | + self.enable_json_repair = enable_json_repair and _json_repair_fn is not None | ||
| 83 | + self.enable_llm_repair = enable_llm_repair | ||
| 84 | + self.max_repair_attempts = max_repair_attempts | ||
| 85 | + | ||
| 86 | + def parse( | ||
| 87 | + self, | ||
| 88 | + raw_text: str, | ||
| 89 | + context_name: str = "JSON", | ||
| 90 | + expected_keys: Optional[List[str]] = None, | ||
| 91 | + extract_wrapper_key: Optional[str] = None, | ||
| 92 | + ) -> Dict[str, Any]: | ||
| 93 | + """ | ||
| 94 | + 解析LLM返回的JSON文本。 | ||
| 95 | + | ||
| 96 | + 参数: | ||
| 97 | + raw_text: LLM原始输出(可能包含```包裹、思考内容等) | ||
| 98 | + context_name: 上下文名称,用于错误信息 | ||
| 99 | + expected_keys: 期望的键列表,用于验证 | ||
| 100 | + extract_wrapper_key: 如果JSON被包裹在某个键中,指定该键名进行提取 | ||
| 101 | + | ||
| 102 | + 返回: | ||
| 103 | + dict: 解析后的JSON对象 | ||
| 104 | + | ||
| 105 | + 异常: | ||
| 106 | + JSONParseError: 多种修复策略仍无法解析合法JSON | ||
| 107 | + """ | ||
| 108 | + if not raw_text or not raw_text.strip(): | ||
| 109 | + raise JSONParseError(f"{context_name}返回空内容") | ||
| 110 | + | ||
| 111 | + # 步骤1: 清理markdown标记和思考内容 | ||
| 112 | + cleaned = self._clean_response(raw_text) | ||
| 113 | + | ||
| 114 | + # 步骤2: 收集候选payload | ||
| 115 | + candidates = [cleaned] | ||
| 116 | + | ||
| 117 | + # 步骤3: 应用本地修复策略 | ||
| 118 | + local_repaired = self._apply_local_repairs(cleaned) | ||
| 119 | + if local_repaired != cleaned: | ||
| 120 | + candidates.append(local_repaired) | ||
| 121 | + | ||
| 122 | + # 步骤4: 尝试解析所有候选 | ||
| 123 | + last_error: Optional[json.JSONDecodeError] = None | ||
| 124 | + for i, candidate in enumerate(candidates): | ||
| 125 | + try: | ||
| 126 | + data = json.loads(candidate) | ||
| 127 | + logger.debug(f"{context_name} JSON解析成功(候选{i + 1}/{len(candidates)})") | ||
| 128 | + return self._extract_and_validate( | ||
| 129 | + data, expected_keys, extract_wrapper_key, context_name | ||
| 130 | + ) | ||
| 131 | + except json.JSONDecodeError as exc: | ||
| 132 | + last_error = exc | ||
| 133 | + logger.debug(f"{context_name} 候选{i + 1}解析失败: {exc}") | ||
| 134 | + | ||
| 135 | + # 步骤5: 使用json_repair库 | ||
| 136 | + if self.enable_json_repair: | ||
| 137 | + repaired = self._attempt_json_repair(cleaned, context_name) | ||
| 138 | + if repaired: | ||
| 139 | + try: | ||
| 140 | + data = json.loads(repaired) | ||
| 141 | + logger.info(f"{context_name} JSON通过json_repair库修复成功") | ||
| 142 | + return self._extract_and_validate( | ||
| 143 | + data, expected_keys, extract_wrapper_key, context_name | ||
| 144 | + ) | ||
| 145 | + except json.JSONDecodeError as exc: | ||
| 146 | + last_error = exc | ||
| 147 | + logger.debug(f"{context_name} json_repair修复后仍无法解析: {exc}") | ||
| 148 | + | ||
| 149 | + # 步骤6: 使用LLM修复(如果启用) | ||
| 150 | + if self.enable_llm_repair and self.llm_repair_fn: | ||
| 151 | + llm_repaired = self._attempt_llm_repair(cleaned, str(last_error), context_name) | ||
| 152 | + if llm_repaired: | ||
| 153 | + try: | ||
| 154 | + data = json.loads(llm_repaired) | ||
| 155 | + logger.info(f"{context_name} JSON通过LLM修复成功") | ||
| 156 | + return self._extract_and_validate( | ||
| 157 | + data, expected_keys, extract_wrapper_key, context_name | ||
| 158 | + ) | ||
| 159 | + except json.JSONDecodeError as exc: | ||
| 160 | + last_error = exc | ||
| 161 | + logger.warning(f"{context_name} LLM修复后仍无法解析: {exc}") | ||
| 162 | + | ||
| 163 | + # 所有策略都失败了 | ||
| 164 | + error_msg = f"{context_name} JSON解析失败: {last_error}" | ||
| 165 | + logger.error(error_msg) | ||
| 166 | + logger.debug(f"原始文本前500字符: {raw_text[:500]}") | ||
| 167 | + raise JSONParseError(error_msg, raw_text=raw_text) from last_error | ||
| 168 | + | ||
| 169 | + def _clean_response(self, raw: str) -> str: | ||
| 170 | + """ | ||
| 171 | + 清理LLM响应,去除markdown标记和思考内容。 | ||
| 172 | + | ||
| 173 | + 参数: | ||
| 174 | + raw: LLM原始输出 | ||
| 175 | + | ||
| 176 | + 返回: | ||
| 177 | + str: 清理后的文本 | ||
| 178 | + """ | ||
| 179 | + cleaned = raw.strip() | ||
| 180 | + | ||
| 181 | + # 移除思考内容(多语言支持) | ||
| 182 | + for pattern in self._THINKING_PATTERNS: | ||
| 183 | + cleaned = re.sub(pattern, "", cleaned, flags=re.DOTALL | re.IGNORECASE) | ||
| 184 | + | ||
| 185 | + # 移除markdown代码块标记 | ||
| 186 | + if cleaned.startswith("```json"): | ||
| 187 | + cleaned = cleaned[7:] | ||
| 188 | + elif cleaned.startswith("```"): | ||
| 189 | + cleaned = cleaned[3:] | ||
| 190 | + | ||
| 191 | + if cleaned.endswith("```"): | ||
| 192 | + cleaned = cleaned[:-3] | ||
| 193 | + | ||
| 194 | + cleaned = cleaned.strip() | ||
| 195 | + | ||
| 196 | + # 尝试提取第一个完整的JSON对象或数组 | ||
| 197 | + cleaned = self._extract_first_json_structure(cleaned) | ||
| 198 | + | ||
| 199 | + return cleaned | ||
| 200 | + | ||
| 201 | + def _extract_first_json_structure(self, text: str) -> str: | ||
| 202 | + """ | ||
| 203 | + 从文本中提取第一个完整的JSON对象或数组。 | ||
| 204 | + | ||
| 205 | + 这对于处理LLM在JSON前后添加说明文字的情况很有用。 | ||
| 206 | + | ||
| 207 | + 参数: | ||
| 208 | + text: 可能包含JSON的文本 | ||
| 209 | + | ||
| 210 | + 返回: | ||
| 211 | + str: 提取的JSON文本,如果找不到则返回原文本 | ||
| 212 | + """ | ||
| 213 | + # 查找第一个 { 或 [ | ||
| 214 | + start_brace = text.find("{") | ||
| 215 | + start_bracket = text.find("[") | ||
| 216 | + | ||
| 217 | + if start_brace == -1 and start_bracket == -1: | ||
| 218 | + return text | ||
| 219 | + | ||
| 220 | + # 确定起始位置 | ||
| 221 | + if start_brace == -1: | ||
| 222 | + start = start_bracket | ||
| 223 | + opener = "[" | ||
| 224 | + closer = "]" | ||
| 225 | + elif start_bracket == -1: | ||
| 226 | + start = start_brace | ||
| 227 | + opener = "{" | ||
| 228 | + closer = "}" | ||
| 229 | + else: | ||
| 230 | + start = min(start_brace, start_bracket) | ||
| 231 | + opener = text[start] | ||
| 232 | + closer = "}" if opener == "{" else "]" | ||
| 233 | + | ||
| 234 | + # 查找对应的结束位置 | ||
| 235 | + depth = 0 | ||
| 236 | + in_string = False | ||
| 237 | + escaped = False | ||
| 238 | + | ||
| 239 | + for i in range(start, len(text)): | ||
| 240 | + ch = text[i] | ||
| 241 | + | ||
| 242 | + if escaped: | ||
| 243 | + escaped = False | ||
| 244 | + continue | ||
| 245 | + | ||
| 246 | + if ch == "\\": | ||
| 247 | + escaped = True | ||
| 248 | + continue | ||
| 249 | + | ||
| 250 | + if ch == '"': | ||
| 251 | + in_string = not in_string | ||
| 252 | + continue | ||
| 253 | + | ||
| 254 | + if in_string: | ||
| 255 | + continue | ||
| 256 | + | ||
| 257 | + if ch in "{[": | ||
| 258 | + depth += 1 | ||
| 259 | + elif ch in "}]": | ||
| 260 | + depth -= 1 | ||
| 261 | + if depth == 0: | ||
| 262 | + return text[start : i + 1] | ||
| 263 | + | ||
| 264 | + # 如果没找到完整的结构,返回从起始位置到结尾 | ||
| 265 | + return text[start:] if start < len(text) else text | ||
| 266 | + | ||
| 267 | + def _apply_local_repairs(self, text: str) -> str: | ||
| 268 | + """ | ||
| 269 | + 应用本地修复策略。 | ||
| 270 | + | ||
| 271 | + 参数: | ||
| 272 | + text: 原始JSON文本 | ||
| 273 | + | ||
| 274 | + 返回: | ||
| 275 | + str: 修复后的文本 | ||
| 276 | + """ | ||
| 277 | + repaired = text | ||
| 278 | + mutated = False | ||
| 279 | + | ||
| 280 | + # 修复 ":=" 错误 | ||
| 281 | + new_text = self._COLON_EQUALS_PATTERN.sub(r"\1", repaired) | ||
| 282 | + if new_text != repaired: | ||
| 283 | + logger.warning("检测到\":=\"字符,已自动移除多余的'='号") | ||
| 284 | + repaired = new_text | ||
| 285 | + mutated = True | ||
| 286 | + | ||
| 287 | + # 转义控制字符 | ||
| 288 | + repaired, escaped = self._escape_control_characters(repaired) | ||
| 289 | + if escaped: | ||
| 290 | + logger.warning("检测到未转义的控制字符,已自动转换为转义序列") | ||
| 291 | + mutated = True | ||
| 292 | + | ||
| 293 | + # 修复缺少的逗号 | ||
| 294 | + repaired, commas_fixed = self._fix_missing_commas(repaired) | ||
| 295 | + if commas_fixed: | ||
| 296 | + logger.warning("检测到对象/数组之间缺少逗号,已自动补齐") | ||
| 297 | + mutated = True | ||
| 298 | + | ||
| 299 | + # 平衡括号 | ||
| 300 | + repaired, balanced = self._balance_brackets(repaired) | ||
| 301 | + if balanced: | ||
| 302 | + logger.warning("检测到括号不平衡,已自动补齐/剔除异常括号") | ||
| 303 | + mutated = True | ||
| 304 | + | ||
| 305 | + # 移除尾随逗号 | ||
| 306 | + repaired, trailing_removed = self._remove_trailing_commas(repaired) | ||
| 307 | + if trailing_removed: | ||
| 308 | + logger.warning("检测到尾随逗号,已自动移除") | ||
| 309 | + mutated = True | ||
| 310 | + | ||
| 311 | + return repaired if mutated else text | ||
| 312 | + | ||
| 313 | + def _escape_control_characters(self, text: str) -> Tuple[str, bool]: | ||
| 314 | + """ | ||
| 315 | + 将字符串字面量中的裸换行/制表符/控制字符替换为JSON合法的转义序列。 | ||
| 316 | + | ||
| 317 | + 参数: | ||
| 318 | + text: 原始JSON文本 | ||
| 319 | + | ||
| 320 | + 返回: | ||
| 321 | + Tuple[str, bool]: (修复后的文本, 是否有修改) | ||
| 322 | + """ | ||
| 323 | + if not text: | ||
| 324 | + return text, False | ||
| 325 | + | ||
| 326 | + result: List[str] = [] | ||
| 327 | + in_string = False | ||
| 328 | + escaped = False | ||
| 329 | + mutated = False | ||
| 330 | + control_map = {"\n": "\\n", "\r": "\\r", "\t": "\\t"} | ||
| 331 | + | ||
| 332 | + for ch in text: | ||
| 333 | + if escaped: | ||
| 334 | + result.append(ch) | ||
| 335 | + escaped = False | ||
| 336 | + continue | ||
| 337 | + | ||
| 338 | + if ch == "\\": | ||
| 339 | + result.append(ch) | ||
| 340 | + escaped = True | ||
| 341 | + continue | ||
| 342 | + | ||
| 343 | + if ch == '"': | ||
| 344 | + result.append(ch) | ||
| 345 | + in_string = not in_string | ||
| 346 | + continue | ||
| 347 | + | ||
| 348 | + if in_string and ch in control_map: | ||
| 349 | + result.append(control_map[ch]) | ||
| 350 | + mutated = True | ||
| 351 | + continue | ||
| 352 | + | ||
| 353 | + if in_string and ord(ch) < 0x20: | ||
| 354 | + result.append(f"\\u{ord(ch):04x}") | ||
| 355 | + mutated = True | ||
| 356 | + continue | ||
| 357 | + | ||
| 358 | + result.append(ch) | ||
| 359 | + | ||
| 360 | + return "".join(result), mutated | ||
| 361 | + | ||
| 362 | + def _fix_missing_commas(self, text: str) -> Tuple[str, bool]: | ||
| 363 | + """ | ||
| 364 | + 在对象/数组元素之间自动补逗号。 | ||
| 365 | + | ||
| 366 | + 参数: | ||
| 367 | + text: 原始JSON文本 | ||
| 368 | + | ||
| 369 | + 返回: | ||
| 370 | + Tuple[str, bool]: (修复后的文本, 是否有修改) | ||
| 371 | + """ | ||
| 372 | + if not text: | ||
| 373 | + return text, False | ||
| 374 | + | ||
| 375 | + chars: List[str] = [] | ||
| 376 | + mutated = False | ||
| 377 | + in_string = False | ||
| 378 | + escaped = False | ||
| 379 | + length = len(text) | ||
| 380 | + i = 0 | ||
| 381 | + | ||
| 382 | + while i < length: | ||
| 383 | + ch = text[i] | ||
| 384 | + chars.append(ch) | ||
| 385 | + | ||
| 386 | + if escaped: | ||
| 387 | + escaped = False | ||
| 388 | + i += 1 | ||
| 389 | + continue | ||
| 390 | + | ||
| 391 | + if ch == "\\": | ||
| 392 | + escaped = True | ||
| 393 | + i += 1 | ||
| 394 | + continue | ||
| 395 | + | ||
| 396 | + if ch == '"': | ||
| 397 | + # 如果我们正在退出字符串,检查后面是否需要逗号 | ||
| 398 | + if in_string: | ||
| 399 | + # 查找下一个非空白字符 | ||
| 400 | + j = i + 1 | ||
| 401 | + while j < length and text[j] in " \t\r\n": | ||
| 402 | + j += 1 | ||
| 403 | + # 如果下一个字符是 " { [ 或数字,可能需要逗号 | ||
| 404 | + if j < length: | ||
| 405 | + next_ch = text[j] | ||
| 406 | + if next_ch in "\"[{" or next_ch.isdigit(): | ||
| 407 | + # 检查是否已经在对象或数组中 | ||
| 408 | + # 通过检查前面是否有未闭合的 { 或 [ | ||
| 409 | + has_opener = False | ||
| 410 | + for k in range(len(chars) - 1, -1, -1): | ||
| 411 | + if chars[k] in "{[": | ||
| 412 | + has_opener = True | ||
| 413 | + break | ||
| 414 | + elif chars[k] in "]}": | ||
| 415 | + break | ||
| 416 | + | ||
| 417 | + if has_opener: | ||
| 418 | + chars.append(",") | ||
| 419 | + mutated = True | ||
| 420 | + | ||
| 421 | + in_string = not in_string | ||
| 422 | + i += 1 | ||
| 423 | + continue | ||
| 424 | + | ||
| 425 | + # 在 } 或 ] 后面检查是否需要逗号 | ||
| 426 | + if not in_string and ch in "}]": | ||
| 427 | + j = i + 1 | ||
| 428 | + # 跳过空白 | ||
| 429 | + while j < length and text[j] in " \t\r\n": | ||
| 430 | + j += 1 | ||
| 431 | + # 如果下一个非空白字符是 { [ " 或数字,添加逗号 | ||
| 432 | + if j < length: | ||
| 433 | + next_ch = text[j] | ||
| 434 | + if next_ch in "{[\"" or next_ch.isdigit(): | ||
| 435 | + chars.append(",") | ||
| 436 | + mutated = True | ||
| 437 | + | ||
| 438 | + i += 1 | ||
| 439 | + | ||
| 440 | + return "".join(chars), mutated | ||
| 441 | + | ||
| 442 | + def _balance_brackets(self, text: str) -> Tuple[str, bool]: | ||
| 443 | + """ | ||
| 444 | + 尝试修复因LLM多写/少写括号导致的不平衡结构。 | ||
| 445 | + | ||
| 446 | + 参数: | ||
| 447 | + text: 原始JSON文本 | ||
| 448 | + | ||
| 449 | + 返回: | ||
| 450 | + Tuple[str, bool]: (修复后的文本, 是否有修改) | ||
| 451 | + """ | ||
| 452 | + if not text: | ||
| 453 | + return text, False | ||
| 454 | + | ||
| 455 | + result: List[str] = [] | ||
| 456 | + stack: List[str] = [] | ||
| 457 | + mutated = False | ||
| 458 | + in_string = False | ||
| 459 | + escaped = False | ||
| 460 | + | ||
| 461 | + opener_map = {"{": "}", "[": "]"} | ||
| 462 | + | ||
| 463 | + for ch in text: | ||
| 464 | + if escaped: | ||
| 465 | + result.append(ch) | ||
| 466 | + escaped = False | ||
| 467 | + continue | ||
| 468 | + | ||
| 469 | + if ch == "\\": | ||
| 470 | + result.append(ch) | ||
| 471 | + escaped = True | ||
| 472 | + continue | ||
| 473 | + | ||
| 474 | + if ch == '"': | ||
| 475 | + result.append(ch) | ||
| 476 | + in_string = not in_string | ||
| 477 | + continue | ||
| 478 | + | ||
| 479 | + if in_string: | ||
| 480 | + result.append(ch) | ||
| 481 | + continue | ||
| 482 | + | ||
| 483 | + if ch in "{[": | ||
| 484 | + stack.append(ch) | ||
| 485 | + result.append(ch) | ||
| 486 | + continue | ||
| 487 | + | ||
| 488 | + if ch in "}]": | ||
| 489 | + if stack and ( | ||
| 490 | + (ch == "}" and stack[-1] == "{") or (ch == "]" and stack[-1] == "[") | ||
| 491 | + ): | ||
| 492 | + stack.pop() | ||
| 493 | + result.append(ch) | ||
| 494 | + else: | ||
| 495 | + # 不匹配的闭括号,忽略 | ||
| 496 | + mutated = True | ||
| 497 | + continue | ||
| 498 | + | ||
| 499 | + result.append(ch) | ||
| 500 | + | ||
| 501 | + # 补齐未闭合的括号 | ||
| 502 | + while stack: | ||
| 503 | + opener = stack.pop() | ||
| 504 | + result.append(opener_map[opener]) | ||
| 505 | + mutated = True | ||
| 506 | + | ||
| 507 | + return "".join(result), mutated | ||
| 508 | + | ||
| 509 | + def _remove_trailing_commas(self, text: str) -> Tuple[str, bool]: | ||
| 510 | + """ | ||
| 511 | + 移除JSON对象和数组中的尾随逗号。 | ||
| 512 | + | ||
| 513 | + 参数: | ||
| 514 | + text: 原始JSON文本 | ||
| 515 | + | ||
| 516 | + 返回: | ||
| 517 | + Tuple[str, bool]: (修复后的文本, 是否有修改) | ||
| 518 | + """ | ||
| 519 | + if not text: | ||
| 520 | + return text, False | ||
| 521 | + | ||
| 522 | + # 使用正则表达式移除尾随逗号 | ||
| 523 | + # 匹配 , 后面跟着空白和 } 或 ] 的情况 | ||
| 524 | + pattern = r",(\s*[}\]])" | ||
| 525 | + new_text = re.sub(pattern, r"\1", text) | ||
| 526 | + | ||
| 527 | + return new_text, new_text != text | ||
| 528 | + | ||
| 529 | + def _attempt_json_repair(self, text: str, context_name: str) -> Optional[str]: | ||
| 530 | + """ | ||
| 531 | + 使用json_repair库进行高级修复。 | ||
| 532 | + | ||
| 533 | + 参数: | ||
| 534 | + text: 原始JSON文本 | ||
| 535 | + context_name: 上下文名称 | ||
| 536 | + | ||
| 537 | + 返回: | ||
| 538 | + Optional[str]: 修复后的JSON文本,失败返回None | ||
| 539 | + """ | ||
| 540 | + if not _json_repair_fn: | ||
| 541 | + return None | ||
| 542 | + | ||
| 543 | + try: | ||
| 544 | + fixed = _json_repair_fn(text) | ||
| 545 | + if fixed and fixed != text: | ||
| 546 | + logger.info(f"{context_name} 使用json_repair库自动修复JSON") | ||
| 547 | + return fixed | ||
| 548 | + except Exception as exc: | ||
| 549 | + logger.debug(f"{context_name} json_repair修复失败: {exc}") | ||
| 550 | + | ||
| 551 | + return None | ||
| 552 | + | ||
| 553 | + def _attempt_llm_repair( | ||
| 554 | + self, text: str, error_msg: str, context_name: str | ||
| 555 | + ) -> Optional[str]: | ||
| 556 | + """ | ||
| 557 | + 使用LLM进行JSON修复。 | ||
| 558 | + | ||
| 559 | + 参数: | ||
| 560 | + text: 原始JSON文本 | ||
| 561 | + error_msg: 解析错误信息 | ||
| 562 | + context_name: 上下文名称 | ||
| 563 | + | ||
| 564 | + 返回: | ||
| 565 | + Optional[str]: 修复后的JSON文本,失败返回None | ||
| 566 | + """ | ||
| 567 | + if not self.llm_repair_fn: | ||
| 568 | + return None | ||
| 569 | + | ||
| 570 | + try: | ||
| 571 | + logger.info(f"{context_name} 尝试使用LLM修复JSON") | ||
| 572 | + repaired = self.llm_repair_fn(text, error_msg) | ||
| 573 | + if repaired and repaired != text: | ||
| 574 | + return repaired | ||
| 575 | + except Exception as exc: | ||
| 576 | + logger.warning(f"{context_name} LLM修复失败: {exc}") | ||
| 577 | + | ||
| 578 | + return None | ||
| 579 | + | ||
| 580 | + def _extract_and_validate( | ||
| 581 | + self, | ||
| 582 | + data: Any, | ||
| 583 | + expected_keys: Optional[List[str]], | ||
| 584 | + extract_wrapper_key: Optional[str], | ||
| 585 | + context_name: str, | ||
| 586 | + ) -> Dict[str, Any]: | ||
| 587 | + """ | ||
| 588 | + 提取并验证JSON数据。 | ||
| 589 | + | ||
| 590 | + 参数: | ||
| 591 | + data: 解析后的数据 | ||
| 592 | + expected_keys: 期望的键列表 | ||
| 593 | + extract_wrapper_key: 包裹键名 | ||
| 594 | + context_name: 上下文名称 | ||
| 595 | + | ||
| 596 | + 返回: | ||
| 597 | + Dict[str, Any]: 提取并验证后的数据 | ||
| 598 | + | ||
| 599 | + 异常: | ||
| 600 | + JSONParseError: 如果数据格式不符合预期 | ||
| 601 | + """ | ||
| 602 | + # 提取包裹的数据 | ||
| 603 | + if extract_wrapper_key and isinstance(data, dict): | ||
| 604 | + if extract_wrapper_key in data: | ||
| 605 | + data = data[extract_wrapper_key] | ||
| 606 | + else: | ||
| 607 | + logger.warning( | ||
| 608 | + f"{context_name} 未找到包裹键'{extract_wrapper_key}',使用原始数据" | ||
| 609 | + ) | ||
| 610 | + | ||
| 611 | + # 验证数据类型 | ||
| 612 | + if not isinstance(data, dict): | ||
| 613 | + if isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict): | ||
| 614 | + logger.warning(f"{context_name} 返回数组,自动提取第一个元素") | ||
| 615 | + data = data[0] | ||
| 616 | + else: | ||
| 617 | + raise JSONParseError( | ||
| 618 | + f"{context_name} 返回的不是JSON对象: {type(data).__name__}" | ||
| 619 | + ) | ||
| 620 | + | ||
| 621 | + # 验证必需的键 | ||
| 622 | + if expected_keys: | ||
| 623 | + missing_keys = [key for key in expected_keys if key not in data] | ||
| 624 | + if missing_keys: | ||
| 625 | + logger.warning( | ||
| 626 | + f"{context_name} 缺少预期的键: {', '.join(missing_keys)}" | ||
| 627 | + ) | ||
| 628 | + | ||
| 629 | + return data | ||
| 630 | + | ||
| 631 | + | ||
| 632 | +__all__ = ["RobustJSONParser", "JSONParseError"] |
ReportEngine/utils/test_json_parser.py
0 → 100644
| 1 | +""" | ||
| 2 | +测试RobustJSONParser的各种修复能力。 | ||
| 3 | + | ||
| 4 | +验证解析器能够处理: | ||
| 5 | +1. 基本的markdown包裹 | ||
| 6 | +2. 思考内容清理 | ||
| 7 | +3. 缺少逗号的修复 | ||
| 8 | +4. 括号不平衡的修复 | ||
| 9 | +5. 控制字符转义 | ||
| 10 | +6. 尾随逗号移除 | ||
| 11 | +""" | ||
| 12 | + | ||
| 13 | +import json | ||
| 14 | +import unittest | ||
| 15 | +from json_parser import RobustJSONParser, JSONParseError | ||
| 16 | + | ||
| 17 | + | ||
| 18 | +class TestRobustJSONParser(unittest.TestCase): | ||
| 19 | + """测试鲁棒JSON解析器的各种修复策略。""" | ||
| 20 | + | ||
| 21 | + def setUp(self): | ||
| 22 | + """初始化解析器。""" | ||
| 23 | + self.parser = RobustJSONParser( | ||
| 24 | + enable_json_repair=False, # 先测试本地修复 | ||
| 25 | + enable_llm_repair=False, | ||
| 26 | + ) | ||
| 27 | + | ||
| 28 | + def test_basic_json(self): | ||
| 29 | + """测试解析基本的合法JSON。""" | ||
| 30 | + json_str = '{"name": "test", "value": 123}' | ||
| 31 | + result = self.parser.parse(json_str, "基本测试") | ||
| 32 | + self.assertEqual(result["name"], "test") | ||
| 33 | + self.assertEqual(result["value"], 123) | ||
| 34 | + | ||
| 35 | + def test_markdown_wrapped(self): | ||
| 36 | + """测试解析被```json包裹的JSON。""" | ||
| 37 | + json_str = """```json | ||
| 38 | +{ | ||
| 39 | + "name": "test", | ||
| 40 | + "value": 123 | ||
| 41 | +} | ||
| 42 | +```""" | ||
| 43 | + result = self.parser.parse(json_str, "Markdown包裹测试") | ||
| 44 | + self.assertEqual(result["name"], "test") | ||
| 45 | + self.assertEqual(result["value"], 123) | ||
| 46 | + | ||
| 47 | + def test_thinking_content_removal(self): | ||
| 48 | + """测试清理思考内容。""" | ||
| 49 | + json_str = """<thinking>让我想想如何构造这个JSON</thinking> | ||
| 50 | +{ | ||
| 51 | + "name": "test", | ||
| 52 | + "value": 123 | ||
| 53 | +}""" | ||
| 54 | + result = self.parser.parse(json_str, "思考内容清理测试") | ||
| 55 | + self.assertEqual(result["name"], "test") | ||
| 56 | + self.assertEqual(result["value"], 123) | ||
| 57 | + | ||
| 58 | + def test_missing_comma_fix(self): | ||
| 59 | + """测试修复缺少的逗号。""" | ||
| 60 | + # 这是实际错误中常见的情况:数组元素之间缺少逗号 | ||
| 61 | + json_str = """{ | ||
| 62 | + "totalWords": 40000, | ||
| 63 | + "globalGuidelines": [ | ||
| 64 | + "重点突出技术红利分配失衡" | ||
| 65 | + "详略策略:技术创新" | ||
| 66 | + ], | ||
| 67 | + "chapters": [] | ||
| 68 | +}""" | ||
| 69 | + result = self.parser.parse(json_str, "缺少逗号修复测试") | ||
| 70 | + self.assertEqual(len(result["globalGuidelines"]), 2) | ||
| 71 | + | ||
| 72 | + def test_unbalanced_brackets(self): | ||
| 73 | + """测试修复括号不平衡。""" | ||
| 74 | + # 缺少结束括号 | ||
| 75 | + json_str = """{ | ||
| 76 | + "name": "test", | ||
| 77 | + "nested": { | ||
| 78 | + "value": 123 | ||
| 79 | + } | ||
| 80 | +""" # 缺少最外层的 } | ||
| 81 | + result = self.parser.parse(json_str, "括号不平衡测试") | ||
| 82 | + self.assertEqual(result["name"], "test") | ||
| 83 | + self.assertEqual(result["nested"]["value"], 123) | ||
| 84 | + | ||
| 85 | + def test_control_character_escape(self): | ||
| 86 | + """测试转义控制字符。""" | ||
| 87 | + # JSON字符串中的裸换行符应该被转义 | ||
| 88 | + json_str = """{ | ||
| 89 | + "text": "这是第一行 | ||
| 90 | +这是第二行", | ||
| 91 | + "value": 123 | ||
| 92 | +}""" | ||
| 93 | + result = self.parser.parse(json_str, "控制字符转义测试") | ||
| 94 | + # 确保换行符被正确处理 | ||
| 95 | + self.assertIn("第一行", result["text"]) | ||
| 96 | + self.assertIn("第二行", result["text"]) | ||
| 97 | + | ||
| 98 | + def test_trailing_comma_removal(self): | ||
| 99 | + """测试移除尾随逗号。""" | ||
| 100 | + json_str = """{ | ||
| 101 | + "name": "test", | ||
| 102 | + "value": 123, | ||
| 103 | + "items": [1, 2, 3,], | ||
| 104 | +}""" | ||
| 105 | + result = self.parser.parse(json_str, "尾随逗号测试") | ||
| 106 | + self.assertEqual(result["name"], "test") | ||
| 107 | + self.assertEqual(len(result["items"]), 3) | ||
| 108 | + | ||
| 109 | + def test_colon_equals_fix(self): | ||
| 110 | + """测试修复冒号等号错误。""" | ||
| 111 | + json_str = """{ | ||
| 112 | + "name":= "test", | ||
| 113 | + "value": 123 | ||
| 114 | +}""" | ||
| 115 | + result = self.parser.parse(json_str, "冒号等号测试") | ||
| 116 | + self.assertEqual(result["name"], "test") | ||
| 117 | + | ||
| 118 | + def test_extract_first_json(self): | ||
| 119 | + """测试从文本中提取第一个JSON结构。""" | ||
| 120 | + json_str = """这是一些说明文字,下面是JSON: | ||
| 121 | +{ | ||
| 122 | + "name": "test", | ||
| 123 | + "value": 123 | ||
| 124 | +} | ||
| 125 | +后面还有一些其他文字""" | ||
| 126 | + result = self.parser.parse(json_str, "提取JSON测试") | ||
| 127 | + self.assertEqual(result["name"], "test") | ||
| 128 | + self.assertEqual(result["value"], 123) | ||
| 129 | + | ||
| 130 | + def test_complex_real_world_case(self): | ||
| 131 | + """测试真实世界的复杂案例(类似实际错误)。""" | ||
| 132 | + # 模拟实际错误:缺少逗号、有markdown包裹、有思考内容 | ||
| 133 | + json_str = """<thinking>我需要构造一个篇幅规划</thinking> | ||
| 134 | +```json | ||
| 135 | +{ | ||
| 136 | + "totalWords": 40000, | ||
| 137 | + "tolerance": 2000, | ||
| 138 | + "globalGuidelines": [ | ||
| 139 | + "重点突出技术红利分配失衡、人才流失与职业认同危机等结构性矛盾" | ||
| 140 | + "详略策略:技术创新与传统技艺的碰撞" | ||
| 141 | + "案例导向:优先引用真实数据和调研" | ||
| 142 | + ], | ||
| 143 | + "chapters": [ | ||
| 144 | + { | ||
| 145 | + "chapterId": "ch1", | ||
| 146 | + "targetWords": 5000 | ||
| 147 | + } | ||
| 148 | + ] | ||
| 149 | +} | ||
| 150 | +```""" | ||
| 151 | + result = self.parser.parse(json_str, "复杂真实案例测试") | ||
| 152 | + self.assertEqual(result["totalWords"], 40000) | ||
| 153 | + self.assertEqual(result["tolerance"], 2000) | ||
| 154 | + self.assertEqual(len(result["globalGuidelines"]), 3) | ||
| 155 | + self.assertEqual(len(result["chapters"]), 1) | ||
| 156 | + | ||
| 157 | + def test_expected_keys_validation(self): | ||
| 158 | + """测试期望键的验证。""" | ||
| 159 | + json_str = '{"name": "test"}' | ||
| 160 | + # 不应该因为缺少键而失败,只是警告 | ||
| 161 | + result = self.parser.parse( | ||
| 162 | + json_str, "键验证测试", expected_keys=["name", "value"] | ||
| 163 | + ) | ||
| 164 | + self.assertIn("name", result) | ||
| 165 | + | ||
| 166 | + def test_wrapper_key_extraction(self): | ||
| 167 | + """测试从包裹键中提取数据。""" | ||
| 168 | + json_str = """{ | ||
| 169 | + "wrapper": { | ||
| 170 | + "name": "test", | ||
| 171 | + "value": 123 | ||
| 172 | + } | ||
| 173 | +}""" | ||
| 174 | + result = self.parser.parse( | ||
| 175 | + json_str, "包裹键测试", extract_wrapper_key="wrapper" | ||
| 176 | + ) | ||
| 177 | + self.assertEqual(result["name"], "test") | ||
| 178 | + self.assertEqual(result["value"], 123) | ||
| 179 | + | ||
| 180 | + def test_empty_input(self): | ||
| 181 | + """测试空输入。""" | ||
| 182 | + with self.assertRaises(JSONParseError): | ||
| 183 | + self.parser.parse("", "空输入测试") | ||
| 184 | + | ||
| 185 | + def test_invalid_json_after_all_repairs(self): | ||
| 186 | + """测试所有修复策略都无法处理的情况。""" | ||
| 187 | + # 这是一个严重损坏的JSON,无法修复 | ||
| 188 | + json_str = "{完全不是JSON格式的内容###" | ||
| 189 | + with self.assertRaises(JSONParseError): | ||
| 190 | + self.parser.parse(json_str, "无法修复测试") | ||
| 191 | + | ||
| 192 | + | ||
| 193 | +def run_manual_test(): | ||
| 194 | + """手动运行测试,打印详细信息。""" | ||
| 195 | + print("=" * 60) | ||
| 196 | + print("开始测试RobustJSONParser") | ||
| 197 | + print("=" * 60) | ||
| 198 | + | ||
| 199 | + parser = RobustJSONParser(enable_json_repair=False, enable_llm_repair=False) | ||
| 200 | + | ||
| 201 | + # 测试实际错误案例 | ||
| 202 | + test_case = """```json | ||
| 203 | +{ | ||
| 204 | + "totalWords": 40000, | ||
| 205 | + "tolerance": 2000, | ||
| 206 | + "globalGuidelines": [ | ||
| 207 | + "重点突出技术红利分配失衡、人才流失与职业认同危机等结构性矛盾" | ||
| 208 | + "详略策略:技术创新与传统技艺的碰撞" | ||
| 209 | + ], | ||
| 210 | + "chapters": [] | ||
| 211 | +} | ||
| 212 | +```""" | ||
| 213 | + | ||
| 214 | + print("\n测试案例:") | ||
| 215 | + print(test_case) | ||
| 216 | + print("\n" + "=" * 60) | ||
| 217 | + | ||
| 218 | + try: | ||
| 219 | + result = parser.parse(test_case, "手动测试") | ||
| 220 | + print("\n✓ 解析成功!") | ||
| 221 | + print("\n解析结果:") | ||
| 222 | + print(json.dumps(result, ensure_ascii=False, indent=2)) | ||
| 223 | + except Exception as e: | ||
| 224 | + print(f"\n✗ 解析失败: {e}") | ||
| 225 | + | ||
| 226 | + print("\n" + "=" * 60) | ||
| 227 | + | ||
| 228 | + | ||
| 229 | +if __name__ == "__main__": | ||
| 230 | + # 运行手动测试 | ||
| 231 | + run_manual_test() | ||
| 232 | + | ||
| 233 | + # 运行单元测试 | ||
| 234 | + print("\n\n运行单元测试...") | ||
| 235 | + unittest.main(verbosity=2) |
-
Please register or login to post a comment