Showing
2 changed files
with
80 additions
and
15 deletions
| @@ -108,18 +108,13 @@ class RobustJSONParser: | @@ -108,18 +108,13 @@ class RobustJSONParser: | ||
| 108 | if not raw_text or not raw_text.strip(): | 108 | if not raw_text or not raw_text.strip(): |
| 109 | raise JSONParseError(f"{context_name}返回空内容") | 109 | raise JSONParseError(f"{context_name}返回空内容") |
| 110 | 110 | ||
| 111 | - # 步骤1: 清理markdown标记和思考内容 | ||
| 112 | - cleaned = self._clean_response(raw_text) | ||
| 113 | - | ||
| 114 | - # 步骤2: 收集候选payload | ||
| 115 | - candidates = [cleaned] | 111 | + # 原始文本用于后续日志 |
| 112 | + original_text = raw_text | ||
| 116 | 113 | ||
| 117 | - # 步骤3: 应用本地修复策略 | ||
| 118 | - local_repaired = self._apply_local_repairs(cleaned) | ||
| 119 | - if local_repaired != cleaned: | ||
| 120 | - candidates.append(local_repaired) | 114 | + # 步骤1: 构造候选集,包含不同清理策略 |
| 115 | + candidates = self._build_candidate_payloads(raw_text, context_name) | ||
| 121 | 116 | ||
| 122 | - # 步骤4: 尝试解析所有候选 | 117 | + # 步骤2: 尝试解析所有候选 |
| 123 | last_error: Optional[json.JSONDecodeError] = None | 118 | last_error: Optional[json.JSONDecodeError] = None |
| 124 | for i, candidate in enumerate(candidates): | 119 | for i, candidate in enumerate(candidates): |
| 125 | try: | 120 | try: |
| @@ -132,7 +127,9 @@ class RobustJSONParser: | @@ -132,7 +127,9 @@ class RobustJSONParser: | ||
| 132 | last_error = exc | 127 | last_error = exc |
| 133 | logger.debug(f"{context_name} 候选{i + 1}解析失败: {exc}") | 128 | logger.debug(f"{context_name} 候选{i + 1}解析失败: {exc}") |
| 134 | 129 | ||
| 135 | - # 步骤5: 使用json_repair库 | 130 | + cleaned = candidates[0] if candidates else original_text |
| 131 | + | ||
| 132 | + # 步骤3: 使用json_repair库 | ||
| 136 | if self.enable_json_repair: | 133 | if self.enable_json_repair: |
| 137 | repaired = self._attempt_json_repair(cleaned, context_name) | 134 | repaired = self._attempt_json_repair(cleaned, context_name) |
| 138 | if repaired: | 135 | if repaired: |
| @@ -146,7 +143,7 @@ class RobustJSONParser: | @@ -146,7 +143,7 @@ class RobustJSONParser: | ||
| 146 | last_error = exc | 143 | last_error = exc |
| 147 | logger.debug(f"{context_name} json_repair修复后仍无法解析: {exc}") | 144 | logger.debug(f"{context_name} json_repair修复后仍无法解析: {exc}") |
| 148 | 145 | ||
| 149 | - # 步骤6: 使用LLM修复(如果启用) | 146 | + # 步骤4: 使用LLM修复(如果启用) |
| 150 | if self.enable_llm_repair and self.llm_repair_fn: | 147 | if self.enable_llm_repair and self.llm_repair_fn: |
| 151 | llm_repaired = self._attempt_llm_repair(cleaned, str(last_error), context_name) | 148 | llm_repaired = self._attempt_llm_repair(cleaned, str(last_error), context_name) |
| 152 | if llm_repaired: | 149 | if llm_repaired: |
| @@ -163,8 +160,29 @@ class RobustJSONParser: | @@ -163,8 +160,29 @@ class RobustJSONParser: | ||
| 163 | # 所有策略都失败了 | 160 | # 所有策略都失败了 |
| 164 | error_msg = f"{context_name} JSON解析失败: {last_error}" | 161 | error_msg = f"{context_name} JSON解析失败: {last_error}" |
| 165 | logger.error(error_msg) | 162 | logger.error(error_msg) |
| 166 | - logger.debug(f"原始文本前500字符: {raw_text[:500]}") | ||
| 167 | - raise JSONParseError(error_msg, raw_text=raw_text) from last_error | 163 | + logger.debug(f"原始文本前500字符: {original_text[:500]}") |
| 164 | + raise JSONParseError(error_msg, raw_text=original_text) from last_error | ||
| 165 | + | ||
| 166 | + def _build_candidate_payloads(self, raw_text: str, context_name: str) -> List[str]: | ||
| 167 | + """ | ||
| 168 | + 针对原始文本构造多个候选JSON字符串,覆盖不同的清理策略。 | ||
| 169 | + | ||
| 170 | + 返回: | ||
| 171 | + List[str]: 候选JSON文本列表 | ||
| 172 | + """ | ||
| 173 | + cleaned = self._clean_response(raw_text) | ||
| 174 | + candidates = [cleaned] | ||
| 175 | + | ||
| 176 | + local_repaired = self._apply_local_repairs(cleaned) | ||
| 177 | + if local_repaired != cleaned: | ||
| 178 | + candidates.append(local_repaired) | ||
| 179 | + | ||
| 180 | + # 对含有三层列表结构的内容强制拉平一次 | ||
| 181 | + flattened = self._flatten_nested_arrays(local_repaired) | ||
| 182 | + if flattened not in candidates: | ||
| 183 | + candidates.append(flattened) | ||
| 184 | + | ||
| 185 | + return candidates | ||
| 168 | 186 | ||
| 169 | def _clean_response(self, raw: str) -> str: | 187 | def _clean_response(self, raw: str) -> str: |
| 170 | """ | 188 | """ |
| @@ -301,6 +319,12 @@ class RobustJSONParser: | @@ -301,6 +319,12 @@ class RobustJSONParser: | ||
| 301 | logger.warning("检测到对象/数组之间缺少逗号,已自动补齐") | 319 | logger.warning("检测到对象/数组之间缺少逗号,已自动补齐") |
| 302 | mutated = True | 320 | mutated = True |
| 303 | 321 | ||
| 322 | + # 合并多余的方括号(LLM常见把二维列表层级写成三层) | ||
| 323 | + repaired, brackets_collapsed = self._collapse_redundant_brackets(repaired) | ||
| 324 | + if brackets_collapsed: | ||
| 325 | + logger.warning("检测到连续的方括号嵌套,已尝试折叠为二维结构") | ||
| 326 | + mutated = True | ||
| 327 | + | ||
| 304 | # 平衡括号 | 328 | # 平衡括号 |
| 305 | repaired, balanced = self._balance_brackets(repaired) | 329 | repaired, balanced = self._balance_brackets(repaired) |
| 306 | if balanced: | 330 | if balanced: |
| @@ -444,6 +468,46 @@ class RobustJSONParser: | @@ -444,6 +468,46 @@ class RobustJSONParser: | ||
| 444 | 468 | ||
| 445 | return "".join(chars), mutated | 469 | return "".join(chars), mutated |
| 446 | 470 | ||
| 471 | + def _collapse_redundant_brackets(self, text: str) -> Tuple[str, bool]: | ||
| 472 | + """ | ||
| 473 | + 针对LLM生成的三层或更多层数组(如]]], [[ / [[[)进行折叠,避免表格/列表写出额外维度。 | ||
| 474 | + | ||
| 475 | + 返回: | ||
| 476 | + Tuple[str, bool]: (修复后的文本, 是否有修改) | ||
| 477 | + """ | ||
| 478 | + if not text: | ||
| 479 | + return text, False | ||
| 480 | + | ||
| 481 | + mutated = False | ||
| 482 | + | ||
| 483 | + patterns = [ | ||
| 484 | + # 典型错误: "]]], [[{...}" -> "]], [{...}" | ||
| 485 | + (re.compile(r"\]\s*\]\s*\]\s*,\s*\[\s*\["), "]],["), | ||
| 486 | + # 极端情况: 连续三层开头 "[[[" -> "[[" | ||
| 487 | + (re.compile(r"\[\s*\[\s*\["), "[["), | ||
| 488 | + # 极端情况: 结尾 "]]]" -> "]]" | ||
| 489 | + (re.compile(r"\]\s*\]\s*\]"), "]]"), | ||
| 490 | + ] | ||
| 491 | + | ||
| 492 | + repaired = text | ||
| 493 | + for pattern, replacement in patterns: | ||
| 494 | + new_text, count = pattern.subn(replacement, repaired) | ||
| 495 | + if count > 0: | ||
| 496 | + mutated = True | ||
| 497 | + repaired = new_text | ||
| 498 | + | ||
| 499 | + return repaired, mutated | ||
| 500 | + | ||
| 501 | + def _flatten_nested_arrays(self, text: str) -> str: | ||
| 502 | + """ | ||
| 503 | + 对明显多余的一层列表进行折叠,例如 [[[x]]] -> [[x]]。 | ||
| 504 | + """ | ||
| 505 | + if not text: | ||
| 506 | + return text | ||
| 507 | + text = re.sub(r"\]\s*\]\s*\]", "]]", text) | ||
| 508 | + text = re.sub(r"\[\s*\[\s*\[", "[[", text) | ||
| 509 | + return text | ||
| 510 | + | ||
| 447 | def _balance_brackets(self, text: str) -> Tuple[str, bool]: | 511 | def _balance_brackets(self, text: str) -> Tuple[str, bool]: |
| 448 | """ | 512 | """ |
| 449 | 尝试修复因LLM多写/少写括号导致的不平衡结构。 | 513 | 尝试修复因LLM多写/少写括号导致的不平衡结构。 |
| @@ -69,6 +69,7 @@ tenacity==8.2.2 | @@ -69,6 +69,7 @@ tenacity==8.2.2 | ||
| 69 | loguru>=0.7.0 | 69 | loguru>=0.7.0 |
| 70 | pydantic==2.5.2 | 70 | pydantic==2.5.2 |
| 71 | pydantic-settings==2.2.1 | 71 | pydantic-settings==2.2.1 |
| 72 | +json-repair==0.53.0 | ||
| 72 | 73 | ||
| 73 | # ===== 开发工具(可选) ===== | 74 | # ===== 开发工具(可选) ===== |
| 74 | pytest>=7.4.0 | 75 | pytest>=7.4.0 |
| @@ -77,4 +78,4 @@ flake8>=6.0.0 | @@ -77,4 +78,4 @@ flake8>=6.0.0 | ||
| 77 | 78 | ||
| 78 | # ===== Web服务器 ===== | 79 | # ===== Web服务器 ===== |
| 79 | fastapi==0.110.2 | 80 | fastapi==0.110.2 |
| 80 | -uvicorn==0.29.0 | ||
| 81 | +uvicorn==0.29.0 |
-
Please register or login to post a comment