马一丁

Repair the Logic for Cleaning Data Returned by LLM

@@ -51,12 +51,12 @@ class RobustJSONParser: @@ -51,12 +51,12 @@ class RobustJSONParser:
51 51
52 # 常见的LLM思考内容模式 52 # 常见的LLM思考内容模式
53 _THINKING_PATTERNS = [ 53 _THINKING_PATTERNS = [
54 - r"<thinking>.*?</thinking>",  
55 - r"<thought>.*?</thought>",  
56 - r"让我想想.*?(?=\{|\[|$)",  
57 - r"首先.*?(?=\{|\[|$)",  
58 - r"分析.*?(?=\{|\[|$)",  
59 - r"根据.*?(?=\{|\[|$)", 54 + r"^\s*<thinking>.*?</thinking>\s*",
  55 + r"^\s*<thought>.*?</thought>\s*",
  56 + r"^\s*让我想想.*?(?=\{|\[|$)",
  57 + r"^\s*首先.*?(?=\{|\[|$)",
  58 + r"^\s*分析.*?(?=\{|\[|$)",
  59 + r"^\s*根据.*?(?=\{|\[|$)",
60 ] 60 ]
61 61
62 # 冒号等号模式(LLM常见错误) 62 # 冒号等号模式(LLM常见错误)
@@ -182,16 +182,21 @@ class RobustJSONParser: @@ -182,16 +182,21 @@ class RobustJSONParser:
182 for pattern in self._THINKING_PATTERNS: 182 for pattern in self._THINKING_PATTERNS:
183 cleaned = re.sub(pattern, "", cleaned, flags=re.DOTALL | re.IGNORECASE) 183 cleaned = re.sub(pattern, "", cleaned, flags=re.DOTALL | re.IGNORECASE)
184 184
185 - # 移除markdown代码块标记  
186 - if cleaned.startswith("```json"):  
187 - cleaned = cleaned[7:]  
188 - elif cleaned.startswith("```"):  
189 - cleaned = cleaned[3:] 185 + # 优先提取任意位置的```json```包裹内容
  186 + fenced_match = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", cleaned)
  187 + if fenced_match:
  188 + cleaned = fenced_match.group(1).strip()
  189 + else:
  190 + # 如果没有找到完整代码块,再尝试移除前后缀
  191 + if cleaned.startswith("```json"):
  192 + cleaned = cleaned[7:]
  193 + elif cleaned.startswith("```"):
  194 + cleaned = cleaned[3:]
190 195
191 - if cleaned.endswith("```"):  
192 - cleaned = cleaned[:-3] 196 + if cleaned.endswith("```"):
  197 + cleaned = cleaned[:-3]
193 198
194 - cleaned = cleaned.strip() 199 + cleaned = cleaned.strip()
195 200
196 # 尝试提取第一个完整的JSON对象或数组 201 # 尝试提取第一个完整的JSON对象或数组
197 cleaned = self._extract_first_json_structure(cleaned) 202 cleaned = self._extract_first_json_structure(cleaned)