Showing
2 changed files
with
120 additions
and
3 deletions
| @@ -610,9 +610,34 @@ class RobustJSONParser: | @@ -610,9 +610,34 @@ class RobustJSONParser: | ||
| 610 | 610 | ||
| 611 | # 验证数据类型 | 611 | # 验证数据类型 |
| 612 | if not isinstance(data, dict): | 612 | if not isinstance(data, dict): |
| 613 | - if isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict): | ||
| 614 | - logger.warning(f"{context_name} 返回数组,自动提取第一个元素") | ||
| 615 | - data = data[0] | 613 | + if isinstance(data, list): |
| 614 | + if len(data) > 0: | ||
| 615 | + # 尝试找到最符合期望的元素 | ||
| 616 | + best_match = None | ||
| 617 | + max_match_count = 0 | ||
| 618 | + | ||
| 619 | + for item in data: | ||
| 620 | + if isinstance(item, dict): | ||
| 621 | + if expected_keys: | ||
| 622 | + # 计算匹配的键数量 | ||
| 623 | + match_count = sum(1 for key in expected_keys if key in item) | ||
| 624 | + if match_count > max_match_count: | ||
| 625 | + max_match_count = match_count | ||
| 626 | + best_match = item | ||
| 627 | + elif best_match is None: | ||
| 628 | + best_match = item | ||
| 629 | + | ||
| 630 | + if best_match: | ||
| 631 | + logger.warning( | ||
| 632 | + f"{context_name} 返回数组,自动提取最佳匹配元素(匹配{max_match_count}/{len(expected_keys or [])}个键)" | ||
| 633 | + ) | ||
| 634 | + data = best_match | ||
| 635 | + else: | ||
| 636 | + raise JSONParseError( | ||
| 637 | + f"{context_name} 返回的数组中没有有效的对象" | ||
| 638 | + ) | ||
| 639 | + else: | ||
| 640 | + raise JSONParseError(f"{context_name} 返回空数组") | ||
| 616 | else: | 641 | else: |
| 617 | raise JSONParseError( | 642 | raise JSONParseError( |
| 618 | f"{context_name} 返回的不是JSON对象: {type(data).__name__}" | 643 | f"{context_name} 返回的不是JSON对象: {type(data).__name__}" |
| @@ -625,6 +650,43 @@ class RobustJSONParser: | @@ -625,6 +650,43 @@ class RobustJSONParser: | ||
| 625 | logger.warning( | 650 | logger.warning( |
| 626 | f"{context_name} 缺少预期的键: {', '.join(missing_keys)}" | 651 | f"{context_name} 缺少预期的键: {', '.join(missing_keys)}" |
| 627 | ) | 652 | ) |
| 653 | + # 尝试修复常见的键名变体 | ||
| 654 | + data = self._try_recover_missing_keys(data, missing_keys, context_name) | ||
| 655 | + | ||
| 656 | + return data | ||
| 657 | + | ||
| 658 | + def _try_recover_missing_keys( | ||
| 659 | + self, data: Dict[str, Any], missing_keys: List[str], context_name: str | ||
| 660 | + ) -> Dict[str, Any]: | ||
| 661 | + """ | ||
| 662 | + 尝试从数据中恢复缺失的键,通过查找相似的键名。 | ||
| 663 | + | ||
| 664 | + 参数: | ||
| 665 | + data: 原始数据 | ||
| 666 | + missing_keys: 缺失的键列表 | ||
| 667 | + context_name: 上下文名称 | ||
| 668 | + | ||
| 669 | + 返回: | ||
| 670 | + Dict[str, Any]: 修复后的数据 | ||
| 671 | + """ | ||
| 672 | + # 常见的键名映射 | ||
| 673 | + key_aliases = { | ||
| 674 | + "template_name": ["templateName", "name", "template"], | ||
| 675 | + "selection_reason": ["selectionReason", "reason", "explanation"], | ||
| 676 | + "title": ["reportTitle", "documentTitle"], | ||
| 677 | + "chapters": ["chapterList", "chapterPlan", "sections"], | ||
| 678 | + "totalWords": ["total_words", "wordCount", "totalWordCount"], | ||
| 679 | + } | ||
| 680 | + | ||
| 681 | + for missing_key in missing_keys: | ||
| 682 | + if missing_key in key_aliases: | ||
| 683 | + for alias in key_aliases[missing_key]: | ||
| 684 | + if alias in data: | ||
| 685 | + logger.info( | ||
| 686 | + f"{context_name} 找到键'{missing_key}'的别名'{alias}',自动映射" | ||
| 687 | + ) | ||
| 688 | + data[missing_key] = data[alias] | ||
| 689 | + break | ||
| 628 | 690 | ||
| 629 | return data | 691 | return data |
| 630 | 692 |
| @@ -127,6 +127,61 @@ class TestRobustJSONParser(unittest.TestCase): | @@ -127,6 +127,61 @@ class TestRobustJSONParser(unittest.TestCase): | ||
| 127 | self.assertEqual(result["name"], "test") | 127 | self.assertEqual(result["name"], "test") |
| 128 | self.assertEqual(result["value"], 123) | 128 | self.assertEqual(result["value"], 123) |
| 129 | 129 | ||
| 130 | + def test_unterminated_string_with_json_repair(self): | ||
| 131 | + """测试使用json_repair库修复未终止的字符串。""" | ||
| 132 | + # 创建启用json_repair的解析器 | ||
| 133 | + parser_with_repair = RobustJSONParser( | ||
| 134 | + enable_json_repair=True, | ||
| 135 | + enable_llm_repair=False, | ||
| 136 | + ) | ||
| 137 | + | ||
| 138 | + # 模拟实际错误:字符串中有未转义的控制字符或引号 | ||
| 139 | + json_str = """{ | ||
| 140 | + "template_name": "特定政策报告", | ||
| 141 | + "selection_reason": "这是测试内容" | ||
| 142 | +}""" | ||
| 143 | + result = parser_with_repair.parse(json_str, "未终止字符串测试") | ||
| 144 | + # 只要能够解析成功,不报错就可以了 | ||
| 145 | + self.assertIsInstance(result, dict) | ||
| 146 | + self.assertIn("template_name", result) | ||
| 147 | + | ||
| 148 | + def test_array_with_best_match(self): | ||
| 149 | + """测试从数组中提取最佳匹配的元素。""" | ||
| 150 | + json_str = """[ | ||
| 151 | + { | ||
| 152 | + "name": "test", | ||
| 153 | + "value": 123 | ||
| 154 | + }, | ||
| 155 | + { | ||
| 156 | + "totalWords": 40000, | ||
| 157 | + "globalGuidelines": ["guide1", "guide2"], | ||
| 158 | + "chapters": [] | ||
| 159 | + } | ||
| 160 | +]""" | ||
| 161 | + result = self.parser.parse( | ||
| 162 | + json_str, | ||
| 163 | + "数组最佳匹配测试", | ||
| 164 | + expected_keys=["totalWords", "globalGuidelines", "chapters"], | ||
| 165 | + ) | ||
| 166 | + # 应该提取第二个元素,因为它匹配了3个键 | ||
| 167 | + self.assertEqual(result["totalWords"], 40000) | ||
| 168 | + self.assertEqual(len(result["globalGuidelines"]), 2) | ||
| 169 | + | ||
| 170 | + def test_key_alias_recovery(self): | ||
| 171 | + """测试键名别名恢复。""" | ||
| 172 | + json_str = """{ | ||
| 173 | + "templateName": "test_template", | ||
| 174 | + "selectionReason": "This is a test" | ||
| 175 | +}""" | ||
| 176 | + result = self.parser.parse( | ||
| 177 | + json_str, | ||
| 178 | + "键别名测试", | ||
| 179 | + expected_keys=["template_name", "selection_reason"], | ||
| 180 | + ) | ||
| 181 | + # 应该自动映射 templateName -> template_name | ||
| 182 | + self.assertEqual(result["template_name"], "test_template") | ||
| 183 | + self.assertEqual(result["selection_reason"], "This is a test") | ||
| 184 | + | ||
| 130 | def test_complex_real_world_case(self): | 185 | def test_complex_real_world_case(self): |
| 131 | """测试真实世界的复杂案例(类似实际错误)。""" | 186 | """测试真实世界的复杂案例(类似实际错误)。""" |
| 132 | # 模拟实际错误:缺少逗号、有markdown包裹、有思考内容 | 187 | # 模拟实际错误:缺少逗号、有markdown包裹、有思考内容 |
-
Please register or login to post a comment