state_parser.py
4.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
"""
State JSON 解析器
解析 Insight/Media/Query 三引擎的 State JSON 文件,
提取结构化数据用于构建知识图谱。
"""
from dataclasses import dataclass, field
from typing import Dict, Any, List, Optional
import json
from pathlib import Path
@dataclass
class SearchRecord:
"""单条搜索记录"""
query: str = ""
url: str = ""
title: str = ""
content: str = ""
score: Optional[float] = None
timestamp: str = ""
@dataclass
class ParsedSection:
"""解析后的段落/章节"""
title: str = ""
order: int = 0
summary: str = ""
search_history: List[SearchRecord] = field(default_factory=list)
@dataclass
class ParsedState:
"""解析后的引擎状态"""
engine: str = ""
query: str = ""
report_title: str = ""
sections: List[ParsedSection] = field(default_factory=list)
class StateParser:
"""
State JSON 解析器
解析三引擎的 State JSON,提取用于构建知识图谱的结构化数据。
"""
def parse(self, engine_name: str, state_json: Dict[str, Any]) -> ParsedState:
"""
解析单个引擎的 State JSON
Args:
engine_name: 引擎名称 (insight/media/query)
state_json: State JSON 字典
Returns:
ParsedState 对象
"""
return ParsedState(
engine=engine_name,
query=state_json.get('query', ''),
report_title=state_json.get('report_title', ''),
sections=[
self._parse_paragraph(p)
for p in state_json.get('paragraphs', [])
]
)
def _parse_paragraph(self, para: Dict[str, Any]) -> ParsedSection:
"""解析单个段落"""
research = para.get('research', {})
# 提取搜索历史
search_history = []
for search in research.get('search_history', []):
search_history.append(SearchRecord(
query=search.get('query', ''),
url=search.get('url', ''),
title=search.get('title', ''),
content=search.get('content', '')[:200] if search.get('content') else '',
score=search.get('score'),
timestamp=search.get('timestamp', '')
))
# 获取摘要,优先使用 latest_summary
summary = research.get('latest_summary', '')
if not summary:
summary = para.get('content', '')
return ParsedSection(
title=para.get('title', ''),
order=para.get('order', 0),
summary=summary[:300] if summary else '',
search_history=search_history
)
def parse_from_file(self, engine_name: str, file_path: str) -> Optional[ParsedState]:
"""
从文件解析 State JSON
Args:
engine_name: 引擎名称
file_path: JSON 文件路径
Returns:
ParsedState 对象,失败返回 None
"""
try:
path = Path(file_path)
if not path.exists():
return None
with open(path, 'r', encoding='utf-8') as f:
state_json = json.load(f)
return self.parse(engine_name, state_json)
except Exception:
return None
def find_state_json(self, md_path: str) -> Optional[str]:
"""
根据 Markdown 报告路径查找对应的 State JSON 文件
State JSON 通常与 MD 文件在同一目录下,命名格式为 state_*.json
Args:
md_path: Markdown 文件路径
Returns:
State JSON 路径,未找到返回 None
"""
md_file = Path(md_path)
if not md_file.exists():
return None
parent_dir = md_file.parent
# 尝试匹配 state_*.json 文件
state_files = list(parent_dir.glob('state_*.json'))
if not state_files:
return None
# 如果有多个,尝试通过时间戳匹配
md_stem = md_file.stem # e.g., "武汉大学_20250825_180214"
for state_file in state_files:
state_stem = state_file.stem # e.g., "state_武汉大学_20250825_180214"
# 检查是否包含相同的查询词和时间戳
if md_stem in state_stem or state_stem.replace('state_', '') == md_stem:
return str(state_file)
# 否则返回最新的
state_files.sort(key=lambda f: f.stat().st_mtime, reverse=True)
return str(state_files[0])