template_parser.py
6.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
"""
Markdown模板切片工具。
LLM需要“按章调用”,因此必须把Markdown模板解析为结构化章节队列。
这里通过轻量正则和缩进启发式,兼容“# 标题”与
“- **1.0 标题** / - 1.1 子标题”等多种写法。
"""
from __future__ import annotations
import re
import unicodedata
from dataclasses import dataclass, field
from typing import List, Optional
SECTION_ORDER_STEP = 10
@dataclass
class TemplateSection:
"""模板章节实体"""
title: str
slug: str
order: int
depth: int
raw_title: str
number: str = ""
chapter_id: str = ""
outline: List[str] = field(default_factory=list)
def to_dict(self) -> dict:
"""将章节实体序列化为字典,方便传给LLM或落盘"""
return {
"title": self.title,
"slug": self.slug,
"order": self.order,
"depth": self.depth,
"number": self.number,
"chapterId": self.chapter_id,
"outline": self.outline,
}
heading_pattern = re.compile(r"^(#{1,6})\s+(.*)$")
bullet_pattern = re.compile(r"^[-*+]\s+(.*)$")
number_pattern = re.compile(r"^(?P<num>\d+(?:\.\d+)*)(?:[\s、::.-]+(?P<label>.*))?$")
def parse_template_sections(template_md: str) -> List[TemplateSection]:
"""
将Markdown模板切分成章节列表(按大标题)。
返回的每个TemplateSection都携带slug/order/章节号,
方便后续分章调用与锚点生成。
"""
sections: List[TemplateSection] = []
current: Optional[TemplateSection] = None
order = SECTION_ORDER_STEP
used_slugs = set()
for raw_line in template_md.splitlines():
if not raw_line.strip():
continue
indent = len(raw_line) - len(raw_line.lstrip(" "))
stripped = raw_line.strip()
meta = _classify_line(stripped, indent)
if not meta:
continue
if meta["is_section"]:
slug = _ensure_unique_slug(meta["slug"], used_slugs)
section = TemplateSection(
title=meta["title"],
slug=slug,
order=order,
depth=meta["depth"],
raw_title=meta["raw"],
number=meta["number"],
)
sections.append(section)
current = section
order += SECTION_ORDER_STEP
continue
# outline
if current:
current.outline.append(meta["title"])
for idx, section in enumerate(sections, start=1):
# 为每个章节生成稳定的chapter_id,便于后续引用
section.chapter_id = f"S{idx}"
return sections
def _classify_line(stripped: str, indent: int) -> Optional[dict]:
"""根据缩进与符号分类行"""
heading_match = heading_pattern.match(stripped)
if heading_match:
level = len(heading_match.group(1))
payload = _strip_markup(heading_match.group(2).strip())
title_info = _split_number(payload)
slug = _build_slug(title_info["number"], title_info["title"])
return {
"is_section": level <= 2,
"depth": level,
"title": title_info["display"],
"raw": payload,
"number": title_info["number"],
"slug": slug,
}
bullet_match = bullet_pattern.match(stripped)
if bullet_match:
payload = _strip_markup(bullet_match.group(1).strip())
title_info = _split_number(payload)
slug = _build_slug(title_info["number"], title_info["title"])
is_section = indent <= 1
depth = 1 if indent <= 1 else 2
return {
"is_section": is_section,
"depth": depth,
"title": title_info["display"],
"raw": payload,
"number": title_info["number"],
"slug": slug,
}
# 兼容“1.1 ...”没有前缀符号的行
number_match = number_pattern.match(stripped)
if number_match and number_match.group("label"):
payload = stripped
title = number_match.group("label").strip()
number = number_match.group("num")
slug = _build_slug(number, title)
is_section = indent == 0 and number.count(".") <= 1
depth = 1 if is_section else 2
display = f"{number} {title}" if title else number
return {
"is_section": is_section,
"depth": depth,
"title": display,
"raw": payload,
"number": number,
"slug": slug,
}
return None
def _strip_markup(text: str) -> str:
"""去除包裹的**、__等简单强调标记"""
if text.startswith(("**", "__")) and text.endswith(("**", "__")) and len(text) > 4:
return text[2:-2].strip()
return text
def _split_number(payload: str) -> dict:
"""拆分编号与标题"""
match = number_pattern.match(payload)
number = match.group("num") if match else ""
label = match.group("label") if match else payload
label = (label or "").strip()
display = f"{number} {label}".strip() if number else label or payload
title_core = label or payload
return {
"number": number,
"title": title_core,
"display": display,
}
def _build_slug(number: str, title: str) -> str:
"""根据编号/标题生成锚点"""
if number:
token = number.replace(".", "-")
else:
token = _slugify_text(title)
token = token or "section"
return f"section-{token}"
def _slugify_text(text: str) -> str:
"""对任意文本做降噪与转写,得到URL友好的slug片段"""
text = unicodedata.normalize("NFKD", text)
text = text.replace("·", "-").replace(" ", "-")
text = re.sub(r"[^0-9a-zA-Z\u4e00-\u9fff-]+", "-", text)
text = re.sub(r"-{2,}", "-", text)
return text.strip("-").lower()
def _ensure_unique_slug(slug: str, used: set) -> str:
"""若slug重复则自动追加序号,直到在used集合中唯一"""
if slug not in used:
used.add(slug)
return slug
base = slug
idx = 2
while slug in used:
slug = f"{base}-{idx}"
idx += 1
used.add(slug)
return slug
__all__ = ["TemplateSection", "parse_template_sections"]