马一丁

Improve the Security of Regular Expression Matching

@@ -51,9 +51,37 @@ class TemplateSection: @@ -51,9 +51,37 @@ class TemplateSection:
51 } 51 }
52 52
53 53
54 -heading_pattern = re.compile(r"^(#{1,6})\s+(.*)$")  
55 -bullet_pattern = re.compile(r"^[-*+]\s+(.*)$")  
56 -number_pattern = re.compile(r"^(?P<num>\d+(?:\.\d+)*)(?:[\s、::.-]+(?P<label>.*))?$") 54 +# The parsing expressions intentionally avoid `.*` to keep matching deterministic and
  55 +# eliminate easy Regular-Expression-DoS gadgets on untrusted template text.
  56 +heading_pattern = re.compile(
  57 + r"""
  58 + (?P<marker>\#{1,6}) # Markdown heading markers
  59 + [ \t]+ # required whitespace
  60 + (?P<title>[^\r\n]+) # heading text without newline characters
  61 + """,
  62 + re.VERBOSE,
  63 +)
  64 +bullet_pattern = re.compile(
  65 + r"""
  66 + (?P<marker>[-*+]) # list bullet symbol
  67 + [ \t]+
  68 + (?P<title>[^\r\n]+)
  69 + """,
  70 + re.VERBOSE,
  71 +)
  72 +number_pattern = re.compile(
  73 + r"""
  74 + (?P<num>
  75 + (?:0|[1-9]\d*)
  76 + (?:\.(?:0|[1-9]\d*))*
  77 + )
  78 + (?:
  79 + (?:[ \t\u00A0\u3000、::-]+|\.(?!\d))+
  80 + (?P<label>[^\r\n]*)
  81 + )?
  82 + """,
  83 + re.VERBOSE,
  84 +)
57 85
58 86
59 def parse_template_sections(template_md: str) -> List[TemplateSection]: 87 def parse_template_sections(template_md: str) -> List[TemplateSection]:
@@ -128,10 +156,10 @@ def _classify_line(stripped: str, indent: int) -> Optional[dict]: @@ -128,10 +156,10 @@ def _classify_line(stripped: str, indent: int) -> Optional[dict]:
128 dict | None: 识别后的元数据;无法识别时返回None。 156 dict | None: 识别后的元数据;无法识别时返回None。
129 """ 157 """
130 158
131 - heading_match = heading_pattern.match(stripped) 159 + heading_match = heading_pattern.fullmatch(stripped)
132 if heading_match: 160 if heading_match:
133 - level = len(heading_match.group(1))  
134 - payload = _strip_markup(heading_match.group(2).strip()) 161 + level = len(heading_match.group("marker"))
  162 + payload = _strip_markup(heading_match.group("title").strip())
135 title_info = _split_number(payload) 163 title_info = _split_number(payload)
136 slug = _build_slug(title_info["number"], title_info["title"]) 164 slug = _build_slug(title_info["number"], title_info["title"])
137 return { 165 return {
@@ -143,9 +171,9 @@ def _classify_line(stripped: str, indent: int) -> Optional[dict]: @@ -143,9 +171,9 @@ def _classify_line(stripped: str, indent: int) -> Optional[dict]:
143 "slug": slug, 171 "slug": slug,
144 } 172 }
145 173
146 - bullet_match = bullet_pattern.match(stripped) 174 + bullet_match = bullet_pattern.fullmatch(stripped)
147 if bullet_match: 175 if bullet_match:
148 - payload = _strip_markup(bullet_match.group(1).strip()) 176 + payload = _strip_markup(bullet_match.group("title").strip())
149 title_info = _split_number(payload) 177 title_info = _split_number(payload)
150 slug = _build_slug(title_info["number"], title_info["title"]) 178 slug = _build_slug(title_info["number"], title_info["title"])
151 is_section = indent <= 1 179 is_section = indent <= 1
@@ -160,7 +188,7 @@ def _classify_line(stripped: str, indent: int) -> Optional[dict]: @@ -160,7 +188,7 @@ def _classify_line(stripped: str, indent: int) -> Optional[dict]:
160 } 188 }
161 189
162 # 兼容“1.1 ...”没有前缀符号的行 190 # 兼容“1.1 ...”没有前缀符号的行
163 - number_match = number_pattern.match(stripped) 191 + number_match = number_pattern.fullmatch(stripped)
164 if number_match and number_match.group("label"): 192 if number_match and number_match.group("label"):
165 payload = stripped 193 payload = stripped
166 title = number_match.group("label").strip() 194 title = number_match.group("label").strip()
@@ -201,7 +229,7 @@ def _split_number(payload: str) -> dict: @@ -201,7 +229,7 @@ def _split_number(payload: str) -> dict:
201 返回: 229 返回:
202 dict: 包含 number/title/display。 230 dict: 包含 number/title/display。
203 """ 231 """
204 - match = number_pattern.match(payload) 232 + match = number_pattern.fullmatch(payload)
205 number = match.group("num") if match else "" 233 number = match.group("num") if match else ""
206 label = match.group("label") if match else payload 234 label = match.group("label") if match else payload
207 label = (label or "").strip() 235 label = (label or "").strip()