Improve the Security of Regular Expression Matching

马一丁
Commit 5ef63ece78840bd804b086a209d29ff66308fb90 5ef63ece 1 parent 79a015b7
Showing 1 changed file with 38 additions and 10 deletions
ReportEngine/core/template_parser.py
--- a/ReportEngine/core/template_parser.py
View file @5ef63ec
+++ b/ReportEngine/core/template_parser.py
View file @5ef63ec
@@ -51,9 +51,37 @@ class TemplateSection:
         }
 
 
- heading_pattern = re.compile(r"^(#{1,6})\s+(.*)$")
- bullet_pattern = re.compile(r"^[-*+]\s+(.*)$")
- number_pattern = re.compile(r"^(?P<num>\d+(?:\.\d+)*)(?:[\s、:：.-]+(?P<label>.*))?$")
+ # The parsing expressions intentionally avoid `.*` to keep matching deterministic and
+ # eliminate easy Regular-Expression-DoS gadgets on untrusted template text.
+ heading_pattern = re.compile(
+     r"""
+     (?P<marker>\#{1,6})       # Markdown heading markers
+     [ \t]+                    # required whitespace
+     (?P<title>[^\r\n]+)       # heading text without newline characters
+     """,
+     re.VERBOSE,
+ )
+ bullet_pattern = re.compile(
+     r"""
+     (?P<marker>[-*+])         # list bullet symbol
+     [ \t]+
+     (?P<title>[^\r\n]+)
+     """,
+     re.VERBOSE,
+ )
+ number_pattern = re.compile(
+     r"""
+     (?P<num>
+         (?:0|[1-9]\d*)
+         (?:\.(?:0|[1-9]\d*))*
+     )
+     (?:
+         (?:[ \t\u00A0\u3000、:：-]+|\.(?!\d))+
+         (?P<label>[^\r\n]*)
+     )?
+     """,
+     re.VERBOSE,
+ )
 
 
 def parse_template_sections(template_md: str) -> List[TemplateSection]:
@@ -128,10 +156,10 @@ def _classify_line(stripped: str, indent: int) -> Optional[dict]:
         dict | None: 识别后的元数据；无法识别时返回None。
     """
 
-     heading_match = heading_pattern.match(stripped)
+     heading_match = heading_pattern.fullmatch(stripped)
     if heading_match:
-         level = len(heading_match.group(1))
-         payload = _strip_markup(heading_match.group(2).strip())
+         level = len(heading_match.group("marker"))
+         payload = _strip_markup(heading_match.group("title").strip())
         title_info = _split_number(payload)
         slug = _build_slug(title_info["number"], title_info["title"])
         return {
@@ -143,9 +171,9 @@ def _classify_line(stripped: str, indent: int) -> Optional[dict]:
             "slug": slug,
         }
 
-     bullet_match = bullet_pattern.match(stripped)
+     bullet_match = bullet_pattern.fullmatch(stripped)
     if bullet_match:
-         payload = _strip_markup(bullet_match.group(1).strip())
+         payload = _strip_markup(bullet_match.group("title").strip())
         title_info = _split_number(payload)
         slug = _build_slug(title_info["number"], title_info["title"])
         is_section = indent <= 1
@@ -160,7 +188,7 @@ def _classify_line(stripped: str, indent: int) -> Optional[dict]:
         }
 
     # 兼容“1.1 ...”没有前缀符号的行
-     number_match = number_pattern.match(stripped)
+     number_match = number_pattern.fullmatch(stripped)
     if number_match and number_match.group("label"):
         payload = stripped
         title = number_match.group("label").strip()
@@ -201,7 +229,7 @@ def _split_number(payload: str) -> dict:
     返回:
         dict: 包含 number/title/display。
     """
-     match = number_pattern.match(payload)
+     match = number_pattern.fullmatch(payload)
     number = match.group("num") if match else ""
     label = match.group("label") if match else payload
     label = (label or "").strip()