Toggle navigation
Toggle navigation
This project
Loading...
Sign in
万朱浩
/
Venue-Ops
Go to a project
Toggle navigation
Projects
Groups
Snippets
Help
Toggle navigation pinning
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
马一丁
2025-12-15 19:54:18 +0800
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
2d646bf21d1ed46d5eff5953ce8f0407cf09a5dc
2d646bf2
1 parent
2327b13e
Allows for separate rendering of markdown
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
328 additions
and
0 deletions
regenerate_latest_md.py
regenerate_latest_md.py
0 → 100644
View file @
2d646bf
"""
使用最新的章节JSON重新装订并渲染Markdown报告。
"""
import
json
import
sys
from
datetime
import
datetime
from
pathlib
import
Path
from
loguru
import
logger
# 确保可以找到项目内模块
sys
.
path
.
insert
(
0
,
str
(
Path
(
__file__
)
.
parent
))
from
ReportEngine.core
import
ChapterStorage
,
DocumentComposer
from
ReportEngine.ir
import
IRValidator
from
ReportEngine.renderers
import
MarkdownRenderer
from
ReportEngine.utils.config
import
settings
def
find_latest_run_dir
(
chapter_root
:
Path
):
"""
定位章节根目录下最新一次运行的输出目录。
扫描 `chapter_root` 下所有子目录,筛选出包含 `manifest.json`
的候选,按修改时间倒序取最新一条。若目录不存在或没有有效
manifest,会记录错误并返回 None。
参数:
chapter_root: 章节输出的根目录(通常是 settings.CHAPTER_OUTPUT_DIR)
返回:
Path | None: 最新的 run 目录路径;若未找到则为 None。
"""
if
not
chapter_root
.
exists
():
logger
.
error
(
f
"章节目录不存在: {chapter_root}"
)
return
None
run_dirs
=
[]
for
candidate
in
chapter_root
.
iterdir
():
if
not
candidate
.
is_dir
():
continue
manifest_path
=
candidate
/
"manifest.json"
if
manifest_path
.
exists
():
run_dirs
.
append
((
candidate
,
manifest_path
.
stat
()
.
st_mtime
))
if
not
run_dirs
:
logger
.
error
(
"未找到带 manifest.json 的章节目录"
)
return
None
latest_dir
=
sorted
(
run_dirs
,
key
=
lambda
item
:
item
[
1
],
reverse
=
True
)[
0
][
0
]
logger
.
info
(
f
"找到最新run目录: {latest_dir.name}"
)
return
latest_dir
def
load_manifest
(
run_dir
:
Path
):
"""
读取单次运行目录内的 manifest.json。
成功时返回 reportId 以及元数据字典;读取或解析失败会记录错误
并返回 (None, None),以便上层提前终止流程。
参数:
run_dir: 包含 manifest.json 的章节输出目录
返回:
tuple[str | None, dict | None]: (report_id, metadata)
"""
manifest_path
=
run_dir
/
"manifest.json"
try
:
with
manifest_path
.
open
(
"r"
,
encoding
=
"utf-8"
)
as
f
:
manifest
=
json
.
load
(
f
)
report_id
=
manifest
.
get
(
"reportId"
)
or
run_dir
.
name
metadata
=
manifest
.
get
(
"metadata"
)
or
{}
logger
.
info
(
f
"报告ID: {report_id}"
)
if
manifest
.
get
(
"createdAt"
):
logger
.
info
(
f
"创建时间: {manifest['createdAt']}"
)
return
report_id
,
metadata
except
Exception
as
exc
:
logger
.
error
(
f
"读取manifest失败: {exc}"
)
return
None
,
None
def
load_chapters
(
run_dir
:
Path
):
"""
读取指定 run 目录下的所有章节 JSON。
会复用 ChapterStorage 的 load_chapters 能力,自动按 order 排序。
读取后打印章节数量,便于确认完整性。
参数:
run_dir: 单次报告的章节目录
返回:
list[dict]: 章节 JSON 列表(若目录为空则为空列表)
"""
storage
=
ChapterStorage
(
settings
.
CHAPTER_OUTPUT_DIR
)
chapters
=
storage
.
load_chapters
(
run_dir
)
logger
.
info
(
f
"加载章节数: {len(chapters)}"
)
return
chapters
def
validate_chapters
(
chapters
):
"""
使用 IRValidator 对章节结构做快速校验。
仅记录未通过的章节及前三条错误,不会中断流程;目的是在
重装订前发现潜在结构问题。
参数:
chapters: 章节 JSON 列表
"""
validator
=
IRValidator
()
invalid
=
[]
for
chapter
in
chapters
:
ok
,
errors
=
validator
.
validate_chapter
(
chapter
)
if
not
ok
:
invalid
.
append
((
chapter
.
get
(
"chapterId"
)
or
"unknown"
,
errors
))
if
invalid
:
logger
.
warning
(
f
"有 {len(invalid)} 个章节未通过结构校验,将继续装订:"
)
for
chapter_id
,
errors
in
invalid
:
preview
=
"; "
.
join
(
errors
[:
3
])
logger
.
warning
(
f
" - {chapter_id}: {preview}"
)
else
:
logger
.
info
(
"章节结构校验通过"
)
def
stitch_document
(
report_id
,
metadata
,
chapters
):
"""
将各章节与元数据装订为完整的 Document IR。
使用 DocumentComposer 统一处理章节顺序、全局元数据等,并打印
装订完成的章节与图表数量。
参数:
report_id: 报告 ID(来自 manifest 或目录名)
metadata: manifest 中的全局元数据
chapters: 已加载的章节列表
返回:
dict: 完整的 Document IR 对象
"""
composer
=
DocumentComposer
()
document_ir
=
composer
.
build_document
(
report_id
,
metadata
,
chapters
)
logger
.
info
(
f
"装订完成: {len(document_ir.get('chapters', []))} 个章节,"
f
"{count_charts(document_ir)} 个图表"
)
return
document_ir
def
count_charts
(
document_ir
):
"""
统计整本 Document IR 中的 Chart.js 图表数量。
会遍历每章的 blocks,递归查找 widget 类型中以 `chart.js`
开头的组件,便于快速感知图表规模。
参数:
document_ir: 完整的 Document IR
返回:
int: 图表总数
"""
chart_count
=
0
for
chapter
in
document_ir
.
get
(
"chapters"
,
[]):
blocks
=
chapter
.
get
(
"blocks"
,
[])
chart_count
+=
_count_chart_blocks
(
blocks
)
return
chart_count
def
_count_chart_blocks
(
blocks
):
"""
递归统计 block 列表中的 Chart.js 组件数量。
兼容嵌套的 blocks/list/table 结构,确保所有层级的图表都被计入。
参数:
blocks: 任意层级的 block 列表
返回:
int: 统计到的 chart.js 图表数量
"""
count
=
0
for
block
in
blocks
:
if
not
isinstance
(
block
,
dict
):
continue
if
block
.
get
(
"type"
)
==
"widget"
and
str
(
block
.
get
(
"widgetType"
,
""
))
.
startswith
(
"chart.js"
):
count
+=
1
nested
=
block
.
get
(
"blocks"
)
if
isinstance
(
nested
,
list
):
count
+=
_count_chart_blocks
(
nested
)
if
block
.
get
(
"type"
)
==
"list"
:
for
item
in
block
.
get
(
"items"
,
[]):
if
isinstance
(
item
,
list
):
count
+=
_count_chart_blocks
(
item
)
if
block
.
get
(
"type"
)
==
"table"
:
for
row
in
block
.
get
(
"rows"
,
[]):
for
cell
in
row
.
get
(
"cells"
,
[]):
if
isinstance
(
cell
,
dict
):
cell_blocks
=
cell
.
get
(
"blocks"
,
[])
if
isinstance
(
cell_blocks
,
list
):
count
+=
_count_chart_blocks
(
cell_blocks
)
return
count
def
save_document_ir
(
document_ir
,
base_name
,
timestamp
):
"""
将重新装订好的整本 Document IR 落盘。
按 `report_ir_{slug}_{timestamp}_regen.json` 命名写入
`settings.DOCUMENT_IR_OUTPUT_DIR`,确保目录存在并返回保存路径。
参数:
document_ir: 已装订完成的整本 IR
base_name: 由主题/标题生成的安全文件名片段
timestamp: 时间戳字符串,用于区分多次重生成
返回:
Path: 保存的 IR 文件路径
"""
output_dir
=
Path
(
settings
.
DOCUMENT_IR_OUTPUT_DIR
)
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
ir_filename
=
f
"report_ir_{base_name}_{timestamp}_regen.json"
ir_path
=
output_dir
/
ir_filename
ir_path
.
write_text
(
json
.
dumps
(
document_ir
,
ensure_ascii
=
False
,
indent
=
2
),
encoding
=
"utf-8"
)
logger
.
info
(
f
"IR已保存: {ir_path}"
)
return
ir_path
def
render_markdown
(
document_ir
,
base_name
,
timestamp
):
"""
使用 MarkdownRenderer 将 Document IR 渲染为 Markdown 并保存。
渲染后落盘到 `final_reports/md`,打印生成文件大小,便于确认
输出内容。
参数:
document_ir: 装订完成的整本 IR
base_name: 文件名片段(来源于报告主题/标题)
timestamp: 时间戳字符串
返回:
Path: 生成的 Markdown 文件路径
"""
renderer
=
MarkdownRenderer
()
markdown_content
=
renderer
.
render
(
document_ir
)
output_dir
=
Path
(
settings
.
OUTPUT_DIR
)
/
"md"
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
md_filename
=
f
"report_md_{base_name}_{timestamp}.md"
md_path
=
output_dir
/
md_filename
md_path
.
write_text
(
markdown_content
,
encoding
=
"utf-8"
)
file_size_kb
=
md_path
.
stat
()
.
st_size
/
1024
logger
.
info
(
f
"Markdown生成成功: {md_path} ({file_size_kb:.1f} KB)"
)
return
md_path
def
build_slug
(
text
):
"""
将主题/标题转换为文件系统安全的片段。
仅保留字母/数字/空格/下划线/连字符,空格统一为下划线,并限制
最长 60 字符,避免过长文件名。
参数:
text: 原始主题或标题
返回:
str: 清洗后的安全字符串
"""
text
=
str
(
text
or
"report"
)
sanitized
=
""
.
join
(
c
for
c
in
text
if
c
.
isalnum
()
or
c
in
(
" "
,
"-"
,
"_"
))
.
strip
()
sanitized
=
sanitized
.
replace
(
" "
,
"_"
)
return
sanitized
[:
60
]
or
"report"
def
main
():
"""
主入口:读取最新章节、装订 IR 并渲染 Markdown。
流程:
1) 找到最新的章节 run 目录并读取 manifest;
2) 加载章节并执行结构校验(仅警告);
3) 装订整本 IR,保存 IR 副本;
4) 渲染 Markdown 并输出路径。
返回:
int: 0 表示成功,其余表示失败。
"""
logger
.
info
(
"🚀 使用最新的LLM章节重新装订并渲染Markdown"
)
chapter_root
=
Path
(
settings
.
CHAPTER_OUTPUT_DIR
)
latest_run
=
find_latest_run_dir
(
chapter_root
)
if
not
latest_run
:
return
1
report_id
,
metadata
=
load_manifest
(
latest_run
)
if
not
report_id
or
metadata
is
None
:
return
1
chapters
=
load_chapters
(
latest_run
)
if
not
chapters
:
logger
.
error
(
"未找到章节JSON,无法装订"
)
return
1
validate_chapters
(
chapters
)
document_ir
=
stitch_document
(
report_id
,
metadata
,
chapters
)
timestamp
=
datetime
.
now
()
.
strftime
(
"
%
Y
%
m
%
d_
%
H
%
M
%
S"
)
base_name
=
build_slug
(
metadata
.
get
(
"query"
)
or
metadata
.
get
(
"title"
)
or
metadata
.
get
(
"reportId"
)
or
report_id
)
ir_path
=
save_document_ir
(
document_ir
,
base_name
,
timestamp
)
md_path
=
render_markdown
(
document_ir
,
base_name
,
timestamp
)
logger
.
info
(
""
)
logger
.
info
(
"🎉 Markdown装订与渲染完成"
)
logger
.
info
(
f
"IR文件: {ir_path.resolve()}"
)
logger
.
info
(
f
"Markdown文件: {md_path.resolve()}"
)
return
0
if
__name__
==
"__main__"
:
sys
.
exit
(
main
())
...
...
Please
register
or
login
to post a comment