Showing
5 changed files
with
365 additions
and
7 deletions
| @@ -13,7 +13,7 @@ import os | @@ -13,7 +13,7 @@ import os | ||
| 13 | from pathlib import Path | 13 | from pathlib import Path |
| 14 | from uuid import uuid4 | 14 | from uuid import uuid4 |
| 15 | from datetime import datetime | 15 | from datetime import datetime |
| 16 | -from typing import Optional, Dict, Any, List, Callable | 16 | +from typing import Optional, Dict, Any, List, Callable, Tuple |
| 17 | 17 | ||
| 18 | from loguru import logger | 18 | from loguru import logger |
| 19 | 19 | ||
| @@ -199,6 +199,7 @@ class ReportAgent: | @@ -199,6 +199,7 @@ class ReportAgent: | ||
| 199 | 199 | ||
| 200 | # 初始化LLM客户端 | 200 | # 初始化LLM客户端 |
| 201 | self.llm_client = self._initialize_llm() | 201 | self.llm_client = self._initialize_llm() |
| 202 | + self.json_rescue_clients = self._initialize_rescue_llms() | ||
| 202 | 203 | ||
| 203 | # 初始化章级存储/校验/渲染组件 | 204 | # 初始化章级存储/校验/渲染组件 |
| 204 | self.chapter_storage = ChapterStorage(self.config.CHAPTER_OUTPUT_DIR) | 205 | self.chapter_storage = ChapterStorage(self.config.CHAPTER_OUTPUT_DIR) |
| @@ -263,6 +264,46 @@ class ReportAgent: | @@ -263,6 +264,46 @@ class ReportAgent: | ||
| 263 | model_name=self.config.REPORT_ENGINE_MODEL_NAME, | 264 | model_name=self.config.REPORT_ENGINE_MODEL_NAME, |
| 264 | base_url=self.config.REPORT_ENGINE_BASE_URL, | 265 | base_url=self.config.REPORT_ENGINE_BASE_URL, |
| 265 | ) | 266 | ) |
| 267 | + | ||
| 268 | + def _initialize_rescue_llms(self) -> List[Tuple[str, LLMClient]]: | ||
| 269 | + """ | ||
| 270 | + 初始化跨引擎章节修复所需的LLM客户端列表。 | ||
| 271 | + | ||
| 272 | + 顺序遵循“Report → Forum → Insight → Media”,缺失配置会被自动跳过。 | ||
| 273 | + """ | ||
| 274 | + clients: List[Tuple[str, LLMClient]] = [] | ||
| 275 | + if self.llm_client: | ||
| 276 | + clients.append(("report_engine", self.llm_client)) | ||
| 277 | + fallback_specs = [ | ||
| 278 | + ( | ||
| 279 | + "forum_engine", | ||
| 280 | + self.config.FORUM_HOST_API_KEY, | ||
| 281 | + self.config.FORUM_HOST_MODEL_NAME, | ||
| 282 | + self.config.FORUM_HOST_BASE_URL, | ||
| 283 | + ), | ||
| 284 | + ( | ||
| 285 | + "insight_engine", | ||
| 286 | + self.config.INSIGHT_ENGINE_API_KEY, | ||
| 287 | + self.config.INSIGHT_ENGINE_MODEL_NAME, | ||
| 288 | + self.config.INSIGHT_ENGINE_BASE_URL, | ||
| 289 | + ), | ||
| 290 | + ( | ||
| 291 | + "media_engine", | ||
| 292 | + self.config.MEDIA_ENGINE_API_KEY, | ||
| 293 | + self.config.MEDIA_ENGINE_MODEL_NAME, | ||
| 294 | + self.config.MEDIA_ENGINE_BASE_URL, | ||
| 295 | + ), | ||
| 296 | + ] | ||
| 297 | + for label, api_key, model_name, base_url in fallback_specs: | ||
| 298 | + if not api_key or not model_name: | ||
| 299 | + continue | ||
| 300 | + try: | ||
| 301 | + client = LLMClient(api_key=api_key, model_name=model_name, base_url=base_url) | ||
| 302 | + except Exception as exc: | ||
| 303 | + logger.warning(f"{label} LLM初始化失败,跳过该修复通道: {exc}") | ||
| 304 | + continue | ||
| 305 | + clients.append((label, client)) | ||
| 306 | + return clients | ||
| 266 | 307 | ||
| 267 | def _initialize_nodes(self): | 308 | def _initialize_nodes(self): |
| 268 | """ | 309 | """ |
| @@ -280,7 +321,9 @@ class ReportAgent: | @@ -280,7 +321,9 @@ class ReportAgent: | ||
| 280 | self.chapter_generation_node = ChapterGenerationNode( | 321 | self.chapter_generation_node = ChapterGenerationNode( |
| 281 | self.llm_client, | 322 | self.llm_client, |
| 282 | self.validator, | 323 | self.validator, |
| 283 | - self.chapter_storage | 324 | + self.chapter_storage, |
| 325 | + fallback_llm_clients=self.json_rescue_clients, | ||
| 326 | + error_log_dir=self.config.JSON_ERROR_LOG_DIR, | ||
| 284 | ) | 327 | ) |
| 285 | 328 | ||
| 286 | def generate_report(self, query: str, reports: List[Any], forum_logs: str = "", | 329 | def generate_report(self, query: str, reports: List[Any], forum_logs: str = "", |
| @@ -8,9 +8,10 @@ | @@ -8,9 +8,10 @@ | ||
| 8 | from __future__ import annotations | 8 | from __future__ import annotations |
| 9 | 9 | ||
| 10 | import json | 10 | import json |
| 11 | +from datetime import datetime | ||
| 11 | from pathlib import Path | 12 | from pathlib import Path |
| 12 | import re | 13 | import re |
| 13 | -from typing import Any, Dict, List, Tuple, Callable, Optional | 14 | +from typing import Any, Dict, List, Tuple, Callable, Optional, Set |
| 14 | 15 | ||
| 15 | from loguru import logger | 16 | from loguru import logger |
| 16 | 17 | ||
| @@ -19,7 +20,9 @@ from ..ir import ALLOWED_BLOCK_TYPES, ALLOWED_INLINE_MARKS, IRValidator | @@ -19,7 +20,9 @@ from ..ir import ALLOWED_BLOCK_TYPES, ALLOWED_INLINE_MARKS, IRValidator | ||
| 19 | from ..prompts import ( | 20 | from ..prompts import ( |
| 20 | SYSTEM_PROMPT_CHAPTER_JSON, | 21 | SYSTEM_PROMPT_CHAPTER_JSON, |
| 21 | SYSTEM_PROMPT_CHAPTER_JSON_REPAIR, | 22 | SYSTEM_PROMPT_CHAPTER_JSON_REPAIR, |
| 23 | + SYSTEM_PROMPT_CHAPTER_JSON_RECOVERY, | ||
| 22 | build_chapter_repair_prompt, | 24 | build_chapter_repair_prompt, |
| 25 | + build_chapter_recovery_payload, | ||
| 23 | build_chapter_user_prompt, | 26 | build_chapter_user_prompt, |
| 24 | ) | 27 | ) |
| 25 | from .base_node import BaseNode | 28 | from .base_node import BaseNode |
| @@ -96,7 +99,14 @@ class ChapterGenerationNode(BaseNode): | @@ -96,7 +99,14 @@ class ChapterGenerationNode(BaseNode): | ||
| 96 | _PARAGRAPH_FRAGMENT_NO_TERMINATOR_MAX_CHARS = 240 | 99 | _PARAGRAPH_FRAGMENT_NO_TERMINATOR_MAX_CHARS = 240 |
| 97 | _TERMINATION_PUNCTUATION = set("。!?!?;;……") | 100 | _TERMINATION_PUNCTUATION = set("。!?!?;;……") |
| 98 | 101 | ||
| 99 | - def __init__(self, llm_client, validator: IRValidator, storage: ChapterStorage): | 102 | + def __init__( |
| 103 | + self, | ||
| 104 | + llm_client, | ||
| 105 | + validator: IRValidator, | ||
| 106 | + storage: ChapterStorage, | ||
| 107 | + fallback_llm_clients: Optional[List[Tuple[str, Any]]] = None, | ||
| 108 | + error_log_dir: Optional[str | Path] = None, | ||
| 109 | + ): | ||
| 100 | """ | 110 | """ |
| 101 | 记录LLM客户端/校验器/章节存储器,便于run方法调度。 | 111 | 记录LLM客户端/校验器/章节存储器,便于run方法调度。 |
| 102 | 112 | ||
| @@ -108,6 +118,17 @@ class ChapterGenerationNode(BaseNode): | @@ -108,6 +118,17 @@ class ChapterGenerationNode(BaseNode): | ||
| 108 | super().__init__(llm_client, "ChapterGenerationNode") | 118 | super().__init__(llm_client, "ChapterGenerationNode") |
| 109 | self.validator = validator | 119 | self.validator = validator |
| 110 | self.storage = storage | 120 | self.storage = storage |
| 121 | + self.fallback_llm_clients: List[Tuple[str, Any]] = fallback_llm_clients or [ | ||
| 122 | + ("report_engine", llm_client) | ||
| 123 | + ] | ||
| 124 | + error_dir = Path(error_log_dir or "logs/json_repair_failures") | ||
| 125 | + error_dir.mkdir(parents=True, exist_ok=True) | ||
| 126 | + self.error_log_dir = error_dir | ||
| 127 | + self._failed_block_counter = 0 | ||
| 128 | + self._active_run_id: Optional[str] = None | ||
| 129 | + self._rescue_attempted_labels: Dict[str, Set[str]] = {} | ||
| 130 | + self._skipped_placeholder_chapters: Set[str] = set() | ||
| 131 | + self._archived_failed_json: Dict[str, str] = {} | ||
| 111 | 132 | ||
| 112 | def run( | 133 | def run( |
| 113 | self, | 134 | self, |
| @@ -141,6 +162,8 @@ class ChapterGenerationNode(BaseNode): | @@ -141,6 +162,8 @@ class ChapterGenerationNode(BaseNode): | ||
| 141 | "order": section.order, | 162 | "order": section.order, |
| 142 | } | 163 | } |
| 143 | chapter_dir = self.storage.begin_chapter(run_dir, chapter_meta) | 164 | chapter_dir = self.storage.begin_chapter(run_dir, chapter_meta) |
| 165 | + run_id = run_dir.name | ||
| 166 | + self._ensure_run_state(run_id) | ||
| 144 | llm_payload = self._build_payload(section, context) | 167 | llm_payload = self._build_payload(section, context) |
| 145 | user_message = build_chapter_user_prompt(llm_payload) | 168 | user_message = build_chapter_user_prompt(llm_payload) |
| 146 | 169 | ||
| @@ -151,7 +174,30 @@ class ChapterGenerationNode(BaseNode): | @@ -151,7 +174,30 @@ class ChapterGenerationNode(BaseNode): | ||
| 151 | section_meta=chapter_meta, | 174 | section_meta=chapter_meta, |
| 152 | **kwargs, | 175 | **kwargs, |
| 153 | ) | 176 | ) |
| 154 | - chapter_json = self._parse_chapter(raw_text) | 177 | + parse_context: List[str] = [] |
| 178 | + placeholder_created = False | ||
| 179 | + try: | ||
| 180 | + chapter_json = self._parse_chapter(raw_text) | ||
| 181 | + except ChapterJsonParseError as parse_error: | ||
| 182 | + logger.warning(f"{section.title} 章节JSON解析失败,尝试跨引擎修复: {parse_error}") | ||
| 183 | + parse_context.append(str(parse_error)) | ||
| 184 | + self._archive_failed_output(section, raw_text) | ||
| 185 | + recovered = self._attempt_cross_engine_json_rescue( | ||
| 186 | + section, | ||
| 187 | + llm_payload, | ||
| 188 | + raw_text, | ||
| 189 | + run_id, | ||
| 190 | + ) | ||
| 191 | + if recovered: | ||
| 192 | + chapter_json = recovered | ||
| 193 | + logger.info(f"{section.title} 章节JSON已通过跨引擎修复") | ||
| 194 | + else: | ||
| 195 | + placeholder = self._build_placeholder_chapter(section, raw_text, parse_error) | ||
| 196 | + if not placeholder: | ||
| 197 | + raise | ||
| 198 | + chapter_json, placeholder_notes = placeholder | ||
| 199 | + parse_context.extend(placeholder_notes) | ||
| 200 | + placeholder_created = True | ||
| 155 | 201 | ||
| 156 | # 自动补全关键字段后再校验 | 202 | # 自动补全关键字段后再校验 |
| 157 | chapter_json.setdefault("chapterId", section.chapter_id) | 203 | chapter_json.setdefault("chapterId", section.chapter_id) |
| @@ -176,13 +222,13 @@ class ChapterGenerationNode(BaseNode): | @@ -176,13 +222,13 @@ class ChapterGenerationNode(BaseNode): | ||
| 176 | self._sanitize_chapter_blocks(chapter_json) | 222 | self._sanitize_chapter_blocks(chapter_json) |
| 177 | valid, errors = self.validator.validate_chapter(chapter_json) | 223 | valid, errors = self.validator.validate_chapter(chapter_json) |
| 178 | content_error: ChapterContentError | None = None | 224 | content_error: ChapterContentError | None = None |
| 179 | - if valid: | 225 | + if valid and not placeholder_created: |
| 180 | try: | 226 | try: |
| 181 | self._ensure_content_density(chapter_json) | 227 | self._ensure_content_density(chapter_json) |
| 182 | except ChapterContentError as exc: | 228 | except ChapterContentError as exc: |
| 183 | content_error = exc | 229 | content_error = exc |
| 184 | 230 | ||
| 185 | - error_messages: List[str] = [] | 231 | + error_messages: List[str] = parse_context.copy() |
| 186 | if not valid and errors: | 232 | if not valid and errors: |
| 187 | error_messages.extend(errors) | 233 | error_messages.extend(errors) |
| 188 | if content_error: | 234 | if content_error: |
| @@ -314,6 +360,154 @@ class ChapterGenerationNode(BaseNode): | @@ -314,6 +360,154 @@ class ChapterGenerationNode(BaseNode): | ||
| 314 | logger.warning(f"章节流式回调失败: {callback_error}") | 360 | logger.warning(f"章节流式回调失败: {callback_error}") |
| 315 | return "".join(chunks) | 361 | return "".join(chunks) |
| 316 | 362 | ||
| 363 | + def _attempt_cross_engine_json_rescue( | ||
| 364 | + self, | ||
| 365 | + section: TemplateSection, | ||
| 366 | + generation_payload: Dict[str, Any], | ||
| 367 | + raw_text: str, | ||
| 368 | + run_id: str, | ||
| 369 | + ) -> Optional[Dict[str, Any]]: | ||
| 370 | + """ | ||
| 371 | + 依次调用Report/Forum/Insight/Media四套API尝试修复无法解析的JSON。 | ||
| 372 | + | ||
| 373 | + Returns: | ||
| 374 | + dict | None: 成功修复时返回章节JSON,否则为None。 | ||
| 375 | + """ | ||
| 376 | + if not self.fallback_llm_clients: | ||
| 377 | + return None | ||
| 378 | + if self._chapter_already_skipped(section): | ||
| 379 | + logger.info(f"[{run_id}] {section.title} 已标记为占位,不再触发跨引擎修复") | ||
| 380 | + return None | ||
| 381 | + section_payload = { | ||
| 382 | + "chapterId": section.chapter_id, | ||
| 383 | + "title": section.title, | ||
| 384 | + "slug": section.slug, | ||
| 385 | + "order": section.order, | ||
| 386 | + "number": section.number, | ||
| 387 | + "outline": section.outline, | ||
| 388 | + } | ||
| 389 | + repair_prompt = build_chapter_recovery_payload( | ||
| 390 | + section_payload, | ||
| 391 | + generation_payload, | ||
| 392 | + raw_text, | ||
| 393 | + ) | ||
| 394 | + attempted_labels = self._rescue_attempted_labels.setdefault(section.chapter_id, set()) | ||
| 395 | + for label, client in self.fallback_llm_clients: | ||
| 396 | + if label in attempted_labels: | ||
| 397 | + continue | ||
| 398 | + attempt_index = len(attempted_labels) + 1 | ||
| 399 | + attempted_labels.add(label) | ||
| 400 | + logger.info( | ||
| 401 | + f"[{run_id}] 章节 {section.title} 触发 {label} API JSON抢修(第{attempt_index}次尝试)" | ||
| 402 | + ) | ||
| 403 | + try: | ||
| 404 | + response = client.invoke( | ||
| 405 | + SYSTEM_PROMPT_CHAPTER_JSON_RECOVERY, | ||
| 406 | + repair_prompt, | ||
| 407 | + temperature=0.0, | ||
| 408 | + top_p=0.05, | ||
| 409 | + ) | ||
| 410 | + except Exception as exc: | ||
| 411 | + logger.warning(f"{label} JSON修复调用失败: {exc}") | ||
| 412 | + continue | ||
| 413 | + if not response: | ||
| 414 | + continue | ||
| 415 | + try: | ||
| 416 | + repaired = self._parse_chapter(response) | ||
| 417 | + except Exception as exc: | ||
| 418 | + logger.warning(f"{label} JSON修复输出仍无法解析: {exc}") | ||
| 419 | + continue | ||
| 420 | + logger.warning(f"[{run_id}] {label} API已修复章节JSON") | ||
| 421 | + self._archived_failed_json.pop(section.chapter_id, None) | ||
| 422 | + return repaired | ||
| 423 | + return None | ||
| 424 | + | ||
| 425 | + def _ensure_run_state(self, run_id: str): | ||
| 426 | + """确保每次报告运行时的修复状态隔离,防止上一份任务的记录影响新任务。""" | ||
| 427 | + if self._active_run_id == run_id: | ||
| 428 | + return | ||
| 429 | + self._active_run_id = run_id | ||
| 430 | + self._rescue_attempted_labels = {} | ||
| 431 | + self._skipped_placeholder_chapters = set() | ||
| 432 | + self._archived_failed_json = {} | ||
| 433 | + | ||
| 434 | + def _archive_failed_output(self, section: TemplateSection, raw_text: str): | ||
| 435 | + """缓存当前章节的原始错误JSON,以便后续占位或人工使用。""" | ||
| 436 | + if not raw_text: | ||
| 437 | + return | ||
| 438 | + self._archived_failed_json[section.chapter_id] = raw_text | ||
| 439 | + | ||
| 440 | + def _get_archived_failed_output(self, section: TemplateSection) -> Optional[str]: | ||
| 441 | + """获取章节最近一次失败的原始输出。""" | ||
| 442 | + return self._archived_failed_json.get(section.chapter_id) | ||
| 443 | + | ||
| 444 | + def _mark_chapter_skipped(self, section: TemplateSection): | ||
| 445 | + """记录该章节已经降级为占位,避免重复触发跨引擎修复。""" | ||
| 446 | + self._skipped_placeholder_chapters.add(section.chapter_id) | ||
| 447 | + | ||
| 448 | + def _chapter_already_skipped(self, section: TemplateSection) -> bool: | ||
| 449 | + """判断章节是否已经被标记为占位。""" | ||
| 450 | + return section.chapter_id in self._skipped_placeholder_chapters | ||
| 451 | + | ||
| 452 | + def _build_placeholder_chapter( | ||
| 453 | + self, | ||
| 454 | + section: TemplateSection, | ||
| 455 | + raw_text: str, | ||
| 456 | + parse_error: Exception, | ||
| 457 | + ) -> Optional[Tuple[Dict[str, Any], List[str]]]: | ||
| 458 | + """ | ||
| 459 | + 在所有修复失败时构造可渲染的占位章节,并记录日志文件供后续排查。 | ||
| 460 | + """ | ||
| 461 | + snapshot = self._get_archived_failed_output(section) or raw_text | ||
| 462 | + log_ref = self._persist_error_payload(section, snapshot, parse_error) | ||
| 463 | + if not log_ref: | ||
| 464 | + logger.error(f"{section.title} 章节JSON完全损坏且无法写入日志") | ||
| 465 | + return None | ||
| 466 | + importance = "critical" if self._is_section_critical(section) else "standard" | ||
| 467 | + message = ( | ||
| 468 | + f"LLM返回块解析错误,详情请见 {log_ref['relativeFile']} 的 {log_ref['entryId']} 记录。" | ||
| 469 | + ) | ||
| 470 | + heading_block = { | ||
| 471 | + "type": "heading", | ||
| 472 | + "level": 2 if importance == "critical" else 3, | ||
| 473 | + "text": section.title, | ||
| 474 | + "anchor": section.slug, | ||
| 475 | + } | ||
| 476 | + callout_block = { | ||
| 477 | + "type": "callout", | ||
| 478 | + "tone": "danger" if importance == "critical" else "warning", | ||
| 479 | + "title": "LLM返回块解析错误", | ||
| 480 | + "blocks": [ | ||
| 481 | + { | ||
| 482 | + "type": "paragraph", | ||
| 483 | + "inlines": [ | ||
| 484 | + { | ||
| 485 | + "text": message, | ||
| 486 | + } | ||
| 487 | + ], | ||
| 488 | + } | ||
| 489 | + ], | ||
| 490 | + "meta": { | ||
| 491 | + "errorLogRef": log_ref, | ||
| 492 | + "rawJsonPreview": (snapshot or "")[:2000], | ||
| 493 | + "errorMessage": message, | ||
| 494 | + "importance": importance, | ||
| 495 | + }, | ||
| 496 | + } | ||
| 497 | + placeholder = { | ||
| 498 | + "chapterId": section.chapter_id, | ||
| 499 | + "title": section.title, | ||
| 500 | + "anchor": section.slug, | ||
| 501 | + "order": section.order, | ||
| 502 | + "blocks": [heading_block, callout_block], | ||
| 503 | + "errorPlaceholder": True, | ||
| 504 | + } | ||
| 505 | + errors = [ | ||
| 506 | + f"{section.title} 章节JSON解析失败,已降级为占位。参考 {log_ref['relativeFile']}#{log_ref['entryId']}" | ||
| 507 | + ] | ||
| 508 | + self._mark_chapter_skipped(section) | ||
| 509 | + return placeholder, errors | ||
| 510 | + | ||
| 317 | def _parse_chapter(self, raw_text: str) -> Dict[str, Any]: | 511 | def _parse_chapter(self, raw_text: str) -> Dict[str, Any]: |
| 318 | """ | 512 | """ |
| 319 | 清洗LLM输出并解析JSON。 | 513 | 清洗LLM输出并解析JSON。 |
| @@ -375,6 +569,58 @@ class ChapterGenerationNode(BaseNode): | @@ -375,6 +569,58 @@ class ChapterGenerationNode(BaseNode): | ||
| 375 | return item | 569 | return item |
| 376 | raise ValueError("章节JSON缺少chapter字段") | 570 | raise ValueError("章节JSON缺少chapter字段") |
| 377 | 571 | ||
| 572 | + def _persist_error_payload( | ||
| 573 | + self, | ||
| 574 | + section: TemplateSection, | ||
| 575 | + raw_text: str, | ||
| 576 | + parse_error: Exception, | ||
| 577 | + ) -> Optional[Dict[str, str]]: | ||
| 578 | + """将无法解析的JSON文本落盘,便于在HTML中指向具体文件。""" | ||
| 579 | + try: | ||
| 580 | + self._failed_block_counter += 1 | ||
| 581 | + entry_id = f"E{self._failed_block_counter:04d}" | ||
| 582 | + timestamp = datetime.utcnow().strftime("%Y%m%d-%H%M%S") | ||
| 583 | + slug = section.slug or "section" | ||
| 584 | + filename = f"{timestamp}-{slug}-{entry_id}.json" | ||
| 585 | + file_path = self.error_log_dir / filename | ||
| 586 | + payload = { | ||
| 587 | + "chapterId": section.chapter_id, | ||
| 588 | + "title": section.title, | ||
| 589 | + "slug": section.slug, | ||
| 590 | + "order": section.order, | ||
| 591 | + "rawOutput": raw_text, | ||
| 592 | + "error": str(parse_error), | ||
| 593 | + "loggedAt": timestamp, | ||
| 594 | + } | ||
| 595 | + file_path.write_text( | ||
| 596 | + json.dumps(payload, ensure_ascii=False, indent=2), | ||
| 597 | + encoding="utf-8", | ||
| 598 | + ) | ||
| 599 | + try: | ||
| 600 | + relative_path = str(file_path.relative_to(Path.cwd())) | ||
| 601 | + except ValueError: | ||
| 602 | + relative_path = str(file_path) | ||
| 603 | + return { | ||
| 604 | + "file": str(file_path), | ||
| 605 | + "relativeFile": relative_path, | ||
| 606 | + "entryId": entry_id, | ||
| 607 | + "timestamp": timestamp, | ||
| 608 | + } | ||
| 609 | + except Exception as exc: | ||
| 610 | + logger.error(f"记录章节JSON错误日志失败: {exc}") | ||
| 611 | + return None | ||
| 612 | + | ||
| 613 | + def _is_section_critical(self, section: TemplateSection) -> bool: | ||
| 614 | + """基于章节深度/编号判断是否会影响目录,从而决定提示强度。""" | ||
| 615 | + if not section: | ||
| 616 | + return False | ||
| 617 | + if section.depth <= 2: | ||
| 618 | + return True | ||
| 619 | + number = section.number or "" | ||
| 620 | + if number and number.count(".") <= 1: | ||
| 621 | + return True | ||
| 622 | + return False | ||
| 623 | + | ||
| 378 | def _repair_llm_json(self, text: str) -> str: | 624 | def _repair_llm_json(self, text: str) -> str: |
| 379 | """ | 625 | """ |
| 380 | 处理常见的LLM错误(如":=导致的非法JSON)。 | 626 | 处理常见的LLM错误(如":=导致的非法JSON)。 |
| @@ -9,6 +9,7 @@ from .prompts import ( | @@ -9,6 +9,7 @@ from .prompts import ( | ||
| 9 | SYSTEM_PROMPT_HTML_GENERATION, | 9 | SYSTEM_PROMPT_HTML_GENERATION, |
| 10 | SYSTEM_PROMPT_CHAPTER_JSON, | 10 | SYSTEM_PROMPT_CHAPTER_JSON, |
| 11 | SYSTEM_PROMPT_CHAPTER_JSON_REPAIR, | 11 | SYSTEM_PROMPT_CHAPTER_JSON_REPAIR, |
| 12 | + SYSTEM_PROMPT_CHAPTER_JSON_RECOVERY, | ||
| 12 | SYSTEM_PROMPT_DOCUMENT_LAYOUT, | 13 | SYSTEM_PROMPT_DOCUMENT_LAYOUT, |
| 13 | SYSTEM_PROMPT_WORD_BUDGET, | 14 | SYSTEM_PROMPT_WORD_BUDGET, |
| 14 | output_schema_template_selection, | 15 | output_schema_template_selection, |
| @@ -16,6 +17,7 @@ from .prompts import ( | @@ -16,6 +17,7 @@ from .prompts import ( | ||
| 16 | chapter_generation_input_schema, | 17 | chapter_generation_input_schema, |
| 17 | build_chapter_user_prompt, | 18 | build_chapter_user_prompt, |
| 18 | build_chapter_repair_prompt, | 19 | build_chapter_repair_prompt, |
| 20 | + build_chapter_recovery_payload, | ||
| 19 | build_document_layout_prompt, | 21 | build_document_layout_prompt, |
| 20 | build_word_budget_prompt, | 22 | build_word_budget_prompt, |
| 21 | ) | 23 | ) |
| @@ -27,11 +29,13 @@ __all__ = [ | @@ -27,11 +29,13 @@ __all__ = [ | ||
| 27 | "SYSTEM_PROMPT_CHAPTER_JSON_REPAIR", | 29 | "SYSTEM_PROMPT_CHAPTER_JSON_REPAIR", |
| 28 | "SYSTEM_PROMPT_DOCUMENT_LAYOUT", | 30 | "SYSTEM_PROMPT_DOCUMENT_LAYOUT", |
| 29 | "SYSTEM_PROMPT_WORD_BUDGET", | 31 | "SYSTEM_PROMPT_WORD_BUDGET", |
| 32 | + "SYSTEM_PROMPT_CHAPTER_JSON_RECOVERY", | ||
| 30 | "output_schema_template_selection", | 33 | "output_schema_template_selection", |
| 31 | "input_schema_html_generation", | 34 | "input_schema_html_generation", |
| 32 | "chapter_generation_input_schema", | 35 | "chapter_generation_input_schema", |
| 33 | "build_chapter_user_prompt", | 36 | "build_chapter_user_prompt", |
| 34 | "build_chapter_repair_prompt", | 37 | "build_chapter_repair_prompt", |
| 38 | + "build_chapter_recovery_payload", | ||
| 35 | "build_document_layout_prompt", | 39 | "build_document_layout_prompt", |
| 36 | "build_word_budget_prompt", | 40 | "build_word_budget_prompt", |
| 37 | ] | 41 | ] |
| @@ -335,6 +335,24 @@ SYSTEM_PROMPT_CHAPTER_JSON_REPAIR = f""" | @@ -335,6 +335,24 @@ SYSTEM_PROMPT_CHAPTER_JSON_REPAIR = f""" | ||
| 335 | 只返回JSON,不要添加注释或自然语言。 | 335 | 只返回JSON,不要添加注释或自然语言。 |
| 336 | """ | 336 | """ |
| 337 | 337 | ||
| 338 | +SYSTEM_PROMPT_CHAPTER_JSON_RECOVERY = f""" | ||
| 339 | +你是Report/Forum/Insight/Media联合的“JSON抢修官”,会拿到章节生成时的全部约束(generationPayload)以及原始失败输出(rawChapterOutput)。 | ||
| 340 | + | ||
| 341 | +请遵守: | ||
| 342 | +1. 章节必须满足IR版本 {IR_VERSION} 规范,block.type 仅能使用:{', '.join(ALLOWED_BLOCK_TYPES)}; | ||
| 343 | +2. paragraph.inlines中的marks仅可出现:{', '.join(ALLOWED_INLINE_MARKS)},并保留原始文字顺序; | ||
| 344 | +3. 请以 generationPayload 中的 section 信息为主导,heading.text 与 anchor 必须与章节slug保持一致; | ||
| 345 | +4. 仅对JSON语法/字段/嵌套做最小必要修复,不改写事实与结论; | ||
| 346 | +5. 输出严格遵循 {{\"chapter\": {{...}}}} 格式,不添加说明。 | ||
| 347 | + | ||
| 348 | +输入字段: | ||
| 349 | +- generationPayload:章节原始需求与素材,请完整遵守; | ||
| 350 | +- rawChapterOutput:无法解析的JSON文本,请尽可能复用其中内容; | ||
| 351 | +- section:章节元信息,便于保持锚点/标题一致。 | ||
| 352 | + | ||
| 353 | +请直接返回修复后的JSON。 | ||
| 354 | +""" | ||
| 355 | + | ||
| 338 | # 文档标题/目录/主题设计提示词 | 356 | # 文档标题/目录/主题设计提示词 |
| 339 | SYSTEM_PROMPT_DOCUMENT_LAYOUT = f""" | 357 | SYSTEM_PROMPT_DOCUMENT_LAYOUT = f""" |
| 340 | 你是报告首席设计官,需要结合模板大纲与三个分析引擎的内容,为整本报告确定最终的标题、导语区、目录样式与美学要素。 | 358 | 你是报告首席设计官,需要结合模板大纲与三个分析引擎的内容,为整本报告确定最终的标题、导语区、目录样式与美学要素。 |
| @@ -399,6 +417,22 @@ def build_chapter_repair_prompt(chapter: dict, errors, original_text=None) -> st | @@ -399,6 +417,22 @@ def build_chapter_repair_prompt(chapter: dict, errors, original_text=None) -> st | ||
| 399 | return json.dumps(payload, ensure_ascii=False, indent=2) | 417 | return json.dumps(payload, ensure_ascii=False, indent=2) |
| 400 | 418 | ||
| 401 | 419 | ||
| 420 | +def build_chapter_recovery_payload( | ||
| 421 | + section: dict, generation_payload: dict, raw_output: str | ||
| 422 | +) -> str: | ||
| 423 | + """ | ||
| 424 | + 构造跨引擎JSON抢修输入,附带章节元信息、生成指令与原始输出。 | ||
| 425 | + | ||
| 426 | + 为避免提示词过长,仅保留原始输出的尾部片段以定位问题。 | ||
| 427 | + """ | ||
| 428 | + payload = { | ||
| 429 | + "section": section, | ||
| 430 | + "generationPayload": generation_payload, | ||
| 431 | + "rawChapterOutput": raw_output[-8000:] if isinstance(raw_output, str) else raw_output, | ||
| 432 | + } | ||
| 433 | + return json.dumps(payload, ensure_ascii=False, indent=2) | ||
| 434 | + | ||
| 435 | + | ||
| 402 | def build_document_layout_prompt(payload: dict) -> str: | 436 | def build_document_layout_prompt(payload: dict) -> str: |
| 403 | """将文档设计所需的上下文序列化为JSON字符串,供布局节点发送给LLM。""" | 437 | """将文档设计所需的上下文序列化为JSON字符串,供布局节点发送给LLM。""" |
| 404 | return json.dumps(payload, ensure_ascii=False, indent=2) | 438 | return json.dumps(payload, ensure_ascii=False, indent=2) |
| @@ -15,6 +15,34 @@ class Settings(BaseSettings): | @@ -15,6 +15,34 @@ class Settings(BaseSettings): | ||
| 15 | REPORT_ENGINE_BASE_URL: Optional[str] = Field(None, description="Report Engine LLM基础URL") | 15 | REPORT_ENGINE_BASE_URL: Optional[str] = Field(None, description="Report Engine LLM基础URL") |
| 16 | REPORT_ENGINE_MODEL_NAME: Optional[str] = Field(None, description="Report Engine LLM模型名称") | 16 | REPORT_ENGINE_MODEL_NAME: Optional[str] = Field(None, description="Report Engine LLM模型名称") |
| 17 | REPORT_ENGINE_PROVIDER: Optional[str] = Field(None, description="模型服务商,仅兼容保留") | 17 | REPORT_ENGINE_PROVIDER: Optional[str] = Field(None, description="模型服务商,仅兼容保留") |
| 18 | + # 其他引擎API(用于跨引擎修复) | ||
| 19 | + FORUM_HOST_API_KEY: Optional[str] = Field( | ||
| 20 | + None, description="Forum Engine / Forum Host 的LLM API密钥(用于章节修复兜底)" | ||
| 21 | + ) | ||
| 22 | + FORUM_HOST_BASE_URL: Optional[str] = Field( | ||
| 23 | + None, description="Forum Engine API Base URL(为空则使用LLM默认配置)" | ||
| 24 | + ) | ||
| 25 | + FORUM_HOST_MODEL_NAME: Optional[str] = Field( | ||
| 26 | + None, description="Forum Engine LLM模型名称" | ||
| 27 | + ) | ||
| 28 | + INSIGHT_ENGINE_API_KEY: Optional[str] = Field( | ||
| 29 | + None, description="Insight Engine LLM API密钥,用于跨引擎章节修复" | ||
| 30 | + ) | ||
| 31 | + INSIGHT_ENGINE_BASE_URL: Optional[str] = Field( | ||
| 32 | + None, description="Insight Engine API Base URL" | ||
| 33 | + ) | ||
| 34 | + INSIGHT_ENGINE_MODEL_NAME: Optional[str] = Field( | ||
| 35 | + None, description="Insight Engine LLM模型名称" | ||
| 36 | + ) | ||
| 37 | + MEDIA_ENGINE_API_KEY: Optional[str] = Field( | ||
| 38 | + None, description="Media Engine LLM API密钥,用于跨引擎章节修复" | ||
| 39 | + ) | ||
| 40 | + MEDIA_ENGINE_BASE_URL: Optional[str] = Field( | ||
| 41 | + None, description="Media Engine API Base URL" | ||
| 42 | + ) | ||
| 43 | + MEDIA_ENGINE_MODEL_NAME: Optional[str] = Field( | ||
| 44 | + None, description="Media Engine LLM模型名称" | ||
| 45 | + ) | ||
| 18 | MAX_CONTENT_LENGTH: int = Field(200000, description="最大内容长度") | 46 | MAX_CONTENT_LENGTH: int = Field(200000, description="最大内容长度") |
| 19 | OUTPUT_DIR: str = Field("final_reports", description="主输出目录") | 47 | OUTPUT_DIR: str = Field("final_reports", description="主输出目录") |
| 20 | # 章节分块JSON会存储在该目录,便于溯源与断点续传 | 48 | # 章节分块JSON会存储在该目录,便于溯源与断点续传 |
| @@ -35,6 +63,9 @@ class Settings(BaseSettings): | @@ -35,6 +63,9 @@ class Settings(BaseSettings): | ||
| 35 | LOG_FILE: str = Field("logs/report.log", description="日志输出文件") | 63 | LOG_FILE: str = Field("logs/report.log", description="日志输出文件") |
| 36 | ENABLE_PDF_EXPORT: bool = Field(True, description="是否允许导出PDF") | 64 | ENABLE_PDF_EXPORT: bool = Field(True, description="是否允许导出PDF") |
| 37 | CHART_STYLE: str = Field("modern", description="图表样式:modern/classic/") | 65 | CHART_STYLE: str = Field("modern", description="图表样式:modern/classic/") |
| 66 | + JSON_ERROR_LOG_DIR: str = Field( | ||
| 67 | + "logs/json_repair_failures", description="无法修复的JSON块落盘目录" | ||
| 68 | + ) | ||
| 38 | 69 | ||
| 39 | class Config: | 70 | class Config: |
| 40 | """Pydantic配置:允许从.env读取并兼容大小写""" | 71 | """Pydantic配置:允许从.env读取并兼容大小写""" |
-
Please register or login to post a comment