HKLHaoBin

feat: 优化健康检查机制

- 为进程信息添加 healthcheck_started_at 字段用于跟踪健康检查开始时间
- 实现健康检查宽限期机制,避免在应用启动期间产生误报
- 创建 _healthcheck_grace_active 函数来判断进程是否仍在宽限期内
- 创建 _log_healthcheck_failure 函数来处理健康检查失败记录,区分启动期间和运行期间的错误
- 在启动和停止应用时更新 healthcheck_started_at 时间戳
- 改进 wait_for_app_startup 函数中的健康检查错误处理

这些改进解决了在应用启动期间健康检查失败导致的误报问题,
提高了系统健康检查的准确性和稳定性。
Showing 1 changed file with 23 additions and 5 deletions
@@ -508,9 +508,9 @@ forum_monitor_thread.start() @@ -508,9 +508,9 @@ forum_monitor_thread.start()
508 508
509 # 全局变量存储进程信息 509 # 全局变量存储进程信息
510 processes = { 510 processes = {
511 - 'insight': {'process': None, 'port': 8501, 'status': 'stopped', 'output': [], 'log_file': None},  
512 - 'media': {'process': None, 'port': 8502, 'status': 'stopped', 'output': [], 'log_file': None},  
513 - 'query': {'process': None, 'port': 8503, 'status': 'stopped', 'output': [], 'log_file': None}, 511 + 'insight': {'process': None, 'port': 8501, 'status': 'stopped', 'output': [], 'log_file': None, 'healthcheck_started_at': None},
  512 + 'media': {'process': None, 'port': 8502, 'status': 'stopped', 'output': [], 'log_file': None, 'healthcheck_started_at': None},
  513 + 'query': {'process': None, 'port': 8503, 'status': 'stopped', 'output': [], 'log_file': None, 'healthcheck_started_at': None},
514 'forum': {'process': None, 'port': None, 'status': 'stopped', 'output': [], 'log_file': None} # 启动后标记为 running 514 'forum': {'process': None, 'port': None, 'status': 'stopped', 'output': [], 'log_file': None} # 启动后标记为 running
515 } 515 }
516 516
@@ -699,6 +699,7 @@ def start_streamlit_app(app_name, script_path, port): @@ -699,6 +699,7 @@ def start_streamlit_app(app_name, script_path, port):
699 processes[app_name]['process'] = process 699 processes[app_name]['process'] = process
700 processes[app_name]['status'] = 'starting' 700 processes[app_name]['status'] = 'starting'
701 processes[app_name]['output'] = [] 701 processes[app_name]['output'] = []
  702 + processes[app_name]['healthcheck_started_at'] = time.time()
702 703
703 # 启动输出读取线程 704 # 启动输出读取线程
704 output_thread = threading.Thread( 705 output_thread = threading.Thread(
@@ -743,6 +744,7 @@ def stop_streamlit_app(app_name): @@ -743,6 +744,7 @@ def stop_streamlit_app(app_name):
743 744
744 processes[app_name]['process'] = None 745 processes[app_name]['process'] = None
745 processes[app_name]['status'] = 'stopped' 746 processes[app_name]['status'] = 'stopped'
  747 + processes[app_name]['healthcheck_started_at'] = None
746 748
747 return True, f"{app_name} 应用已停止" 749 return True, f"{app_name} 应用已停止"
748 750
@@ -752,12 +754,27 @@ def stop_streamlit_app(app_name): @@ -752,12 +754,27 @@ def stop_streamlit_app(app_name):
752 754
753 HEALTHCHECK_PATH = "/_stcore/health" 755 HEALTHCHECK_PATH = "/_stcore/health"
754 HEALTHCHECK_PROXIES = {'http': None, 'https': None} 756 HEALTHCHECK_PROXIES = {'http': None, 'https': None}
  757 +HEALTHCHECK_GRACE_SECONDS = 15
755 758
756 759
757 def _build_healthcheck_url(port): 760 def _build_healthcheck_url(port):
758 return f"http://127.0.0.1:{port}{HEALTHCHECK_PATH}" 761 return f"http://127.0.0.1:{port}{HEALTHCHECK_PATH}"
759 762
760 763
  764 +def _healthcheck_grace_active(app_name: str) -> bool:
  765 + started_at = processes.get(app_name, {}).get('healthcheck_started_at')
  766 + if not started_at:
  767 + return False
  768 + return (time.time() - started_at) < HEALTHCHECK_GRACE_SECONDS
  769 +
  770 +
  771 +def _log_healthcheck_failure(app_name: str, exc: Exception):
  772 + if _healthcheck_grace_active(app_name):
  773 + logger.debug(f"正在启动{app_name},请等待")
  774 + return
  775 + logger.warning(f"{app_name} 健康检查失败: {exc}")
  776 +
  777 +
761 def check_app_status(): 778 def check_app_status():
762 """检查应用状态""" 779 """检查应用状态"""
763 for app_name, info in processes.items(): 780 for app_name, info in processes.items():
@@ -775,12 +792,13 @@ def check_app_status(): @@ -775,12 +792,13 @@ def check_app_status():
775 else: 792 else:
776 info['status'] = 'starting' 793 info['status'] = 'starting'
777 except Exception as exc: 794 except Exception as exc:
778 - logger.warning(f"{app_name} 健康检查失败: {exc}") 795 + _log_healthcheck_failure(app_name, exc)
779 info['status'] = 'starting' 796 info['status'] = 'starting'
780 else: 797 else:
781 # 进程已结束 798 # 进程已结束
782 info['process'] = None 799 info['process'] = None
783 info['status'] = 'stopped' 800 info['status'] = 'stopped'
  801 + info['healthcheck_started_at'] = None
784 802
785 def wait_for_app_startup(app_name, max_wait_time=90): 803 def wait_for_app_startup(app_name, max_wait_time=90):
786 """等待应用启动完成""" 804 """等待应用启动完成"""
@@ -804,7 +822,7 @@ def wait_for_app_startup(app_name, max_wait_time=90): @@ -804,7 +822,7 @@ def wait_for_app_startup(app_name, max_wait_time=90):
804 info['status'] = 'running' 822 info['status'] = 'running'
805 return True, "启动成功" 823 return True, "启动成功"
806 except Exception as exc: 824 except Exception as exc:
807 - logger.warning(f"{app_name} 健康检查失败: {exc}") 825 + _log_healthcheck_failure(app_name, exc)
808 826
809 time.sleep(1) 827 time.sleep(1)
810 828