Committed by
GitHub
Merge pull request #531 from HKLHaoBin/feature/health-check-optimization
feat: 优化健康检查机制
Showing
2 changed files
with
26 additions
and
5 deletions
| @@ -508,9 +508,9 @@ forum_monitor_thread.start() | @@ -508,9 +508,9 @@ forum_monitor_thread.start() | ||
| 508 | 508 | ||
| 509 | # 全局变量存储进程信息 | 509 | # 全局变量存储进程信息 |
| 510 | processes = { | 510 | processes = { |
| 511 | - 'insight': {'process': None, 'port': 8501, 'status': 'stopped', 'output': [], 'log_file': None}, | ||
| 512 | - 'media': {'process': None, 'port': 8502, 'status': 'stopped', 'output': [], 'log_file': None}, | ||
| 513 | - 'query': {'process': None, 'port': 8503, 'status': 'stopped', 'output': [], 'log_file': None}, | 511 | + 'insight': {'process': None, 'port': 8501, 'status': 'stopped', 'output': [], 'log_file': None, 'healthcheck_started_at': None}, |
| 512 | + 'media': {'process': None, 'port': 8502, 'status': 'stopped', 'output': [], 'log_file': None, 'healthcheck_started_at': None}, | ||
| 513 | + 'query': {'process': None, 'port': 8503, 'status': 'stopped', 'output': [], 'log_file': None, 'healthcheck_started_at': None}, | ||
| 514 | 'forum': {'process': None, 'port': None, 'status': 'stopped', 'output': [], 'log_file': None} # 启动后标记为 running | 514 | 'forum': {'process': None, 'port': None, 'status': 'stopped', 'output': [], 'log_file': None} # 启动后标记为 running |
| 515 | } | 515 | } |
| 516 | 516 | ||
| @@ -699,6 +699,7 @@ def start_streamlit_app(app_name, script_path, port): | @@ -699,6 +699,7 @@ def start_streamlit_app(app_name, script_path, port): | ||
| 699 | processes[app_name]['process'] = process | 699 | processes[app_name]['process'] = process |
| 700 | processes[app_name]['status'] = 'starting' | 700 | processes[app_name]['status'] = 'starting' |
| 701 | processes[app_name]['output'] = [] | 701 | processes[app_name]['output'] = [] |
| 702 | + processes[app_name]['healthcheck_started_at'] = time.time() | ||
| 702 | 703 | ||
| 703 | # 启动输出读取线程 | 704 | # 启动输出读取线程 |
| 704 | output_thread = threading.Thread( | 705 | output_thread = threading.Thread( |
| @@ -743,6 +744,7 @@ def stop_streamlit_app(app_name): | @@ -743,6 +744,7 @@ def stop_streamlit_app(app_name): | ||
| 743 | 744 | ||
| 744 | processes[app_name]['process'] = None | 745 | processes[app_name]['process'] = None |
| 745 | processes[app_name]['status'] = 'stopped' | 746 | processes[app_name]['status'] = 'stopped' |
| 747 | + processes[app_name]['healthcheck_started_at'] = None | ||
| 746 | 748 | ||
| 747 | return True, f"{app_name} 应用已停止" | 749 | return True, f"{app_name} 应用已停止" |
| 748 | 750 | ||
| @@ -752,12 +754,27 @@ def stop_streamlit_app(app_name): | @@ -752,12 +754,27 @@ def stop_streamlit_app(app_name): | ||
| 752 | 754 | ||
| 753 | HEALTHCHECK_PATH = "/_stcore/health" | 755 | HEALTHCHECK_PATH = "/_stcore/health" |
| 754 | HEALTHCHECK_PROXIES = {'http': None, 'https': None} | 756 | HEALTHCHECK_PROXIES = {'http': None, 'https': None} |
| 757 | +HEALTHCHECK_GRACE_SECONDS = 15 | ||
| 755 | 758 | ||
| 756 | 759 | ||
| 757 | def _build_healthcheck_url(port): | 760 | def _build_healthcheck_url(port): |
| 758 | return f"http://127.0.0.1:{port}{HEALTHCHECK_PATH}" | 761 | return f"http://127.0.0.1:{port}{HEALTHCHECK_PATH}" |
| 759 | 762 | ||
| 760 | 763 | ||
| 764 | +def _healthcheck_grace_active(app_name: str) -> bool: | ||
| 765 | + started_at = processes.get(app_name, {}).get('healthcheck_started_at') | ||
| 766 | + if not started_at: | ||
| 767 | + return False | ||
| 768 | + return (time.time() - started_at) < HEALTHCHECK_GRACE_SECONDS | ||
| 769 | + | ||
| 770 | + | ||
| 771 | +def _log_healthcheck_failure(app_name: str, exc: Exception): | ||
| 772 | + if _healthcheck_grace_active(app_name): | ||
| 773 | + logger.debug(f"正在启动{app_name},请等待") | ||
| 774 | + return | ||
| 775 | + logger.warning(f"{app_name} 健康检查失败: {exc}") | ||
| 776 | + | ||
| 777 | + | ||
| 761 | def check_app_status(): | 778 | def check_app_status(): |
| 762 | """检查应用状态""" | 779 | """检查应用状态""" |
| 763 | for app_name, info in processes.items(): | 780 | for app_name, info in processes.items(): |
| @@ -775,12 +792,13 @@ def check_app_status(): | @@ -775,12 +792,13 @@ def check_app_status(): | ||
| 775 | else: | 792 | else: |
| 776 | info['status'] = 'starting' | 793 | info['status'] = 'starting' |
| 777 | except Exception as exc: | 794 | except Exception as exc: |
| 778 | - logger.warning(f"{app_name} 健康检查失败: {exc}") | 795 | + _log_healthcheck_failure(app_name, exc) |
| 779 | info['status'] = 'starting' | 796 | info['status'] = 'starting' |
| 780 | else: | 797 | else: |
| 781 | # 进程已结束 | 798 | # 进程已结束 |
| 782 | info['process'] = None | 799 | info['process'] = None |
| 783 | info['status'] = 'stopped' | 800 | info['status'] = 'stopped' |
| 801 | + info['healthcheck_started_at'] = None | ||
| 784 | 802 | ||
| 785 | def wait_for_app_startup(app_name, max_wait_time=90): | 803 | def wait_for_app_startup(app_name, max_wait_time=90): |
| 786 | """等待应用启动完成""" | 804 | """等待应用启动完成""" |
| @@ -804,7 +822,7 @@ def wait_for_app_startup(app_name, max_wait_time=90): | @@ -804,7 +822,7 @@ def wait_for_app_startup(app_name, max_wait_time=90): | ||
| 804 | info['status'] = 'running' | 822 | info['status'] = 'running' |
| 805 | return True, "启动成功" | 823 | return True, "启动成功" |
| 806 | except Exception as exc: | 824 | except Exception as exc: |
| 807 | - logger.warning(f"{app_name} 健康检查失败: {exc}") | 825 | + _log_healthcheck_failure(app_name, exc) |
| 808 | 826 | ||
| 809 | time.sleep(1) | 827 | time.sleep(1) |
| 810 | 828 |
-
Please register or login to post a comment