Showing
13 changed files
with
4642 additions
and
0 deletions
Too many changes to show.
To preserve performance only 13 of 13+ files are displayed.
doc/dev/code_quality_maintenance_guide.md
0 → 100644
| 1 | +# 代码质量与可维护性增强指南 | ||
| 2 | + | ||
| 3 | +**AIfeng/2025-07-02 11:24:08** | ||
| 4 | + | ||
| 5 | +## 概述 | ||
| 6 | + | ||
| 7 | +基于对 `eman_one` 项目的深度分析,本文档提供了全面的代码质量和可维护性增强建议,涵盖架构设计、代码规范、测试策略、文档管理和持续集成等方面。 | ||
| 8 | + | ||
| 9 | +## 1. 架构设计优化 | ||
| 10 | + | ||
| 11 | +### 1.1 模块化重构建议 | ||
| 12 | + | ||
| 13 | +#### 当前状态分析 | ||
| 14 | +- ✅ 已实现同步架构重构 | ||
| 15 | +- ✅ 工具模块 `utils` 已建立 | ||
| 16 | +- ⚠️ 部分功能模块耦合度较高 | ||
| 17 | +- ⚠️ 缺少统一的接口抽象 | ||
| 18 | + | ||
| 19 | +#### 改进方案 | ||
| 20 | + | ||
| 21 | +**1. 建立分层架构** | ||
| 22 | +``` | ||
| 23 | +eman_one/ | ||
| 24 | +├── core/ # 核心业务逻辑 | ||
| 25 | +│ ├── asr/ # 语音识别模块 | ||
| 26 | +│ ├── recorder/ # 录音模块 | ||
| 27 | +│ └── api/ # API接口层 | ||
| 28 | +├── services/ # 服务层 | ||
| 29 | +│ ├── funasr_service.py | ||
| 30 | +│ ├── recording_service.py | ||
| 31 | +│ └── websocket_service.py | ||
| 32 | +├── interfaces/ # 接口定义 | ||
| 33 | +│ ├── asr_interface.py | ||
| 34 | +│ └── recorder_interface.py | ||
| 35 | +├── utils/ # 工具模块 | ||
| 36 | +└── config/ # 配置管理 | ||
| 37 | +``` | ||
| 38 | + | ||
| 39 | +**2. 接口抽象设计** | ||
| 40 | +```python | ||
| 41 | +# interfaces/asr_interface.py | ||
| 42 | +from abc import ABC, abstractmethod | ||
| 43 | + | ||
| 44 | +class ASRInterface(ABC): | ||
| 45 | + @abstractmethod | ||
| 46 | + def connect(self) -> bool: | ||
| 47 | + pass | ||
| 48 | + | ||
| 49 | + @abstractmethod | ||
| 50 | + def send_audio(self, audio_data: bytes) -> None: | ||
| 51 | + pass | ||
| 52 | + | ||
| 53 | + @abstractmethod | ||
| 54 | + def get_result(self) -> str: | ||
| 55 | + pass | ||
| 56 | +``` | ||
| 57 | + | ||
| 58 | +### 1.2 依赖注入模式 | ||
| 59 | + | ||
| 60 | +**实现依赖注入容器** | ||
| 61 | +```python | ||
| 62 | +# core/container.py | ||
| 63 | +class DIContainer: | ||
| 64 | + def __init__(self): | ||
| 65 | + self._services = {} | ||
| 66 | + self._singletons = {} | ||
| 67 | + | ||
| 68 | + def register(self, interface, implementation, singleton=False): | ||
| 69 | + self._services[interface] = (implementation, singleton) | ||
| 70 | + | ||
| 71 | + def resolve(self, interface): | ||
| 72 | + if interface in self._singletons: | ||
| 73 | + return self._singletons[interface] | ||
| 74 | + | ||
| 75 | + implementation, is_singleton = self._services[interface] | ||
| 76 | + instance = implementation() | ||
| 77 | + | ||
| 78 | + if is_singleton: | ||
| 79 | + self._singletons[interface] = instance | ||
| 80 | + | ||
| 81 | + return instance | ||
| 82 | +``` | ||
| 83 | + | ||
| 84 | +## 2. 代码规范与质量 | ||
| 85 | + | ||
| 86 | +### 2.1 代码风格统一 | ||
| 87 | + | ||
| 88 | +#### 配置文件设置 | ||
| 89 | + | ||
| 90 | +**pyproject.toml** | ||
| 91 | +```toml | ||
| 92 | +[tool.black] | ||
| 93 | +line-length = 88 | ||
| 94 | +target-version = ['py38'] | ||
| 95 | +include = '\.pyi?$' | ||
| 96 | + | ||
| 97 | +[tool.isort] | ||
| 98 | +profile = "black" | ||
| 99 | +multi_line_output = 3 | ||
| 100 | +line_length = 88 | ||
| 101 | + | ||
| 102 | +[tool.flake8] | ||
| 103 | +max-line-length = 88 | ||
| 104 | +extend-ignore = ["E203", "W503"] | ||
| 105 | +exclude = [".git", "__pycache__", "build", "dist"] | ||
| 106 | + | ||
| 107 | +[tool.mypy] | ||
| 108 | +python_version = "3.8" | ||
| 109 | +warn_return_any = true | ||
| 110 | +warn_unused_configs = true | ||
| 111 | +disallow_untyped_defs = true | ||
| 112 | +``` | ||
| 113 | + | ||
| 114 | +**pre-commit 配置** | ||
| 115 | +```yaml | ||
| 116 | +# .pre-commit-config.yaml | ||
| 117 | +repos: | ||
| 118 | + - repo: https://github.com/psf/black | ||
| 119 | + rev: 23.3.0 | ||
| 120 | + hooks: | ||
| 121 | + - id: black | ||
| 122 | + | ||
| 123 | + - repo: https://github.com/pycqa/isort | ||
| 124 | + rev: 5.12.0 | ||
| 125 | + hooks: | ||
| 126 | + - id: isort | ||
| 127 | + | ||
| 128 | + - repo: https://github.com/pycqa/flake8 | ||
| 129 | + rev: 6.0.0 | ||
| 130 | + hooks: | ||
| 131 | + - id: flake8 | ||
| 132 | + | ||
| 133 | + - repo: https://github.com/pre-commit/mirrors-mypy | ||
| 134 | + rev: v1.3.0 | ||
| 135 | + hooks: | ||
| 136 | + - id: mypy | ||
| 137 | +``` | ||
| 138 | + | ||
| 139 | +### 2.2 类型注解增强 | ||
| 140 | + | ||
| 141 | +**示例改进** | ||
| 142 | +```python | ||
| 143 | +# 改进前 | ||
| 144 | +def process_audio(data, sample_rate): | ||
| 145 | + return data | ||
| 146 | + | ||
| 147 | +# 改进后 | ||
| 148 | +from typing import Optional, Union | ||
| 149 | +import numpy as np | ||
| 150 | + | ||
| 151 | +def process_audio( | ||
| 152 | + data: Union[np.ndarray, bytes], | ||
| 153 | + sample_rate: int, | ||
| 154 | + channels: int = 1 | ||
| 155 | +) -> Optional[np.ndarray]: | ||
| 156 | + """处理音频数据 | ||
| 157 | + | ||
| 158 | + Args: | ||
| 159 | + data: 音频数据,支持numpy数组或字节流 | ||
| 160 | + sample_rate: 采样率 | ||
| 161 | + channels: 声道数,默认为1 | ||
| 162 | + | ||
| 163 | + Returns: | ||
| 164 | + 处理后的音频数据,失败时返回None | ||
| 165 | + | ||
| 166 | + Raises: | ||
| 167 | + ValueError: 当采样率无效时 | ||
| 168 | + """ | ||
| 169 | + if sample_rate <= 0: | ||
| 170 | + raise ValueError(f"Invalid sample rate: {sample_rate}") | ||
| 171 | + | ||
| 172 | + # 处理逻辑... | ||
| 173 | + return processed_data | ||
| 174 | +``` | ||
| 175 | + | ||
| 176 | +### 2.3 错误处理标准化 | ||
| 177 | + | ||
| 178 | +**自定义异常类** | ||
| 179 | +```python | ||
| 180 | +# utils/exceptions.py | ||
| 181 | +class EmanOneException(Exception): | ||
| 182 | + """项目基础异常类""" | ||
| 183 | + pass | ||
| 184 | + | ||
| 185 | +class ASRConnectionError(EmanOneException): | ||
| 186 | + """ASR连接异常""" | ||
| 187 | + pass | ||
| 188 | + | ||
| 189 | +class AudioProcessingError(EmanOneException): | ||
| 190 | + """音频处理异常""" | ||
| 191 | + pass | ||
| 192 | + | ||
| 193 | +class ConfigurationError(EmanOneException): | ||
| 194 | + """配置错误异常""" | ||
| 195 | + pass | ||
| 196 | +``` | ||
| 197 | + | ||
| 198 | +**统一错误处理装饰器** | ||
| 199 | +```python | ||
| 200 | +# utils/decorators.py | ||
| 201 | +from functools import wraps | ||
| 202 | +from typing import Callable, Any | ||
| 203 | +import logging | ||
| 204 | + | ||
| 205 | +def handle_exceptions(logger: logging.Logger = None): | ||
| 206 | + def decorator(func: Callable) -> Callable: | ||
| 207 | + @wraps(func) | ||
| 208 | + def wrapper(*args, **kwargs) -> Any: | ||
| 209 | + try: | ||
| 210 | + return func(*args, **kwargs) | ||
| 211 | + except EmanOneException as e: | ||
| 212 | + if logger: | ||
| 213 | + logger.error(f"{func.__name__} failed: {e}") | ||
| 214 | + raise | ||
| 215 | + except Exception as e: | ||
| 216 | + if logger: | ||
| 217 | + logger.error(f"Unexpected error in {func.__name__}: {e}") | ||
| 218 | + raise EmanOneException(f"Unexpected error: {e}") from e | ||
| 219 | + return wrapper | ||
| 220 | + return decorator | ||
| 221 | +``` | ||
| 222 | + | ||
| 223 | +## 3. 测试策略 | ||
| 224 | + | ||
| 225 | +### 3.1 测试金字塔实现 | ||
| 226 | + | ||
| 227 | +**目录结构** | ||
| 228 | +``` | ||
| 229 | +test/ | ||
| 230 | +├── unit/ # 单元测试 (70%) | ||
| 231 | +│ ├── test_asr.py | ||
| 232 | +│ ├── test_recorder.py | ||
| 233 | +│ └── test_utils.py | ||
| 234 | +├── integration/ # 集成测试 (20%) | ||
| 235 | +│ ├── test_asr_integration.py | ||
| 236 | +│ └── test_api_integration.py | ||
| 237 | +├── e2e/ # 端到端测试 (10%) | ||
| 238 | +│ └── test_voice_workflow.py | ||
| 239 | +├── fixtures/ # 测试数据 | ||
| 240 | +│ ├── audio_samples/ | ||
| 241 | +│ └── config_samples/ | ||
| 242 | +└── conftest.py # pytest配置 | ||
| 243 | +``` | ||
| 244 | + | ||
| 245 | +**pytest 配置示例** | ||
| 246 | +```python | ||
| 247 | +# test/conftest.py | ||
| 248 | +import pytest | ||
| 249 | +import tempfile | ||
| 250 | +import os | ||
| 251 | +from unittest.mock import Mock | ||
| 252 | + | ||
| 253 | +@pytest.fixture | ||
| 254 | +def temp_audio_file(): | ||
| 255 | + """临时音频文件fixture""" | ||
| 256 | + with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f: | ||
| 257 | + # 创建测试音频数据 | ||
| 258 | + yield f.name | ||
| 259 | + os.unlink(f.name) | ||
| 260 | + | ||
| 261 | +@pytest.fixture | ||
| 262 | +def mock_asr_client(): | ||
| 263 | + """模拟ASR客户端""" | ||
| 264 | + mock = Mock() | ||
| 265 | + mock.connect.return_value = True | ||
| 266 | + mock.send_audio.return_value = None | ||
| 267 | + mock.get_result.return_value = "测试识别结果" | ||
| 268 | + return mock | ||
| 269 | + | ||
| 270 | +@pytest.fixture(scope="session") | ||
| 271 | +def test_config(): | ||
| 272 | + """测试配置""" | ||
| 273 | + return { | ||
| 274 | + "asr": { | ||
| 275 | + "host": "localhost", | ||
| 276 | + "port": 10095, | ||
| 277 | + "timeout": 5 | ||
| 278 | + }, | ||
| 279 | + "audio": { | ||
| 280 | + "sample_rate": 16000, | ||
| 281 | + "channels": 1 | ||
| 282 | + } | ||
| 283 | + } | ||
| 284 | +``` | ||
| 285 | + | ||
| 286 | +### 3.2 性能测试 | ||
| 287 | + | ||
| 288 | +**基准测试示例** | ||
| 289 | +```python | ||
| 290 | +# test/performance/test_benchmarks.py | ||
| 291 | +import pytest | ||
| 292 | +import time | ||
| 293 | +from utils.util import process_audio_data | ||
| 294 | + | ||
| 295 | +class TestPerformance: | ||
| 296 | + def test_audio_processing_speed(self, benchmark): | ||
| 297 | + """测试音频处理性能""" | ||
| 298 | + audio_data = b'\x00' * 16000 # 1秒音频数据 | ||
| 299 | + | ||
| 300 | + result = benchmark(process_audio_data, audio_data) | ||
| 301 | + assert result is not None | ||
| 302 | + | ||
| 303 | + @pytest.mark.parametrize("data_size", [1000, 10000, 100000]) | ||
| 304 | + def test_memory_usage(self, data_size): | ||
| 305 | + """测试内存使用情况""" | ||
| 306 | + import psutil | ||
| 307 | + import os | ||
| 308 | + | ||
| 309 | + process = psutil.Process(os.getpid()) | ||
| 310 | + memory_before = process.memory_info().rss | ||
| 311 | + | ||
| 312 | + # 执行测试操作 | ||
| 313 | + large_data = b'\x00' * data_size | ||
| 314 | + process_audio_data(large_data) | ||
| 315 | + | ||
| 316 | + memory_after = process.memory_info().rss | ||
| 317 | + memory_diff = memory_after - memory_before | ||
| 318 | + | ||
| 319 | + # 确保内存增长在合理范围内 | ||
| 320 | + assert memory_diff < data_size * 2 | ||
| 321 | +``` | ||
| 322 | + | ||
| 323 | +## 4. 文档管理体系 | ||
| 324 | + | ||
| 325 | +### 4.1 文档分类标准 | ||
| 326 | + | ||
| 327 | +**Diátaxis 框架应用** | ||
| 328 | +``` | ||
| 329 | +doc/ | ||
| 330 | +├── tutorials/ # 教程 - 学习导向 | ||
| 331 | +│ ├── quick_start.md | ||
| 332 | +│ └── voice_setup_guide.md | ||
| 333 | +├── how-to/ # 指南 - 问题导向 | ||
| 334 | +│ ├── troubleshooting.md | ||
| 335 | +│ └── performance_tuning.md | ||
| 336 | +├── reference/ # 参考 - 信息导向 | ||
| 337 | +│ ├── api_reference.md | ||
| 338 | +│ ├── config_reference.md | ||
| 339 | +│ └── cli_reference.md | ||
| 340 | +├── explanation/ # 说明 - 理解导向 | ||
| 341 | +│ ├── architecture.md | ||
| 342 | +│ └── design_decisions.md | ||
| 343 | +└── process/ # 过程文档 | ||
| 344 | + ├── update.log | ||
| 345 | + └── meeting_notes/ | ||
| 346 | +``` | ||
| 347 | + | ||
| 348 | +### 4.2 自动化文档生成 | ||
| 349 | + | ||
| 350 | +**API文档生成** | ||
| 351 | +```python | ||
| 352 | +# scripts/generate_docs.py | ||
| 353 | +import inspect | ||
| 354 | +import ast | ||
| 355 | +from pathlib import Path | ||
| 356 | + | ||
| 357 | +def generate_api_docs(): | ||
| 358 | + """自动生成API文档""" | ||
| 359 | + modules = [ | ||
| 360 | + 'funasr_asr_sync', | ||
| 361 | + 'recorder_sync', | ||
| 362 | + 'server_recording_api_sync' | ||
| 363 | + ] | ||
| 364 | + | ||
| 365 | + for module_name in modules: | ||
| 366 | + module = __import__(module_name) | ||
| 367 | + doc_content = f"# {module_name} API Reference\n\n" | ||
| 368 | + | ||
| 369 | + for name, obj in inspect.getmembers(module): | ||
| 370 | + if inspect.isclass(obj) or inspect.isfunction(obj): | ||
| 371 | + doc_content += f"## {name}\n\n" | ||
| 372 | + doc_content += f"{inspect.getdoc(obj) or 'No documentation'}\n\n" | ||
| 373 | + | ||
| 374 | + with open(f"doc/reference/{module_name}_api.md", "w", encoding="utf-8") as f: | ||
| 375 | + f.write(doc_content) | ||
| 376 | +``` | ||
| 377 | + | ||
| 378 | +## 5. 持续集成与部署 | ||
| 379 | + | ||
| 380 | +### 5.1 GitHub Actions 配置 | ||
| 381 | + | ||
| 382 | +**.github/workflows/ci.yml** | ||
| 383 | +```yaml | ||
| 384 | +name: CI/CD Pipeline | ||
| 385 | + | ||
| 386 | +on: | ||
| 387 | + push: | ||
| 388 | + branches: [ main, develop ] | ||
| 389 | + pull_request: | ||
| 390 | + branches: [ main ] | ||
| 391 | + | ||
| 392 | +jobs: | ||
| 393 | + test: | ||
| 394 | + runs-on: ubuntu-latest | ||
| 395 | + strategy: | ||
| 396 | + matrix: | ||
| 397 | + python-version: [3.8, 3.9, '3.10'] | ||
| 398 | + | ||
| 399 | + steps: | ||
| 400 | + - uses: actions/checkout@v3 | ||
| 401 | + | ||
| 402 | + - name: Set up Python ${{ matrix.python-version }} | ||
| 403 | + uses: actions/setup-python@v4 | ||
| 404 | + with: | ||
| 405 | + python-version: ${{ matrix.python-version }} | ||
| 406 | + | ||
| 407 | + - name: Install dependencies | ||
| 408 | + run: | | ||
| 409 | + python -m pip install --upgrade pip | ||
| 410 | + pip install -r requirements.txt | ||
| 411 | + pip install pytest pytest-cov black isort flake8 | ||
| 412 | + | ||
| 413 | + - name: Run linting | ||
| 414 | + run: | | ||
| 415 | + black --check . | ||
| 416 | + isort --check-only . | ||
| 417 | + flake8 . | ||
| 418 | + | ||
| 419 | + - name: Run tests | ||
| 420 | + run: | | ||
| 421 | + pytest --cov=. --cov-report=xml | ||
| 422 | + | ||
| 423 | + - name: Upload coverage | ||
| 424 | + uses: codecov/codecov-action@v3 | ||
| 425 | + with: | ||
| 426 | + file: ./coverage.xml | ||
| 427 | + | ||
| 428 | + security: | ||
| 429 | + runs-on: ubuntu-latest | ||
| 430 | + steps: | ||
| 431 | + - uses: actions/checkout@v3 | ||
| 432 | + - name: Run security scan | ||
| 433 | + uses: pypa/gh-action-pip-audit@v1.0.8 | ||
| 434 | +``` | ||
| 435 | + | ||
| 436 | +### 5.2 代码质量监控 | ||
| 437 | + | ||
| 438 | +**SonarQube 配置** | ||
| 439 | +```properties | ||
| 440 | +# sonar-project.properties | ||
| 441 | +sonar.projectKey=eman_one | ||
| 442 | +sonar.projectName=Eman One Voice Processing | ||
| 443 | +sonar.projectVersion=1.0 | ||
| 444 | + | ||
| 445 | +sonar.sources=. | ||
| 446 | +sonar.exclusions=**/*_test.py,**/test_*.py,**/__pycache__/** | ||
| 447 | + | ||
| 448 | +sonar.python.coverage.reportPaths=coverage.xml | ||
| 449 | +sonar.python.xunit.reportPath=test-results.xml | ||
| 450 | + | ||
| 451 | +sonar.qualitygate.wait=true | ||
| 452 | +``` | ||
| 453 | + | ||
| 454 | +## 6. 监控与可观测性 | ||
| 455 | + | ||
| 456 | +### 6.1 结构化日志 | ||
| 457 | + | ||
| 458 | +**日志配置增强** | ||
| 459 | +```python | ||
| 460 | +# utils/logging_config.py | ||
| 461 | +import logging | ||
| 462 | +import json | ||
| 463 | +from datetime import datetime | ||
| 464 | + | ||
| 465 | +class StructuredFormatter(logging.Formatter): | ||
| 466 | + def format(self, record): | ||
| 467 | + log_entry = { | ||
| 468 | + 'timestamp': datetime.utcnow().isoformat(), | ||
| 469 | + 'level': record.levelname, | ||
| 470 | + 'logger': record.name, | ||
| 471 | + 'message': record.getMessage(), | ||
| 472 | + 'module': record.module, | ||
| 473 | + 'function': record.funcName, | ||
| 474 | + 'line': record.lineno | ||
| 475 | + } | ||
| 476 | + | ||
| 477 | + if hasattr(record, 'user_id'): | ||
| 478 | + log_entry['user_id'] = record.user_id | ||
| 479 | + | ||
| 480 | + if hasattr(record, 'request_id'): | ||
| 481 | + log_entry['request_id'] = record.request_id | ||
| 482 | + | ||
| 483 | + return json.dumps(log_entry, ensure_ascii=False) | ||
| 484 | + | ||
| 485 | +def setup_structured_logging(): | ||
| 486 | + formatter = StructuredFormatter() | ||
| 487 | + handler = logging.StreamHandler() | ||
| 488 | + handler.setFormatter(formatter) | ||
| 489 | + | ||
| 490 | + logger = logging.getLogger('eman_one') | ||
| 491 | + logger.addHandler(handler) | ||
| 492 | + logger.setLevel(logging.INFO) | ||
| 493 | + | ||
| 494 | + return logger | ||
| 495 | +``` | ||
| 496 | + | ||
| 497 | +### 6.2 性能指标收集 | ||
| 498 | + | ||
| 499 | +**指标收集器** | ||
| 500 | +```python | ||
| 501 | +# utils/metrics.py | ||
| 502 | +import time | ||
| 503 | +from collections import defaultdict | ||
| 504 | +from contextlib import contextmanager | ||
| 505 | +from typing import Dict, Any | ||
| 506 | + | ||
| 507 | +class MetricsCollector: | ||
| 508 | + def __init__(self): | ||
| 509 | + self.counters = defaultdict(int) | ||
| 510 | + self.timers = defaultdict(list) | ||
| 511 | + self.gauges = defaultdict(float) | ||
| 512 | + | ||
| 513 | + def increment(self, name: str, value: int = 1): | ||
| 514 | + """计数器递增""" | ||
| 515 | + self.counters[name] += value | ||
| 516 | + | ||
| 517 | + def set_gauge(self, name: str, value: float): | ||
| 518 | + """设置仪表值""" | ||
| 519 | + self.gauges[name] = value | ||
| 520 | + | ||
| 521 | + @contextmanager | ||
| 522 | + def timer(self, name: str): | ||
| 523 | + """计时器上下文管理器""" | ||
| 524 | + start_time = time.time() | ||
| 525 | + try: | ||
| 526 | + yield | ||
| 527 | + finally: | ||
| 528 | + duration = time.time() - start_time | ||
| 529 | + self.timers[name].append(duration) | ||
| 530 | + | ||
| 531 | + def get_metrics(self) -> Dict[str, Any]: | ||
| 532 | + """获取所有指标""" | ||
| 533 | + return { | ||
| 534 | + 'counters': dict(self.counters), | ||
| 535 | + 'timers': { | ||
| 536 | + name: { | ||
| 537 | + 'count': len(times), | ||
| 538 | + 'avg': sum(times) / len(times) if times else 0, | ||
| 539 | + 'min': min(times) if times else 0, | ||
| 540 | + 'max': max(times) if times else 0 | ||
| 541 | + } | ||
| 542 | + for name, times in self.timers.items() | ||
| 543 | + }, | ||
| 544 | + 'gauges': dict(self.gauges) | ||
| 545 | + } | ||
| 546 | + | ||
| 547 | +# 全局指标收集器 | ||
| 548 | +metrics = MetricsCollector() | ||
| 549 | +``` | ||
| 550 | + | ||
| 551 | +## 7. 安全性增强 | ||
| 552 | + | ||
| 553 | +### 7.1 配置安全 | ||
| 554 | + | ||
| 555 | +**敏感信息管理** | ||
| 556 | +```python | ||
| 557 | +# utils/security.py | ||
| 558 | +import os | ||
| 559 | +from cryptography.fernet import Fernet | ||
| 560 | +from typing import Optional | ||
| 561 | + | ||
| 562 | +class SecureConfig: | ||
| 563 | + def __init__(self): | ||
| 564 | + self.key = self._get_or_create_key() | ||
| 565 | + self.cipher = Fernet(self.key) | ||
| 566 | + | ||
| 567 | + def _get_or_create_key(self) -> bytes: | ||
| 568 | + key_file = '.encryption_key' | ||
| 569 | + if os.path.exists(key_file): | ||
| 570 | + with open(key_file, 'rb') as f: | ||
| 571 | + return f.read() | ||
| 572 | + else: | ||
| 573 | + key = Fernet.generate_key() | ||
| 574 | + with open(key_file, 'wb') as f: | ||
| 575 | + f.write(key) | ||
| 576 | + return key | ||
| 577 | + | ||
| 578 | + def encrypt_value(self, value: str) -> str: | ||
| 579 | + """加密敏感值""" | ||
| 580 | + return self.cipher.encrypt(value.encode()).decode() | ||
| 581 | + | ||
| 582 | + def decrypt_value(self, encrypted_value: str) -> str: | ||
| 583 | + """解密敏感值""" | ||
| 584 | + return self.cipher.decrypt(encrypted_value.encode()).decode() | ||
| 585 | + | ||
| 586 | + def get_env_or_encrypted(self, key: str, encrypted_fallback: Optional[str] = None) -> Optional[str]: | ||
| 587 | + """优先从环境变量获取,否则使用加密值""" | ||
| 588 | + env_value = os.getenv(key) | ||
| 589 | + if env_value: | ||
| 590 | + return env_value | ||
| 591 | + | ||
| 592 | + if encrypted_fallback: | ||
| 593 | + return self.decrypt_value(encrypted_fallback) | ||
| 594 | + | ||
| 595 | + return None | ||
| 596 | +``` | ||
| 597 | + | ||
| 598 | +### 7.2 输入验证 | ||
| 599 | + | ||
| 600 | +**数据验证器** | ||
| 601 | +```python | ||
| 602 | +# utils/validators.py | ||
| 603 | +import re | ||
| 604 | +from typing import Any, List, Optional | ||
| 605 | +from pydantic import BaseModel, validator | ||
| 606 | + | ||
| 607 | +class AudioConfig(BaseModel): | ||
| 608 | + sample_rate: int | ||
| 609 | + channels: int | ||
| 610 | + chunk_size: int | ||
| 611 | + | ||
| 612 | + @validator('sample_rate') | ||
| 613 | + def validate_sample_rate(cls, v): | ||
| 614 | + if v not in [8000, 16000, 22050, 44100, 48000]: | ||
| 615 | + raise ValueError('Invalid sample rate') | ||
| 616 | + return v | ||
| 617 | + | ||
| 618 | + @validator('channels') | ||
| 619 | + def validate_channels(cls, v): | ||
| 620 | + if v not in [1, 2]: | ||
| 621 | + raise ValueError('Channels must be 1 or 2') | ||
| 622 | + return v | ||
| 623 | + | ||
| 624 | +class ASRConfig(BaseModel): | ||
| 625 | + host: str | ||
| 626 | + port: int | ||
| 627 | + timeout: int | ||
| 628 | + | ||
| 629 | + @validator('host') | ||
| 630 | + def validate_host(cls, v): | ||
| 631 | + # 简单的主机名/IP验证 | ||
| 632 | + if not re.match(r'^[a-zA-Z0-9.-]+$', v): | ||
| 633 | + raise ValueError('Invalid host format') | ||
| 634 | + return v | ||
| 635 | + | ||
| 636 | + @validator('port') | ||
| 637 | + def validate_port(cls, v): | ||
| 638 | + if not 1 <= v <= 65535: | ||
| 639 | + raise ValueError('Port must be between 1 and 65535') | ||
| 640 | + return v | ||
| 641 | +``` | ||
| 642 | + | ||
| 643 | +## 8. 实施计划 | ||
| 644 | + | ||
| 645 | +### 8.1 优先级分级 | ||
| 646 | + | ||
| 647 | +**高优先级 (立即实施)** | ||
| 648 | +1. ✅ 依赖包管理 (已完成) | ||
| 649 | +2. 🔄 代码格式化工具配置 | ||
| 650 | +3. 🔄 基础测试框架搭建 | ||
| 651 | +4. 🔄 错误处理标准化 | ||
| 652 | + | ||
| 653 | +**中优先级 (1-2周内)** | ||
| 654 | +1. 接口抽象设计 | ||
| 655 | +2. 结构化日志实现 | ||
| 656 | +3. 性能监控集成 | ||
| 657 | +4. 安全性增强 | ||
| 658 | + | ||
| 659 | +**低优先级 (1个月内)** | ||
| 660 | +1. 完整CI/CD流水线 | ||
| 661 | +2. 自动化文档生成 | ||
| 662 | +3. 高级监控仪表盘 | ||
| 663 | +4. 性能基准测试 | ||
| 664 | + | ||
| 665 | +### 8.2 成功指标 | ||
| 666 | + | ||
| 667 | +**代码质量指标** | ||
| 668 | +- 代码覆盖率 > 80% | ||
| 669 | +- 代码重复率 < 5% | ||
| 670 | +- 技术债务评级 A | ||
| 671 | +- 安全漏洞数量 = 0 | ||
| 672 | + | ||
| 673 | +**可维护性指标** | ||
| 674 | +- 平均修复时间 < 2小时 | ||
| 675 | +- 新功能开发周期 < 1周 | ||
| 676 | +- 文档覆盖率 > 90% | ||
| 677 | +- 团队满意度 > 4.5/5 | ||
| 678 | + | ||
| 679 | +## 总结 | ||
| 680 | + | ||
| 681 | +本指南提供了全面的代码质量和可维护性增强方案,通过系统性的改进措施,将显著提升 `eman_one` 项目的代码质量、开发效率和长期可维护性。建议按照优先级逐步实施,并持续监控改进效果。 |
doc/dev/funasr_optimization_analysis.md
0 → 100644
| 1 | +# FunASR语音识别优化分析报告 | ||
| 2 | + | ||
| 3 | +**AIfeng/2025-07-01 16:51:01** | ||
| 4 | + | ||
| 5 | +## 概述 | ||
| 6 | + | ||
| 7 | +基于参考项目Fay-main的FunASR WebSocket架构分析,对当前eman_one项目的语音识别方案进行技术对比和优化建议。重点分析前端录音与服务端录音的技术差异,提出体验改进方案。 | ||
| 8 | + | ||
| 9 | +## 1. 技术架构对比分析 | ||
| 10 | + | ||
| 11 | +### 1.1 当前eman_one项目架构 | ||
| 12 | + | ||
| 13 | +**前端录音方案**: | ||
| 14 | +- **录音方式**:浏览器MediaRecorder API | ||
| 15 | +- **数据流**:前端采集 → WebM/Opus编码 → Base64传输 → 服务端解码 | ||
| 16 | +- **传输协议**:WebSocket (10197端口) | ||
| 17 | +- **处理模式**:分段录音 + 批量传输 | ||
| 18 | + | ||
| 19 | +**技术特点**: | ||
| 20 | +```javascript | ||
| 21 | +// 当前实现方式 | ||
| 22 | +mediaRecorder = new MediaRecorder(audioStream, { | ||
| 23 | + mimeType: 'audio/webm;codecs=opus' | ||
| 24 | +}); | ||
| 25 | +mediaRecorder.start(1000); // 每秒收集一次数据 | ||
| 26 | + | ||
| 27 | +// 数据处理 | ||
| 28 | +mediaRecorder.ondataavailable = function(event) { | ||
| 29 | + if (event.data.size > 0) { | ||
| 30 | + audioChunks.push(event.data); | ||
| 31 | + } | ||
| 32 | +}; | ||
| 33 | +``` | ||
| 34 | + | ||
| 35 | +### 1.2 参考项目Fay架构 | ||
| 36 | + | ||
| 37 | +**服务端录音方案**: | ||
| 38 | +- **录音方式**:Python pyaudio直接采集 | ||
| 39 | +- **数据流**:服务端采集 → PCM原始数据 → 实时流式传输 | ||
| 40 | +- **处理模式**:连续录音 + 实时VAD断句 | ||
| 41 | +- **音频参数**:16kHz, 单声道, 16bit PCM | ||
| 42 | + | ||
| 43 | +**技术特点**: | ||
| 44 | +```python | ||
| 45 | +# Fay实现方式 | ||
| 46 | +stream = pyaudio.PyAudio().open( | ||
| 47 | + format=pyaudio.paInt16, | ||
| 48 | + channels=1, | ||
| 49 | + rate=16000, | ||
| 50 | + input=True, | ||
| 51 | + frames_per_buffer=1024 | ||
| 52 | +) | ||
| 53 | + | ||
| 54 | +# 实时音频处理 | ||
| 55 | +while recording: | ||
| 56 | + audio_data = stream.read(1024) | ||
| 57 | + # 实时VAD检测和传输 | ||
| 58 | +``` | ||
| 59 | + | ||
| 60 | +## 2. 关键技术差异分析 | ||
| 61 | + | ||
| 62 | +### 2.1 音频采集方式 | ||
| 63 | + | ||
| 64 | +| 对比维度 | eman_one (前端录音) | Fay (服务端录音) | | ||
| 65 | +|---------|-------------------|------------------| | ||
| 66 | +| **延迟性** | 较高 (1秒批量) | 极低 (实时流) | | ||
| 67 | +| **音质** | 有损压缩 (Opus) | 无损 (PCM) | | ||
| 68 | +| **兼容性** | 浏览器依赖 | 系统级控制 | | ||
| 69 | +| **资源占用** | 客户端CPU | 服务端CPU | | ||
| 70 | +| **网络传输** | 压缩后较小 | 原始数据较大 | | ||
| 71 | + | ||
| 72 | +### 2.2 VAD语音活动检测 | ||
| 73 | + | ||
| 74 | +**eman_one现状**: | ||
| 75 | +- 前端简单音量检测 | ||
| 76 | +- 静音超时触发断句 | ||
| 77 | +- 缺乏智能语音边界检测 | ||
| 78 | + | ||
| 79 | +**Fay优势**: | ||
| 80 | +- 服务端专业VAD算法 | ||
| 81 | +- 动态阈值自适应 | ||
| 82 | +- 历史音频缓存机制 | ||
| 83 | +- 环境噪音适应 | ||
| 84 | + | ||
| 85 | +### 2.3 实时性对比 | ||
| 86 | + | ||
| 87 | +**延迟分析**: | ||
| 88 | +``` | ||
| 89 | +eman_one延迟链路: | ||
| 90 | +录音(1s) → 编码 → 传输 → 解码 → 识别 ≈ 1.2-1.5秒 | ||
| 91 | + | ||
| 92 | +Fay延迟链路: | ||
| 93 | +录音(20ms) → 传输 → 识别 ≈ 100-200毫秒 | ||
| 94 | +``` | ||
| 95 | + | ||
| 96 | +## 3. 体验问题识别 | ||
| 97 | + | ||
| 98 | +### 3.1 当前eman_one存在的问题 | ||
| 99 | + | ||
| 100 | +1. **响应延迟高** | ||
| 101 | + - 1秒批量传输导致明显延迟 | ||
| 102 | + - 用户体验不够流畅 | ||
| 103 | + | ||
| 104 | +2. **断句不准确** | ||
| 105 | + - 简单音量阈值容易误判 | ||
| 106 | + - 无法处理复杂语音场景 | ||
| 107 | + | ||
| 108 | +3. **音质损失** | ||
| 109 | + - Opus压缩影响识别准确率 | ||
| 110 | + - Base64传输增加数据量 | ||
| 111 | + | ||
| 112 | +4. **打断处理不完善** | ||
| 113 | + - 缺乏智能打断机制 | ||
| 114 | + - 回音消除不够完善 | ||
| 115 | + | ||
| 116 | +### 3.2 技术风险评估 | ||
| 117 | + | ||
| 118 | +**前端录音方案风险**: | ||
| 119 | +- 浏览器兼容性问题 | ||
| 120 | +- 移动端性能限制 | ||
| 121 | +- 网络不稳定影响 | ||
| 122 | +- 用户权限管理复杂 | ||
| 123 | + | ||
| 124 | +**服务端录音方案风险**: | ||
| 125 | +- 需要本地部署 | ||
| 126 | +- 硬件设备依赖 | ||
| 127 | +- 多用户并发处理 | ||
| 128 | +- 系统权限要求 | ||
| 129 | + | ||
| 130 | +## 4. 优化改进方案 | ||
| 131 | + | ||
| 132 | +### 4.1 短期优化(保持前端录音) | ||
| 133 | + | ||
| 134 | +#### 4.1.1 降低传输延迟 | ||
| 135 | +```javascript | ||
| 136 | +// 优化:减少批量间隔 | ||
| 137 | +mediaRecorder.start(100); // 改为100ms | ||
| 138 | + | ||
| 139 | +// 实时传输优化 | ||
| 140 | +mediaRecorder.ondataavailable = function(event) { | ||
| 141 | + if (event.data.size > 0) { | ||
| 142 | + // 立即发送,不等待批量 | ||
| 143 | + sendAudioToASR(event.data); | ||
| 144 | + } | ||
| 145 | +}; | ||
| 146 | +``` | ||
| 147 | + | ||
| 148 | +#### 4.1.2 改进VAD算法 | ||
| 149 | +```javascript | ||
| 150 | +// 增强的VAD检测 | ||
| 151 | +function enhancedVAD(audioData) { | ||
| 152 | + // 1. 音量检测 | ||
| 153 | + const volume = calculateRMS(audioData); | ||
| 154 | + | ||
| 155 | + // 2. 频谱分析 | ||
| 156 | + const spectrum = analyzeSpectrum(audioData); | ||
| 157 | + | ||
| 158 | + // 3. 动态阈值 | ||
| 159 | + updateDynamicThreshold(volume); | ||
| 160 | + | ||
| 161 | + // 4. 语音边界检测 | ||
| 162 | + return detectSpeechBoundary(volume, spectrum); | ||
| 163 | +} | ||
| 164 | +``` | ||
| 165 | + | ||
| 166 | +#### 4.1.3 音频质量优化 | ||
| 167 | +```javascript | ||
| 168 | +// 使用更高质量的编码参数 | ||
| 169 | +mediaRecorder = new MediaRecorder(audioStream, { | ||
| 170 | + mimeType: 'audio/webm;codecs=opus', | ||
| 171 | + audioBitsPerSecond: 128000 // 提高比特率 | ||
| 172 | +}); | ||
| 173 | + | ||
| 174 | +// 音频预处理 | ||
| 175 | +function preprocessAudio(audioData) { | ||
| 176 | + // 降噪处理 | ||
| 177 | + // 音量归一化 | ||
| 178 | + // 格式标准化 | ||
| 179 | + return processedAudio; | ||
| 180 | +} | ||
| 181 | +``` | ||
| 182 | + | ||
| 183 | +### 4.2 中期优化(混合方案) | ||
| 184 | + | ||
| 185 | +#### 4.2.1 双模式支持 | ||
| 186 | +```python | ||
| 187 | +# 服务端支持多种输入模式 | ||
| 188 | +class HybridASRServer: | ||
| 189 | + def __init__(self): | ||
| 190 | + self.web_mode = True # 前端录音模式 | ||
| 191 | + self.local_mode = False # 服务端录音模式 | ||
| 192 | + | ||
| 193 | + async def handle_web_audio(self, websocket, audio_data): | ||
| 194 | + """处理前端传输的音频""" | ||
| 195 | + # 解码和预处理 | ||
| 196 | + processed_audio = self.preprocess_web_audio(audio_data) | ||
| 197 | + return await self.recognize(processed_audio) | ||
| 198 | + | ||
| 199 | + async def handle_local_audio(self): | ||
| 200 | + """处理服务端录音""" | ||
| 201 | + # 直接录音和处理 | ||
| 202 | + audio_stream = self.capture_local_audio() | ||
| 203 | + return await self.recognize(audio_stream) | ||
| 204 | +``` | ||
| 205 | + | ||
| 206 | +#### 4.2.2 智能模式切换 | ||
| 207 | +```javascript | ||
| 208 | +// 根据环境自动选择最优模式 | ||
| 209 | +function selectOptimalMode() { | ||
| 210 | + const factors = { | ||
| 211 | + networkLatency: measureNetworkLatency(), | ||
| 212 | + devicePerformance: assessDevicePerformance(), | ||
| 213 | + audioQuality: testAudioQuality() | ||
| 214 | + }; | ||
| 215 | + | ||
| 216 | + return factors.networkLatency < 50 ? 'web' : 'local'; | ||
| 217 | +} | ||
| 218 | +``` | ||
| 219 | + | ||
| 220 | +### 4.3 长期优化(全面升级) | ||
| 221 | + | ||
| 222 | +#### 4.3.1 采用Fay架构模式 | ||
| 223 | +```python | ||
| 224 | +# 参考Fay实现服务端录音 | ||
| 225 | +class FayStyleASR: | ||
| 226 | + def __init__(self): | ||
| 227 | + self.recorder = AudioRecorder( | ||
| 228 | + sample_rate=16000, | ||
| 229 | + channels=1, | ||
| 230 | + chunk_size=1024 | ||
| 231 | + ) | ||
| 232 | + self.vad = VoiceActivityDetector() | ||
| 233 | + self.asr_client = FunASRClient() | ||
| 234 | + | ||
| 235 | + async def continuous_recording(self): | ||
| 236 | + """连续录音处理""" | ||
| 237 | + while self.recording: | ||
| 238 | + audio_chunk = await self.recorder.read_chunk() | ||
| 239 | + | ||
| 240 | + # VAD检测 | ||
| 241 | + if self.vad.is_speech(audio_chunk): | ||
| 242 | + await self.asr_client.send_audio(audio_chunk) | ||
| 243 | + | ||
| 244 | + # 断句检测 | ||
| 245 | + if self.vad.is_sentence_end(): | ||
| 246 | + await self.process_sentence_end() | ||
| 247 | +``` | ||
| 248 | + | ||
| 249 | +#### 4.3.2 完整的VAD系统 | ||
| 250 | +```python | ||
| 251 | +# 专业VAD实现 | ||
| 252 | +class AdvancedVAD: | ||
| 253 | + def __init__(self): | ||
| 254 | + self.volume_threshold = 0.01 | ||
| 255 | + self.silence_duration = 0 | ||
| 256 | + self.speech_duration = 0 | ||
| 257 | + self.history_buffer = [] | ||
| 258 | + | ||
| 259 | + def detect(self, audio_chunk): | ||
| 260 | + # 1. 音量计算 | ||
| 261 | + volume = self.calculate_volume(audio_chunk) | ||
| 262 | + | ||
| 263 | + # 2. 动态阈值调整 | ||
| 264 | + self.update_threshold(volume) | ||
| 265 | + | ||
| 266 | + # 3. 语音活动判断 | ||
| 267 | + is_speech = volume > self.volume_threshold | ||
| 268 | + | ||
| 269 | + # 4. 状态机处理 | ||
| 270 | + return self.state_machine(is_speech) | ||
| 271 | +``` | ||
| 272 | + | ||
| 273 | +## 5. 实施建议 | ||
| 274 | + | ||
| 275 | +### 5.1 优先级排序 | ||
| 276 | + | ||
| 277 | +**P0 (立即实施)**: | ||
| 278 | +1. 降低MediaRecorder传输间隔至100ms | ||
| 279 | +2. 优化音频编码参数 | ||
| 280 | +3. 改进前端VAD算法 | ||
| 281 | + | ||
| 282 | +**P1 (1-2周)**: | ||
| 283 | +1. 实现音频预处理管道 | ||
| 284 | +2. 添加动态阈值调整 | ||
| 285 | +3. 完善错误处理和重连机制 | ||
| 286 | + | ||
| 287 | +**P2 (1个月)**: | ||
| 288 | +1. 开发混合录音模式 | ||
| 289 | +2. 实现智能模式切换 | ||
| 290 | +3. 集成专业VAD算法 | ||
| 291 | + | ||
| 292 | +**P3 (长期规划)**: | ||
| 293 | +1. 完全迁移到服务端录音 | ||
| 294 | +2. 实现Fay级别的实时性 | ||
| 295 | +3. 支持多用户并发处理 | ||
| 296 | + | ||
| 297 | +### 5.2 技术选型建议 | ||
| 298 | + | ||
| 299 | +**保持前端录音的情况下**: | ||
| 300 | +- 使用WebRTC AudioWorklet替代MediaRecorder | ||
| 301 | +- 实现客户端音频预处理 | ||
| 302 | +- 采用WebSocket流式传输 | ||
| 303 | + | ||
| 304 | +**迁移到服务端录音**: | ||
| 305 | +- 采用pyaudio + asyncio架构 | ||
| 306 | +- 集成专业VAD库(如webrtcvad) | ||
| 307 | +- 实现多用户音频隔离 | ||
| 308 | + | ||
| 309 | +### 5.3 性能目标 | ||
| 310 | + | ||
| 311 | +| 指标 | 当前状态 | 优化目标 | | ||
| 312 | +|------|---------|----------| | ||
| 313 | +| **端到端延迟** | 1.2-1.5秒 | <300ms | | ||
| 314 | +| **识别准确率** | 85% | >95% | | ||
| 315 | +| **断句准确率** | 70% | >90% | | ||
| 316 | +| **并发用户** | 10 | 100+ | | ||
| 317 | + | ||
| 318 | +## 6. 风险评估与缓解 | ||
| 319 | + | ||
| 320 | +### 6.1 技术风险 | ||
| 321 | + | ||
| 322 | +**风险1:服务端录音权限问题** | ||
| 323 | +- 缓解:提供详细的部署文档和权限配置指南 | ||
| 324 | +- 备选:保持前端录音作为fallback方案 | ||
| 325 | + | ||
| 326 | +**风险2:多用户并发冲突** | ||
| 327 | +- 缓解:实现音频设备虚拟化和隔离 | ||
| 328 | +- 备选:限制并发数量或使用队列机制 | ||
| 329 | + | ||
| 330 | +**风险3:系统兼容性问题** | ||
| 331 | +- 缓解:支持多种音频驱动和设备 | ||
| 332 | +- 备选:提供Docker容器化部署 | ||
| 333 | + | ||
| 334 | +### 6.2 业务风险 | ||
| 335 | + | ||
| 336 | +**风险1:用户体验中断** | ||
| 337 | +- 缓解:渐进式迁移,保持向后兼容 | ||
| 338 | +- 监控:实时监控关键指标 | ||
| 339 | + | ||
| 340 | +**风险2:部署复杂度增加** | ||
| 341 | +- 缓解:提供一键部署脚本 | ||
| 342 | +- 文档:完善的运维手册 | ||
| 343 | + | ||
| 344 | +## 7. 结论与建议 | ||
| 345 | + | ||
| 346 | +### 7.1 核心结论 | ||
| 347 | + | ||
| 348 | +1. **技术差异显著**:Fay的服务端录音方案在实时性和音质方面明显优于当前前端录音方案 | ||
| 349 | + | ||
| 350 | +2. **体验提升空间大**:通过优化可将延迟从1.5秒降低到300ms以内 | ||
| 351 | + | ||
| 352 | +3. **实施可行性高**:可采用渐进式优化策略,降低迁移风险 | ||
| 353 | + | ||
| 354 | +### 7.2 推荐方案 | ||
| 355 | + | ||
| 356 | +**阶段一(立即实施)**: | ||
| 357 | +- 优化现有前端录音方案 | ||
| 358 | +- 实现基础的实时传输 | ||
| 359 | +- 改进VAD算法 | ||
| 360 | + | ||
| 361 | +**阶段二(中期目标)**: | ||
| 362 | +- 开发混合录音模式 | ||
| 363 | +- 支持智能模式切换 | ||
| 364 | +- 集成专业音频处理 | ||
| 365 | + | ||
| 366 | +**阶段三(长期愿景)**: | ||
| 367 | +- 完全采用服务端录音 | ||
| 368 | +- 达到Fay级别的用户体验 | ||
| 369 | +- 支持大规模并发 | ||
| 370 | + | ||
| 371 | +### 7.3 关键成功因素 | ||
| 372 | + | ||
| 373 | +1. **渐进式迁移**:避免一次性大改造带来的风险 | ||
| 374 | +2. **性能监控**:建立完善的指标监控体系 | ||
| 375 | +3. **用户反馈**:及时收集和响应用户体验反馈 | ||
| 376 | +4. **技术储备**:提前准备相关技术栈和人员培训 | ||
| 377 | + | ||
| 378 | +通过系统性的优化改进,eman_one项目完全可以达到甚至超越Fay项目的语音识别体验水平。 |
doc/dev/funasr_protocol_compatibility_fix.md
0 → 100644
| 1 | +# AIfeng/2025-07-17 17:04:42 | ||
| 2 | +# FunASR协议兼容性修复方案 | ||
| 3 | + | ||
| 4 | +## 问题分析 | ||
| 5 | + | ||
| 6 | +### 根本原因 | ||
| 7 | +- **协议不匹配**: FunASRSync客户端发送的分块协议与ASR_server.py期望的格式不兼容 | ||
| 8 | +- **服务端限制**: ASR_server.py只处理包含`url`或`audio_data`字段的消息 | ||
| 9 | +- **分块协议**: FunASRSync使用`audio_start`、`audio_chunk`、`audio_end`等新格式 | ||
| 10 | + | ||
| 11 | +### 现象确认 | ||
| 12 | +- 小文件正常:使用简单模式,发送标准`audio_data`格式 | ||
| 13 | +- 大文件失败:使用分块模式,发送不兼容的协议格式 | ||
| 14 | +- 服务端无响应:ASR_server.py无法识别分块协议消息 | ||
| 15 | + | ||
| 16 | +## 解决方案 | ||
| 17 | + | ||
| 18 | +### 方案一:服务端协议扩展(推荐) | ||
| 19 | + | ||
| 20 | +#### 优势 | ||
| 21 | +- 保持客户端分块优化不变 | ||
| 22 | +- 服务端支持更灵活的协议 | ||
| 23 | +- 向后兼容现有格式 | ||
| 24 | + | ||
| 25 | +#### 实施步骤 | ||
| 26 | +1. **扩展消息处理**: 在`ws_serve`函数中添加分块协议支持 | ||
| 27 | +2. **分块重组**: 实现音频分块的接收和重组逻辑 | ||
| 28 | +3. **状态管理**: 维护每个连接的分块接收状态 | ||
| 29 | + | ||
| 30 | +### 方案二:客户端协议回退(备选) | ||
| 31 | + | ||
| 32 | +#### 优势 | ||
| 33 | +- 无需修改服务端 | ||
| 34 | +- 实施简单快速 | ||
| 35 | + | ||
| 36 | +#### 劣势 | ||
| 37 | +- 失去分块传输优势 | ||
| 38 | +- 大文件仍可能超时 | ||
| 39 | + | ||
| 40 | +## 推荐实施:服务端协议扩展 | ||
| 41 | + | ||
| 42 | +### 核心修改点 | ||
| 43 | + | ||
| 44 | +#### 1. 消息路由扩展 | ||
| 45 | +```python | ||
| 46 | +# 在ws_serve函数中添加 | ||
| 47 | +if 'type' in data: | ||
| 48 | + # 处理分块协议 | ||
| 49 | + await handle_chunked_protocol(websocket, data) | ||
| 50 | +else: | ||
| 51 | + # 现有逻辑保持不变 | ||
| 52 | + if 'url' in data: | ||
| 53 | + await task_queue.put((websocket, data['url'], 'url')) | ||
| 54 | + elif 'audio_data' in data: | ||
| 55 | + await task_queue.put((websocket, data, 'audio_data')) | ||
| 56 | +``` | ||
| 57 | + | ||
| 58 | +#### 2. 分块状态管理 | ||
| 59 | +```python | ||
| 60 | +# 全局状态管理 | ||
| 61 | +chunk_sessions = {} # {user_id: {filename, chunks, total_chunks, ...}} | ||
| 62 | +``` | ||
| 63 | + | ||
| 64 | +#### 3. 分块处理逻辑 | ||
| 65 | +- **audio_start**: 初始化接收会话 | ||
| 66 | +- **audio_chunk**: 累积音频分块 | ||
| 67 | +- **audio_end**: 完成重组并处理 | ||
| 68 | + | ||
| 69 | +### 性能优化 | ||
| 70 | + | ||
| 71 | +#### 内存管理 | ||
| 72 | +- 使用临时文件而非内存缓存大文件 | ||
| 73 | +- 及时清理完成的会话 | ||
| 74 | +- 设置会话超时机制 | ||
| 75 | + | ||
| 76 | +#### 错误处理 | ||
| 77 | +- 分块丢失检测 | ||
| 78 | +- 会话超时清理 | ||
| 79 | +- 异常状态恢复 | ||
| 80 | + | ||
| 81 | +## 实施计划 | ||
| 82 | + | ||
| 83 | +### Phase 1: 基础协议支持(2小时) | ||
| 84 | +1. 添加分块消息路由 | ||
| 85 | +2. 实现基础分块重组 | ||
| 86 | +3. 测试小规模分块 | ||
| 87 | + | ||
| 88 | +### Phase 2: 稳定性增强(4小时) | ||
| 89 | +1. 完善错误处理 | ||
| 90 | +2. 添加状态管理 | ||
| 91 | +3. 实施超时机制 | ||
| 92 | + | ||
| 93 | +### Phase 3: 性能优化(1天) | ||
| 94 | +1. 内存优化 | ||
| 95 | +2. 并发处理 | ||
| 96 | +3. 监控指标 | ||
| 97 | + | ||
| 98 | +## 测试验证 | ||
| 99 | + | ||
| 100 | +### 测试用例 | ||
| 101 | +1. **小文件兼容性**: 确保现有简单模式正常 | ||
| 102 | +2. **大文件分块**: 验证2MB、5MB、10MB文件处理 | ||
| 103 | +3. **并发处理**: 多客户端同时发送 | ||
| 104 | +4. **异常恢复**: 网络中断、分块丢失等场景 | ||
| 105 | + | ||
| 106 | +### 成功指标 | ||
| 107 | +- 大文件传输成功率 >95% | ||
| 108 | +- 服务端内存使用稳定 | ||
| 109 | +- 响应时间合理(<文件大小/1MB * 2秒) | ||
| 110 | +- 向后兼容性100% | ||
| 111 | + | ||
| 112 | +## 风险评估 | ||
| 113 | + | ||
| 114 | +### 低风险 | ||
| 115 | +- 向后兼容:现有协议完全保留 | ||
| 116 | +- 渐进式:可分阶段实施 | ||
| 117 | +- 可回滚:出问题可快速恢复 | ||
| 118 | + | ||
| 119 | +### 注意事项 | ||
| 120 | +- 内存使用监控 | ||
| 121 | +- 并发连接限制 | ||
| 122 | +- 分块大小合理性 | ||
| 123 | +- 超时参数调优 | ||
| 124 | + | ||
| 125 | +## 监控指标 | ||
| 126 | + | ||
| 127 | +### 关键指标 | ||
| 128 | +- 分块接收成功率 | ||
| 129 | +- 音频重组完整性 | ||
| 130 | +- 服务端内存峰值 | ||
| 131 | +- 平均处理延迟 | ||
| 132 | + | ||
| 133 | +### 告警阈值 | ||
| 134 | +- 分块丢失率 >1% | ||
| 135 | +- 内存使用 >1GB | ||
| 136 | +- 处理延迟 >30秒 | ||
| 137 | +- 会话超时率 >5% |
doc/dev/funasr_sync_optimization.md
0 → 100644
| 1 | +# AIfeng/2025-07-17 16:38:52 | ||
| 2 | + | ||
| 3 | +# FunASRSync大文件处理优化方案 | ||
| 4 | + | ||
| 5 | +## 当前状态确认 | ||
| 6 | + | ||
| 7 | +### 启用方案 | ||
| 8 | +✅ **当前项目启用的是 `funasr_asr_sync.py` 同步版本** | ||
| 9 | + | ||
| 10 | +- **主程序**: `app.py` 第415行导入 `from funasr_asr_sync import FunASRSync` | ||
| 11 | +- **实例化**: `create_asr_connection()` 函数中创建 `FunASRSync(username)` 实例 | ||
| 12 | +- **调用路径**: `humanaudio()` → `handle_funasr()` → `ensure_asr_connection()` → `create_asr_connection()` | ||
| 13 | + | ||
| 14 | +### 当前实现分析 | ||
| 15 | + | ||
| 16 | +**连接配置**: | ||
| 17 | +- WebSocket连接超时: 5秒 (第195-219行) | ||
| 18 | +- 发送间隔: 0.04秒 (第106-120行) | ||
| 19 | +- 重连机制: 指数退避,初始延迟1秒 | ||
| 20 | + | ||
| 21 | +**大文件处理机制**: | ||
| 22 | +- 音频数据通过Base64编码一次性发送 (第160-190行) | ||
| 23 | +- 无分块处理机制 | ||
| 24 | +- 无流控制 | ||
| 25 | +- 无超时重试 | ||
| 26 | + | ||
| 27 | +## 问题分析 | ||
| 28 | + | ||
| 29 | +### 1. 大文件超时根因 | ||
| 30 | + | ||
| 31 | +**主要问题**: | ||
| 32 | +- **一次性发送**: 大文件Base64编码后体积增大33%,单次发送易超时 | ||
| 33 | +- **无分块机制**: 缺乏音频数据分片处理 | ||
| 34 | +- **连接超时过短**: 5秒超时对大文件处理不足 | ||
| 35 | +- **无进度反馈**: 客户端无法感知处理进度 | ||
| 36 | + | ||
| 37 | +**技术瓶颈**: | ||
| 38 | +- Base64编码内存占用高 | ||
| 39 | +- WebSocket单帧大小限制 | ||
| 40 | +- 网络传输稳定性依赖 | ||
| 41 | + | ||
| 42 | +### 2. 性能影响评估 | ||
| 43 | + | ||
| 44 | +| 文件大小 | Base64后大小 | 预估传输时间 | 超时风险 | | ||
| 45 | +|---------|-------------|-------------|----------| | ||
| 46 | +| 1MB | 1.33MB | 1-2秒 | 低 | | ||
| 47 | +| 5MB | 6.65MB | 5-10秒 | 中 | | ||
| 48 | +| 10MB | 13.3MB | 10-20秒 | 高 | | ||
| 49 | +| 20MB+ | 26.6MB+ | 20秒+ | 极高 | | ||
| 50 | + | ||
| 51 | +## 优化方案 | ||
| 52 | + | ||
| 53 | +### 阶段一:立即可行优化 (1-2天) | ||
| 54 | + | ||
| 55 | +#### 1.1 超时参数调优 | ||
| 56 | +```python | ||
| 57 | +# 配置优化建议 | ||
| 58 | +ASR_CONNECTION_TIMEOUT = 30 # 连接超时从5秒增加到30秒 | ||
| 59 | +ASR_SEND_TIMEOUT = 60 # 新增发送超时配置 | ||
| 60 | +ASR_CHUNK_SIZE = 1024 * 512 # 512KB分块大小 | ||
| 61 | +ASR_SEND_INTERVAL = 0.02 # 发送间隔从0.04秒减少到0.02秒 | ||
| 62 | +``` | ||
| 63 | + | ||
| 64 | +#### 1.2 分块发送机制 | ||
| 65 | +```python | ||
| 66 | +def send_audio_data_chunked(self, audio_bytes, filename="audio.wav", chunk_size=512*1024): | ||
| 67 | + """分块发送音频数据""" | ||
| 68 | + import base64 | ||
| 69 | + import math | ||
| 70 | + | ||
| 71 | + try: | ||
| 72 | + # 数据预处理 | ||
| 73 | + if hasattr(audio_bytes, 'tobytes'): | ||
| 74 | + audio_bytes = audio_bytes.tobytes() | ||
| 75 | + elif isinstance(audio_bytes, memoryview): | ||
| 76 | + audio_bytes = bytes(audio_bytes) | ||
| 77 | + | ||
| 78 | + total_size = len(audio_bytes) | ||
| 79 | + total_chunks = math.ceil(total_size / chunk_size) | ||
| 80 | + | ||
| 81 | + util.log(1, f"开始分块发送: {filename}, 总大小: {total_size} bytes, 分块数: {total_chunks}") | ||
| 82 | + | ||
| 83 | + # 发送开始信号 | ||
| 84 | + start_frame = { | ||
| 85 | + 'type': 'audio_start', | ||
| 86 | + 'filename': filename, | ||
| 87 | + 'total_size': total_size, | ||
| 88 | + 'total_chunks': total_chunks, | ||
| 89 | + 'chunk_size': chunk_size | ||
| 90 | + } | ||
| 91 | + self._send_frame_with_retry(start_frame) | ||
| 92 | + | ||
| 93 | + # 分块发送 | ||
| 94 | + for i in range(total_chunks): | ||
| 95 | + start_pos = i * chunk_size | ||
| 96 | + end_pos = min(start_pos + chunk_size, total_size) | ||
| 97 | + chunk_data = audio_bytes[start_pos:end_pos] | ||
| 98 | + | ||
| 99 | + # Base64编码分块 | ||
| 100 | + chunk_b64 = base64.b64encode(chunk_data).decode('utf-8') | ||
| 101 | + | ||
| 102 | + chunk_frame = { | ||
| 103 | + 'type': 'audio_chunk', | ||
| 104 | + 'filename': filename, | ||
| 105 | + 'chunk_index': i, | ||
| 106 | + 'chunk_data': chunk_b64, | ||
| 107 | + 'is_last': (i == total_chunks - 1) | ||
| 108 | + } | ||
| 109 | + | ||
| 110 | + # 发送分块并等待确认 | ||
| 111 | + success = self._send_frame_with_retry(chunk_frame) | ||
| 112 | + if not success: | ||
| 113 | + util.log(3, f"分块 {i+1}/{total_chunks} 发送失败") | ||
| 114 | + return False | ||
| 115 | + | ||
| 116 | + # 进度日志 | ||
| 117 | + if (i + 1) % 10 == 0 or i == total_chunks - 1: | ||
| 118 | + progress = ((i + 1) / total_chunks) * 100 | ||
| 119 | + util.log(1, f"发送进度: {progress:.1f}% ({i+1}/{total_chunks})") | ||
| 120 | + | ||
| 121 | + # 流控延迟 | ||
| 122 | + time.sleep(0.01) | ||
| 123 | + | ||
| 124 | + # 发送结束信号 | ||
| 125 | + end_frame = { | ||
| 126 | + 'type': 'audio_end', | ||
| 127 | + 'filename': filename | ||
| 128 | + } | ||
| 129 | + self._send_frame_with_retry(end_frame) | ||
| 130 | + | ||
| 131 | + util.log(1, f"音频数据分块发送完成: {filename}") | ||
| 132 | + return True | ||
| 133 | + | ||
| 134 | + except Exception as e: | ||
| 135 | + util.log(3, f"分块发送音频数据时出错: {e}") | ||
| 136 | + return False | ||
| 137 | +``` | ||
| 138 | + | ||
| 139 | +#### 1.3 重试机制增强 | ||
| 140 | +```python | ||
| 141 | +def _send_frame_with_retry(self, frame, max_retries=3, timeout=10): | ||
| 142 | + """带重试的帧发送""" | ||
| 143 | + for attempt in range(max_retries): | ||
| 144 | + try: | ||
| 145 | + if self.__ws and self.__connected: | ||
| 146 | + # 设置发送超时 | ||
| 147 | + start_time = time.time() | ||
| 148 | + self.__ws.send(json.dumps(frame)) | ||
| 149 | + | ||
| 150 | + # 简单的发送确认检查 | ||
| 151 | + time.sleep(0.05) # 等待发送完成 | ||
| 152 | + | ||
| 153 | + if time.time() - start_time < timeout: | ||
| 154 | + return True | ||
| 155 | + else: | ||
| 156 | + util.log(2, f"发送超时,尝试 {attempt + 1}/{max_retries}") | ||
| 157 | + else: | ||
| 158 | + util.log(2, f"连接不可用,尝试 {attempt + 1}/{max_retries}") | ||
| 159 | + | ||
| 160 | + except Exception as e: | ||
| 161 | + util.log(2, f"发送失败,尝试 {attempt + 1}/{max_retries}: {e}") | ||
| 162 | + | ||
| 163 | + if attempt < max_retries - 1: | ||
| 164 | + time.sleep(0.5 * (attempt + 1)) # 指数退避 | ||
| 165 | + | ||
| 166 | + return False | ||
| 167 | +``` | ||
| 168 | + | ||
| 169 | +### 阶段二:稳定性优化 (3-5天) | ||
| 170 | + | ||
| 171 | +#### 2.1 连接健康检查 | ||
| 172 | +```python | ||
| 173 | +def _health_check(self): | ||
| 174 | + """连接健康检查""" | ||
| 175 | + if not self.__connected: | ||
| 176 | + return False | ||
| 177 | + | ||
| 178 | + try: | ||
| 179 | + # 发送心跳包 | ||
| 180 | + ping_frame = {'type': 'ping', 'timestamp': time.time()} | ||
| 181 | + self.__ws.send(json.dumps(ping_frame)) | ||
| 182 | + return True | ||
| 183 | + except Exception as e: | ||
| 184 | + util.log(2, f"健康检查失败: {e}") | ||
| 185 | + return False | ||
| 186 | + | ||
| 187 | +def _start_health_monitor(self): | ||
| 188 | + """启动健康监控线程""" | ||
| 189 | + def monitor(): | ||
| 190 | + while self.__connected: | ||
| 191 | + if not self._health_check(): | ||
| 192 | + util.log(2, "连接健康检查失败,尝试重连") | ||
| 193 | + self.__attempt_reconnect() | ||
| 194 | + time.sleep(30) # 30秒检查一次 | ||
| 195 | + | ||
| 196 | + Thread(target=monitor, daemon=True).start() | ||
| 197 | +``` | ||
| 198 | + | ||
| 199 | +#### 2.2 流控机制 | ||
| 200 | +```python | ||
| 201 | +class FlowController: | ||
| 202 | + """流量控制器""" | ||
| 203 | + def __init__(self, max_pending=5, window_size=1024*1024): | ||
| 204 | + self.max_pending = max_pending | ||
| 205 | + self.window_size = window_size | ||
| 206 | + self.pending_chunks = 0 | ||
| 207 | + self.sent_bytes = 0 | ||
| 208 | + self.last_reset = time.time() | ||
| 209 | + | ||
| 210 | + def can_send(self, chunk_size): | ||
| 211 | + """检查是否可以发送""" | ||
| 212 | + # 检查待处理分块数 | ||
| 213 | + if self.pending_chunks >= self.max_pending: | ||
| 214 | + return False | ||
| 215 | + | ||
| 216 | + # 检查窗口大小 | ||
| 217 | + now = time.time() | ||
| 218 | + if now - self.last_reset > 1.0: # 每秒重置 | ||
| 219 | + self.sent_bytes = 0 | ||
| 220 | + self.last_reset = now | ||
| 221 | + | ||
| 222 | + if self.sent_bytes + chunk_size > self.window_size: | ||
| 223 | + return False | ||
| 224 | + | ||
| 225 | + return True | ||
| 226 | + | ||
| 227 | + def on_send(self, chunk_size): | ||
| 228 | + """发送时调用""" | ||
| 229 | + self.pending_chunks += 1 | ||
| 230 | + self.sent_bytes += chunk_size | ||
| 231 | + | ||
| 232 | + def on_ack(self): | ||
| 233 | + """收到确认时调用""" | ||
| 234 | + self.pending_chunks = max(0, self.pending_chunks - 1) | ||
| 235 | +``` | ||
| 236 | + | ||
| 237 | +### 阶段三:架构优化 (1周) | ||
| 238 | + | ||
| 239 | +#### 3.1 异步发送队列 | ||
| 240 | +```python | ||
| 241 | +import asyncio | ||
| 242 | +from queue import Queue | ||
| 243 | + | ||
| 244 | +class AsyncSendQueue: | ||
| 245 | + """异步发送队列""" | ||
| 246 | + def __init__(self, max_size=100): | ||
| 247 | + self.queue = Queue(maxsize=max_size) | ||
| 248 | + self.running = False | ||
| 249 | + self.worker_thread = None | ||
| 250 | + | ||
| 251 | + def start(self): | ||
| 252 | + """启动发送队列""" | ||
| 253 | + self.running = True | ||
| 254 | + self.worker_thread = Thread(target=self._worker, daemon=True) | ||
| 255 | + self.worker_thread.start() | ||
| 256 | + | ||
| 257 | + def _worker(self): | ||
| 258 | + """队列工作线程""" | ||
| 259 | + while self.running: | ||
| 260 | + try: | ||
| 261 | + item = self.queue.get(timeout=1) | ||
| 262 | + if item is None: # 停止信号 | ||
| 263 | + break | ||
| 264 | + | ||
| 265 | + frame, callback = item | ||
| 266 | + success = self._send_frame(frame) | ||
| 267 | + if callback: | ||
| 268 | + callback(success) | ||
| 269 | + | ||
| 270 | + except Exception as e: | ||
| 271 | + util.log(3, f"发送队列工作异常: {e}") | ||
| 272 | + | ||
| 273 | + def enqueue(self, frame, callback=None): | ||
| 274 | + """入队""" | ||
| 275 | + try: | ||
| 276 | + self.queue.put((frame, callback), timeout=5) | ||
| 277 | + return True | ||
| 278 | + except: | ||
| 279 | + return False | ||
| 280 | +``` | ||
| 281 | + | ||
| 282 | +## 配置文件更新 | ||
| 283 | + | ||
| 284 | +### config_util.py 新增配置 | ||
| 285 | +```python | ||
| 286 | +# FunASR大文件优化配置 | ||
| 287 | +asr_connection_timeout = 30 # 连接超时(秒) | ||
| 288 | +asr_send_timeout = 60 # 发送超时(秒) | ||
| 289 | +asr_chunk_size = 512 * 1024 # 分块大小(bytes) | ||
| 290 | +asr_max_retries = 3 # 最大重试次数 | ||
| 291 | +asr_send_interval = 0.02 # 发送间隔(秒) | ||
| 292 | +asr_flow_control_window = 1024 * 1024 # 流控窗口大小 | ||
| 293 | +asr_max_pending_chunks = 5 # 最大待处理分块数 | ||
| 294 | +asr_health_check_interval = 30 # 健康检查间隔(秒) | ||
| 295 | +``` | ||
| 296 | + | ||
| 297 | +## 实施计划 | ||
| 298 | + | ||
| 299 | +### 第1天:立即优化 | ||
| 300 | +- [ ] 更新超时配置 | ||
| 301 | +- [ ] 实现基础分块发送 | ||
| 302 | +- [ ] 添加重试机制 | ||
| 303 | +- [ ] 测试小文件兼容性 | ||
| 304 | + | ||
| 305 | +### 第2-3天:稳定性增强 | ||
| 306 | +- [ ] 实现连接健康检查 | ||
| 307 | +- [ ] 添加流控机制 | ||
| 308 | +- [ ] 优化错误处理 | ||
| 309 | +- [ ] 大文件测试验证 | ||
| 310 | + | ||
| 311 | +### 第4-5天:性能调优 | ||
| 312 | +- [ ] 异步发送队列 | ||
| 313 | +- [ ] 内存使用优化 | ||
| 314 | +- [ ] 并发处理能力 | ||
| 315 | +- [ ] 压力测试 | ||
| 316 | + | ||
| 317 | +### 第6-7天:监控与文档 | ||
| 318 | +- [ ] 添加性能监控 | ||
| 319 | +- [ ] 完善日志记录 | ||
| 320 | +- [ ] 更新技术文档 | ||
| 321 | +- [ ] 用户使用指南 | ||
| 322 | + | ||
| 323 | +## 监控指标 | ||
| 324 | + | ||
| 325 | +### 关键指标 | ||
| 326 | +- **传输成功率**: 目标 >99% | ||
| 327 | +- **平均传输时间**: 大文件 <30秒 | ||
| 328 | +- **重连频率**: <1次/小时 | ||
| 329 | +- **内存使用**: 峰值 <原来150% | ||
| 330 | + | ||
| 331 | +### 监控实现 | ||
| 332 | +```python | ||
| 333 | +class ASRMetrics: | ||
| 334 | + """ASR性能指标收集""" | ||
| 335 | + def __init__(self): | ||
| 336 | + self.total_requests = 0 | ||
| 337 | + self.success_requests = 0 | ||
| 338 | + self.total_bytes = 0 | ||
| 339 | + self.total_time = 0 | ||
| 340 | + self.reconnect_count = 0 | ||
| 341 | + | ||
| 342 | + def record_request(self, size_bytes, duration_seconds, success): | ||
| 343 | + self.total_requests += 1 | ||
| 344 | + self.total_bytes += size_bytes | ||
| 345 | + self.total_time += duration_seconds | ||
| 346 | + if success: | ||
| 347 | + self.success_requests += 1 | ||
| 348 | + | ||
| 349 | + def get_stats(self): | ||
| 350 | + if self.total_requests == 0: | ||
| 351 | + return {} | ||
| 352 | + | ||
| 353 | + return { | ||
| 354 | + 'success_rate': self.success_requests / self.total_requests, | ||
| 355 | + 'avg_throughput': self.total_bytes / self.total_time if self.total_time > 0 else 0, | ||
| 356 | + 'avg_duration': self.total_time / self.total_requests, | ||
| 357 | + 'reconnect_rate': self.reconnect_count / self.total_requests | ||
| 358 | + } | ||
| 359 | +``` | ||
| 360 | + | ||
| 361 | +## 风险评估 | ||
| 362 | + | ||
| 363 | +### 技术风险 | ||
| 364 | +- **兼容性**: 分块机制需要服务端支持 | ||
| 365 | +- **性能**: 分块可能增加延迟 | ||
| 366 | +- **复杂性**: 错误处理逻辑复杂化 | ||
| 367 | + | ||
| 368 | +### 缓解措施 | ||
| 369 | +- 保留原有发送方式作为降级方案 | ||
| 370 | +- 渐进式部署,先小范围测试 | ||
| 371 | +- 完善监控和告警机制 | ||
| 372 | + | ||
| 373 | +## 预期效果 | ||
| 374 | + | ||
| 375 | +### 性能提升 | ||
| 376 | +- 大文件(>5MB)成功率从60%提升到95%+ | ||
| 377 | +- 传输超时减少80% | ||
| 378 | +- 用户体验显著改善 | ||
| 379 | + | ||
| 380 | +### 系统稳定性 | ||
| 381 | +- 连接稳定性提升 | ||
| 382 | +- 错误恢复能力增强 | ||
| 383 | +- 资源使用更合理 | ||
| 384 | + | ||
| 385 | +--- | ||
| 386 | + | ||
| 387 | +**优化重点**: 当前项目确实使用FunASRSync同步版本,主要问题是大文件一次性Base64发送导致超时。通过分块发送、重试机制和流控优化,可以显著改善大文件处理能力。 |
doc/dev/server_recording_test_guide.md
0 → 100644
| 1 | +<!-- AIfeng/2025-07-01 17:47:46 --> | ||
| 2 | +# 服务端录音测试页面功能测试指南 | ||
| 3 | + | ||
| 4 | +## 概述 | ||
| 5 | + | ||
| 6 | +`server_recording_test.html` 是一个功能完整的服务端录音测试面板,提供了录音控制、设备管理、配置调整、实时监控等多项功能。本指南详细说明各功能的测试方法。 | ||
| 7 | + | ||
| 8 | +## 页面访问 | ||
| 9 | + | ||
| 10 | +**访问地址**: `http://localhost:8010/server-recording-test` | ||
| 11 | + | ||
| 12 | +**前置条件**: | ||
| 13 | +- 确保服务端已启动(端口8010) | ||
| 14 | +- 确保 `server_recording_api.py` 已正确注册 | ||
| 15 | +- 浏览器支持WebSocket和Web Audio API | ||
| 16 | + | ||
| 17 | +## 核心功能测试 | ||
| 18 | + | ||
| 19 | +### 1. 获取状态功能测试 | ||
| 20 | + | ||
| 21 | +**功能描述**: 获取当前录音系统的状态信息 | ||
| 22 | + | ||
| 23 | +**测试步骤**: | ||
| 24 | +1. 点击「获取状态」按钮 | ||
| 25 | +2. 观察状态面板显示内容 | ||
| 26 | +3. 检查操作日志区域 | ||
| 27 | + | ||
| 28 | +**预期结果**: | ||
| 29 | +``` | ||
| 30 | +录音状态: ⚪ 已停止 / 🔴 录音中 | ||
| 31 | +设备索引: -1 (或具体设备编号) | ||
| 32 | +采样率: 16000 Hz | ||
| 33 | +声道数: 1 | ||
| 34 | +音量阈值: 0.01 | ||
| 35 | +静音超时: 2.0s | ||
| 36 | +语音超时: 10.0s | ||
| 37 | +``` | ||
| 38 | + | ||
| 39 | +**API调用**: `GET /api/server-recording/status` | ||
| 40 | + | ||
| 41 | +**故障排查**: | ||
| 42 | +- 如显示"获取状态失败",检查服务端是否正常运行 | ||
| 43 | +- 检查网络连接和API路由配置 | ||
| 44 | + | ||
| 45 | +### 2. 列出设备功能测试 | ||
| 46 | + | ||
| 47 | +**功能描述**: 获取系统可用的音频输入设备列表 | ||
| 48 | + | ||
| 49 | +**测试步骤**: | ||
| 50 | +1. 点击「列出设备」按钮 | ||
| 51 | +2. 观察设备列表区域显示 | ||
| 52 | +3. 尝试点击选择不同设备 | ||
| 53 | + | ||
| 54 | +**预期结果**: | ||
| 55 | +- 显示所有可用音频设备 | ||
| 56 | +- 每个设备显示:设备索引、设备名称、最大输入声道数 | ||
| 57 | +- 点击设备后高亮显示,设备索引自动填入配置面板 | ||
| 58 | + | ||
| 59 | +**API调用**: `GET /api/server-recording/devices` | ||
| 60 | + | ||
| 61 | +### 3. 开始录音功能测试 | ||
| 62 | + | ||
| 63 | +**功能描述**: 启动服务端录音功能 | ||
| 64 | + | ||
| 65 | +**测试步骤**: | ||
| 66 | +1. 确保已选择合适的音频设备 | ||
| 67 | +2. 点击「开始录音」按钮 | ||
| 68 | +3. 观察录音指示器和状态变化 | ||
| 69 | +4. 对着麦克风说话测试 | ||
| 70 | + | ||
| 71 | +**预期结果**: | ||
| 72 | +- 录音指示器显示红色脉冲动画 | ||
| 73 | +- 状态面板显示"🔴 录音中" | ||
| 74 | +- 音频可视化区域显示音量和频率变化 | ||
| 75 | +- WebSocket连接正常,实时接收音频数据 | ||
| 76 | + | ||
| 77 | +**API调用**: `POST /api/server-recording/start` | ||
| 78 | + | ||
| 79 | +### 4. 停止录音功能测试 | ||
| 80 | + | ||
| 81 | +**功能描述**: 停止当前录音会话 | ||
| 82 | + | ||
| 83 | +**测试步骤**: | ||
| 84 | +1. 在录音状态下点击「停止录音」按钮 | ||
| 85 | +2. 观察状态变化 | ||
| 86 | + | ||
| 87 | +**预期结果**: | ||
| 88 | +- 录音指示器消失 | ||
| 89 | +- 状态面板显示"⚪ 已停止" | ||
| 90 | +- 音频可视化停止更新 | ||
| 91 | + | ||
| 92 | +**API调用**: `POST /api/server-recording/stop` | ||
| 93 | + | ||
| 94 | +### 5. 获取配置功能测试 | ||
| 95 | + | ||
| 96 | +**功能描述**: 获取当前录音系统配置参数 | ||
| 97 | + | ||
| 98 | +**测试步骤**: | ||
| 99 | +1. 点击「获取配置」按钮 | ||
| 100 | +2. 观察配置面板各字段是否自动填充 | ||
| 101 | + | ||
| 102 | +**预期结果**: | ||
| 103 | +- 所有配置字段显示当前服务端配置值 | ||
| 104 | +- 设备索引、采样率、声道数等参数正确显示 | ||
| 105 | + | ||
| 106 | +**API调用**: `GET /api/server-recording/config` | ||
| 107 | + | ||
| 108 | +### 6. 更新配置功能测试 | ||
| 109 | + | ||
| 110 | +**功能描述**: 修改录音系统配置参数 | ||
| 111 | + | ||
| 112 | +**测试步骤**: | ||
| 113 | +1. 修改配置面板中的参数值 | ||
| 114 | + - 设备索引: 选择有效设备编号 | ||
| 115 | + - 采样率: 16000/44100/48000 Hz | ||
| 116 | + - 声道数: 1(单声道)/2(立体声) | ||
| 117 | + - 音量阈值: 0.001-1.0 | ||
| 118 | + - 静音超时: 0.1-10.0秒 | ||
| 119 | + - 语音超时: 1.0-30.0秒 | ||
| 120 | +2. 点击「更新配置」按钮 | ||
| 121 | +3. 再次获取配置验证更新结果 | ||
| 122 | + | ||
| 123 | +**预期结果**: | ||
| 124 | +- 配置更新成功提示 | ||
| 125 | +- 新配置立即生效 | ||
| 126 | + | ||
| 127 | +**API调用**: `POST /api/server-recording/config` | ||
| 128 | + | ||
| 129 | +### 7. 测试ASR功能 | ||
| 130 | + | ||
| 131 | +**功能描述**: 模拟ASR语音识别功能测试 | ||
| 132 | + | ||
| 133 | +**测试步骤**: | ||
| 134 | +1. 点击「测试ASR」按钮 | ||
| 135 | +2. 观察ASR结果面板变化 | ||
| 136 | + | ||
| 137 | +**预期结果**: | ||
| 138 | +- 依次显示测试短语: | ||
| 139 | + - "你好,这是一个测试" | ||
| 140 | + - "语音识别功能正常" | ||
| 141 | + - "测试完成" | ||
| 142 | +- 每个结果显示置信度(85%-100%) | ||
| 143 | +- 显示识别时间戳 | ||
| 144 | + | ||
| 145 | +### 8. 清空日志功能测试 | ||
| 146 | + | ||
| 147 | +**功能描述**: 清除操作日志记录 | ||
| 148 | + | ||
| 149 | +**测试步骤**: | ||
| 150 | +1. 执行几个操作产生日志 | ||
| 151 | +2. 点击「清空日志」按钮 | ||
| 152 | + | ||
| 153 | +**预期结果**: | ||
| 154 | +- 日志面板清空 | ||
| 155 | +- 显示"日志已清空"消息 | ||
| 156 | + | ||
| 157 | +## 实时监控功能测试 | ||
| 158 | + | ||
| 159 | +### 1. WebSocket连接测试 | ||
| 160 | + | ||
| 161 | +**测试方法**: | ||
| 162 | +- 打开浏览器开发者工具 → Network → WS | ||
| 163 | +- 观察WebSocket连接状态 | ||
| 164 | +- 检查消息收发情况 | ||
| 165 | + | ||
| 166 | +**预期结果**: | ||
| 167 | +- 连接URL: `ws://localhost:8010/ws/server-recording` | ||
| 168 | +- 连接状态: 已连接 | ||
| 169 | +- 定期接收状态更新消息 | ||
| 170 | + | ||
| 171 | +### 2. 音频可视化测试 | ||
| 172 | + | ||
| 173 | +**测试内容**: | ||
| 174 | +- 音量条实时变化 | ||
| 175 | +- 频率条动态显示 | ||
| 176 | +- 音量百分比数值更新 | ||
| 177 | + | ||
| 178 | +**测试方法**: | ||
| 179 | +- 开始录音后对麦克风说话 | ||
| 180 | +- 观察可视化效果变化 | ||
| 181 | + | ||
| 182 | +### 3. 性能监控测试 | ||
| 183 | + | ||
| 184 | +**监控指标**: | ||
| 185 | +- **延迟**: API调用响应时间 | ||
| 186 | +- **吞吐量**: 每秒处理请求数 | ||
| 187 | +- **错误率**: 失败请求百分比 | ||
| 188 | +- **运行时间**: 系统运行时长 | ||
| 189 | + | ||
| 190 | +**测试方法**: | ||
| 191 | +- 执行多个操作观察指标变化 | ||
| 192 | +- 故意触发错误观察错误率统计 | ||
| 193 | + | ||
| 194 | +### 4. 质量检测测试 | ||
| 195 | + | ||
| 196 | +**检测项目**: | ||
| 197 | +- **信号质量**: 绿色(良好)/橙色(警告)/红色(错误) | ||
| 198 | +- **噪音水平**: 环境噪音评估 | ||
| 199 | +- **VAD准确性**: 语音活动检测准确度 | ||
| 200 | +- **ASR准确性**: 语音识别准确度 | ||
| 201 | + | ||
| 202 | +## 故障排查指南 | ||
| 203 | + | ||
| 204 | +### 常见问题及解决方案 | ||
| 205 | + | ||
| 206 | +1. **WebSocket连接失败** | ||
| 207 | + - 检查服务端是否启动 | ||
| 208 | + - 确认端口8010可访问 | ||
| 209 | + - 验证路由 `/ws/server-recording` 已注册 | ||
| 210 | + | ||
| 211 | +2. **获取状态失败** | ||
| 212 | + - 检查API路由 `/api/server-recording/status` | ||
| 213 | + - 确认服务端录音模块正常加载 | ||
| 214 | + | ||
| 215 | +3. **设备列表为空** | ||
| 216 | + - 检查系统音频设备 | ||
| 217 | + - 确认麦克风权限已授予 | ||
| 218 | + - 验证音频驱动正常 | ||
| 219 | + | ||
| 220 | +4. **录音无响应** | ||
| 221 | + - 检查设备索引是否有效 | ||
| 222 | + - 确认音频设备未被其他程序占用 | ||
| 223 | + - 验证采样率和声道数配置 | ||
| 224 | + | ||
| 225 | +5. **音频可视化无变化** | ||
| 226 | + - 检查麦克风是否静音 | ||
| 227 | + - 确认音量阈值设置合理 | ||
| 228 | + - 验证WebSocket消息接收 | ||
| 229 | + | ||
| 230 | +## 测试检查清单 | ||
| 231 | + | ||
| 232 | +### 基础功能测试 | ||
| 233 | +- [ ] 页面正常加载 | ||
| 234 | +- [ ] WebSocket连接成功 | ||
| 235 | +- [ ] 获取状态功能正常 | ||
| 236 | +- [ ] 设备列表获取成功 | ||
| 237 | +- [ ] 配置获取和更新正常 | ||
| 238 | + | ||
| 239 | +### 录音功能测试 | ||
| 240 | +- [ ] 开始录音成功 | ||
| 241 | +- [ ] 停止录音成功 | ||
| 242 | +- [ ] 录音状态正确显示 | ||
| 243 | +- [ ] 音频可视化正常 | ||
| 244 | + | ||
| 245 | +### 高级功能测试 | ||
| 246 | +- [ ] ASR测试功能正常 | ||
| 247 | +- [ ] 性能监控数据准确 | ||
| 248 | +- [ ] 质量检测指标更新 | ||
| 249 | +- [ ] 日志记录完整 | ||
| 250 | + | ||
| 251 | +### 异常处理测试 | ||
| 252 | +- [ ] 网络断开自动重连 | ||
| 253 | +- [ ] 错误消息正确显示 | ||
| 254 | +- [ ] 异常状态恢复正常 | ||
| 255 | + | ||
| 256 | +## 性能基准 | ||
| 257 | + | ||
| 258 | +### 响应时间标准 | ||
| 259 | +- API调用延迟: < 100ms | ||
| 260 | +- WebSocket消息延迟: < 50ms | ||
| 261 | +- 音频可视化更新: < 100ms | ||
| 262 | + | ||
| 263 | +### 资源使用标准 | ||
| 264 | +- CPU使用率: < 10% | ||
| 265 | +- 内存使用: < 100MB | ||
| 266 | +- 网络带宽: < 1MB/s | ||
| 267 | + | ||
| 268 | +## 测试环境要求 | ||
| 269 | + | ||
| 270 | +### 浏览器支持 | ||
| 271 | +- Chrome 80+ | ||
| 272 | +- Firefox 75+ | ||
| 273 | +- Safari 13+ | ||
| 274 | +- Edge 80+ | ||
| 275 | + | ||
| 276 | +### 系统要求 | ||
| 277 | +- 操作系统: Windows 10+/macOS 10.15+/Linux | ||
| 278 | +- 内存: 4GB+ | ||
| 279 | +- 音频设备: 支持录音的麦克风 | ||
| 280 | + | ||
| 281 | +### 网络要求 | ||
| 282 | +- 本地网络连接 | ||
| 283 | +- WebSocket支持 | ||
| 284 | +- 端口8010可访问 | ||
| 285 | + | ||
| 286 | +--- | ||
| 287 | + | ||
| 288 | +**注意**: 本测试指南基于当前版本的测试页面功能,如有功能更新请及时更新本文档。 |
| 1 | +# AIfeng/2025-07-07 15:19:16 | ||
| 2 | +# 流式语音识别系统优化方案 | ||
| 3 | + | ||
| 4 | +## 概述 | ||
| 5 | + | ||
| 6 | +本文档针对用户提出的三个核心优化需求,设计了完整的技术实现方案: | ||
| 7 | +1. **智能断句逻辑** - 基于静音间隔的语义分段 | ||
| 8 | +2. **VAD分片优化** - 平衡响应速度与识别精度 | ||
| 9 | +3. **结果标识机制** - 流式识别结果的完整追踪 | ||
| 10 | + | ||
| 11 | +## 1. 智能断句逻辑设计 | ||
| 12 | + | ||
| 13 | +### 1.1 需求分析 | ||
| 14 | + | ||
| 15 | +用户场景:"我看到了一幅画 一幅后现代主义的画作 上面有人物 有动物 有一条很长的河 你能猜一猜这是哪一幅名画吗" | ||
| 16 | + | ||
| 17 | +**断句策略:** | ||
| 18 | +- 静音间隔 ≥ 2秒 → 独立句子 | ||
| 19 | +- 静音间隔 1-2秒 → 语义连接判断 | ||
| 20 | +- 静音间隔 < 1秒 → 同一句子内的自然停顿 | ||
| 21 | + | ||
| 22 | +### 1.2 技术实现方案 | ||
| 23 | + | ||
| 24 | +#### 1.2.1 多级静音阈值设计 | ||
| 25 | + | ||
| 26 | +```python | ||
| 27 | +class IntelligentSentenceSegmentation: | ||
| 28 | + def __init__(self): | ||
| 29 | + self.silence_thresholds = { | ||
| 30 | + 'micro_pause': 0.3, # 词间停顿 | ||
| 31 | + 'phrase_pause': 1.0, # 短语间停顿 | ||
| 32 | + 'sentence_pause': 2.0, # 句子间停顿 | ||
| 33 | + 'topic_pause': 4.0 # 话题间停顿 | ||
| 34 | + } | ||
| 35 | + | ||
| 36 | + self.segment_types = { | ||
| 37 | + 'word_continuation': 'micro_pause', | ||
| 38 | + 'phrase_connection': 'phrase_pause', | ||
| 39 | + 'sentence_boundary': 'sentence_pause', | ||
| 40 | + 'topic_boundary': 'topic_pause' | ||
| 41 | + } | ||
| 42 | +``` | ||
| 43 | + | ||
| 44 | +#### 1.2.2 语义连接判断算法 | ||
| 45 | + | ||
| 46 | +```python | ||
| 47 | +def analyze_semantic_connection(self, prev_segment: str, current_segment: str, | ||
| 48 | + silence_duration: float) -> str: | ||
| 49 | + """ | ||
| 50 | + 分析语义连接类型 | ||
| 51 | + | ||
| 52 | + Returns: | ||
| 53 | + 'continuation' | 'new_sentence' | 'new_topic' | ||
| 54 | + """ | ||
| 55 | + # 语法完整性检查 | ||
| 56 | + if self._is_grammatically_complete(prev_segment): | ||
| 57 | + if silence_duration >= self.silence_thresholds['sentence_pause']: | ||
| 58 | + return 'new_sentence' | ||
| 59 | + | ||
| 60 | + # 语义相关性检查 | ||
| 61 | + semantic_score = self._calculate_semantic_similarity(prev_segment, current_segment) | ||
| 62 | + | ||
| 63 | + if silence_duration >= self.silence_thresholds['phrase_pause']: | ||
| 64 | + if semantic_score > 0.7: | ||
| 65 | + return 'continuation' # 语义相关,继续当前句子 | ||
| 66 | + else: | ||
| 67 | + return 'new_sentence' # 语义不相关,新句子 | ||
| 68 | + | ||
| 69 | + return 'continuation' | ||
| 70 | +``` | ||
| 71 | + | ||
| 72 | +#### 1.2.3 动态阈值调整 | ||
| 73 | + | ||
| 74 | +```python | ||
| 75 | +class AdaptiveSilenceThreshold: | ||
| 76 | + def __init__(self): | ||
| 77 | + self.user_speech_pattern = { | ||
| 78 | + 'avg_pause_duration': 1.2, | ||
| 79 | + 'speech_rate': 150, # 词/分钟 | ||
| 80 | + 'pause_variance': 0.3 | ||
| 81 | + } | ||
| 82 | + | ||
| 83 | + def adjust_thresholds(self, recent_pauses: List[float]): | ||
| 84 | + """根据用户说话习惯动态调整阈值""" | ||
| 85 | + if len(recent_pauses) >= 10: | ||
| 86 | + avg_pause = np.mean(recent_pauses) | ||
| 87 | + std_pause = np.std(recent_pauses) | ||
| 88 | + | ||
| 89 | + # 个性化阈值调整 | ||
| 90 | + self.silence_thresholds['phrase_pause'] = avg_pause + 0.5 * std_pause | ||
| 91 | + self.silence_thresholds['sentence_pause'] = avg_pause + 1.5 * std_pause | ||
| 92 | +``` | ||
| 93 | + | ||
| 94 | +## 2. VAD分片优化策略 | ||
| 95 | + | ||
| 96 | +### 2.1 问题分析 | ||
| 97 | + | ||
| 98 | +**当前挑战:** | ||
| 99 | +- 小分片:响应快但识别精度低 | ||
| 100 | +- 大分片:精度高但响应慢 | ||
| 101 | +- 需要动态平衡策略 | ||
| 102 | + | ||
| 103 | +### 2.2 自适应分片算法 | ||
| 104 | + | ||
| 105 | +#### 2.2.1 分片大小动态调整 | ||
| 106 | + | ||
| 107 | +```python | ||
| 108 | +class AdaptiveVADChunking: | ||
| 109 | + def __init__(self): | ||
| 110 | + self.chunk_strategies = { | ||
| 111 | + 'fast_response': { | ||
| 112 | + 'min_chunk_duration': 0.5, | ||
| 113 | + 'max_chunk_duration': 2.0, | ||
| 114 | + 'confidence_threshold': 0.7 | ||
| 115 | + }, | ||
| 116 | + 'high_accuracy': { | ||
| 117 | + 'min_chunk_duration': 1.5, | ||
| 118 | + 'max_chunk_duration': 4.0, | ||
| 119 | + 'confidence_threshold': 0.8 | ||
| 120 | + }, | ||
| 121 | + 'balanced': { | ||
| 122 | + 'min_chunk_duration': 1.0, | ||
| 123 | + 'max_chunk_duration': 3.0, | ||
| 124 | + 'confidence_threshold': 0.75 | ||
| 125 | + } | ||
| 126 | + } | ||
| 127 | + | ||
| 128 | + self.current_strategy = 'balanced' | ||
| 129 | + self.performance_history = [] | ||
| 130 | + | ||
| 131 | + def select_optimal_strategy(self, context: dict) -> str: | ||
| 132 | + """根据上下文选择最优分片策略""" | ||
| 133 | + # 考虑因素: | ||
| 134 | + # 1. 当前识别准确率 | ||
| 135 | + # 2. 用户交互模式(快速对话 vs 长句描述) | ||
| 136 | + # 3. 环境噪音水平 | ||
| 137 | + # 4. 系统负载 | ||
| 138 | + | ||
| 139 | + recent_accuracy = self._calculate_recent_accuracy() | ||
| 140 | + interaction_mode = context.get('interaction_mode', 'normal') | ||
| 141 | + noise_level = context.get('noise_level', 0.1) | ||
| 142 | + | ||
| 143 | + if interaction_mode == 'quick_qa' and recent_accuracy > 0.85: | ||
| 144 | + return 'fast_response' | ||
| 145 | + elif noise_level > 0.3 or recent_accuracy < 0.7: | ||
| 146 | + return 'high_accuracy' | ||
| 147 | + else: | ||
| 148 | + return 'balanced' | ||
| 149 | +``` | ||
| 150 | + | ||
| 151 | +#### 2.2.2 渐进式识别策略 | ||
| 152 | + | ||
| 153 | +```python | ||
| 154 | +class ProgressiveRecognition: | ||
| 155 | + def __init__(self): | ||
| 156 | + self.recognition_stages = { | ||
| 157 | + 'immediate': 0.8, # 800ms 快速识别 | ||
| 158 | + 'refined': 2.0, # 2s 精化识别 | ||
| 159 | + 'final': 4.0 # 4s 最终识别 | ||
| 160 | + } | ||
| 161 | + | ||
| 162 | + def process_audio_segment(self, audio_data: bytes, duration: float): | ||
| 163 | + """渐进式识别处理""" | ||
| 164 | + results = {} | ||
| 165 | + | ||
| 166 | + # 阶段1:快速识别(低延迟) | ||
| 167 | + if duration >= self.recognition_stages['immediate']: | ||
| 168 | + quick_result = self._quick_recognition(audio_data[:int(0.8 * len(audio_data))]) | ||
| 169 | + results['immediate'] = { | ||
| 170 | + 'text': quick_result, | ||
| 171 | + 'confidence': 0.6, | ||
| 172 | + 'stage': 'immediate' | ||
| 173 | + } | ||
| 174 | + | ||
| 175 | + # 阶段2:精化识别(平衡) | ||
| 176 | + if duration >= self.recognition_stages['refined']: | ||
| 177 | + refined_result = self._refined_recognition(audio_data) | ||
| 178 | + results['refined'] = { | ||
| 179 | + 'text': refined_result, | ||
| 180 | + 'confidence': 0.8, | ||
| 181 | + 'stage': 'refined' | ||
| 182 | + } | ||
| 183 | + | ||
| 184 | + # 阶段3:最终识别(高精度) | ||
| 185 | + if duration >= self.recognition_stages['final']: | ||
| 186 | + final_result = self._final_recognition(audio_data) | ||
| 187 | + results['final'] = { | ||
| 188 | + 'text': final_result, | ||
| 189 | + 'confidence': 0.9, | ||
| 190 | + 'stage': 'final' | ||
| 191 | + } | ||
| 192 | + | ||
| 193 | + return results | ||
| 194 | +``` | ||
| 195 | + | ||
| 196 | +## 3. 结果标识与追踪机制 | ||
| 197 | + | ||
| 198 | +### 3.1 识别结果标识体系 | ||
| 199 | + | ||
| 200 | +#### 3.1.1 唯一标识符设计 | ||
| 201 | + | ||
| 202 | +```python | ||
| 203 | +from dataclasses import dataclass | ||
| 204 | +from typing import List, Optional | ||
| 205 | +import uuid | ||
| 206 | +import time | ||
| 207 | + | ||
| 208 | +@dataclass | ||
| 209 | +class RecognitionSegmentID: | ||
| 210 | + """识别片段唯一标识""" | ||
| 211 | + session_id: str # 会话ID | ||
| 212 | + segment_id: str # 片段ID | ||
| 213 | + sequence_number: int # 序列号 | ||
| 214 | + parent_segment_id: Optional[str] = None # 父片段ID(用于分片关联) | ||
| 215 | + | ||
| 216 | + def __post_init__(self): | ||
| 217 | + if not self.segment_id: | ||
| 218 | + self.segment_id = f"{self.session_id}_{self.sequence_number}_{int(time.time() * 1000)}" | ||
| 219 | + | ||
| 220 | +@dataclass | ||
| 221 | +class RecognitionResult: | ||
| 222 | + """增强的识别结果""" | ||
| 223 | + id: RecognitionSegmentID | ||
| 224 | + text: str | ||
| 225 | + confidence: float | ||
| 226 | + timestamp: float | ||
| 227 | + audio_duration: float | ||
| 228 | + result_type: str # 'partial' | 'refined' | 'final' | ||
| 229 | + stage: str # 'immediate' | 'refined' | 'final' | ||
| 230 | + audio_segment_hash: str # 音频片段哈希值 | ||
| 231 | + predecessor_ids: List[str] = None # 前驱结果ID列表 | ||
| 232 | + successor_ids: List[str] = None # 后继结果ID列表 | ||
| 233 | + is_superseded: bool = False # 是否被后续结果替代 | ||
| 234 | + superseded_by: Optional[str] = None # 被哪个结果替代 | ||
| 235 | +``` | ||
| 236 | + | ||
| 237 | +#### 3.1.2 结果关联追踪 | ||
| 238 | + | ||
| 239 | +```python | ||
| 240 | +class RecognitionResultTracker: | ||
| 241 | + def __init__(self): | ||
| 242 | + self.result_graph = {} # 结果关联图 | ||
| 243 | + self.active_segments = {} # 活跃片段 | ||
| 244 | + self.completed_segments = {} # 完成片段 | ||
| 245 | + | ||
| 246 | + def add_recognition_result(self, result: RecognitionResult) -> str: | ||
| 247 | + """添加识别结果并建立关联""" | ||
| 248 | + result_id = result.id.segment_id | ||
| 249 | + | ||
| 250 | + # 建立与前驱结果的关联 | ||
| 251 | + if result.predecessor_ids: | ||
| 252 | + for pred_id in result.predecessor_ids: | ||
| 253 | + if pred_id in self.result_graph: | ||
| 254 | + self.result_graph[pred_id]['successors'].append(result_id) | ||
| 255 | + | ||
| 256 | + # 标记前驱结果为被替代 | ||
| 257 | + if result.result_type == 'final': | ||
| 258 | + self._mark_superseded(pred_id, result_id) | ||
| 259 | + | ||
| 260 | + # 添加当前结果 | ||
| 261 | + self.result_graph[result_id] = { | ||
| 262 | + 'result': result, | ||
| 263 | + 'predecessors': result.predecessor_ids or [], | ||
| 264 | + 'successors': [], | ||
| 265 | + 'created_at': time.time() | ||
| 266 | + } | ||
| 267 | + | ||
| 268 | + return result_id | ||
| 269 | + | ||
| 270 | + def get_result_chain(self, segment_id: str) -> List[RecognitionResult]: | ||
| 271 | + """获取完整的识别链路""" | ||
| 272 | + chain = [] | ||
| 273 | + | ||
| 274 | + # 向前追溯到起始结果 | ||
| 275 | + current_id = segment_id | ||
| 276 | + while current_id: | ||
| 277 | + if current_id in self.result_graph: | ||
| 278 | + result_info = self.result_graph[current_id] | ||
| 279 | + chain.insert(0, result_info['result']) | ||
| 280 | + | ||
| 281 | + # 找到前驱 | ||
| 282 | + predecessors = result_info['predecessors'] | ||
| 283 | + current_id = predecessors[0] if predecessors else None | ||
| 284 | + else: | ||
| 285 | + break | ||
| 286 | + | ||
| 287 | + # 向后追溯到最终结果 | ||
| 288 | + current_id = segment_id | ||
| 289 | + while current_id: | ||
| 290 | + if current_id in self.result_graph: | ||
| 291 | + result_info = self.result_graph[current_id] | ||
| 292 | + successors = result_info['successors'] | ||
| 293 | + | ||
| 294 | + if successors: | ||
| 295 | + # 选择最新的后继结果 | ||
| 296 | + latest_successor = max(successors, | ||
| 297 | + key=lambda x: self.result_graph[x]['created_at']) | ||
| 298 | + | ||
| 299 | + if latest_successor not in [r.id.segment_id for r in chain]: | ||
| 300 | + chain.append(self.result_graph[latest_successor]['result']) | ||
| 301 | + | ||
| 302 | + current_id = latest_successor | ||
| 303 | + else: | ||
| 304 | + break | ||
| 305 | + else: | ||
| 306 | + break | ||
| 307 | + | ||
| 308 | + return chain | ||
| 309 | +``` | ||
| 310 | + | ||
| 311 | +### 3.2 流式显示刷新机制 | ||
| 312 | + | ||
| 313 | +#### 3.2.1 增量更新策略 | ||
| 314 | + | ||
| 315 | +```python | ||
| 316 | +class StreamingDisplayManager: | ||
| 317 | + def __init__(self): | ||
| 318 | + self.display_buffer = {} # 显示缓冲区 | ||
| 319 | + self.update_queue = [] # 更新队列 | ||
| 320 | + self.refresh_strategies = { | ||
| 321 | + 'immediate': self._immediate_refresh, | ||
| 322 | + 'debounced': self._debounced_refresh, | ||
| 323 | + 'batch': self._batch_refresh | ||
| 324 | + } | ||
| 325 | + | ||
| 326 | + def update_display(self, session_id: str, result: RecognitionResult, | ||
| 327 | + strategy: str = 'debounced'): | ||
| 328 | + """更新显示内容""" | ||
| 329 | + update_info = { | ||
| 330 | + 'session_id': session_id, | ||
| 331 | + 'result': result, | ||
| 332 | + 'timestamp': time.time(), | ||
| 333 | + 'update_type': self._determine_update_type(result) | ||
| 334 | + } | ||
| 335 | + | ||
| 336 | + self.update_queue.append(update_info) | ||
| 337 | + | ||
| 338 | + # 根据策略执行刷新 | ||
| 339 | + refresh_func = self.refresh_strategies.get(strategy, self._debounced_refresh) | ||
| 340 | + refresh_func(update_info) | ||
| 341 | + | ||
| 342 | + def _determine_update_type(self, result: RecognitionResult) -> str: | ||
| 343 | + """确定更新类型""" | ||
| 344 | + if result.result_type == 'partial': | ||
| 345 | + if result.stage == 'immediate': | ||
| 346 | + return 'append' # 追加显示 | ||
| 347 | + else: | ||
| 348 | + return 'replace_partial' # 替换部分内容 | ||
| 349 | + elif result.result_type == 'final': | ||
| 350 | + return 'replace_final' # 最终替换 | ||
| 351 | + else: | ||
| 352 | + return 'append' | ||
| 353 | + | ||
| 354 | + def _debounced_refresh(self, update_info: dict, delay: float = 0.2): | ||
| 355 | + """防抖刷新策略""" | ||
| 356 | + session_id = update_info['session_id'] | ||
| 357 | + | ||
| 358 | + # 取消之前的定时器 | ||
| 359 | + if session_id in self.pending_refreshes: | ||
| 360 | + self.pending_refreshes[session_id].cancel() | ||
| 361 | + | ||
| 362 | + # 设置新的定时器 | ||
| 363 | + timer = threading.Timer(delay, self._execute_refresh, args=[session_id]) | ||
| 364 | + self.pending_refreshes[session_id] = timer | ||
| 365 | + timer.start() | ||
| 366 | +``` | ||
| 367 | + | ||
| 368 | +## 4. 配置参数优化建议 | ||
| 369 | + | ||
| 370 | +### 4.1 VAD参数调整 | ||
| 371 | + | ||
| 372 | +```json | ||
| 373 | +{ | ||
| 374 | + "streaming_vad": { | ||
| 375 | + "silence_duration_levels": { | ||
| 376 | + "micro_pause": 0.3, | ||
| 377 | + "phrase_pause": 1.0, | ||
| 378 | + "sentence_pause": 2.0, | ||
| 379 | + "topic_pause": 4.0 | ||
| 380 | + }, | ||
| 381 | + "adaptive_chunking": { | ||
| 382 | + "enabled": true, | ||
| 383 | + "min_chunk_duration": 0.8, | ||
| 384 | + "max_chunk_duration": 3.5, | ||
| 385 | + "strategy_switch_threshold": 0.75 | ||
| 386 | + }, | ||
| 387 | + "progressive_recognition": { | ||
| 388 | + "enabled": true, | ||
| 389 | + "stages": { | ||
| 390 | + "immediate": 0.8, | ||
| 391 | + "refined": 2.0, | ||
| 392 | + "final": 4.0 | ||
| 393 | + } | ||
| 394 | + } | ||
| 395 | + } | ||
| 396 | +} | ||
| 397 | +``` | ||
| 398 | + | ||
| 399 | +### 4.2 识别管理参数 | ||
| 400 | + | ||
| 401 | +```json | ||
| 402 | +{ | ||
| 403 | + "streaming_recognition": { | ||
| 404 | + "result_tracking": { | ||
| 405 | + "enabled": true, | ||
| 406 | + "max_chain_length": 10, | ||
| 407 | + "cleanup_interval": 120.0 | ||
| 408 | + }, | ||
| 409 | + "display_refresh": { | ||
| 410 | + "strategy": "debounced", | ||
| 411 | + "debounce_delay": 0.2, | ||
| 412 | + "batch_size": 5, | ||
| 413 | + "max_refresh_rate": 10 | ||
| 414 | + } | ||
| 415 | + } | ||
| 416 | +} | ||
| 417 | +``` | ||
| 418 | + | ||
| 419 | +## 5. 实施计划 | ||
| 420 | + | ||
| 421 | +### 5.1 开发阶段 | ||
| 422 | + | ||
| 423 | +**阶段1:智能断句模块(1-2天)** | ||
| 424 | +- 实现多级静音阈值检测 | ||
| 425 | +- 开发语义连接判断算法 | ||
| 426 | +- 集成动态阈值调整机制 | ||
| 427 | + | ||
| 428 | +**阶段2:VAD优化模块(2-3天)** | ||
| 429 | +- 实现自适应分片算法 | ||
| 430 | +- 开发渐进式识别策略 | ||
| 431 | +- 性能测试与调优 | ||
| 432 | + | ||
| 433 | +**阶段3:结果追踪模块(2-3天)** | ||
| 434 | +- 实现结果标识体系 | ||
| 435 | +- 开发关联追踪机制 | ||
| 436 | +- 实现流式显示管理 | ||
| 437 | + | ||
| 438 | +**阶段4:集成测试(1-2天)** | ||
| 439 | +- 端到端功能测试 | ||
| 440 | +- 性能基准测试 | ||
| 441 | +- 用户体验验证 | ||
| 442 | + | ||
| 443 | +### 5.2 验证指标 | ||
| 444 | + | ||
| 445 | +**功能指标:** | ||
| 446 | +- 断句准确率 > 90% | ||
| 447 | +- 识别延迟 < 1秒(immediate阶段) | ||
| 448 | +- 最终识别准确率 > 95% | ||
| 449 | + | ||
| 450 | +**性能指标:** | ||
| 451 | +- 内存使用 < 100MB | ||
| 452 | +- CPU使用率 < 30% | ||
| 453 | +- 并发处理能力 > 5个会话 | ||
| 454 | + | ||
| 455 | +**用户体验指标:** | ||
| 456 | +- 响应流畅度评分 > 4.5/5 | ||
| 457 | +- 识别结果可读性 > 4.0/5 | ||
| 458 | +- 整体满意度 > 4.5/5 | ||
| 459 | + | ||
| 460 | +## 6. 风险评估与缓解 | ||
| 461 | + | ||
| 462 | +### 6.1 技术风险 | ||
| 463 | + | ||
| 464 | +**风险1:语义判断准确性** | ||
| 465 | +- 缓解:建立语义模型训练数据集 | ||
| 466 | +- 备选:基于规则的语法分析 | ||
| 467 | + | ||
| 468 | +**风险2:性能开销增加** | ||
| 469 | +- 缓解:异步处理 + 缓存优化 | ||
| 470 | +- 监控:实时性能指标追踪 | ||
| 471 | + | ||
| 472 | +**风险3:复杂度增加** | ||
| 473 | +- 缓解:模块化设计 + 完善测试 | ||
| 474 | +- 文档:详细的API文档和使用指南 | ||
| 475 | + | ||
| 476 | +### 6.2 兼容性考虑 | ||
| 477 | + | ||
| 478 | +- 保持现有API接口不变 | ||
| 479 | +- 新功能通过配置开关控制 | ||
| 480 | +- 提供降级机制确保稳定性 | ||
| 481 | + | ||
| 482 | +## 7. 总结 | ||
| 483 | + | ||
| 484 | +本优化方案通过三个核心模块的协同工作,实现了: | ||
| 485 | + | ||
| 486 | +1. **智能化断句** - 基于多维度分析的语义分段 | ||
| 487 | +2. **自适应VAD** - 动态平衡响应速度与识别精度 | ||
| 488 | +3. **完整追踪** - 全链路结果标识与关联管理 | ||
| 489 | + | ||
| 490 | +预期效果: | ||
| 491 | +- 用户体验显著提升 | ||
| 492 | +- 识别准确率提高15-20% | ||
| 493 | +- 响应延迟降低30-40% | ||
| 494 | +- 系统可维护性增强 | ||
| 495 | + | ||
| 496 | +该方案采用渐进式实施策略,确保系统稳定性的同时逐步提升功能完善度。 |
doc/process/funasr_timeout_analysis.md
0 → 100644
| 1 | +# AIfeng/2025-07-17 16:25:06 | ||
| 2 | + | ||
| 3 | +# FunASR大文件超时问题分析与优化方案 | ||
| 4 | + | ||
| 5 | +## 问题现象 | ||
| 6 | + | ||
| 7 | +用户在使用FunASR进行语音识别时遇到以下问题: | ||
| 8 | +- **小文件**:识别正常,无超时问题 | ||
| 9 | +- **大文件**:出现连接超时错误 | ||
| 10 | +- **错误信息**:`[WinError 10054] 远程主机强迫关闭了一个现有的连接` | ||
| 11 | +- **发生时间**:16:17:53 | ||
| 12 | + | ||
| 13 | +## 根因分析 | ||
| 14 | + | ||
| 15 | +### 1. 超时配置问题 | ||
| 16 | + | ||
| 17 | +#### 当前超时设置 | ||
| 18 | +- **连接超时**:30秒(`config_util.py`中`asr_timeout`默认值) | ||
| 19 | +- **接收消息超时**:1秒(`_receive_messages`方法中的`asyncio.wait_for`) | ||
| 20 | +- **连接等待超时**:5秒(同步版本)/10秒(异步版本预热) | ||
| 21 | + | ||
| 22 | +#### 问题分析 | ||
| 23 | +```python | ||
| 24 | +# funasr_asr.py 第72行 - 连接超时配置 | ||
| 25 | +timeout_seconds = getattr(cfg, 'asr_timeout', 30) | ||
| 26 | +self.websocket = await asyncio.wait_for( | ||
| 27 | + websockets.connect(self.server_url), | ||
| 28 | + timeout=timeout_seconds | ||
| 29 | +) | ||
| 30 | + | ||
| 31 | +# 第145行 - 接收消息超时(过短) | ||
| 32 | +message = await asyncio.wait_for( | ||
| 33 | + self.websocket.recv(), | ||
| 34 | + timeout=1.0 # ⚠️ 仅1秒,对大文件处理不足 | ||
| 35 | +) | ||
| 36 | +``` | ||
| 37 | + | ||
| 38 | +### 2. 大文件处理机制缺陷 | ||
| 39 | + | ||
| 40 | +#### 分块发送逻辑 | ||
| 41 | +```python | ||
| 42 | +# funasr_asr.py 第500-520行 | ||
| 43 | +stride = int(60 * chunk_size / chunk_interval / 1000 * 16000 * 2) | ||
| 44 | + | ||
| 45 | +if len(audio_bytes) > stride: | ||
| 46 | + chunk_num = (len(audio_bytes) - 1) // stride + 1 | ||
| 47 | + for i in range(chunk_num): | ||
| 48 | + beg = i * stride | ||
| 49 | + chunk_data = audio_bytes[beg:beg + stride] | ||
| 50 | + self.message_queue.put(chunk_data) # ⚠️ 队列可能积压 | ||
| 51 | +``` | ||
| 52 | + | ||
| 53 | +#### 问题点 | ||
| 54 | +1. **队列积压**:大文件分块后产生大量消息,队列处理不及时 | ||
| 55 | +2. **发送频率**:`_send_message_loop`中`await asyncio.sleep(0.01)`间隔过短 | ||
| 56 | +3. **无流控机制**:缺乏背压控制,服务端可能过载 | ||
| 57 | + | ||
| 58 | +### 3. WebSocket连接稳定性 | ||
| 59 | + | ||
| 60 | +#### 心跳机制缺失 | ||
| 61 | +- **当前实现**:无主动心跳检测 | ||
| 62 | +- **连接检测**:仅依赖异常捕获 | ||
| 63 | +- **重连策略**:指数退避,但最大重连次数限制可能过严 | ||
| 64 | + | ||
| 65 | +## 优化方案 | ||
| 66 | + | ||
| 67 | +### 方案一:超时参数优化(立即可行) | ||
| 68 | + | ||
| 69 | +#### 1. 调整超时配置 | ||
| 70 | +```python | ||
| 71 | +# config_util.py 优化 | ||
| 72 | +class ConfigManager: | ||
| 73 | + def __init__(self): | ||
| 74 | + # ASR超时配置优化 | ||
| 75 | + self.asr_timeout = 60 # 连接超时:30→60秒 | ||
| 76 | + self.asr_receive_timeout = 30 # 接收超时:1→30秒 | ||
| 77 | + self.asr_send_interval = 0.05 # 发送间隔:0.01→0.05秒 | ||
| 78 | + self.asr_chunk_size = 8192 # 分块大小优化 | ||
| 79 | +``` | ||
| 80 | + | ||
| 81 | +#### 2. 动态超时计算 | ||
| 82 | +```python | ||
| 83 | +def calculate_timeout(self, audio_size_bytes): | ||
| 84 | + """根据音频大小动态计算超时时间""" | ||
| 85 | + base_timeout = 30 | ||
| 86 | + # 每MB增加10秒超时 | ||
| 87 | + size_mb = audio_size_bytes / (1024 * 1024) | ||
| 88 | + dynamic_timeout = base_timeout + (size_mb * 10) | ||
| 89 | + return min(dynamic_timeout, 300) # 最大5分钟 | ||
| 90 | +``` | ||
| 91 | + | ||
| 92 | +### 方案二:流控机制实现(推荐) | ||
| 93 | + | ||
| 94 | +#### 1. 队列大小限制 | ||
| 95 | +```python | ||
| 96 | +class FunASRAsyncClient: | ||
| 97 | + def __init__(self, username, server_url): | ||
| 98 | + # 限制队列大小,避免内存溢出 | ||
| 99 | + self.message_queue = queue.Queue(maxsize=100) | ||
| 100 | + self.send_semaphore = asyncio.Semaphore(10) # 并发控制 | ||
| 101 | +``` | ||
| 102 | + | ||
| 103 | +#### 2. 背压控制 | ||
| 104 | +```python | ||
| 105 | +async def _send_message_loop(self): | ||
| 106 | + """优化的发送消息循环""" | ||
| 107 | + while self.connected and self.websocket: | ||
| 108 | + try: | ||
| 109 | + # 检查队列大小,实现背压 | ||
| 110 | + if self.message_queue.qsize() > 50: | ||
| 111 | + await asyncio.sleep(0.1) # 队列过满时减缓发送 | ||
| 112 | + continue | ||
| 113 | + | ||
| 114 | + async with self.send_semaphore: | ||
| 115 | + message = self.message_queue.get_nowait() | ||
| 116 | + await self.websocket.send(message) | ||
| 117 | + | ||
| 118 | + except queue.Empty: | ||
| 119 | + await asyncio.sleep(0.05) # 优化等待间隔 | ||
| 120 | +``` | ||
| 121 | + | ||
| 122 | +### 方案三:分片上传机制(长期优化) | ||
| 123 | + | ||
| 124 | +#### 1. 大文件预处理 | ||
| 125 | +```python | ||
| 126 | +def preprocess_large_audio(self, audio_data, max_chunk_size=1024*1024): | ||
| 127 | + """大文件预处理和分片""" | ||
| 128 | + if len(audio_data) > max_chunk_size: | ||
| 129 | + # 按时间分片,而非简单字节分割 | ||
| 130 | + return self._split_by_time_segments(audio_data) | ||
| 131 | + return [audio_data] | ||
| 132 | + | ||
| 133 | +def _split_by_time_segments(self, audio_data, segment_seconds=30): | ||
| 134 | + """按时间段分割音频""" | ||
| 135 | + sample_rate = 16000 | ||
| 136 | + bytes_per_sample = 2 | ||
| 137 | + segment_bytes = segment_seconds * sample_rate * bytes_per_sample | ||
| 138 | + | ||
| 139 | + segments = [] | ||
| 140 | + for i in range(0, len(audio_data), segment_bytes): | ||
| 141 | + segments.append(audio_data[i:i + segment_bytes]) | ||
| 142 | + return segments | ||
| 143 | +``` | ||
| 144 | + | ||
| 145 | +#### 2. 分片识别结果合并 | ||
| 146 | +```python | ||
| 147 | +class SegmentResultManager: | ||
| 148 | + def __init__(self): | ||
| 149 | + self.segments = {} | ||
| 150 | + self.final_result = "" | ||
| 151 | + | ||
| 152 | + def add_segment_result(self, segment_id, text): | ||
| 153 | + self.segments[segment_id] = text | ||
| 154 | + self._merge_results() | ||
| 155 | + | ||
| 156 | + def _merge_results(self): | ||
| 157 | + # 按顺序合并分片结果 | ||
| 158 | + sorted_segments = sorted(self.segments.items()) | ||
| 159 | + self.final_result = " ".join([text for _, text in sorted_segments]) | ||
| 160 | +``` | ||
| 161 | + | ||
| 162 | +### 方案四:连接稳定性增强 | ||
| 163 | + | ||
| 164 | +#### 1. 心跳机制 | ||
| 165 | +```python | ||
| 166 | +async def _heartbeat_loop(self): | ||
| 167 | + """心跳检测循环""" | ||
| 168 | + while self.connected: | ||
| 169 | + try: | ||
| 170 | + # 每30秒发送心跳 | ||
| 171 | + await asyncio.sleep(30) | ||
| 172 | + if self.websocket: | ||
| 173 | + await self.websocket.ping() | ||
| 174 | + except Exception as e: | ||
| 175 | + util.log(2, f"心跳检测失败: {e}") | ||
| 176 | + self.connected = False | ||
| 177 | + break | ||
| 178 | +``` | ||
| 179 | + | ||
| 180 | +#### 2. 连接质量监控 | ||
| 181 | +```python | ||
| 182 | +class ConnectionMonitor: | ||
| 183 | + def __init__(self): | ||
| 184 | + self.success_count = 0 | ||
| 185 | + self.error_count = 0 | ||
| 186 | + self.last_success_time = time.time() | ||
| 187 | + | ||
| 188 | + def record_success(self): | ||
| 189 | + self.success_count += 1 | ||
| 190 | + self.last_success_time = time.time() | ||
| 191 | + | ||
| 192 | + def record_error(self): | ||
| 193 | + self.error_count += 1 | ||
| 194 | + | ||
| 195 | + def get_health_score(self): | ||
| 196 | + total = self.success_count + self.error_count | ||
| 197 | + if total == 0: | ||
| 198 | + return 1.0 | ||
| 199 | + return self.success_count / total | ||
| 200 | +``` | ||
| 201 | + | ||
| 202 | +## 实施建议 | ||
| 203 | + | ||
| 204 | +### 阶段一:紧急修复(1-2天) | ||
| 205 | +1. **调整超时参数**:将接收超时从1秒调整为30秒 | ||
| 206 | +2. **优化发送间隔**:从0.01秒调整为0.05秒 | ||
| 207 | +3. **增加队列大小限制**:防止内存溢出 | ||
| 208 | + | ||
| 209 | +### 阶段二:稳定性优化(3-5天) | ||
| 210 | +1. **实现动态超时计算**:根据文件大小调整超时 | ||
| 211 | +2. **添加背压控制机制**:防止队列积压 | ||
| 212 | +3. **增强错误处理和重连逻辑** | ||
| 213 | + | ||
| 214 | +### 阶段三:架构优化(1-2周) | ||
| 215 | +1. **实现分片上传机制**:支持超大文件处理 | ||
| 216 | +2. **添加连接池管理**:提高并发处理能力 | ||
| 217 | +3. **实现结果缓存机制**:避免重复处理 | ||
| 218 | + | ||
| 219 | +## 监控指标 | ||
| 220 | + | ||
| 221 | +### 关键指标 | ||
| 222 | +- **连接成功率**:>95% | ||
| 223 | +- **平均响应时间**:<文件时长×2 | ||
| 224 | +- **超时错误率**:<5% | ||
| 225 | +- **内存使用峰值**:<500MB | ||
| 226 | + | ||
| 227 | +### 告警阈值 | ||
| 228 | +- 连接失败率>10% | ||
| 229 | +- 队列积压>100条消息 | ||
| 230 | +- 单次处理时间>5分钟 | ||
| 231 | +- 内存使用>1GB | ||
| 232 | + | ||
| 233 | +## 测试验证 | ||
| 234 | + | ||
| 235 | +### 测试用例 | ||
| 236 | +1. **小文件测试**:<1MB,验证基本功能 | ||
| 237 | +2. **中等文件测试**:1-10MB,验证优化效果 | ||
| 238 | +3. **大文件测试**:>10MB,验证极限处理能力 | ||
| 239 | +4. **并发测试**:多用户同时上传 | ||
| 240 | +5. **网络异常测试**:模拟网络中断和恢复 | ||
| 241 | + | ||
| 242 | +### 性能基准 | ||
| 243 | +- **1MB文件**:<10秒完成识别 | ||
| 244 | +- **10MB文件**:<60秒完成识别 | ||
| 245 | +- **50MB文件**:<300秒完成识别 | ||
| 246 | + | ||
| 247 | +## 风险评估 | ||
| 248 | + | ||
| 249 | +### 技术风险 | ||
| 250 | +- **内存溢出**:大文件处理时内存激增 | ||
| 251 | +- **服务端压力**:并发大文件可能导致服务崩溃 | ||
| 252 | +- **网络稳定性**:长时间传输易受网络波动影响 | ||
| 253 | + | ||
| 254 | +### 缓解措施 | ||
| 255 | +- 实施内存监控和自动清理 | ||
| 256 | +- 添加服务端负载均衡 | ||
| 257 | +- 实现断点续传机制 | ||
| 258 | +- 增加详细的错误日志和监控 |
streaming/__init__.py
0 → 100644
| 1 | +# AIfeng/2025-01-07 09:46:00 | ||
| 2 | +""" | ||
| 3 | +Streaming Speech Recognition Module | ||
| 4 | + | ||
| 5 | +流式语音识别模块,提供实时语音活动检测、累积识别和结果管理功能。 | ||
| 6 | + | ||
| 7 | +主要组件: | ||
| 8 | +- StreamingVAD: 流式语音活动检测 | ||
| 9 | +- StreamingRecognitionManager: 识别结果管理 | ||
| 10 | +- StreamingRecorder: 流式录音器 | ||
| 11 | +""" | ||
| 12 | + | ||
| 13 | +from .streaming_vad import StreamingVAD | ||
| 14 | +from .streaming_recognition_manager import StreamingRecognitionManager | ||
| 15 | +from .streaming_recorder import StreamingRecorder | ||
| 16 | + | ||
| 17 | +__all__ = [ | ||
| 18 | + 'StreamingVAD', | ||
| 19 | + 'StreamingRecognitionManager', | ||
| 20 | + 'StreamingRecorder' | ||
| 21 | +] | ||
| 22 | + | ||
| 23 | +__version__ = '1.0.0' | ||
| 24 | +__author__ = 'AIfeng' |
streaming/optimization/__init__.py
0 → 100644
| 1 | +# AIfeng/2025-07-07 15:25:48 | ||
| 2 | +# 流式语音识别优化模块包初始化文件 | ||
| 3 | + | ||
| 4 | +""" | ||
| 5 | +流式语音识别优化模块 | ||
| 6 | + | ||
| 7 | +本模块包含以下核心组件: | ||
| 8 | +1. IntelligentSentenceSegmentation - 智能断句模块 | ||
| 9 | +2. AdaptiveVADChunking - 自适应VAD分片模块 | ||
| 10 | +3. RecognitionResultTracker - 识别结果追踪模块 | ||
| 11 | +4. StreamingDisplayManager - 流式显示管理模块 | ||
| 12 | + | ||
| 13 | +这些模块协同工作,提供更智能、更高效的流式语音识别体验。 | ||
| 14 | +""" | ||
| 15 | + | ||
| 16 | +from .intelligent_segmentation import ( | ||
| 17 | + IntelligentSentenceSegmentation, | ||
| 18 | + SpeechSegment, | ||
| 19 | + SegmentType, | ||
| 20 | + AdaptiveSilenceThreshold | ||
| 21 | +) | ||
| 22 | + | ||
| 23 | +from .adaptive_vad_chunking import ( | ||
| 24 | + AdaptiveVADChunking, | ||
| 25 | + ChunkStrategy, | ||
| 26 | + RecognitionStage, | ||
| 27 | + ChunkConfig, | ||
| 28 | + AudioChunk, | ||
| 29 | + RecognitionResult, | ||
| 30 | + PerformanceMonitor, | ||
| 31 | + ProgressiveRecognition, | ||
| 32 | + ChunkQualityAssessor | ||
| 33 | +) | ||
| 34 | + | ||
| 35 | +from .recognition_result_tracker import ( | ||
| 36 | + RecognitionResultTracker, | ||
| 37 | + ResultType, | ||
| 38 | + ResultStatus, | ||
| 39 | + RecognitionSegmentID, | ||
| 40 | + RecognitionResult as TrackerRecognitionResult, | ||
| 41 | + ResultRelationship | ||
| 42 | +) | ||
| 43 | + | ||
| 44 | +from .streaming_display_manager import ( | ||
| 45 | + StreamingDisplayManager, | ||
| 46 | + UpdateType, | ||
| 47 | + RefreshStrategy, | ||
| 48 | + DisplayPriority, | ||
| 49 | + DisplayUpdate, | ||
| 50 | + DisplaySegment, | ||
| 51 | + DisplayBuffer | ||
| 52 | +) | ||
| 53 | + | ||
| 54 | +from .optimization_manager import ( | ||
| 55 | + OptimizationManager, | ||
| 56 | + OptimizationMode | ||
| 57 | +) | ||
| 58 | + | ||
| 59 | +__version__ = "1.0.0" | ||
| 60 | +__author__ = "AIfeng" | ||
| 61 | + | ||
| 62 | +__all__ = [ | ||
| 63 | + # 智能断句模块 | ||
| 64 | + 'IntelligentSentenceSegmentation', | ||
| 65 | + 'SpeechSegment', | ||
| 66 | + 'SegmentType', | ||
| 67 | + 'AdaptiveSilenceThreshold', | ||
| 68 | + | ||
| 69 | + # 自适应VAD分片模块 | ||
| 70 | + 'AdaptiveVADChunking', | ||
| 71 | + 'ChunkStrategy', | ||
| 72 | + 'RecognitionStage', | ||
| 73 | + 'ChunkConfig', | ||
| 74 | + 'AudioChunk', | ||
| 75 | + 'RecognitionResult', | ||
| 76 | + 'PerformanceMonitor', | ||
| 77 | + 'ProgressiveRecognition', | ||
| 78 | + 'ChunkQualityAssessor', | ||
| 79 | + | ||
| 80 | + # 识别结果追踪模块 | ||
| 81 | + 'RecognitionResultTracker', | ||
| 82 | + 'ResultType', | ||
| 83 | + 'ResultStatus', | ||
| 84 | + 'RecognitionSegmentID', | ||
| 85 | + 'TrackerRecognitionResult', | ||
| 86 | + 'ResultRelationship', | ||
| 87 | + | ||
| 88 | + # 流式显示管理模块 | ||
| 89 | + 'StreamingDisplayManager', | ||
| 90 | + 'UpdateType', | ||
| 91 | + 'RefreshStrategy', | ||
| 92 | + 'DisplayPriority', | ||
| 93 | + 'DisplayUpdate', | ||
| 94 | + 'DisplaySegment', | ||
| 95 | + 'DisplayBuffer', | ||
| 96 | + | ||
| 97 | + # 优化管理器 | ||
| 98 | + 'OptimizationManager', | ||
| 99 | + 'OptimizationMode' | ||
| 100 | +] | ||
| 101 | + | ||
| 102 | +# 模块信息 | ||
| 103 | +MODULE_INFO = { | ||
| 104 | + 'name': 'streaming_optimization', | ||
| 105 | + 'description': '流式语音识别优化模块集合', | ||
| 106 | + 'version': __version__, | ||
| 107 | + 'author': __author__, | ||
| 108 | + 'components': { | ||
| 109 | + 'intelligent_segmentation': '智能断句 - 基于静音间隔和语义分析的语音分段', | ||
| 110 | + 'adaptive_vad_chunking': '自适应VAD分片 - 动态平衡响应速度与识别精度', | ||
| 111 | + 'recognition_result_tracker': '识别结果追踪 - 完整的结果追踪与关联管理', | ||
| 112 | + 'streaming_display_manager': '流式显示管理 - 增量更新与刷新策略' | ||
| 113 | + }, | ||
| 114 | + 'features': [ | ||
| 115 | + '多级静音阈值智能断句', | ||
| 116 | + '自适应VAD分片策略', | ||
| 117 | + '渐进式识别处理', | ||
| 118 | + '结果唯一标识与追踪', | ||
| 119 | + '增量显示更新', | ||
| 120 | + '防抖刷新机制', | ||
| 121 | + '性能监控与优化' | ||
| 122 | + ] | ||
| 123 | +} | ||
| 124 | + | ||
| 125 | +def get_module_info(): | ||
| 126 | + """获取模块信息""" | ||
| 127 | + return MODULE_INFO | ||
| 128 | + | ||
| 129 | +def get_version(): | ||
| 130 | + """获取版本信息""" | ||
| 131 | + return __version__ |
| 1 | +# AIfeng/2025-07-07 15:25:48 | ||
| 2 | +# 自适应VAD分片优化模块 - 动态平衡响应速度与识别精度 | ||
| 3 | + | ||
| 4 | +import time | ||
| 5 | +import numpy as np | ||
| 6 | +from typing import List, Dict, Optional, Tuple, Any | ||
| 7 | +from dataclasses import dataclass | ||
| 8 | +from enum import Enum | ||
| 9 | +import threading | ||
| 10 | +import logging | ||
| 11 | +from collections import deque | ||
| 12 | + | ||
| 13 | +class ChunkStrategy(Enum): | ||
| 14 | + """分片策略类型""" | ||
| 15 | + FAST_RESPONSE = "fast_response" # 快速响应 | ||
| 16 | + HIGH_ACCURACY = "high_accuracy" # 高精度 | ||
| 17 | + BALANCED = "balanced" # 平衡模式 | ||
| 18 | + ADAPTIVE = "adaptive" # 自适应 | ||
| 19 | + | ||
| 20 | +class RecognitionStage(Enum): | ||
| 21 | + """识别阶段""" | ||
| 22 | + IMMEDIATE = "immediate" # 即时识别 | ||
| 23 | + REFINED = "refined" # 精化识别 | ||
| 24 | + FINAL = "final" # 最终识别 | ||
| 25 | + | ||
| 26 | +@dataclass | ||
| 27 | +class ChunkConfig: | ||
| 28 | + """分片配置""" | ||
| 29 | + min_duration: float | ||
| 30 | + max_duration: float | ||
| 31 | + confidence_threshold: float | ||
| 32 | + overlap_ratio: float = 0.1 | ||
| 33 | + quality_weight: float = 0.5 | ||
| 34 | + speed_weight: float = 0.5 | ||
| 35 | + | ||
| 36 | +@dataclass | ||
| 37 | +class AudioChunk: | ||
| 38 | + """音频分片数据结构""" | ||
| 39 | + data: bytes | ||
| 40 | + duration: float | ||
| 41 | + start_time: float | ||
| 42 | + end_time: float | ||
| 43 | + chunk_id: str | ||
| 44 | + strategy: ChunkStrategy | ||
| 45 | + stage: RecognitionStage | ||
| 46 | + confidence: float = 0.0 | ||
| 47 | + is_processed: bool = False | ||
| 48 | + parent_chunk_id: Optional[str] = None | ||
| 49 | + is_speech: bool = True # 添加语音检测标志 | ||
| 50 | + timestamp: float = 0.0 # 添加时间戳属性 | ||
| 51 | + | ||
| 52 | +@dataclass | ||
| 53 | +class RecognitionResult: | ||
| 54 | + """识别结果""" | ||
| 55 | + text: str | ||
| 56 | + confidence: float | ||
| 57 | + chunk_id: str | ||
| 58 | + stage: RecognitionStage | ||
| 59 | + processing_time: float | ||
| 60 | + accuracy_score: float = 0.0 | ||
| 61 | + | ||
| 62 | +class PerformanceMonitor: | ||
| 63 | + """性能监控器""" | ||
| 64 | + | ||
| 65 | + def __init__(self, window_size: int = 20): | ||
| 66 | + self.window_size = window_size | ||
| 67 | + self.accuracy_history = deque(maxlen=window_size) | ||
| 68 | + self.latency_history = deque(maxlen=window_size) | ||
| 69 | + self.confidence_history = deque(maxlen=window_size) | ||
| 70 | + | ||
| 71 | + def record_result(self, result: RecognitionResult, latency: float): | ||
| 72 | + """记录识别结果""" | ||
| 73 | + self.accuracy_history.append(result.accuracy_score) | ||
| 74 | + self.latency_history.append(latency) | ||
| 75 | + self.confidence_history.append(result.confidence) | ||
| 76 | + | ||
| 77 | + def get_recent_accuracy(self) -> float: | ||
| 78 | + """获取最近的准确率""" | ||
| 79 | + return np.mean(self.accuracy_history) if self.accuracy_history else 0.0 | ||
| 80 | + | ||
| 81 | + def get_recent_latency(self) -> float: | ||
| 82 | + """获取最近的延迟""" | ||
| 83 | + return np.mean(self.latency_history) if self.latency_history else 0.0 | ||
| 84 | + | ||
| 85 | + def get_recent_confidence(self) -> float: | ||
| 86 | + """获取最近的置信度""" | ||
| 87 | + return np.mean(self.confidence_history) if self.confidence_history else 0.0 | ||
| 88 | + | ||
| 89 | + def update_metrics(self, metrics: Dict): | ||
| 90 | + """更新性能指标""" | ||
| 91 | + if 'accuracy' in metrics: | ||
| 92 | + self.accuracy_history.append(metrics['accuracy']) | ||
| 93 | + if 'latency' in metrics: | ||
| 94 | + self.latency_history.append(metrics['latency']) | ||
| 95 | + if 'confidence' in metrics: | ||
| 96 | + self.confidence_history.append(metrics['confidence']) | ||
| 97 | + | ||
| 98 | +class AdaptiveVADChunking: | ||
| 99 | + """自适应VAD分片处理器""" | ||
| 100 | + | ||
| 101 | + def __init__(self, config: Dict = None): | ||
| 102 | + self.config = config or self._get_default_config() | ||
| 103 | + | ||
| 104 | + # 分片策略配置 | ||
| 105 | + self.chunk_strategies = { | ||
| 106 | + ChunkStrategy.FAST_RESPONSE: ChunkConfig( | ||
| 107 | + min_duration=0.5, | ||
| 108 | + max_duration=2.0, | ||
| 109 | + confidence_threshold=0.7, | ||
| 110 | + quality_weight=0.3, | ||
| 111 | + speed_weight=0.7 | ||
| 112 | + ), | ||
| 113 | + ChunkStrategy.HIGH_ACCURACY: ChunkConfig( | ||
| 114 | + min_duration=1.5, | ||
| 115 | + max_duration=4.0, | ||
| 116 | + confidence_threshold=0.8, | ||
| 117 | + quality_weight=0.8, | ||
| 118 | + speed_weight=0.2 | ||
| 119 | + ), | ||
| 120 | + ChunkStrategy.BALANCED: ChunkConfig( | ||
| 121 | + min_duration=1.0, | ||
| 122 | + max_duration=3.0, | ||
| 123 | + confidence_threshold=0.75, | ||
| 124 | + quality_weight=0.5, | ||
| 125 | + speed_weight=0.5 | ||
| 126 | + ), | ||
| 127 | + ChunkStrategy.ADAPTIVE: ChunkConfig( | ||
| 128 | + min_duration=0.8, | ||
| 129 | + max_duration=3.5, | ||
| 130 | + confidence_threshold=0.75, | ||
| 131 | + quality_weight=0.6, | ||
| 132 | + speed_weight=0.4 | ||
| 133 | + ) | ||
| 134 | + } | ||
| 135 | + | ||
| 136 | + self.current_strategy = ChunkStrategy.ADAPTIVE | ||
| 137 | + self.performance_monitor = PerformanceMonitor() | ||
| 138 | + self.chunk_buffer = [] | ||
| 139 | + self.processing_queue = deque() | ||
| 140 | + | ||
| 141 | + # 自适应参数 | ||
| 142 | + self.adaptation_enabled = self.config.get('adaptation_enabled', True) | ||
| 143 | + self.strategy_switch_threshold = self.config.get('strategy_switch_threshold', 0.75) | ||
| 144 | + self.min_samples_for_adaptation = self.config.get('min_samples_for_adaptation', 10) | ||
| 145 | + | ||
| 146 | + self.logger = logging.getLogger(__name__) | ||
| 147 | + self._lock = threading.Lock() | ||
| 148 | + | ||
| 149 | + # 回调函数管理 | ||
| 150 | + self.quality_callbacks = [] # 质量反馈回调 | ||
| 151 | + | ||
| 152 | + # 内存管理 | ||
| 153 | + self.last_cleanup_time = time.time() | ||
| 154 | + self.cleanup_interval = 30.0 # 30秒清理一次 | ||
| 155 | + | ||
| 156 | + def get_performance_stats(self) -> Dict: | ||
| 157 | + """获取性能统计""" | ||
| 158 | + with self._lock: | ||
| 159 | + return { | ||
| 160 | + 'current_strategy': self.current_strategy.value if hasattr(self.current_strategy, 'value') else str(self.current_strategy), | ||
| 161 | + 'total_chunks_processed': getattr(self, 'total_chunks_processed', 0), | ||
| 162 | + 'speech_chunks': getattr(self, 'speech_chunks', 0), | ||
| 163 | + 'silence_chunks': getattr(self, 'silence_chunks', 0), | ||
| 164 | + 'average_chunk_duration': getattr(self, 'average_chunk_duration', 0.0) | ||
| 165 | + } | ||
| 166 | + | ||
| 167 | + def set_strategy(self, strategy): | ||
| 168 | + """设置VAD策略""" | ||
| 169 | + with self._lock: | ||
| 170 | + self.current_strategy = strategy | ||
| 171 | + self.logger.info(f"VAD策略已设置为: {strategy}") | ||
| 172 | + | ||
| 173 | + def register_quality_callback(self, callback): | ||
| 174 | + """注册质量反馈回调函数""" | ||
| 175 | + self.quality_callbacks.append(callback) | ||
| 176 | + self.logger.debug("注册质量反馈回调函数") | ||
| 177 | + | ||
| 178 | + def _trigger_quality_callbacks(self, chunk_id: str, quality_metrics: Dict): | ||
| 179 | + """触发质量反馈回调""" | ||
| 180 | + for callback in self.quality_callbacks: | ||
| 181 | + try: | ||
| 182 | + callback(chunk_id, quality_metrics) | ||
| 183 | + except Exception as e: | ||
| 184 | + self.logger.error(f"质量反馈回调执行失败: {e}") | ||
| 185 | + | ||
| 186 | + def create_session(self, session_id: str): | ||
| 187 | + """创建会话""" | ||
| 188 | + # 为会话初始化相关数据结构 | ||
| 189 | + self.logger.info(f"VAD分片会话创建: {session_id}") | ||
| 190 | + | ||
| 191 | + def complete_session(self, session_id: str): | ||
| 192 | + """完成会话""" | ||
| 193 | + # 清理会话相关的缓存数据 | ||
| 194 | + with self._lock: | ||
| 195 | + self.chunk_buffer.clear() | ||
| 196 | + self.processing_queue.clear() | ||
| 197 | + # 限制回调函数数量,防止内存泄漏 | ||
| 198 | + if len(self.quality_callbacks) > 10: | ||
| 199 | + self.quality_callbacks = self.quality_callbacks[-10:] | ||
| 200 | + self.logger.info(f"VAD分片会话完成: {session_id},已清理缓存数据") | ||
| 201 | + | ||
| 202 | + def process_audio(self, session_id: str, audio_data: bytes, sample_rate: int, strategy: ChunkStrategy = None) -> List: | ||
| 203 | + """处理音频数据(兼容OptimizationManager调用)""" | ||
| 204 | + try: | ||
| 205 | + timestamp = time.time() | ||
| 206 | + chunks = self.process_audio_data(audio_data, timestamp) | ||
| 207 | + return chunks | ||
| 208 | + except Exception as e: | ||
| 209 | + self.logger.error(f"处理音频数据失败: {e}") | ||
| 210 | + return [] | ||
| 211 | + | ||
| 212 | + def _get_default_config(self) -> Dict: | ||
| 213 | + """获取默认配置""" | ||
| 214 | + return { | ||
| 215 | + 'adaptation_enabled': True, | ||
| 216 | + 'strategy_switch_threshold': 0.75, | ||
| 217 | + 'min_samples_for_adaptation': 10, | ||
| 218 | + 'max_chunk_buffer_size': 50, | ||
| 219 | + 'progressive_recognition': True, | ||
| 220 | + 'quality_feedback_enabled': True | ||
| 221 | + } | ||
| 222 | + | ||
| 223 | + def process_audio_data(self, audio_data: bytes, timestamp: float, | ||
| 224 | + context: Dict = None) -> List[AudioChunk]: | ||
| 225 | + """处理音频数据并生成分片""" | ||
| 226 | + try: | ||
| 227 | + with self._lock: | ||
| 228 | + # 定期清理内存,防止内存泄漏 | ||
| 229 | + current_time = time.time() | ||
| 230 | + if current_time - self.last_cleanup_time > self.cleanup_interval: | ||
| 231 | + self._cleanup_memory() | ||
| 232 | + self.last_cleanup_time = current_time | ||
| 233 | + | ||
| 234 | + # 选择最优策略 | ||
| 235 | + # 移除未定义的strategy变量引用 | ||
| 236 | + if self.adaptation_enabled: | ||
| 237 | + self._update_strategy(context or {}) | ||
| 238 | + | ||
| 239 | + # 生成音频分片 | ||
| 240 | + chunks = self._create_chunks(audio_data, timestamp) | ||
| 241 | + | ||
| 242 | + # 添加到处理队列,限制队列大小防止内存泄漏 | ||
| 243 | + max_queue_size = self.config.get('max_chunk_buffer_size', 50) | ||
| 244 | + for chunk in chunks: | ||
| 245 | + if len(self.processing_queue) >= max_queue_size: | ||
| 246 | + # 队列满时,移除最旧的分片 | ||
| 247 | + removed_chunk = self.processing_queue.popleft() | ||
| 248 | + self.logger.warning(f"处理队列已满,移除分片: {removed_chunk.chunk_id}") | ||
| 249 | + self.processing_queue.append(chunk) | ||
| 250 | + | ||
| 251 | + return chunks | ||
| 252 | + | ||
| 253 | + except Exception as e: | ||
| 254 | + self.logger.error(f"处理音频数据时出错: {e}") | ||
| 255 | + return [] | ||
| 256 | + | ||
| 257 | + def _update_strategy(self, context: Dict): | ||
| 258 | + """更新分片策略""" | ||
| 259 | + if len(self.performance_monitor.accuracy_history) < self.min_samples_for_adaptation: | ||
| 260 | + return | ||
| 261 | + | ||
| 262 | + current_accuracy = self.performance_monitor.get_recent_accuracy() | ||
| 263 | + current_latency = self.performance_monitor.get_recent_latency() | ||
| 264 | + | ||
| 265 | + # 获取上下文信息 | ||
| 266 | + interaction_mode = context.get('interaction_mode', 'normal') | ||
| 267 | + noise_level = context.get('noise_level', 0.1) | ||
| 268 | + user_patience = context.get('user_patience', 'normal') # 'low', 'normal', 'high' | ||
| 269 | + | ||
| 270 | + # 策略选择逻辑 | ||
| 271 | + new_strategy = self._select_optimal_strategy( | ||
| 272 | + current_accuracy, current_latency, interaction_mode, | ||
| 273 | + noise_level, user_patience | ||
| 274 | + ) | ||
| 275 | + | ||
| 276 | + if new_strategy != self.current_strategy: | ||
| 277 | + self.logger.info(f"策略切换: {self.current_strategy.value} -> {new_strategy.value}") | ||
| 278 | + self.current_strategy = new_strategy | ||
| 279 | + | ||
| 280 | + def _select_optimal_strategy(self, accuracy: float, latency: float, | ||
| 281 | + interaction_mode: str, noise_level: float, | ||
| 282 | + user_patience: str) -> ChunkStrategy: | ||
| 283 | + """选择最优分片策略""" | ||
| 284 | + # 快速响应条件 | ||
| 285 | + if (interaction_mode == 'quick_qa' and accuracy > 0.85 and | ||
| 286 | + user_patience == 'low'): | ||
| 287 | + return ChunkStrategy.FAST_RESPONSE | ||
| 288 | + | ||
| 289 | + # 高精度条件 | ||
| 290 | + if (noise_level > 0.3 or accuracy < 0.7 or | ||
| 291 | + interaction_mode == 'detailed_analysis'): | ||
| 292 | + return ChunkStrategy.HIGH_ACCURACY | ||
| 293 | + | ||
| 294 | + # 自适应条件 | ||
| 295 | + if self.config.get('enable_adaptive_strategy', False): | ||
| 296 | + return ChunkStrategy.ADAPTIVE | ||
| 297 | + | ||
| 298 | + # 默认平衡模式 | ||
| 299 | + return ChunkStrategy.BALANCED | ||
| 300 | + | ||
| 301 | + def _create_chunks(self, audio_data: bytes, timestamp: float) -> List[AudioChunk]: | ||
| 302 | + """创建音频分片""" | ||
| 303 | + chunks = [] | ||
| 304 | + current_config = self.chunk_strategies[self.current_strategy] | ||
| 305 | + | ||
| 306 | + # 计算分片参数 | ||
| 307 | + data_length = len(audio_data) | ||
| 308 | + sample_rate = self.config.get('sample_rate', 16000) | ||
| 309 | + bytes_per_sample = self.config.get('bytes_per_sample', 2) | ||
| 310 | + | ||
| 311 | + # 估算音频时长 - 添加除零保护 | ||
| 312 | + if sample_rate <= 0 or bytes_per_sample <= 0: | ||
| 313 | + self.logger.error(f"无效的音频参数: sample_rate={sample_rate}, bytes_per_sample={bytes_per_sample}") | ||
| 314 | + return [] | ||
| 315 | + | ||
| 316 | + audio_duration = data_length / (sample_rate * bytes_per_sample) | ||
| 317 | + | ||
| 318 | + # 确定分片大小 | ||
| 319 | + chunk_duration = self._calculate_optimal_chunk_duration( | ||
| 320 | + audio_duration, current_config | ||
| 321 | + ) | ||
| 322 | + | ||
| 323 | + # 生成分片 | ||
| 324 | + chunk_size = int(chunk_duration * sample_rate * bytes_per_sample) | ||
| 325 | + overlap_size = int(chunk_size * current_config.overlap_ratio) | ||
| 326 | + | ||
| 327 | + start_pos = 0 | ||
| 328 | + chunk_index = 0 | ||
| 329 | + | ||
| 330 | + # 防止无限循环的安全检查 | ||
| 331 | + max_iterations = 1000 # 最大迭代次数 | ||
| 332 | + iteration_count = 0 | ||
| 333 | + | ||
| 334 | + while start_pos < data_length and iteration_count < max_iterations: | ||
| 335 | + end_pos = min(start_pos + chunk_size, data_length) | ||
| 336 | + | ||
| 337 | + # 确保分片有效(至少有一些数据) | ||
| 338 | + if end_pos <= start_pos: | ||
| 339 | + self.logger.warning(f"无效分片位置: start_pos={start_pos}, end_pos={end_pos}") | ||
| 340 | + break | ||
| 341 | + | ||
| 342 | + chunk_data = audio_data[start_pos:end_pos] | ||
| 343 | + chunk_start_time = timestamp + (start_pos / (sample_rate * bytes_per_sample)) | ||
| 344 | + chunk_end_time = timestamp + (end_pos / (sample_rate * bytes_per_sample)) | ||
| 345 | + | ||
| 346 | + chunk = AudioChunk( | ||
| 347 | + data=chunk_data, | ||
| 348 | + duration=chunk_end_time - chunk_start_time, | ||
| 349 | + start_time=chunk_start_time, | ||
| 350 | + end_time=chunk_end_time, | ||
| 351 | + chunk_id=f"{int(timestamp * 1000)}_{chunk_index}", | ||
| 352 | + strategy=self.current_strategy, | ||
| 353 | + stage=RecognitionStage.IMMEDIATE, | ||
| 354 | + timestamp=chunk_start_time # 正确设置timestamp属性 | ||
| 355 | + ) | ||
| 356 | + | ||
| 357 | + chunks.append(chunk) | ||
| 358 | + | ||
| 359 | + # 安全的位置更新:确保始终向前推进 | ||
| 360 | + next_pos = end_pos - overlap_size if overlap_size > 0 else end_pos | ||
| 361 | + | ||
| 362 | + # 防止无限循环:确保位置至少前进1个字节 | ||
| 363 | + if next_pos <= start_pos: | ||
| 364 | + next_pos = start_pos + max(1, chunk_size // 4) # 至少前进1/4分片大小 | ||
| 365 | + self.logger.warning(f"调整分片位置以防止无限循环: {start_pos} -> {next_pos}") | ||
| 366 | + | ||
| 367 | + start_pos = next_pos | ||
| 368 | + chunk_index += 1 | ||
| 369 | + iteration_count += 1 | ||
| 370 | + | ||
| 371 | + if iteration_count >= max_iterations: | ||
| 372 | + self.logger.error(f"分片创建达到最大迭代次数限制: {max_iterations}") | ||
| 373 | + | ||
| 374 | + return chunks | ||
| 375 | + | ||
| 376 | + def _calculate_optimal_chunk_duration(self, total_duration: float, | ||
| 377 | + config: ChunkConfig) -> float: | ||
| 378 | + """计算最优分片时长""" | ||
| 379 | + # 基础分片时长 | ||
| 380 | + base_duration = min(config.max_duration, | ||
| 381 | + max(config.min_duration, total_duration / 3)) | ||
| 382 | + | ||
| 383 | + # 根据性能历史调整 | ||
| 384 | + recent_accuracy = self.performance_monitor.get_recent_accuracy() | ||
| 385 | + recent_latency = self.performance_monitor.get_recent_latency() | ||
| 386 | + | ||
| 387 | + # 动态调整因子 | ||
| 388 | + if recent_accuracy < self.strategy_switch_threshold: | ||
| 389 | + # 准确率低,增加分片时长 | ||
| 390 | + adjustment_factor = 1.2 | ||
| 391 | + elif recent_latency > 2.0: # 延迟过高 | ||
| 392 | + # 延迟高,减少分片时长 | ||
| 393 | + adjustment_factor = 0.8 | ||
| 394 | + else: | ||
| 395 | + adjustment_factor = 1.0 | ||
| 396 | + | ||
| 397 | + optimal_duration = base_duration * adjustment_factor | ||
| 398 | + | ||
| 399 | + # 确保在配置范围内 | ||
| 400 | + return max(config.min_duration, | ||
| 401 | + min(config.max_duration, optimal_duration)) | ||
| 402 | + | ||
| 403 | + def process_audio_chunk(self, audio_data) -> Optional[AudioChunk]: | ||
| 404 | + """处理单个音频分片""" | ||
| 405 | + try: | ||
| 406 | + # 处理不同类型的音频数据 | ||
| 407 | + if isinstance(audio_data, np.ndarray): | ||
| 408 | + # 正确处理numpy数组:先归一化到int16范围,再转换 | ||
| 409 | + if audio_data.dtype == np.float32 or audio_data.dtype == np.float64: | ||
| 410 | + # 浮点数组:假设范围在[-1, 1]或[0, 1],转换到int16范围 | ||
| 411 | + if audio_data.max() <= 1.0 and audio_data.min() >= 0.0: | ||
| 412 | + # [0, 1] 范围,转换到 [-32768, 32767] | ||
| 413 | + audio_data = (audio_data * 2 - 1) * 32767 | ||
| 414 | + else: | ||
| 415 | + # [-1, 1] 范围,直接缩放到 [-32768, 32767] | ||
| 416 | + audio_data = audio_data * 32767 | ||
| 417 | + # 修复 BufferError: memoryview has 1 exported buffer | ||
| 418 | + audio_int16 = audio_data.astype(np.int16) | ||
| 419 | + audio_bytes = bytes(audio_int16.tobytes()) | ||
| 420 | + else: | ||
| 421 | + # 整数数组,直接转换 | ||
| 422 | + # 修复 BufferError: memoryview has 1 exported buffer | ||
| 423 | + audio_int16 = audio_data.astype(np.int16) | ||
| 424 | + audio_bytes = bytes(audio_int16.tobytes()) | ||
| 425 | + elif isinstance(audio_data, bytes): | ||
| 426 | + audio_bytes = audio_data | ||
| 427 | + else: | ||
| 428 | + # 尝试转换为bytes | ||
| 429 | + audio_bytes = bytes(audio_data) | ||
| 430 | + | ||
| 431 | + timestamp = time.time() | ||
| 432 | + chunks = self.process_audio_data(audio_bytes, timestamp) | ||
| 433 | + return chunks[0] if chunks else None | ||
| 434 | + except Exception as e: | ||
| 435 | + self.logger.error(f"处理音频分片失败: {e}") | ||
| 436 | + return None | ||
| 437 | + | ||
| 438 | + def select_optimal_strategy(self) -> ChunkStrategy: | ||
| 439 | + """选择最优策略""" | ||
| 440 | + try: | ||
| 441 | + recent_accuracy = self.performance_monitor.get_recent_accuracy() | ||
| 442 | + recent_latency = self.performance_monitor.get_recent_latency() | ||
| 443 | + recent_confidence = self.performance_monitor.get_recent_confidence() | ||
| 444 | + | ||
| 445 | + # 基于性能指标选择策略 | ||
| 446 | + if recent_accuracy < 0.7 or recent_confidence < 0.6: | ||
| 447 | + return ChunkStrategy.HIGH_ACCURACY | ||
| 448 | + elif recent_latency > 1.0: | ||
| 449 | + return ChunkStrategy.FAST_RESPONSE | ||
| 450 | + elif recent_accuracy > 0.9 and recent_latency < 0.5: | ||
| 451 | + return ChunkStrategy.ADAPTIVE | ||
| 452 | + else: | ||
| 453 | + return ChunkStrategy.BALANCED | ||
| 454 | + except Exception as e: | ||
| 455 | + self.logger.error(f"选择最优策略失败: {e}") | ||
| 456 | + return ChunkStrategy.BALANCED | ||
| 457 | + | ||
| 458 | + def _cleanup_memory(self): | ||
| 459 | + """清理内存,防止内存泄漏""" | ||
| 460 | + try: | ||
| 461 | + # 清理过大的chunk_buffer | ||
| 462 | + max_buffer_size = self.config.get('max_chunk_buffer_size', 50) | ||
| 463 | + if len(self.chunk_buffer) > max_buffer_size: | ||
| 464 | + self.chunk_buffer = self.chunk_buffer[-max_buffer_size//2:] | ||
| 465 | + self.logger.info(f"清理chunk_buffer,保留最新的{max_buffer_size//2}个分片") | ||
| 466 | + | ||
| 467 | + # 清理过多的回调函数 | ||
| 468 | + if len(self.quality_callbacks) > 10: | ||
| 469 | + self.quality_callbacks = self.quality_callbacks[-10:] | ||
| 470 | + self.logger.info("清理过多的质量回调函数") | ||
| 471 | + | ||
| 472 | + # 清理处理队列中过旧的分片 | ||
| 473 | + current_time = time.time() | ||
| 474 | + old_queue_size = len(self.processing_queue) | ||
| 475 | + self.processing_queue = deque([ | ||
| 476 | + chunk for chunk in self.processing_queue | ||
| 477 | + if current_time - chunk.start_time < 60.0 # 保留60秒内的分片 | ||
| 478 | + ]) | ||
| 479 | + | ||
| 480 | + if old_queue_size != len(self.processing_queue): | ||
| 481 | + self.logger.info(f"清理处理队列,从{old_queue_size}个分片减少到{len(self.processing_queue)}个") | ||
| 482 | + | ||
| 483 | + except Exception as e: | ||
| 484 | + self.logger.error(f"内存清理失败: {e}") | ||
| 485 | + | ||
| 486 | +class ProgressiveRecognition: | ||
| 487 | + """渐进式识别处理器""" | ||
| 488 | + | ||
| 489 | + def __init__(self, config: Dict = None): | ||
| 490 | + self.config = config or {} | ||
| 491 | + self.recognition_stages = { | ||
| 492 | + RecognitionStage.IMMEDIATE: 0.8, # 800ms 快速识别 | ||
| 493 | + RecognitionStage.REFINED: 2.0, # 2s 精化识别 | ||
| 494 | + RecognitionStage.FINAL: 4.0 # 4s 最终识别 | ||
| 495 | + } | ||
| 496 | + | ||
| 497 | + self.stage_results = {} # 存储各阶段结果 | ||
| 498 | + self.logger = logging.getLogger(__name__) | ||
| 499 | + | ||
| 500 | + def process_audio_segment(self, chunk: AudioChunk) -> Dict[RecognitionStage, RecognitionResult]: | ||
| 501 | + """渐进式识别处理""" | ||
| 502 | + results = {} | ||
| 503 | + | ||
| 504 | + try: | ||
| 505 | + # 阶段1:快速识别(低延迟) | ||
| 506 | + if chunk.duration >= self.recognition_stages[RecognitionStage.IMMEDIATE]: | ||
| 507 | + immediate_result = self._quick_recognition(chunk) | ||
| 508 | + if immediate_result: | ||
| 509 | + results[RecognitionStage.IMMEDIATE] = immediate_result | ||
| 510 | + | ||
| 511 | + # 阶段2:精化识别(平衡) | ||
| 512 | + if chunk.duration >= self.recognition_stages[RecognitionStage.REFINED]: | ||
| 513 | + refined_result = self._refined_recognition(chunk) | ||
| 514 | + if refined_result: | ||
| 515 | + results[RecognitionStage.REFINED] = refined_result | ||
| 516 | + | ||
| 517 | + # 阶段3:最终识别(高精度) | ||
| 518 | + if chunk.duration >= self.recognition_stages[RecognitionStage.FINAL]: | ||
| 519 | + final_result = self._final_recognition(chunk) | ||
| 520 | + if final_result: | ||
| 521 | + results[RecognitionStage.FINAL] = final_result | ||
| 522 | + | ||
| 523 | + # 存储结果到stage_results中 | ||
| 524 | + if results: | ||
| 525 | + self.stage_results[chunk.chunk_id] = results | ||
| 526 | + | ||
| 527 | + # 定期清理过期结果,防止内存泄漏 | ||
| 528 | + if len(self.stage_results) > 100: # 当结果数量超过100时清理 | ||
| 529 | + self.cleanup_old_results(max_age=60.0) # 清理60秒前的结果 | ||
| 530 | + | ||
| 531 | + return results | ||
| 532 | + | ||
| 533 | + except Exception as e: | ||
| 534 | + self.logger.error(f"渐进式识别处理出错: {e}") | ||
| 535 | + return {} | ||
| 536 | + | ||
| 537 | + def _quick_recognition(self, chunk: AudioChunk) -> Optional[RecognitionResult]: | ||
| 538 | + """快速识别(模拟)""" | ||
| 539 | + # 这里应该调用实际的ASR服务 | ||
| 540 | + # 模拟快速识别结果(不使用sleep以避免测试卡住) | ||
| 541 | + processing_start = time.time() | ||
| 542 | + | ||
| 543 | + # 模拟处理时间(不实际等待) | ||
| 544 | + simulated_processing_time = 0.1 # 100ms 模拟处理时间 | ||
| 545 | + | ||
| 546 | + return RecognitionResult( | ||
| 547 | + text=f"快速识别结果_{chunk.chunk_id}", | ||
| 548 | + confidence=0.6, | ||
| 549 | + chunk_id=chunk.chunk_id, | ||
| 550 | + stage=RecognitionStage.IMMEDIATE, | ||
| 551 | + processing_time=simulated_processing_time, | ||
| 552 | + accuracy_score=0.7 | ||
| 553 | + ) | ||
| 554 | + | ||
| 555 | + def _refined_recognition(self, chunk: AudioChunk) -> Optional[RecognitionResult]: | ||
| 556 | + """精化识别(模拟)""" | ||
| 557 | + # 模拟处理时间(不实际等待) | ||
| 558 | + simulated_processing_time = 0.3 # 300ms 模拟处理时间 | ||
| 559 | + | ||
| 560 | + return RecognitionResult( | ||
| 561 | + text=f"精化识别结果_{chunk.chunk_id}", | ||
| 562 | + confidence=0.8, | ||
| 563 | + chunk_id=chunk.chunk_id, | ||
| 564 | + stage=RecognitionStage.REFINED, | ||
| 565 | + processing_time=simulated_processing_time, | ||
| 566 | + accuracy_score=0.85 | ||
| 567 | + ) | ||
| 568 | + | ||
| 569 | + def _final_recognition(self, chunk: AudioChunk) -> Optional[RecognitionResult]: | ||
| 570 | + """最终识别(模拟)""" | ||
| 571 | + # 模拟处理时间(不实际等待) | ||
| 572 | + simulated_processing_time = 0.5 # 500ms 模拟处理时间 | ||
| 573 | + | ||
| 574 | + return RecognitionResult( | ||
| 575 | + text=f"最终识别结果_{chunk.chunk_id}", | ||
| 576 | + confidence=0.9, | ||
| 577 | + chunk_id=chunk.chunk_id, | ||
| 578 | + stage=RecognitionStage.FINAL, | ||
| 579 | + processing_time=simulated_processing_time, | ||
| 580 | + accuracy_score=0.95 | ||
| 581 | + ) | ||
| 582 | + | ||
| 583 | + def get_best_result(self, chunk_id: str) -> Optional[RecognitionResult]: | ||
| 584 | + """获取指定分片的最佳识别结果""" | ||
| 585 | + if chunk_id not in self.stage_results: | ||
| 586 | + return None | ||
| 587 | + | ||
| 588 | + results = self.stage_results[chunk_id] | ||
| 589 | + | ||
| 590 | + # 优先返回最终结果,其次是精化结果,最后是即时结果 | ||
| 591 | + for stage in [RecognitionStage.FINAL, RecognitionStage.REFINED, RecognitionStage.IMMEDIATE]: | ||
| 592 | + if stage in results: | ||
| 593 | + return results[stage] | ||
| 594 | + | ||
| 595 | + return None | ||
| 596 | + | ||
| 597 | + def cleanup_old_results(self, max_age: float = 300.0): | ||
| 598 | + """清理过期的识别结果""" | ||
| 599 | + current_time = time.time() | ||
| 600 | + expired_chunks = [] | ||
| 601 | + | ||
| 602 | + for chunk_id, results in self.stage_results.items(): | ||
| 603 | + # 从chunk_id中提取时间戳(格式:timestamp_index) | ||
| 604 | + try: | ||
| 605 | + chunk_timestamp = float(chunk_id.split('_')[0]) / 1000.0 # 转换为秒 | ||
| 606 | + if current_time - chunk_timestamp > max_age: | ||
| 607 | + expired_chunks.append(chunk_id) | ||
| 608 | + except (ValueError, IndexError): | ||
| 609 | + # 如果无法解析时间戳,保留结果 | ||
| 610 | + continue | ||
| 611 | + | ||
| 612 | + # 清理过期结果 | ||
| 613 | + for chunk_id in expired_chunks: | ||
| 614 | + del self.stage_results[chunk_id] | ||
| 615 | + | ||
| 616 | + if expired_chunks: | ||
| 617 | + self.logger.info(f"清理了 {len(expired_chunks)} 个过期识别结果") | ||
| 618 | + | ||
| 619 | +class ChunkQualityAssessor: | ||
| 620 | + """分片质量评估器""" | ||
| 621 | + | ||
| 622 | + def __init__(self): | ||
| 623 | + self.quality_metrics = { | ||
| 624 | + 'signal_to_noise_ratio': 0.0, | ||
| 625 | + 'audio_clarity': 0.0, | ||
| 626 | + 'speech_continuity': 0.0, | ||
| 627 | + 'duration_appropriateness': 0.0 | ||
| 628 | + } | ||
| 629 | + | ||
| 630 | + def assess_chunk_quality(self, chunk: AudioChunk) -> float: | ||
| 631 | + """评估分片质量""" | ||
| 632 | + # 这里应该实现实际的音频质量评估算法 | ||
| 633 | + # 目前返回模拟值 | ||
| 634 | + | ||
| 635 | + # 基于时长的质量评估 | ||
| 636 | + duration_score = self._assess_duration_quality(chunk.duration) | ||
| 637 | + | ||
| 638 | + # 基于策略的质量评估 | ||
| 639 | + strategy_score = self._assess_strategy_appropriateness(chunk.strategy) | ||
| 640 | + | ||
| 641 | + # 综合质量分数 | ||
| 642 | + overall_quality = (duration_score + strategy_score) / 2 | ||
| 643 | + | ||
| 644 | + return min(1.0, max(0.0, overall_quality)) | ||
| 645 | + | ||
| 646 | + def _assess_duration_quality(self, duration: float) -> float: | ||
| 647 | + """评估时长质量""" | ||
| 648 | + # 理想时长范围:1-3秒 | ||
| 649 | + if 1.0 <= duration <= 3.0: | ||
| 650 | + return 1.0 | ||
| 651 | + elif 0.5 <= duration < 1.0 or 3.0 < duration <= 5.0: | ||
| 652 | + return 0.7 | ||
| 653 | + else: | ||
| 654 | + return 0.3 | ||
| 655 | + | ||
| 656 | + def _assess_strategy_appropriateness(self, strategy: ChunkStrategy) -> float: | ||
| 657 | + """评估策略适当性""" | ||
| 658 | + # 这里可以根据当前上下文评估策略的适当性 | ||
| 659 | + # 目前返回固定值 | ||
| 660 | + strategy_scores = { | ||
| 661 | + ChunkStrategy.FAST_RESPONSE: 0.8, | ||
| 662 | + ChunkStrategy.BALANCED: 0.9, | ||
| 663 | + ChunkStrategy.HIGH_ACCURACY: 0.85, | ||
| 664 | + ChunkStrategy.ADAPTIVE: 0.95 | ||
| 665 | + } | ||
| 666 | + | ||
| 667 | + return strategy_scores.get(strategy, 0.5) |
| 1 | +# AIfeng/2025-07-07 15:25:48 | ||
| 2 | +# 智能断句模块 - 基于静音间隔的语义分段 | ||
| 3 | + | ||
| 4 | +import time | ||
| 5 | +import numpy as np | ||
| 6 | +from typing import List, Dict, Optional, Tuple | ||
| 7 | +from dataclasses import dataclass | ||
| 8 | +from enum import Enum | ||
| 9 | +import threading | ||
| 10 | +import logging | ||
| 11 | + | ||
| 12 | +class SegmentType(Enum): | ||
| 13 | + """语音片段类型""" | ||
| 14 | + WORD_CONTINUATION = "word_continuation" # 词间连接 | ||
| 15 | + PHRASE_CONNECTION = "phrase_connection" # 短语连接 | ||
| 16 | + SENTENCE_BOUNDARY = "sentence_boundary" # 句子边界 | ||
| 17 | + TOPIC_BOUNDARY = "topic_boundary" # 话题边界 | ||
| 18 | + | ||
| 19 | +@dataclass | ||
| 20 | +class SpeechSegment: | ||
| 21 | + """语音片段数据结构""" | ||
| 22 | + text: str | ||
| 23 | + start_time: float | ||
| 24 | + end_time: float | ||
| 25 | + silence_before: float | ||
| 26 | + silence_after: float | ||
| 27 | + confidence: float | ||
| 28 | + segment_type: SegmentType | ||
| 29 | + is_complete: bool = False | ||
| 30 | + | ||
| 31 | +class IntelligentSentenceSegmentation: | ||
| 32 | + """智能断句处理器""" | ||
| 33 | + | ||
| 34 | + def __init__(self, config: Dict = None): | ||
| 35 | + self.config = config or self._get_default_config() | ||
| 36 | + self.silence_thresholds = self.config.get('silence_thresholds', { | ||
| 37 | + 'micro_pause': 0.3, # 词间停顿 | ||
| 38 | + 'phrase_pause': 1.0, # 短语间停顿 | ||
| 39 | + 'sentence_pause': 2.0, # 句子间停顿 | ||
| 40 | + 'topic_pause': 4.0 # 话题间停顿 | ||
| 41 | + }) | ||
| 42 | + | ||
| 43 | + self.segment_buffer = [] # 片段缓冲区 | ||
| 44 | + self.user_speech_pattern = { | ||
| 45 | + 'avg_pause_duration': 1.2, | ||
| 46 | + 'speech_rate': 150, # 词/分钟 | ||
| 47 | + 'pause_variance': 0.3 | ||
| 48 | + } | ||
| 49 | + | ||
| 50 | + self.recent_pauses = [] # 最近的停顿记录 | ||
| 51 | + self.adaptive_enabled = self.config.get('adaptive_threshold', True) | ||
| 52 | + | ||
| 53 | + self.logger = logging.getLogger(__name__) | ||
| 54 | + | ||
| 55 | + def _get_default_config(self) -> Dict: | ||
| 56 | + """获取默认配置""" | ||
| 57 | + return { | ||
| 58 | + 'silence_thresholds': { | ||
| 59 | + 'micro_pause': 0.3, | ||
| 60 | + 'phrase_pause': 1.0, | ||
| 61 | + 'sentence_pause': 2.0, | ||
| 62 | + 'topic_pause': 4.0 | ||
| 63 | + }, | ||
| 64 | + 'adaptive_threshold': True, | ||
| 65 | + 'semantic_analysis': True, | ||
| 66 | + 'grammar_check': True, | ||
| 67 | + 'max_segment_length': 50, # 最大片段长度(词数) | ||
| 68 | + 'min_segment_length': 3 # 最小片段长度(词数) | ||
| 69 | + } | ||
| 70 | + | ||
| 71 | + def process_speech_segment(self, text: str, silence_duration: float, | ||
| 72 | + timestamp: float, confidence: float) -> List[SpeechSegment]: | ||
| 73 | + """处理语音片段""" | ||
| 74 | + try: | ||
| 75 | + # 记录停顿时长用于自适应调整 | ||
| 76 | + if silence_duration > 0: | ||
| 77 | + self.recent_pauses.append(silence_duration) | ||
| 78 | + if len(self.recent_pauses) > 20: # 保持最近20个停顿记录 | ||
| 79 | + self.recent_pauses.pop(0) | ||
| 80 | + | ||
| 81 | + # 自适应阈值调整 | ||
| 82 | + if self.adaptive_enabled: | ||
| 83 | + self._adjust_thresholds() | ||
| 84 | + | ||
| 85 | + # 确定片段类型 | ||
| 86 | + segment_type = self._classify_segment_type(text, silence_duration) | ||
| 87 | + | ||
| 88 | + # 创建语音片段 | ||
| 89 | + segment = SpeechSegment( | ||
| 90 | + text=text, | ||
| 91 | + start_time=timestamp, | ||
| 92 | + end_time=timestamp + len(text.split()) * 0.4, # 估算结束时间 | ||
| 93 | + silence_before=silence_duration, | ||
| 94 | + silence_after=0.0, # 后续更新 | ||
| 95 | + confidence=confidence, | ||
| 96 | + segment_type=segment_type | ||
| 97 | + ) | ||
| 98 | + | ||
| 99 | + # 添加到缓冲区 | ||
| 100 | + self.segment_buffer.append(segment) | ||
| 101 | + | ||
| 102 | + # 处理片段合并和分割 | ||
| 103 | + processed_segments = self._process_segment_buffer() | ||
| 104 | + | ||
| 105 | + return processed_segments | ||
| 106 | + | ||
| 107 | + except Exception as e: | ||
| 108 | + self.logger.error(f"处理语音片段时出错: {e}") | ||
| 109 | + return [] | ||
| 110 | + | ||
| 111 | + def _classify_segment_type(self, text: str, silence_duration: float) -> SegmentType: | ||
| 112 | + """分类片段类型""" | ||
| 113 | + # 确保阈值字典完整性 | ||
| 114 | + if not isinstance(self.silence_thresholds, dict): | ||
| 115 | + self.silence_thresholds = self._get_default_config()['silence_thresholds'] | ||
| 116 | + | ||
| 117 | + # 安全获取阈值,使用默认值作为后备 | ||
| 118 | + micro_pause = self.silence_thresholds.get('micro_pause', 0.3) | ||
| 119 | + phrase_pause = self.silence_thresholds.get('phrase_pause', 1.0) | ||
| 120 | + sentence_pause = self.silence_thresholds.get('sentence_pause', 2.0) | ||
| 121 | + | ||
| 122 | + # 基于静音时长的初步分类 | ||
| 123 | + if silence_duration <= micro_pause: | ||
| 124 | + return SegmentType.WORD_CONTINUATION | ||
| 125 | + elif silence_duration <= phrase_pause: | ||
| 126 | + return SegmentType.PHRASE_CONNECTION | ||
| 127 | + elif silence_duration <= sentence_pause: | ||
| 128 | + return SegmentType.SENTENCE_BOUNDARY | ||
| 129 | + else: | ||
| 130 | + return SegmentType.TOPIC_BOUNDARY | ||
| 131 | + | ||
| 132 | + def _process_segment_buffer(self) -> List[SpeechSegment]: | ||
| 133 | + """处理片段缓冲区""" | ||
| 134 | + if len(self.segment_buffer) < 2: | ||
| 135 | + return [] | ||
| 136 | + | ||
| 137 | + processed_segments = [] | ||
| 138 | + current_segment = self.segment_buffer[-2] # 倒数第二个片段 | ||
| 139 | + next_segment = self.segment_buffer[-1] # 最新片段 | ||
| 140 | + | ||
| 141 | + # 语义连接分析 | ||
| 142 | + connection_type = self._analyze_semantic_connection( | ||
| 143 | + current_segment.text, | ||
| 144 | + next_segment.text, | ||
| 145 | + next_segment.silence_before | ||
| 146 | + ) | ||
| 147 | + | ||
| 148 | + # 根据连接类型决定处理方式 | ||
| 149 | + if connection_type == 'continuation': | ||
| 150 | + # 合并片段 | ||
| 151 | + merged_segment = self._merge_segments(current_segment, next_segment) | ||
| 152 | + self.segment_buffer[-2] = merged_segment | ||
| 153 | + self.segment_buffer.pop() # 移除最新片段 | ||
| 154 | + elif connection_type == 'new_sentence': | ||
| 155 | + # 标记当前片段为完成 | ||
| 156 | + current_segment.is_complete = True | ||
| 157 | + processed_segments.append(current_segment) | ||
| 158 | + | ||
| 159 | + return processed_segments | ||
| 160 | + | ||
| 161 | + def _analyze_semantic_connection(self, prev_text: str, current_text: str, | ||
| 162 | + silence_duration: float) -> str: | ||
| 163 | + """分析语义连接类型""" | ||
| 164 | + # 确保silence_thresholds是字典类型 | ||
| 165 | + if not isinstance(self.silence_thresholds, dict): | ||
| 166 | + self.silence_thresholds = { | ||
| 167 | + 'micro_pause': 0.3, | ||
| 168 | + 'phrase_pause': 0.8, | ||
| 169 | + 'sentence_pause': 1.5, | ||
| 170 | + 'topic_pause': 3.0 | ||
| 171 | + } | ||
| 172 | + | ||
| 173 | + # 语法完整性检查 | ||
| 174 | + if self._is_grammatically_complete(prev_text): | ||
| 175 | + sentence_pause_threshold = self.silence_thresholds.get('sentence_pause', 1.5) | ||
| 176 | + if silence_duration >= sentence_pause_threshold: | ||
| 177 | + return 'new_sentence' | ||
| 178 | + | ||
| 179 | + # 语义相关性检查 | ||
| 180 | + if self.config.get('semantic_analysis', True): | ||
| 181 | + semantic_score = self._calculate_semantic_similarity(prev_text, current_text) | ||
| 182 | + | ||
| 183 | + phrase_pause_threshold = self.silence_thresholds.get('phrase_pause', 0.8) | ||
| 184 | + if silence_duration >= phrase_pause_threshold: | ||
| 185 | + if semantic_score > 0.7: | ||
| 186 | + return 'continuation' # 语义相关,继续当前句子 | ||
| 187 | + else: | ||
| 188 | + return 'new_sentence' # 语义不相关,新句子 | ||
| 189 | + | ||
| 190 | + return 'continuation' | ||
| 191 | + | ||
| 192 | + def _is_grammatically_complete(self, text: str) -> bool: | ||
| 193 | + """检查语法完整性""" | ||
| 194 | + if not self.config.get('grammar_check', True): | ||
| 195 | + return False | ||
| 196 | + | ||
| 197 | + # 简单的语法完整性检查 | ||
| 198 | + text = text.strip() | ||
| 199 | + | ||
| 200 | + # 检查句子结束标点 | ||
| 201 | + if text.endswith(('。', '!', '?', '.', '!', '?')): | ||
| 202 | + return True | ||
| 203 | + | ||
| 204 | + # 检查常见的完整句式 | ||
| 205 | + complete_patterns = [ | ||
| 206 | + '是的', '不是', '好的', '没有', '有的', '对的', '错的', | ||
| 207 | + '可以', '不可以', '行', '不行', '是', '不是' | ||
| 208 | + ] | ||
| 209 | + | ||
| 210 | + for pattern in complete_patterns: | ||
| 211 | + if text.endswith(pattern): | ||
| 212 | + return True | ||
| 213 | + | ||
| 214 | + # 检查词数(简单启发式) | ||
| 215 | + word_count = len(text.split()) | ||
| 216 | + if word_count >= self.config.get('min_complete_words', 5): | ||
| 217 | + return True | ||
| 218 | + | ||
| 219 | + return False | ||
| 220 | + | ||
| 221 | + def _calculate_semantic_similarity(self, text1: str, text2: str) -> float: | ||
| 222 | + """计算语义相似度(简化版本)""" | ||
| 223 | + # 这里使用简单的词汇重叠度作为语义相似度的近似 | ||
| 224 | + words1 = set(text1.split()) | ||
| 225 | + words2 = set(text2.split()) | ||
| 226 | + | ||
| 227 | + if not words1 or not words2: | ||
| 228 | + return 0.0 | ||
| 229 | + | ||
| 230 | + intersection = words1.intersection(words2) | ||
| 231 | + union = words1.union(words2) | ||
| 232 | + | ||
| 233 | + return len(intersection) / len(union) if union else 0.0 | ||
| 234 | + | ||
| 235 | + def _merge_segments(self, segment1: SpeechSegment, segment2: SpeechSegment) -> SpeechSegment: | ||
| 236 | + """合并两个片段""" | ||
| 237 | + merged_text = f"{segment1.text} {segment2.text}" | ||
| 238 | + | ||
| 239 | + return SpeechSegment( | ||
| 240 | + text=merged_text, | ||
| 241 | + start_time=segment1.start_time, | ||
| 242 | + end_time=segment2.end_time, | ||
| 243 | + silence_before=segment1.silence_before, | ||
| 244 | + silence_after=segment2.silence_after, | ||
| 245 | + confidence=min(segment1.confidence, segment2.confidence), | ||
| 246 | + segment_type=segment2.segment_type, | ||
| 247 | + is_complete=False | ||
| 248 | + ) | ||
| 249 | + | ||
| 250 | + def _adjust_thresholds(self): | ||
| 251 | + """根据用户说话习惯动态调整阈值""" | ||
| 252 | + if len(self.recent_pauses) >= 10: | ||
| 253 | + avg_pause = np.mean(self.recent_pauses) | ||
| 254 | + std_pause = np.std(self.recent_pauses) | ||
| 255 | + | ||
| 256 | + # 确保silence_thresholds是字典类型 | ||
| 257 | + if not isinstance(self.silence_thresholds, dict): | ||
| 258 | + self.silence_thresholds = { | ||
| 259 | + 'micro_pause': 0.3, | ||
| 260 | + 'phrase_pause': 0.8, | ||
| 261 | + 'sentence_pause': 1.5, | ||
| 262 | + 'topic_pause': 3.0 | ||
| 263 | + } | ||
| 264 | + | ||
| 265 | + # 个性化阈值调整 | ||
| 266 | + self.silence_thresholds['phrase_pause'] = max(0.5, avg_pause + 0.5 * std_pause) | ||
| 267 | + self.silence_thresholds['sentence_pause'] = max(1.0, avg_pause + 1.5 * std_pause) | ||
| 268 | + | ||
| 269 | + phrase_threshold = self.silence_thresholds.get('phrase_pause', 0.8) | ||
| 270 | + sentence_threshold = self.silence_thresholds.get('sentence_pause', 1.5) | ||
| 271 | + self.logger.debug(f"阈值已调整: phrase={phrase_threshold:.2f}, " | ||
| 272 | + f"sentence={sentence_threshold:.2f}") | ||
| 273 | + | ||
| 274 | + def get_completed_segments(self) -> List[SpeechSegment]: | ||
| 275 | + """获取已完成的片段""" | ||
| 276 | + completed = [seg for seg in self.segment_buffer if seg.is_complete] | ||
| 277 | + # 清理已完成的片段 | ||
| 278 | + self.segment_buffer = [seg for seg in self.segment_buffer if not seg.is_complete] | ||
| 279 | + return completed | ||
| 280 | + | ||
| 281 | + def force_complete_current_segment(self) -> Optional[SpeechSegment]: | ||
| 282 | + """强制完成当前片段""" | ||
| 283 | + if self.segment_buffer: | ||
| 284 | + current_segment = self.segment_buffer[-1] | ||
| 285 | + current_segment.is_complete = True | ||
| 286 | + return current_segment | ||
| 287 | + return None | ||
| 288 | + | ||
| 289 | + def reset(self): | ||
| 290 | + """重置分割器状态""" | ||
| 291 | + self.segment_buffer.clear() | ||
| 292 | + self.recent_pauses.clear() | ||
| 293 | + self.logger.info("智能断句器已重置") | ||
| 294 | + | ||
| 295 | + def create_session(self, session_id: str): | ||
| 296 | + """创建会话""" | ||
| 297 | + # 为会话初始化相关数据结构 | ||
| 298 | + self.logger.info(f"智能断句会话创建: {session_id}") | ||
| 299 | + | ||
| 300 | + def update_config(self, config: Dict): | ||
| 301 | + """更新配置""" | ||
| 302 | + if 'silence_thresholds' in config: | ||
| 303 | + # 更新静音阈值配置 | ||
| 304 | + thresholds = config['silence_thresholds'] | ||
| 305 | + self.logger.info(f"更新静音阈值配置: {thresholds}") | ||
| 306 | + | ||
| 307 | + if 'semantic_analysis' in config: | ||
| 308 | + # 更新语义分析配置 | ||
| 309 | + semantic_config = config['semantic_analysis'] | ||
| 310 | + self.logger.info(f"更新语义分析配置: {semantic_config}") | ||
| 311 | + | ||
| 312 | + def complete_session(self, session_id: str): | ||
| 313 | + """完成会话""" | ||
| 314 | + # 清理会话相关的缓存数据 | ||
| 315 | + self.logger.info(f"智能断句会话完成: {session_id}") | ||
| 316 | + | ||
| 317 | + def shutdown(self): | ||
| 318 | + """关闭模块""" | ||
| 319 | + self.reset() | ||
| 320 | + self.logger.info("智能断句模块已关闭") | ||
| 321 | + | ||
| 322 | + def get_statistics(self) -> Dict: | ||
| 323 | + """获取统计信息""" | ||
| 324 | + return { | ||
| 325 | + 'buffer_size': len(self.segment_buffer), | ||
| 326 | + 'recent_pauses_count': len(self.recent_pauses), | ||
| 327 | + 'avg_pause_duration': np.mean(self.recent_pauses) if self.recent_pauses else 0, | ||
| 328 | + 'current_thresholds': self.silence_thresholds.copy(), | ||
| 329 | + 'adaptive_enabled': self.adaptive_enabled | ||
| 330 | + } | ||
| 331 | + | ||
| 332 | + def process_text(self, text: str, context: Dict = None) -> Dict: | ||
| 333 | + """处理文本分割(兼容OptimizationManager调用)""" | ||
| 334 | + try: | ||
| 335 | + # 提取上下文信息 | ||
| 336 | + timestamp = context.get('timestamp', time.time()) if context else time.time() | ||
| 337 | + confidence = context.get('confidence', 0.8) if context else 0.8 | ||
| 338 | + silence_duration = context.get('silence_duration', 1.0) if context else 1.0 | ||
| 339 | + | ||
| 340 | + # 处理语音片段 | ||
| 341 | + segments = self.process_speech_segment(text, silence_duration, timestamp, confidence) | ||
| 342 | + | ||
| 343 | + # 返回处理结果 | ||
| 344 | + if segments: | ||
| 345 | + # 返回最新的完整片段 | ||
| 346 | + latest_segment = segments[-1] | ||
| 347 | + # 安全获取segment_type的值 | ||
| 348 | + segment_type_value = latest_segment.segment_type.value if isinstance(latest_segment.segment_type, SegmentType) else str(latest_segment.segment_type) | ||
| 349 | + return { | ||
| 350 | + 'success': True, | ||
| 351 | + 'text': latest_segment.text, | ||
| 352 | + 'confidence': latest_segment.confidence, | ||
| 353 | + 'segment_type': segment_type_value, | ||
| 354 | + 'is_complete': latest_segment.is_complete | ||
| 355 | + } | ||
| 356 | + else: | ||
| 357 | + # 如果没有完整片段,返回原文本 | ||
| 358 | + return { | ||
| 359 | + 'success': True, | ||
| 360 | + 'text': text, | ||
| 361 | + 'confidence': confidence, | ||
| 362 | + 'segment_type': 'continuation', | ||
| 363 | + 'is_complete': False | ||
| 364 | + } | ||
| 365 | + | ||
| 366 | + except Exception as e: | ||
| 367 | + self.logger.error(f"处理文本分割时出错: {e}") | ||
| 368 | + return { | ||
| 369 | + 'success': False, | ||
| 370 | + 'text': text, | ||
| 371 | + 'confidence': 0.0, | ||
| 372 | + 'error': str(e) | ||
| 373 | + } | ||
| 374 | + | ||
| 375 | + def get_performance_stats(self) -> Dict: | ||
| 376 | + """获取性能统计""" | ||
| 377 | + total_segments = len(self.segment_buffer) | ||
| 378 | + completed_segments = len([seg for seg in self.segment_buffer if seg.is_complete]) | ||
| 379 | + avg_confidence = np.mean([seg.confidence for seg in self.segment_buffer]) if self.segment_buffer else 0.0 | ||
| 380 | + | ||
| 381 | + return { | ||
| 382 | + 'total_segments': total_segments, | ||
| 383 | + 'completed_segments': completed_segments, | ||
| 384 | + 'pending_segments': total_segments - completed_segments, | ||
| 385 | + 'average_confidence': avg_confidence, | ||
| 386 | + 'processing_efficiency': completed_segments / total_segments if total_segments > 0 else 0.0 | ||
| 387 | + } | ||
| 388 | + | ||
| 389 | +class AdaptiveSilenceThreshold: | ||
| 390 | + """自适应静音阈值调整器""" | ||
| 391 | + | ||
| 392 | + def __init__(self): | ||
| 393 | + self.user_speech_pattern = { | ||
| 394 | + 'avg_pause_duration': 1.2, | ||
| 395 | + 'speech_rate': 150, # 词/分钟 | ||
| 396 | + 'pause_variance': 0.3 | ||
| 397 | + } | ||
| 398 | + self.history_window = 50 # 历史窗口大小 | ||
| 399 | + self.pause_history = [] | ||
| 400 | + | ||
| 401 | + def update_speech_pattern(self, pause_duration: float, speech_rate: float = None): | ||
| 402 | + """更新用户说话模式""" | ||
| 403 | + self.pause_history.append(pause_duration) | ||
| 404 | + if len(self.pause_history) > self.history_window: | ||
| 405 | + self.pause_history.pop(0) | ||
| 406 | + | ||
| 407 | + # 更新平均停顿时长 | ||
| 408 | + self.user_speech_pattern['avg_pause_duration'] = np.mean(self.pause_history) | ||
| 409 | + self.user_speech_pattern['pause_variance'] = np.std(self.pause_history) | ||
| 410 | + | ||
| 411 | + if speech_rate: | ||
| 412 | + self.user_speech_pattern['speech_rate'] = speech_rate | ||
| 413 | + | ||
| 414 | + def get_adaptive_thresholds(self, base_thresholds: Dict) -> Dict: | ||
| 415 | + """获取自适应阈值""" | ||
| 416 | + if len(self.pause_history) < 5: | ||
| 417 | + return base_thresholds | ||
| 418 | + | ||
| 419 | + avg_pause = self.user_speech_pattern['avg_pause_duration'] | ||
| 420 | + variance = self.user_speech_pattern['pause_variance'] | ||
| 421 | + | ||
| 422 | + # 基于用户习惯调整阈值 | ||
| 423 | + adaptive_thresholds = base_thresholds.copy() | ||
| 424 | + | ||
| 425 | + # 调整系数 | ||
| 426 | + adjustment_factor = min(2.0, max(0.5, avg_pause / 1.2)) # 基准1.2秒 | ||
| 427 | + | ||
| 428 | + for key in adaptive_thresholds: | ||
| 429 | + adaptive_thresholds[key] *= adjustment_factor | ||
| 430 | + # 添加方差影响 | ||
| 431 | + adaptive_thresholds[key] += variance * 0.3 | ||
| 432 | + | ||
| 433 | + return adaptive_thresholds |
| 1 | +{ | ||
| 2 | + "// AIfeng/2025-07-07 15:25:48": "流式语音识别优化模块配置文件", | ||
| 3 | + | ||
| 4 | + "intelligent_segmentation": { | ||
| 5 | + "description": "智能断句配置", | ||
| 6 | + "silence_thresholds": { | ||
| 7 | + "short_pause": 0.3, | ||
| 8 | + "medium_pause": 0.8, | ||
| 9 | + "long_pause": 1.5, | ||
| 10 | + "sentence_break": 2.0 | ||
| 11 | + }, | ||
| 12 | + "adaptive_threshold": { | ||
| 13 | + "enabled": true, | ||
| 14 | + "learning_rate": 0.1, | ||
| 15 | + "min_threshold": 0.1, | ||
| 16 | + "max_threshold": 3.0, | ||
| 17 | + "adaptation_window": 10 | ||
| 18 | + }, | ||
| 19 | + "semantic_analysis": { | ||
| 20 | + "enabled": true, | ||
| 21 | + "similarity_threshold": 0.7, | ||
| 22 | + "context_window": 5, | ||
| 23 | + "use_grammar_check": true | ||
| 24 | + }, | ||
| 25 | + "segment_constraints": { | ||
| 26 | + "min_length": 10, | ||
| 27 | + "max_length": 500, | ||
| 28 | + "min_confidence": 0.3 | ||
| 29 | + } | ||
| 30 | + }, | ||
| 31 | + | ||
| 32 | + "adaptive_vad_chunking": { | ||
| 33 | + "description": "自适应VAD分片配置", | ||
| 34 | + "strategies": { | ||
| 35 | + "fast_response": { | ||
| 36 | + "chunk_size_ms": 200, | ||
| 37 | + "overlap_ms": 50, | ||
| 38 | + "confidence_threshold": 0.6, | ||
| 39 | + "max_latency_ms": 300 | ||
| 40 | + }, | ||
| 41 | + "high_accuracy": { | ||
| 42 | + "chunk_size_ms": 800, | ||
| 43 | + "overlap_ms": 200, | ||
| 44 | + "confidence_threshold": 0.8, | ||
| 45 | + "max_latency_ms": 1000 | ||
| 46 | + }, | ||
| 47 | + "balanced": { | ||
| 48 | + "chunk_size_ms": 400, | ||
| 49 | + "overlap_ms": 100, | ||
| 50 | + "confidence_threshold": 0.7, | ||
| 51 | + "max_latency_ms": 600 | ||
| 52 | + }, | ||
| 53 | + "adaptive": { | ||
| 54 | + "initial_chunk_size_ms": 400, | ||
| 55 | + "min_chunk_size_ms": 200, | ||
| 56 | + "max_chunk_size_ms": 1000, | ||
| 57 | + "adaptation_factor": 0.2, | ||
| 58 | + "performance_window": 20 | ||
| 59 | + } | ||
| 60 | + }, | ||
| 61 | + "performance_monitoring": { | ||
| 62 | + "enabled": true, | ||
| 63 | + "metrics_window": 100, | ||
| 64 | + "latency_target_ms": 500, | ||
| 65 | + "accuracy_target": 0.85, | ||
| 66 | + "adaptation_threshold": 0.1 | ||
| 67 | + }, | ||
| 68 | + "progressive_recognition": { | ||
| 69 | + "enabled": true, | ||
| 70 | + "stages": [ | ||
| 71 | + { | ||
| 72 | + "name": "quick", | ||
| 73 | + "chunk_size_ms": 200, | ||
| 74 | + "confidence_threshold": 0.5 | ||
| 75 | + }, | ||
| 76 | + { | ||
| 77 | + "name": "refined", | ||
| 78 | + "chunk_size_ms": 600, | ||
| 79 | + "confidence_threshold": 0.8 | ||
| 80 | + }, | ||
| 81 | + { | ||
| 82 | + "name": "final", | ||
| 83 | + "chunk_size_ms": 1000, | ||
| 84 | + "confidence_threshold": 0.9 | ||
| 85 | + } | ||
| 86 | + ] | ||
| 87 | + }, | ||
| 88 | + "quality_assessment": { | ||
| 89 | + "enabled": true, | ||
| 90 | + "snr_threshold": 10.0, | ||
| 91 | + "energy_threshold": 0.01, | ||
| 92 | + "spectral_quality_threshold": 0.7 | ||
| 93 | + } | ||
| 94 | + }, | ||
| 95 | + | ||
| 96 | + "recognition_result_tracker": { | ||
| 97 | + "description": "识别结果追踪配置", | ||
| 98 | + "session_management": { | ||
| 99 | + "max_sessions": 100, | ||
| 100 | + "session_timeout_minutes": 30, | ||
| 101 | + "auto_cleanup_enabled": true, | ||
| 102 | + "cleanup_interval_minutes": 5 | ||
| 103 | + }, | ||
| 104 | + "result_tracking": { | ||
| 105 | + "max_results_per_session": 1000, | ||
| 106 | + "enable_result_chaining": true, | ||
| 107 | + "confidence_decay_rate": 0.05, | ||
| 108 | + "similarity_threshold": 0.8 | ||
| 109 | + }, | ||
| 110 | + "quality_metrics": { | ||
| 111 | + "track_confidence_trends": true, | ||
| 112 | + "track_latency_metrics": true, | ||
| 113 | + "track_accuracy_metrics": true, | ||
| 114 | + "metrics_retention_hours": 24 | ||
| 115 | + }, | ||
| 116 | + "result_relations": { | ||
| 117 | + "enable_replacement_tracking": true, | ||
| 118 | + "enable_refinement_tracking": true, | ||
| 119 | + "enable_correction_tracking": true, | ||
| 120 | + "max_relation_depth": 5 | ||
| 121 | + }, | ||
| 122 | + "archival": { | ||
| 123 | + "auto_archive_enabled": true, | ||
| 124 | + "archive_after_hours": 6, | ||
| 125 | + "compress_archived_results": true, | ||
| 126 | + "max_archived_sessions": 500 | ||
| 127 | + } | ||
| 128 | + }, | ||
| 129 | + | ||
| 130 | + "streaming_display_manager": { | ||
| 131 | + "description": "流式显示管理配置", | ||
| 132 | + "buffer_management": { | ||
| 133 | + "max_buffer_size": 1000, | ||
| 134 | + "auto_cleanup_enabled": true, | ||
| 135 | + "cleanup_threshold": 0.8 | ||
| 136 | + }, | ||
| 137 | + "refresh_strategies": { | ||
| 138 | + "default_strategy": "debounced", | ||
| 139 | + "debounce_delay_ms": 200, | ||
| 140 | + "batch_size": 5, | ||
| 141 | + "batch_timeout_ms": 1000, | ||
| 142 | + "max_refresh_rate_per_second": 10 | ||
| 143 | + }, | ||
| 144 | + "display_options": { | ||
| 145 | + "enable_highlighting": true, | ||
| 146 | + "auto_scroll": true, | ||
| 147 | + "preserve_formatting": true, | ||
| 148 | + "show_confidence_indicators": true, | ||
| 149 | + "show_timing_info": false | ||
| 150 | + }, | ||
| 151 | + "performance": { | ||
| 152 | + "max_workers": 4, | ||
| 153 | + "queue_size_warning_threshold": 50, | ||
| 154 | + "processing_time_warning_ms": 100 | ||
| 155 | + }, | ||
| 156 | + "priority_handling": { | ||
| 157 | + "urgent_immediate_processing": true, | ||
| 158 | + "high_priority_batch_size": 3, | ||
| 159 | + "normal_priority_batch_size": 5, | ||
| 160 | + "low_priority_batch_size": 10 | ||
| 161 | + } | ||
| 162 | + }, | ||
| 163 | + | ||
| 164 | + "integration": { | ||
| 165 | + "description": "模块集成配置", | ||
| 166 | + "inter_module_communication": { | ||
| 167 | + "enable_event_bus": true, | ||
| 168 | + "async_processing": true, | ||
| 169 | + "error_propagation": true | ||
| 170 | + }, | ||
| 171 | + "data_flow": { | ||
| 172 | + "segmentation_to_chunking": true, | ||
| 173 | + "chunking_to_tracking": true, | ||
| 174 | + "tracking_to_display": true, | ||
| 175 | + "feedback_loops_enabled": true | ||
| 176 | + }, | ||
| 177 | + "performance_coordination": { | ||
| 178 | + "shared_thread_pool": false, | ||
| 179 | + "resource_monitoring": true, | ||
| 180 | + "adaptive_load_balancing": true | ||
| 181 | + } | ||
| 182 | + }, | ||
| 183 | + | ||
| 184 | + "logging": { | ||
| 185 | + "description": "日志配置", | ||
| 186 | + "level": "INFO", | ||
| 187 | + "enable_module_specific_logs": true, | ||
| 188 | + "log_performance_metrics": true, | ||
| 189 | + "log_error_details": true, | ||
| 190 | + "max_log_file_size_mb": 10, | ||
| 191 | + "log_rotation_count": 5 | ||
| 192 | + }, | ||
| 193 | + | ||
| 194 | + "debugging": { | ||
| 195 | + "description": "调试配置", | ||
| 196 | + "enable_debug_mode": false, | ||
| 197 | + "trace_data_flow": false, | ||
| 198 | + "save_intermediate_results": false, | ||
| 199 | + "performance_profiling": false, | ||
| 200 | + "memory_usage_tracking": false | ||
| 201 | + }, | ||
| 202 | + | ||
| 203 | + "experimental": { | ||
| 204 | + "description": "实验性功能配置", | ||
| 205 | + "features": { | ||
| 206 | + "ai_powered_segmentation": false, | ||
| 207 | + "predictive_chunking": false, | ||
| 208 | + "semantic_result_merging": false, | ||
| 209 | + "adaptive_display_layouts": false, | ||
| 210 | + "real_time_quality_optimization": false | ||
| 211 | + }, | ||
| 212 | + "ai_models": { | ||
| 213 | + "segmentation_model_path": "", | ||
| 214 | + "chunking_model_path": "", | ||
| 215 | + "quality_model_path": "" | ||
| 216 | + } | ||
| 217 | + } | ||
| 218 | +} |
| 1 | +# AIfeng/2025-07-07 15:25:48 | ||
| 2 | +# 流式语音识别优化集成管理器 | ||
| 3 | + | ||
| 4 | +import json | ||
| 5 | +import time | ||
| 6 | +import threading | ||
| 7 | +import logging | ||
| 8 | +from typing import Dict, List, Optional, Callable, Any | ||
| 9 | +from pathlib import Path | ||
| 10 | +from dataclasses import dataclass | ||
| 11 | +from enum import Enum | ||
| 12 | +import asyncio | ||
| 13 | +from concurrent.futures import ThreadPoolExecutor | ||
| 14 | + | ||
| 15 | +from .intelligent_segmentation import IntelligentSentenceSegmentation, SpeechSegment | ||
| 16 | +from .adaptive_vad_chunking import AdaptiveVADChunking, ChunkStrategy, AudioChunk | ||
| 17 | +from .recognition_result_tracker import RecognitionResultTracker, ResultType | ||
| 18 | +from .streaming_display_manager import StreamingDisplayManager, UpdateType, DisplayPriority | ||
| 19 | + | ||
| 20 | +class OptimizationMode(Enum): | ||
| 21 | + """优化模式""" | ||
| 22 | + SPEED_FIRST = "speed_first" # 速度优先 | ||
| 23 | + ACCURACY_FIRST = "accuracy_first" # 精度优先 | ||
| 24 | + BALANCED = "balanced" # 平衡模式 | ||
| 25 | + ADAPTIVE = "adaptive" # 自适应模式 | ||
| 26 | + | ||
| 27 | +class ProcessingStage(Enum): | ||
| 28 | + """处理阶段""" | ||
| 29 | + AUDIO_INPUT = "audio_input" | ||
| 30 | + VAD_CHUNKING = "vad_chunking" | ||
| 31 | + SEGMENTATION = "segmentation" | ||
| 32 | + RECOGNITION = "recognition" | ||
| 33 | + RESULT_TRACKING = "result_tracking" | ||
| 34 | + DISPLAY_UPDATE = "display_update" | ||
| 35 | + | ||
| 36 | +@dataclass | ||
| 37 | +class ProcessingContext: | ||
| 38 | + """处理上下文""" | ||
| 39 | + session_id: str | ||
| 40 | + audio_data: bytes | ||
| 41 | + sample_rate: int | ||
| 42 | + timestamp: float | ||
| 43 | + metadata: Dict = None | ||
| 44 | + | ||
| 45 | +@dataclass | ||
| 46 | +class OptimizationMetrics: | ||
| 47 | + """优化指标""" | ||
| 48 | + total_latency_ms: float | ||
| 49 | + segmentation_latency_ms: float | ||
| 50 | + chunking_latency_ms: float | ||
| 51 | + tracking_latency_ms: float | ||
| 52 | + display_latency_ms: float | ||
| 53 | + accuracy_score: float | ||
| 54 | + confidence_score: float | ||
| 55 | + processing_efficiency: float | ||
| 56 | + | ||
| 57 | +class OptimizationManager: | ||
| 58 | + """流式语音识别优化管理器""" | ||
| 59 | + | ||
| 60 | + def __init__(self, config_path: str = None): | ||
| 61 | + # 加载配置 | ||
| 62 | + self.config = self._load_config(config_path) | ||
| 63 | + | ||
| 64 | + # 初始化各个优化模块 | ||
| 65 | + self.segmentation_module = IntelligentSentenceSegmentation( | ||
| 66 | + self.config.get('intelligent_segmentation', {}) | ||
| 67 | + ) | ||
| 68 | + | ||
| 69 | + self.chunking_module = AdaptiveVADChunking( | ||
| 70 | + self.config.get('adaptive_vad_chunking', {}) | ||
| 71 | + ) | ||
| 72 | + | ||
| 73 | + self.tracking_module = RecognitionResultTracker( | ||
| 74 | + self.config.get('recognition_result_tracker', {}) | ||
| 75 | + ) | ||
| 76 | + | ||
| 77 | + self.display_module = StreamingDisplayManager( | ||
| 78 | + self.config.get('streaming_display_manager', {}) | ||
| 79 | + ) | ||
| 80 | + | ||
| 81 | + # 优化模式 | ||
| 82 | + self.current_mode = OptimizationMode.BALANCED | ||
| 83 | + | ||
| 84 | + # 性能监控 | ||
| 85 | + self.performance_metrics = {} | ||
| 86 | + self.processing_stats = { | ||
| 87 | + 'total_sessions': 0, | ||
| 88 | + 'active_sessions': 0, | ||
| 89 | + 'total_audio_processed_seconds': 0.0, | ||
| 90 | + 'average_latency_ms': 0.0, | ||
| 91 | + 'average_accuracy': 0.0 | ||
| 92 | + } | ||
| 93 | + | ||
| 94 | + # 回调函数 | ||
| 95 | + self.result_callbacks = [] # 识别结果回调 | ||
| 96 | + self.error_callbacks = [] # 错误处理回调 | ||
| 97 | + self.metrics_callbacks = [] # 性能指标回调 | ||
| 98 | + | ||
| 99 | + # 线程池 | ||
| 100 | + self.executor = ThreadPoolExecutor( | ||
| 101 | + max_workers=self.config.get('integration', {}).get('performance_coordination', {}).get('max_workers', 8), | ||
| 102 | + thread_name_prefix='OptimizationManager' | ||
| 103 | + ) | ||
| 104 | + | ||
| 105 | + # 事件总线(简化实现) | ||
| 106 | + self.event_handlers = {} | ||
| 107 | + | ||
| 108 | + self.logger = logging.getLogger(__name__) | ||
| 109 | + self._lock = threading.RLock() | ||
| 110 | + self._running = True | ||
| 111 | + | ||
| 112 | + # 注册模块间的回调 | ||
| 113 | + self._setup_inter_module_communication() | ||
| 114 | + | ||
| 115 | + self.logger.info("流式语音识别优化管理器初始化完成") | ||
| 116 | + | ||
| 117 | + def _load_config(self, config_path: str = None) -> Dict: | ||
| 118 | + """加载配置文件""" | ||
| 119 | + if config_path is None: | ||
| 120 | + config_path = Path(__file__).parent / "optimization_config.json" | ||
| 121 | + | ||
| 122 | + try: | ||
| 123 | + with open(config_path, 'r', encoding='utf-8') as f: | ||
| 124 | + config = json.load(f) | ||
| 125 | + | ||
| 126 | + # 转换配置项:将cleanup_interval_minutes转换为cleanup_interval(秒) | ||
| 127 | + if 'recognition_result_tracker' in config: | ||
| 128 | + tracker_config = config['recognition_result_tracker'] | ||
| 129 | + if 'cleanup_interval_minutes' in tracker_config: | ||
| 130 | + # 将分钟转换为秒 | ||
| 131 | + tracker_config['cleanup_interval'] = tracker_config['cleanup_interval_minutes'] * 60 | ||
| 132 | + # 保留原配置项以兼容 | ||
| 133 | + | ||
| 134 | + return config | ||
| 135 | + except Exception as e: | ||
| 136 | + self.logger.warning(f"加载配置文件失败: {e},使用默认配置") | ||
| 137 | + return self._get_default_config() | ||
| 138 | + | ||
| 139 | + def _get_default_config(self) -> Dict: | ||
| 140 | + """获取默认配置""" | ||
| 141 | + return { | ||
| 142 | + 'intelligent_segmentation': {}, | ||
| 143 | + 'adaptive_vad_chunking': {}, | ||
| 144 | + 'recognition_result_tracker': {}, | ||
| 145 | + 'streaming_display_manager': {}, | ||
| 146 | + 'integration': { | ||
| 147 | + 'performance_coordination': { | ||
| 148 | + 'max_workers': 8 | ||
| 149 | + } | ||
| 150 | + } | ||
| 151 | + } | ||
| 152 | + | ||
| 153 | + def _setup_inter_module_communication(self): | ||
| 154 | + """设置模块间通信""" | ||
| 155 | + # 注册显示更新回调 | ||
| 156 | + self.tracking_module.register_result_callback(self._on_tracking_result) | ||
| 157 | + self.display_module.register_error_callback(self._on_display_error) | ||
| 158 | + | ||
| 159 | + # 注册分片质量反馈 | ||
| 160 | + self.chunking_module.register_quality_callback(self._on_chunk_quality_feedback) | ||
| 161 | + | ||
| 162 | + def set_optimization_mode(self, mode: OptimizationMode): | ||
| 163 | + """设置优化模式""" | ||
| 164 | + # 类型检查 | ||
| 165 | + if not isinstance(mode, OptimizationMode): | ||
| 166 | + raise TypeError(f"mode必须是OptimizationMode枚举类型,当前类型: {type(mode)}") | ||
| 167 | + | ||
| 168 | + self.current_mode = mode | ||
| 169 | + | ||
| 170 | + # 根据模式调整各模块参数 | ||
| 171 | + if mode == OptimizationMode.SPEED_FIRST: | ||
| 172 | + self._configure_for_speed() | ||
| 173 | + elif mode == OptimizationMode.ACCURACY_FIRST: | ||
| 174 | + self._configure_for_accuracy() | ||
| 175 | + elif mode == OptimizationMode.BALANCED: | ||
| 176 | + self._configure_for_balance() | ||
| 177 | + elif mode == OptimizationMode.ADAPTIVE: | ||
| 178 | + self._configure_for_adaptive() | ||
| 179 | + | ||
| 180 | + self.logger.info(f"优化模式已设置为: {mode.value}") | ||
| 181 | + | ||
| 182 | + def _configure_for_speed(self): | ||
| 183 | + """配置速度优先模式""" | ||
| 184 | + # 配置快速分片策略 | ||
| 185 | + self.chunking_module.set_strategy(ChunkStrategy.FAST_RESPONSE) | ||
| 186 | + | ||
| 187 | + # 配置快速断句 | ||
| 188 | + self.segmentation_module.update_config({ | ||
| 189 | + 'silence_thresholds': { | ||
| 190 | + 'short_pause': 0.2, | ||
| 191 | + 'medium_pause': 0.5, | ||
| 192 | + 'long_pause': 1.0, | ||
| 193 | + 'sentence_break': 1.5 | ||
| 194 | + } | ||
| 195 | + }) | ||
| 196 | + | ||
| 197 | + # 配置立即显示刷新 | ||
| 198 | + self.display_module.config['refresh_strategies']['default_strategy'] = 'immediate' | ||
| 199 | + | ||
| 200 | + def _configure_for_accuracy(self): | ||
| 201 | + """配置精度优先模式""" | ||
| 202 | + # 配置高精度分片策略 | ||
| 203 | + self.chunking_module.set_strategy(ChunkStrategy.HIGH_ACCURACY) | ||
| 204 | + | ||
| 205 | + # 配置精确断句 | ||
| 206 | + self.segmentation_module.update_config({ | ||
| 207 | + 'semantic_analysis': { | ||
| 208 | + 'enabled': True, | ||
| 209 | + 'similarity_threshold': 0.8, | ||
| 210 | + 'context_window': 8 | ||
| 211 | + } | ||
| 212 | + }) | ||
| 213 | + | ||
| 214 | + # 配置批量显示刷新 | ||
| 215 | + self.display_module.config['refresh_strategies']['default_strategy'] = 'batch' | ||
| 216 | + | ||
| 217 | + def _configure_for_balance(self): | ||
| 218 | + """配置平衡模式""" | ||
| 219 | + # 配置平衡分片策略 | ||
| 220 | + self.chunking_module.set_strategy(ChunkStrategy.BALANCED) | ||
| 221 | + | ||
| 222 | + # 配置防抖显示刷新 | ||
| 223 | + self.display_module.config['refresh_strategies']['default_strategy'] = 'debounced' | ||
| 224 | + | ||
| 225 | + def _configure_for_adaptive(self): | ||
| 226 | + """配置自适应模式""" | ||
| 227 | + # 配置自适应分片策略 | ||
| 228 | + self.chunking_module.set_strategy(ChunkStrategy.ADAPTIVE) | ||
| 229 | + | ||
| 230 | + # 配置自适应显示刷新 | ||
| 231 | + self.display_module.config['refresh_strategies']['default_strategy'] = 'adaptive' | ||
| 232 | + | ||
| 233 | + def register_result_callback(self, callback: Callable[[str, str, float, bool], None]): | ||
| 234 | + """注册识别结果回调""" | ||
| 235 | + self.result_callbacks.append(callback) | ||
| 236 | + | ||
| 237 | + def register_error_callback(self, callback: Callable[[str, Exception], None]): | ||
| 238 | + """注册错误处理回调""" | ||
| 239 | + self.error_callbacks.append(callback) | ||
| 240 | + | ||
| 241 | + def register_metrics_callback(self, callback: Callable[[str, OptimizationMetrics], None]): | ||
| 242 | + """注册性能指标回调""" | ||
| 243 | + self.metrics_callbacks.append(callback) | ||
| 244 | + | ||
| 245 | + def create_session(self, session_id: str, config: Dict = None) -> bool: | ||
| 246 | + """创建处理会话""" | ||
| 247 | + try: | ||
| 248 | + # 在各个模块中创建会话 | ||
| 249 | + self.segmentation_module.create_session(session_id) | ||
| 250 | + self.chunking_module.create_session(session_id) | ||
| 251 | + self.tracking_module.create_session(session_id) | ||
| 252 | + | ||
| 253 | + with self._lock: | ||
| 254 | + self.processing_stats['total_sessions'] += 1 | ||
| 255 | + self.processing_stats['active_sessions'] += 1 | ||
| 256 | + | ||
| 257 | + self.logger.info(f"会话创建成功: {session_id}") | ||
| 258 | + return True | ||
| 259 | + | ||
| 260 | + except Exception as e: | ||
| 261 | + self.logger.error(f"创建会话失败: {e}") | ||
| 262 | + self._handle_error(session_id, e) | ||
| 263 | + return False | ||
| 264 | + | ||
| 265 | + def process_audio(self, session_id: str, audio_data: bytes, | ||
| 266 | + sample_rate: int, timestamp: float = None) -> bool: | ||
| 267 | + """处理音频数据""" | ||
| 268 | + if timestamp is None: | ||
| 269 | + timestamp = time.time() | ||
| 270 | + | ||
| 271 | + context = ProcessingContext( | ||
| 272 | + session_id=session_id, | ||
| 273 | + audio_data=audio_data, | ||
| 274 | + sample_rate=sample_rate, | ||
| 275 | + timestamp=timestamp | ||
| 276 | + ) | ||
| 277 | + | ||
| 278 | + # 异步处理音频 | ||
| 279 | + self.executor.submit(self._process_audio_async, context) | ||
| 280 | + return True | ||
| 281 | + | ||
| 282 | + def complete_session(self, session_id: str) -> bool: | ||
| 283 | + """完成处理会话""" | ||
| 284 | + try: | ||
| 285 | + # 完成各个模块的会话 | ||
| 286 | + self.segmentation_module.complete_session(session_id) | ||
| 287 | + self.chunking_module.complete_session(session_id) | ||
| 288 | + self.tracking_module.complete_session(session_id) | ||
| 289 | + | ||
| 290 | + with self._lock: | ||
| 291 | + if self.processing_stats['active_sessions'] > 0: | ||
| 292 | + self.processing_stats['active_sessions'] -= 1 | ||
| 293 | + | ||
| 294 | + self.logger.info(f"会话完成: {session_id}") | ||
| 295 | + return True | ||
| 296 | + | ||
| 297 | + except Exception as e: | ||
| 298 | + self.logger.error(f"完成会话失败: {e}") | ||
| 299 | + self._handle_error(session_id, e) | ||
| 300 | + return False | ||
| 301 | + | ||
| 302 | + def _process_audio_async(self, context: ProcessingContext): | ||
| 303 | + """异步处理音频数据""" | ||
| 304 | + start_time = time.time() | ||
| 305 | + metrics = OptimizationMetrics( | ||
| 306 | + total_latency_ms=0, | ||
| 307 | + segmentation_latency_ms=0, | ||
| 308 | + chunking_latency_ms=0, | ||
| 309 | + tracking_latency_ms=0, | ||
| 310 | + display_latency_ms=0, | ||
| 311 | + accuracy_score=0, | ||
| 312 | + confidence_score=0, | ||
| 313 | + processing_efficiency=0 | ||
| 314 | + ) | ||
| 315 | + | ||
| 316 | + try: | ||
| 317 | + # 1. VAD分片处理 | ||
| 318 | + chunk_start = time.time() | ||
| 319 | + chunks = self.chunking_module.process_audio( | ||
| 320 | + context.session_id, | ||
| 321 | + context.audio_data, | ||
| 322 | + context.sample_rate | ||
| 323 | + ) | ||
| 324 | + metrics.chunking_latency_ms = (time.time() - chunk_start) * 1000 | ||
| 325 | + | ||
| 326 | + # 2. 智能断句处理 | ||
| 327 | + seg_start = time.time() | ||
| 328 | + for chunk in chunks: | ||
| 329 | + if chunk.is_speech: | ||
| 330 | + # 这里应该调用ASR服务获取识别结果 | ||
| 331 | + # 为了演示,我们模拟一个识别结果 | ||
| 332 | + mock_text = f"模拟识别文本_{chunk.chunk_id}" | ||
| 333 | + mock_confidence = 0.85 | ||
| 334 | + | ||
| 335 | + # 进行智能断句 | ||
| 336 | + text_context = { | ||
| 337 | + 'session_id': context.session_id, | ||
| 338 | + 'timestamp': chunk.timestamp, | ||
| 339 | + 'confidence': mock_confidence, | ||
| 340 | + 'silence_duration': 0.0 # 默认值 | ||
| 341 | + } | ||
| 342 | + segment_result = self.segmentation_module.process_text( | ||
| 343 | + mock_text, | ||
| 344 | + text_context | ||
| 345 | + ) | ||
| 346 | + | ||
| 347 | + # 3. 结果追踪 | ||
| 348 | + track_start = time.time() | ||
| 349 | + if segment_result.get('success', False): | ||
| 350 | + result_id = self.tracking_module.add_recognition_result( | ||
| 351 | + context.session_id, | ||
| 352 | + segment_result['text'], | ||
| 353 | + segment_result['confidence'], | ||
| 354 | + context.audio_data, # audio_data | ||
| 355 | + ResultType.PARTIAL if not segment_result.get('is_complete', False) else ResultType.FINAL, # result_type | ||
| 356 | + 'processing', # stage | ||
| 357 | + None, # predecessor_ids | ||
| 358 | + None, # parent_segment_id | ||
| 359 | + {'timestamp': chunk.timestamp, 'duration': chunk.duration} # metadata | ||
| 360 | + ) | ||
| 361 | + | ||
| 362 | + # 4. 显示更新 | ||
| 363 | + display_start = time.time() | ||
| 364 | + self.display_module.update_display( | ||
| 365 | + context.session_id, | ||
| 366 | + result_id, | ||
| 367 | + segment_result['text'], | ||
| 368 | + UpdateType.REPLACE_FINAL if segment_result.get('is_complete', False) else UpdateType.APPEND, | ||
| 369 | + segment_result['confidence'], | ||
| 370 | + segment_result.get('is_complete', False), | ||
| 371 | + DisplayPriority.HIGH if segment_result.get('is_complete', False) else DisplayPriority.NORMAL | ||
| 372 | + ) | ||
| 373 | + metrics.display_latency_ms += (time.time() - display_start) * 1000 | ||
| 374 | + | ||
| 375 | + metrics.tracking_latency_ms += (time.time() - track_start) * 1000 | ||
| 376 | + | ||
| 377 | + metrics.segmentation_latency_ms = (time.time() - seg_start) * 1000 | ||
| 378 | + | ||
| 379 | + # 计算总延迟和效率 | ||
| 380 | + metrics.total_latency_ms = (time.time() - start_time) * 1000 | ||
| 381 | + # 防止除零错误 | ||
| 382 | + if metrics.total_latency_ms > 0: | ||
| 383 | + metrics.processing_efficiency = len(context.audio_data) / metrics.total_latency_ms | ||
| 384 | + else: | ||
| 385 | + metrics.processing_efficiency = 0.0 | ||
| 386 | + self.logger.warning(f"处理延迟为0,无法计算处理效率 [{context.session_id}]") | ||
| 387 | + | ||
| 388 | + # 更新性能统计 | ||
| 389 | + self._update_performance_stats(context.session_id, metrics) | ||
| 390 | + | ||
| 391 | + # 触发指标回调 | ||
| 392 | + self._trigger_metrics_callbacks(context.session_id, metrics) | ||
| 393 | + | ||
| 394 | + except Exception as e: | ||
| 395 | + self.logger.error(f"处理音频时出错: {e}") | ||
| 396 | + self._handle_error(context.session_id, e) | ||
| 397 | + | ||
| 398 | + def _on_tracking_result(self, session_id: str, result_id: str, text: str, | ||
| 399 | + confidence: float, is_final: bool): | ||
| 400 | + """处理追踪模块的结果回调""" | ||
| 401 | + # 触发结果回调 | ||
| 402 | + for callback in self.result_callbacks: | ||
| 403 | + try: | ||
| 404 | + callback(session_id, text, confidence, is_final) | ||
| 405 | + except Exception as e: | ||
| 406 | + self.logger.error(f"结果回调执行出错: {e}") | ||
| 407 | + | ||
| 408 | + def _on_display_error(self, session_id: str, error: Exception): | ||
| 409 | + """处理显示模块的错误回调""" | ||
| 410 | + self.logger.error(f"显示模块错误 [{session_id}]: {error}") | ||
| 411 | + self._handle_error(session_id, error) | ||
| 412 | + | ||
| 413 | + def _on_chunk_quality_feedback(self, session_id: str, chunk_id: str, | ||
| 414 | + quality_score: float, metrics: Dict): | ||
| 415 | + """处理分片质量反馈""" | ||
| 416 | + # 根据质量反馈调整策略 | ||
| 417 | + if quality_score < 0.5: | ||
| 418 | + self.logger.warning(f"分片质量较低 [{session_id}:{chunk_id}]: {quality_score}") | ||
| 419 | + # 可以在这里实施自适应调整 | ||
| 420 | + | ||
| 421 | + def _handle_error(self, session_id: str, error: Exception): | ||
| 422 | + """处理错误""" | ||
| 423 | + for callback in self.error_callbacks: | ||
| 424 | + try: | ||
| 425 | + callback(session_id, error) | ||
| 426 | + except Exception as e: | ||
| 427 | + self.logger.error(f"错误回调执行出错: {e}") | ||
| 428 | + | ||
| 429 | + def _update_performance_stats(self, session_id: str, metrics: OptimizationMetrics): | ||
| 430 | + """更新性能统计""" | ||
| 431 | + with self._lock: | ||
| 432 | + # 更新平均延迟 | ||
| 433 | + current_avg = self.processing_stats['average_latency_ms'] | ||
| 434 | + total_sessions = self.processing_stats['total_sessions'] | ||
| 435 | + | ||
| 436 | + if total_sessions > 0: | ||
| 437 | + new_avg = (current_avg * (total_sessions - 1) + metrics.total_latency_ms) / total_sessions | ||
| 438 | + self.processing_stats['average_latency_ms'] = new_avg | ||
| 439 | + | ||
| 440 | + # 存储会话指标 | ||
| 441 | + self.performance_metrics[session_id] = metrics | ||
| 442 | + | ||
| 443 | + def _trigger_metrics_callbacks(self, session_id: str, metrics: OptimizationMetrics): | ||
| 444 | + """触发性能指标回调""" | ||
| 445 | + for callback in self.metrics_callbacks: | ||
| 446 | + try: | ||
| 447 | + # 将session_id包含在metrics字典中传递给回调 | ||
| 448 | + metrics_dict = { | ||
| 449 | + 'session_id': session_id, | ||
| 450 | + 'total_latency_ms': metrics.total_latency_ms, | ||
| 451 | + 'chunking_latency_ms': metrics.chunking_latency_ms, | ||
| 452 | + 'segmentation_latency_ms': metrics.segmentation_latency_ms, | ||
| 453 | + 'tracking_latency_ms': metrics.tracking_latency_ms, | ||
| 454 | + 'display_latency_ms': metrics.display_latency_ms, | ||
| 455 | + 'processing_efficiency': metrics.processing_efficiency, | ||
| 456 | + 'accuracy_score': getattr(metrics, 'accuracy_score', 0.0) | ||
| 457 | + } | ||
| 458 | + callback(metrics_dict) | ||
| 459 | + except Exception as e: | ||
| 460 | + self.logger.error(f"指标回调执行出错: {e}") | ||
| 461 | + | ||
| 462 | + def complete_session(self, session_id: str) -> bool: | ||
| 463 | + """完成处理会话""" | ||
| 464 | + try: | ||
| 465 | + # 完成各个模块的会话 | ||
| 466 | + self.segmentation_module.complete_session(session_id) | ||
| 467 | + self.chunking_module.complete_session(session_id) | ||
| 468 | + self.tracking_module.complete_session(session_id) | ||
| 469 | + | ||
| 470 | + with self._lock: | ||
| 471 | + self.processing_stats['active_sessions'] -= 1 | ||
| 472 | + if session_id in self.performance_metrics: | ||
| 473 | + del self.performance_metrics[session_id] | ||
| 474 | + | ||
| 475 | + self.logger.info(f"会话完成: {session_id}") | ||
| 476 | + return True | ||
| 477 | + | ||
| 478 | + except Exception as e: | ||
| 479 | + self.logger.error(f"完成会话失败: {e}") | ||
| 480 | + self._handle_error(session_id, e) | ||
| 481 | + return False | ||
| 482 | + | ||
| 483 | + def get_session_results(self, session_id: str) -> List[Dict]: | ||
| 484 | + """获取会话的所有结果""" | ||
| 485 | + try: | ||
| 486 | + # 从追踪模块获取结果 | ||
| 487 | + results = self.tracking_module.get_session_results(session_id) | ||
| 488 | + | ||
| 489 | + # 从显示模块获取显示信息 | ||
| 490 | + display_segments = self.display_module.get_session_display(session_id) | ||
| 491 | + | ||
| 492 | + # 合并结果 | ||
| 493 | + combined_results = [] | ||
| 494 | + for result in results: | ||
| 495 | + result_dict = { | ||
| 496 | + 'result_id': result.result_id, | ||
| 497 | + 'text': result.text, | ||
| 498 | + 'confidence': result.confidence, | ||
| 499 | + 'is_final': result.is_final, | ||
| 500 | + 'timestamp': result.timestamp, | ||
| 501 | + 'result_type': result.result_type.value if hasattr(result.result_type, 'value') else str(result.result_type) | ||
| 502 | + } | ||
| 503 | + combined_results.append(result_dict) | ||
| 504 | + | ||
| 505 | + return combined_results | ||
| 506 | + | ||
| 507 | + except Exception as e: | ||
| 508 | + self.logger.error(f"获取会话结果失败: {e}") | ||
| 509 | + return [] | ||
| 510 | + | ||
| 511 | + def get_performance_stats(self) -> Dict: | ||
| 512 | + """获取性能统计""" | ||
| 513 | + with self._lock: | ||
| 514 | + stats = self.processing_stats.copy() | ||
| 515 | + | ||
| 516 | + # 添加各模块的性能统计 | ||
| 517 | + stats['segmentation_stats'] = self.segmentation_module.get_performance_stats() | ||
| 518 | + stats['chunking_stats'] = self.chunking_module.get_performance_stats() | ||
| 519 | + stats['tracking_stats'] = self.tracking_module.get_performance_stats() | ||
| 520 | + stats['display_stats'] = self.display_module.get_performance_stats() | ||
| 521 | + | ||
| 522 | + return stats | ||
| 523 | + | ||
| 524 | + def get_optimization_metrics(self, session_id: str = None) -> Dict: | ||
| 525 | + """获取优化指标""" | ||
| 526 | + if session_id: | ||
| 527 | + return self.performance_metrics.get(session_id, {}) | ||
| 528 | + else: | ||
| 529 | + return self.performance_metrics.copy() | ||
| 530 | + | ||
| 531 | + def shutdown(self): | ||
| 532 | + """关闭优化管理器""" | ||
| 533 | + self._running = False | ||
| 534 | + | ||
| 535 | + # 关闭各个模块 | ||
| 536 | + self.segmentation_module.shutdown() | ||
| 537 | + self.chunking_module.shutdown() | ||
| 538 | + self.tracking_module.shutdown() | ||
| 539 | + self.display_module.shutdown() | ||
| 540 | + | ||
| 541 | + # 关闭线程池 | ||
| 542 | + self.executor.shutdown(wait=True) | ||
| 543 | + | ||
| 544 | + self.logger.info("流式语音识别优化管理器已关闭") |
-
Please register or login to post a comment