测试文件提交

冯杨
Commit fd51e309766a58659e3c82a1b15f6f2b0eea34f0 fd51e309 1 parent f274f8c1
Showing 13 changed files with 4642 additions and 0 deletions
doc/dev/code_quality_maintenance_guide.md
doc/dev/funasr_optimization_analysis.md
doc/dev/funasr_protocol_compatibility_fix.md
doc/dev/funasr_sync_optimization.md
doc/dev/server_recording_test_guide.md
doc/dev/streaming_recognition_optimization_plan.md
doc/process/funasr_timeout_analysis.md
streaming/__init__.py
streaming/optimization/__init__.py
streaming/optimization/adaptive_vad_chunking.py
streaming/optimization/intelligent_segmentation.py
streaming/optimization/optimization_config.json
streaming/optimization/optimization_manager.py
--- a/doc/dev/code_quality_maintenance_guide.md 0 → 100644
View file @fd51e30
+++ b/doc/dev/code_quality_maintenance_guide.md 0 → 100644
View file @fd51e30
+ # 代码质量与可维护性增强指南
+ 
+ **AIfeng/2025-07-02 11:24:08**
+ 
+ ## 概述
+ 
+ 基于对 `eman_one` 项目的深度分析，本文档提供了全面的代码质量和可维护性增强建议，涵盖架构设计、代码规范、测试策略、文档管理和持续集成等方面。
+ 
+ ## 1. 架构设计优化
+ 
+ ### 1.1 模块化重构建议
+ 
+ #### 当前状态分析
+ - ✅ 已实现同步架构重构
+ - ✅ 工具模块 `utils` 已建立
+ - ⚠️ 部分功能模块耦合度较高
+ - ⚠️ 缺少统一的接口抽象
+ 
+ #### 改进方案
+ 
+ **1. 建立分层架构**
+ ```
+ eman_one/
+ ├── core/           # 核心业务逻辑
+ │   ├── asr/        # 语音识别模块
+ │   ├── recorder/   # 录音模块
+ │   └── api/        # API接口层
+ ├── services/       # 服务层
+ │   ├── funasr_service.py
+ │   ├── recording_service.py
+ │   └── websocket_service.py
+ ├── interfaces/     # 接口定义
+ │   ├── asr_interface.py
+ │   └── recorder_interface.py
+ ├── utils/          # 工具模块
+ └── config/         # 配置管理
+ ```
+ 
+ **2. 接口抽象设计**
+ ```python
+ # interfaces/asr_interface.py
+ from abc import ABC, abstractmethod
+ 
+ class ASRInterface(ABC):
+     @abstractmethod
+     def connect(self) -> bool:
+         pass
+     
+     @abstractmethod
+     def send_audio(self, audio_data: bytes) -> None:
+         pass
+     
+     @abstractmethod
+     def get_result(self) -> str:
+         pass
+ ```
+ 
+ ### 1.2 依赖注入模式
+ 
+ **实现依赖注入容器**
+ ```python
+ # core/container.py
+ class DIContainer:
+     def __init__(self):
+         self._services = {}
+         self._singletons = {}
+     
+     def register(self, interface, implementation, singleton=False):
+         self._services[interface] = (implementation, singleton)
+     
+     def resolve(self, interface):
+         if interface in self._singletons:
+             return self._singletons[interface]
+         
+         implementation, is_singleton = self._services[interface]
+         instance = implementation()
+         
+         if is_singleton:
+             self._singletons[interface] = instance
+         
+         return instance
+ ```
+ 
+ ## 2. 代码规范与质量
+ 
+ ### 2.1 代码风格统一
+ 
+ #### 配置文件设置
+ 
+ **pyproject.toml**
+ ```toml
+ [tool.black]
+ line-length = 88
+ target-version = ['py38']
+ include = '\.pyi?$'
+ 
+ [tool.isort]
+ profile = "black"
+ multi_line_output = 3
+ line_length = 88
+ 
+ [tool.flake8]
+ max-line-length = 88
+ extend-ignore = ["E203", "W503"]
+ exclude = [".git", "__pycache__", "build", "dist"]
+ 
+ [tool.mypy]
+ python_version = "3.8"
+ warn_return_any = true
+ warn_unused_configs = true
+ disallow_untyped_defs = true
+ ```
+ 
+ **pre-commit 配置**
+ ```yaml
+ # .pre-commit-config.yaml
+ repos:
+   - repo: https://github.com/psf/black
+     rev: 23.3.0
+     hooks:
+       - id: black
+   
+   - repo: https://github.com/pycqa/isort
+     rev: 5.12.0
+     hooks:
+       - id: isort
+   
+   - repo: https://github.com/pycqa/flake8
+     rev: 6.0.0
+     hooks:
+       - id: flake8
+   
+   - repo: https://github.com/pre-commit/mirrors-mypy
+     rev: v1.3.0
+     hooks:
+       - id: mypy
+ ```
+ 
+ ### 2.2 类型注解增强
+ 
+ **示例改进**
+ ```python
+ # 改进前
+ def process_audio(data, sample_rate):
+     return data
+ 
+ # 改进后
+ from typing import Optional, Union
+ import numpy as np
+ 
+ def process_audio(
+     data: Union[np.ndarray, bytes], 
+     sample_rate: int,
+     channels: int = 1
+ ) -> Optional[np.ndarray]:
+     """处理音频数据
+     
+     Args:
+         data: 音频数据，支持numpy数组或字节流
+         sample_rate: 采样率
+         channels: 声道数，默认为1
+     
+     Returns:
+         处理后的音频数据，失败时返回None
+     
+     Raises:
+         ValueError: 当采样率无效时
+     """
+     if sample_rate <= 0:
+         raise ValueError(f"Invalid sample rate: {sample_rate}")
+     
+     # 处理逻辑...
+     return processed_data
+ ```
+ 
+ ### 2.3 错误处理标准化
+ 
+ **自定义异常类**
+ ```python
+ # utils/exceptions.py
+ class EmanOneException(Exception):
+     """项目基础异常类"""
+     pass
+ 
+ class ASRConnectionError(EmanOneException):
+     """ASR连接异常"""
+     pass
+ 
+ class AudioProcessingError(EmanOneException):
+     """音频处理异常"""
+     pass
+ 
+ class ConfigurationError(EmanOneException):
+     """配置错误异常"""
+     pass
+ ```
+ 
+ **统一错误处理装饰器**
+ ```python
+ # utils/decorators.py
+ from functools import wraps
+ from typing import Callable, Any
+ import logging
+ 
+ def handle_exceptions(logger: logging.Logger = None):
+     def decorator(func: Callable) -> Callable:
+         @wraps(func)
+         def wrapper(*args, **kwargs) -> Any:
+             try:
+                 return func(*args, **kwargs)
+             except EmanOneException as e:
+                 if logger:
+                     logger.error(f"{func.__name__} failed: {e}")
+                 raise
+             except Exception as e:
+                 if logger:
+                     logger.error(f"Unexpected error in {func.__name__}: {e}")
+                 raise EmanOneException(f"Unexpected error: {e}") from e
+         return wrapper
+     return decorator
+ ```
+ 
+ ## 3. 测试策略
+ 
+ ### 3.1 测试金字塔实现
+ 
+ **目录结构**
+ ```
+ test/
+ ├── unit/           # 单元测试 (70%)
+ │   ├── test_asr.py
+ │   ├── test_recorder.py
+ │   └── test_utils.py
+ ├── integration/    # 集成测试 (20%)
+ │   ├── test_asr_integration.py
+ │   └── test_api_integration.py
+ ├── e2e/           # 端到端测试 (10%)
+ │   └── test_voice_workflow.py
+ ├── fixtures/      # 测试数据
+ │   ├── audio_samples/
+ │   └── config_samples/
+ └── conftest.py    # pytest配置
+ ```
+ 
+ **pytest 配置示例**
+ ```python
+ # test/conftest.py
+ import pytest
+ import tempfile
+ import os
+ from unittest.mock import Mock
+ 
+ @pytest.fixture
+ def temp_audio_file():
+     """临时音频文件fixture"""
+     with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
+         # 创建测试音频数据
+         yield f.name
+     os.unlink(f.name)
+ 
+ @pytest.fixture
+ def mock_asr_client():
+     """模拟ASR客户端"""
+     mock = Mock()
+     mock.connect.return_value = True
+     mock.send_audio.return_value = None
+     mock.get_result.return_value = "测试识别结果"
+     return mock
+ 
+ @pytest.fixture(scope="session")
+ def test_config():
+     """测试配置"""
+     return {
+         "asr": {
+             "host": "localhost",
+             "port": 10095,
+             "timeout": 5
+         },
+         "audio": {
+             "sample_rate": 16000,
+             "channels": 1
+         }
+     }
+ ```
+ 
+ ### 3.2 性能测试
+ 
+ **基准测试示例**
+ ```python
+ # test/performance/test_benchmarks.py
+ import pytest
+ import time
+ from utils.util import process_audio_data
+ 
+ class TestPerformance:
+     def test_audio_processing_speed(self, benchmark):
+         """测试音频处理性能"""
+         audio_data = b'\x00' * 16000  # 1秒音频数据
+         
+         result = benchmark(process_audio_data, audio_data)
+         assert result is not None
+     
+     @pytest.mark.parametrize("data_size", [1000, 10000, 100000])
+     def test_memory_usage(self, data_size):
+         """测试内存使用情况"""
+         import psutil
+         import os
+         
+         process = psutil.Process(os.getpid())
+         memory_before = process.memory_info().rss
+         
+         # 执行测试操作
+         large_data = b'\x00' * data_size
+         process_audio_data(large_data)
+         
+         memory_after = process.memory_info().rss
+         memory_diff = memory_after - memory_before
+         
+         # 确保内存增长在合理范围内
+         assert memory_diff < data_size * 2
+ ```
+ 
+ ## 4. 文档管理体系
+ 
+ ### 4.1 文档分类标准
+ 
+ **Diátaxis 框架应用**
+ ```
+ doc/
+ ├── tutorials/      # 教程 - 学习导向
+ │   ├── quick_start.md
+ │   └── voice_setup_guide.md
+ ├── how-to/         # 指南 - 问题导向
+ │   ├── troubleshooting.md
+ │   └── performance_tuning.md
+ ├── reference/      # 参考 - 信息导向
+ │   ├── api_reference.md
+ │   ├── config_reference.md
+ │   └── cli_reference.md
+ ├── explanation/    # 说明 - 理解导向
+ │   ├── architecture.md
+ │   └── design_decisions.md
+ └── process/        # 过程文档
+     ├── update.log
+     └── meeting_notes/
+ ```
+ 
+ ### 4.2 自动化文档生成
+ 
+ **API文档生成**
+ ```python
+ # scripts/generate_docs.py
+ import inspect
+ import ast
+ from pathlib import Path
+ 
+ def generate_api_docs():
+     """自动生成API文档"""
+     modules = [
+         'funasr_asr_sync',
+         'recorder_sync', 
+         'server_recording_api_sync'
+     ]
+     
+     for module_name in modules:
+         module = __import__(module_name)
+         doc_content = f"# {module_name} API Reference\n\n"
+         
+         for name, obj in inspect.getmembers(module):
+             if inspect.isclass(obj) or inspect.isfunction(obj):
+                 doc_content += f"## {name}\n\n"
+                 doc_content += f"{inspect.getdoc(obj) or 'No documentation'}\n\n"
+         
+         with open(f"doc/reference/{module_name}_api.md", "w", encoding="utf-8") as f:
+             f.write(doc_content)
+ ```
+ 
+ ## 5. 持续集成与部署
+ 
+ ### 5.1 GitHub Actions 配置
+ 
+ **.github/workflows/ci.yml**
+ ```yaml
+ name: CI/CD Pipeline
+ 
+ on:
+   push:
+     branches: [ main, develop ]
+   pull_request:
+     branches: [ main ]
+ 
+ jobs:
+   test:
+     runs-on: ubuntu-latest
+     strategy:
+       matrix:
+         python-version: [3.8, 3.9, '3.10']
+     
+     steps:
+     - uses: actions/checkout@v3
+     
+     - name: Set up Python ${{ matrix.python-version }}
+       uses: actions/setup-python@v4
+       with:
+         python-version: ${{ matrix.python-version }}
+     
+     - name: Install dependencies
+       run: |
+         python -m pip install --upgrade pip
+         pip install -r requirements.txt
+         pip install pytest pytest-cov black isort flake8
+     
+     - name: Run linting
+       run: |
+         black --check .
+         isort --check-only .
+         flake8 .
+     
+     - name: Run tests
+       run: |
+         pytest --cov=. --cov-report=xml
+     
+     - name: Upload coverage
+       uses: codecov/codecov-action@v3
+       with:
+         file: ./coverage.xml
+ 
+   security:
+     runs-on: ubuntu-latest
+     steps:
+     - uses: actions/checkout@v3
+     - name: Run security scan
+       uses: pypa/gh-action-pip-audit@v1.0.8
+ ```
+ 
+ ### 5.2 代码质量监控
+ 
+ **SonarQube 配置**
+ ```properties
+ # sonar-project.properties
+ sonar.projectKey=eman_one
+ sonar.projectName=Eman One Voice Processing
+ sonar.projectVersion=1.0
+ 
+ sonar.sources=.
+ sonar.exclusions=**/*_test.py,**/test_*.py,**/__pycache__/**
+ 
+ sonar.python.coverage.reportPaths=coverage.xml
+ sonar.python.xunit.reportPath=test-results.xml
+ 
+ sonar.qualitygate.wait=true
+ ```
+ 
+ ## 6. 监控与可观测性
+ 
+ ### 6.1 结构化日志
+ 
+ **日志配置增强**
+ ```python
+ # utils/logging_config.py
+ import logging
+ import json
+ from datetime import datetime
+ 
+ class StructuredFormatter(logging.Formatter):
+     def format(self, record):
+         log_entry = {
+             'timestamp': datetime.utcnow().isoformat(),
+             'level': record.levelname,
+             'logger': record.name,
+             'message': record.getMessage(),
+             'module': record.module,
+             'function': record.funcName,
+             'line': record.lineno
+         }
+         
+         if hasattr(record, 'user_id'):
+             log_entry['user_id'] = record.user_id
+         
+         if hasattr(record, 'request_id'):
+             log_entry['request_id'] = record.request_id
+         
+         return json.dumps(log_entry, ensure_ascii=False)
+ 
+ def setup_structured_logging():
+     formatter = StructuredFormatter()
+     handler = logging.StreamHandler()
+     handler.setFormatter(formatter)
+     
+     logger = logging.getLogger('eman_one')
+     logger.addHandler(handler)
+     logger.setLevel(logging.INFO)
+     
+     return logger
+ ```
+ 
+ ### 6.2 性能指标收集
+ 
+ **指标收集器**
+ ```python
+ # utils/metrics.py
+ import time
+ from collections import defaultdict
+ from contextlib import contextmanager
+ from typing import Dict, Any
+ 
+ class MetricsCollector:
+     def __init__(self):
+         self.counters = defaultdict(int)
+         self.timers = defaultdict(list)
+         self.gauges = defaultdict(float)
+     
+     def increment(self, name: str, value: int = 1):
+         """计数器递增"""
+         self.counters[name] += value
+     
+     def set_gauge(self, name: str, value: float):
+         """设置仪表值"""
+         self.gauges[name] = value
+     
+     @contextmanager
+     def timer(self, name: str):
+         """计时器上下文管理器"""
+         start_time = time.time()
+         try:
+             yield
+         finally:
+             duration = time.time() - start_time
+             self.timers[name].append(duration)
+     
+     def get_metrics(self) -> Dict[str, Any]:
+         """获取所有指标"""
+         return {
+             'counters': dict(self.counters),
+             'timers': {
+                 name: {
+                     'count': len(times),
+                     'avg': sum(times) / len(times) if times else 0,
+                     'min': min(times) if times else 0,
+                     'max': max(times) if times else 0
+                 }
+                 for name, times in self.timers.items()
+             },
+             'gauges': dict(self.gauges)
+         }
+ 
+ # 全局指标收集器
+ metrics = MetricsCollector()
+ ```
+ 
+ ## 7. 安全性增强
+ 
+ ### 7.1 配置安全
+ 
+ **敏感信息管理**
+ ```python
+ # utils/security.py
+ import os
+ from cryptography.fernet import Fernet
+ from typing import Optional
+ 
+ class SecureConfig:
+     def __init__(self):
+         self.key = self._get_or_create_key()
+         self.cipher = Fernet(self.key)
+     
+     def _get_or_create_key(self) -> bytes:
+         key_file = '.encryption_key'
+         if os.path.exists(key_file):
+             with open(key_file, 'rb') as f:
+                 return f.read()
+         else:
+             key = Fernet.generate_key()
+             with open(key_file, 'wb') as f:
+                 f.write(key)
+             return key
+     
+     def encrypt_value(self, value: str) -> str:
+         """加密敏感值"""
+         return self.cipher.encrypt(value.encode()).decode()
+     
+     def decrypt_value(self, encrypted_value: str) -> str:
+         """解密敏感值"""
+         return self.cipher.decrypt(encrypted_value.encode()).decode()
+     
+     def get_env_or_encrypted(self, key: str, encrypted_fallback: Optional[str] = None) -> Optional[str]:
+         """优先从环境变量获取，否则使用加密值"""
+         env_value = os.getenv(key)
+         if env_value:
+             return env_value
+         
+         if encrypted_fallback:
+             return self.decrypt_value(encrypted_fallback)
+         
+         return None
+ ```
+ 
+ ### 7.2 输入验证
+ 
+ **数据验证器**
+ ```python
+ # utils/validators.py
+ import re
+ from typing import Any, List, Optional
+ from pydantic import BaseModel, validator
+ 
+ class AudioConfig(BaseModel):
+     sample_rate: int
+     channels: int
+     chunk_size: int
+     
+     @validator('sample_rate')
+     def validate_sample_rate(cls, v):
+         if v not in [8000, 16000, 22050, 44100, 48000]:
+             raise ValueError('Invalid sample rate')
+         return v
+     
+     @validator('channels')
+     def validate_channels(cls, v):
+         if v not in [1, 2]:
+             raise ValueError('Channels must be 1 or 2')
+         return v
+ 
+ class ASRConfig(BaseModel):
+     host: str
+     port: int
+     timeout: int
+     
+     @validator('host')
+     def validate_host(cls, v):
+         # 简单的主机名/IP验证
+         if not re.match(r'^[a-zA-Z0-9.-]+$', v):
+             raise ValueError('Invalid host format')
+         return v
+     
+     @validator('port')
+     def validate_port(cls, v):
+         if not 1 <= v <= 65535:
+             raise ValueError('Port must be between 1 and 65535')
+         return v
+ ```
+ 
+ ## 8. 实施计划
+ 
+ ### 8.1 优先级分级
+ 
+ **高优先级 (立即实施)**
+ 1. ✅ 依赖包管理 (已完成)
+ 2. 🔄 代码格式化工具配置
+ 3. 🔄 基础测试框架搭建
+ 4. 🔄 错误处理标准化
+ 
+ **中优先级 (1-2周内)**
+ 1. 接口抽象设计
+ 2. 结构化日志实现
+ 3. 性能监控集成
+ 4. 安全性增强
+ 
+ **低优先级 (1个月内)**
+ 1. 完整CI/CD流水线
+ 2. 自动化文档生成
+ 3. 高级监控仪表盘
+ 4. 性能基准测试
+ 
+ ### 8.2 成功指标
+ 
+ **代码质量指标**
+ - 代码覆盖率 > 80%
+ - 代码重复率 < 5%
+ - 技术债务评级 A
+ - 安全漏洞数量 = 0
+ 
+ **可维护性指标**
+ - 平均修复时间 < 2小时
+ - 新功能开发周期 < 1周
+ - 文档覆盖率 > 90%
+ - 团队满意度 > 4.5/5
+ 
+ ## 总结
+ 
+ 本指南提供了全面的代码质量和可维护性增强方案，通过系统性的改进措施，将显著提升 `eman_one` 项目的代码质量、开发效率和长期可维护性。建议按照优先级逐步实施，并持续监控改进效果。
\ No newline at end of file
--- a/doc/dev/funasr_optimization_analysis.md 0 → 100644
View file @fd51e30
+++ b/doc/dev/funasr_optimization_analysis.md 0 → 100644
View file @fd51e30
+ # FunASR语音识别优化分析报告
+ 
+ **AIfeng/2025-07-01 16:51:01**
+ 
+ ## 概述
+ 
+ 基于参考项目Fay-main的FunASR WebSocket架构分析，对当前eman_one项目的语音识别方案进行技术对比和优化建议。重点分析前端录音与服务端录音的技术差异，提出体验改进方案。
+ 
+ ## 1. 技术架构对比分析
+ 
+ ### 1.1 当前eman_one项目架构
+ 
+ **前端录音方案**：
+ - **录音方式**：浏览器MediaRecorder API
+ - **数据流**：前端采集 → WebM/Opus编码 → Base64传输 → 服务端解码
+ - **传输协议**：WebSocket (10197端口)
+ - **处理模式**：分段录音 + 批量传输
+ 
+ **技术特点**：
+ ```javascript
+ // 当前实现方式
+ mediaRecorder = new MediaRecorder(audioStream, {
+     mimeType: 'audio/webm;codecs=opus'
+ });
+ mediaRecorder.start(1000); // 每秒收集一次数据
+ 
+ // 数据处理
+ mediaRecorder.ondataavailable = function(event) {
+     if (event.data.size > 0) {
+         audioChunks.push(event.data);
+     }
+ };
+ ```
+ 
+ ### 1.2 参考项目Fay架构
+ 
+ **服务端录音方案**：
+ - **录音方式**：Python pyaudio直接采集
+ - **数据流**：服务端采集 → PCM原始数据 → 实时流式传输
+ - **处理模式**：连续录音 + 实时VAD断句
+ - **音频参数**：16kHz, 单声道, 16bit PCM
+ 
+ **技术特点**：
+ ```python
+ # Fay实现方式
+ stream = pyaudio.PyAudio().open(
+     format=pyaudio.paInt16,
+     channels=1,
+     rate=16000,
+     input=True,
+     frames_per_buffer=1024
+ )
+ 
+ # 实时音频处理
+ while recording:
+     audio_data = stream.read(1024)
+     # 实时VAD检测和传输
+ ```
+ 
+ ## 2. 关键技术差异分析
+ 
+ ### 2.1 音频采集方式
+ 
+ | 对比维度 | eman_one (前端录音) | Fay (服务端录音) |
+ |---------|-------------------|------------------|
+ | **延迟性** | 较高 (1秒批量) | 极低 (实时流) |
+ | **音质** | 有损压缩 (Opus) | 无损 (PCM) |
+ | **兼容性** | 浏览器依赖 | 系统级控制 |
+ | **资源占用** | 客户端CPU | 服务端CPU |
+ | **网络传输** | 压缩后较小 | 原始数据较大 |
+ 
+ ### 2.2 VAD语音活动检测
+ 
+ **eman_one现状**：
+ - 前端简单音量检测
+ - 静音超时触发断句
+ - 缺乏智能语音边界检测
+ 
+ **Fay优势**：
+ - 服务端专业VAD算法
+ - 动态阈值自适应
+ - 历史音频缓存机制
+ - 环境噪音适应
+ 
+ ### 2.3 实时性对比
+ 
+ **延迟分析**：
+ ```
+ eman_one延迟链路：
+ 录音(1s) → 编码 → 传输 → 解码 → 识别 ≈ 1.2-1.5秒
+ 
+ Fay延迟链路：
+ 录音(20ms) → 传输 → 识别 ≈ 100-200毫秒
+ ```
+ 
+ ## 3. 体验问题识别
+ 
+ ### 3.1 当前eman_one存在的问题
+ 
+ 1. **响应延迟高**
+    - 1秒批量传输导致明显延迟
+    - 用户体验不够流畅
+ 
+ 2. **断句不准确**
+    - 简单音量阈值容易误判
+    - 无法处理复杂语音场景
+ 
+ 3. **音质损失**
+    - Opus压缩影响识别准确率
+    - Base64传输增加数据量
+ 
+ 4. **打断处理不完善**
+    - 缺乏智能打断机制
+    - 回音消除不够完善
+ 
+ ### 3.2 技术风险评估
+ 
+ **前端录音方案风险**：
+ - 浏览器兼容性问题
+ - 移动端性能限制
+ - 网络不稳定影响
+ - 用户权限管理复杂
+ 
+ **服务端录音方案风险**：
+ - 需要本地部署
+ - 硬件设备依赖
+ - 多用户并发处理
+ - 系统权限要求
+ 
+ ## 4. 优化改进方案
+ 
+ ### 4.1 短期优化（保持前端录音）
+ 
+ #### 4.1.1 降低传输延迟
+ ```javascript
+ // 优化：减少批量间隔
+ mediaRecorder.start(100); // 改为100ms
+ 
+ // 实时传输优化
+ mediaRecorder.ondataavailable = function(event) {
+     if (event.data.size > 0) {
+         // 立即发送，不等待批量
+         sendAudioToASR(event.data);
+     }
+ };
+ ```
+ 
+ #### 4.1.2 改进VAD算法
+ ```javascript
+ // 增强的VAD检测
+ function enhancedVAD(audioData) {
+     // 1. 音量检测
+     const volume = calculateRMS(audioData);
+     
+     // 2. 频谱分析
+     const spectrum = analyzeSpectrum(audioData);
+     
+     // 3. 动态阈值
+     updateDynamicThreshold(volume);
+     
+     // 4. 语音边界检测
+     return detectSpeechBoundary(volume, spectrum);
+ }
+ ```
+ 
+ #### 4.1.3 音频质量优化
+ ```javascript
+ // 使用更高质量的编码参数
+ mediaRecorder = new MediaRecorder(audioStream, {
+     mimeType: 'audio/webm;codecs=opus',
+     audioBitsPerSecond: 128000 // 提高比特率
+ });
+ 
+ // 音频预处理
+ function preprocessAudio(audioData) {
+     // 降噪处理
+     // 音量归一化
+     // 格式标准化
+     return processedAudio;
+ }
+ ```
+ 
+ ### 4.2 中期优化（混合方案）
+ 
+ #### 4.2.1 双模式支持
+ ```python
+ # 服务端支持多种输入模式
+ class HybridASRServer:
+     def __init__(self):
+         self.web_mode = True  # 前端录音模式
+         self.local_mode = False  # 服务端录音模式
+     
+     async def handle_web_audio(self, websocket, audio_data):
+         """处理前端传输的音频"""
+         # 解码和预处理
+         processed_audio = self.preprocess_web_audio(audio_data)
+         return await self.recognize(processed_audio)
+     
+     async def handle_local_audio(self):
+         """处理服务端录音"""
+         # 直接录音和处理
+         audio_stream = self.capture_local_audio()
+         return await self.recognize(audio_stream)
+ ```
+ 
+ #### 4.2.2 智能模式切换
+ ```javascript
+ // 根据环境自动选择最优模式
+ function selectOptimalMode() {
+     const factors = {
+         networkLatency: measureNetworkLatency(),
+         devicePerformance: assessDevicePerformance(),
+         audioQuality: testAudioQuality()
+     };
+     
+     return factors.networkLatency < 50 ? 'web' : 'local';
+ }
+ ```
+ 
+ ### 4.3 长期优化（全面升级）
+ 
+ #### 4.3.1 采用Fay架构模式
+ ```python
+ # 参考Fay实现服务端录音
+ class FayStyleASR:
+     def __init__(self):
+         self.recorder = AudioRecorder(
+             sample_rate=16000,
+             channels=1,
+             chunk_size=1024
+         )
+         self.vad = VoiceActivityDetector()
+         self.asr_client = FunASRClient()
+     
+     async def continuous_recording(self):
+         """连续录音处理"""
+         while self.recording:
+             audio_chunk = await self.recorder.read_chunk()
+             
+             # VAD检测
+             if self.vad.is_speech(audio_chunk):
+                 await self.asr_client.send_audio(audio_chunk)
+             
+             # 断句检测
+             if self.vad.is_sentence_end():
+                 await self.process_sentence_end()
+ ```
+ 
+ #### 4.3.2 完整的VAD系统
+ ```python
+ # 专业VAD实现
+ class AdvancedVAD:
+     def __init__(self):
+         self.volume_threshold = 0.01
+         self.silence_duration = 0
+         self.speech_duration = 0
+         self.history_buffer = []
+     
+     def detect(self, audio_chunk):
+         # 1. 音量计算
+         volume = self.calculate_volume(audio_chunk)
+         
+         # 2. 动态阈值调整
+         self.update_threshold(volume)
+         
+         # 3. 语音活动判断
+         is_speech = volume > self.volume_threshold
+         
+         # 4. 状态机处理
+         return self.state_machine(is_speech)
+ ```
+ 
+ ## 5. 实施建议
+ 
+ ### 5.1 优先级排序
+ 
+ **P0 (立即实施)**：
+ 1. 降低MediaRecorder传输间隔至100ms
+ 2. 优化音频编码参数
+ 3. 改进前端VAD算法
+ 
+ **P1 (1-2周)**：
+ 1. 实现音频预处理管道
+ 2. 添加动态阈值调整
+ 3. 完善错误处理和重连机制
+ 
+ **P2 (1个月)**：
+ 1. 开发混合录音模式
+ 2. 实现智能模式切换
+ 3. 集成专业VAD算法
+ 
+ **P3 (长期规划)**：
+ 1. 完全迁移到服务端录音
+ 2. 实现Fay级别的实时性
+ 3. 支持多用户并发处理
+ 
+ ### 5.2 技术选型建议
+ 
+ **保持前端录音的情况下**：
+ - 使用WebRTC AudioWorklet替代MediaRecorder
+ - 实现客户端音频预处理
+ - 采用WebSocket流式传输
+ 
+ **迁移到服务端录音**：
+ - 采用pyaudio + asyncio架构
+ - 集成专业VAD库（如webrtcvad）
+ - 实现多用户音频隔离
+ 
+ ### 5.3 性能目标
+ 
+ | 指标 | 当前状态 | 优化目标 |
+ |------|---------|----------|
+ | **端到端延迟** | 1.2-1.5秒 | <300ms |
+ | **识别准确率** | 85% | >95% |
+ | **断句准确率** | 70% | >90% |
+ | **并发用户** | 10 | 100+ |
+ 
+ ## 6. 风险评估与缓解
+ 
+ ### 6.1 技术风险
+ 
+ **风险1：服务端录音权限问题**
+ - 缓解：提供详细的部署文档和权限配置指南
+ - 备选：保持前端录音作为fallback方案
+ 
+ **风险2：多用户并发冲突**
+ - 缓解：实现音频设备虚拟化和隔离
+ - 备选：限制并发数量或使用队列机制
+ 
+ **风险3：系统兼容性问题**
+ - 缓解：支持多种音频驱动和设备
+ - 备选：提供Docker容器化部署
+ 
+ ### 6.2 业务风险
+ 
+ **风险1：用户体验中断**
+ - 缓解：渐进式迁移，保持向后兼容
+ - 监控：实时监控关键指标
+ 
+ **风险2：部署复杂度增加**
+ - 缓解：提供一键部署脚本
+ - 文档：完善的运维手册
+ 
+ ## 7. 结论与建议
+ 
+ ### 7.1 核心结论
+ 
+ 1. **技术差异显著**：Fay的服务端录音方案在实时性和音质方面明显优于当前前端录音方案
+ 
+ 2. **体验提升空间大**：通过优化可将延迟从1.5秒降低到300ms以内
+ 
+ 3. **实施可行性高**：可采用渐进式优化策略，降低迁移风险
+ 
+ ### 7.2 推荐方案
+ 
+ **阶段一（立即实施）**：
+ - 优化现有前端录音方案
+ - 实现基础的实时传输
+ - 改进VAD算法
+ 
+ **阶段二（中期目标）**：
+ - 开发混合录音模式
+ - 支持智能模式切换
+ - 集成专业音频处理
+ 
+ **阶段三（长期愿景）**：
+ - 完全采用服务端录音
+ - 达到Fay级别的用户体验
+ - 支持大规模并发
+ 
+ ### 7.3 关键成功因素
+ 
+ 1. **渐进式迁移**：避免一次性大改造带来的风险
+ 2. **性能监控**：建立完善的指标监控体系
+ 3. **用户反馈**：及时收集和响应用户体验反馈
+ 4. **技术储备**：提前准备相关技术栈和人员培训
+ 
+ 通过系统性的优化改进，eman_one项目完全可以达到甚至超越Fay项目的语音识别体验水平。
\ No newline at end of file
--- a/doc/dev/funasr_protocol_compatibility_fix.md 0 → 100644
View file @fd51e30
+++ b/doc/dev/funasr_protocol_compatibility_fix.md 0 → 100644
View file @fd51e30
+ # AIfeng/2025-07-17 17:04:42
+ # FunASR协议兼容性修复方案
+ 
+ ## 问题分析
+ 
+ ### 根本原因
+ - **协议不匹配**: FunASRSync客户端发送的分块协议与ASR_server.py期望的格式不兼容
+ - **服务端限制**: ASR_server.py只处理包含`url`或`audio_data`字段的消息
+ - **分块协议**: FunASRSync使用`audio_start`、`audio_chunk`、`audio_end`等新格式
+ 
+ ### 现象确认
+ - 小文件正常：使用简单模式，发送标准`audio_data`格式
+ - 大文件失败：使用分块模式，发送不兼容的协议格式
+ - 服务端无响应：ASR_server.py无法识别分块协议消息
+ 
+ ## 解决方案
+ 
+ ### 方案一：服务端协议扩展（推荐）
+ 
+ #### 优势
+ - 保持客户端分块优化不变
+ - 服务端支持更灵活的协议
+ - 向后兼容现有格式
+ 
+ #### 实施步骤
+ 1. **扩展消息处理**: 在`ws_serve`函数中添加分块协议支持
+ 2. **分块重组**: 实现音频分块的接收和重组逻辑
+ 3. **状态管理**: 维护每个连接的分块接收状态
+ 
+ ### 方案二：客户端协议回退（备选）
+ 
+ #### 优势
+ - 无需修改服务端
+ - 实施简单快速
+ 
+ #### 劣势
+ - 失去分块传输优势
+ - 大文件仍可能超时
+ 
+ ## 推荐实施：服务端协议扩展
+ 
+ ### 核心修改点
+ 
+ #### 1. 消息路由扩展
+ ```python
+ # 在ws_serve函数中添加
+ if 'type' in data:
+     # 处理分块协议
+     await handle_chunked_protocol(websocket, data)
+ else:
+     # 现有逻辑保持不变
+     if 'url' in data:
+         await task_queue.put((websocket, data['url'], 'url'))
+     elif 'audio_data' in data:
+         await task_queue.put((websocket, data, 'audio_data'))
+ ```
+ 
+ #### 2. 分块状态管理
+ ```python
+ # 全局状态管理
+ chunk_sessions = {}  # {user_id: {filename, chunks, total_chunks, ...}}
+ ```
+ 
+ #### 3. 分块处理逻辑
+ - **audio_start**: 初始化接收会话
+ - **audio_chunk**: 累积音频分块
+ - **audio_end**: 完成重组并处理
+ 
+ ### 性能优化
+ 
+ #### 内存管理
+ - 使用临时文件而非内存缓存大文件
+ - 及时清理完成的会话
+ - 设置会话超时机制
+ 
+ #### 错误处理
+ - 分块丢失检测
+ - 会话超时清理
+ - 异常状态恢复
+ 
+ ## 实施计划
+ 
+ ### Phase 1: 基础协议支持（2小时）
+ 1. 添加分块消息路由
+ 2. 实现基础分块重组
+ 3. 测试小规模分块
+ 
+ ### Phase 2: 稳定性增强（4小时）
+ 1. 完善错误处理
+ 2. 添加状态管理
+ 3. 实施超时机制
+ 
+ ### Phase 3: 性能优化（1天）
+ 1. 内存优化
+ 2. 并发处理
+ 3. 监控指标
+ 
+ ## 测试验证
+ 
+ ### 测试用例
+ 1. **小文件兼容性**: 确保现有简单模式正常
+ 2. **大文件分块**: 验证2MB、5MB、10MB文件处理
+ 3. **并发处理**: 多客户端同时发送
+ 4. **异常恢复**: 网络中断、分块丢失等场景
+ 
+ ### 成功指标
+ - 大文件传输成功率 >95%
+ - 服务端内存使用稳定
+ - 响应时间合理（<文件大小/1MB * 2秒）
+ - 向后兼容性100%
+ 
+ ## 风险评估
+ 
+ ### 低风险
+ - 向后兼容：现有协议完全保留
+ - 渐进式：可分阶段实施
+ - 可回滚：出问题可快速恢复
+ 
+ ### 注意事项
+ - 内存使用监控
+ - 并发连接限制
+ - 分块大小合理性
+ - 超时参数调优
+ 
+ ## 监控指标
+ 
+ ### 关键指标
+ - 分块接收成功率
+ - 音频重组完整性
+ - 服务端内存峰值
+ - 平均处理延迟
+ 
+ ### 告警阈值
+ - 分块丢失率 >1%
+ - 内存使用 >1GB
+ - 处理延迟 >30秒
+ - 会话超时率 >5%
\ No newline at end of file
--- a/doc/dev/funasr_sync_optimization.md 0 → 100644
View file @fd51e30
+++ b/doc/dev/funasr_sync_optimization.md 0 → 100644
View file @fd51e30
+ # AIfeng/2025-07-17 16:38:52
+ 
+ # FunASRSync大文件处理优化方案
+ 
+ ## 当前状态确认
+ 
+ ### 启用方案
+ ✅ **当前项目启用的是 `funasr_asr_sync.py` 同步版本**
+ 
+ - **主程序**: `app.py` 第415行导入 `from funasr_asr_sync import FunASRSync`
+ - **实例化**: `create_asr_connection()` 函数中创建 `FunASRSync(username)` 实例
+ - **调用路径**: `humanaudio()` → `handle_funasr()` → `ensure_asr_connection()` → `create_asr_connection()`
+ 
+ ### 当前实现分析
+ 
+ **连接配置**:
+ - WebSocket连接超时: 5秒 (第195-219行)
+ - 发送间隔: 0.04秒 (第106-120行)
+ - 重连机制: 指数退避，初始延迟1秒
+ 
+ **大文件处理机制**:
+ - 音频数据通过Base64编码一次性发送 (第160-190行)
+ - 无分块处理机制
+ - 无流控制
+ - 无超时重试
+ 
+ ## 问题分析
+ 
+ ### 1. 大文件超时根因
+ 
+ **主要问题**:
+ - **一次性发送**: 大文件Base64编码后体积增大33%，单次发送易超时
+ - **无分块机制**: 缺乏音频数据分片处理
+ - **连接超时过短**: 5秒超时对大文件处理不足
+ - **无进度反馈**: 客户端无法感知处理进度
+ 
+ **技术瓶颈**:
+ - Base64编码内存占用高
+ - WebSocket单帧大小限制
+ - 网络传输稳定性依赖
+ 
+ ### 2. 性能影响评估
+ 
+ | 文件大小 | Base64后大小 | 预估传输时间 | 超时风险 |
+ |---------|-------------|-------------|----------|
+ | 1MB     | 1.33MB      | 1-2秒       | 低       |
+ | 5MB     | 6.65MB      | 5-10秒      | 中       |
+ | 10MB    | 13.3MB      | 10-20秒     | 高       |
+ | 20MB+   | 26.6MB+     | 20秒+       | 极高     |
+ 
+ ## 优化方案
+ 
+ ### 阶段一：立即可行优化 (1-2天)
+ 
+ #### 1.1 超时参数调优
+ ```python
+ # 配置优化建议
+ ASR_CONNECTION_TIMEOUT = 30  # 连接超时从5秒增加到30秒
+ ASR_SEND_TIMEOUT = 60       # 新增发送超时配置
+ ASR_CHUNK_SIZE = 1024 * 512 # 512KB分块大小
+ ASR_SEND_INTERVAL = 0.02    # 发送间隔从0.04秒减少到0.02秒
+ ```
+ 
+ #### 1.2 分块发送机制
+ ```python
+ def send_audio_data_chunked(self, audio_bytes, filename="audio.wav", chunk_size=512*1024):
+     """分块发送音频数据"""
+     import base64
+     import math
+     
+     try:
+         # 数据预处理
+         if hasattr(audio_bytes, 'tobytes'):
+             audio_bytes = audio_bytes.tobytes()
+         elif isinstance(audio_bytes, memoryview):
+             audio_bytes = bytes(audio_bytes)
+         
+         total_size = len(audio_bytes)
+         total_chunks = math.ceil(total_size / chunk_size)
+         
+         util.log(1, f"开始分块发送: {filename}, 总大小: {total_size} bytes, 分块数: {total_chunks}")
+         
+         # 发送开始信号
+         start_frame = {
+             'type': 'audio_start',
+             'filename': filename,
+             'total_size': total_size,
+             'total_chunks': total_chunks,
+             'chunk_size': chunk_size
+         }
+         self._send_frame_with_retry(start_frame)
+         
+         # 分块发送
+         for i in range(total_chunks):
+             start_pos = i * chunk_size
+             end_pos = min(start_pos + chunk_size, total_size)
+             chunk_data = audio_bytes[start_pos:end_pos]
+             
+             # Base64编码分块
+             chunk_b64 = base64.b64encode(chunk_data).decode('utf-8')
+             
+             chunk_frame = {
+                 'type': 'audio_chunk',
+                 'filename': filename,
+                 'chunk_index': i,
+                 'chunk_data': chunk_b64,
+                 'is_last': (i == total_chunks - 1)
+             }
+             
+             # 发送分块并等待确认
+             success = self._send_frame_with_retry(chunk_frame)
+             if not success:
+                 util.log(3, f"分块 {i+1}/{total_chunks} 发送失败")
+                 return False
+                 
+             # 进度日志
+             if (i + 1) % 10 == 0 or i == total_chunks - 1:
+                 progress = ((i + 1) / total_chunks) * 100
+                 util.log(1, f"发送进度: {progress:.1f}% ({i+1}/{total_chunks})")
+             
+             # 流控延迟
+             time.sleep(0.01)
+         
+         # 发送结束信号
+         end_frame = {
+             'type': 'audio_end',
+             'filename': filename
+         }
+         self._send_frame_with_retry(end_frame)
+         
+         util.log(1, f"音频数据分块发送完成: {filename}")
+         return True
+         
+     except Exception as e:
+         util.log(3, f"分块发送音频数据时出错: {e}")
+         return False
+ ```
+ 
+ #### 1.3 重试机制增强
+ ```python
+ def _send_frame_with_retry(self, frame, max_retries=3, timeout=10):
+     """带重试的帧发送"""
+     for attempt in range(max_retries):
+         try:
+             if self.__ws and self.__connected:
+                 # 设置发送超时
+                 start_time = time.time()
+                 self.__ws.send(json.dumps(frame))
+                 
+                 # 简单的发送确认检查
+                 time.sleep(0.05)  # 等待发送完成
+                 
+                 if time.time() - start_time < timeout:
+                     return True
+                 else:
+                     util.log(2, f"发送超时，尝试 {attempt + 1}/{max_retries}")
+             else:
+                 util.log(2, f"连接不可用，尝试 {attempt + 1}/{max_retries}")
+                 
+         except Exception as e:
+             util.log(2, f"发送失败，尝试 {attempt + 1}/{max_retries}: {e}")
+             
+         if attempt < max_retries - 1:
+             time.sleep(0.5 * (attempt + 1))  # 指数退避
+     
+     return False
+ ```
+ 
+ ### 阶段二：稳定性优化 (3-5天)
+ 
+ #### 2.1 连接健康检查
+ ```python
+ def _health_check(self):
+     """连接健康检查"""
+     if not self.__connected:
+         return False
+         
+     try:
+         # 发送心跳包
+         ping_frame = {'type': 'ping', 'timestamp': time.time()}
+         self.__ws.send(json.dumps(ping_frame))
+         return True
+     except Exception as e:
+         util.log(2, f"健康检查失败: {e}")
+         return False
+ 
+ def _start_health_monitor(self):
+     """启动健康监控线程"""
+     def monitor():
+         while self.__connected:
+             if not self._health_check():
+                 util.log(2, "连接健康检查失败，尝试重连")
+                 self.__attempt_reconnect()
+             time.sleep(30)  # 30秒检查一次
+     
+     Thread(target=monitor, daemon=True).start()
+ ```
+ 
+ #### 2.2 流控机制
+ ```python
+ class FlowController:
+     """流量控制器"""
+     def __init__(self, max_pending=5, window_size=1024*1024):
+         self.max_pending = max_pending
+         self.window_size = window_size
+         self.pending_chunks = 0
+         self.sent_bytes = 0
+         self.last_reset = time.time()
+     
+     def can_send(self, chunk_size):
+         """检查是否可以发送"""
+         # 检查待处理分块数
+         if self.pending_chunks >= self.max_pending:
+             return False
+         
+         # 检查窗口大小
+         now = time.time()
+         if now - self.last_reset > 1.0:  # 每秒重置
+             self.sent_bytes = 0
+             self.last_reset = now
+         
+         if self.sent_bytes + chunk_size > self.window_size:
+             return False
+         
+         return True
+     
+     def on_send(self, chunk_size):
+         """发送时调用"""
+         self.pending_chunks += 1
+         self.sent_bytes += chunk_size
+     
+     def on_ack(self):
+         """收到确认时调用"""
+         self.pending_chunks = max(0, self.pending_chunks - 1)
+ ```
+ 
+ ### 阶段三：架构优化 (1周)
+ 
+ #### 3.1 异步发送队列
+ ```python
+ import asyncio
+ from queue import Queue
+ 
+ class AsyncSendQueue:
+     """异步发送队列"""
+     def __init__(self, max_size=100):
+         self.queue = Queue(maxsize=max_size)
+         self.running = False
+         self.worker_thread = None
+     
+     def start(self):
+         """启动发送队列"""
+         self.running = True
+         self.worker_thread = Thread(target=self._worker, daemon=True)
+         self.worker_thread.start()
+     
+     def _worker(self):
+         """队列工作线程"""
+         while self.running:
+             try:
+                 item = self.queue.get(timeout=1)
+                 if item is None:  # 停止信号
+                     break
+                 
+                 frame, callback = item
+                 success = self._send_frame(frame)
+                 if callback:
+                     callback(success)
+                     
+             except Exception as e:
+                 util.log(3, f"发送队列工作异常: {e}")
+     
+     def enqueue(self, frame, callback=None):
+         """入队"""
+         try:
+             self.queue.put((frame, callback), timeout=5)
+             return True
+         except:
+             return False
+ ```
+ 
+ ## 配置文件更新
+ 
+ ### config_util.py 新增配置
+ ```python
+ # FunASR大文件优化配置
+ asr_connection_timeout = 30      # 连接超时(秒)
+ asr_send_timeout = 60           # 发送超时(秒)
+ asr_chunk_size = 512 * 1024     # 分块大小(bytes)
+ asr_max_retries = 3             # 最大重试次数
+ asr_send_interval = 0.02        # 发送间隔(秒)
+ asr_flow_control_window = 1024 * 1024  # 流控窗口大小
+ asr_max_pending_chunks = 5      # 最大待处理分块数
+ asr_health_check_interval = 30  # 健康检查间隔(秒)
+ ```
+ 
+ ## 实施计划
+ 
+ ### 第1天：立即优化
+ - [ ] 更新超时配置
+ - [ ] 实现基础分块发送
+ - [ ] 添加重试机制
+ - [ ] 测试小文件兼容性
+ 
+ ### 第2-3天：稳定性增强
+ - [ ] 实现连接健康检查
+ - [ ] 添加流控机制
+ - [ ] 优化错误处理
+ - [ ] 大文件测试验证
+ 
+ ### 第4-5天：性能调优
+ - [ ] 异步发送队列
+ - [ ] 内存使用优化
+ - [ ] 并发处理能力
+ - [ ] 压力测试
+ 
+ ### 第6-7天：监控与文档
+ - [ ] 添加性能监控
+ - [ ] 完善日志记录
+ - [ ] 更新技术文档
+ - [ ] 用户使用指南
+ 
+ ## 监控指标
+ 
+ ### 关键指标
+ - **传输成功率**: 目标 >99%
+ - **平均传输时间**: 大文件 <30秒
+ - **重连频率**: <1次/小时
+ - **内存使用**: 峰值 <原来150%
+ 
+ ### 监控实现
+ ```python
+ class ASRMetrics:
+     """ASR性能指标收集"""
+     def __init__(self):
+         self.total_requests = 0
+         self.success_requests = 0
+         self.total_bytes = 0
+         self.total_time = 0
+         self.reconnect_count = 0
+     
+     def record_request(self, size_bytes, duration_seconds, success):
+         self.total_requests += 1
+         self.total_bytes += size_bytes
+         self.total_time += duration_seconds
+         if success:
+             self.success_requests += 1
+     
+     def get_stats(self):
+         if self.total_requests == 0:
+             return {}
+         
+         return {
+             'success_rate': self.success_requests / self.total_requests,
+             'avg_throughput': self.total_bytes / self.total_time if self.total_time > 0 else 0,
+             'avg_duration': self.total_time / self.total_requests,
+             'reconnect_rate': self.reconnect_count / self.total_requests
+         }
+ ```
+ 
+ ## 风险评估
+ 
+ ### 技术风险
+ - **兼容性**: 分块机制需要服务端支持
+ - **性能**: 分块可能增加延迟
+ - **复杂性**: 错误处理逻辑复杂化
+ 
+ ### 缓解措施
+ - 保留原有发送方式作为降级方案
+ - 渐进式部署，先小范围测试
+ - 完善监控和告警机制
+ 
+ ## 预期效果
+ 
+ ### 性能提升
+ - 大文件(>5MB)成功率从60%提升到95%+
+ - 传输超时减少80%
+ - 用户体验显著改善
+ 
+ ### 系统稳定性
+ - 连接稳定性提升
+ - 错误恢复能力增强
+ - 资源使用更合理
+ 
+ ---
+ 
+ **优化重点**: 当前项目确实使用FunASRSync同步版本，主要问题是大文件一次性Base64发送导致超时。通过分块发送、重试机制和流控优化，可以显著改善大文件处理能力。
\ No newline at end of file
--- a/doc/dev/server_recording_test_guide.md 0 → 100644
View file @fd51e30
+++ b/doc/dev/server_recording_test_guide.md 0 → 100644
View file @fd51e30
+ <!-- AIfeng/2025-07-01 17:47:46 -->
+ # 服务端录音测试页面功能测试指南
+ 
+ ## 概述
+ 
+ `server_recording_test.html` 是一个功能完整的服务端录音测试面板，提供了录音控制、设备管理、配置调整、实时监控等多项功能。本指南详细说明各功能的测试方法。
+ 
+ ## 页面访问
+ 
+ **访问地址**: `http://localhost:8010/server-recording-test`
+ 
+ **前置条件**:
+ - 确保服务端已启动（端口8010）
+ - 确保 `server_recording_api.py` 已正确注册
+ - 浏览器支持WebSocket和Web Audio API
+ 
+ ## 核心功能测试
+ 
+ ### 1. 获取状态功能测试
+ 
+ **功能描述**: 获取当前录音系统的状态信息
+ 
+ **测试步骤**:
+ 1. 点击「获取状态」按钮
+ 2. 观察状态面板显示内容
+ 3. 检查操作日志区域
+ 
+ **预期结果**:
+ ```
+ 录音状态: ⚪ 已停止 / 🔴 录音中
+ 设备索引: -1 (或具体设备编号)
+ 采样率: 16000 Hz
+ 声道数: 1
+ 音量阈值: 0.01
+ 静音超时: 2.0s
+ 语音超时: 10.0s
+ ```
+ 
+ **API调用**: `GET /api/server-recording/status`
+ 
+ **故障排查**:
+ - 如显示"获取状态失败"，检查服务端是否正常运行
+ - 检查网络连接和API路由配置
+ 
+ ### 2. 列出设备功能测试
+ 
+ **功能描述**: 获取系统可用的音频输入设备列表
+ 
+ **测试步骤**:
+ 1. 点击「列出设备」按钮
+ 2. 观察设备列表区域显示
+ 3. 尝试点击选择不同设备
+ 
+ **预期结果**:
+ - 显示所有可用音频设备
+ - 每个设备显示：设备索引、设备名称、最大输入声道数
+ - 点击设备后高亮显示，设备索引自动填入配置面板
+ 
+ **API调用**: `GET /api/server-recording/devices`
+ 
+ ### 3. 开始录音功能测试
+ 
+ **功能描述**: 启动服务端录音功能
+ 
+ **测试步骤**:
+ 1. 确保已选择合适的音频设备
+ 2. 点击「开始录音」按钮
+ 3. 观察录音指示器和状态变化
+ 4. 对着麦克风说话测试
+ 
+ **预期结果**:
+ - 录音指示器显示红色脉冲动画
+ - 状态面板显示"🔴 录音中"
+ - 音频可视化区域显示音量和频率变化
+ - WebSocket连接正常，实时接收音频数据
+ 
+ **API调用**: `POST /api/server-recording/start`
+ 
+ ### 4. 停止录音功能测试
+ 
+ **功能描述**: 停止当前录音会话
+ 
+ **测试步骤**:
+ 1. 在录音状态下点击「停止录音」按钮
+ 2. 观察状态变化
+ 
+ **预期结果**:
+ - 录音指示器消失
+ - 状态面板显示"⚪ 已停止"
+ - 音频可视化停止更新
+ 
+ **API调用**: `POST /api/server-recording/stop`
+ 
+ ### 5. 获取配置功能测试
+ 
+ **功能描述**: 获取当前录音系统配置参数
+ 
+ **测试步骤**:
+ 1. 点击「获取配置」按钮
+ 2. 观察配置面板各字段是否自动填充
+ 
+ **预期结果**:
+ - 所有配置字段显示当前服务端配置值
+ - 设备索引、采样率、声道数等参数正确显示
+ 
+ **API调用**: `GET /api/server-recording/config`
+ 
+ ### 6. 更新配置功能测试
+ 
+ **功能描述**: 修改录音系统配置参数
+ 
+ **测试步骤**:
+ 1. 修改配置面板中的参数值
+    - 设备索引: 选择有效设备编号
+    - 采样率: 16000/44100/48000 Hz
+    - 声道数: 1(单声道)/2(立体声)
+    - 音量阈值: 0.001-1.0
+    - 静音超时: 0.1-10.0秒
+    - 语音超时: 1.0-30.0秒
+ 2. 点击「更新配置」按钮
+ 3. 再次获取配置验证更新结果
+ 
+ **预期结果**:
+ - 配置更新成功提示
+ - 新配置立即生效
+ 
+ **API调用**: `POST /api/server-recording/config`
+ 
+ ### 7. 测试ASR功能
+ 
+ **功能描述**: 模拟ASR语音识别功能测试
+ 
+ **测试步骤**:
+ 1. 点击「测试ASR」按钮
+ 2. 观察ASR结果面板变化
+ 
+ **预期结果**:
+ - 依次显示测试短语：
+   - "你好，这是一个测试"
+   - "语音识别功能正常"
+   - "测试完成"
+ - 每个结果显示置信度（85%-100%）
+ - 显示识别时间戳
+ 
+ ### 8. 清空日志功能测试
+ 
+ **功能描述**: 清除操作日志记录
+ 
+ **测试步骤**:
+ 1. 执行几个操作产生日志
+ 2. 点击「清空日志」按钮
+ 
+ **预期结果**:
+ - 日志面板清空
+ - 显示"日志已清空"消息
+ 
+ ## 实时监控功能测试
+ 
+ ### 1. WebSocket连接测试
+ 
+ **测试方法**:
+ - 打开浏览器开发者工具 → Network → WS
+ - 观察WebSocket连接状态
+ - 检查消息收发情况
+ 
+ **预期结果**:
+ - 连接URL: `ws://localhost:8010/ws/server-recording`
+ - 连接状态: 已连接
+ - 定期接收状态更新消息
+ 
+ ### 2. 音频可视化测试
+ 
+ **测试内容**:
+ - 音量条实时变化
+ - 频率条动态显示
+ - 音量百分比数值更新
+ 
+ **测试方法**:
+ - 开始录音后对麦克风说话
+ - 观察可视化效果变化
+ 
+ ### 3. 性能监控测试
+ 
+ **监控指标**:
+ - **延迟**: API调用响应时间
+ - **吞吐量**: 每秒处理请求数
+ - **错误率**: 失败请求百分比
+ - **运行时间**: 系统运行时长
+ 
+ **测试方法**:
+ - 执行多个操作观察指标变化
+ - 故意触发错误观察错误率统计
+ 
+ ### 4. 质量检测测试
+ 
+ **检测项目**:
+ - **信号质量**: 绿色(良好)/橙色(警告)/红色(错误)
+ - **噪音水平**: 环境噪音评估
+ - **VAD准确性**: 语音活动检测准确度
+ - **ASR准确性**: 语音识别准确度
+ 
+ ## 故障排查指南
+ 
+ ### 常见问题及解决方案
+ 
+ 1. **WebSocket连接失败**
+    - 检查服务端是否启动
+    - 确认端口8010可访问
+    - 验证路由 `/ws/server-recording` 已注册
+ 
+ 2. **获取状态失败**
+    - 检查API路由 `/api/server-recording/status`
+    - 确认服务端录音模块正常加载
+ 
+ 3. **设备列表为空**
+    - 检查系统音频设备
+    - 确认麦克风权限已授予
+    - 验证音频驱动正常
+ 
+ 4. **录音无响应**
+    - 检查设备索引是否有效
+    - 确认音频设备未被其他程序占用
+    - 验证采样率和声道数配置
+ 
+ 5. **音频可视化无变化**
+    - 检查麦克风是否静音
+    - 确认音量阈值设置合理
+    - 验证WebSocket消息接收
+ 
+ ## 测试检查清单
+ 
+ ### 基础功能测试
+ - [ ] 页面正常加载
+ - [ ] WebSocket连接成功
+ - [ ] 获取状态功能正常
+ - [ ] 设备列表获取成功
+ - [ ] 配置获取和更新正常
+ 
+ ### 录音功能测试
+ - [ ] 开始录音成功
+ - [ ] 停止录音成功
+ - [ ] 录音状态正确显示
+ - [ ] 音频可视化正常
+ 
+ ### 高级功能测试
+ - [ ] ASR测试功能正常
+ - [ ] 性能监控数据准确
+ - [ ] 质量检测指标更新
+ - [ ] 日志记录完整
+ 
+ ### 异常处理测试
+ - [ ] 网络断开自动重连
+ - [ ] 错误消息正确显示
+ - [ ] 异常状态恢复正常
+ 
+ ## 性能基准
+ 
+ ### 响应时间标准
+ - API调用延迟: < 100ms
+ - WebSocket消息延迟: < 50ms
+ - 音频可视化更新: < 100ms
+ 
+ ### 资源使用标准
+ - CPU使用率: < 10%
+ - 内存使用: < 100MB
+ - 网络带宽: < 1MB/s
+ 
+ ## 测试环境要求
+ 
+ ### 浏览器支持
+ - Chrome 80+
+ - Firefox 75+
+ - Safari 13+
+ - Edge 80+
+ 
+ ### 系统要求
+ - 操作系统: Windows 10+/macOS 10.15+/Linux
+ - 内存: 4GB+
+ - 音频设备: 支持录音的麦克风
+ 
+ ### 网络要求
+ - 本地网络连接
+ - WebSocket支持
+ - 端口8010可访问
+ 
+ ---
+ 
+ **注意**: 本测试指南基于当前版本的测试页面功能，如有功能更新请及时更新本文档。
\ No newline at end of file
--- a/doc/dev/streaming_recognition_optimization_plan.md 0 → 100644
View file @fd51e30
+++ b/doc/dev/streaming_recognition_optimization_plan.md 0 → 100644
View file @fd51e30
+ # AIfeng/2025-07-07 15:19:16
+ # 流式语音识别系统优化方案
+ 
+ ## 概述
+ 
+ 本文档针对用户提出的三个核心优化需求，设计了完整的技术实现方案：
+ 1. **智能断句逻辑** - 基于静音间隔的语义分段
+ 2. **VAD分片优化** - 平衡响应速度与识别精度
+ 3. **结果标识机制** - 流式识别结果的完整追踪
+ 
+ ## 1. 智能断句逻辑设计
+ 
+ ### 1.1 需求分析
+ 
+ 用户场景："我看到了一幅画 一幅后现代主义的画作 上面有人物 有动物 有一条很长的河 你能猜一猜这是哪一幅名画吗"
+ 
+ **断句策略：**
+ - 静音间隔 ≥ 2秒 → 独立句子
+ - 静音间隔 1-2秒 → 语义连接判断
+ - 静音间隔 < 1秒 → 同一句子内的自然停顿
+ 
+ ### 1.2 技术实现方案
+ 
+ #### 1.2.1 多级静音阈值设计
+ 
+ ```python
+ class IntelligentSentenceSegmentation:
+     def __init__(self):
+         self.silence_thresholds = {
+             'micro_pause': 0.3,      # 词间停顿
+             'phrase_pause': 1.0,     # 短语间停顿
+             'sentence_pause': 2.0,   # 句子间停顿
+             'topic_pause': 4.0       # 话题间停顿
+         }
+         
+         self.segment_types = {
+             'word_continuation': 'micro_pause',
+             'phrase_connection': 'phrase_pause', 
+             'sentence_boundary': 'sentence_pause',
+             'topic_boundary': 'topic_pause'
+         }
+ ```
+ 
+ #### 1.2.2 语义连接判断算法
+ 
+ ```python
+ def analyze_semantic_connection(self, prev_segment: str, current_segment: str, 
+                                silence_duration: float) -> str:
+     """
+     分析语义连接类型
+     
+     Returns:
+         'continuation' | 'new_sentence' | 'new_topic'
+     """
+     # 语法完整性检查
+     if self._is_grammatically_complete(prev_segment):
+         if silence_duration >= self.silence_thresholds['sentence_pause']:
+             return 'new_sentence'
+     
+     # 语义相关性检查
+     semantic_score = self._calculate_semantic_similarity(prev_segment, current_segment)
+     
+     if silence_duration >= self.silence_thresholds['phrase_pause']:
+         if semantic_score > 0.7:
+             return 'continuation'  # 语义相关，继续当前句子
+         else:
+             return 'new_sentence'  # 语义不相关，新句子
+     
+     return 'continuation'
+ ```
+ 
+ #### 1.2.3 动态阈值调整
+ 
+ ```python
+ class AdaptiveSilenceThreshold:
+     def __init__(self):
+         self.user_speech_pattern = {
+             'avg_pause_duration': 1.2,
+             'speech_rate': 150,  # 词/分钟
+             'pause_variance': 0.3
+         }
+     
+     def adjust_thresholds(self, recent_pauses: List[float]):
+         """根据用户说话习惯动态调整阈值"""
+         if len(recent_pauses) >= 10:
+             avg_pause = np.mean(recent_pauses)
+             std_pause = np.std(recent_pauses)
+             
+             # 个性化阈值调整
+             self.silence_thresholds['phrase_pause'] = avg_pause + 0.5 * std_pause
+             self.silence_thresholds['sentence_pause'] = avg_pause + 1.5 * std_pause
+ ```
+ 
+ ## 2. VAD分片优化策略
+ 
+ ### 2.1 问题分析
+ 
+ **当前挑战：**
+ - 小分片：响应快但识别精度低
+ - 大分片：精度高但响应慢
+ - 需要动态平衡策略
+ 
+ ### 2.2 自适应分片算法
+ 
+ #### 2.2.1 分片大小动态调整
+ 
+ ```python
+ class AdaptiveVADChunking:
+     def __init__(self):
+         self.chunk_strategies = {
+             'fast_response': {
+                 'min_chunk_duration': 0.5,
+                 'max_chunk_duration': 2.0,
+                 'confidence_threshold': 0.7
+             },
+             'high_accuracy': {
+                 'min_chunk_duration': 1.5,
+                 'max_chunk_duration': 4.0,
+                 'confidence_threshold': 0.8
+             },
+             'balanced': {
+                 'min_chunk_duration': 1.0,
+                 'max_chunk_duration': 3.0,
+                 'confidence_threshold': 0.75
+             }
+         }
+         
+         self.current_strategy = 'balanced'
+         self.performance_history = []
+     
+     def select_optimal_strategy(self, context: dict) -> str:
+         """根据上下文选择最优分片策略"""
+         # 考虑因素：
+         # 1. 当前识别准确率
+         # 2. 用户交互模式（快速对话 vs 长句描述）
+         # 3. 环境噪音水平
+         # 4. 系统负载
+         
+         recent_accuracy = self._calculate_recent_accuracy()
+         interaction_mode = context.get('interaction_mode', 'normal')
+         noise_level = context.get('noise_level', 0.1)
+         
+         if interaction_mode == 'quick_qa' and recent_accuracy > 0.85:
+             return 'fast_response'
+         elif noise_level > 0.3 or recent_accuracy < 0.7:
+             return 'high_accuracy'
+         else:
+             return 'balanced'
+ ```
+ 
+ #### 2.2.2 渐进式识别策略
+ 
+ ```python
+ class ProgressiveRecognition:
+     def __init__(self):
+         self.recognition_stages = {
+             'immediate': 0.8,    # 800ms 快速识别
+             'refined': 2.0,      # 2s 精化识别
+             'final': 4.0         # 4s 最终识别
+         }
+     
+     def process_audio_segment(self, audio_data: bytes, duration: float):
+         """渐进式识别处理"""
+         results = {}
+         
+         # 阶段1：快速识别（低延迟）
+         if duration >= self.recognition_stages['immediate']:
+             quick_result = self._quick_recognition(audio_data[:int(0.8 * len(audio_data))])
+             results['immediate'] = {
+                 'text': quick_result,
+                 'confidence': 0.6,
+                 'stage': 'immediate'
+             }
+         
+         # 阶段2：精化识别（平衡）
+         if duration >= self.recognition_stages['refined']:
+             refined_result = self._refined_recognition(audio_data)
+             results['refined'] = {
+                 'text': refined_result,
+                 'confidence': 0.8,
+                 'stage': 'refined'
+             }
+         
+         # 阶段3：最终识别（高精度）
+         if duration >= self.recognition_stages['final']:
+             final_result = self._final_recognition(audio_data)
+             results['final'] = {
+                 'text': final_result,
+                 'confidence': 0.9,
+                 'stage': 'final'
+             }
+         
+         return results
+ ```
+ 
+ ## 3. 结果标识与追踪机制
+ 
+ ### 3.1 识别结果标识体系
+ 
+ #### 3.1.1 唯一标识符设计
+ 
+ ```python
+ from dataclasses import dataclass
+ from typing import List, Optional
+ import uuid
+ import time
+ 
+ @dataclass
+ class RecognitionSegmentID:
+     """识别片段唯一标识"""
+     session_id: str          # 会话ID
+     segment_id: str          # 片段ID
+     sequence_number: int     # 序列号
+     parent_segment_id: Optional[str] = None  # 父片段ID（用于分片关联）
+     
+     def __post_init__(self):
+         if not self.segment_id:
+             self.segment_id = f"{self.session_id}_{self.sequence_number}_{int(time.time() * 1000)}"
+ 
+ @dataclass
+ class RecognitionResult:
+     """增强的识别结果"""
+     id: RecognitionSegmentID
+     text: str
+     confidence: float
+     timestamp: float
+     audio_duration: float
+     result_type: str  # 'partial' | 'refined' | 'final'
+     stage: str       # 'immediate' | 'refined' | 'final'
+     audio_segment_hash: str  # 音频片段哈希值
+     predecessor_ids: List[str] = None  # 前驱结果ID列表
+     successor_ids: List[str] = None    # 后继结果ID列表
+     is_superseded: bool = False        # 是否被后续结果替代
+     superseded_by: Optional[str] = None # 被哪个结果替代
+ ```
+ 
+ #### 3.1.2 结果关联追踪
+ 
+ ```python
+ class RecognitionResultTracker:
+     def __init__(self):
+         self.result_graph = {}  # 结果关联图
+         self.active_segments = {}  # 活跃片段
+         self.completed_segments = {}  # 完成片段
+     
+     def add_recognition_result(self, result: RecognitionResult) -> str:
+         """添加识别结果并建立关联"""
+         result_id = result.id.segment_id
+         
+         # 建立与前驱结果的关联
+         if result.predecessor_ids:
+             for pred_id in result.predecessor_ids:
+                 if pred_id in self.result_graph:
+                     self.result_graph[pred_id]['successors'].append(result_id)
+                     
+                     # 标记前驱结果为被替代
+                     if result.result_type == 'final':
+                         self._mark_superseded(pred_id, result_id)
+         
+         # 添加当前结果
+         self.result_graph[result_id] = {
+             'result': result,
+             'predecessors': result.predecessor_ids or [],
+             'successors': [],
+             'created_at': time.time()
+         }
+         
+         return result_id
+     
+     def get_result_chain(self, segment_id: str) -> List[RecognitionResult]:
+         """获取完整的识别链路"""
+         chain = []
+         
+         # 向前追溯到起始结果
+         current_id = segment_id
+         while current_id:
+             if current_id in self.result_graph:
+                 result_info = self.result_graph[current_id]
+                 chain.insert(0, result_info['result'])
+                 
+                 # 找到前驱
+                 predecessors = result_info['predecessors']
+                 current_id = predecessors[0] if predecessors else None
+             else:
+                 break
+         
+         # 向后追溯到最终结果
+         current_id = segment_id
+         while current_id:
+             if current_id in self.result_graph:
+                 result_info = self.result_graph[current_id]
+                 successors = result_info['successors']
+                 
+                 if successors:
+                     # 选择最新的后继结果
+                     latest_successor = max(successors, 
+                         key=lambda x: self.result_graph[x]['created_at'])
+                     
+                     if latest_successor not in [r.id.segment_id for r in chain]:
+                         chain.append(self.result_graph[latest_successor]['result'])
+                     
+                     current_id = latest_successor
+                 else:
+                     break
+             else:
+                 break
+         
+         return chain
+ ```
+ 
+ ### 3.2 流式显示刷新机制
+ 
+ #### 3.2.1 增量更新策略
+ 
+ ```python
+ class StreamingDisplayManager:
+     def __init__(self):
+         self.display_buffer = {}  # 显示缓冲区
+         self.update_queue = []    # 更新队列
+         self.refresh_strategies = {
+             'immediate': self._immediate_refresh,
+             'debounced': self._debounced_refresh,
+             'batch': self._batch_refresh
+         }
+     
+     def update_display(self, session_id: str, result: RecognitionResult, 
+                       strategy: str = 'debounced'):
+         """更新显示内容"""
+         update_info = {
+             'session_id': session_id,
+             'result': result,
+             'timestamp': time.time(),
+             'update_type': self._determine_update_type(result)
+         }
+         
+         self.update_queue.append(update_info)
+         
+         # 根据策略执行刷新
+         refresh_func = self.refresh_strategies.get(strategy, self._debounced_refresh)
+         refresh_func(update_info)
+     
+     def _determine_update_type(self, result: RecognitionResult) -> str:
+         """确定更新类型"""
+         if result.result_type == 'partial':
+             if result.stage == 'immediate':
+                 return 'append'  # 追加显示
+             else:
+                 return 'replace_partial'  # 替换部分内容
+         elif result.result_type == 'final':
+             return 'replace_final'  # 最终替换
+         else:
+             return 'append'
+     
+     def _debounced_refresh(self, update_info: dict, delay: float = 0.2):
+         """防抖刷新策略"""
+         session_id = update_info['session_id']
+         
+         # 取消之前的定时器
+         if session_id in self.pending_refreshes:
+             self.pending_refreshes[session_id].cancel()
+         
+         # 设置新的定时器
+         timer = threading.Timer(delay, self._execute_refresh, args=[session_id])
+         self.pending_refreshes[session_id] = timer
+         timer.start()
+ ```
+ 
+ ## 4. 配置参数优化建议
+ 
+ ### 4.1 VAD参数调整
+ 
+ ```json
+ {
+   "streaming_vad": {
+     "silence_duration_levels": {
+       "micro_pause": 0.3,
+       "phrase_pause": 1.0,
+       "sentence_pause": 2.0,
+       "topic_pause": 4.0
+     },
+     "adaptive_chunking": {
+       "enabled": true,
+       "min_chunk_duration": 0.8,
+       "max_chunk_duration": 3.5,
+       "strategy_switch_threshold": 0.75
+     },
+     "progressive_recognition": {
+       "enabled": true,
+       "stages": {
+         "immediate": 0.8,
+         "refined": 2.0,
+         "final": 4.0
+       }
+     }
+   }
+ }
+ ```
+ 
+ ### 4.2 识别管理参数
+ 
+ ```json
+ {
+   "streaming_recognition": {
+     "result_tracking": {
+       "enabled": true,
+       "max_chain_length": 10,
+       "cleanup_interval": 120.0
+     },
+     "display_refresh": {
+       "strategy": "debounced",
+       "debounce_delay": 0.2,
+       "batch_size": 5,
+       "max_refresh_rate": 10
+     }
+   }
+ }
+ ```
+ 
+ ## 5. 实施计划
+ 
+ ### 5.1 开发阶段
+ 
+ **阶段1：智能断句模块（1-2天）**
+ - 实现多级静音阈值检测
+ - 开发语义连接判断算法
+ - 集成动态阈值调整机制
+ 
+ **阶段2：VAD优化模块（2-3天）**
+ - 实现自适应分片算法
+ - 开发渐进式识别策略
+ - 性能测试与调优
+ 
+ **阶段3：结果追踪模块（2-3天）**
+ - 实现结果标识体系
+ - 开发关联追踪机制
+ - 实现流式显示管理
+ 
+ **阶段4：集成测试（1-2天）**
+ - 端到端功能测试
+ - 性能基准测试
+ - 用户体验验证
+ 
+ ### 5.2 验证指标
+ 
+ **功能指标：**
+ - 断句准确率 > 90%
+ - 识别延迟 < 1秒（immediate阶段）
+ - 最终识别准确率 > 95%
+ 
+ **性能指标：**
+ - 内存使用 < 100MB
+ - CPU使用率 < 30%
+ - 并发处理能力 > 5个会话
+ 
+ **用户体验指标：**
+ - 响应流畅度评分 > 4.5/5
+ - 识别结果可读性 > 4.0/5
+ - 整体满意度 > 4.5/5
+ 
+ ## 6. 风险评估与缓解
+ 
+ ### 6.1 技术风险
+ 
+ **风险1：语义判断准确性**
+ - 缓解：建立语义模型训练数据集
+ - 备选：基于规则的语法分析
+ 
+ **风险2：性能开销增加**
+ - 缓解：异步处理 + 缓存优化
+ - 监控：实时性能指标追踪
+ 
+ **风险3：复杂度增加**
+ - 缓解：模块化设计 + 完善测试
+ - 文档：详细的API文档和使用指南
+ 
+ ### 6.2 兼容性考虑
+ 
+ - 保持现有API接口不变
+ - 新功能通过配置开关控制
+ - 提供降级机制确保稳定性
+ 
+ ## 7. 总结
+ 
+ 本优化方案通过三个核心模块的协同工作，实现了：
+ 
+ 1. **智能化断句** - 基于多维度分析的语义分段
+ 2. **自适应VAD** - 动态平衡响应速度与识别精度
+ 3. **完整追踪** - 全链路结果标识与关联管理
+ 
+ 预期效果：
+ - 用户体验显著提升
+ - 识别准确率提高15-20%
+ - 响应延迟降低30-40%
+ - 系统可维护性增强
+ 
+ 该方案采用渐进式实施策略，确保系统稳定性的同时逐步提升功能完善度。
\ No newline at end of file
--- a/doc/process/funasr_timeout_analysis.md 0 → 100644
View file @fd51e30
+++ b/doc/process/funasr_timeout_analysis.md 0 → 100644
View file @fd51e30
+ # AIfeng/2025-07-17 16:25:06
+ 
+ # FunASR大文件超时问题分析与优化方案
+ 
+ ## 问题现象
+ 
+ 用户在使用FunASR进行语音识别时遇到以下问题：
+ - **小文件**：识别正常，无超时问题
+ - **大文件**：出现连接超时错误
+ - **错误信息**：`[WinError 10054] 远程主机强迫关闭了一个现有的连接`
+ - **发生时间**：16:17:53
+ 
+ ## 根因分析
+ 
+ ### 1. 超时配置问题
+ 
+ #### 当前超时设置
+ - **连接超时**：30秒（`config_util.py`中`asr_timeout`默认值）
+ - **接收消息超时**：1秒（`_receive_messages`方法中的`asyncio.wait_for`）
+ - **连接等待超时**：5秒（同步版本）/10秒（异步版本预热）
+ 
+ #### 问题分析
+ ```python
+ # funasr_asr.py 第72行 - 连接超时配置
+ timeout_seconds = getattr(cfg, 'asr_timeout', 30)
+ self.websocket = await asyncio.wait_for(
+     websockets.connect(self.server_url),
+     timeout=timeout_seconds
+ )
+ 
+ # 第145行 - 接收消息超时（过短）
+ message = await asyncio.wait_for(
+     self.websocket.recv(), 
+     timeout=1.0  # ⚠️ 仅1秒，对大文件处理不足
+ )
+ ```
+ 
+ ### 2. 大文件处理机制缺陷
+ 
+ #### 分块发送逻辑
+ ```python
+ # funasr_asr.py 第500-520行
+ stride = int(60 * chunk_size / chunk_interval / 1000 * 16000 * 2)
+ 
+ if len(audio_bytes) > stride:
+     chunk_num = (len(audio_bytes) - 1) // stride + 1
+     for i in range(chunk_num):
+         beg = i * stride
+         chunk_data = audio_bytes[beg:beg + stride]
+         self.message_queue.put(chunk_data)  # ⚠️ 队列可能积压
+ ```
+ 
+ #### 问题点
+ 1. **队列积压**：大文件分块后产生大量消息，队列处理不及时
+ 2. **发送频率**：`_send_message_loop`中`await asyncio.sleep(0.01)`间隔过短
+ 3. **无流控机制**：缺乏背压控制，服务端可能过载
+ 
+ ### 3. WebSocket连接稳定性
+ 
+ #### 心跳机制缺失
+ - **当前实现**：无主动心跳检测
+ - **连接检测**：仅依赖异常捕获
+ - **重连策略**：指数退避，但最大重连次数限制可能过严
+ 
+ ## 优化方案
+ 
+ ### 方案一：超时参数优化（立即可行）
+ 
+ #### 1. 调整超时配置
+ ```python
+ # config_util.py 优化
+ class ConfigManager:
+     def __init__(self):
+         # ASR超时配置优化
+         self.asr_timeout = 60  # 连接超时：30→60秒
+         self.asr_receive_timeout = 30  # 接收超时：1→30秒
+         self.asr_send_interval = 0.05  # 发送间隔：0.01→0.05秒
+         self.asr_chunk_size = 8192  # 分块大小优化
+ ```
+ 
+ #### 2. 动态超时计算
+ ```python
+ def calculate_timeout(self, audio_size_bytes):
+     """根据音频大小动态计算超时时间"""
+     base_timeout = 30
+     # 每MB增加10秒超时
+     size_mb = audio_size_bytes / (1024 * 1024)
+     dynamic_timeout = base_timeout + (size_mb * 10)
+     return min(dynamic_timeout, 300)  # 最大5分钟
+ ```
+ 
+ ### 方案二：流控机制实现（推荐）
+ 
+ #### 1. 队列大小限制
+ ```python
+ class FunASRAsyncClient:
+     def __init__(self, username, server_url):
+         # 限制队列大小，避免内存溢出
+         self.message_queue = queue.Queue(maxsize=100)
+         self.send_semaphore = asyncio.Semaphore(10)  # 并发控制
+ ```
+ 
+ #### 2. 背压控制
+ ```python
+ async def _send_message_loop(self):
+     """优化的发送消息循环"""
+     while self.connected and self.websocket:
+         try:
+             # 检查队列大小，实现背压
+             if self.message_queue.qsize() > 50:
+                 await asyncio.sleep(0.1)  # 队列过满时减缓发送
+                 continue
+                 
+             async with self.send_semaphore:
+                 message = self.message_queue.get_nowait()
+                 await self.websocket.send(message)
+                 
+         except queue.Empty:
+             await asyncio.sleep(0.05)  # 优化等待间隔
+ ```
+ 
+ ### 方案三：分片上传机制（长期优化）
+ 
+ #### 1. 大文件预处理
+ ```python
+ def preprocess_large_audio(self, audio_data, max_chunk_size=1024*1024):
+     """大文件预处理和分片"""
+     if len(audio_data) > max_chunk_size:
+         # 按时间分片，而非简单字节分割
+         return self._split_by_time_segments(audio_data)
+     return [audio_data]
+ 
+ def _split_by_time_segments(self, audio_data, segment_seconds=30):
+     """按时间段分割音频"""
+     sample_rate = 16000
+     bytes_per_sample = 2
+     segment_bytes = segment_seconds * sample_rate * bytes_per_sample
+     
+     segments = []
+     for i in range(0, len(audio_data), segment_bytes):
+         segments.append(audio_data[i:i + segment_bytes])
+     return segments
+ ```
+ 
+ #### 2. 分片识别结果合并
+ ```python
+ class SegmentResultManager:
+     def __init__(self):
+         self.segments = {}
+         self.final_result = ""
+     
+     def add_segment_result(self, segment_id, text):
+         self.segments[segment_id] = text
+         self._merge_results()
+     
+     def _merge_results(self):
+         # 按顺序合并分片结果
+         sorted_segments = sorted(self.segments.items())
+         self.final_result = " ".join([text for _, text in sorted_segments])
+ ```
+ 
+ ### 方案四：连接稳定性增强
+ 
+ #### 1. 心跳机制
+ ```python
+ async def _heartbeat_loop(self):
+     """心跳检测循环"""
+     while self.connected:
+         try:
+             # 每30秒发送心跳
+             await asyncio.sleep(30)
+             if self.websocket:
+                 await self.websocket.ping()
+         except Exception as e:
+             util.log(2, f"心跳检测失败: {e}")
+             self.connected = False
+             break
+ ```
+ 
+ #### 2. 连接质量监控
+ ```python
+ class ConnectionMonitor:
+     def __init__(self):
+         self.success_count = 0
+         self.error_count = 0
+         self.last_success_time = time.time()
+     
+     def record_success(self):
+         self.success_count += 1
+         self.last_success_time = time.time()
+     
+     def record_error(self):
+         self.error_count += 1
+     
+     def get_health_score(self):
+         total = self.success_count + self.error_count
+         if total == 0:
+             return 1.0
+         return self.success_count / total
+ ```
+ 
+ ## 实施建议
+ 
+ ### 阶段一：紧急修复（1-2天）
+ 1. **调整超时参数**：将接收超时从1秒调整为30秒
+ 2. **优化发送间隔**：从0.01秒调整为0.05秒
+ 3. **增加队列大小限制**：防止内存溢出
+ 
+ ### 阶段二：稳定性优化（3-5天）
+ 1. **实现动态超时计算**：根据文件大小调整超时
+ 2. **添加背压控制机制**：防止队列积压
+ 3. **增强错误处理和重连逻辑**
+ 
+ ### 阶段三：架构优化（1-2周）
+ 1. **实现分片上传机制**：支持超大文件处理
+ 2. **添加连接池管理**：提高并发处理能力
+ 3. **实现结果缓存机制**：避免重复处理
+ 
+ ## 监控指标
+ 
+ ### 关键指标
+ - **连接成功率**：>95%
+ - **平均响应时间**：<文件时长×2
+ - **超时错误率**：<5%
+ - **内存使用峰值**：<500MB
+ 
+ ### 告警阈值
+ - 连接失败率>10%
+ - 队列积压>100条消息
+ - 单次处理时间>5分钟
+ - 内存使用>1GB
+ 
+ ## 测试验证
+ 
+ ### 测试用例
+ 1. **小文件测试**：<1MB，验证基本功能
+ 2. **中等文件测试**：1-10MB，验证优化效果
+ 3. **大文件测试**：>10MB，验证极限处理能力
+ 4. **并发测试**：多用户同时上传
+ 5. **网络异常测试**：模拟网络中断和恢复
+ 
+ ### 性能基准
+ - **1MB文件**：<10秒完成识别
+ - **10MB文件**：<60秒完成识别
+ - **50MB文件**：<300秒完成识别
+ 
+ ## 风险评估
+ 
+ ### 技术风险
+ - **内存溢出**：大文件处理时内存激增
+ - **服务端压力**：并发大文件可能导致服务崩溃
+ - **网络稳定性**：长时间传输易受网络波动影响
+ 
+ ### 缓解措施
+ - 实施内存监控和自动清理
+ - 添加服务端负载均衡
+ - 实现断点续传机制
+ - 增加详细的错误日志和监控
\ No newline at end of file
--- a/streaming/__init__.py 0 → 100644
View file @fd51e30
+++ b/streaming/__init__.py 0 → 100644
View file @fd51e30
+ # AIfeng/2025-01-07 09:46:00
+ """
+ Streaming Speech Recognition Module
+ 
+ 流式语音识别模块，提供实时语音活动检测、累积识别和结果管理功能。
+ 
+ 主要组件：
+ - StreamingVAD: 流式语音活动检测
+ - StreamingRecognitionManager: 识别结果管理
+ - StreamingRecorder: 流式录音器
+ """
+ 
+ from .streaming_vad import StreamingVAD
+ from .streaming_recognition_manager import StreamingRecognitionManager
+ from .streaming_recorder import StreamingRecorder
+ 
+ __all__ = [
+     'StreamingVAD',
+     'StreamingRecognitionManager', 
+     'StreamingRecorder'
+ ]
+ 
+ __version__ = '1.0.0'
+ __author__ = 'AIfeng'
\ No newline at end of file
--- a/streaming/optimization/__init__.py 0 → 100644
View file @fd51e30
+++ b/streaming/optimization/__init__.py 0 → 100644
View file @fd51e30
+ # AIfeng/2025-07-07 15:25:48
+ # 流式语音识别优化模块包初始化文件
+ 
+ """
+ 流式语音识别优化模块
+ 
+ 本模块包含以下核心组件：
+ 1. IntelligentSentenceSegmentation - 智能断句模块
+ 2. AdaptiveVADChunking - 自适应VAD分片模块
+ 3. RecognitionResultTracker - 识别结果追踪模块
+ 4. StreamingDisplayManager - 流式显示管理模块
+ 
+ 这些模块协同工作，提供更智能、更高效的流式语音识别体验。
+ """
+ 
+ from .intelligent_segmentation import (
+     IntelligentSentenceSegmentation,
+     SpeechSegment,
+     SegmentType,
+     AdaptiveSilenceThreshold
+ )
+ 
+ from .adaptive_vad_chunking import (
+     AdaptiveVADChunking,
+     ChunkStrategy,
+     RecognitionStage,
+     ChunkConfig,
+     AudioChunk,
+     RecognitionResult,
+     PerformanceMonitor,
+     ProgressiveRecognition,
+     ChunkQualityAssessor
+ )
+ 
+ from .recognition_result_tracker import (
+     RecognitionResultTracker,
+     ResultType,
+     ResultStatus,
+     RecognitionSegmentID,
+     RecognitionResult as TrackerRecognitionResult,
+     ResultRelationship
+ )
+ 
+ from .streaming_display_manager import (
+     StreamingDisplayManager,
+     UpdateType,
+     RefreshStrategy,
+     DisplayPriority,
+     DisplayUpdate,
+     DisplaySegment,
+     DisplayBuffer
+ )
+ 
+ from .optimization_manager import (
+     OptimizationManager,
+     OptimizationMode
+ )
+ 
+ __version__ = "1.0.0"
+ __author__ = "AIfeng"
+ 
+ __all__ = [
+     # 智能断句模块
+     'IntelligentSentenceSegmentation',
+     'SpeechSegment',
+     'SegmentType',
+     'AdaptiveSilenceThreshold',
+     
+     # 自适应VAD分片模块
+     'AdaptiveVADChunking',
+     'ChunkStrategy',
+     'RecognitionStage',
+     'ChunkConfig',
+     'AudioChunk',
+     'RecognitionResult',
+     'PerformanceMonitor',
+     'ProgressiveRecognition',
+     'ChunkQualityAssessor',
+     
+     # 识别结果追踪模块
+     'RecognitionResultTracker',
+     'ResultType',
+     'ResultStatus',
+     'RecognitionSegmentID',
+     'TrackerRecognitionResult',
+     'ResultRelationship',
+     
+     # 流式显示管理模块
+     'StreamingDisplayManager',
+     'UpdateType',
+     'RefreshStrategy',
+     'DisplayPriority',
+     'DisplayUpdate',
+     'DisplaySegment',
+     'DisplayBuffer',
+     
+     # 优化管理器
+     'OptimizationManager',
+     'OptimizationMode'
+ ]
+ 
+ # 模块信息
+ MODULE_INFO = {
+     'name': 'streaming_optimization',
+     'description': '流式语音识别优化模块集合',
+     'version': __version__,
+     'author': __author__,
+     'components': {
+         'intelligent_segmentation': '智能断句 - 基于静音间隔和语义分析的语音分段',
+         'adaptive_vad_chunking': '自适应VAD分片 - 动态平衡响应速度与识别精度',
+         'recognition_result_tracker': '识别结果追踪 - 完整的结果追踪与关联管理',
+         'streaming_display_manager': '流式显示管理 - 增量更新与刷新策略'
+     },
+     'features': [
+         '多级静音阈值智能断句',
+         '自适应VAD分片策略',
+         '渐进式识别处理',
+         '结果唯一标识与追踪',
+         '增量显示更新',
+         '防抖刷新机制',
+         '性能监控与优化'
+     ]
+ }
+ 
+ def get_module_info():
+     """获取模块信息"""
+     return MODULE_INFO
+ 
+ def get_version():
+     """获取版本信息"""
+     return __version__
\ No newline at end of file
--- a/streaming/optimization/adaptive_vad_chunking.py 0 → 100644
View file @fd51e30
+++ b/streaming/optimization/adaptive_vad_chunking.py 0 → 100644
View file @fd51e30
+ # AIfeng/2025-07-07 15:25:48
+ # 自适应VAD分片优化模块 - 动态平衡响应速度与识别精度
+ 
+ import time
+ import numpy as np
+ from typing import List, Dict, Optional, Tuple, Any
+ from dataclasses import dataclass
+ from enum import Enum
+ import threading
+ import logging
+ from collections import deque
+ 
+ class ChunkStrategy(Enum):
+     """分片策略类型"""
+     FAST_RESPONSE = "fast_response"      # 快速响应
+     HIGH_ACCURACY = "high_accuracy"      # 高精度
+     BALANCED = "balanced"                # 平衡模式
+     ADAPTIVE = "adaptive"                # 自适应
+ 
+ class RecognitionStage(Enum):
+     """识别阶段"""
+     IMMEDIATE = "immediate"              # 即时识别
+     REFINED = "refined"                  # 精化识别
+     FINAL = "final"                      # 最终识别
+ 
+ @dataclass
+ class ChunkConfig:
+     """分片配置"""
+     min_duration: float
+     max_duration: float
+     confidence_threshold: float
+     overlap_ratio: float = 0.1
+     quality_weight: float = 0.5
+     speed_weight: float = 0.5
+ 
+ @dataclass
+ class AudioChunk:
+     """音频分片数据结构"""
+     data: bytes
+     duration: float
+     start_time: float
+     end_time: float
+     chunk_id: str
+     strategy: ChunkStrategy
+     stage: RecognitionStage
+     confidence: float = 0.0
+     is_processed: bool = False
+     parent_chunk_id: Optional[str] = None
+     is_speech: bool = True  # 添加语音检测标志
+     timestamp: float = 0.0  # 添加时间戳属性
+ 
+ @dataclass
+ class RecognitionResult:
+     """识别结果"""
+     text: str
+     confidence: float
+     chunk_id: str
+     stage: RecognitionStage
+     processing_time: float
+     accuracy_score: float = 0.0
+ 
+ class PerformanceMonitor:
+     """性能监控器"""
+     
+     def __init__(self, window_size: int = 20):
+         self.window_size = window_size
+         self.accuracy_history = deque(maxlen=window_size)
+         self.latency_history = deque(maxlen=window_size)
+         self.confidence_history = deque(maxlen=window_size)
+         
+     def record_result(self, result: RecognitionResult, latency: float):
+         """记录识别结果"""
+         self.accuracy_history.append(result.accuracy_score)
+         self.latency_history.append(latency)
+         self.confidence_history.append(result.confidence)
+     
+     def get_recent_accuracy(self) -> float:
+         """获取最近的准确率"""
+         return np.mean(self.accuracy_history) if self.accuracy_history else 0.0
+     
+     def get_recent_latency(self) -> float:
+         """获取最近的延迟"""
+         return np.mean(self.latency_history) if self.latency_history else 0.0
+     
+     def get_recent_confidence(self) -> float:
+         """获取最近的置信度"""
+         return np.mean(self.confidence_history) if self.confidence_history else 0.0
+     
+     def update_metrics(self, metrics: Dict):
+         """更新性能指标"""
+         if 'accuracy' in metrics:
+             self.accuracy_history.append(metrics['accuracy'])
+         if 'latency' in metrics:
+             self.latency_history.append(metrics['latency'])
+         if 'confidence' in metrics:
+             self.confidence_history.append(metrics['confidence'])
+ 
+ class AdaptiveVADChunking:
+     """自适应VAD分片处理器"""
+     
+     def __init__(self, config: Dict = None):
+         self.config = config or self._get_default_config()
+         
+         # 分片策略配置
+         self.chunk_strategies = {
+             ChunkStrategy.FAST_RESPONSE: ChunkConfig(
+                 min_duration=0.5,
+                 max_duration=2.0,
+                 confidence_threshold=0.7,
+                 quality_weight=0.3,
+                 speed_weight=0.7
+             ),
+             ChunkStrategy.HIGH_ACCURACY: ChunkConfig(
+                 min_duration=1.5,
+                 max_duration=4.0,
+                 confidence_threshold=0.8,
+                 quality_weight=0.8,
+                 speed_weight=0.2
+             ),
+             ChunkStrategy.BALANCED: ChunkConfig(
+                 min_duration=1.0,
+                 max_duration=3.0,
+                 confidence_threshold=0.75,
+                 quality_weight=0.5,
+                 speed_weight=0.5
+             ),
+             ChunkStrategy.ADAPTIVE: ChunkConfig(
+                 min_duration=0.8,
+                 max_duration=3.5,
+                 confidence_threshold=0.75,
+                 quality_weight=0.6,
+                 speed_weight=0.4
+             )
+         }
+         
+         self.current_strategy = ChunkStrategy.ADAPTIVE
+         self.performance_monitor = PerformanceMonitor()
+         self.chunk_buffer = []
+         self.processing_queue = deque()
+         
+         # 自适应参数
+         self.adaptation_enabled = self.config.get('adaptation_enabled', True)
+         self.strategy_switch_threshold = self.config.get('strategy_switch_threshold', 0.75)
+         self.min_samples_for_adaptation = self.config.get('min_samples_for_adaptation', 10)
+         
+         self.logger = logging.getLogger(__name__)
+         self._lock = threading.Lock()
+         
+         # 回调函数管理
+         self.quality_callbacks = []  # 质量反馈回调
+         
+         # 内存管理
+         self.last_cleanup_time = time.time()
+         self.cleanup_interval = 30.0  # 30秒清理一次
+         
+     def get_performance_stats(self) -> Dict:
+         """获取性能统计"""
+         with self._lock:
+             return {
+                 'current_strategy': self.current_strategy.value if hasattr(self.current_strategy, 'value') else str(self.current_strategy),
+                 'total_chunks_processed': getattr(self, 'total_chunks_processed', 0),
+                 'speech_chunks': getattr(self, 'speech_chunks', 0),
+                 'silence_chunks': getattr(self, 'silence_chunks', 0),
+                 'average_chunk_duration': getattr(self, 'average_chunk_duration', 0.0)
+             }
+         
+     def set_strategy(self, strategy):
+         """设置VAD策略"""
+         with self._lock:
+             self.current_strategy = strategy
+             self.logger.info(f"VAD策略已设置为: {strategy}")
+     
+     def register_quality_callback(self, callback):
+         """注册质量反馈回调函数"""
+         self.quality_callbacks.append(callback)
+         self.logger.debug("注册质量反馈回调函数")
+     
+     def _trigger_quality_callbacks(self, chunk_id: str, quality_metrics: Dict):
+         """触发质量反馈回调"""
+         for callback in self.quality_callbacks:
+             try:
+                 callback(chunk_id, quality_metrics)
+             except Exception as e:
+                 self.logger.error(f"质量反馈回调执行失败: {e}")
+     
+     def create_session(self, session_id: str):
+         """创建会话"""
+         # 为会话初始化相关数据结构
+         self.logger.info(f"VAD分片会话创建: {session_id}")
+     
+     def complete_session(self, session_id: str):
+         """完成会话"""
+         # 清理会话相关的缓存数据
+         with self._lock:
+             self.chunk_buffer.clear()
+             self.processing_queue.clear()
+             # 限制回调函数数量，防止内存泄漏
+             if len(self.quality_callbacks) > 10:
+                 self.quality_callbacks = self.quality_callbacks[-10:]
+         self.logger.info(f"VAD分片会话完成: {session_id}，已清理缓存数据")
+     
+     def process_audio(self, session_id: str, audio_data: bytes, sample_rate: int, strategy: ChunkStrategy = None) -> List:
+         """处理音频数据（兼容OptimizationManager调用）"""
+         try:
+             timestamp = time.time()
+             chunks = self.process_audio_data(audio_data, timestamp)
+             return chunks
+         except Exception as e:
+             self.logger.error(f"处理音频数据失败: {e}")
+             return []
+         
+     def _get_default_config(self) -> Dict:
+         """获取默认配置"""
+         return {
+             'adaptation_enabled': True,
+             'strategy_switch_threshold': 0.75,
+             'min_samples_for_adaptation': 10,
+             'max_chunk_buffer_size': 50,
+             'progressive_recognition': True,
+             'quality_feedback_enabled': True
+         }
+     
+     def process_audio_data(self, audio_data: bytes, timestamp: float, 
+                           context: Dict = None) -> List[AudioChunk]:
+         """处理音频数据并生成分片"""
+         try:
+             with self._lock:
+                 # 定期清理内存，防止内存泄漏
+                 current_time = time.time()
+                 if current_time - self.last_cleanup_time > self.cleanup_interval:
+                     self._cleanup_memory()
+                     self.last_cleanup_time = current_time
+                 
+                 # 选择最优策略
+                 # 移除未定义的strategy变量引用
+                 if self.adaptation_enabled:
+                     self._update_strategy(context or {})
+                 
+                 # 生成音频分片
+                 chunks = self._create_chunks(audio_data, timestamp)
+                 
+                 # 添加到处理队列，限制队列大小防止内存泄漏
+                 max_queue_size = self.config.get('max_chunk_buffer_size', 50)
+                 for chunk in chunks:
+                     if len(self.processing_queue) >= max_queue_size:
+                         # 队列满时，移除最旧的分片
+                         removed_chunk = self.processing_queue.popleft()
+                         self.logger.warning(f"处理队列已满，移除分片: {removed_chunk.chunk_id}")
+                     self.processing_queue.append(chunk)
+                 
+                 return chunks
+                 
+         except Exception as e:
+             self.logger.error(f"处理音频数据时出错: {e}")
+             return []
+     
+     def _update_strategy(self, context: Dict):
+         """更新分片策略"""
+         if len(self.performance_monitor.accuracy_history) < self.min_samples_for_adaptation:
+             return
+         
+         current_accuracy = self.performance_monitor.get_recent_accuracy()
+         current_latency = self.performance_monitor.get_recent_latency()
+         
+         # 获取上下文信息
+         interaction_mode = context.get('interaction_mode', 'normal')
+         noise_level = context.get('noise_level', 0.1)
+         user_patience = context.get('user_patience', 'normal')  # 'low', 'normal', 'high'
+         
+         # 策略选择逻辑
+         new_strategy = self._select_optimal_strategy(
+             current_accuracy, current_latency, interaction_mode, 
+             noise_level, user_patience
+         )
+         
+         if new_strategy != self.current_strategy:
+             self.logger.info(f"策略切换: {self.current_strategy.value} -> {new_strategy.value}")
+             self.current_strategy = new_strategy
+     
+     def _select_optimal_strategy(self, accuracy: float, latency: float, 
+                                interaction_mode: str, noise_level: float, 
+                                user_patience: str) -> ChunkStrategy:
+         """选择最优分片策略"""
+         # 快速响应条件
+         if (interaction_mode == 'quick_qa' and accuracy > 0.85 and 
+             user_patience == 'low'):
+             return ChunkStrategy.FAST_RESPONSE
+         
+         # 高精度条件
+         if (noise_level > 0.3 or accuracy < 0.7 or 
+             interaction_mode == 'detailed_analysis'):
+             return ChunkStrategy.HIGH_ACCURACY
+         
+         # 自适应条件
+         if self.config.get('enable_adaptive_strategy', False):
+             return ChunkStrategy.ADAPTIVE
+         
+         # 默认平衡模式
+         return ChunkStrategy.BALANCED
+     
+     def _create_chunks(self, audio_data: bytes, timestamp: float) -> List[AudioChunk]:
+         """创建音频分片"""
+         chunks = []
+         current_config = self.chunk_strategies[self.current_strategy]
+         
+         # 计算分片参数
+         data_length = len(audio_data)
+         sample_rate = self.config.get('sample_rate', 16000)
+         bytes_per_sample = self.config.get('bytes_per_sample', 2)
+         
+         # 估算音频时长 - 添加除零保护
+         if sample_rate <= 0 or bytes_per_sample <= 0:
+             self.logger.error(f"无效的音频参数: sample_rate={sample_rate}, bytes_per_sample={bytes_per_sample}")
+             return []
+         
+         audio_duration = data_length / (sample_rate * bytes_per_sample)
+         
+         # 确定分片大小
+         chunk_duration = self._calculate_optimal_chunk_duration(
+             audio_duration, current_config
+         )
+         
+         # 生成分片
+         chunk_size = int(chunk_duration * sample_rate * bytes_per_sample)
+         overlap_size = int(chunk_size * current_config.overlap_ratio)
+         
+         start_pos = 0
+         chunk_index = 0
+         
+         # 防止无限循环的安全检查
+         max_iterations = 1000  # 最大迭代次数
+         iteration_count = 0
+         
+         while start_pos < data_length and iteration_count < max_iterations:
+             end_pos = min(start_pos + chunk_size, data_length)
+             
+             # 确保分片有效（至少有一些数据）
+             if end_pos <= start_pos:
+                 self.logger.warning(f"无效分片位置: start_pos={start_pos}, end_pos={end_pos}")
+                 break
+             
+             chunk_data = audio_data[start_pos:end_pos]
+             chunk_start_time = timestamp + (start_pos / (sample_rate * bytes_per_sample))
+             chunk_end_time = timestamp + (end_pos / (sample_rate * bytes_per_sample))
+             
+             chunk = AudioChunk(
+                 data=chunk_data,
+                 duration=chunk_end_time - chunk_start_time,
+                 start_time=chunk_start_time,
+                 end_time=chunk_end_time,
+                 chunk_id=f"{int(timestamp * 1000)}_{chunk_index}",
+                 strategy=self.current_strategy,
+                 stage=RecognitionStage.IMMEDIATE,
+                 timestamp=chunk_start_time  # 正确设置timestamp属性
+             )
+             
+             chunks.append(chunk)
+             
+             # 安全的位置更新：确保始终向前推进
+             next_pos = end_pos - overlap_size if overlap_size > 0 else end_pos
+             
+             # 防止无限循环：确保位置至少前进1个字节
+             if next_pos <= start_pos:
+                 next_pos = start_pos + max(1, chunk_size // 4)  # 至少前进1/4分片大小
+                 self.logger.warning(f"调整分片位置以防止无限循环: {start_pos} -> {next_pos}")
+             
+             start_pos = next_pos
+             chunk_index += 1
+             iteration_count += 1
+         
+         if iteration_count >= max_iterations:
+             self.logger.error(f"分片创建达到最大迭代次数限制: {max_iterations}")
+         
+         return chunks
+     
+     def _calculate_optimal_chunk_duration(self, total_duration: float, 
+                                         config: ChunkConfig) -> float:
+         """计算最优分片时长"""
+         # 基础分片时长
+         base_duration = min(config.max_duration, 
+                           max(config.min_duration, total_duration / 3))
+         
+         # 根据性能历史调整
+         recent_accuracy = self.performance_monitor.get_recent_accuracy()
+         recent_latency = self.performance_monitor.get_recent_latency()
+         
+         # 动态调整因子
+         if recent_accuracy < self.strategy_switch_threshold:
+             # 准确率低，增加分片时长
+             adjustment_factor = 1.2
+         elif recent_latency > 2.0:  # 延迟过高
+             # 延迟高，减少分片时长
+             adjustment_factor = 0.8
+         else:
+             adjustment_factor = 1.0
+         
+         optimal_duration = base_duration * adjustment_factor
+         
+         # 确保在配置范围内
+         return max(config.min_duration, 
+                   min(config.max_duration, optimal_duration))
+     
+     def process_audio_chunk(self, audio_data) -> Optional[AudioChunk]:
+         """处理单个音频分片"""
+         try:
+             # 处理不同类型的音频数据
+             if isinstance(audio_data, np.ndarray):
+                 # 正确处理numpy数组：先归一化到int16范围，再转换
+                 if audio_data.dtype == np.float32 or audio_data.dtype == np.float64:
+                     # 浮点数组：假设范围在[-1, 1]或[0, 1]，转换到int16范围
+                     if audio_data.max() <= 1.0 and audio_data.min() >= 0.0:
+                         # [0, 1] 范围，转换到 [-32768, 32767]
+                         audio_data = (audio_data * 2 - 1) * 32767
+                     else:
+                         # [-1, 1] 范围，直接缩放到 [-32768, 32767]
+                         audio_data = audio_data * 32767
+                     # 修复 BufferError: memoryview has 1 exported buffer
+                     audio_int16 = audio_data.astype(np.int16)
+                     audio_bytes = bytes(audio_int16.tobytes())
+                 else:
+                     # 整数数组，直接转换
+                     # 修复 BufferError: memoryview has 1 exported buffer
+                     audio_int16 = audio_data.astype(np.int16)
+                     audio_bytes = bytes(audio_int16.tobytes())
+             elif isinstance(audio_data, bytes):
+                 audio_bytes = audio_data
+             else:
+                 # 尝试转换为bytes
+                 audio_bytes = bytes(audio_data)
+             
+             timestamp = time.time()
+             chunks = self.process_audio_data(audio_bytes, timestamp)
+             return chunks[0] if chunks else None
+         except Exception as e:
+             self.logger.error(f"处理音频分片失败: {e}")
+             return None
+     
+     def select_optimal_strategy(self) -> ChunkStrategy:
+         """选择最优策略"""
+         try:
+             recent_accuracy = self.performance_monitor.get_recent_accuracy()
+             recent_latency = self.performance_monitor.get_recent_latency()
+             recent_confidence = self.performance_monitor.get_recent_confidence()
+             
+             # 基于性能指标选择策略
+             if recent_accuracy < 0.7 or recent_confidence < 0.6:
+                 return ChunkStrategy.HIGH_ACCURACY
+             elif recent_latency > 1.0:
+                 return ChunkStrategy.FAST_RESPONSE
+             elif recent_accuracy > 0.9 and recent_latency < 0.5:
+                 return ChunkStrategy.ADAPTIVE
+             else:
+                 return ChunkStrategy.BALANCED
+         except Exception as e:
+             self.logger.error(f"选择最优策略失败: {e}")
+             return ChunkStrategy.BALANCED
+     
+     def _cleanup_memory(self):
+         """清理内存，防止内存泄漏"""
+         try:
+             # 清理过大的chunk_buffer
+             max_buffer_size = self.config.get('max_chunk_buffer_size', 50)
+             if len(self.chunk_buffer) > max_buffer_size:
+                 self.chunk_buffer = self.chunk_buffer[-max_buffer_size//2:]
+                 self.logger.info(f"清理chunk_buffer，保留最新的{max_buffer_size//2}个分片")
+             
+             # 清理过多的回调函数
+             if len(self.quality_callbacks) > 10:
+                 self.quality_callbacks = self.quality_callbacks[-10:]
+                 self.logger.info("清理过多的质量回调函数")
+             
+             # 清理处理队列中过旧的分片
+             current_time = time.time()
+             old_queue_size = len(self.processing_queue)
+             self.processing_queue = deque([
+                 chunk for chunk in self.processing_queue 
+                 if current_time - chunk.start_time < 60.0  # 保留60秒内的分片
+             ])
+             
+             if old_queue_size != len(self.processing_queue):
+                 self.logger.info(f"清理处理队列，从{old_queue_size}个分片减少到{len(self.processing_queue)}个")
+                 
+         except Exception as e:
+             self.logger.error(f"内存清理失败: {e}")
+ 
+ class ProgressiveRecognition:
+     """渐进式识别处理器"""
+     
+     def __init__(self, config: Dict = None):
+         self.config = config or {}
+         self.recognition_stages = {
+             RecognitionStage.IMMEDIATE: 0.8,    # 800ms 快速识别
+             RecognitionStage.REFINED: 2.0,      # 2s 精化识别
+             RecognitionStage.FINAL: 4.0         # 4s 最终识别
+         }
+         
+         self.stage_results = {}  # 存储各阶段结果
+         self.logger = logging.getLogger(__name__)
+     
+     def process_audio_segment(self, chunk: AudioChunk) -> Dict[RecognitionStage, RecognitionResult]:
+         """渐进式识别处理"""
+         results = {}
+         
+         try:
+             # 阶段1：快速识别（低延迟）
+             if chunk.duration >= self.recognition_stages[RecognitionStage.IMMEDIATE]:
+                 immediate_result = self._quick_recognition(chunk)
+                 if immediate_result:
+                     results[RecognitionStage.IMMEDIATE] = immediate_result
+             
+             # 阶段2：精化识别（平衡）
+             if chunk.duration >= self.recognition_stages[RecognitionStage.REFINED]:
+                 refined_result = self._refined_recognition(chunk)
+                 if refined_result:
+                     results[RecognitionStage.REFINED] = refined_result
+             
+             # 阶段3：最终识别（高精度）
+             if chunk.duration >= self.recognition_stages[RecognitionStage.FINAL]:
+                 final_result = self._final_recognition(chunk)
+                 if final_result:
+                     results[RecognitionStage.FINAL] = final_result
+             
+             # 存储结果到stage_results中
+             if results:
+                 self.stage_results[chunk.chunk_id] = results
+             
+             # 定期清理过期结果，防止内存泄漏
+             if len(self.stage_results) > 100:  # 当结果数量超过100时清理
+                 self.cleanup_old_results(max_age=60.0)  # 清理60秒前的结果
+             
+             return results
+             
+         except Exception as e:
+             self.logger.error(f"渐进式识别处理出错: {e}")
+             return {}
+     
+     def _quick_recognition(self, chunk: AudioChunk) -> Optional[RecognitionResult]:
+         """快速识别（模拟）"""
+         # 这里应该调用实际的ASR服务
+         # 模拟快速识别结果（不使用sleep以避免测试卡住）
+         processing_start = time.time()
+         
+         # 模拟处理时间（不实际等待）
+         simulated_processing_time = 0.1  # 100ms 模拟处理时间
+         
+         return RecognitionResult(
+             text=f"快速识别结果_{chunk.chunk_id}",
+             confidence=0.6,
+             chunk_id=chunk.chunk_id,
+             stage=RecognitionStage.IMMEDIATE,
+             processing_time=simulated_processing_time,
+             accuracy_score=0.7
+         )
+     
+     def _refined_recognition(self, chunk: AudioChunk) -> Optional[RecognitionResult]:
+         """精化识别（模拟）"""
+         # 模拟处理时间（不实际等待）
+         simulated_processing_time = 0.3  # 300ms 模拟处理时间
+         
+         return RecognitionResult(
+             text=f"精化识别结果_{chunk.chunk_id}",
+             confidence=0.8,
+             chunk_id=chunk.chunk_id,
+             stage=RecognitionStage.REFINED,
+             processing_time=simulated_processing_time,
+             accuracy_score=0.85
+         )
+     
+     def _final_recognition(self, chunk: AudioChunk) -> Optional[RecognitionResult]:
+         """最终识别（模拟）"""
+         # 模拟处理时间（不实际等待）
+         simulated_processing_time = 0.5  # 500ms 模拟处理时间
+         
+         return RecognitionResult(
+             text=f"最终识别结果_{chunk.chunk_id}",
+             confidence=0.9,
+             chunk_id=chunk.chunk_id,
+             stage=RecognitionStage.FINAL,
+             processing_time=simulated_processing_time,
+             accuracy_score=0.95
+         )
+     
+     def get_best_result(self, chunk_id: str) -> Optional[RecognitionResult]:
+         """获取指定分片的最佳识别结果"""
+         if chunk_id not in self.stage_results:
+             return None
+         
+         results = self.stage_results[chunk_id]
+         
+         # 优先返回最终结果，其次是精化结果，最后是即时结果
+         for stage in [RecognitionStage.FINAL, RecognitionStage.REFINED, RecognitionStage.IMMEDIATE]:
+             if stage in results:
+                 return results[stage]
+         
+         return None
+     
+     def cleanup_old_results(self, max_age: float = 300.0):
+         """清理过期的识别结果"""
+         current_time = time.time()
+         expired_chunks = []
+         
+         for chunk_id, results in self.stage_results.items():
+             # 从chunk_id中提取时间戳（格式：timestamp_index）
+             try:
+                 chunk_timestamp = float(chunk_id.split('_')[0]) / 1000.0  # 转换为秒
+                 if current_time - chunk_timestamp > max_age:
+                     expired_chunks.append(chunk_id)
+             except (ValueError, IndexError):
+                 # 如果无法解析时间戳，保留结果
+                 continue
+         
+         # 清理过期结果
+         for chunk_id in expired_chunks:
+             del self.stage_results[chunk_id]
+         
+         if expired_chunks:
+             self.logger.info(f"清理了 {len(expired_chunks)} 个过期识别结果")
+ 
+ class ChunkQualityAssessor:
+     """分片质量评估器"""
+     
+     def __init__(self):
+         self.quality_metrics = {
+             'signal_to_noise_ratio': 0.0,
+             'audio_clarity': 0.0,
+             'speech_continuity': 0.0,
+             'duration_appropriateness': 0.0
+         }
+     
+     def assess_chunk_quality(self, chunk: AudioChunk) -> float:
+         """评估分片质量"""
+         # 这里应该实现实际的音频质量评估算法
+         # 目前返回模拟值
+         
+         # 基于时长的质量评估
+         duration_score = self._assess_duration_quality(chunk.duration)
+         
+         # 基于策略的质量评估
+         strategy_score = self._assess_strategy_appropriateness(chunk.strategy)
+         
+         # 综合质量分数
+         overall_quality = (duration_score + strategy_score) / 2
+         
+         return min(1.0, max(0.0, overall_quality))
+     
+     def _assess_duration_quality(self, duration: float) -> float:
+         """评估时长质量"""
+         # 理想时长范围：1-3秒
+         if 1.0 <= duration <= 3.0:
+             return 1.0
+         elif 0.5 <= duration < 1.0 or 3.0 < duration <= 5.0:
+             return 0.7
+         else:
+             return 0.3
+     
+     def _assess_strategy_appropriateness(self, strategy: ChunkStrategy) -> float:
+         """评估策略适当性"""
+         # 这里可以根据当前上下文评估策略的适当性
+         # 目前返回固定值
+         strategy_scores = {
+             ChunkStrategy.FAST_RESPONSE: 0.8,
+             ChunkStrategy.BALANCED: 0.9,
+             ChunkStrategy.HIGH_ACCURACY: 0.85,
+             ChunkStrategy.ADAPTIVE: 0.95
+         }
+         
+         return strategy_scores.get(strategy, 0.5)
\ No newline at end of file
--- a/streaming/optimization/intelligent_segmentation.py 0 → 100644
View file @fd51e30
+++ b/streaming/optimization/intelligent_segmentation.py 0 → 100644
View file @fd51e30
+ # AIfeng/2025-07-07 15:25:48
+ # 智能断句模块 - 基于静音间隔的语义分段
+ 
+ import time
+ import numpy as np
+ from typing import List, Dict, Optional, Tuple
+ from dataclasses import dataclass
+ from enum import Enum
+ import threading
+ import logging
+ 
+ class SegmentType(Enum):
+     """语音片段类型"""
+     WORD_CONTINUATION = "word_continuation"      # 词间连接
+     PHRASE_CONNECTION = "phrase_connection"      # 短语连接
+     SENTENCE_BOUNDARY = "sentence_boundary"      # 句子边界
+     TOPIC_BOUNDARY = "topic_boundary"            # 话题边界
+ 
+ @dataclass
+ class SpeechSegment:
+     """语音片段数据结构"""
+     text: str
+     start_time: float
+     end_time: float
+     silence_before: float
+     silence_after: float
+     confidence: float
+     segment_type: SegmentType
+     is_complete: bool = False
+ 
+ class IntelligentSentenceSegmentation:
+     """智能断句处理器"""
+     
+     def __init__(self, config: Dict = None):
+         self.config = config or self._get_default_config()
+         self.silence_thresholds = self.config.get('silence_thresholds', {
+             'micro_pause': 0.3,      # 词间停顿
+             'phrase_pause': 1.0,     # 短语间停顿
+             'sentence_pause': 2.0,   # 句子间停顿
+             'topic_pause': 4.0       # 话题间停顿
+         })
+         
+         self.segment_buffer = []  # 片段缓冲区
+         self.user_speech_pattern = {
+             'avg_pause_duration': 1.2,
+             'speech_rate': 150,  # 词/分钟
+             'pause_variance': 0.3
+         }
+         
+         self.recent_pauses = []  # 最近的停顿记录
+         self.adaptive_enabled = self.config.get('adaptive_threshold', True)
+         
+         self.logger = logging.getLogger(__name__)
+         
+     def _get_default_config(self) -> Dict:
+         """获取默认配置"""
+         return {
+             'silence_thresholds': {
+                 'micro_pause': 0.3,
+                 'phrase_pause': 1.0,
+                 'sentence_pause': 2.0,
+                 'topic_pause': 4.0
+             },
+             'adaptive_threshold': True,
+             'semantic_analysis': True,
+             'grammar_check': True,
+             'max_segment_length': 50,  # 最大片段长度（词数）
+             'min_segment_length': 3    # 最小片段长度（词数）
+         }
+     
+     def process_speech_segment(self, text: str, silence_duration: float, 
+                              timestamp: float, confidence: float) -> List[SpeechSegment]:
+         """处理语音片段"""
+         try:
+             # 记录停顿时长用于自适应调整
+             if silence_duration > 0:
+                 self.recent_pauses.append(silence_duration)
+                 if len(self.recent_pauses) > 20:  # 保持最近20个停顿记录
+                     self.recent_pauses.pop(0)
+             
+             # 自适应阈值调整
+             if self.adaptive_enabled:
+                 self._adjust_thresholds()
+             
+             # 确定片段类型
+             segment_type = self._classify_segment_type(text, silence_duration)
+             
+             # 创建语音片段
+             segment = SpeechSegment(
+                 text=text,
+                 start_time=timestamp,
+                 end_time=timestamp + len(text.split()) * 0.4,  # 估算结束时间
+                 silence_before=silence_duration,
+                 silence_after=0.0,  # 后续更新
+                 confidence=confidence,
+                 segment_type=segment_type
+             )
+             
+             # 添加到缓冲区
+             self.segment_buffer.append(segment)
+             
+             # 处理片段合并和分割
+             processed_segments = self._process_segment_buffer()
+             
+             return processed_segments
+             
+         except Exception as e:
+             self.logger.error(f"处理语音片段时出错: {e}")
+             return []
+     
+     def _classify_segment_type(self, text: str, silence_duration: float) -> SegmentType:
+         """分类片段类型"""
+         # 确保阈值字典完整性
+         if not isinstance(self.silence_thresholds, dict):
+             self.silence_thresholds = self._get_default_config()['silence_thresholds']
+         
+         # 安全获取阈值，使用默认值作为后备
+         micro_pause = self.silence_thresholds.get('micro_pause', 0.3)
+         phrase_pause = self.silence_thresholds.get('phrase_pause', 1.0)
+         sentence_pause = self.silence_thresholds.get('sentence_pause', 2.0)
+         
+         # 基于静音时长的初步分类
+         if silence_duration <= micro_pause:
+             return SegmentType.WORD_CONTINUATION
+         elif silence_duration <= phrase_pause:
+             return SegmentType.PHRASE_CONNECTION
+         elif silence_duration <= sentence_pause:
+             return SegmentType.SENTENCE_BOUNDARY
+         else:
+             return SegmentType.TOPIC_BOUNDARY
+     
+     def _process_segment_buffer(self) -> List[SpeechSegment]:
+         """处理片段缓冲区"""
+         if len(self.segment_buffer) < 2:
+             return []
+         
+         processed_segments = []
+         current_segment = self.segment_buffer[-2]  # 倒数第二个片段
+         next_segment = self.segment_buffer[-1]     # 最新片段
+         
+         # 语义连接分析
+         connection_type = self._analyze_semantic_connection(
+             current_segment.text, 
+             next_segment.text, 
+             next_segment.silence_before
+         )
+         
+         # 根据连接类型决定处理方式
+         if connection_type == 'continuation':
+             # 合并片段
+             merged_segment = self._merge_segments(current_segment, next_segment)
+             self.segment_buffer[-2] = merged_segment
+             self.segment_buffer.pop()  # 移除最新片段
+         elif connection_type == 'new_sentence':
+             # 标记当前片段为完成
+             current_segment.is_complete = True
+             processed_segments.append(current_segment)
+         
+         return processed_segments
+     
+     def _analyze_semantic_connection(self, prev_text: str, current_text: str, 
+                                    silence_duration: float) -> str:
+         """分析语义连接类型"""
+         # 确保silence_thresholds是字典类型
+         if not isinstance(self.silence_thresholds, dict):
+             self.silence_thresholds = {
+                 'micro_pause': 0.3,
+                 'phrase_pause': 0.8,
+                 'sentence_pause': 1.5,
+                 'topic_pause': 3.0
+             }
+         
+         # 语法完整性检查
+         if self._is_grammatically_complete(prev_text):
+             sentence_pause_threshold = self.silence_thresholds.get('sentence_pause', 1.5)
+             if silence_duration >= sentence_pause_threshold:
+                 return 'new_sentence'
+         
+         # 语义相关性检查
+         if self.config.get('semantic_analysis', True):
+             semantic_score = self._calculate_semantic_similarity(prev_text, current_text)
+             
+             phrase_pause_threshold = self.silence_thresholds.get('phrase_pause', 0.8)
+             if silence_duration >= phrase_pause_threshold:
+                 if semantic_score > 0.7:
+                     return 'continuation'  # 语义相关，继续当前句子
+                 else:
+                     return 'new_sentence'  # 语义不相关，新句子
+         
+         return 'continuation'
+     
+     def _is_grammatically_complete(self, text: str) -> bool:
+         """检查语法完整性"""
+         if not self.config.get('grammar_check', True):
+             return False
+             
+         # 简单的语法完整性检查
+         text = text.strip()
+         
+         # 检查句子结束标点
+         if text.endswith(('。', '！', '？', '.', '!', '?')):
+             return True
+         
+         # 检查常见的完整句式
+         complete_patterns = [
+             '是的', '不是', '好的', '没有', '有的', '对的', '错的',
+             '可以', '不可以', '行', '不行', '是', '不是'
+         ]
+         
+         for pattern in complete_patterns:
+             if text.endswith(pattern):
+                 return True
+         
+         # 检查词数（简单启发式）
+         word_count = len(text.split())
+         if word_count >= self.config.get('min_complete_words', 5):
+             return True
+         
+         return False
+     
+     def _calculate_semantic_similarity(self, text1: str, text2: str) -> float:
+         """计算语义相似度（简化版本）"""
+         # 这里使用简单的词汇重叠度作为语义相似度的近似
+         words1 = set(text1.split())
+         words2 = set(text2.split())
+         
+         if not words1 or not words2:
+             return 0.0
+         
+         intersection = words1.intersection(words2)
+         union = words1.union(words2)
+         
+         return len(intersection) / len(union) if union else 0.0
+     
+     def _merge_segments(self, segment1: SpeechSegment, segment2: SpeechSegment) -> SpeechSegment:
+         """合并两个片段"""
+         merged_text = f"{segment1.text} {segment2.text}"
+         
+         return SpeechSegment(
+             text=merged_text,
+             start_time=segment1.start_time,
+             end_time=segment2.end_time,
+             silence_before=segment1.silence_before,
+             silence_after=segment2.silence_after,
+             confidence=min(segment1.confidence, segment2.confidence),
+             segment_type=segment2.segment_type,
+             is_complete=False
+         )
+     
+     def _adjust_thresholds(self):
+         """根据用户说话习惯动态调整阈值"""
+         if len(self.recent_pauses) >= 10:
+             avg_pause = np.mean(self.recent_pauses)
+             std_pause = np.std(self.recent_pauses)
+             
+             # 确保silence_thresholds是字典类型
+             if not isinstance(self.silence_thresholds, dict):
+                 self.silence_thresholds = {
+                     'micro_pause': 0.3,
+                     'phrase_pause': 0.8,
+                     'sentence_pause': 1.5,
+                     'topic_pause': 3.0
+                 }
+             
+             # 个性化阈值调整
+             self.silence_thresholds['phrase_pause'] = max(0.5, avg_pause + 0.5 * std_pause)
+             self.silence_thresholds['sentence_pause'] = max(1.0, avg_pause + 1.5 * std_pause)
+             
+             phrase_threshold = self.silence_thresholds.get('phrase_pause', 0.8)
+             sentence_threshold = self.silence_thresholds.get('sentence_pause', 1.5)
+             self.logger.debug(f"阈值已调整: phrase={phrase_threshold:.2f}, "
+                             f"sentence={sentence_threshold:.2f}")
+     
+     def get_completed_segments(self) -> List[SpeechSegment]:
+         """获取已完成的片段"""
+         completed = [seg for seg in self.segment_buffer if seg.is_complete]
+         # 清理已完成的片段
+         self.segment_buffer = [seg for seg in self.segment_buffer if not seg.is_complete]
+         return completed
+     
+     def force_complete_current_segment(self) -> Optional[SpeechSegment]:
+         """强制完成当前片段"""
+         if self.segment_buffer:
+             current_segment = self.segment_buffer[-1]
+             current_segment.is_complete = True
+             return current_segment
+         return None
+     
+     def reset(self):
+         """重置分割器状态"""
+         self.segment_buffer.clear()
+         self.recent_pauses.clear()
+         self.logger.info("智能断句器已重置")
+     
+     def create_session(self, session_id: str):
+         """创建会话"""
+         # 为会话初始化相关数据结构
+         self.logger.info(f"智能断句会话创建: {session_id}")
+     
+     def update_config(self, config: Dict):
+         """更新配置"""
+         if 'silence_thresholds' in config:
+             # 更新静音阈值配置
+             thresholds = config['silence_thresholds']
+             self.logger.info(f"更新静音阈值配置: {thresholds}")
+         
+         if 'semantic_analysis' in config:
+             # 更新语义分析配置
+             semantic_config = config['semantic_analysis']
+             self.logger.info(f"更新语义分析配置: {semantic_config}")
+     
+     def complete_session(self, session_id: str):
+         """完成会话"""
+         # 清理会话相关的缓存数据
+         self.logger.info(f"智能断句会话完成: {session_id}")
+     
+     def shutdown(self):
+         """关闭模块"""
+         self.reset()
+         self.logger.info("智能断句模块已关闭")
+     
+     def get_statistics(self) -> Dict:
+         """获取统计信息"""
+         return {
+             'buffer_size': len(self.segment_buffer),
+             'recent_pauses_count': len(self.recent_pauses),
+             'avg_pause_duration': np.mean(self.recent_pauses) if self.recent_pauses else 0,
+             'current_thresholds': self.silence_thresholds.copy(),
+             'adaptive_enabled': self.adaptive_enabled
+         }
+     
+     def process_text(self, text: str, context: Dict = None) -> Dict:
+         """处理文本分割（兼容OptimizationManager调用）"""
+         try:
+             # 提取上下文信息
+             timestamp = context.get('timestamp', time.time()) if context else time.time()
+             confidence = context.get('confidence', 0.8) if context else 0.8
+             silence_duration = context.get('silence_duration', 1.0) if context else 1.0
+             
+             # 处理语音片段
+             segments = self.process_speech_segment(text, silence_duration, timestamp, confidence)
+             
+             # 返回处理结果
+             if segments:
+                 # 返回最新的完整片段
+                 latest_segment = segments[-1]
+                 # 安全获取segment_type的值
+                 segment_type_value = latest_segment.segment_type.value if isinstance(latest_segment.segment_type, SegmentType) else str(latest_segment.segment_type)
+                 return {
+                     'success': True,
+                     'text': latest_segment.text,
+                     'confidence': latest_segment.confidence,
+                     'segment_type': segment_type_value,
+                     'is_complete': latest_segment.is_complete
+                 }
+             else:
+                 # 如果没有完整片段，返回原文本
+                 return {
+                     'success': True,
+                     'text': text,
+                     'confidence': confidence,
+                     'segment_type': 'continuation',
+                     'is_complete': False
+                 }
+                 
+         except Exception as e:
+             self.logger.error(f"处理文本分割时出错: {e}")
+             return {
+                 'success': False,
+                 'text': text,
+                 'confidence': 0.0,
+                 'error': str(e)
+             }
+     
+     def get_performance_stats(self) -> Dict:
+         """获取性能统计"""
+         total_segments = len(self.segment_buffer)
+         completed_segments = len([seg for seg in self.segment_buffer if seg.is_complete])
+         avg_confidence = np.mean([seg.confidence for seg in self.segment_buffer]) if self.segment_buffer else 0.0
+         
+         return {
+             'total_segments': total_segments,
+             'completed_segments': completed_segments,
+             'pending_segments': total_segments - completed_segments,
+             'average_confidence': avg_confidence,
+             'processing_efficiency': completed_segments / total_segments if total_segments > 0 else 0.0
+         }
+ 
+ class AdaptiveSilenceThreshold:
+     """自适应静音阈值调整器"""
+     
+     def __init__(self):
+         self.user_speech_pattern = {
+             'avg_pause_duration': 1.2,
+             'speech_rate': 150,  # 词/分钟
+             'pause_variance': 0.3
+         }
+         self.history_window = 50  # 历史窗口大小
+         self.pause_history = []
+         
+     def update_speech_pattern(self, pause_duration: float, speech_rate: float = None):
+         """更新用户说话模式"""
+         self.pause_history.append(pause_duration)
+         if len(self.pause_history) > self.history_window:
+             self.pause_history.pop(0)
+         
+         # 更新平均停顿时长
+         self.user_speech_pattern['avg_pause_duration'] = np.mean(self.pause_history)
+         self.user_speech_pattern['pause_variance'] = np.std(self.pause_history)
+         
+         if speech_rate:
+             self.user_speech_pattern['speech_rate'] = speech_rate
+     
+     def get_adaptive_thresholds(self, base_thresholds: Dict) -> Dict:
+         """获取自适应阈值"""
+         if len(self.pause_history) < 5:
+             return base_thresholds
+         
+         avg_pause = self.user_speech_pattern['avg_pause_duration']
+         variance = self.user_speech_pattern['pause_variance']
+         
+         # 基于用户习惯调整阈值
+         adaptive_thresholds = base_thresholds.copy()
+         
+         # 调整系数
+         adjustment_factor = min(2.0, max(0.5, avg_pause / 1.2))  # 基准1.2秒
+         
+         for key in adaptive_thresholds:
+             adaptive_thresholds[key] *= adjustment_factor
+             # 添加方差影响
+             adaptive_thresholds[key] += variance * 0.3
+         
+         return adaptive_thresholds
\ No newline at end of file
--- a/streaming/optimization/optimization_config.json 0 → 100644
View file @fd51e30
+++ b/streaming/optimization/optimization_config.json 0 → 100644
View file @fd51e30
+ {
+   "// AIfeng/2025-07-07 15:25:48": "流式语音识别优化模块配置文件",
+   
+   "intelligent_segmentation": {
+     "description": "智能断句配置",
+     "silence_thresholds": {
+       "short_pause": 0.3,
+       "medium_pause": 0.8,
+       "long_pause": 1.5,
+       "sentence_break": 2.0
+     },
+     "adaptive_threshold": {
+       "enabled": true,
+       "learning_rate": 0.1,
+       "min_threshold": 0.1,
+       "max_threshold": 3.0,
+       "adaptation_window": 10
+     },
+     "semantic_analysis": {
+       "enabled": true,
+       "similarity_threshold": 0.7,
+       "context_window": 5,
+       "use_grammar_check": true
+     },
+     "segment_constraints": {
+       "min_length": 10,
+       "max_length": 500,
+       "min_confidence": 0.3
+     }
+   },
+   
+   "adaptive_vad_chunking": {
+     "description": "自适应VAD分片配置",
+     "strategies": {
+       "fast_response": {
+         "chunk_size_ms": 200,
+         "overlap_ms": 50,
+         "confidence_threshold": 0.6,
+         "max_latency_ms": 300
+       },
+       "high_accuracy": {
+         "chunk_size_ms": 800,
+         "overlap_ms": 200,
+         "confidence_threshold": 0.8,
+         "max_latency_ms": 1000
+       },
+       "balanced": {
+         "chunk_size_ms": 400,
+         "overlap_ms": 100,
+         "confidence_threshold": 0.7,
+         "max_latency_ms": 600
+       },
+       "adaptive": {
+         "initial_chunk_size_ms": 400,
+         "min_chunk_size_ms": 200,
+         "max_chunk_size_ms": 1000,
+         "adaptation_factor": 0.2,
+         "performance_window": 20
+       }
+     },
+     "performance_monitoring": {
+       "enabled": true,
+       "metrics_window": 100,
+       "latency_target_ms": 500,
+       "accuracy_target": 0.85,
+       "adaptation_threshold": 0.1
+     },
+     "progressive_recognition": {
+       "enabled": true,
+       "stages": [
+         {
+           "name": "quick",
+           "chunk_size_ms": 200,
+           "confidence_threshold": 0.5
+         },
+         {
+           "name": "refined",
+           "chunk_size_ms": 600,
+           "confidence_threshold": 0.8
+         },
+         {
+           "name": "final",
+           "chunk_size_ms": 1000,
+           "confidence_threshold": 0.9
+         }
+       ]
+     },
+     "quality_assessment": {
+       "enabled": true,
+       "snr_threshold": 10.0,
+       "energy_threshold": 0.01,
+       "spectral_quality_threshold": 0.7
+     }
+   },
+   
+   "recognition_result_tracker": {
+     "description": "识别结果追踪配置",
+     "session_management": {
+       "max_sessions": 100,
+       "session_timeout_minutes": 30,
+       "auto_cleanup_enabled": true,
+       "cleanup_interval_minutes": 5
+     },
+     "result_tracking": {
+       "max_results_per_session": 1000,
+       "enable_result_chaining": true,
+       "confidence_decay_rate": 0.05,
+       "similarity_threshold": 0.8
+     },
+     "quality_metrics": {
+       "track_confidence_trends": true,
+       "track_latency_metrics": true,
+       "track_accuracy_metrics": true,
+       "metrics_retention_hours": 24
+     },
+     "result_relations": {
+       "enable_replacement_tracking": true,
+       "enable_refinement_tracking": true,
+       "enable_correction_tracking": true,
+       "max_relation_depth": 5
+     },
+     "archival": {
+       "auto_archive_enabled": true,
+       "archive_after_hours": 6,
+       "compress_archived_results": true,
+       "max_archived_sessions": 500
+     }
+   },
+   
+   "streaming_display_manager": {
+     "description": "流式显示管理配置",
+     "buffer_management": {
+       "max_buffer_size": 1000,
+       "auto_cleanup_enabled": true,
+       "cleanup_threshold": 0.8
+     },
+     "refresh_strategies": {
+       "default_strategy": "debounced",
+       "debounce_delay_ms": 200,
+       "batch_size": 5,
+       "batch_timeout_ms": 1000,
+       "max_refresh_rate_per_second": 10
+     },
+     "display_options": {
+       "enable_highlighting": true,
+       "auto_scroll": true,
+       "preserve_formatting": true,
+       "show_confidence_indicators": true,
+       "show_timing_info": false
+     },
+     "performance": {
+       "max_workers": 4,
+       "queue_size_warning_threshold": 50,
+       "processing_time_warning_ms": 100
+     },
+     "priority_handling": {
+       "urgent_immediate_processing": true,
+       "high_priority_batch_size": 3,
+       "normal_priority_batch_size": 5,
+       "low_priority_batch_size": 10
+     }
+   },
+   
+   "integration": {
+     "description": "模块集成配置",
+     "inter_module_communication": {
+       "enable_event_bus": true,
+       "async_processing": true,
+       "error_propagation": true
+     },
+     "data_flow": {
+       "segmentation_to_chunking": true,
+       "chunking_to_tracking": true,
+       "tracking_to_display": true,
+       "feedback_loops_enabled": true
+     },
+     "performance_coordination": {
+       "shared_thread_pool": false,
+       "resource_monitoring": true,
+       "adaptive_load_balancing": true
+     }
+   },
+   
+   "logging": {
+     "description": "日志配置",
+     "level": "INFO",
+     "enable_module_specific_logs": true,
+     "log_performance_metrics": true,
+     "log_error_details": true,
+     "max_log_file_size_mb": 10,
+     "log_rotation_count": 5
+   },
+   
+   "debugging": {
+     "description": "调试配置",
+     "enable_debug_mode": false,
+     "trace_data_flow": false,
+     "save_intermediate_results": false,
+     "performance_profiling": false,
+     "memory_usage_tracking": false
+   },
+   
+   "experimental": {
+     "description": "实验性功能配置",
+     "features": {
+       "ai_powered_segmentation": false,
+       "predictive_chunking": false,
+       "semantic_result_merging": false,
+       "adaptive_display_layouts": false,
+       "real_time_quality_optimization": false
+     },
+     "ai_models": {
+       "segmentation_model_path": "",
+       "chunking_model_path": "",
+       "quality_model_path": ""
+     }
+   }
+ }
\ No newline at end of file
--- a/streaming/optimization/optimization_manager.py 0 → 100644
View file @fd51e30
+++ b/streaming/optimization/optimization_manager.py 0 → 100644
View file @fd51e30
+ # AIfeng/2025-07-07 15:25:48
+ # 流式语音识别优化集成管理器
+ 
+ import json
+ import time
+ import threading
+ import logging
+ from typing import Dict, List, Optional, Callable, Any
+ from pathlib import Path
+ from dataclasses import dataclass
+ from enum import Enum
+ import asyncio
+ from concurrent.futures import ThreadPoolExecutor
+ 
+ from .intelligent_segmentation import IntelligentSentenceSegmentation, SpeechSegment
+ from .adaptive_vad_chunking import AdaptiveVADChunking, ChunkStrategy, AudioChunk
+ from .recognition_result_tracker import RecognitionResultTracker, ResultType
+ from .streaming_display_manager import StreamingDisplayManager, UpdateType, DisplayPriority
+ 
+ class OptimizationMode(Enum):
+     """优化模式"""
+     SPEED_FIRST = "speed_first"          # 速度优先
+     ACCURACY_FIRST = "accuracy_first"    # 精度优先
+     BALANCED = "balanced"                # 平衡模式
+     ADAPTIVE = "adaptive"                # 自适应模式
+ 
+ class ProcessingStage(Enum):
+     """处理阶段"""
+     AUDIO_INPUT = "audio_input"
+     VAD_CHUNKING = "vad_chunking"
+     SEGMENTATION = "segmentation"
+     RECOGNITION = "recognition"
+     RESULT_TRACKING = "result_tracking"
+     DISPLAY_UPDATE = "display_update"
+ 
+ @dataclass
+ class ProcessingContext:
+     """处理上下文"""
+     session_id: str
+     audio_data: bytes
+     sample_rate: int
+     timestamp: float
+     metadata: Dict = None
+ 
+ @dataclass
+ class OptimizationMetrics:
+     """优化指标"""
+     total_latency_ms: float
+     segmentation_latency_ms: float
+     chunking_latency_ms: float
+     tracking_latency_ms: float
+     display_latency_ms: float
+     accuracy_score: float
+     confidence_score: float
+     processing_efficiency: float
+ 
+ class OptimizationManager:
+     """流式语音识别优化管理器"""
+     
+     def __init__(self, config_path: str = None):
+         # 加载配置
+         self.config = self._load_config(config_path)
+         
+         # 初始化各个优化模块
+         self.segmentation_module = IntelligentSentenceSegmentation(
+             self.config.get('intelligent_segmentation', {})
+         )
+         
+         self.chunking_module = AdaptiveVADChunking(
+             self.config.get('adaptive_vad_chunking', {})
+         )
+         
+         self.tracking_module = RecognitionResultTracker(
+             self.config.get('recognition_result_tracker', {})
+         )
+         
+         self.display_module = StreamingDisplayManager(
+             self.config.get('streaming_display_manager', {})
+         )
+         
+         # 优化模式
+         self.current_mode = OptimizationMode.BALANCED
+         
+         # 性能监控
+         self.performance_metrics = {}
+         self.processing_stats = {
+             'total_sessions': 0,
+             'active_sessions': 0,
+             'total_audio_processed_seconds': 0.0,
+             'average_latency_ms': 0.0,
+             'average_accuracy': 0.0
+         }
+         
+         # 回调函数
+         self.result_callbacks = []  # 识别结果回调
+         self.error_callbacks = []   # 错误处理回调
+         self.metrics_callbacks = [] # 性能指标回调
+         
+         # 线程池
+         self.executor = ThreadPoolExecutor(
+             max_workers=self.config.get('integration', {}).get('performance_coordination', {}).get('max_workers', 8),
+             thread_name_prefix='OptimizationManager'
+         )
+         
+         # 事件总线（简化实现）
+         self.event_handlers = {}
+         
+         self.logger = logging.getLogger(__name__)
+         self._lock = threading.RLock()
+         self._running = True
+         
+         # 注册模块间的回调
+         self._setup_inter_module_communication()
+         
+         self.logger.info("流式语音识别优化管理器初始化完成")
+     
+     def _load_config(self, config_path: str = None) -> Dict:
+         """加载配置文件"""
+         if config_path is None:
+             config_path = Path(__file__).parent / "optimization_config.json"
+         
+         try:
+             with open(config_path, 'r', encoding='utf-8') as f:
+                 config = json.load(f)
+             
+             # 转换配置项：将cleanup_interval_minutes转换为cleanup_interval（秒）
+             if 'recognition_result_tracker' in config:
+                 tracker_config = config['recognition_result_tracker']
+                 if 'cleanup_interval_minutes' in tracker_config:
+                     # 将分钟转换为秒
+                     tracker_config['cleanup_interval'] = tracker_config['cleanup_interval_minutes'] * 60
+                     # 保留原配置项以兼容
+             
+             return config
+         except Exception as e:
+             self.logger.warning(f"加载配置文件失败: {e}，使用默认配置")
+             return self._get_default_config()
+     
+     def _get_default_config(self) -> Dict:
+         """获取默认配置"""
+         return {
+             'intelligent_segmentation': {},
+             'adaptive_vad_chunking': {},
+             'recognition_result_tracker': {},
+             'streaming_display_manager': {},
+             'integration': {
+                 'performance_coordination': {
+                     'max_workers': 8
+                 }
+             }
+         }
+     
+     def _setup_inter_module_communication(self):
+         """设置模块间通信"""
+         # 注册显示更新回调
+         self.tracking_module.register_result_callback(self._on_tracking_result)
+         self.display_module.register_error_callback(self._on_display_error)
+         
+         # 注册分片质量反馈
+         self.chunking_module.register_quality_callback(self._on_chunk_quality_feedback)
+     
+     def set_optimization_mode(self, mode: OptimizationMode):
+         """设置优化模式"""
+         # 类型检查
+         if not isinstance(mode, OptimizationMode):
+             raise TypeError(f"mode必须是OptimizationMode枚举类型，当前类型: {type(mode)}")
+             
+         self.current_mode = mode
+         
+         # 根据模式调整各模块参数
+         if mode == OptimizationMode.SPEED_FIRST:
+             self._configure_for_speed()
+         elif mode == OptimizationMode.ACCURACY_FIRST:
+             self._configure_for_accuracy()
+         elif mode == OptimizationMode.BALANCED:
+             self._configure_for_balance()
+         elif mode == OptimizationMode.ADAPTIVE:
+             self._configure_for_adaptive()
+         
+         self.logger.info(f"优化模式已设置为: {mode.value}")
+     
+     def _configure_for_speed(self):
+         """配置速度优先模式"""
+         # 配置快速分片策略
+         self.chunking_module.set_strategy(ChunkStrategy.FAST_RESPONSE)
+         
+         # 配置快速断句
+         self.segmentation_module.update_config({
+             'silence_thresholds': {
+                 'short_pause': 0.2,
+                 'medium_pause': 0.5,
+                 'long_pause': 1.0,
+                 'sentence_break': 1.5
+             }
+         })
+         
+         # 配置立即显示刷新
+         self.display_module.config['refresh_strategies']['default_strategy'] = 'immediate'
+     
+     def _configure_for_accuracy(self):
+         """配置精度优先模式"""
+         # 配置高精度分片策略
+         self.chunking_module.set_strategy(ChunkStrategy.HIGH_ACCURACY)
+         
+         # 配置精确断句
+         self.segmentation_module.update_config({
+             'semantic_analysis': {
+                 'enabled': True,
+                 'similarity_threshold': 0.8,
+                 'context_window': 8
+             }
+         })
+         
+         # 配置批量显示刷新
+         self.display_module.config['refresh_strategies']['default_strategy'] = 'batch'
+     
+     def _configure_for_balance(self):
+         """配置平衡模式"""
+         # 配置平衡分片策略
+         self.chunking_module.set_strategy(ChunkStrategy.BALANCED)
+         
+         # 配置防抖显示刷新
+         self.display_module.config['refresh_strategies']['default_strategy'] = 'debounced'
+     
+     def _configure_for_adaptive(self):
+         """配置自适应模式"""
+         # 配置自适应分片策略
+         self.chunking_module.set_strategy(ChunkStrategy.ADAPTIVE)
+         
+         # 配置自适应显示刷新
+         self.display_module.config['refresh_strategies']['default_strategy'] = 'adaptive'
+     
+     def register_result_callback(self, callback: Callable[[str, str, float, bool], None]):
+         """注册识别结果回调"""
+         self.result_callbacks.append(callback)
+     
+     def register_error_callback(self, callback: Callable[[str, Exception], None]):
+         """注册错误处理回调"""
+         self.error_callbacks.append(callback)
+     
+     def register_metrics_callback(self, callback: Callable[[str, OptimizationMetrics], None]):
+         """注册性能指标回调"""
+         self.metrics_callbacks.append(callback)
+     
+     def create_session(self, session_id: str, config: Dict = None) -> bool:
+         """创建处理会话"""
+         try:
+             # 在各个模块中创建会话
+             self.segmentation_module.create_session(session_id)
+             self.chunking_module.create_session(session_id)
+             self.tracking_module.create_session(session_id)
+             
+             with self._lock:
+                 self.processing_stats['total_sessions'] += 1
+                 self.processing_stats['active_sessions'] += 1
+             
+             self.logger.info(f"会话创建成功: {session_id}")
+             return True
+             
+         except Exception as e:
+             self.logger.error(f"创建会话失败: {e}")
+             self._handle_error(session_id, e)
+             return False
+     
+     def process_audio(self, session_id: str, audio_data: bytes, 
+                      sample_rate: int, timestamp: float = None) -> bool:
+         """处理音频数据"""
+         if timestamp is None:
+             timestamp = time.time()
+         
+         context = ProcessingContext(
+             session_id=session_id,
+             audio_data=audio_data,
+             sample_rate=sample_rate,
+             timestamp=timestamp
+         )
+         
+         # 异步处理音频
+         self.executor.submit(self._process_audio_async, context)
+         return True
+     
+     def complete_session(self, session_id: str) -> bool:
+         """完成处理会话"""
+         try:
+             # 完成各个模块的会话
+             self.segmentation_module.complete_session(session_id)
+             self.chunking_module.complete_session(session_id)
+             self.tracking_module.complete_session(session_id)
+             
+             with self._lock:
+                 if self.processing_stats['active_sessions'] > 0:
+                     self.processing_stats['active_sessions'] -= 1
+             
+             self.logger.info(f"会话完成: {session_id}")
+             return True
+             
+         except Exception as e:
+             self.logger.error(f"完成会话失败: {e}")
+             self._handle_error(session_id, e)
+             return False
+     
+     def _process_audio_async(self, context: ProcessingContext):
+         """异步处理音频数据"""
+         start_time = time.time()
+         metrics = OptimizationMetrics(
+             total_latency_ms=0,
+             segmentation_latency_ms=0,
+             chunking_latency_ms=0,
+             tracking_latency_ms=0,
+             display_latency_ms=0,
+             accuracy_score=0,
+             confidence_score=0,
+             processing_efficiency=0
+         )
+         
+         try:
+             # 1. VAD分片处理
+             chunk_start = time.time()
+             chunks = self.chunking_module.process_audio(
+                 context.session_id,
+                 context.audio_data,
+                 context.sample_rate
+             )
+             metrics.chunking_latency_ms = (time.time() - chunk_start) * 1000
+             
+             # 2. 智能断句处理
+             seg_start = time.time()
+             for chunk in chunks:
+                 if chunk.is_speech:
+                     # 这里应该调用ASR服务获取识别结果
+                     # 为了演示，我们模拟一个识别结果
+                     mock_text = f"模拟识别文本_{chunk.chunk_id}"
+                     mock_confidence = 0.85
+                     
+                     # 进行智能断句
+                     text_context = {
+                         'session_id': context.session_id,
+                         'timestamp': chunk.timestamp,
+                         'confidence': mock_confidence,
+                         'silence_duration': 0.0  # 默认值
+                     }
+                     segment_result = self.segmentation_module.process_text(
+                         mock_text,
+                         text_context
+                     )
+                     
+                     # 3. 结果追踪
+                     track_start = time.time()
+                     if segment_result.get('success', False):
+                         result_id = self.tracking_module.add_recognition_result(
+                             context.session_id,
+                             segment_result['text'],
+                             segment_result['confidence'],
+                             context.audio_data,  # audio_data
+                             ResultType.PARTIAL if not segment_result.get('is_complete', False) else ResultType.FINAL,  # result_type
+                             'processing',  # stage
+                             None,  # predecessor_ids
+                             None,  # parent_segment_id
+                             {'timestamp': chunk.timestamp, 'duration': chunk.duration}  # metadata
+                         )
+                         
+                         # 4. 显示更新
+                         display_start = time.time()
+                         self.display_module.update_display(
+                             context.session_id,
+                             result_id,
+                             segment_result['text'],
+                             UpdateType.REPLACE_FINAL if segment_result.get('is_complete', False) else UpdateType.APPEND,
+                             segment_result['confidence'],
+                             segment_result.get('is_complete', False),
+                             DisplayPriority.HIGH if segment_result.get('is_complete', False) else DisplayPriority.NORMAL
+                         )
+                         metrics.display_latency_ms += (time.time() - display_start) * 1000
+                     
+                     metrics.tracking_latency_ms += (time.time() - track_start) * 1000
+             
+             metrics.segmentation_latency_ms = (time.time() - seg_start) * 1000
+             
+             # 计算总延迟和效率
+             metrics.total_latency_ms = (time.time() - start_time) * 1000
+             # 防止除零错误
+             if metrics.total_latency_ms > 0:
+                 metrics.processing_efficiency = len(context.audio_data) / metrics.total_latency_ms
+             else:
+                 metrics.processing_efficiency = 0.0
+                 self.logger.warning(f"处理延迟为0，无法计算处理效率 [{context.session_id}]")
+             
+             # 更新性能统计
+             self._update_performance_stats(context.session_id, metrics)
+             
+             # 触发指标回调
+             self._trigger_metrics_callbacks(context.session_id, metrics)
+             
+         except Exception as e:
+             self.logger.error(f"处理音频时出错: {e}")
+             self._handle_error(context.session_id, e)
+     
+     def _on_tracking_result(self, session_id: str, result_id: str, text: str, 
+                            confidence: float, is_final: bool):
+         """处理追踪模块的结果回调"""
+         # 触发结果回调
+         for callback in self.result_callbacks:
+             try:
+                 callback(session_id, text, confidence, is_final)
+             except Exception as e:
+                 self.logger.error(f"结果回调执行出错: {e}")
+     
+     def _on_display_error(self, session_id: str, error: Exception):
+         """处理显示模块的错误回调"""
+         self.logger.error(f"显示模块错误 [{session_id}]: {error}")
+         self._handle_error(session_id, error)
+     
+     def _on_chunk_quality_feedback(self, session_id: str, chunk_id: str, 
+                                   quality_score: float, metrics: Dict):
+         """处理分片质量反馈"""
+         # 根据质量反馈调整策略
+         if quality_score < 0.5:
+             self.logger.warning(f"分片质量较低 [{session_id}:{chunk_id}]: {quality_score}")
+             # 可以在这里实施自适应调整
+     
+     def _handle_error(self, session_id: str, error: Exception):
+         """处理错误"""
+         for callback in self.error_callbacks:
+             try:
+                 callback(session_id, error)
+             except Exception as e:
+                 self.logger.error(f"错误回调执行出错: {e}")
+     
+     def _update_performance_stats(self, session_id: str, metrics: OptimizationMetrics):
+         """更新性能统计"""
+         with self._lock:
+             # 更新平均延迟
+             current_avg = self.processing_stats['average_latency_ms']
+             total_sessions = self.processing_stats['total_sessions']
+             
+             if total_sessions > 0:
+                 new_avg = (current_avg * (total_sessions - 1) + metrics.total_latency_ms) / total_sessions
+                 self.processing_stats['average_latency_ms'] = new_avg
+             
+             # 存储会话指标
+             self.performance_metrics[session_id] = metrics
+     
+     def _trigger_metrics_callbacks(self, session_id: str, metrics: OptimizationMetrics):
+         """触发性能指标回调"""
+         for callback in self.metrics_callbacks:
+             try:
+                 # 将session_id包含在metrics字典中传递给回调
+                 metrics_dict = {
+                     'session_id': session_id,
+                     'total_latency_ms': metrics.total_latency_ms,
+                     'chunking_latency_ms': metrics.chunking_latency_ms,
+                     'segmentation_latency_ms': metrics.segmentation_latency_ms,
+                     'tracking_latency_ms': metrics.tracking_latency_ms,
+                     'display_latency_ms': metrics.display_latency_ms,
+                     'processing_efficiency': metrics.processing_efficiency,
+                     'accuracy_score': getattr(metrics, 'accuracy_score', 0.0)
+                 }
+                 callback(metrics_dict)
+             except Exception as e:
+                 self.logger.error(f"指标回调执行出错: {e}")
+     
+     def complete_session(self, session_id: str) -> bool:
+         """完成处理会话"""
+         try:
+             # 完成各个模块的会话
+             self.segmentation_module.complete_session(session_id)
+             self.chunking_module.complete_session(session_id)
+             self.tracking_module.complete_session(session_id)
+             
+             with self._lock:
+                 self.processing_stats['active_sessions'] -= 1
+                 if session_id in self.performance_metrics:
+                     del self.performance_metrics[session_id]
+             
+             self.logger.info(f"会话完成: {session_id}")
+             return True
+             
+         except Exception as e:
+             self.logger.error(f"完成会话失败: {e}")
+             self._handle_error(session_id, e)
+             return False
+     
+     def get_session_results(self, session_id: str) -> List[Dict]:
+         """获取会话的所有结果"""
+         try:
+             # 从追踪模块获取结果
+             results = self.tracking_module.get_session_results(session_id)
+             
+             # 从显示模块获取显示信息
+             display_segments = self.display_module.get_session_display(session_id)
+             
+             # 合并结果
+             combined_results = []
+             for result in results:
+                 result_dict = {
+                     'result_id': result.result_id,
+                     'text': result.text,
+                     'confidence': result.confidence,
+                     'is_final': result.is_final,
+                     'timestamp': result.timestamp,
+                     'result_type': result.result_type.value if hasattr(result.result_type, 'value') else str(result.result_type)
+                 }
+                 combined_results.append(result_dict)
+             
+             return combined_results
+             
+         except Exception as e:
+             self.logger.error(f"获取会话结果失败: {e}")
+             return []
+     
+     def get_performance_stats(self) -> Dict:
+         """获取性能统计"""
+         with self._lock:
+             stats = self.processing_stats.copy()
+             
+             # 添加各模块的性能统计
+             stats['segmentation_stats'] = self.segmentation_module.get_performance_stats()
+             stats['chunking_stats'] = self.chunking_module.get_performance_stats()
+             stats['tracking_stats'] = self.tracking_module.get_performance_stats()
+             stats['display_stats'] = self.display_module.get_performance_stats()
+             
+             return stats
+     
+     def get_optimization_metrics(self, session_id: str = None) -> Dict:
+         """获取优化指标"""
+         if session_id:
+             return self.performance_metrics.get(session_id, {})
+         else:
+             return self.performance_metrics.copy()
+     
+     def shutdown(self):
+         """关闭优化管理器"""
+         self._running = False
+         
+         # 关闭各个模块
+         self.segmentation_module.shutdown()
+         self.chunking_module.shutdown()
+         self.tracking_module.shutdown()
+         self.display_module.shutdown()
+         
+         # 关闭线程池
+         self.executor.shutdown(wait=True)
+         
+         self.logger.info("流式语音识别优化管理器已关闭")
\ No newline at end of file