recorder_sync_improved.py 11.5 KB
# AIfeng/2025-01-02 16:03:47
# 改进版同步录音器 - 解决语音过度分割问题
# 核心改进:静音持续时间检测、动态阈值优化、最小语音长度限制

import pyaudio
import wave
import threading
import time
import tempfile
import os
from funasr_asr_sync import FunASRSync
from logger import get_logger

logger = get_logger("RecorderSyncImproved")

class RecorderSyncImproved:
    def __init__(self, chunk=1024, format=pyaudio.paInt16, channels=1, rate=16000, 
                 volume_threshold=0.03, silence_duration=1.5, min_speech_duration=0.5,
                 pre_buffer_duration=0.5, dynamic_threshold_factor=0.8):
        """
        改进版同步录音器
        
        Args:
            chunk: 音频块大小
            format: 音频格式
            channels: 声道数
            rate: 采样率
            volume_threshold: 基础音量阈值
            silence_duration: 静音持续时间阈值(秒)
            min_speech_duration: 最小语音持续时间(秒)
            pre_buffer_duration: 预缓冲时长(秒)
            dynamic_threshold_factor: 动态阈值因子
        """
        self.chunk = chunk
        self.format = format
        self.channels = channels
        self.rate = rate
        self.volume_threshold = volume_threshold
        self.silence_duration = silence_duration
        self.min_speech_duration = min_speech_duration
        self.pre_buffer_duration = pre_buffer_duration
        self.dynamic_threshold_factor = dynamic_threshold_factor
        
        # 计算帧数
        self.silence_frames = int(silence_duration * rate / chunk)
        self.min_speech_frames = int(min_speech_duration * rate / chunk)
        self.pre_buffer_frames = int(pre_buffer_duration * rate / chunk)
        
        # 状态变量
        self.is_recording_flag = False
        self.recording_thread = None
        self.audio = None
        self.stream = None
        
        # 语音检测状态
        self.is_speaking = False
        self.silence_counter = 0
        self.speech_counter = 0
        self.current_recording = []
        self.pre_buffer = []
        
        # 动态阈值
        self.volume_history = []
        self.history_size = 50
        self.dynamic_threshold = volume_threshold
        
        # ASR客户端
        self.asr_client = FunASRSync()
        
        logger.info(f"RecorderSyncImproved初始化完成 - 静音阈值:{silence_duration}s, 最小语音:{min_speech_duration}s")
    
    def _calculate_volume(self, data):
        """计算音频数据的音量"""
        import numpy as np
        audio_data = np.frombuffer(data, dtype=np.int16)
        return np.sqrt(np.mean(audio_data**2)) / 32768.0
    
    def _update_dynamic_threshold(self, volume):
        """更新动态阈值"""
        self.volume_history.append(volume)
        if len(self.volume_history) > self.history_size:
            self.volume_history.pop(0)
        
        if len(self.volume_history) >= 10:
            # 使用历史音量的百分位数作为动态阈值
            import numpy as np
            percentile_75 = np.percentile(self.volume_history, 75)
            self.dynamic_threshold = max(
                self.volume_threshold,
                percentile_75 * self.dynamic_threshold_factor
            )
    
    def _save_audio_segment(self, audio_data):
        """保存音频片段并发送给ASR"""
        if len(audio_data) < self.min_speech_frames:
            logger.debug(f"音频片段太短,跳过处理: {len(audio_data)} < {self.min_speech_frames}")
            return
        
        try:
            # 创建临时文件
            temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav', dir='cache_data')
            temp_filename = temp_file.name
            temp_file.close()
            
            # 保存音频
            with wave.open(temp_filename, 'wb') as wf:
                wf.setnchannels(self.channels)
                wf.setsampwidth(pyaudio.get_sample_size(self.format))
                wf.setframerate(self.rate)
                wf.writeframes(b''.join(audio_data))
            
            logger.info(f"保存音频片段: {temp_filename}, 帧数: {len(audio_data)}")
            
            # 发送给ASR处理
            self.asr_client.send_audio_file(temp_filename)
            
        except Exception as e:
            logger.error(f"保存音频片段失败: {e}")
    
    def _recording_loop(self):
        """录音主循环 - 改进版VAD逻辑"""
        logger.info("开始录音循环")
        
        while self.is_recording_flag:
            try:
                data = self.stream.read(self.chunk, exception_on_overflow=False)
                volume = self._calculate_volume(data)
                
                # 更新动态阈值
                self._update_dynamic_threshold(volume)
                
                # 维护预缓冲区
                self.pre_buffer.append(data)
                if len(self.pre_buffer) > self.pre_buffer_frames:
                    self.pre_buffer.pop(0)
                
                # 语音活动检测
                if volume > self.dynamic_threshold:
                    # 检测到语音
                    if not self.is_speaking:
                        # 语音开始
                        logger.debug(f"检测到语音开始 - 音量:{volume:.4f}, 阈值:{self.dynamic_threshold:.4f}")
                        self.is_speaking = True
                        self.silence_counter = 0
                        self.speech_counter = 1
                        
                        # 将预缓冲区数据加入当前录音
                        self.current_recording = list(self.pre_buffer)
                    else:
                        # 语音继续
                        self.speech_counter += 1
                        self.silence_counter = 0
                        self.current_recording.append(data)
                else:
                    # 检测到静音
                    if self.is_speaking:
                        self.silence_counter += 1
                        self.current_recording.append(data)
                        
                        # 检查是否达到静音持续时间阈值
                        if self.silence_counter >= self.silence_frames:
                            # 语音结束
                            logger.debug(f"检测到语音结束 - 语音帧数:{self.speech_counter}, 静音帧数:{self.silence_counter}")
                            
                            # 检查语音长度是否满足最小要求
                            if self.speech_counter >= self.min_speech_frames:
                                # 移除末尾的静音部分
                                speech_data = self.current_recording[:-self.silence_counter]
                                self._save_audio_segment(speech_data)
                            else:
                                logger.debug(f"语音片段太短,跳过: {self.speech_counter} < {self.min_speech_frames}")
                            
                            # 重置状态
                            self.is_speaking = False
                            self.silence_counter = 0
                            self.speech_counter = 0
                            self.current_recording = []
                    else:
                        # 持续静音,不做处理
                        pass
                
            except Exception as e:
                logger.error(f"录音循环错误: {e}")
                break
        
        logger.info("录音循环结束")
    
    def start_recording(self, device_index=None):
        """开始录音"""
        if self.is_recording_flag:
            logger.warning("录音已在进行中")
            return
        
        try:
            self.audio = pyaudio.PyAudio()
            
            # 创建音频流
            self.stream = self.audio.open(
                format=self.format,
                channels=self.channels,
                rate=self.rate,
                input=True,
                input_device_index=device_index,
                frames_per_buffer=self.chunk
            )
            
            # 重置状态
            self.is_recording_flag = True
            self.is_speaking = False
            self.silence_counter = 0
            self.speech_counter = 0
            self.current_recording = []
            self.pre_buffer = []
            self.volume_history = []
            self.dynamic_threshold = self.volume_threshold
            
            # 启动录音线程
            self.recording_thread = threading.Thread(target=self._recording_loop)
            self.recording_thread.start()
            
            logger.info("录音开始")
            
        except Exception as e:
            logger.error(f"启动录音失败: {e}")
            self.stop_recording()
    
    def stop_recording(self):
        """停止录音"""
        logger.info("停止录音")
        
        self.is_recording_flag = False
        
        if self.recording_thread:
            self.recording_thread.join(timeout=2)
        
        if self.stream:
            self.stream.stop_stream()
            self.stream.close()
            self.stream = None
        
        if self.audio:
            self.audio.terminate()
            self.audio = None
        
        # 处理最后的录音片段
        if self.is_speaking and len(self.current_recording) >= self.min_speech_frames:
            logger.info("处理最后的录音片段")
            self._save_audio_segment(self.current_recording)
        
        # 重置状态
        self.is_speaking = False
        self.silence_counter = 0
        self.speech_counter = 0
        self.current_recording = []
        self.pre_buffer = []
    
    def is_recording(self):
        """检查是否正在录音"""
        return self.is_recording_flag
    
    def get_status(self):
        """获取录音状态信息"""
        return {
            'is_recording': self.is_recording_flag,
            'is_speaking': self.is_speaking,
            'dynamic_threshold': self.dynamic_threshold,
            'volume_threshold': self.volume_threshold,
            'silence_duration': self.silence_duration,
            'min_speech_duration': self.min_speech_duration,
            'current_speech_frames': self.speech_counter,
            'current_silence_frames': self.silence_counter
        }
    
    def list_audio_devices(self):
        """列出可用的音频设备"""
        audio = pyaudio.PyAudio()
        devices = []
        
        for i in range(audio.get_device_count()):
            device_info = audio.get_device_info_by_index(i)
            if device_info['maxInputChannels'] > 0:
                devices.append({
                    'index': i,
                    'name': device_info['name'],
                    'channels': device_info['maxInputChannels'],
                    'sample_rate': device_info['defaultSampleRate']
                })
        
        audio.terminate()
        return devices

if __name__ == "__main__":
    # 测试代码
    recorder = RecorderSyncImproved(
        volume_threshold=0.03,
        silence_duration=1.5,
        min_speech_duration=0.5
    )
    
    print("可用音频设备:")
    devices = recorder.list_audio_devices()
    for device in devices:
        print(f"  {device['index']}: {device['name']}")
    
    print("\n按Enter开始录音...")
    input()
    
    recorder.start_recording()
    
    print("录音中... 按Enter停止")
    input()
    
    recorder.stop_recording()
    print("录音结束")