recorder_sync_improved.py
11.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
# AIfeng/2025-01-02 16:03:47
# 改进版同步录音器 - 解决语音过度分割问题
# 核心改进:静音持续时间检测、动态阈值优化、最小语音长度限制
import pyaudio
import wave
import threading
import time
import tempfile
import os
from funasr_asr_sync import FunASRSync
from logger import get_logger
logger = get_logger("RecorderSyncImproved")
class RecorderSyncImproved:
def __init__(self, chunk=1024, format=pyaudio.paInt16, channels=1, rate=16000,
volume_threshold=0.03, silence_duration=1.5, min_speech_duration=0.5,
pre_buffer_duration=0.5, dynamic_threshold_factor=0.8):
"""
改进版同步录音器
Args:
chunk: 音频块大小
format: 音频格式
channels: 声道数
rate: 采样率
volume_threshold: 基础音量阈值
silence_duration: 静音持续时间阈值(秒)
min_speech_duration: 最小语音持续时间(秒)
pre_buffer_duration: 预缓冲时长(秒)
dynamic_threshold_factor: 动态阈值因子
"""
self.chunk = chunk
self.format = format
self.channels = channels
self.rate = rate
self.volume_threshold = volume_threshold
self.silence_duration = silence_duration
self.min_speech_duration = min_speech_duration
self.pre_buffer_duration = pre_buffer_duration
self.dynamic_threshold_factor = dynamic_threshold_factor
# 计算帧数
self.silence_frames = int(silence_duration * rate / chunk)
self.min_speech_frames = int(min_speech_duration * rate / chunk)
self.pre_buffer_frames = int(pre_buffer_duration * rate / chunk)
# 状态变量
self.is_recording_flag = False
self.recording_thread = None
self.audio = None
self.stream = None
# 语音检测状态
self.is_speaking = False
self.silence_counter = 0
self.speech_counter = 0
self.current_recording = []
self.pre_buffer = []
# 动态阈值
self.volume_history = []
self.history_size = 50
self.dynamic_threshold = volume_threshold
# ASR客户端
self.asr_client = FunASRSync()
logger.info(f"RecorderSyncImproved初始化完成 - 静音阈值:{silence_duration}s, 最小语音:{min_speech_duration}s")
def _calculate_volume(self, data):
"""计算音频数据的音量"""
import numpy as np
audio_data = np.frombuffer(data, dtype=np.int16)
return np.sqrt(np.mean(audio_data**2)) / 32768.0
def _update_dynamic_threshold(self, volume):
"""更新动态阈值"""
self.volume_history.append(volume)
if len(self.volume_history) > self.history_size:
self.volume_history.pop(0)
if len(self.volume_history) >= 10:
# 使用历史音量的百分位数作为动态阈值
import numpy as np
percentile_75 = np.percentile(self.volume_history, 75)
self.dynamic_threshold = max(
self.volume_threshold,
percentile_75 * self.dynamic_threshold_factor
)
def _save_audio_segment(self, audio_data):
"""保存音频片段并发送给ASR"""
if len(audio_data) < self.min_speech_frames:
logger.debug(f"音频片段太短,跳过处理: {len(audio_data)} < {self.min_speech_frames}")
return
try:
# 创建临时文件
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav', dir='cache_data')
temp_filename = temp_file.name
temp_file.close()
# 保存音频
with wave.open(temp_filename, 'wb') as wf:
wf.setnchannels(self.channels)
wf.setsampwidth(pyaudio.get_sample_size(self.format))
wf.setframerate(self.rate)
wf.writeframes(b''.join(audio_data))
logger.info(f"保存音频片段: {temp_filename}, 帧数: {len(audio_data)}")
# 发送给ASR处理
self.asr_client.send_audio_file(temp_filename)
except Exception as e:
logger.error(f"保存音频片段失败: {e}")
def _recording_loop(self):
"""录音主循环 - 改进版VAD逻辑"""
logger.info("开始录音循环")
while self.is_recording_flag:
try:
data = self.stream.read(self.chunk, exception_on_overflow=False)
volume = self._calculate_volume(data)
# 更新动态阈值
self._update_dynamic_threshold(volume)
# 维护预缓冲区
self.pre_buffer.append(data)
if len(self.pre_buffer) > self.pre_buffer_frames:
self.pre_buffer.pop(0)
# 语音活动检测
if volume > self.dynamic_threshold:
# 检测到语音
if not self.is_speaking:
# 语音开始
logger.debug(f"检测到语音开始 - 音量:{volume:.4f}, 阈值:{self.dynamic_threshold:.4f}")
self.is_speaking = True
self.silence_counter = 0
self.speech_counter = 1
# 将预缓冲区数据加入当前录音
self.current_recording = list(self.pre_buffer)
else:
# 语音继续
self.speech_counter += 1
self.silence_counter = 0
self.current_recording.append(data)
else:
# 检测到静音
if self.is_speaking:
self.silence_counter += 1
self.current_recording.append(data)
# 检查是否达到静音持续时间阈值
if self.silence_counter >= self.silence_frames:
# 语音结束
logger.debug(f"检测到语音结束 - 语音帧数:{self.speech_counter}, 静音帧数:{self.silence_counter}")
# 检查语音长度是否满足最小要求
if self.speech_counter >= self.min_speech_frames:
# 移除末尾的静音部分
speech_data = self.current_recording[:-self.silence_counter]
self._save_audio_segment(speech_data)
else:
logger.debug(f"语音片段太短,跳过: {self.speech_counter} < {self.min_speech_frames}")
# 重置状态
self.is_speaking = False
self.silence_counter = 0
self.speech_counter = 0
self.current_recording = []
else:
# 持续静音,不做处理
pass
except Exception as e:
logger.error(f"录音循环错误: {e}")
break
logger.info("录音循环结束")
def start_recording(self, device_index=None):
"""开始录音"""
if self.is_recording_flag:
logger.warning("录音已在进行中")
return
try:
self.audio = pyaudio.PyAudio()
# 创建音频流
self.stream = self.audio.open(
format=self.format,
channels=self.channels,
rate=self.rate,
input=True,
input_device_index=device_index,
frames_per_buffer=self.chunk
)
# 重置状态
self.is_recording_flag = True
self.is_speaking = False
self.silence_counter = 0
self.speech_counter = 0
self.current_recording = []
self.pre_buffer = []
self.volume_history = []
self.dynamic_threshold = self.volume_threshold
# 启动录音线程
self.recording_thread = threading.Thread(target=self._recording_loop)
self.recording_thread.start()
logger.info("录音开始")
except Exception as e:
logger.error(f"启动录音失败: {e}")
self.stop_recording()
def stop_recording(self):
"""停止录音"""
logger.info("停止录音")
self.is_recording_flag = False
if self.recording_thread:
self.recording_thread.join(timeout=2)
if self.stream:
self.stream.stop_stream()
self.stream.close()
self.stream = None
if self.audio:
self.audio.terminate()
self.audio = None
# 处理最后的录音片段
if self.is_speaking and len(self.current_recording) >= self.min_speech_frames:
logger.info("处理最后的录音片段")
self._save_audio_segment(self.current_recording)
# 重置状态
self.is_speaking = False
self.silence_counter = 0
self.speech_counter = 0
self.current_recording = []
self.pre_buffer = []
def is_recording(self):
"""检查是否正在录音"""
return self.is_recording_flag
def get_status(self):
"""获取录音状态信息"""
return {
'is_recording': self.is_recording_flag,
'is_speaking': self.is_speaking,
'dynamic_threshold': self.dynamic_threshold,
'volume_threshold': self.volume_threshold,
'silence_duration': self.silence_duration,
'min_speech_duration': self.min_speech_duration,
'current_speech_frames': self.speech_counter,
'current_silence_frames': self.silence_counter
}
def list_audio_devices(self):
"""列出可用的音频设备"""
audio = pyaudio.PyAudio()
devices = []
for i in range(audio.get_device_count()):
device_info = audio.get_device_info_by_index(i)
if device_info['maxInputChannels'] > 0:
devices.append({
'index': i,
'name': device_info['name'],
'channels': device_info['maxInputChannels'],
'sample_rate': device_info['defaultSampleRate']
})
audio.terminate()
return devices
if __name__ == "__main__":
# 测试代码
recorder = RecorderSyncImproved(
volume_threshold=0.03,
silence_duration=1.5,
min_speech_duration=0.5
)
print("可用音频设备:")
devices = recorder.list_audio_devices()
for device in devices:
print(f" {device['index']}: {device['name']}")
print("\n按Enter开始录音...")
input()
recorder.start_recording()
print("录音中... 按Enter停止")
input()
recorder.stop_recording()
print("录音结束")