audio_recognition_app.py
41.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
import tkinter as tk
from tkinter import filedialog
import websocket
import threading
import pyaudio
import wave
import os
import json
import struct
import time
import datetime
class AudioRecognitionApp:
def __init__(self, root):
self.root = root
self.root.title("音频识别应用")
self.root.geometry("500x400")
self.ws = None
self.is_recording = False
self.is_testing_mic = False
self.is_cache_recording = False # 添加先存后传录音状态
self.audio_stream = None
self.p = pyaudio.PyAudio()
# 调整音频参数
self.sample_rate = 16000 # 采样率16kHz
self.chunk_size = 1600 # 每个音频块100ms的数据量 (16000 * 0.1)
self.format = pyaudio.paInt16 # 16位采样
self.channels = 1 # 单声道
# 调整静默检测参数 - 设置更合理的阈值
self.silence_threshold = 300 # 调整静默阈值,确保能正确区分语音和背景噪音
self.silence_frames = 0 # 连续静默帧数
self.max_silence_frames = 20 # 2秒静默 (16000Hz / 1600帧 * 2秒)
# 添加静默检测控制UI
self.silence_threshold_var = tk.IntVar(value=self.silence_threshold)
self.ws = None
self.is_recording = False
self.is_testing_mic = False
self.is_cache_recording = False # 添加先存后传录音状态
self.audio_stream = None
self.p = pyaudio.PyAudio()
# 调整音频参数
self.sample_rate = 16000 # 采样率16kHz
self.chunk_size = 1600 # 每个音频块100ms的数据量 (16000 * 0.1)
self.format = pyaudio.paInt16 # 16位采样
self.channels = 1 # 单声道
# 其他参数
self.recording_frames = [] # 存储录音数据
self.ws_timeout = 10 # WebSocket连接超时设置(秒)
self.volume_gain = 5.0 # 音量增益倍数
# 添加累积缓冲区和发送控制
self.buffer_frames = [] # 累积的音频帧
self.buffer_max_frames = 16 # 累积10帧后发送(约1秒)
self.last_send_time = 0 # 上次发送数据的时间
self.min_send_interval = 0.8 # 最小发送间隔(秒)
# 先存后传录音相关参数
self.cache_segment_duration = 1.0 # 每段缓存录音的时长(秒)
self.cache_segment_frames = int(self.sample_rate * self.cache_segment_duration / self.chunk_size) # 每段录音的帧数
self.cache_dir = "cache" # 缓存目录
self.cache_files = [] # 存储缓存文件路径
self.current_cache_frames = [] # 当前缓存段的帧
# 创建缓存目录
if not os.path.exists(self.cache_dir):
os.makedirs(self.cache_dir)
# 创建UI元素
self.setup_ui()
def setup_ui(self):
# 调整窗口大小以适应更多内容
self.root.geometry("600x700") # 增加高度以容纳新控件
# 创建顶部控制区域框架
control_frame = tk.Frame(self.root)
control_frame.pack(fill=tk.X, padx=10, pady=10)
# 连接状态标签
self.status_label = tk.Label(control_frame, text="未连接", fg="red")
self.status_label.pack(side=tk.LEFT, padx=5)
# 创建麦克风设备选择区域
mic_frame = tk.Frame(self.root)
mic_frame.pack(fill=tk.X, padx=10, pady=5)
# 麦克风设备标签
mic_label = tk.Label(mic_frame, text="选择麦克风设备:")
mic_label.pack(side=tk.LEFT, padx=5)
# 获取可用的音频输入设备
self.mic_devices = []
self.mic_device_names = []
for i in range(self.p.get_device_count()):
device_info = self.p.get_device_info_by_index(i)
if device_info['maxInputChannels'] > 0: # 只显示输入设备
self.mic_devices.append(i)
name = device_info['name']
self.mic_device_names.append(f"{i}: {name}")
# 默认选择第一个设备
self.selected_mic_index = tk.StringVar()
if self.mic_device_names:
self.selected_mic_index.set(self.mic_device_names[0])
# 麦克风设备下拉菜单
self.mic_dropdown = tk.OptionMenu(mic_frame, self.selected_mic_index, *self.mic_device_names)
self.mic_dropdown.pack(side=tk.LEFT, padx=5)
# 刷新麦克风列表按钮
refresh_button = tk.Button(mic_frame, text="刷新设备列表", command=self.refresh_mic_devices)
refresh_button.pack(side=tk.LEFT, padx=5)
# 添加音量增益控制区域
gain_frame = tk.Frame(self.root)
gain_frame.pack(fill=tk.X, padx=10, pady=5)
gain_label = tk.Label(gain_frame, text="音量增益:")
gain_label.pack(side=tk.LEFT, padx=5)
self.gain_value = tk.DoubleVar(value=self.volume_gain)
self.gain_slider = tk.Scale(gain_frame, from_=1.0, to=10.0, resolution=0.5,
orient=tk.HORIZONTAL, length=200,
variable=self.gain_value, command=self.update_gain)
self.gain_slider.pack(side=tk.LEFT, padx=5)
self.gain_label = tk.Label(gain_frame, text=f"{self.volume_gain:.1f}x")
self.gain_label.pack(side=tk.LEFT, padx=5)
# 连接按钮
self.connect_button = tk.Button(control_frame, text="连接WebSocket", command=self.connect_websocket)
self.connect_button.pack(side=tk.LEFT, padx=5)
# 断开连接按钮
self.disconnect_button = tk.Button(control_frame, text="断开连接", command=self.disconnect_websocket, state=tk.DISABLED)
self.disconnect_button.pack(side=tk.LEFT, padx=5)
# 创建操作区域框架
operation_frame = tk.Frame(self.root)
operation_frame.pack(fill=tk.X, padx=10, pady=5)
# 上传文件按钮
self.upload_button = tk.Button(operation_frame, text="上传音频文件", command=self.upload_audio_file, state=tk.DISABLED)
self.upload_button.pack(side=tk.LEFT, padx=5)
# 麦克风录音按钮
self.mic_button = tk.Button(operation_frame, text="开始麦克风录音", command=self.toggle_microphone, state=tk.DISABLED)
self.mic_button.pack(side=tk.LEFT, padx=5)
# 麦克风测试按钮
self.test_mic_button = tk.Button(operation_frame, text="测试麦克风", command=self.toggle_test_microphone)
self.test_mic_button.pack(side=tk.LEFT, padx=5)
# 添加"先存后传录音"按钮
self.cache_record_button = tk.Button(operation_frame, text="先存后传录音", command=self.toggle_cache_recording, state=tk.DISABLED)
self.cache_record_button.pack(side=tk.LEFT, padx=5)
# 创建客户端操作日志区域
client_log_frame = tk.LabelFrame(self.root, text="客户端操作日志")
client_log_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=5)
self.client_log_text = tk.Text(client_log_frame, height=6, width=60)
self.client_log_text.pack(fill=tk.BOTH, expand=True, padx=5, pady=5)
client_scroll = tk.Scrollbar(self.client_log_text)
client_scroll.pack(side=tk.RIGHT, fill=tk.Y)
self.client_log_text.config(yscrollcommand=client_scroll.set)
client_scroll.config(command=self.client_log_text.yview)
# 创建服务端识别结果区域
server_result_frame = tk.LabelFrame(self.root, text="服务端识别结果")
server_result_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=5)
self.result_text = tk.Text(server_result_frame, height=8, width=60)
self.result_text.pack(fill=tk.BOTH, expand=True, padx=5, pady=5)
result_scroll = tk.Scrollbar(self.result_text)
result_scroll.pack(side=tk.RIGHT, fill=tk.Y)
self.result_text.config(yscrollcommand=result_scroll.set)
result_scroll.config(command=self.result_text.yview)
# 添加音量增益控制区域后,添加静默阈值控制
silence_frame = tk.Frame(self.root)
silence_frame.pack(fill=tk.X, padx=10, pady=5)
silence_label = tk.Label(silence_frame, text="静默阈值:")
silence_label.pack(side=tk.LEFT, padx=5)
self.silence_slider = tk.Scale(silence_frame, from_=100, to=2000, resolution=50,
orient=tk.HORIZONTAL, length=200,
variable=self.silence_threshold_var, command=self.update_silence_threshold)
self.silence_slider.pack(side=tk.LEFT, padx=5)
self.silence_value_label = tk.Label(silence_frame, text=f"{self.silence_threshold}")
self.silence_value_label.pack(side=tk.LEFT, padx=5)
def connect_websocket(self):
def on_message(ws, message):
try:
self.update_result(f"收到WebSocket消息,长度:{len(message)}字节")
result = json.loads(message)
if "text" in result:
# 将服务端返回的识别结果显示在结果区域
self.update_result(result["text"], is_server_result=True)
# 在客户端日志中记录收到结果
self.update_result(f"收到识别结果: {result['text'][:20]}{'...' if len(result['text']) > 20 else ''}")
elif "status" in result:
# 服务端状态消息显示在客户端日志中
self.update_result(f"服务端状态: {result['status']} - {result.get('message', '')}")
else:
# 其他消息显示在客户端日志中
self.update_result(f"收到消息: {message}")
except json.JSONDecodeError:
self.update_result(f"收到非JSON消息: {message}")
except Exception as e:
self.update_result(f"处理WebSocket消息时发生异常:{str(e)}")
def on_error(ws, error):
self.update_status("连接错误", "red")
self.update_result(f"WebSocket错误: {error}")
print(f"错误: {error}")
def on_close(ws, close_status_code, close_msg):
self.update_status("连接已关闭", "red")
self.update_result(f"WebSocket连接已关闭,状态码:{close_status_code},消息:{close_msg}")
self.upload_button.config(state=tk.DISABLED)
self.mic_button.config(state=tk.DISABLED)
self.cache_record_button.config(state=tk.DISABLED) # 禁用先存后传录音按钮
self.disconnect_button.config(state=tk.DISABLED)
self.connect_button.config(state=tk.NORMAL)
print("连接已关闭")
def on_open(ws):
self.update_status("已连接", "green")
self.update_result("WebSocket连接已建立,准备发送/接收数据")
self.upload_button.config(state=tk.NORMAL)
self.mic_button.config(state=tk.NORMAL)
self.cache_record_button.config(state=tk.NORMAL) # 启用先存后传录音按钮
self.disconnect_button.config(state=tk.NORMAL)
self.connect_button.config(state=tk.DISABLED)
print("连接已建立")
try:
# 如果已经有连接,先关闭
if self.ws:
self.disconnect_websocket()
# "ws://sitigrs.boeart.cn/jeecg-boot/api/speech/recognize" #
self.ws = websocket.WebSocketApp("ws://localhost:10197",
on_open=on_open,
on_message=on_message,
on_error=on_error,
on_close=on_close)
wst = threading.Thread(target=lambda: self.ws.run_forever(ping_interval=30, ping_timeout=10, ping_payload="ping"))
wst.daemon = True
wst.start()
self.update_status("正在连接...", "orange")
except Exception as e:
self.update_status(f"连接失败: {str(e)}", "red")
def update_gain(self, value):
"""更新音量增益值"""
self.volume_gain = float(value)
self.gain_label.config(text=f"{self.volume_gain:.1f}x")
self.update_result(f"音量增益已调整为: {self.volume_gain:.1f}x")
def update_silence_threshold(self, value):
"""更新静默阈值"""
self.silence_threshold = int(value)
self.silence_value_label.config(text=f"{self.silence_threshold}")
self.update_result(f"静默阈值已调整为: {self.silence_threshold}")
def upload_audio_file(self):
file_path = filedialog.askopenfilename(
filetypes=[("音频文件", "*.wav *.mp3")]
)
if file_path:
try:
if file_path.endswith('.wav'):
with open(file_path, 'rb') as audio_data:
audio_data = audio_data.read()
if self.ws and self.ws.sock:
self.ws.send(audio_data, websocket.ABNF.OPCODE_BINARY)
self.update_result(f"已发送文件: {os.path.basename(file_path)}")
else:
self.update_result("目前只支持WAV格式文件")
except Exception as e:
self.update_result(f"发送文件错误: {str(e)}")
def toggle_microphone(self):
if not self.is_recording:
self.start_recording()
self.mic_button.config(text="停止麦克风录音")
else:
self.stop_recording()
self.mic_button.config(text="开始麦克风录音")
def toggle_test_microphone(self):
if not self.is_testing_mic:
self.start_test_recording()
self.test_mic_button.config(text="停止麦克风测试")
else:
self.stop_test_recording()
self.test_mic_button.config(text="测试麦克风")
def start_recording(self):
self.is_recording = True
self.recording_frames = [] # 重置录音帧
self.buffer_frames = [] # 重置缓冲区
self.silence_frames = 0 # 重置静默计数
self.last_send_time = time.time()
# 获取选中的麦克风设备索引
selected_device = self.selected_mic_index.get()
if selected_device:
device_index = int(selected_device.split(":")[0])
self.update_result(f"使用麦克风设备: {selected_device}")
else:
device_index = None # 使用系统默认设备
self.update_result("使用系统默认麦克风设备")
self.update_result("开始录音...")
def audio_callback(in_data, frame_count, time_info, status):
if self.is_recording:
# 确保使用原始字节数据
audio_data = in_data
# 计算RMS值用于音量显示和静默检测
rms = self.calculate_rms(audio_data)
# 应用音量增益
amplified_data = self.apply_volume_gain(audio_data)
# 更新音量指示器
self.update_volume_indicator(rms)
# 存储放大后的录音数据 - 始终保存到录音帧中,不再重置
self.recording_frames.append(amplified_data)
# 添加到发送缓冲区
self.buffer_frames.append(amplified_data)
# 静默检测 - 只用于显示,不再中断录音或发送结束命令
if rms < self.silence_threshold:
self.silence_frames += 1
else:
self.silence_frames = 0
# 当累积足够的帧数或距离上次发送已经过了足够时间时发送数据
current_time = time.time()
if (len(self.buffer_frames) >= self.buffer_max_frames or
(current_time - self.last_send_time >= self.min_send_interval and self.buffer_frames)):
buffer_data = b''.join(self.buffer_frames)
buffer_size = len(buffer_data)
if buffer_size > 0:
self.send_audio_data(buffer_data)
self.last_send_time = current_time
# 清空缓冲区,但不清空录音帧
self.buffer_frames = []
return (in_data, pyaudio.paContinue)
# 打开音频流
self.audio_stream = self.p.open(
format=self.format,
channels=self.channels,
rate=self.sample_rate,
input=True,
output=False,
frames_per_buffer=self.chunk_size,
input_device_index=device_index,
stream_callback=audio_callback
)
self.audio_stream.start_stream()
self.update_result("开始录音...")
def start_test_recording(self):
self.is_testing_mic = True
self.recording_frames = []
# 获取选中的麦克风设备索引
selected_device = self.selected_mic_index.get()
if selected_device:
device_index = int(selected_device.split(":")[0])
self.update_result(f"测试麦克风设备: {selected_device}")
else:
device_index = None # 使用系统默认设备
self.update_result("测试系统默认麦克风设备")
def audio_callback(in_data, frame_count, time_info, status):
if self.is_testing_mic:
# 计算音量,用于显示音量指示
audio_data = in_data
rms = self.calculate_rms(audio_data)
# 应用音量增益
amplified_data = self.apply_volume_gain(audio_data)
# 存储放大后的录音数据
self.recording_frames.append(amplified_data)
# 更新音量指示器
self.update_volume_indicator(rms)
return (in_data, pyaudio.paContinue)
# 打开音频流
self.audio_stream = self.p.open(
format=self.format,
channels=self.channels,
rate=self.sample_rate,
input=True,
output=False,
frames_per_buffer=self.chunk_size,
input_device_index=device_index,
stream_callback=audio_callback
)
self.audio_stream.start_stream()
self.update_result("麦克风测试已开始")
def update_volume_indicator(self, rms):
"""更新音量指示器"""
# 创建一个更直观的音量指示
volume_level = min(int(rms / 10), 100) # 将音量标准化到0-100范围
volume_text = "█" * (volume_level // 5) # 每5个单位显示一个方块
# 根据音量大小设置不同颜色
if rms > self.silence_threshold:
color = "green"
status = "检测到声音"
else:
color = "orange"
status = "静默"
# 更新状态标签
self.root.after(0, lambda: self.status_label.config(
text=f"{status}: {int(rms)} [{volume_text}]",
fg=color
))
# 在日志中记录较大的音量变化(避免过多日志)
if rms > self.silence_threshold * 2 and volume_level % 20 == 0:
self.update_result(f"当前音量: {int(rms)}")
def stop_recording(self):
# 先标记录音状态为False,防止回调函数继续处理
self.is_recording = False
# 安全关闭音频流
if self.audio_stream:
try:
# 检查流是否打开
if self.audio_stream.is_active():
self.audio_stream.stop_stream()
self.audio_stream.close()
except OSError as e:
# 捕获可能的超时或流已关闭错误
self.update_result(f"关闭音频流时出现错误: {str(e)}")
finally:
self.audio_stream = None
# 发送剩余缓冲区数据
if self.buffer_frames and self.ws and self.ws.sock and self.ws.sock.connected:
buffer_data = b''.join(self.buffer_frames)
if len(buffer_data) > 0:
self.send_audio_data(buffer_data)
self.buffer_frames = []
# 发送结束命令
if self.ws and self.ws.sock and self.ws.sock.connected:
try:
self.ws.send(json.dumps({"command": "end"}), websocket.ABNF.OPCODE_TEXT)
self.update_result("已发送结束命令")
except Exception as e:
self.update_result(f"发送结束命令时出现错误: {str(e)}")
# 保存完整录音 - 只在停止录音时保存一次
if self.recording_frames:
filename = self.save_recording()
self.update_result(f"已保存完整录音: {filename}")
self.recording_frames = [] # 保存后清空录音帧
self.update_result("录音已停止")
def stop_test_recording(self):
# 先标记测试状态为False,防止回调函数继续处理
self.is_testing_mic = False
# 安全关闭音频流
if self.audio_stream:
try:
# 检查流是否打开
if self.audio_stream.is_active():
self.audio_stream.stop_stream()
self.audio_stream.close()
except OSError as e:
# 捕获可能的超时或流已关闭错误
self.update_result(f"关闭测试音频流时出现错误: {str(e)}")
finally:
self.audio_stream = None
# 保存测试录音
if self.recording_frames:
filename = self.save_recording("test_recording")
self.update_result(f"测试录音已保存到: {filename}")
self.recording_frames = []
self.update_result("麦克风测试已停止")
def update_status(self, text, color):
self.status_label.config(text=text, fg=color)
def update_result(self, text, is_server_result=False):
"""更新结果显示
Args:
text: 要显示的文本
is_server_result: 是否为服务端返回的识别结果
"""
if is_server_result:
# 服务端识别结果显示在结果区域
self.root.after(0, lambda: self.result_text.insert(tk.END, text + "\n"))
self.root.after(0, lambda: self.result_text.see(tk.END))
else:
# 客户端操作日志显示在日志区域
self.root.after(0, lambda: self.client_log_text.insert(tk.END, text + "\n"))
self.root.after(0, lambda: self.client_log_text.see(tk.END))
def calculate_rms(self, data):
"""计算音频数据的RMS值,用于检测静默"""
try:
# 将字节数据转换为16位整数数组
count = len(data) // 2
if count == 0:
self.update_result("警告:计算RMS时收到空数据")
return 0
format = f"{count}h"
shorts = struct.unpack(format, bytes(data))
# 计算RMS值
sum_squares = 0
for sample in shorts:
sum_squares += sample * sample
rms_value = (sum_squares / count) ** 0.5 if count > 0 else 0
return rms_value
except Exception as e:
self.update_result(f"错误:计算RMS值时发生异常:{str(e)}")
return 0
def send_audio_data(self, audio_data):
"""发送音频数据到WebSocket服务器"""
if not self.ws or not self.ws.sock or not self.ws.sock.connected:
self.update_result("WebSocket未连接,无法发送数据")
return
data_size = len(audio_data)
if data_size == 0:
self.update_result("警告:尝试发送空数据")
return
# 计算RMS值用于日志
rms = self.calculate_rms(audio_data)
# 记录发送信息
self.update_result(f"发送WebSocket数据:{data_size}字节,RMS值:{int(rms)}")
# 记录数据前几个字节用于调试
if data_size > 10:
first_bytes = audio_data[:10]
self.update_result(f"数据前10字节:{first_bytes}")
try:
# 发送数据
result = self.ws.send(audio_data, websocket.ABNF.OPCODE_BINARY)
self.update_result(f"WebSocket发送结果:{result}")
except Exception as e:
self.update_result(f"发送数据错误: {str(e)}")
def apply_volume_gain(self, data):
"""应用音量增益到音频数据"""
# 将字节数据转换为16位整数数组
count = len(data) // 2
if count == 0:
self.update_result("错误:应用音量增益时收到空数据")
return data
format = f"{count}h"
try:
shorts = struct.unpack(format, bytes(data))
# 检查是否全是0或接近0的值
is_silent = True
for sample in shorts[:100]: # 检查前100个样本
if abs(sample) > 10: # 非零阈值
is_silent = False
break
if is_silent:
# 记录警告但仍处理数据
if len(self.recording_frames) % 50 == 0: # 减少日志频率
self.update_result("警告:检测到可能的静默数据")
# 计算原始数据的RMS值
original_rms = 0
for sample in shorts:
original_rms += sample * sample
original_rms = (original_rms / count) ** 0.5 if count > 0 else 0
# 应用音量增益
amplified_shorts = []
for sample in shorts:
# 应用增益并确保不超过16位整数范围
amplified_sample = int(sample * self.volume_gain)
if amplified_sample > 32767:
amplified_sample = 32767
elif amplified_sample < -32768:
amplified_sample = -32768
amplified_shorts.append(amplified_sample)
# 计算放大后数据的RMS值
amplified_rms = 0
for sample in amplified_shorts:
amplified_rms += sample * sample
amplified_rms = (amplified_rms / count) ** 0.5 if count > 0 else 0
# 每16帧记录一次增益效果,避免日志过多
if len(self.recording_frames) % 16 == 0:
self.update_result(f"音量增益:原始RMS={int(original_rms)} -> 放大后RMS={int(amplified_rms)},增益倍数={self.volume_gain:.1f}x")
# 将放大后的数据打包回字节数组
result = struct.pack(format, *amplified_shorts)
return result
except Exception as e:
self.update_result(f"错误:应用音量增益时发生异常:{str(e)}")
return data # 出错时返回原始数据
def save_recording(self, prefix="recording"):
"""保存录音数据到WAV文件"""
if not self.recording_frames:
self.update_result("没有录音数据可保存")
return None
# 合并所有帧
audio_data = b''.join(self.recording_frames)
data_size = len(audio_data)
frame_count = data_size // (2 * self.channels) # 16位采样,每帧2字节
duration = frame_count / self.sample_rate # 计算录音时长(秒)
# 如果数据太少,给出警告但仍然保存
if data_size < 8000: # 少于0.5秒的数据
self.update_result(f"警告:录音数据较短({data_size}字节,约{duration:.2f}秒)")
# 创建时间戳文件名
timestamp = time.strftime("%Y%m%d-%H%M%S")
filename = f"{prefix}_{timestamp}.wav"
self.update_result(f"准备保存录音文件:{filename},数据大小:{data_size}字节,时长:{duration:.1f}秒")
# 检查数据有效性
if data_size == 0:
self.update_result("错误:没有有效的音频数据可保存")
return None
# 保存为WAV文件
try:
wf = wave.open(filename, 'wb')
wf.setnchannels(self.channels)
wf.setsampwidth(self.p.get_sample_size(self.format))
wf.setframerate(self.sample_rate)
wf.writeframes(audio_data)
wf.close()
file_size = os.path.getsize(filename)
self.update_result(f"录音文件已保存:{filename},文件大小:{file_size}字节")
return filename
except Exception as e:
self.update_result(f"保存录音文件时出错:{str(e)}")
return None
def disconnect_websocket(self):
if self.ws:
self.ws.close()
self.ws = None
self.update_status("已断开连接", "red")
self.upload_button.config(state=tk.DISABLED)
self.mic_button.config(state=tk.DISABLED)
self.cache_record_button.config(state=tk.DISABLED) # 禁用先存后传录音按钮
self.disconnect_button.config(state=tk.DISABLED)
self.connect_button.config(state=tk.NORMAL)
# 清空录音状态
if self.is_recording:
self.stop_recording()
if self.is_cache_recording: # 如果正在进行先存后传录音,也需要停止
self.stop_cache_recording()
self.update_result("已断开与服务器的连接")
def refresh_mic_devices(self):
"""刷新麦克风设备列表"""
# 清空当前设备列表
self.mic_devices = []
self.mic_device_names = []
# 重新获取设备列表
for i in range(self.p.get_device_count()):
device_info = self.p.get_device_info_by_index(i)
if device_info['maxInputChannels'] > 0: # 只显示输入设备
self.mic_devices.append(i)
name = device_info['name']
self.mic_device_names.append(f"{i}: {name}")
# 更新下拉菜单
menu = self.mic_dropdown["menu"]
menu.delete(0, "end")
for name in self.mic_device_names:
menu.add_command(label=name, command=lambda value=name: self.selected_mic_index.set(value))
# 如果有设备,选择第一个
if self.mic_device_names:
self.selected_mic_index.set(self.mic_device_names[0])
self.update_result(f"已刷新麦克风设备列表,找到 {len(self.mic_device_names)} 个设备")
else:
self.update_result("未找到麦克风设备")
def on_closing(self):
if self.is_recording:
self.stop_recording()
if self.is_testing_mic:
self.stop_test_recording()
if self.is_cache_recording: # 添加对先存后传录音的处理
self.stop_cache_recording()
if self.ws:
self.ws.close()
self.p.terminate()
self.root.destroy()
def toggle_cache_recording(self):
"""切换先存后传录音状态"""
if not self.is_cache_recording:
self.start_cache_recording()
self.cache_record_button.config(text="停止先存后传")
else:
self.stop_cache_recording()
self.cache_record_button.config(text="先存后传录音")
def start_cache_recording(self):
"""开始先存后传录音"""
self.is_cache_recording = True
self.current_cache_frames = [] # 重置当前缓存段
self.cache_files = [] # 重置缓存文件列表
self.silence_frames = 0 # 重置静默计数
# 清空缓存目录中的旧文件
for file in os.listdir(self.cache_dir):
if file.endswith(".wav"):
try:
os.remove(os.path.join(self.cache_dir, file))
except Exception as e:
self.update_result(f"清理缓存文件时出错: {str(e)}")
# 获取选中的麦克风设备索引
selected_device = self.selected_mic_index.get()
if selected_device:
device_index = int(selected_device.split(":")[0])
self.update_result(f"使用麦克风设备: {selected_device}")
else:
device_index = None # 使用系统默认设备
self.update_result("使用系统默认麦克风设备")
self.update_result(f"开始先存后传录音... 静默阈值: {self.silence_threshold}")
def audio_callback(in_data, frame_count, time_info, status):
if self.is_cache_recording:
# 确保使用原始字节数据
audio_data = in_data
# 计算RMS值用于音量显示和静默检测
rms = self.calculate_rms(audio_data)
# 应用音量增益
amplified_data = self.apply_volume_gain(audio_data)
# 更新音量指示器
self.update_volume_indicator(rms)
# 静默检测
is_silent = rms < self.silence_threshold
if is_silent:
self.silence_frames += 1
if self.silence_frames % 10 == 0: # 每10帧记录一次静默状态
self.update_result(f"缓存录音检测到静默: {self.silence_frames}帧, RMS={int(rms)}")
else:
if self.silence_frames > 0:
self.update_result(f"缓存录音静默结束,持续了{self.silence_frames}帧")
self.silence_frames = 0
# 添加到当前缓存段 - 无论是否静默都保存,但标记静默状态
frame_info = {
'data': amplified_data,
'is_silent': is_silent,
'rms': rms
}
self.current_cache_frames.append(frame_info)
# 当累积足够的帧数时,保存为一个缓存文件
if len(self.current_cache_frames) >= self.cache_segment_frames:
cache_filename = self.save_cache_segment()
if cache_filename:
self.cache_files.append(cache_filename)
# 如果WebSocket已连接,立即发送该文件(只有非静默文件才发送)
if not self.is_segment_silent(self.current_cache_frames):
self.send_cache_file(cache_filename)
else:
self.update_result(f"缓存段静默比例过高,不发送: {os.path.basename(cache_filename)}")
# 重置当前缓存段
self.current_cache_frames = []
return (in_data, pyaudio.paContinue)
# 打开音频流
self.audio_stream = self.p.open(
format=self.format,
channels=self.channels,
rate=self.sample_rate,
input=True,
output=False,
frames_per_buffer=self.chunk_size,
input_device_index=device_index,
stream_callback=audio_callback
)
self.audio_stream.start_stream()
def save_cache_segment(self):
"""保存当前缓存段为WAV文件"""
if not self.current_cache_frames:
return None
# 提取音频数据
audio_data_list = [frame['data'] for frame in self.current_cache_frames]
# 合并所有帧
audio_data = b''.join(audio_data_list)
data_size = len(audio_data)
# 如果数据太少,不保存
if data_size < 1000: # 少于约60ms的数据
self.update_result("缓存段数据太少,不保存")
return None
# 创建时间戳文件名
timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S-%f")[:19] # 精确到毫秒
# 判断是否为静默段
is_silent = self.is_segment_silent(self.current_cache_frames)
prefix = "silent" if is_silent else "cache"
filename = os.path.join(self.cache_dir, f"{prefix}_{timestamp}.wav")
# 保存为WAV文件
try:
wf = wave.open(filename, 'wb')
wf.setnchannels(self.channels)
wf.setsampwidth(self.p.get_sample_size(self.format))
wf.setframerate(self.sample_rate)
wf.writeframes(audio_data)
wf.close()
file_size = os.path.getsize(filename)
duration = len(self.current_cache_frames) * self.chunk_size / self.sample_rate
status = "静默" if is_silent else "有声音"
self.update_result(f"已保存{status}缓存段: {filename}, 大小: {file_size}字节, 时长: {duration:.2f}秒")
return filename
except Exception as e:
self.update_result(f"保存缓存段时出错: {str(e)}")
return None
# 添加缺失的is_segment_silent方法
def is_segment_silent(self, frames):
"""判断一个缓存段是否大部分是静默"""
if not frames:
return True
silent_frames = sum(1 for frame in frames if frame['is_silent'])
silent_ratio = silent_frames / len(frames)
# 如果静默帧占比超过70%,则认为整个段是静默的
is_silent = silent_ratio > 0.7
if is_silent:
avg_rms = sum(frame['rms'] for frame in frames) / len(frames)
self.update_result(f"缓存段静默比例: {silent_ratio:.2f}, 平均RMS: {int(avg_rms)}")
return is_silent
def send_cache_file(self, filename):
"""发送缓存文件到WebSocket服务器"""
if not self.ws or not self.ws.sock or not self.ws.sock.connected:
self.update_result(f"WebSocket未连接,无法发送缓存文件: {filename}")
return False
try:
with open(filename, 'rb') as f:
audio_data = f.read()
# 发送文件数据
self.update_result(f"发送缓存文件: {os.path.basename(filename)}, 大小: {len(audio_data)}字节")
self.ws.send(audio_data, websocket.ABNF.OPCODE_BINARY)
return True
except Exception as e:
self.update_result(f"发送缓存文件时出错: {str(e)}")
return False
def stop_cache_recording(self):
"""停止先存后传录音"""
# 先标记录音状态为False,防止回调函数继续处理
self.is_cache_recording = False
# 安全关闭音频流
if self.audio_stream:
try:
if self.audio_stream.is_active():
self.audio_stream.stop_stream()
self.audio_stream.close()
except OSError as e:
self.update_result(f"关闭音频流时出现错误: {str(e)}")
finally:
self.audio_stream = None
# 保存最后一个缓存段
if self.current_cache_frames:
cache_filename = self.save_cache_segment()
if cache_filename:
self.cache_files.append(cache_filename)
# 如果WebSocket已连接,发送该文件
self.send_cache_file(cache_filename)
# 发送结束命令
if self.ws and self.ws.sock and self.ws.sock.connected:
try:
self.ws.send(json.dumps({"command": "end"}), websocket.ABNF.OPCODE_TEXT)
self.update_result("已发送结束命令")
except Exception as e:
self.update_result(f"发送结束命令时出现错误: {str(e)}")
# 显示录音统计信息
if self.cache_files:
total_size = sum(os.path.getsize(f) for f in self.cache_files)
self.update_result(f"先存后传录音已完成,共{len(self.cache_files)}个文件,总大小: {total_size}字节")
else:
self.update_result("先存后传录音已完成,但没有保存任何文件")
self.current_cache_frames = []
if __name__ == "__main__":
root = tk.Tk()
app = AudioRecognitionApp(root)
root.protocol("WM_DELETE_WINDOW", app.on_closing)
root.mainloop()