1
+ import base64
2
+ import gc
3
+ import io
4
+ from pathlib import Path
5
+
1
6
import gradio as gr
2
- import speech_recognition as sr
7
+ import numpy as np
8
+ import torch
9
+ import whisper
10
+ from pydub import AudioSegment
3
11
4
12
from modules import shared
5
13
8
16
'value' : ["" , "" ]
9
17
}
10
18
11
- # parameters which can be customized in settings.json of webui
19
+ # parameters which can be customized in settings.yaml of webui
12
20
params = {
13
21
'whipser_language' : 'english' ,
14
22
'whipser_model' : 'small.en' ,
15
23
'auto_submit' : True
16
24
}
17
25
26
+ startup_device = torch .device ('cuda' if torch .cuda .is_available () else 'cpu' )
27
+ WHISPERMODEL = whisper .load_model (params ['whipser_model' ], device = startup_device )
28
+
18
29
19
30
def chat_input_modifier (text , visible_text , state ):
20
31
global input_hijack
@@ -25,47 +36,84 @@ def chat_input_modifier(text, visible_text, state):
25
36
return text , visible_text
26
37
27
38
28
- def do_stt (audio , whipser_model , whipser_language ):
29
- transcription = ""
30
- r = sr . Recognizer ( )
31
-
32
- # Convert to AudioData
33
- audio_data = sr . AudioData ( sample_rate = audio [ 0 ], frame_data = audio [ 1 ], sample_width = 4 )
39
+ def do_stt (audio , whipser_language ):
40
+ # use pydub to convert sample_rate and sample_width for whisper input
41
+ dubaudio = AudioSegment . from_file ( io . BytesIO ( audio ) )
42
+ dubaudio = dubaudio . set_channels ( 1 )
43
+ dubaudio = dubaudio . set_frame_rate ( 16000 )
44
+ dubaudio = dubaudio . set_sample_width ( 2 )
34
45
35
- try :
36
- transcription = r .recognize_whisper (audio_data , language = whipser_language , model = whipser_model )
37
- except sr .UnknownValueError :
38
- print ("Whisper could not understand audio" )
39
- except sr .RequestError as e :
40
- print ("Could not request results from Whisper" , e )
46
+ # same method to get the array as openai whisper repo used from wav file
47
+ audio_np = np .frombuffer (dubaudio .raw_data , np .int16 ).flatten ().astype (np .float32 ) / 32768.0
41
48
42
- return transcription
49
+ if len (whipser_language ) == 0 :
50
+ result = WHISPERMODEL .transcribe (audio = audio_np )
51
+ else :
52
+ result = WHISPERMODEL .transcribe (audio = audio_np , language = whipser_language )
53
+ return result ["text" ]
43
54
44
55
45
- def auto_transcribe (audio , auto_submit , whipser_model , whipser_language ):
46
- if audio is None :
56
+ def auto_transcribe (audio , auto_submit , whipser_language ):
57
+ if audio is None or audio == "" :
58
+ print ("Whisper received no audio data" )
47
59
return "" , ""
48
- transcription = do_stt (audio , whipser_model , whipser_language )
60
+ audio_bytes = base64 .b64decode (audio .split (',' )[1 ])
61
+
62
+ transcription = do_stt (audio_bytes , whipser_language )
49
63
if auto_submit :
50
64
input_hijack .update ({"state" : True , "value" : [transcription , transcription ]})
65
+ return transcription
66
+
67
+
68
+ def reload_whispermodel (whisper_model_name : str , whisper_language : str , device : str ):
69
+ if len (whisper_model_name ) > 0 :
70
+ global WHISPERMODEL
71
+ WHISPERMODEL = None
72
+ if torch .cuda .is_available ():
73
+ torch .cuda .empty_cache ()
74
+ gc .collect ()
51
75
52
- return transcription , None
76
+ if device != "none" :
77
+ if device == "cuda" :
78
+ device = torch .device ('cuda' if torch .cuda .is_available () else 'cpu' )
79
+
80
+ WHISPERMODEL = whisper .load_model (whisper_model_name , device = device )
81
+ params .update ({"whipser_model" : whisper_model_name })
82
+ if ".en" in whisper_model_name :
83
+ whisper_language = "english"
84
+ audio_update = gr .Audio .update (interactive = True )
85
+ else :
86
+ audio_update = gr .Audio .update (interactive = False )
87
+ return [whisper_model_name , whisper_language , str (device ), audio_update ]
53
88
54
89
55
90
def ui ():
56
91
with gr .Accordion ("Whisper STT" , open = True ):
57
92
with gr .Row ():
58
- audio = gr .Audio (source = "microphone" )
93
+ audio = gr .Textbox (elem_id = "audio-base64" , visible = False )
94
+ record_button = gr .Button ("Rec." , elem_id = "record-button" , elem_classes = "custom-button" )
59
95
with gr .Row ():
60
96
with gr .Accordion ("Settings" , open = False ):
61
97
auto_submit = gr .Checkbox (label = 'Submit the transcribed audio automatically' , value = params ['auto_submit' ])
62
- whipser_model = gr .Dropdown (label = 'Whisper Model' , value = params ['whipser_model' ], choices = ["tiny.en" , "base.en" , "small.en" , "medium.en" , "tiny" , "base" , "small" , "medium" , "large" ])
63
- whipser_language = gr .Dropdown (label = 'Whisper Language' , value = params ['whipser_language' ], choices = ["chinese" , "german" , "spanish" , "russian" , "korean" , "french" , "japanese" , "portuguese" , "turkish" , "polish" , "catalan" , "dutch" , "arabic" , "swedish" , "italian" , "indonesian" , "hindi" , "finnish" , "vietnamese" , "hebrew" , "ukrainian" , "greek" , "malay" , "czech" , "romanian" , "danish" , "hungarian" , "tamil" , "norwegian" , "thai" , "urdu" , "croatian" , "bulgarian" , "lithuanian" , "latin" , "maori" , "malayalam" , "welsh" , "slovak" , "telugu" , "persian" , "latvian" , "bengali" , "serbian" , "azerbaijani" , "slovenian" , "kannada" , "estonian" , "macedonian" , "breton" , "basque" , "icelandic" , "armenian" , "nepali" , "mongolian" , "bosnian" , "kazakh" , "albanian" , "swahili" , "galician" , "marathi" , "punjabi" , "sinhala" , "khmer" , "shona" , "yoruba" , "somali" , "afrikaans" , "occitan" , "georgian" , "belarusian" , "tajik" , "sindhi" , "gujarati" , "amharic" , "yiddish" , "lao" , "uzbek" , "faroese" , "haitian creole" , "pashto" , "turkmen" , "nynorsk" , "maltese" , "sanskrit" , "luxembourgish" , "myanmar" , "tibetan" , "tagalog" , "malagasy" , "assamese" , "tatar" , "hawaiian" , "lingala" , "hausa" , "bashkir" , "javanese" , "sundanese" ])
98
+ device_dropd = gr .Dropdown (label = 'Device' , value = str (startup_device ), choices = ["cuda" , "cpu" , "none" ])
99
+ whisper_model_dropd = gr .Dropdown (label = 'Whisper Model' , value = params ['whipser_model' ], choices = ["tiny.en" , "base.en" , "small.en" , "medium.en" , "tiny" , "base" , "small" , "medium" , "large" ])
100
+ whisper_language = gr .Dropdown (label = 'Whisper Language' , value = params ['whipser_language' ], choices = ["english" , "chinese" , "german" , "spanish" , "russian" , "korean" , "french" , "japanese" , "portuguese" , "turkish" , "polish" , "catalan" , "dutch" , "arabic" , "swedish" , "italian" , "indonesian" , "hindi" , "finnish" , "vietnamese" , "hebrew" , "ukrainian" , "greek" , "malay" , "czech" , "romanian" , "danish" , "hungarian" , "tamil" , "norwegian" , "thai" , "urdu" , "croatian" , "bulgarian" , "lithuanian" , "latin" , "maori" , "malayalam" , "welsh" , "slovak" , "telugu" , "persian" , "latvian" , "bengali" , "serbian" , "azerbaijani" , "slovenian" , "kannada" , "estonian" , "macedonian" , "breton" , "basque" , "icelandic" , "armenian" , "nepali" , "mongolian" , "bosnian" , "kazakh" , "albanian" , "swahili" , "galician" , "marathi" , "punjabi" , "sinhala" , "khmer" , "shona" , "yoruba" , "somali" , "afrikaans" , "occitan" , "georgian" , "belarusian" , "tajik" , "sindhi" , "gujarati" , "amharic" , "yiddish" , "lao" , "uzbek" , "faroese" , "haitian creole" , "pashto" , "turkmen" , "nynorsk" , "maltese" , "sanskrit" , "luxembourgish" , "myanmar" , "tibetan" , "tagalog" , "malagasy" , "assamese" , "tatar" , "hawaiian" , "lingala" , "hausa" , "bashkir" , "javanese" , "sundanese" ])
64
101
65
- audio .stop_recording (
66
- auto_transcribe , [audio , auto_submit , whipser_model , whipser_language ], [shared .gradio ['textbox' ], audio ]).then (
67
- None , auto_submit , None , js = "(check) => {if (check) { document.getElementById('Generate').click() }}" )
102
+ audio .change (
103
+ auto_transcribe , [audio , auto_submit , whisper_language ], [shared .gradio ['textbox' ]]).then (
104
+ None , auto_submit , None , _js = "(check) => {if (check) { document.getElementById('Generate').click() }}" )
68
105
69
- whipser_model .change (lambda x : params .update ({"whipser_model" : x }), whipser_model , None )
70
- whipser_language .change (lambda x : params .update ({"whipser_language" : x }), whipser_language , None )
106
+ device_dropd .input (reload_whispermodel , [whisper_model_dropd , whisper_language , device_dropd ], [whisper_model_dropd , whisper_language , device_dropd , audio ])
107
+ whisper_model_dropd .change (reload_whispermodel , [whisper_model_dropd , whisper_language , device_dropd ], [whisper_model_dropd , whisper_language , device_dropd , audio ])
108
+ whisper_language .change (lambda x : params .update ({"whipser_language" : x }), whisper_language , None )
71
109
auto_submit .change (lambda x : params .update ({"auto_submit" : x }), auto_submit , None )
110
+
111
+
112
+ def custom_js ():
113
+ """
114
+ Returns custom javascript as a string. It is applied whenever the web UI is
115
+ loaded.
116
+ :return:
117
+ """
118
+ with open (Path (__file__ ).parent .resolve () / "script.js" , "r" ) as f :
119
+ return f .read ()
0 commit comments