forked from LazaUK/AOAI-Whisper-Gradio
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path2_Whisper_AzureAISpeech_endpoint.py
88 lines (68 loc) · 3.08 KB
/
2_Whisper_AzureAISpeech_endpoint.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
"""
This script is a demo app for the Whisper model through Azure AI Speech endpoint.
Source code is taken from: https://github.com/Azure-Samples/cognitive-services-speech-sdk/blob/master/samples/batch/python/python-client/main.py
"""
import gradio as gr
import sys, requests, time,os
import azure.cognitiveservices.speech as speechsdk
from dotenv import load_dotenv
load_dotenv("azure.env")
class Azure:
def __init__(self):
self.speech_endpoint = None
self.speech_key = None
self.speech_region = None
azure = Azure()
# Setting Azure Speech parameters
azure.speech_endpoint = os.getenv('AZURE_SPEECH_ENDPOINT')
azure.speech_key = os.getenv('AZURE_SPEECH_KEY')
azure.speech_region = os.getenv('AZURE_SPEECH_REGION')
def translateAudioLanguage (text2Speech,paramVoice):
speech_config = speechsdk.SpeechConfig(subscription=azure.speech_key, region=azure.speech_region)
speech_config.speech_synthesis_voice_name = paramVoice
# use the default speaker as audio output.
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config)
return speech_synthesizer.speak_text_async(text2Speech).get()
def transcribe_from_single_blob(uri, properties):
"""
Transcribe a single audio file located at `uri` using the settings specified in `properties`
using the base model for the specified locale.
"""
transcription_definition = swagger_client.Transcription(
display_name=NAME,
description=DESCRIPTION,
locale=LOCALE,
content_urls=[uri],
properties=properties
)
return transcription_definition
def transcribe_with_custom_model(client, uri, properties):
"""
Transcribe a single audio file located at `uri` using the settings specified in `properties`
using the base model for the specified locale.
"""
# Model information (ADAPTED_ACOUSTIC_ID and ADAPTED_LANGUAGE_ID) must be set above.
if MODEL_REFERENCE is None:
logging.error("Custom model ids must be set when using custom models")
sys.exit()
model = {'self': f'{client.configuration.host}/models/{MODEL_REFERENCE}'}
transcription_definition = swagger_client.Transcription(
display_name=NAME,
description=DESCRIPTION,
locale=LOCALE,
content_urls=[uri],
model=model,
properties=properties
)
return transcription_definition
def transcribe(audio):
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, language="en-US")
result = speech_recognizer.recognize_once_async().get()
print(result.text)
demo = gr.Interface(
transcribe, gr.Audio(source="microphone", type="filepath", label="Audio Recording"), "textbox",
title="Demo App 2: Whisper model through Azure AI Speech endpoint",
description="Record your speech via microphone and press the Submit button to transcribe it into text. Please, note that the size of the audio file should be less than 1 GB."
)
if __name__ == "__main__":
demo.launch()