-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy path2_Whisper_AzureAISpeech_endpoint.py
156 lines (120 loc) · 6.55 KB
/
2_Whisper_AzureAISpeech_endpoint.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
"""
This script is a demo app for the Whisper model through Azure AI Speech endpoint.
Source code is taken from: https://github.com/Azure-Samples/cognitive-services-speech-sdk/blob/master/samples/batch/python/python-client/main.py
"""
import gradio as gr
import logging
import sys
import requests
import time
import swagger_client
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG,
format="%(asctime)s %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p %Z")
# Your subscription key and region for the speech service
SUBSCRIPTION_KEY = "YourSubscriptionKey"
SERVICE_REGION = "YourServiceRegion"
NAME = "Simple transcription"
DESCRIPTION = "Simple transcription description"
LOCALE = "en-US"
RECORDINGS_BLOB_URI = "<Your SAS Uri to the recording>"
# Provide the uri of a container with audio files for transcribing all of them
# with a single request. At least 'read' and 'list' (rl) permissions are required.
RECORDINGS_CONTAINER_URI = "<Your SAS Uri to a container of audio files>"
# Set model information when doing transcription with custom models
MODEL_REFERENCE = None # guid of a custom model
def transcribe_from_single_blob(uri, properties):
"""
Transcribe a single audio file located at `uri` using the settings specified in `properties`
using the base model for the specified locale.
"""
transcription_definition = swagger_client.Transcription(
display_name=NAME,
description=DESCRIPTION,
locale=LOCALE,
content_urls=[uri],
properties=properties
)
return transcription_definition
def transcribe_with_custom_model(client, uri, properties):
"""
Transcribe a single audio file located at `uri` using the settings specified in `properties`
using the base model for the specified locale.
"""
# Model information (ADAPTED_ACOUSTIC_ID and ADAPTED_LANGUAGE_ID) must be set above.
if MODEL_REFERENCE is None:
logging.error("Custom model ids must be set when using custom models")
sys.exit()
model = {'self': f'{client.configuration.host}/models/{MODEL_REFERENCE}'}
transcription_definition = swagger_client.Transcription(
display_name=NAME,
description=DESCRIPTION,
locale=LOCALE,
content_urls=[uri],
model=model,
properties=properties
)
return transcription_definition
def transcribe(audio):
return "This is still Work in Progress. Please, check back later."
logging.info("Starting transcription client...")
# configure API key authorization: subscription_key
configuration = swagger_client.Configuration()
configuration.api_key["Ocp-Apim-Subscription-Key"] = SUBSCRIPTION_KEY
configuration.host = f"https://{SERVICE_REGION}.api.cognitive.microsoft.com/speechtotext/v3.1"
# create the client object and authenticate
client = swagger_client.ApiClient(configuration)
# create an instance of the transcription api class
api = swagger_client.CustomSpeechTranscriptionsApi(api_client=client)
# Specify transcription properties by passing a dict to the properties parameter. See
# https://learn.microsoft.com/azure/cognitive-services/speech-service/batch-transcription-create?pivots=rest-api#request-configuration-options
# for supported parameters.
properties = swagger_client.TranscriptionProperties()
# properties.word_level_timestamps_enabled = True
# properties.display_form_word_level_timestamps_enabled = True
# properties.punctuation_mode = "DictatedAndAutomatic"
# properties.profanity_filter_mode = "Masked"
# properties.destination_container_url = "<SAS Uri with at least write (w) permissions for an Azure Storage blob container that results should be written to>"
# properties.time_to_live = "PT1H"
# uncomment the following block to enable and configure speaker separation
# properties.diarization_enabled = True
# properties.diarization = swagger_client.DiarizationProperties(
# swagger_client.DiarizationSpeakersProperties(min_count=1, max_count=5))
# properties.language_identification = swagger_client.LanguageIdentificationProperties(["en-US", "ja-JP"])
# Use base models for transcription. Comment this block if you are using a custom model.
transcription_definition = transcribe_from_single_blob(RECORDINGS_BLOB_URI, properties)
# Uncomment this block to use custom models for transcription.
# transcription_definition = transcribe_with_custom_model(client, RECORDINGS_BLOB_URI, properties)
created_transcription, status, headers = api.transcriptions_create_with_http_info(transcription=transcription_definition)
# get the transcription Id from the location URI
transcription_id = headers["location"].split("/")[-1]
# Log information about the created transcription. If you should ask for support, please
# include this information.
logging.info(f"Created new transcription with id '{transcription_id}' in region {SERVICE_REGION}")
logging.info("Checking status.")
completed = False
while not completed:
# wait for 5 seconds before refreshing the transcription status
time.sleep(5)
transcription = api.transcriptions_get(transcription_id)
logging.info(f"Transcriptions status: {transcription.status}")
if transcription.status in ("Failed", "Succeeded"):
completed = True
if transcription.status == "Succeeded":
pag_files = api.transcriptions_list_files(transcription_id)
for file_data in _paginate(api, pag_files):
if file_data.kind != "Transcription":
continue
audiofilename = file_data.name
results_url = file_data.links.content_url
results = requests.get(results_url)
logging.info(f"Results for {audiofilename}:\n{results.content.decode('utf-8')}")
elif transcription.status == "Failed":
logging.info(f"Transcription failed: {transcription.properties.error.message}")
# return "This is still Work in Progress. Please, check back later."
demo = gr.Interface(
transcribe, gr.Audio(source="microphone", type="filepath", label="Audio Recording"), "textbox",
title="Demo App 2: Whisper model through Azure AI Speech endpoint",
description="Record your speech via microphone and press the Submit button to transcribe it into text. Please, note that the size of the audio file should be less than 1 GB."
)
if __name__ == "__main__":
demo.launch()