-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspeech_to_text_qdrant.py
137 lines (100 loc) · 4.47 KB
/
speech_to_text_qdrant.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import os
import ocifs
import numpy as np
import torch
import pandas as pd
import whisper
import torchaudio
from tqdm.notebook import tqdm
from sentence_transformers import SentenceTransformer
import faiss
import pickle
import ffmpeg
import ocifs
from json import loads, dumps
from qdrant_client import models, QdrantClient
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
#fetch environmment variables:
recording_name = os.environ.get("recording_name", "dammac recording")
print("Fetching environment variable called " + recording_name)
######## ----------------------------------------------------- #################
# Step 1
######## ----------------------------------------------------- #################
#create local folder (in job)
path_input_locally = "/home/datascience/input_recording/"
bucket = "oci://West_BP@frqap2zhtzbe/dammac/"
try:
if not os.path.exists(path_input_locally):
os.makedirs(path_input_locally)
except OSError:
print ('Error: Creating directory of input recording')
#copy recording from bucket to local folder
fs = ocifs.OCIFileSystem()
print(fs.ls(bucket))
fs.get(bucket, path_input_locally, recursive=True, refresh=True)
######## ----------------------------------------------------- #################
# Step 2 - Load model and Detect languages
######## ----------------------------------------------------- #################
#load whisper model
model = whisper.load_model("base")
#for each recording in the folder detect langauge
for recording in os.listdir(path_input_locally):
if (recording.endswith(".mp3")):
audio_recording = os.path.join(path_input_locally, recording)
# load audio and pad/trim it to fit 30 seconds
audio = whisper.load_audio(audio_recording)
audio = whisper.pad_or_trim(audio)
# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audio).to(model.device)
# detect the spoken language
_, probs = model.detect_language(mel)
print(f"Detected language: {max(probs, key=probs.get)}")
######## ----------------------------------------------------- #################
# Step 3 Transcripte recordings
######## ----------------------------------------------------- #################
output = []
##ffmpeg
!wget -O - -q https://github.com/yt-dlp/FFmpeg-Builds/releases/download/latest/ffmpeg-master-latest-linux64-gpl.tar.xz | xz -qdc| tar -x
for recording in os.listdir(path_input_locally):
if (recording.endswith(".mp3")):
audio_recording = os.path.join(path_input_locally, recording)
#transcribe recording
result = model.transcribe(audio_recording)
#append result for each recording to list
output.append(result['text'])
print(recording + " is transcribed")
#all in dataframe
df_transcriptions = pd.DataFrame(output, columns=['text'])
######## ----------------------------------------------------- #################
# Step 3 Create embeddings
######## ----------------------------------------------------- #################
encoder = SentenceTransformer('multi-qa-distilbert-cos-v1') # Model to create embeddings
## transform df_transcritiopons to correct format for Qrant
texts_damac = df_transcriptions.to_json(orient = 'records')
documents_damac = loads(texts_damac)
#establish connection to Qdrant vector database
qdrant = QdrantClient("138.3.241.32", port=6333) # Create in-memory Qdrant instance
# Create collection to store books
qdrant.recreate_collection(
collection_name="damac_v1",
vectors_config=models.VectorParams(
size=encoder.get_sentence_embedding_dimension(), # Vector size is defined by used model
distance=models.Distance.COSINE
)
)
# Let's vectorize descriptions and upload to qdrant
qdrant.upload_records(
collection_name="damac_v1",
records=[
models.Record(
id=idx,
vector=encoder.encode(doc["text"]).tolist(),
payload=doc
) for idx, doc in enumerate(documents_damac)
]
)
print("-------------------------------------------------------")
print("-------------------------------------------------------")
print("Encoded Text / Embeddings are pushed to Qdrant")
print("-------------------------------------------------------")
print("-------------------------------------------------------")