-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfaster_whisper_ollama.py
248 lines (213 loc) · 9.01 KB
/
faster_whisper_ollama.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
#!/usr/bin/env python
"""
在本地进行录音 + 转写的单脚本代码。不依赖于云服务(e.g., redis, socket),适合于离线使用。
依赖安装:
pip3 install pyaudio webrtcvad faster-whisper
运行方式:
python3 faster_whisper_ollama.py
"""
import collections
import io
import logging
import queue
import threading
import typing
import wave
from io import BytesIO
import codefast as cf
import pyaudio
import webrtcvad
from faster_whisper import WhisperModel
import asyncio
from ollama import AsyncClient
import json
# 解决bug问题
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
history = [{
'role': 'system',
'content': 'your name is Conor,and you are a helpful assistant,you are good at coding by python,you always give the right answers step by step!',
}]
logging.basicConfig(level=logging.INFO,
format='%(name)s - %(levelname)s - %(message)s')
class Queues:
audio = queue.Queue()
text = queue.Queue()
class Transcriber(threading.Thread):
def __init__(
self,
model_size: str,
device: str = "auto",
compute_type: str = "default",
prompt: str = '实时/低延迟语音转写服务,林黛玉、倒拔、杨柳树、鲁迅、周树人、关键词、转写正确') -> None:
""" FasterWhisper 语音转写
Args:
model_size (str): 模型大小,可选项为 "tiny", "base", "small", "medium", "large" 。
更多信息参考:https://github.com/openai/whisper
device (str, optional): 模型运行设备。
compute_type (str, optional): 计算类型。默认为"default"。
prompt (str, optional): 初始提示。如果需要转写简体中文,可以使用简体中文提示。
"""
super().__init__()
self.model_size = model_size
self.device = device
self.compute_type = compute_type
self.prompt = prompt
def __enter__(self) -> 'Transcriber':
try:
self._model = WhisperModel(self.model_size, device=self.device, compute_type=self.compute_type)
except Exception as e:
logging.error("Failed to initialize WhisperModel: %s", e)
raise e
return self
def __exit__(self, exc_type, exc_value, traceback) -> None:
pass
def __call__(self, audio: bytes) -> typing.Generator[str, None, None]:
if not audio:
logging.error("Received empty audio data.")
return
try:
segments, info = self._model.transcribe(BytesIO(audio), initial_prompt=self.prompt, vad_filter=True)
for segment in segments:
t = segment.text
if self.prompt in t.strip():
continue
if t.strip().replace('.', ''):
yield t
except Exception as e:
logging.error("Error during transcription: %s", e)
def run(self):
while True:
audio = Queues.audio.get()
if audio:
text = ''
for seg in self(audio):
logging.info(cf.fp.cyan(seg))
text += seg
Queues.text.put(text)
class AudioRecorder(threading.Thread):
""" Audio recorder.
Args:
channels (int, 可选): 通道数,默认为1(单声道)。
rate (int, 可选): 采样率,默认为16000 Hz。
chunk (int, 可选): 缓冲区中的帧数,默认为256。
frame_duration (int, 可选): 每帧的持续时间(单位:毫秒),默认为30。
"""
def __init__(self,
channels: int = 1,
sample_rate: int = 16000,
chunk: int = 256,
frame_duration: int = 30) -> None:
super().__init__()
self.sample_rate = sample_rate
self.channels = channels
self.chunk = chunk
self.frame_size = (sample_rate * frame_duration // 1000)
self.__frames: typing.List[bytes] = []
def __enter__(self) -> 'AudioRecorder':
try:
self.vad = webrtcvad.Vad()
# 设置 VAD 的敏感度。参数是一个 0 到 3 之间的整数。0 表示对非语音最不敏感,3 最敏感。
self.vad.set_mode(1)
self.audio = pyaudio.PyAudio()
self.sample_width = self.audio.get_sample_size(pyaudio.paInt16)
self.stream = self.audio.open(format=pyaudio.paInt16,
channels=self.channels,
rate=self.sample_rate,
input=True,
frames_per_buffer=self.chunk)
except Exception as e:
logging.error("Failed to initialize audio recorder: %s", e)
raise e
return self
def __exit__(self, exc_type, exc_value, traceback) -> None:
try:
self.stream.stop_stream()
self.stream.close()
self.audio.terminate()
except Exception as e:
logging.error("Error during cleanup of audio recorder: %s", e)
def __bytes__(self) -> bytes:
buf = io.BytesIO()
try:
with wave.open(buf, 'wb') as wf:
wf.setnchannels(self.channels)
wf.setsampwidth(self.sample_width)
wf.setframerate(self.sample_rate)
wf.writeframes(b''.join(self.__frames))
self.__frames.clear()
except Exception as e:
logging.error("Error during byte conversion: %s", e)
return buf.getvalue()
def run(self):
""" Record audio until silence is detected.
"""
MAXLEN = 30
watcher = collections.deque(maxlen=MAXLEN)
triggered, ratio = False, 0.5
while True:
try:
frame = self.stream.read(self.frame_size)
is_speech = self.vad.is_speech(frame, self.sample_rate)
watcher.append(is_speech)
self.__frames.append(frame)
if not triggered:
num_voiced = len([x for x in watcher if x])
if num_voiced > ratio * watcher.maxlen:
logging.info("start recording...")
triggered = True
watcher.clear()
self.__frames = self.__frames[-MAXLEN:]
else:
num_unvoiced = len([x for x in watcher if not x])
if num_unvoiced > ratio * watcher.maxlen:
logging.info("stop recording...")
triggered = False
audio_data = bytes(self)
if audio_data:
Queues.audio.put(audio_data)
logging.info("audio task number: {}".format(Queues.audio.qsize()))
except Exception as e:
logging.error("Error during audio recording: %s", e)
class Chat(threading.Thread):
def __init__(self, prompt: str) -> None:
super().__init__()
self.prompt = prompt
async def chat_to_ollama(self, text):
history.append({'role': 'user', 'content': text})
answer = ''
try:
client = AsyncClient(host='http://124.223.159.146:11434')# replace the host IP with your own ollama serve IP
message = {'role': 'user', 'content': text}
async for part in await client.chat(model='deepseek-coder', messages=history, stream=True):#replace the model your own
print(part['message']['content'], end='', flush=True)
answer += part['message']['content']
history.append({'role': 'assistant', 'content': answer})
with open('history.json', 'w') as f:
json.dump(history, f)
except Exception as e:
logging.error("Error during chat communication: %s", e)
def run(self):
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
while True:
text = Queues.text.get()
if text:
loop.run_until_complete(self.chat_to_ollama(text))
def main():
try:
with AudioRecorder(channels=1, sample_rate=16000) as recorder:
with Transcriber(model_size="small") as transcriber:#model_size:tiny,base,small,medium,large,large-v1,large-v2,large-v3
recorder.start()
transcriber.start()
chat = Chat("")
chat.start()
recorder.join()
transcriber.join()
chat.join()
except KeyboardInterrupt:
print("KeyboardInterrupt: terminating...")
except Exception as e:
logging.error(e, exc_info=True, stack_info=True)
if __name__ == "__main__":
main()