36
36
_available = False
37
37
# fmt: on
38
38
39
- # Global queue for audio playback
39
+ # Global queues and thread controls
40
40
audio_queue : queue .Queue [tuple ["np.ndarray" , int ]] = queue .Queue ()
41
- playback_thread = None
41
+ tts_request_queue : queue .Queue [str | None ] = queue .Queue ()
42
+ playback_thread : threading .Thread | None = None
43
+ tts_processor_thread : threading .Thread | None = None
42
44
current_volume = 1.0
43
45
current_speed = 1.3
44
46
@@ -61,14 +63,27 @@ def set_volume(volume):
61
63
log .info (f"TTS volume set to { current_volume :.2f} " )
62
64
63
65
64
- def stop ():
65
- """Stop audio playback and clear the queue ."""
66
+ def stop () -> None :
67
+ """Stop audio playback and clear queues ."""
66
68
sd .stop ()
69
+
70
+ # Clear both queues silently
67
71
clear_queue ()
68
- log .info ("Stopped TTS playback and cleared queue" )
72
+ with tts_request_queue .mutex :
73
+ tts_request_queue .queue .clear ()
74
+ tts_request_queue .all_tasks_done .notify_all ()
75
+
76
+ # Stop processor thread quietly
77
+ global tts_processor_thread
78
+ if tts_processor_thread and tts_processor_thread .is_alive ():
79
+ tts_request_queue .put (None )
80
+ try :
81
+ tts_processor_thread .join (timeout = 1 )
82
+ except RuntimeError :
83
+ pass
69
84
70
85
71
- def clear_queue ():
86
+ def clear_queue () -> None :
72
87
"""Clear the audio queue without stopping current playback."""
73
88
while not audio_queue .empty ():
74
89
try :
@@ -78,7 +93,7 @@ def clear_queue():
78
93
break
79
94
80
95
81
- def split_text (text , max_words = 50 ):
96
+ def split_text (text : str , max_words = 50 ) -> list [ str ] :
82
97
"""Split text into chunks at sentence boundaries, respecting word count, paragraphs, and markdown lists."""
83
98
# Split into paragraphs
84
99
paragraphs = [p .strip () for p in text .split ("\n \n " ) if p .strip ()]
@@ -187,11 +202,29 @@ def split_sentences(text):
187
202
return result
188
203
189
204
205
+ emoji_pattern = re .compile (
206
+ "["
207
+ "\U0001f600 -\U0001f64f " # emoticons
208
+ "\U0001f300 -\U0001f5ff " # symbols & pictographs
209
+ "\U0001f680 -\U0001f6ff " # transport & map symbols
210
+ "\U0001f1e0 -\U0001f1ff " # flags (iOS)
211
+ "\U0001f900 -\U0001f9ff " # supplemental symbols, has 🧹
212
+ "✅" # these are somehow not included in the above
213
+ "🤖"
214
+ "✨"
215
+ "]+" ,
216
+ flags = re .UNICODE ,
217
+ )
218
+
219
+
190
220
def clean_for_speech (content : str ) -> str :
191
221
"""
192
222
Clean content for speech by removing:
193
223
- <thinking> tags and their content
194
224
- Tool use blocks (```tool ...```)
225
+ - **Italic** markup
226
+ - Additional (details) that may not need to be spoken
227
+ - Emojis and other non-speech content
195
228
196
229
Returns the cleaned content suitable for speech.
197
230
"""
@@ -201,10 +234,19 @@ def clean_for_speech(content: str) -> str:
201
234
# Remove tool use blocks
202
235
content = re_tool_use .sub ("" , content )
203
236
237
+ # Remove **Italic** markup
238
+ content = re .sub (r"\*\*(.*?)\*\*" , r"\1" , content )
239
+
240
+ # Remove (details)
241
+ content = re .sub (r"\(.*?\)" , "" , content )
242
+
243
+ # Remove emojis
244
+ content = emoji_pattern .sub ("" , content )
245
+
204
246
return content .strip ()
205
247
206
248
207
- def get_output_device ():
249
+ def get_output_device () -> tuple [ int , int ] :
208
250
"""Get the best available output device and its sample rate.
209
251
210
252
Returns:
@@ -265,7 +307,7 @@ def get_output_device():
265
307
return output_device , device_sr
266
308
267
309
268
- def audio_player_thread ():
310
+ def audio_player_thread () -> None :
269
311
"""Background thread for playing audio."""
270
312
log .debug ("Audio player thread started" )
271
313
while True :
@@ -299,13 +341,82 @@ def audio_player_thread():
299
341
log .error (f"Error in audio playback: { e } " )
300
342
301
343
302
- def ensure_playback_thread ():
303
- """Ensure the playback thread is running."""
304
- global playback_thread
344
+ def tts_processor_thread_func ():
345
+ """Background thread for processing TTS requests."""
346
+ log .debug ("TTS processor ready" )
347
+ while True :
348
+ try :
349
+ # Get next chunk from queue
350
+ chunk = tts_request_queue .get ()
351
+ if chunk is None : # Sentinel value to stop thread
352
+ log .debug ("Received stop signal for TTS processor" )
353
+ break
354
+
355
+ # Make request to the TTS server
356
+ url = f"http://{ host } :{ port } /tts"
357
+ params = {"text" : chunk , "speed" : current_speed }
358
+ if voice := os .getenv ("GPTME_TTS_VOICE" ):
359
+ params ["voice" ] = voice
360
+
361
+ try :
362
+ response = requests .get (url , params = params )
363
+ except requests .exceptions .ConnectionError :
364
+ log .warning (f"TTS server unavailable at { url } " )
365
+ tts_request_queue .task_done ()
366
+ continue
367
+
368
+ if response .status_code != 200 :
369
+ log .error (f"TTS server returned status { response .status_code } " )
370
+ if response .content :
371
+ log .error (f"Error content: { response .content .decode ()} for { chunk } " )
372
+ tts_request_queue .task_done ()
373
+ continue
374
+
375
+ # Process audio response
376
+ audio_data = io .BytesIO (response .content )
377
+ sample_rate , data = wavfile .read (audio_data )
378
+
379
+ # Get output device for sample rate
380
+ try :
381
+ _ , device_sr = get_output_device ()
382
+ # Resample if needed
383
+ if sample_rate != device_sr :
384
+ data = resample_audio (data , sample_rate , device_sr )
385
+ sample_rate = device_sr
386
+ except RuntimeError as e :
387
+ log .error (f"Device error: { e } " )
388
+ tts_request_queue .task_done ()
389
+ continue
390
+
391
+ # Normalize audio
392
+ if data .dtype != np .float32 :
393
+ data = data .astype (np .float32 ) / np .iinfo (data .dtype ).max
394
+
395
+ # Queue for playback
396
+ audio_queue .put ((data , sample_rate ))
397
+ tts_request_queue .task_done ()
398
+
399
+ except Exception as e :
400
+ log .error (f"Error in TTS processing: { e } " )
401
+ tts_request_queue .task_done ()
402
+
403
+
404
+ def ensure_threads ():
405
+ """Ensure both playback and TTS processor threads are running."""
406
+ global playback_thread , tts_processor_thread
407
+
408
+ # Ensure playback thread
305
409
if playback_thread is None or not playback_thread .is_alive ():
306
410
playback_thread = threading .Thread (target = audio_player_thread , daemon = True )
307
411
playback_thread .start ()
308
412
413
+ # Ensure TTS processor thread
414
+ if tts_processor_thread is None or not tts_processor_thread .is_alive ():
415
+ tts_processor_thread = threading .Thread (
416
+ target = tts_processor_thread_func , daemon = True
417
+ )
418
+ tts_processor_thread .start ()
419
+
309
420
310
421
def resample_audio (data , orig_sr , target_sr ):
311
422
"""Resample audio data to target sample rate."""
@@ -326,11 +437,13 @@ def speak(text, block=False, interrupt=True, clean=True):
326
437
- Automatic chunking of long texts
327
438
- Non-blocking operation with optional blocking mode
328
439
- Interruption of current speech
440
+ - Background processing of TTS requests
329
441
330
442
Args:
331
443
text: Text to speak
332
444
block: If True, wait for audio to finish playing
333
445
interrupt: If True, stop current speech and clear queue before speaking
446
+ clean: If True, clean text for speech (remove markup, emojis, etc.)
334
447
335
448
Example:
336
449
>>> from gptme.tools.tts import speak, set_speed, set_volume
@@ -346,67 +459,29 @@ def speak(text, block=False, interrupt=True, clean=True):
346
459
347
460
# Stop current speech if requested
348
461
if interrupt :
349
- clear_queue ()
350
-
351
- # Split text into chunks if needed
352
- chunks = split_text (text )
353
- chunks = [c .replace ("gptme" , "gpt-me" ) for c in chunks ] # Fix pronunciation
462
+ stop ()
354
463
355
464
try :
356
- # Get output device and sample rate
357
- output_device , device_sr = get_output_device ()
465
+ # Split text into chunks
466
+ chunks = split_text (text )
467
+ chunks = [c .replace ("gptme" , "gpt-me" ) for c in chunks ] # Fix pronunciation
358
468
359
- # Ensure playback thread is running
360
- ensure_playback_thread ()
469
+ # Ensure both threads are running
470
+ ensure_threads ()
361
471
362
- # Process each chunk
472
+ # Queue chunks for processing
363
473
for chunk in chunks :
364
- if not chunk .strip ().strip ("`" ):
365
- continue
366
-
367
- # Make request to the TTS server
368
- url = f"http://{ host } :{ port } /tts"
369
- params = {"text" : chunk , "speed" : current_speed }
370
- if voice := os .getenv ("GPTME_TTS_VOICE" ):
371
- params ["voice" ] = voice
372
-
373
- try :
374
- response = requests .get (url , params = params )
375
- except requests .exceptions .ConnectionError :
376
- log .warning (f"TTS server was not available at { url } " )
377
- return
378
-
379
- if response .status_code != 200 :
380
- log .error (f"TTS server returned status { response .status_code } " )
381
- if response .content :
382
- log .error (f"Error content: { response .content .decode ()} " )
383
- continue
384
-
385
- # Convert response to audio
386
- audio_data = io .BytesIO (response .content )
387
- sample_rate , data = wavfile .read (audio_data )
388
-
389
- log .debug (
390
- f"Audio: { len (data )} samples at { sample_rate } Hz ({ len (data )/ sample_rate :.2f} seconds)"
391
- )
392
-
393
- # Resample if needed
394
- if sample_rate != device_sr :
395
- data = resample_audio (data , sample_rate , device_sr )
396
- sample_rate = device_sr
397
-
398
- # Normalize audio to float32 in range [-1, 1]
399
- if data .dtype != np .float32 :
400
- data = data .astype (np .float32 ) / np .iinfo (data .dtype ).max
401
-
402
- # Queue audio for playback
403
- audio_queue .put ((data , sample_rate ))
474
+ if chunk .strip ():
475
+ tts_request_queue .put (chunk )
404
476
405
477
if block :
406
- audio_queue .join () # Wait for audio to finish playing
478
+ # Wait for all TTS processing to complete
479
+ tts_request_queue .join ()
480
+ # Then wait for all audio to finish playing
481
+ audio_queue .join ()
407
482
408
483
except Exception as e :
409
- log .error (f"Failed to speak text: { e } " )
484
+ log .error (f"Failed to queue text for speech : { e } " )
410
485
411
486
412
487
tool = ToolSpec (
0 commit comments