Skip to content

Commit 44c3dfe

Browse files
authored
fix(tts): make tts server requests non-blocking, improve clean_for_speech (#422)
1 parent 9b4e911 commit 44c3dfe

File tree

1 file changed

+139
-64
lines changed

1 file changed

+139
-64
lines changed

gptme/tools/tts.py

+139-64
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,11 @@
3636
_available = False
3737
# fmt: on
3838

39-
# Global queue for audio playback
39+
# Global queues and thread controls
4040
audio_queue: queue.Queue[tuple["np.ndarray", int]] = queue.Queue()
41-
playback_thread = None
41+
tts_request_queue: queue.Queue[str | None] = queue.Queue()
42+
playback_thread: threading.Thread | None = None
43+
tts_processor_thread: threading.Thread | None = None
4244
current_volume = 1.0
4345
current_speed = 1.3
4446

@@ -61,14 +63,27 @@ def set_volume(volume):
6163
log.info(f"TTS volume set to {current_volume:.2f}")
6264

6365

64-
def stop():
65-
"""Stop audio playback and clear the queue."""
66+
def stop() -> None:
67+
"""Stop audio playback and clear queues."""
6668
sd.stop()
69+
70+
# Clear both queues silently
6771
clear_queue()
68-
log.info("Stopped TTS playback and cleared queue")
72+
with tts_request_queue.mutex:
73+
tts_request_queue.queue.clear()
74+
tts_request_queue.all_tasks_done.notify_all()
75+
76+
# Stop processor thread quietly
77+
global tts_processor_thread
78+
if tts_processor_thread and tts_processor_thread.is_alive():
79+
tts_request_queue.put(None)
80+
try:
81+
tts_processor_thread.join(timeout=1)
82+
except RuntimeError:
83+
pass
6984

7085

71-
def clear_queue():
86+
def clear_queue() -> None:
7287
"""Clear the audio queue without stopping current playback."""
7388
while not audio_queue.empty():
7489
try:
@@ -78,7 +93,7 @@ def clear_queue():
7893
break
7994

8095

81-
def split_text(text, max_words=50):
96+
def split_text(text: str, max_words=50) -> list[str]:
8297
"""Split text into chunks at sentence boundaries, respecting word count, paragraphs, and markdown lists."""
8398
# Split into paragraphs
8499
paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
@@ -187,11 +202,29 @@ def split_sentences(text):
187202
return result
188203

189204

205+
emoji_pattern = re.compile(
206+
"["
207+
"\U0001f600-\U0001f64f" # emoticons
208+
"\U0001f300-\U0001f5ff" # symbols & pictographs
209+
"\U0001f680-\U0001f6ff" # transport & map symbols
210+
"\U0001f1e0-\U0001f1ff" # flags (iOS)
211+
"\U0001f900-\U0001f9ff" # supplemental symbols, has 🧹
212+
"✅" # these are somehow not included in the above
213+
"🤖"
214+
"✨"
215+
"]+",
216+
flags=re.UNICODE,
217+
)
218+
219+
190220
def clean_for_speech(content: str) -> str:
191221
"""
192222
Clean content for speech by removing:
193223
- <thinking> tags and their content
194224
- Tool use blocks (```tool ...```)
225+
- **Italic** markup
226+
- Additional (details) that may not need to be spoken
227+
- Emojis and other non-speech content
195228
196229
Returns the cleaned content suitable for speech.
197230
"""
@@ -201,10 +234,19 @@ def clean_for_speech(content: str) -> str:
201234
# Remove tool use blocks
202235
content = re_tool_use.sub("", content)
203236

237+
# Remove **Italic** markup
238+
content = re.sub(r"\*\*(.*?)\*\*", r"\1", content)
239+
240+
# Remove (details)
241+
content = re.sub(r"\(.*?\)", "", content)
242+
243+
# Remove emojis
244+
content = emoji_pattern.sub("", content)
245+
204246
return content.strip()
205247

206248

207-
def get_output_device():
249+
def get_output_device() -> tuple[int, int]:
208250
"""Get the best available output device and its sample rate.
209251
210252
Returns:
@@ -265,7 +307,7 @@ def get_output_device():
265307
return output_device, device_sr
266308

267309

268-
def audio_player_thread():
310+
def audio_player_thread() -> None:
269311
"""Background thread for playing audio."""
270312
log.debug("Audio player thread started")
271313
while True:
@@ -299,13 +341,82 @@ def audio_player_thread():
299341
log.error(f"Error in audio playback: {e}")
300342

301343

302-
def ensure_playback_thread():
303-
"""Ensure the playback thread is running."""
304-
global playback_thread
344+
def tts_processor_thread_func():
345+
"""Background thread for processing TTS requests."""
346+
log.debug("TTS processor ready")
347+
while True:
348+
try:
349+
# Get next chunk from queue
350+
chunk = tts_request_queue.get()
351+
if chunk is None: # Sentinel value to stop thread
352+
log.debug("Received stop signal for TTS processor")
353+
break
354+
355+
# Make request to the TTS server
356+
url = f"http://{host}:{port}/tts"
357+
params = {"text": chunk, "speed": current_speed}
358+
if voice := os.getenv("GPTME_TTS_VOICE"):
359+
params["voice"] = voice
360+
361+
try:
362+
response = requests.get(url, params=params)
363+
except requests.exceptions.ConnectionError:
364+
log.warning(f"TTS server unavailable at {url}")
365+
tts_request_queue.task_done()
366+
continue
367+
368+
if response.status_code != 200:
369+
log.error(f"TTS server returned status {response.status_code}")
370+
if response.content:
371+
log.error(f"Error content: {response.content.decode()} for {chunk}")
372+
tts_request_queue.task_done()
373+
continue
374+
375+
# Process audio response
376+
audio_data = io.BytesIO(response.content)
377+
sample_rate, data = wavfile.read(audio_data)
378+
379+
# Get output device for sample rate
380+
try:
381+
_, device_sr = get_output_device()
382+
# Resample if needed
383+
if sample_rate != device_sr:
384+
data = resample_audio(data, sample_rate, device_sr)
385+
sample_rate = device_sr
386+
except RuntimeError as e:
387+
log.error(f"Device error: {e}")
388+
tts_request_queue.task_done()
389+
continue
390+
391+
# Normalize audio
392+
if data.dtype != np.float32:
393+
data = data.astype(np.float32) / np.iinfo(data.dtype).max
394+
395+
# Queue for playback
396+
audio_queue.put((data, sample_rate))
397+
tts_request_queue.task_done()
398+
399+
except Exception as e:
400+
log.error(f"Error in TTS processing: {e}")
401+
tts_request_queue.task_done()
402+
403+
404+
def ensure_threads():
405+
"""Ensure both playback and TTS processor threads are running."""
406+
global playback_thread, tts_processor_thread
407+
408+
# Ensure playback thread
305409
if playback_thread is None or not playback_thread.is_alive():
306410
playback_thread = threading.Thread(target=audio_player_thread, daemon=True)
307411
playback_thread.start()
308412

413+
# Ensure TTS processor thread
414+
if tts_processor_thread is None or not tts_processor_thread.is_alive():
415+
tts_processor_thread = threading.Thread(
416+
target=tts_processor_thread_func, daemon=True
417+
)
418+
tts_processor_thread.start()
419+
309420

310421
def resample_audio(data, orig_sr, target_sr):
311422
"""Resample audio data to target sample rate."""
@@ -326,11 +437,13 @@ def speak(text, block=False, interrupt=True, clean=True):
326437
- Automatic chunking of long texts
327438
- Non-blocking operation with optional blocking mode
328439
- Interruption of current speech
440+
- Background processing of TTS requests
329441
330442
Args:
331443
text: Text to speak
332444
block: If True, wait for audio to finish playing
333445
interrupt: If True, stop current speech and clear queue before speaking
446+
clean: If True, clean text for speech (remove markup, emojis, etc.)
334447
335448
Example:
336449
>>> from gptme.tools.tts import speak, set_speed, set_volume
@@ -346,67 +459,29 @@ def speak(text, block=False, interrupt=True, clean=True):
346459

347460
# Stop current speech if requested
348461
if interrupt:
349-
clear_queue()
350-
351-
# Split text into chunks if needed
352-
chunks = split_text(text)
353-
chunks = [c.replace("gptme", "gpt-me") for c in chunks] # Fix pronunciation
462+
stop()
354463

355464
try:
356-
# Get output device and sample rate
357-
output_device, device_sr = get_output_device()
465+
# Split text into chunks
466+
chunks = split_text(text)
467+
chunks = [c.replace("gptme", "gpt-me") for c in chunks] # Fix pronunciation
358468

359-
# Ensure playback thread is running
360-
ensure_playback_thread()
469+
# Ensure both threads are running
470+
ensure_threads()
361471

362-
# Process each chunk
472+
# Queue chunks for processing
363473
for chunk in chunks:
364-
if not chunk.strip().strip("`"):
365-
continue
366-
367-
# Make request to the TTS server
368-
url = f"http://{host}:{port}/tts"
369-
params = {"text": chunk, "speed": current_speed}
370-
if voice := os.getenv("GPTME_TTS_VOICE"):
371-
params["voice"] = voice
372-
373-
try:
374-
response = requests.get(url, params=params)
375-
except requests.exceptions.ConnectionError:
376-
log.warning(f"TTS server was not available at {url}")
377-
return
378-
379-
if response.status_code != 200:
380-
log.error(f"TTS server returned status {response.status_code}")
381-
if response.content:
382-
log.error(f"Error content: {response.content.decode()}")
383-
continue
384-
385-
# Convert response to audio
386-
audio_data = io.BytesIO(response.content)
387-
sample_rate, data = wavfile.read(audio_data)
388-
389-
log.debug(
390-
f"Audio: {len(data)} samples at {sample_rate}Hz ({len(data)/sample_rate:.2f} seconds)"
391-
)
392-
393-
# Resample if needed
394-
if sample_rate != device_sr:
395-
data = resample_audio(data, sample_rate, device_sr)
396-
sample_rate = device_sr
397-
398-
# Normalize audio to float32 in range [-1, 1]
399-
if data.dtype != np.float32:
400-
data = data.astype(np.float32) / np.iinfo(data.dtype).max
401-
402-
# Queue audio for playback
403-
audio_queue.put((data, sample_rate))
474+
if chunk.strip():
475+
tts_request_queue.put(chunk)
404476

405477
if block:
406-
audio_queue.join() # Wait for audio to finish playing
478+
# Wait for all TTS processing to complete
479+
tts_request_queue.join()
480+
# Then wait for all audio to finish playing
481+
audio_queue.join()
407482

408483
except Exception as e:
409-
log.error(f"Failed to speak text: {e}")
484+
log.error(f"Failed to queue text for speech: {e}")
410485

411486

412487
tool = ToolSpec(

0 commit comments

Comments
 (0)