Zg/tts python livekit example (#173)

zgreathouse · web-flow · commit 6a4c09057e23 · 2025-05-21T21:02:27.000Z
diff --git a/tts/tts-python-livekit/.env.example b/tts/tts-python-livekit/.env.example
@@ -0,0 +1,7 @@
+HUME_API_KEY=
+ANTHROPIC_API_KEY=
+GROQ_API_KEY=
+
+LIVEKIT_URL=
+LIVEKIT_API_KEY=
+LIVEKIT_API_SECRET=
diff --git a/tts/tts-python-livekit/.gitignore b/tts/tts-python-livekit/.gitignore
@@ -0,0 +1,30 @@
+# Python internals
+__pycache__/
+*.py[cod]
+*.pyd
+*.pyo
+*.so
+*.dylib
+
+#  Virtual-environment directories
+.venv/
+venv/
+env/
+
+#  Secrets & local config
+.env
+
+#  Build / packaging artefacts
+build/
+dist/
+*.egg-info/
+*.egg
+*.whl
+
+#  Logs & runtime files
+*.log
+logs/
+
+#  OS-specific noise
+.DS_Store
+Thumbs.db
diff --git a/tts/tts-python-livekit/.python-version b/tts/tts-python-livekit/.python-version
@@ -0,0 +1 @@
+3.11
diff --git a/tts/tts-python-livekit/README.md b/tts/tts-python-livekit/README.md
@@ -0,0 +1,69 @@
+<div align="center">
+  <img src="https://storage.googleapis.com/hume-public-logos/hume/hume-banner.png">
+  <h1>Text-to-Speech | Python LiveKit Agents Example</h1>
+</div>
+
+## Overview
+
+This example demonstrates how to use the **Hume Python LiveKit plugin** to integrate:
+
+1. **Speech-to-Text with Voice Activity Detection (VAD)** (Silero VAD + Groq Whisper)
+2. **A conversational LLM** (Anthropic Claude Haiku)
+3. **Low-latency Text-to-Speech** (Hume AI's streaming API for Octave)
+
+…inside a LiveKit Agents worker that runs in **console mode** by default. No front-end required—just your terminal and microphone.
+
+## Instructions
+
+1. **Clone this examples repository**
+
+   ```sh
+   git clone https://github.com/humeai/hume-api-examples
+   cd hume-api-examples/tts/tts-python-livekit
+   ```
+
+2. **Set up the environment**
+
+   We recommend `uv` but you can adapt these commands to your preferred package manager.
+
+   ```sh
+   uv sync
+   ```
+
+3. **Configure your API keys**
+
+   You’ll need accounts and credentials for:
+
+   - **Hume AI**: https://platform.hume.ai
+   - **Anthropic**: https://console.anthropic.com
+   - **Groq**: https://console.groq.com
+   - **LiveKit**: https://livekit.com
+
+   Copy the example and fill in your credentials:
+
+   ```sh
+   cp .env.example .env
+   ```
+
+   Edit .env to include:
+
+   ```dotenv
+   HUME_API_KEY=…        # from Hume AI
+   GROQ_API_KEY=…        # from Groq console
+   ANTHROPIC_API_KEY=…   # from Anthropic console
+   LIVEKIT_URL=…         # your LiveKit deployment URL
+   LIVEKIT_API_KEY=…     # your LiveKit API key
+   LIVEKIT_API_SECRET=…  # your LiveKit API secret
+   ```
+
+4. **Run the demo**
+
+   Start the console-based assistant and begin talking:
+
+   ```sh
+   uv run python main.py
+   ```
+
+   Speak into your mic and the assistant will respond.
+
+   > **Optional**: Tweak additional demo settings in settings.py (e.g. models, prompt, voice, VAD thresholds).
diff --git a/tts/tts-python-livekit/main.py b/tts/tts-python-livekit/main.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+"""
+Demo: LiveKit Agents with STT (Groq), LLM (Claude Haiku), and TTS (Hume).
+"""
+
+import sys
+
+# third-party
+from livekit.agents import Agent, AgentSession, JobContext, WorkerOptions, cli
+from livekit.agents.stt.stream_adapter import StreamAdapter
+from livekit.plugins import hume, groq, anthropic, silero
+
+# local
+from utils import validate_env_vars
+from settings import (
+    STT_MODEL,
+    LLM_MODEL,
+    LLM_TEMPERATURE,
+    LLM_PROMPT,
+    GREETING,
+    HUME_VOICE,
+    VAD_SPEECH_DURATION,
+    VAD_SILENCE_DURATION,
+)
+
+
+class VoiceAssistant(Agent):
+    """Agent using the voice-assistant prompt."""
+    def __init__(self):
+        super().__init__(instructions=LLM_PROMPT)
+
+
+async def entrypoint(ctx: JobContext):
+    """Configure and run STT, LLM, and TTS in a LiveKit session."""
+    await ctx.connect()
+
+    # voice-activity detection + buffering for non-streaming STT
+    vad = silero.VAD.load(
+        min_speech_duration=VAD_SPEECH_DURATION,
+        min_silence_duration=VAD_SILENCE_DURATION
+    )
+    stt = StreamAdapter(
+        stt=groq.STT(model=STT_MODEL, language="en"),
+        vad=vad,
+    )
+
+    # assemble the pipeline
+    session = AgentSession(
+        vad=vad,
+        stt=stt,
+        llm=anthropic.LLM(model=LLM_MODEL, temperature=LLM_TEMPERATURE),
+        tts=hume.TTS(voice=HUME_VOICE, instant_mode=True),
+    )
+
+    await session.start(agent=VoiceAssistant(), room=ctx.room)
+    await session.generate_reply(instructions=GREETING)
+
+
+def main():
+    """Validate env vars, default to console mode, then launch the worker."""
+    validate_env_vars()  # fail fast if keys/URLs are missing
+
+    if len(sys.argv) == 1:
+        sys.argv.append("console")
+    cli.run_app(WorkerOptions(entrypoint_fnc=entrypoint))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tts/tts-python-livekit/pyproject.toml b/tts/tts-python-livekit/pyproject.toml
@@ -0,0 +1,13 @@
+[project]
+name = "tts-python-livekit"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+    "livekit-agents[hume]>=1.0.20",
+    "livekit-plugins-anthropic>=1.0.20",
+    "livekit-plugins-groq>=1.0.20",
+    "livekit-plugins-silero>=1.0.20",
+    "python-dotenv>=1.1.0",
+]
diff --git a/tts/tts-python-livekit/settings.py b/tts/tts-python-livekit/settings.py
@@ -0,0 +1,54 @@
+"""Settings for the LiveKit Agents demo."""
+
+# — STT (speech-to-text) — 
+# The Groq Whisper model used for transcribing incoming audio
+STT_MODEL = "whisper-large-v3-turbo"
+
+
+# — LLM (language model) —
+# The Anthropic Claude model for generating replies
+LLM_MODEL = "claude-3-5-haiku-latest"
+# How "creative" the LLM should be: 0.0 = fully deterministic, higher = more varied
+LLM_TEMPERATURE = 0.5
+# The system prompt passed to the LLM at startup to set persona & tone
+LLM_PROMPT = """\
+VOICE ASSISTANT GUIDELINES
+
+CORE IDENTITY:
+- Helpful, professional voice assistant communicating via audio
+- Warm, conversational tone using short, clear sentences
+- No references to underlying model or implementation
+
+INTERACTION PATTERN:
+- Keep responses concise (~50 words/30 seconds of spoken audio)
+- Provide longer responses only when explicitly requested
+- Ask one focused follow-up question if user request is unclear
+- When interrupted, stop immediately and respond to new input
+
+INFORMATION HANDLING:
+- Prioritize accuracy over completeness
+- Acknowledge uncertainty rather than guessing
+- When unsure, offer to suggest next steps
+"""
+
+
+# — TTS (text-to-speech) —
+# Pick a voice in the Hume Voice Library
+# https://platform.hume.ai/tts/voice-library
+# Use "HUME_AI" for Hume library voices or "CUSTOM_VOICE" for voices you’ve created
+HUME_VOICE = {
+    "name": "Male English Actor",
+    "provider": "HUME_AI",
+}
+
+
+# — Initial greeting —
+# The exact text the agent will speak on startup
+GREETING = "Say 'Hi there! How can I help you today?'"
+
+
+# — VAD (voice-activity detection) —
+# Minimum seconds of continuous speech before sending to STT
+VAD_SPEECH_DURATION = 0.1
+# Minimum seconds of silence to mark the end of a speech segment
+VAD_SILENCE_DURATION = 0.5
diff --git a/tts/tts-python-livekit/utils.py b/tts/tts-python-livekit/utils.py
@@ -0,0 +1,43 @@
+"""
+Utility functions for the LiveKit Agents demo.
+"""
+
+import os
+import sys
+
+# third-party
+from dotenv import load_dotenv
+
+
+# Environment variables required to run the demo
+REQUIRED_ENV_VARS = [
+    "HUME_API_KEY",
+    "GROQ_API_KEY",
+    "ANTHROPIC_API_KEY",
+    "LIVEKIT_URL",
+    "LIVEKIT_API_KEY",
+    "LIVEKIT_API_SECRET",
+]
+
+
+def validate_env_vars():
+    """
+    Load environment variables from .env, then ensure all REQUIRED_ENV_VARS are set.
+    If any are missing, exit with a helpful message pointing to the .env.example file.
+    """
+    # Load from .env into environment
+    load_dotenv(override=True)
+
+    # Check which vars are missing
+    missing = [var for var in REQUIRED_ENV_VARS if not os.getenv(var)]
+    if missing:
+        example_filename = ".env.example"
+        message = (
+            "\nERROR: Missing environment variables: "
+            + ", ".join(missing)
+            + "\n\nPlease create a .env file in the project root "
+            + f"based on {example_filename} and fill in the values:\n\n"
+            + "\n".join(f"  {var}=" for var in REQUIRED_ENV_VARS)
+            + "\n"
+        )
+        sys.exit(message)
diff --git a/tts/tts-python-livekit/uv.lock b/tts/tts-python-livekit/uv.lock