Skip to content

feat(speech-to-speech) #463

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
121 changes: 121 additions & 0 deletions apps/sim/app/api/proxy/tts/stream/route.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
import type { NextRequest } from 'next/server'
import { env } from '@/lib/env'
import { createLogger } from '@/lib/logs/console-logger'

const logger = createLogger('ProxyTTSStreamAPI')

export async function POST(request: NextRequest) {
try {
const body = await request.json()
const { text, voiceId, modelId = 'eleven_turbo_v2_5' } = body

if (!text || !voiceId) {
return new Response('Missing required parameters', { status: 400 })
}

// Use server-side API key instead of client-provided key
const apiKey = env.ELEVENLABS_API_KEY
if (!apiKey) {
logger.error('ELEVENLABS_API_KEY not configured on server')
return new Response('ElevenLabs service not configured', { status: 503 })
}

const endpoint = `https://api.elevenlabs.io/v1/text-to-speech/${voiceId}/stream`

const response = await fetch(endpoint, {
method: 'POST',
headers: {
Accept: 'audio/mpeg',
'Content-Type': 'application/json',
'xi-api-key': apiKey,
},
body: JSON.stringify({
text,
model_id: modelId,
// Maximum performance settings
optimize_streaming_latency: 4,
output_format: 'mp3_22050_32', // Fastest format
voice_settings: {
stability: 0.5,
similarity_boost: 0.8,
style: 0.0,
use_speaker_boost: false,
},
enable_ssml_parsing: false,
apply_text_normalization: 'off',
// Use auto mode for fastest possible streaming
// Note: This may sacrifice some quality for speed
use_pvc_as_ivc: false, // Use fastest voice processing
}),
})

if (!response.ok) {
logger.error(`Failed to generate Stream TTS: ${response.status} ${response.statusText}`)
return new Response(`Failed to generate TTS: ${response.status} ${response.statusText}`, {
status: response.status,
})
}

if (!response.body) {
logger.error('No response body received from ElevenLabs')
return new Response('No audio stream received', { status: 422 })
}

// Create optimized streaming response
const { readable, writable } = new TransformStream({
transform(chunk, controller) {
// Pass through chunks immediately without buffering
controller.enqueue(chunk)
},
flush(controller) {
// Ensure all data is flushed immediately
controller.terminate()
},
})

const writer = writable.getWriter()
const reader = response.body.getReader()

// Stream with minimal buffering for real-time performance

;(async () => {
try {
while (true) {
const { done, value } = await reader.read()
if (done) {
await writer.close()
break
}
// Write immediately without waiting
writer.write(value).catch(logger.error)
}
} catch (error) {
logger.error('Error during Stream streaming:', error)
await writer.abort(error)
}
})()

return new Response(readable, {
headers: {
'Content-Type': 'audio/mpeg',
'Transfer-Encoding': 'chunked',
'Cache-Control': 'no-cache, no-store, must-revalidate',
Pragma: 'no-cache',
Expires: '0',
'X-Content-Type-Options': 'nosniff',
'Access-Control-Allow-Origin': '*',
Connection: 'keep-alive',
// Stream headers for better streaming
'X-Accel-Buffering': 'no', // Disable nginx buffering
'X-Stream-Type': 'real-time',
},
})
} catch (error) {
logger.error('Error in Stream TTS:', error)

return new Response(
`Internal Server Error: ${error instanceof Error ? error.message : 'Unknown error'}`,
{ status: 500 }
)
}
}
160 changes: 146 additions & 14 deletions apps/sim/app/chat/[subdomain]/chat-client.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import { type RefObject, useCallback, useEffect, useRef, useState } from 'react'
import { v4 as uuidv4 } from 'uuid'
import { createLogger } from '@/lib/logs/console-logger'
import { getFormattedGitHubStars } from '@/app/(landing)/actions/github'
import EmailAuth from './components/auth/email/email-auth'
import PasswordAuth from './components/auth/password/password-auth'
Expand All @@ -11,8 +12,12 @@ import { ChatInput } from './components/input/input'
import { ChatLoadingState } from './components/loading-state/loading-state'
import type { ChatMessage } from './components/message/message'
import { ChatMessageContainer } from './components/message-container/message-container'
import { VoiceInterface } from './components/voice-interface/voice-interface'
import { useAudioStreaming } from './hooks/use-audio-streaming'
import { useChatStreaming } from './hooks/use-chat-streaming'

const logger = createLogger('ChatClient')

interface ChatConfig {
id: string
title: string
Expand All @@ -26,6 +31,10 @@ interface ChatConfig {
authType?: 'public' | 'password' | 'email'
}

const DEFAULT_VOICE_SETTINGS = {
voiceId: 'EXAVITQu4vr4xnSDxMaL', // Default ElevenLabs voice (Bella)
}

function throttle<T extends (...args: any[]) => any>(func: T, delay: number): T {
let timeoutId: NodeJS.Timeout | null = null
let lastExecTime = 0
Expand Down Expand Up @@ -60,19 +69,17 @@ export default function ChatClient({ subdomain }: { subdomain: string }) {
const [starCount, setStarCount] = useState('3.4k')
const [conversationId, setConversationId] = useState('')

// Simple state for showing scroll button
const [showScrollButton, setShowScrollButton] = useState(false)

// Track if user has manually scrolled during response
const [userHasScrolled, setUserHasScrolled] = useState(false)
const isUserScrollingRef = useRef(false)

// Authentication state
const [authRequired, setAuthRequired] = useState<'password' | 'email' | null>(null)

// Use the custom streaming hook
const [isVoiceFirstMode, setIsVoiceFirstMode] = useState(false)
const { isStreamingResponse, abortControllerRef, stopStreaming, handleStreamedResponse } =
useChatStreaming()
const audioContextRef = useRef<AudioContext | null>(null)
const { isPlayingAudio, streamTextToAudio, stopAudio } = useAudioStreaming(audioContextRef)

const scrollToBottom = useCallback(() => {
if (messagesEndRef.current) {
Expand Down Expand Up @@ -193,7 +200,7 @@ export default function ChatClient({ subdomain }: { subdomain: string }) {
])
}
} catch (error) {
console.error('Error fetching chat config:', error)
logger.error('Error fetching chat config:', error)
setError('This chat is currently unavailable. Please try again later.')
}
}
Expand All @@ -208,7 +215,7 @@ export default function ChatClient({ subdomain }: { subdomain: string }) {
setStarCount(formattedStars)
})
.catch((err) => {
console.error('Failed to fetch GitHub stars:', err)
logger.error('Failed to fetch GitHub stars:', err)
})
}, [subdomain])

Expand All @@ -224,7 +231,7 @@ export default function ChatClient({ subdomain }: { subdomain: string }) {
}

// Handle sending a message
const handleSendMessage = async (messageParam?: string) => {
const handleSendMessage = async (messageParam?: string, isVoiceInput = false) => {
const messageToSend = messageParam ?? inputValue
if (!messageToSend.trim() || isLoading) return

Expand Down Expand Up @@ -278,18 +285,44 @@ export default function ChatClient({ subdomain }: { subdomain: string }) {
const contentType = response.headers.get('Content-Type') || ''

if (contentType.includes('text/plain')) {
// Handle streaming response - pass the current userHasScrolled value
// Prepare audio streaming handler if voice mode is enabled
const shouldPlayAudio = isVoiceInput || isVoiceFirstMode

const audioStreamHandler = shouldPlayAudio
? async (text: string) => {
try {
await streamTextToAudio(text, {
voiceId: DEFAULT_VOICE_SETTINGS.voiceId,
onError: (error) => {
logger.error('Audio streaming error:', error)
},
})
} catch (error) {
logger.error('TTS error:', error)
}
}
: undefined

// Handle streaming response with audio support
await handleStreamedResponse(
response,
setMessages,
setIsLoading,
scrollToBottom,
userHasScrolled
userHasScrolled,
{
voiceSettings: {
isVoiceEnabled: true,
voiceId: DEFAULT_VOICE_SETTINGS.voiceId,
autoPlayResponses: isVoiceInput || isVoiceFirstMode,
},
audioStreamHandler,
}
)
} else {
// Fallback to JSON response handling
const responseData = await response.json()
console.log('Message response:', responseData)
logger.info('Message response:', responseData)

// Handle different response formats from API
if (
Expand Down Expand Up @@ -321,6 +354,23 @@ export default function ChatClient({ subdomain }: { subdomain: string }) {

// Add all messages at once
setMessages((prev) => [...prev, ...assistantMessages])

// Play audio for the full response if voice mode is enabled
if (isVoiceInput || isVoiceFirstMode) {
const fullContent = assistantMessages.map((m: ChatMessage) => m.content).join(' ')
if (fullContent.trim()) {
try {
await streamTextToAudio(fullContent, {
voiceId: DEFAULT_VOICE_SETTINGS.voiceId,
onError: (error) => {
logger.error('Audio playback error:', error)
},
})
} catch (error) {
logger.error('TTS error:', error)
}
}
}
} else {
// Handle single output as before
let messageContent = responseData.output
Expand Down Expand Up @@ -349,10 +399,29 @@ export default function ChatClient({ subdomain }: { subdomain: string }) {
}

setMessages((prev) => [...prev, assistantMessage])

// Play audio for the response if voice mode is enabled
if ((isVoiceInput || isVoiceFirstMode) && assistantMessage.content) {
const contentString =
typeof assistantMessage.content === 'string'
? assistantMessage.content
: JSON.stringify(assistantMessage.content)

try {
await streamTextToAudio(contentString, {
voiceId: DEFAULT_VOICE_SETTINGS.voiceId,
onError: (error) => {
logger.error('Audio playback error:', error)
},
})
} catch (error) {
logger.error('TTS error:', error)
}
}
}
}
} catch (error) {
console.error('Error sending message:', error)
logger.error('Error sending message:', error)

const errorMessage: ChatMessage = {
id: crypto.randomUUID(),
Expand All @@ -367,6 +436,46 @@ export default function ChatClient({ subdomain }: { subdomain: string }) {
}
}

// Stop audio when component unmounts or when streaming is stopped
useEffect(() => {
return () => {
stopAudio()
if (audioContextRef.current && audioContextRef.current.state !== 'closed') {
audioContextRef.current.close()
}
}
}, [stopAudio])

// Voice interruption - stop audio when user starts speaking
const handleVoiceInterruption = useCallback(() => {
// Stop audio playback immediately
stopAudio()

// Stop any ongoing streaming response
if (isStreamingResponse) {
stopStreaming(setMessages)
}
}, [isStreamingResponse, stopStreaming, setMessages, stopAudio])

// Handle voice mode activation
const handleVoiceStart = useCallback(() => {
setIsVoiceFirstMode(true)
}, [])

// Handle exiting voice mode
const handleExitVoiceMode = useCallback(() => {
setIsVoiceFirstMode(false)
stopAudio() // Stop any playing audio when exiting
}, [stopAudio])

// Handle voice transcript from voice-first interface
const handleVoiceTranscript = useCallback(
(transcript: string) => {
handleSendMessage(transcript, true)
},
[handleSendMessage]
)

// If error, show error message using the extracted component
if (error) {
return <ChatErrorState error={error} starCount={starCount} />
Expand Down Expand Up @@ -405,6 +514,27 @@ export default function ChatClient({ subdomain }: { subdomain: string }) {
return <ChatLoadingState />
}

// Voice-first mode interface
if (isVoiceFirstMode) {
return (
<VoiceInterface
onCallEnd={handleExitVoiceMode}
onVoiceTranscript={handleVoiceTranscript}
onVoiceStart={() => {}}
onVoiceEnd={() => {}}
onInterrupt={handleVoiceInterruption}
isStreaming={isStreamingResponse}
isPlayingAudio={isPlayingAudio}
audioContextRef={audioContextRef}
messages={messages.map((msg) => ({
content: typeof msg.content === 'string' ? msg.content : JSON.stringify(msg.content),
type: msg.type,
}))}
/>
)
}

// Standard text-based chat interface
return (
<div className='fixed inset-0 z-[100] flex flex-col bg-background'>
{/* Header component */}
Expand All @@ -426,11 +556,13 @@ export default function ChatClient({ subdomain }: { subdomain: string }) {
<div className='relative p-4 pb-6'>
<div className='relative mx-auto max-w-3xl'>
<ChatInput
onSubmit={(value) => {
void handleSendMessage(value)
onSubmit={(value, isVoiceInput) => {
void handleSendMessage(value, isVoiceInput)
}}
isStreaming={isStreamingResponse}
onStopStreaming={() => stopStreaming(setMessages)}
onVoiceStart={handleVoiceStart}
onInterrupt={handleVoiceInterruption}
/>
</div>
</div>
Expand Down
Loading