simstudioai · emir-karabeg · Jun 4, 2025 · Jun 4, 2025 · Jun 5, 2025 · Jun 6, 2025
diff --git a/apps/sim/app/api/proxy/tts/stream/route.ts b/apps/sim/app/api/proxy/tts/stream/route.ts
@@ -0,0 +1,121 @@
+import type { NextRequest } from 'next/server'
+import { env } from '@/lib/env'
+import { createLogger } from '@/lib/logs/console-logger'
+
+const logger = createLogger('ProxyTTSStreamAPI')
+
+export async function POST(request: NextRequest) {
+  try {
+    const body = await request.json()
+    const { text, voiceId, modelId = 'eleven_turbo_v2_5' } = body
+
+    if (!text || !voiceId) {
+      return new Response('Missing required parameters', { status: 400 })
+    }
+
+    // Use server-side API key instead of client-provided key
+    const apiKey = env.ELEVENLABS_API_KEY
+    if (!apiKey) {
+      logger.error('ELEVENLABS_API_KEY not configured on server')
+      return new Response('ElevenLabs service not configured', { status: 503 })
+    }
+
+    const endpoint = `https://api.elevenlabs.io/v1/text-to-speech/${voiceId}/stream`
+
+    const response = await fetch(endpoint, {
+      method: 'POST',
+      headers: {
+        Accept: 'audio/mpeg',
+        'Content-Type': 'application/json',
+        'xi-api-key': apiKey,
+      },
+      body: JSON.stringify({
+        text,
+        model_id: modelId,
+        // Maximum performance settings
+        optimize_streaming_latency: 4,
+        output_format: 'mp3_22050_32', // Fastest format
+        voice_settings: {
+          stability: 0.5,
+          similarity_boost: 0.8,
+          style: 0.0,
+          use_speaker_boost: false,
+        },
+        enable_ssml_parsing: false,
+        apply_text_normalization: 'off',
+        // Use auto mode for fastest possible streaming
+        // Note: This may sacrifice some quality for speed
+        use_pvc_as_ivc: false, // Use fastest voice processing
+      }),
+    })
+
+    if (!response.ok) {
+      logger.error(`Failed to generate Stream TTS: ${response.status} ${response.statusText}`)
+      return new Response(`Failed to generate TTS: ${response.status} ${response.statusText}`, {
+        status: response.status,
+      })
+    }
+
+    if (!response.body) {
+      logger.error('No response body received from ElevenLabs')
+      return new Response('No audio stream received', { status: 422 })
+    }
+
+    // Create optimized streaming response
+    const { readable, writable } = new TransformStream({
+      transform(chunk, controller) {
+        // Pass through chunks immediately without buffering
+        controller.enqueue(chunk)
+      },
+      flush(controller) {
+        // Ensure all data is flushed immediately
+        controller.terminate()
+      },
+    })
+
+    const writer = writable.getWriter()
+    const reader = response.body.getReader()
+
+    // Stream with minimal buffering for real-time performance
+
+    ;(async () => {
+      try {
+        while (true) {
+          const { done, value } = await reader.read()
+          if (done) {
+            await writer.close()
+            break
+          }
+          // Write immediately without waiting
+          writer.write(value).catch(logger.error)
+        }
+      } catch (error) {
+        logger.error('Error during Stream streaming:', error)
+        await writer.abort(error)
+      }
+    })()
+
+    return new Response(readable, {
+      headers: {
+        'Content-Type': 'audio/mpeg',
+        'Transfer-Encoding': 'chunked',
+        'Cache-Control': 'no-cache, no-store, must-revalidate',
+        Pragma: 'no-cache',
+        Expires: '0',
+        'X-Content-Type-Options': 'nosniff',
+        'Access-Control-Allow-Origin': '*',
+        Connection: 'keep-alive',
+        // Stream headers for better streaming
+        'X-Accel-Buffering': 'no', // Disable nginx buffering
+        'X-Stream-Type': 'real-time',
+      },
+    })
+  } catch (error) {
+    logger.error('Error in Stream TTS:', error)
+
+    return new Response(
+      `Internal Server Error: ${error instanceof Error ? error.message : 'Unknown error'}`,
+      { status: 500 }
+    )
+  }
+}
diff --git a/apps/sim/app/chat/[subdomain]/chat-client.tsx b/apps/sim/app/chat/[subdomain]/chat-client.tsx
@@ -2,6 +2,7 @@
 
 import { type RefObject, useCallback, useEffect, useRef, useState } from 'react'
 import { v4 as uuidv4 } from 'uuid'
+import { createLogger } from '@/lib/logs/console-logger'
 import { getFormattedGitHubStars } from '@/app/(landing)/actions/github'
 import EmailAuth from './components/auth/email/email-auth'
 import PasswordAuth from './components/auth/password/password-auth'
@@ -11,8 +12,12 @@ import { ChatInput } from './components/input/input'
 import { ChatLoadingState } from './components/loading-state/loading-state'
 import type { ChatMessage } from './components/message/message'
 import { ChatMessageContainer } from './components/message-container/message-container'
+import { VoiceInterface } from './components/voice-interface/voice-interface'
+import { useAudioStreaming } from './hooks/use-audio-streaming'
 import { useChatStreaming } from './hooks/use-chat-streaming'
 
+const logger = createLogger('ChatClient')
+
 interface ChatConfig {
   id: string
   title: string
@@ -26,6 +31,10 @@ interface ChatConfig {
   authType?: 'public' | 'password' | 'email'
 }
 
+const DEFAULT_VOICE_SETTINGS = {
+  voiceId: 'EXAVITQu4vr4xnSDxMaL', // Default ElevenLabs voice (Bella)
+}
+
 function throttle<T extends (...args: any[]) => any>(func: T, delay: number): T {
   let timeoutId: NodeJS.Timeout | null = null
   let lastExecTime = 0
@@ -60,19 +69,17 @@ export default function ChatClient({ subdomain }: { subdomain: string }) {
   const [starCount, setStarCount] = useState('3.4k')
   const [conversationId, setConversationId] = useState('')
 
-  // Simple state for showing scroll button
   const [showScrollButton, setShowScrollButton] = useState(false)
-
-  // Track if user has manually scrolled during response
   const [userHasScrolled, setUserHasScrolled] = useState(false)
   const isUserScrollingRef = useRef(false)
 
-  // Authentication state
   const [authRequired, setAuthRequired] = useState<'password' | 'email' | null>(null)
 
-  // Use the custom streaming hook
+  const [isVoiceFirstMode, setIsVoiceFirstMode] = useState(false)
   const { isStreamingResponse, abortControllerRef, stopStreaming, handleStreamedResponse } =
     useChatStreaming()
+  const audioContextRef = useRef<AudioContext | null>(null)
+  const { isPlayingAudio, streamTextToAudio, stopAudio } = useAudioStreaming(audioContextRef)
 
   const scrollToBottom = useCallback(() => {
     if (messagesEndRef.current) {
@@ -193,7 +200,7 @@ export default function ChatClient({ subdomain }: { subdomain: string }) {
         ])
       }
     } catch (error) {
-      console.error('Error fetching chat config:', error)
+      logger.error('Error fetching chat config:', error)
       setError('This chat is currently unavailable. Please try again later.')
     }
   }
@@ -208,7 +215,7 @@ export default function ChatClient({ subdomain }: { subdomain: string }) {
         setStarCount(formattedStars)
       })
       .catch((err) => {
-        console.error('Failed to fetch GitHub stars:', err)
+        logger.error('Failed to fetch GitHub stars:', err)
       })
   }, [subdomain])
 
@@ -224,7 +231,7 @@ export default function ChatClient({ subdomain }: { subdomain: string }) {
   }
 
   // Handle sending a message
-  const handleSendMessage = async (messageParam?: string) => {
+  const handleSendMessage = async (messageParam?: string, isVoiceInput = false) => {
     const messageToSend = messageParam ?? inputValue
     if (!messageToSend.trim() || isLoading) return
 
@@ -278,18 +285,44 @@ export default function ChatClient({ subdomain }: { subdomain: string }) {
       const contentType = response.headers.get('Content-Type') || ''
 
       if (contentType.includes('text/plain')) {
-        // Handle streaming response - pass the current userHasScrolled value
+        // Prepare audio streaming handler if voice mode is enabled
+        const shouldPlayAudio = isVoiceInput || isVoiceFirstMode
+
+        const audioStreamHandler = shouldPlayAudio
+          ? async (text: string) => {
+              try {
+                await streamTextToAudio(text, {
+                  voiceId: DEFAULT_VOICE_SETTINGS.voiceId,
+                  onError: (error) => {
+                    logger.error('Audio streaming error:', error)
+                  },
+                })
+              } catch (error) {
+                logger.error('TTS error:', error)
+              }
+            }
+          : undefined
+
+        // Handle streaming response with audio support
         await handleStreamedResponse(
           response,
           setMessages,
           setIsLoading,
           scrollToBottom,
-          userHasScrolled
+          userHasScrolled,
+          {
+            voiceSettings: {
+              isVoiceEnabled: true,
+              voiceId: DEFAULT_VOICE_SETTINGS.voiceId,
+              autoPlayResponses: isVoiceInput || isVoiceFirstMode,
+            },
+            audioStreamHandler,
+          }
         )
       } else {
         // Fallback to JSON response handling
         const responseData = await response.json()
-        console.log('Message response:', responseData)
+        logger.info('Message response:', responseData)
 
         // Handle different response formats from API
         if (
@@ -321,6 +354,23 @@ export default function ChatClient({ subdomain }: { subdomain: string }) {
 
           // Add all messages at once
           setMessages((prev) => [...prev, ...assistantMessages])
+
+          // Play audio for the full response if voice mode is enabled
+          if (isVoiceInput || isVoiceFirstMode) {
+            const fullContent = assistantMessages.map((m: ChatMessage) => m.content).join(' ')
+            if (fullContent.trim()) {
+              try {
+                await streamTextToAudio(fullContent, {
+                  voiceId: DEFAULT_VOICE_SETTINGS.voiceId,
+                  onError: (error) => {
+                    logger.error('Audio playback error:', error)
+                  },
+                })
+              } catch (error) {
+                logger.error('TTS error:', error)
+              }
+            }
+          }
         } else {
           // Handle single output as before
           let messageContent = responseData.output
@@ -349,10 +399,29 @@ export default function ChatClient({ subdomain }: { subdomain: string }) {
           }
 
           setMessages((prev) => [...prev, assistantMessage])
+
+          // Play audio for the response if voice mode is enabled
+          if ((isVoiceInput || isVoiceFirstMode) && assistantMessage.content) {
+            const contentString =
+              typeof assistantMessage.content === 'string'
+                ? assistantMessage.content
+                : JSON.stringify(assistantMessage.content)
+
+            try {
+              await streamTextToAudio(contentString, {
+                voiceId: DEFAULT_VOICE_SETTINGS.voiceId,
+                onError: (error) => {
+                  logger.error('Audio playback error:', error)
+                },
+              })
+            } catch (error) {
+              logger.error('TTS error:', error)
+            }
+          }
         }
       }
     } catch (error) {
-      console.error('Error sending message:', error)
+      logger.error('Error sending message:', error)
 
       const errorMessage: ChatMessage = {
         id: crypto.randomUUID(),
@@ -367,6 +436,46 @@ export default function ChatClient({ subdomain }: { subdomain: string }) {
     }
   }
 
+  // Stop audio when component unmounts or when streaming is stopped
+  useEffect(() => {
+    return () => {
+      stopAudio()
+      if (audioContextRef.current && audioContextRef.current.state !== 'closed') {
+        audioContextRef.current.close()
+      }
+    }
+  }, [stopAudio])
+
+  // Voice interruption - stop audio when user starts speaking
+  const handleVoiceInterruption = useCallback(() => {
+    // Stop audio playback immediately
+    stopAudio()
+
+    // Stop any ongoing streaming response
+    if (isStreamingResponse) {
+      stopStreaming(setMessages)
+    }
+  }, [isStreamingResponse, stopStreaming, setMessages, stopAudio])
+
+  // Handle voice mode activation
+  const handleVoiceStart = useCallback(() => {
+    setIsVoiceFirstMode(true)
+  }, [])
+
+  // Handle exiting voice mode
+  const handleExitVoiceMode = useCallback(() => {
+    setIsVoiceFirstMode(false)
+    stopAudio() // Stop any playing audio when exiting
+  }, [stopAudio])
+
+  // Handle voice transcript from voice-first interface
+  const handleVoiceTranscript = useCallback(
+    (transcript: string) => {
+      handleSendMessage(transcript, true)
+    },
+    [handleSendMessage]
+  )
+
   // If error, show error message using the extracted component
   if (error) {
     return <ChatErrorState error={error} starCount={starCount} />
@@ -405,6 +514,27 @@ export default function ChatClient({ subdomain }: { subdomain: string }) {
     return <ChatLoadingState />
   }
 
+  // Voice-first mode interface
+  if (isVoiceFirstMode) {
+    return (
+      <VoiceInterface
+        onCallEnd={handleExitVoiceMode}
+        onVoiceTranscript={handleVoiceTranscript}
+        onVoiceStart={() => {}}
+        onVoiceEnd={() => {}}
+        onInterrupt={handleVoiceInterruption}
+        isStreaming={isStreamingResponse}
+        isPlayingAudio={isPlayingAudio}
+        audioContextRef={audioContextRef}
+        messages={messages.map((msg) => ({
+          content: typeof msg.content === 'string' ? msg.content : JSON.stringify(msg.content),
+          type: msg.type,
+        }))}
+      />
+    )
+  }
+
+  // Standard text-based chat interface
   return (
     <div className='fixed inset-0 z-[100] flex flex-col bg-background'>
       {/* Header component */}
@@ -426,11 +556,13 @@ export default function ChatClient({ subdomain }: { subdomain: string }) {
       <div className='relative p-4 pb-6'>
         <div className='relative mx-auto max-w-3xl'>
           <ChatInput
-            onSubmit={(value) => {
-              void handleSendMessage(value)
+            onSubmit={(value, isVoiceInput) => {
+              void handleSendMessage(value, isVoiceInput)
             }}
             isStreaming={isStreamingResponse}
             onStopStreaming={() => stopStreaming(setMessages)}
+            onVoiceStart={handleVoiceStart}
+            onInterrupt={handleVoiceInterruption}
           />
         </div>
       </div>