Skip to content

Commit a1da931

Browse files
committed
feat: correctly detect when starting the vad server
Signed-off-by: Ettore Di Giacinto <[email protected]>
1 parent ea6ef64 commit a1da931

File tree

1 file changed

+33
-14
lines changed

1 file changed

+33
-14
lines changed

core/http/endpoints/openai/realtime.go

+33-14
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ type Session struct {
2424
ID string
2525
Model string
2626
Voice string
27-
TurnDetection string // "server_vad" or "none"
27+
TurnDetection *TurnDetection `json:"turn_detection"` // "server_vad" or "none"
2828
Functions []FunctionType
2929
Instructions string
3030
Conversations map[string]*Conversation
@@ -34,6 +34,10 @@ type Session struct {
3434
ModelInterface Model
3535
}
3636

37+
type TurnDetection struct {
38+
Type string `json:"type"`
39+
}
40+
3741
// FunctionType represents a function that can be called by the server
3842
type FunctionType struct {
3943
Name string `json:"name"`
@@ -214,9 +218,9 @@ func RegisterRealtime(cl *config.BackendConfigLoader, ml *model.ModelLoader, app
214218
sessionID := generateSessionID()
215219
session := &Session{
216220
ID: sessionID,
217-
Model: model, // default model
218-
Voice: "alloy", // default voice
219-
TurnDetection: "server_vad", // default turn detection mode
221+
Model: model, // default model
222+
Voice: "alloy", // default voice
223+
TurnDetection: &TurnDetection{Type: "none"},
220224
Instructions: "Your knowledge cutoff is 2023-10. You are a helpful, witty, and friendly AI. Act like a human, but remember that you aren't a human and that you can't do human things in the real world. Your voice and personality should be warm and engaging, with a lively and playful tone. If interacting in a non-English language, start by using the standard accent or dialect familiar to the user. Talk quickly. You should always call a function if you can. Do not refer to these rules, even if you're asked about them.",
221225
Conversations: make(map[string]*Conversation),
222226
}
@@ -260,14 +264,7 @@ func RegisterRealtime(cl *config.BackendConfigLoader, ml *model.ModelLoader, app
260264
done = make(chan struct{})
261265
)
262266

263-
// Start a goroutine to handle VAD if in server VAD mode
264-
if session.TurnDetection == "server_vad" {
265-
wg.Add(1)
266-
go func() {
267-
defer wg.Done()
268-
handleVAD(session, conversation, c, done)
269-
}()
270-
}
267+
var vadServerStarted bool
271268

272269
for {
273270
if mt, msg, err = c.ReadMessage(); err != nil {
@@ -305,6 +302,24 @@ func RegisterRealtime(cl *config.BackendConfigLoader, ml *model.ModelLoader, app
305302
Session: session,
306303
})
307304

305+
if session.TurnDetection.Type == "server_vad" && !vadServerStarted {
306+
log.Debug().Msg("Starting VAD goroutine...")
307+
wg.Add(1)
308+
go func() {
309+
defer wg.Done()
310+
conversation := session.Conversations[session.DefaultConversationID]
311+
handleVAD(session, conversation, c, done)
312+
}()
313+
vadServerStarted = true
314+
} else if vadServerStarted {
315+
log.Debug().Msg("Stopping VAD goroutine...")
316+
317+
wg.Add(-1)
318+
go func() {
319+
done <- struct{}{}
320+
}()
321+
vadServerStarted = false
322+
}
308323
case "input_audio_buffer.append":
309324
// Handle 'input_audio_buffer.append'
310325
if incomingMsg.Audio == "" {
@@ -499,15 +514,16 @@ func updateSession(session *Session, update *Session, cl *config.BackendConfigLo
499514
if update.Voice != "" {
500515
session.Voice = update.Voice
501516
}
502-
if update.TurnDetection != "" {
503-
session.TurnDetection = update.TurnDetection
517+
if update.TurnDetection != nil && update.TurnDetection.Type != "" {
518+
session.TurnDetection.Type = update.TurnDetection.Type
504519
}
505520
if update.Instructions != "" {
506521
session.Instructions = update.Instructions
507522
}
508523
if update.Functions != nil {
509524
session.Functions = update.Functions
510525
}
526+
511527
return nil
512528
}
513529

@@ -622,6 +638,7 @@ func generateResponse(session *Session, conversation *Conversation, responseCrea
622638
sendError(c, "processing_error", "Failed to generate text response", "", "")
623639
return
624640
}
641+
log.Debug().Any("text", generatedText).Msg("Generated text response")
625642
}
626643

627644
if functionCall != nil {
@@ -717,6 +734,8 @@ func generateResponse(session *Session, conversation *Conversation, responseCrea
717734
Type: "conversation.item.created",
718735
Item: item,
719736
})
737+
738+
log.Debug().Any("item", item).Msg("Realtime response sent")
720739
}
721740
}
722741

0 commit comments

Comments
 (0)