speech2action/speechaction.py at main · chiragjain0/speech2action · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import threading
import queue
import time
import json
import openai
import sounddevice as sd
import vosk
import numpy as np
import panda_py
from panda_py import libfranka
from llm_tamp import RobotController

openai.api_key = "your-openai-key"

# Queues for inter-thread communication
audio_q = queue.Queue()
text_q = queue.Queue()
command_q = queue.Queue()

# Audio device ID for microphone
DEVICE_ID = 1

# Load Vosk model
model = vosk.Model(lang="en-us")

# Voice input thread
def listen_audio():
    recognizer = vosk.KaldiRecognizer(model, 16000)

    def callback(indata, frames, time, status):
        if recognizer.AcceptWaveform(indata):
            result = json.loads(recognizer.Result())
            audio_q.put(result.get("text", ""))

    with sd.RawInputStream(samplerate=16000, blocksize=8000, dtype='int16',
                           channels=1, callback=callback, device=DEVICE_ID):
        while True:
            time.sleep(0.1)

# Speech to text thread
def speech_to_text():
    while True:
        text = audio_q.get()
        if text:
            text_q.put(text)

# LLM parsing thread
def llm_parse_text():
    while True:
        user_input = text_q.get()
        try:
            response = openai.ChatCompletion.create(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": "Extract structured command in JSON format with keys: task, location, modifiers."},
                    {"role": "user", "content": user_input}
                ]
            )
            content = response['choices'][0]['message']['content']
            parsed_json = json.loads(content)
            command_q.put(parsed_json)
        except Exception as e:
            print(f"LLM parsing error: {e}")

# Command execution thread
def execute_command(robot, object_poses, object_grasp_pose):
    while True:
        try:
            cmd = command_q.get()
            task = cmd.get("task", "").lower()
            location = cmd.get("location", "")
            modifiers = cmd.get("modifiers", [])

            if task == "draw":
                robot.prepare(task='draw', ort=np.array(object_grasp_pose['marker']))
                if location == "top right":
                    robot.move_to(object_poses['writing_loc_top_right'], modifier=modifiers[0] if modifiers else 'direct')
                    robot.draw(object_poses['writing_loc_top_right'])
                elif location == "top left":
                    robot.move_to(object_poses['writing_loc_top_left'], modifier=modifiers[0] if modifiers else 'direct')
                    robot.draw_two(object_poses['writing_loc_top_left'])

            elif task == "erase":
                robot.prepare(task='erase', ort=np.array(object_grasp_pose['up_eraser_align']))
                if location == "top":
                    robot.move_to(object_poses['up_erasing_loc'], modifier=modifiers[0] if modifiers else 'direct')
                    robot.erase(time=5)
                elif location == "bottom":
                    robot.prepare(task='erase', ort=np.array(object_grasp_pose['down_eraser_align']))
                    robot.move_to(object_poses['bottom_erasing_loc'], modifier=modifiers[0] if modifiers else 'direct')
                    robot.erase(time=5)

            elif task == "move":
                for mod in modifiers:
                    robot.move_gen(mod)

        except Exception as e:
            print(f"Execution error: {e}")

# Main function
def main():
    hostname = '192.168.1.11'
    panda = panda_py.Panda(hostname)
    gripper = libfranka.Gripper(hostname)
    robot = RobotController(panda=panda, gripper=gripper)
    robot.move_to_study()

    object_poses = {
        'writing_loc_top_right': [0.711947263682881, 0.12706318489784938, 0.5052814249098799],
        'writing_loc_top_left': [0.711947263682881, 0.24706318489784938, 0.5052814249098799],
        'bottom_erasing_loc': [0.7421822003037818, 0.12347043377115799, 0.24469861149105856],
        'up_erasing_loc': [0.7668475668639307, 0.11783189894165687, 0.5541111324550737]
    }
    object_grasp_pose = {
        'down_eraser_align': [0.08793943573778525, 0.7047919107466506, -0.07203306288116887, 0.7002472758907934],
        'up_eraser_align': [0.01796533906041393, 0.7122159863223589, -0.03631220920003831, 0.7007903040869043],
        'marker': [0.08229516371960562, 0.7730591118216822, -0.06885761869359781, 0.6251925586127405]
    }

    threading.Thread(target=listen_audio, daemon=True).start()
    threading.Thread(target=speech_to_text, daemon=True).start()
    threading.Thread(target=llm_parse_text, daemon=True).start()
    threading.Thread(target=execute_command, args=(robot, object_poses, object_grasp_pose), daemon=True).start()

    while True:
        time.sleep(1)

if __name__ == "__main__":
    main()