Skip to content

Commit 4870356

Browse files
Merge pull request Significant-Gravitas#1718 from gucky92/transcribe_audio_huggingface
Transcribe audio using huggingface
2 parents a3f25ca + 017371b commit 4870356

File tree

5 files changed

+49
-0
lines changed

5 files changed

+49
-0
lines changed

.env.template

+7
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,13 @@ IMAGE_PROVIDER=dalle
9393
# HUGGINGFACE_API_TOKEN - HuggingFace API token (Example: my-huggingface-api-token)
9494
HUGGINGFACE_API_TOKEN=your-huggingface-api-token
9595

96+
################################################################################
97+
### AUDIO TO TEXT PROVIDER
98+
################################################################################
99+
100+
### HUGGINGFACE
101+
HUGGINGFACE_AUDIO_TO_TEXT_MODEL=facebook/wav2vec2-base-960h
102+
96103
################################################################################
97104
### GIT Provider for repository actions
98105
################################################################################

autogpt/app.py

+3
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from autogpt.commands.write_tests import write_tests
99
from autogpt.config import Config
1010
from autogpt.commands.image_gen import generate_image
11+
from autogpt.commands.audio_text import read_audio_from_file
1112
from autogpt.commands.web_requests import scrape_links, scrape_text
1213
from autogpt.commands.execute_code import execute_python_file, execute_shell
1314
from autogpt.commands.file_operations import (
@@ -180,6 +181,8 @@ def execute_command(command_name: str, arguments):
180181
" shell commands, EXECUTE_LOCAL_COMMANDS must be set to 'True' "
181182
"in your config. Do not attempt to bypass the restriction."
182183
)
184+
elif command_name == "read_audio_from_file":
185+
return read_audio_from_file(arguments["file"])
183186
elif command_name == "generate_image":
184187
return generate_image(arguments["prompt"])
185188
elif command_name == "send_tweet":

autogpt/commands/audio_text.py

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
import requests
2+
import json
3+
4+
from autogpt.config import Config
5+
from autogpt.commands.file_operations import safe_join
6+
7+
cfg = Config()
8+
9+
working_directory = "auto_gpt_workspace"
10+
11+
12+
def read_audio_from_file(audio_path):
13+
audio_path = safe_join(working_directory, audio_path)
14+
with open(audio_path, "rb") as audio_file:
15+
audio = audio_file.read()
16+
return read_audio(audio)
17+
18+
19+
def read_audio(audio):
20+
model = cfg.huggingface_audio_to_text_model
21+
api_url = f"https://api-inference.huggingface.co/models/{model}"
22+
api_token = cfg.huggingface_api_token
23+
headers = {"Authorization": f"Bearer {api_token}"}
24+
25+
if api_token is None:
26+
raise ValueError("You need to set your Hugging Face API token in the config file.")
27+
28+
response = requests.post(
29+
api_url,
30+
headers=headers,
31+
data=audio,
32+
)
33+
34+
text = json.loads(response.content.decode("utf-8"))['text']
35+
return "The audio says: " + text

autogpt/config/config.py

+3
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,9 @@ def __init__(self) -> None:
7272

7373
self.image_provider = os.getenv("IMAGE_PROVIDER")
7474
self.huggingface_api_token = os.getenv("HUGGINGFACE_API_TOKEN")
75+
self.huggingface_audio_to_text_model = os.getenv(
76+
"HUGGINGFACE_AUDIO_TO_TEXT_MODEL"
77+
)
7578

7679
# User agent headers to use when browsing web
7780
# Some websites might just completely deny request with an error code if

autogpt/prompt.py

+1
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ def get_prompt() -> str:
8282
),
8383
("Execute Python File", "execute_python_file", {"file": "<file>"}),
8484
("Generate Image", "generate_image", {"prompt": "<prompt>"}),
85+
("Convert Audio to text", "read_audio_from_file", {"file": "<file>"}),
8586
("Send Tweet", "send_tweet", {"text": "<text>"}),
8687

8788
]

0 commit comments

Comments
 (0)