-
Notifications
You must be signed in to change notification settings - Fork 2.9k
Add gr.Dialogue component #11092
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Add gr.Dialogue component #11092
Changes from all commits
cab4441
9d3fccf
a56dd56
632535c
d18c6bd
2638540
a91c861
ea455c5
c26e950
5a38378
32616c8
d8fd6c8
3e87f24
2199902
f6537b7
bcb4a61
8999bdd
7a0156b
a4b6b1b
fd6f4eb
bc86fb1
d6266b9
50c5404
8e64b7f
9cdc4ac
3f328f9
60e8467
73737ac
3b435cb
a523deb
6418c4c
e1fda40
6038e0b
57c6582
055b2fb
14ff6ed
2578be6
b8408ce
b714ccd
9eddc37
56c53d7
54c8f2d
ed79ab4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
--- | ||
"@gradio/dialogue": minor | ||
"@gradio/dropdown": minor | ||
"gradio": minor | ||
--- | ||
|
||
feat:Add gr.Dialogue component |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: dia_dialogue_demo"]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio "]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "import httpx\n", "\n", "\n", "emotions = [\n", " \"(laughs)\",\n", " \"(clears throat)\",\n", " \"(sighs)\",\n", " \"(gasps)\",\n", " \"(coughs)\",\n", " \"(singing)\",\n", " \"(sings)\",\n", " \"(mumbles)\",\n", " \"(beep)\",\n", " \"(groans)\",\n", " \"(sniffs)\",\n", " \"(claps)\",\n", " \"(screams)\",\n", " \"(inhales)\",\n", " \"(exhales)\",\n", " \"(applause)\",\n", " \"(burps)\",\n", " \"(humming)\",\n", " \"(sneezes)\",\n", " \"(chuckle)\",\n", " \"(whistles)\",\n", "]\n", "speakers = [\"Speaker 1\", \"Speaker 2\"]\n", "\n", "client = httpx.AsyncClient(timeout=180)\n", "API_URL = \"https://router.huggingface.co/fal-ai/fal-ai/dia-tts\"\n", "\n", "\n", "async def query(dialogue: str, token: gr.OAuthToken | None):\n", " if token is None:\n", " raise gr.Error(\n", " \"No token provided. Use Sign in with Hugging Face to get a token.\"\n", " )\n", " headers = {\n", " \"Authorization\": f\"Bearer {token.token}\",\n", " }\n", " response = await client.post(API_URL, headers=headers, json={\"text\": dialogue})\n", " url = response.json()[\"audio\"][\"url\"]\n", " print(\"URL: \", url)\n", " return url\n", "\n", "\n", "def formatter(speaker, text):\n", " speaker = speaker.split(\" \")[1]\n", " return f\"[S{speaker}] {text}\"\n", "\n", "\n", "with gr.Blocks() as demo:\n", " with gr.Sidebar():\n", " login_button = gr.LoginButton()\n", " gr.HTML(\n", " \"\"\"\n", " <h1 style='text-align: center; display: flex; align-items: center; justify-content: center;'>\n", " <img src=\"https://huggingface.co/datasets/freddyaboulton/bucket/resolve/main/dancing_huggy.gif\" alt=\"Dancing Huggy\" style=\"height: 100px; margin-right: 10px\"> Dia Dialogue Generation Model\n", " </h1>\n", " <h2 style='text-align: center; display: flex; align-items: center; justify-content: center;'>Model by <a href=\"https://huggingface.co/nari-labs/Dia-1.6B\"> Nari Labs</a>. Powered by HF and <a href=\"https://fal.ai/\">Fal AI</a> API.</h2>\n", " <h3>Dia is a dialogue generation model that can generate realistic dialogue between two speakers. Use the dialogue component to create a conversation and then hit the submit button in the bottom right corner to see it come to life .</h3>\n", " \"\"\"\n", " )\n", " with gr.Row():\n", " with gr.Column():\n", " dialogue = gr.Dialogue(\n", " speakers=speakers, emotions=emotions, formatter=formatter\n", " )\n", " with gr.Column():\n", " with gr.Row():\n", " audio = gr.Audio(label=\"Audio\")\n", " with gr.Row():\n", " gr.DeepLinkButton(value=\"Share Audio via Link\")\n", " with gr.Row():\n", " gr.Examples(\n", " examples=[\n", " [\n", " [\n", " {\n", " \"speaker\": \"Speaker 1\",\n", " \"text\": \"Why did the chicken cross the road?\",\n", " },\n", " {\"speaker\": \"Speaker 2\", \"text\": \"I don't know!\"},\n", " {\n", " \"speaker\": \"Speaker 1\",\n", " \"text\": \"to get to the other side! (laughs)\",\n", " },\n", " ]\n", " ],\n", " [\n", " [\n", " {\n", " \"speaker\": \"Speaker 1\",\n", " \"text\": \"I am a little tired today (sighs).\",\n", " },\n", " {\"speaker\": \"Speaker 2\", \"text\": \"Hang in there!\"},\n", " ]\n", " ],\n", " ],\n", " inputs=[dialogue],\n", " cache_examples=False,\n", " )\n", "\n", " dialogue.submit(query, [dialogue], audio)\n", "\n", "if __name__ == \"__main__\":\n", " demo.launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
import gradio as gr | ||
import httpx | ||
|
||
|
||
emotions = [ | ||
"(laughs)", | ||
"(clears throat)", | ||
"(sighs)", | ||
"(gasps)", | ||
"(coughs)", | ||
"(singing)", | ||
"(sings)", | ||
"(mumbles)", | ||
"(beep)", | ||
"(groans)", | ||
"(sniffs)", | ||
"(claps)", | ||
"(screams)", | ||
"(inhales)", | ||
"(exhales)", | ||
"(applause)", | ||
"(burps)", | ||
"(humming)", | ||
"(sneezes)", | ||
"(chuckle)", | ||
"(whistles)", | ||
] | ||
speakers = ["Speaker 1", "Speaker 2"] | ||
|
||
client = httpx.AsyncClient(timeout=180) | ||
API_URL = "https://router.huggingface.co/fal-ai/fal-ai/dia-tts" | ||
|
||
|
||
async def query(dialogue: str, token: gr.OAuthToken | None): | ||
if token is None: | ||
raise gr.Error( | ||
"No token provided. Use Sign in with Hugging Face to get a token." | ||
) | ||
headers = { | ||
"Authorization": f"Bearer {token.token}", | ||
} | ||
response = await client.post(API_URL, headers=headers, json={"text": dialogue}) | ||
url = response.json()["audio"]["url"] | ||
print("URL: ", url) | ||
return url | ||
|
||
|
||
def formatter(speaker, text): | ||
speaker = speaker.split(" ")[1] | ||
return f"[S{speaker}] {text}" | ||
|
||
|
||
with gr.Blocks() as demo: | ||
with gr.Sidebar(): | ||
login_button = gr.LoginButton() | ||
gr.HTML( | ||
""" | ||
<h1 style='text-align: center; display: flex; align-items: center; justify-content: center;'> | ||
<img src="https://huggingface.co/datasets/freddyaboulton/bucket/resolve/main/dancing_huggy.gif" alt="Dancing Huggy" style="height: 100px; margin-right: 10px"> Dia Dialogue Generation Model | ||
</h1> | ||
<h2 style='text-align: center; display: flex; align-items: center; justify-content: center;'>Model by <a href="https://huggingface.co/nari-labs/Dia-1.6B"> Nari Labs</a>. Powered by HF and <a href="https://fal.ai/">Fal AI</a> API.</h2> | ||
<h3>Dia is a dialogue generation model that can generate realistic dialogue between two speakers. Use the dialogue component to create a conversation and then hit the submit button in the bottom right corner to see it come to life .</h3> | ||
""" | ||
) | ||
with gr.Row(): | ||
with gr.Column(): | ||
dialogue = gr.Dialogue( | ||
speakers=speakers, emotions=emotions, formatter=formatter | ||
) | ||
with gr.Column(): | ||
with gr.Row(): | ||
audio = gr.Audio(label="Audio") | ||
with gr.Row(): | ||
gr.DeepLinkButton(value="Share Audio via Link") | ||
with gr.Row(): | ||
gr.Examples( | ||
examples=[ | ||
[ | ||
[ | ||
{ | ||
"speaker": "Speaker 1", | ||
"text": "Why did the chicken cross the road?", | ||
}, | ||
{"speaker": "Speaker 2", "text": "I don't know!"}, | ||
{ | ||
"speaker": "Speaker 1", | ||
"text": "to get to the other side! (laughs)", | ||
}, | ||
] | ||
], | ||
[ | ||
[ | ||
{ | ||
"speaker": "Speaker 1", | ||
"text": "I am a little tired today (sighs).", | ||
}, | ||
{"speaker": "Speaker 2", "text": "Hang in there!"}, | ||
] | ||
], | ||
], | ||
inputs=[dialogue], | ||
cache_examples=False, | ||
) | ||
|
||
dialogue.submit(query, [dialogue], audio) | ||
|
||
if __name__ == "__main__": | ||
demo.launch() |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -0,0 +1,155 @@ | ||||||
from __future__ import annotations | ||||||
|
||||||
from collections.abc import Callable | ||||||
|
||||||
from gradio.components.base import server | ||||||
from gradio.components.textbox import Textbox | ||||||
from gradio.data_classes import GradioModel, GradioRootModel | ||||||
from gradio.events import Events | ||||||
|
||||||
|
||||||
class DialogueLine(GradioModel): | ||||||
speaker: str | ||||||
text: str | ||||||
|
||||||
|
||||||
class DialogueModel(GradioRootModel): | ||||||
root: list[DialogueLine] | str | ||||||
|
||||||
|
||||||
class Dialogue(Textbox): | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is the benefit of inherting from |
||||||
""" | ||||||
Creates a dialogue components for users to enter dialogue between speakers. | ||||||
|
||||||
Demos: dia_dialogue_demo | ||||||
""" | ||||||
|
||||||
EVENTS = [ | ||||||
Events.change, | ||||||
Events.input, | ||||||
Events.submit, | ||||||
] | ||||||
|
||||||
data_model = DialogueModel | ||||||
|
||||||
def __init__( | ||||||
self, | ||||||
value: list[dict[str, str]] | Callable | None = None, | ||||||
*, | ||||||
speakers: list[str] | None = None, | ||||||
formatter: Callable | None = None, | ||||||
emotions: list[str] | None = None, | ||||||
separator: str = " ", | ||||||
label: str | None = "Dialogue", | ||||||
info: str | ||||||
| None = "Type colon (:) in the dialogue line to see the available emotion and intonation tags", | ||||||
placeholder: str | None = "Enter dialogue here...", | ||||||
Comment on lines
+44
to
+46
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we really want to have default values for |
||||||
show_label: bool | None = None, | ||||||
container: bool = True, | ||||||
scale: int | None = None, | ||||||
min_width: int = 160, | ||||||
interactive: bool | None = None, | ||||||
visible: bool = True, | ||||||
elem_id: str | None = None, | ||||||
autofocus: bool = False, | ||||||
autoscroll: bool = True, | ||||||
elem_classes: list[str] | str | None = None, | ||||||
render: bool = True, | ||||||
key: int | str | None = None, | ||||||
max_lines: int | None = None, | ||||||
show_submit_button: bool = True, | ||||||
show_copy_button: bool = True, | ||||||
): | ||||||
""" | ||||||
Parameters: | ||||||
value: Value of the dialogue. It is a list of dictionaries, each containing a 'speaker' key and a 'text' key. If a function is provided, the function will be called each time the app loads to set the initial value of this component. | ||||||
speakers: The different speakers allowed in the dialogue. | ||||||
formatter: A function that formats the dialogue line dictionary, e.g. {"speaker": "Speaker 1", "text": "Hello, how are you?"} into a string, e.g. "Speaker 1: Hello, how are you?". | ||||||
emotions: The different emotions and intonation allowed in the dialogue. Emotions are displayed in an autocomplete menu below the input textbox when the user starts typing `:`. Use the exact emotion name expected by the AI model or inference function. | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. as discussed in slack somewhere, consider a more general name for this parameter like
Suggested change
|
||||||
separator: The separator between the different dialogue lines used to join the formatted dialogue lines into a single string. For example, a newline character or empty string. | ||||||
max_lines: maximum number of lines allowed in the dialogue. | ||||||
placeholder: placeholder hint to provide behind textarea. | ||||||
label: the label for this component, displayed above the component if `show_label` is `True` and is also used as the header if there are a table of examples for this component. If None and used in a `gr.Interface`, the label will be the name of the parameter this component corresponds to. | ||||||
show_label: if True, will display the label. If False, the copy button is hidden as well as well as the label. | ||||||
container: if True, will place the component in a container - providing some extra padding around the border. | ||||||
scale: relative size compared to adjacent Components. For example if Components A and B are in a Row, and A has scale=2, and B has scale=1, A will be twice as wide as B. Should be an integer. scale applies in Rows, and to top-level Components in Blocks where fill_height=True. | ||||||
min_width: minimum pixel width, will wrap if not sufficient screen space to satisfy this value. If a certain scale value results in this Component being narrower than min_width, the min_width parameter will be respected first. | ||||||
interactive: if True, will be rendered as an editable textbox; if False, editing will be disabled. If not provided, this is inferred based on whether the component is used as an input or output. | ||||||
visible: If False, component will be hidden. | ||||||
autofocus: If True, will focus on the textbox when the page loads. Use this carefully, as it can cause usability issues for sighted and non-sighted users. | ||||||
elem_id: An optional string that is assigned as the id of this component in the HTML DOM. Can be used for targeting CSS styles. | ||||||
elem_classes: An optional list of strings that are assigned as the classes of this component in the HTML DOM. Can be used for targeting CSS styles. | ||||||
render: If False, component will not render be rendered in the Blocks context. Should be used if the intention is to assign event listeners now but render the component later. | ||||||
key: if assigned, will be used to assume identity across a re-render. Components that have the same key across a re-render will have their value preserved. | ||||||
show_copy_button: If True, includes a copy button to copy the text in the textbox. Only applies if show_label is True. | ||||||
show_submit_button: If True, includes a submit button to submit the dialogue. | ||||||
autoscroll: If True, will automatically scroll to the bottom of the textbox when the value changes, unless the user scrolls up. If False, will not scroll to the bottom of the textbox when the value changes. | ||||||
""" | ||||||
super().__init__( | ||||||
value="", | ||||||
label=label, | ||||||
info=info, | ||||||
placeholder=placeholder, | ||||||
show_label=show_label, | ||||||
container=container, | ||||||
scale=scale, | ||||||
min_width=min_width, | ||||||
interactive=interactive, | ||||||
visible=visible, | ||||||
elem_id=elem_id, | ||||||
autofocus=autofocus, | ||||||
autoscroll=autoscroll, | ||||||
elem_classes=elem_classes, | ||||||
render=render, | ||||||
key=key, | ||||||
max_lines=max_lines, | ||||||
) | ||||||
self.speakers = speakers | ||||||
self.emotions = emotions or [] | ||||||
self.formatter = formatter | ||||||
self.separator = separator | ||||||
self.show_submit_button = show_submit_button | ||||||
self.show_copy_button = show_copy_button | ||||||
if isinstance(value, Callable): | ||||||
value = value() | ||||||
self.value = ( | ||||||
self.preprocess(DialogueModel(root=value)) if value is not None else value # type: ignore | ||||||
) | ||||||
|
||||||
def preprocess(self, payload: DialogueModel) -> str: # type: ignore | ||||||
formatter = self.formatter | ||||||
if not formatter: | ||||||
formatter = self.default_formatter | ||||||
if isinstance(payload.root, str): | ||||||
return payload.root | ||||||
return self.separator.join( | ||||||
[formatter(line.speaker, line.text) for line in payload.root] | ||||||
) | ||||||
|
||||||
@staticmethod | ||||||
def default_formatter(speaker: str, text: str) -> str: | ||||||
return f"[{speaker}] {text}" | ||||||
|
||||||
@server | ||||||
async def format(self, value: list[dict]): | ||||||
"""Format the dialogue in the frontend into a string that's copied to the clipboard.""" | ||||||
data = DialogueModel(root=value) # type: ignore | ||||||
return self.preprocess(data) | ||||||
|
||||||
def postprocess(self, value): | ||||||
return value | ||||||
|
||||||
def as_example(self, value): | ||||||
return self.preprocess(DialogueModel(root=value)) | ||||||
|
||||||
def example_payload(self): | ||||||
return [ | ||||||
{"speaker": "Speaker 1", "text": "Hello, how are you?"}, | ||||||
{"speaker": "Speaker 2", "text": "I'm fine, thank you!"}, | ||||||
] | ||||||
|
||||||
def example_value(self): | ||||||
return [ | ||||||
{"speaker": "Speaker 1", "text": "Hello, how are you?"}, | ||||||
{"speaker": "Speaker 2", "text": "I'm fine, thank you!"}, | ||||||
] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For some reason I'm not seeing spaces before and after the links:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Really great demo! Btw for some reason I'm getting some strange outputs. The output audio is always exactly 30 seconds long and has long periods of silence and other artifacts. Not sure if it's an issue with their API or something with our preprocessing:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it's the Fal API. The zero-gpu demo in their org is a lot better.