-
Notifications
You must be signed in to change notification settings - Fork 31
/
app.py
265 lines (204 loc) · 14.3 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
# Imports
import torch
from torch import autocast
from diffusers import StableDiffusionPipeline
import numpy as np
import PIL
from PIL import Image
import gradio as gr
import sys
import openai
import speech_recognition as sr
import pygame
from pygame import mixer
import cv2
from vosk import KaldiRecognizer, SetLogLevel
from vosk import Model as vosk_Model
from pydub import AudioSegment
import wave
import json
from Img_Mover.Img_Mover import Img_Mover
from Girlfriend_Obj import Girlfriend_Obj
import matplotlib.pyplot as plt
import multiprocess
import gradio as gr
import time
from copy import deepcopy
import asyncio
import threading
import os
# # Path to the custom audio model
# audio_model_path = "Audio_Generation/Generation_Scripts/saved_models/default"
# # Path to the custom audio data
# audio_data_path = "Audio_Generation/Generation_Scripts/data/albedo"
# Path to the custom model to load in
custom_model_path = "CustomModel/"
# # The initial summary is initially a basic prompt telling GPT-3 who it is
# initial_summ = "You are my female waifu girlfriend who loves me."
# # The initial prompt tells GPT-3 how to respond
# initial_prompt = "Me: Hi\nYou: Hello\n\n"\
# "Me: How are you?\nYou: Good. How are you?\n\n"\
# "Me: I'm good.\nYou: Nice to meet you.\n\n"
initial_summ = "The following is a conversation with me and my waifu girlfriend\n\n"
initial_prompt = "Me: Hello\nGirlfriend: Hello\n"\
"Me: How are you?\nGirlfriend: I am good\n"
# Setup function to setup the environment
# memory_file = "config_file.json"
memory_file = None
MyGirlfriend = Girlfriend_Obj(initial_summ, initial_prompt, False, custom_model_path=custom_model_path, saved_memory=memory_file)
# Device must be cuda
device = torch.device("cuda:0")
def audio_auto_submit(custom_audio, custom_model, text, audio_pth, GPT_key):
if audio_pth != None:
return MyGirlfriend.generate_audio(custom_audio, custom_model, text, audio_pth, GPT_key)
return MyGirlfriend.last_text
# Initialize the audio mixer
mixer.init()
mixer.music.unload()
# Handle changes to the motion switch which either turns on or
# off image motion
def handle_motion_switch(switch_value):
MyGirlfriend.add_movement = switch_value
# Ensure the image is in the default position
MyGirlfriend.img_anim.pose *= 0
# Force a reload in the image
MyGirlfriend.force_gen = True
# Handles file uploads
def upload_file(file):
# Load the image as a PIL object
image = Image.open(file.name)
# When an image is generated, load it in the animator
old_add_movement = MyGirlfriend.add_movement
MyGirlfriend.add_movement = False
MyGirlfriend.img_anim.load_new_image(img=image)
MyGirlfriend.add_movement = old_add_movement
# Save the image in case of errors
MyGirlfriend.last_image = image
# Ensure the image style vector is reset
MyGirlfriend.img_anim.pose *= 0
# Force the image to be regenerated
MyGirlfriend.force_gen = True
return file.name
# Handles image saving
def save_img():
if not os.path.exists("saved_images"):
os.mkdir("saved_images")
filename = fr"./saved_images/{time.ctime().replace(' ', '-').replace(':', '.')}.png"
if type(MyGirlfriend.last_image) is not PIL.Image.Image:
Image.fromarray(MyGirlfriend.last_image.clip(0, 255).astype(np.uint8)).save(filename)
else:
MyGirlfriend.last_image.save(filename)
# Function used to test the mouth movement
def test_mouth():
# Make sure the mouth isn't already moving
if MyGirlfriend.generating_mouth_movement == True:
return
# Make sure the thread is not running
if MyGirlfriend.m_thread is not None:
MyGirlfriend.m_thread.join()
# Start the mouth movement loop
MyGirlfriend.m_thread = threading.Thread(target=MyGirlfriend.run_talk_loop, args=("test_audio.mp3",))
MyGirlfriend.m_thread.start()
# Loads a memory file into the model
def load_mem(file):
# Get the filename
filename = file.name
# Load in the file and upload the memory
outTxt = MyGirlfriend.load_mem(filename)
return filename, outTxt
interface = gr.Blocks(css="#color_red {background-color: #f44336}")
with interface:
with gr.Tabs():
with gr.TabItem("Intro"):
gr.Textbox("""
Below is an intro explaining how this app works...
Generation Tab:
Before starting, make sure to click the "Setup interface" button to setup the inferface and to begin using the app.
The upper-most part of the interface includes two tabs: "Voice-based Chat" and "Text-based Chat" which are used to repond to the AI. Voice-based allows you to use your mic to talk to the AI while text-based allows you to chat with the AI using text. The audio is auto-submitted for response while the text requires either pressing the "enter" key or clicking the "Generate Audio" button.
The next part is the "Response" text field. The latest response the AI gave will appear here.
Below reponse is a section split into two parts. The left-most part is the currently generated image. The rightmost section has multiple parts:
1. "Add motion to image?" checkbox is used to toggle image animation. If checked, the image will be animated. The animation includes blinking and mouth movement assuming the image is in the correct form.
2. "Mouth movement test" can be used to check if mouth movement works for the current image.
3. "Save current image" saves the currently generated image to a folder named "saved_images". The filename will be the current time and date so that images don't overwrite eachother
4. "Upload an image" is used to upload an image you want to load in as opposed to generating one until one looks good. Clikcing on this button allows you to select the image you want to display.
At the bottom of this section, there are two buttons: "Generate Audio" and "Generate Image". "Generate Audio" takes the currently entered text and generates a new response from the AI. "Generate Image" is used to generate a new image and display it.
Some notes about image animation:
The image must be in the correct form to be animated correctly. The image should be a face-shot photo to ensure that blinking is done correctly. Mouth movement will occur if the image is face forward and when audio is generated. Sometimes the mouth movement doesn't work and if this is the case, you should probably just generate an image until movement works.
Settings Tab:
The settings tab has several uses from loading in past memories to changing the style of the image to generate.
The first block in this tab is the "Use custom chat model?" checkbox. If this box is checked, a free custom model will be used to respond. Otherwise GPT-3 will respond. If the box is unchecked, an OpenAI key is required which can obtained following this article: https://elephas.app/blog/how-to-create-openai-api-keys-cl5c4f21d281431po7k8fgyol0 If a key isn't provided, an error will be shown in place of the response text.
The next block is the "Settings" blocks which is used to setup the style of the image and how it's generated. Settings can be found at the following link (though do be warned, the site has some sus images, not my doing btw): https://danbooru.donmai.us/wiki_pages/tag_group:image_composition
The next block is "Characteristics" which is also used to style the generated images. These prompts are more of how you want the generated image to look like. Should it be female or male? What color hair?
The "settings" block and "characteristics" block actually have no difference when implemented, but it's nice to break up the difference between image settings and image characteristics.
The next block is the "Guidance value" which is used as a tradeoff between Fidelity (how good the image looks) and variance (kind of how creative the model is). A value of 1 is required, and having a value too high will cause garbage to be produced. Keeping this value around 10 seems to work well.
The "blink time" button and field allows you to change the number of seconds it takes to make a full blink.
The next part is a memory loading system. As the conversation goes on, the conversation is saved to a memory file called "config_file.json". This file can be loaded back in through this section of the settings to replace the current conversation with a past one saved in a .json file. The text box next to the upload button signals where the upload was a success or a failure.
The last part is a reset button to reset the current memory to the initial prompt.
""")
with gr.TabItem("Generation"):
gen_col = gr.Column(visible=False)
with gen_col:
# Talking to the AI
with gr.Tabs():
with gr.TabItem("Voice-based Chat"):
audio = gr.Audio(source="microphone", type="filepath", label="Response", live=True)
with gr.TabItem("Text-based Chat"):
text = gr.Textbox(label="Text", value="I love you!", interactive=True)
response = gr.Textbox(label="Response", value="", interactive=False)
with gr.Row():
# Note gallery expects a 3-D array: (L, W, 3)
gallery = gr.Image(label="Generated images", show_label=False)\
.style(height=512)
with gr.Column():
# Switch to generate a new image with audio or keep the
# image static
motion_switch = gr.Checkbox(value=True, label="Add motion to image?")
motion_switch.change(fn=handle_motion_switch, inputs=[motion_switch], outputs=[])
# Button to test mouth movement
btn_mouth_test = gr.Button("Mouth movement test")
btn_mouth_test.click(fn=test_mouth, inputs=[], outputs=[])
# Button to save the currently generated image
btn_save_img = gr.Button("Save Current Image")
btn_save_img.click(fn=save_img, inputs=[], outputs=[])
# Button to load an image
upload_button = gr.UploadButton("Upload an image", file_types=["image"], file_count="single")
upload_button.upload(fn=upload_file, inputs=[upload_button])
with gr.Row():
# Button to generate new audio
btn_audio = gr.Button("Generate Audio")
# Button to generate new audio
btn_img = gr.Button("Generate Image")
# Button to load and setup the generation tab
btn_load = gr.Button("Setup interface")
btn_load.click(fn=MyGirlfriend.event_loop, inputs=[], outputs=[gallery, gen_col, btn_load], queue=True)
with gr.TabItem("Settings"):
# Switched for which model to use
custom_model = gr.Checkbox(value=True, label="Use custom chat model? (False to use GPT, True to use custom model)")
GPT_key_ = gr.Textbox(label="Key to use GPT-3 (if using GPT-3)\nNote: If you don't have one go here: https://elephas.app/blog/how-to-create-openai-api-keys-cl5c4f21d281431po7k8fgyol0", value="", interactive=True)
custom_audio = gr.Checkbox(value=False, label="Use custom audio model?")
# Settings for the image
settings = gr.Textbox(label="Settings", value= "1girl,solo focus,very wide shot,feamle focus,ratio:16:9,detailed,looking at viewer,facing viewer,facing forward,vtuber", interactive=True)
characteristics = gr.Textbox(label="Characteristics", value="waifu,female,brown hair,blue eyes,sidelocks,slight blush,happy", interactive=True)
guidance_scale = gr.Number(label="Guidance value - Tradeoff between creativity and image fidelity (greater than 1.0)", value=10.0, interactive=True, precision=1)
with gr.Row():
blink_time = gr.Number(label="Time for a full blink (in seconds) (limited between 0.5 and 2.0)", value=0.6, interactive=True, precision=2)
blink_time_btn = gr.Button(value="Change blink time").click(MyGirlfriend.change_blink_time, inputs=[blink_time])
blink_time.submit(MyGirlfriend.change_blink_time, inputs=[blink_time])
# Used to load a memory file
with gr.Column():
with gr.Row():
trash_file_output = gr.File(visible=False)
mem_load_btn = gr.UploadButton("Load memory file", file_types=["json"], file_count="single")
mem_file_success = gr.Textbox(label="Was the load successful?", value= "", interactive=False)
mem_load_btn.upload(fn=load_mem, inputs=[mem_load_btn], outputs=[trash_file_output, mem_file_success])
# Used to treset the memory of the model
reset_btn = gr.Button(value="Reset Memory", elem_id="color_red")
reset_btn.click(MyGirlfriend.reset_memory, inputs=[], outputs=[])
# When the audio is changed, we want to auto submit it
audio.change(fn=audio_auto_submit, inputs=[custom_audio, custom_model, text, audio, GPT_key_], outputs=[response])
# When the button or text is submitted, we want to generate new audio
btn_audio.click(fn=MyGirlfriend.generate_audio, inputs=[custom_audio, custom_model, text, audio, GPT_key_], outputs=[response])
text.submit(fn=MyGirlfriend.generate_audio, inputs=[custom_audio, custom_model, text, audio, GPT_key_], outputs=[response])
# When the image button is clicked, we want to generate a new image
btn_img.click(fn=MyGirlfriend.generate_img, inputs=[settings, characteristics, guidance_scale], outputs=[])
interface.queue(concurrency_count=3).launch(debug=False, share=False)