Skip to content

Commit 7d2ff30

Browse files
authored
Add video demo for ChatBot (IDEA-Research#121)
* update readme * add video * Update README.md * remove video
1 parent 507fcc3 commit 7d2ff30

File tree

2 files changed

+32
-23
lines changed

2 files changed

+32
-23
lines changed

Diff for: README.md

+9-7
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ The **core idea** behind this project is to **combine the strengths of different
1515
- The combination of `BLIP + Grounding DINO + SAM` for **automatic labeling system**!
1616
- The combination of `Grounding DINO + SAM + Stable-diffusion` for **data-factory, generating new data**!
1717
- The combination of `Whisper + Grounding DINO + SAM` to **detect and segment anything with speech**!
18+
- The chatbot **for the above tools** with better reasoning!
1819

1920
**🔥 🔈Speak to edit🎨: Whisper + ChatGPT + Grounded-SAM + SD**
2021

@@ -32,10 +33,6 @@ Using BLIP to generate caption, extracting tags with ChatGPT, and using Grounded
3233

3334
![](./assets/automatic_label_output_demo3.jpg)
3435

35-
**ChatBot**
36-
![](./assets/chatbot_demo.png)
37-
38-
3936
**Imagine Space**
4037

4138
Some possible avenues for future work ...
@@ -51,13 +48,18 @@ Some possible avenues for future work ...
5148
**Tips**
5249
- If you want to detect multiple objects in one sentence with [Grounding DINO](https://github.com/IDEA-Research/GroundingDINO), we suggest seperating each name with `.` . An example: `cat . dog . chair .`
5350

54-
## :fire: What's New
55-
- 🆕 Release the interactive fashion-edit playground in [here](https://github.com/IDEA-Research/Grounded-Segment-Anything/tree/humanFace). Run in the notebook, just click for annotating points for further segmentation. Enjoy it!
51+
## What's New
5652

53+
- :fire: **ChatBot** for our project is built!
54+
55+
https://user-images.githubusercontent.com/24236723/231955561-2ae4ec1a-c75f-4cc5-9b7b-517aa1432123.mp4
5756

58-
<img src="https://github.com/IDEA-Research/Grounded-Segment-Anything/blob/humanFace/assets/interactive-fashion-edit.png" width="500" height="260"/><img src="https://github.com/IDEA-Research/Grounded-Segment-Anything/blob/humanFace/assets/interactive-mark.gif" width="250" height="250"/>
5957

6058

59+
- 🆕 Release the interactive fashion-edit playground in [here](https://github.com/IDEA-Research/Grounded-Segment-Anything/tree/humanFace). Run in the notebook, just click for annotating points for further segmentation. Enjoy it!
60+
61+
62+
<img src="https://github.com/IDEA-Research/Grounded-Segment-Anything/blob/humanFace/assets/interactive-fashion-edit.png" width="500" height="260"/><img src="https://github.com/IDEA-Research/Grounded-Segment-Anything/blob/humanFace/assets/interactive-mark.gif" width="250" height="250"/>
6163

6264
- :new: Checkout our related human-face-edit branch [here](https://github.com/IDEA-Research/Grounded-Segment-Anything/tree/humanFace). We'll keep updating this branch with more interesting features. Here are some examples:
6365

Diff for: chatbot.py

+23-16
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import argparse
1313
import inspect
1414

15+
import shutil
1516
import torchvision
1617
import whisper
1718
import matplotlib.pyplot as plt
@@ -1216,7 +1217,7 @@ def inference_auto_segment_object(self, image_path):
12161217
text_prompt = generate_tags(caption, split=",")
12171218
print(f"\nCaption: {caption}")
12181219
print(f"Tags: {text_prompt}")
1219-
updated_image_path, pred_phrases = self._segment_object(image_path, text_prompt, func_name="seg-objects")
1220+
updated_image_path, pred_phrases = self._segment_object(image_path, text_prompt, func_name="auto-label")
12201221
caption = check_caption(caption, pred_phrases)
12211222
print(f"Revise caption with number: {caption}")
12221223
print(f"Processed SegmentMultiObject, Input Image: {image_path}, Caption: {caption}, "
@@ -1251,8 +1252,8 @@ def _inpainting(self, image_path, to_be_replaced_txt, replace_with_txt, func_nam
12511252
)
12521253
# inpainting pipeline
12531254
mask = masks[0][0].cpu().numpy() # simply choose the first mask, which will be refine in the future release
1254-
mask_pil = Image.fromarray(mask)
1255-
image_pil = Image.fromarray(image)
1255+
mask_pil = Image.fromarray(mask).resize((512, 512))
1256+
image_pil = Image.fromarray(image).resize((512, 512))
12561257
image = self.sd_pipe(prompt=replace_with_txt, image=image_pil, mask_image=mask_pil).images[0]
12571258
updated_image_path = get_new_image_name(image_path, func_name)
12581259
image.save(updated_image_path)
@@ -1313,18 +1314,23 @@ def run_text(self, text, state):
13131314
return state, state
13141315

13151316
def run_image(self, image, state, txt, lang):
1316-
image_filename = os.path.join('image', f"{str(uuid.uuid4())[:8]}.png")
1317-
print("======>Auto Resize Image...")
1318-
img = Image.open(image.name)
1319-
width, height = img.size
1320-
ratio = min(512 / width, 512 / height)
1321-
width_new, height_new = (round(width * ratio), round(height * ratio))
1322-
width_new = int(np.round(width_new / 64.0)) * 64
1323-
height_new = int(np.round(height_new / 64.0)) * 64
1324-
img = img.resize((width_new, height_new))
1325-
img = img.convert('RGB')
1326-
img.save(image_filename, "PNG")
1327-
print(f"Resize image form {width}x{height} to {width_new}x{height_new}")
1317+
# image_filename = os.path.join('image', f"{str(uuid.uuid4())[:8]}.png")
1318+
# print("======>Auto Resize Image...")
1319+
# img = Image.open(image.name)
1320+
# width, height = img.size
1321+
# ratio = min(512 / width, 512 / height)
1322+
# width_new, height_new = (round(width * ratio), round(height * ratio))
1323+
# width_new = int(np.round(width_new / 64.0)) * 64
1324+
# height_new = int(np.round(height_new / 64.0)) * 64
1325+
# img = img.resize((width_new, height_new))
1326+
# img = img.convert('RGB')
1327+
# img.save(image_filename)
1328+
# img.save(image_filename, "PNG")
1329+
# print(f"Resize image form {width}x{height} to {width_new}x{height_new}")
1330+
## Directly use original image for better results
1331+
suffix = image.name.split('.')[-1]
1332+
image_filename = os.path.join('image', f"{str(uuid.uuid4())[:8]}.{suffix}")
1333+
shutil.copy(image.name, image_filename)
13281334
if 'Grounded_dino_sam_inpainting' in self.models:
13291335
description = self.models['Grounded_dino_sam_inpainting'].inference_caption(image_filename)
13301336
else:
@@ -1388,7 +1394,7 @@ def speech_recognition(speech_file):
13881394

13891395
if __name__ == '__main__':
13901396
load_dict = {'Grounded_dino_sam_inpainting': 'cuda:0'}
1391-
# load_dict = {'ImageCaptioning': 'cuda:0'}
1397+
# load_dict = {'ImageCaptioning': 'cuda:0'}
13921398

13931399
bot = ConversationBot(load_dict)
13941400

@@ -1451,3 +1457,4 @@ def speech_recognition(speech_file):
14511457
clear.click(lambda: [], None, state)
14521458

14531459
demo.launch(server_name="0.0.0.0", server_port=10010)
1460+

0 commit comments

Comments
 (0)