Add video demo for ChatBot (IDEA-Research#121)

Andy1621 · web-flow · commit 7d2ff303ddf1 · 2023-04-14T14:15:03.000+08:00
* update readme

* add video

* Update README.md

* remove video
diff --git a/README.md b/README.md
@@ -15,6 +15,7 @@ The **core idea** behind this project is to **combine the strengths of different
 - The combination of `BLIP + Grounding DINO + SAM` for **automatic labeling system**!
 - The combination of `Grounding DINO + SAM + Stable-diffusion` for **data-factory, generating new data**!
 - The combination of `Whisper + Grounding DINO + SAM` to **detect and segment anything with speech**!
+- The chatbot **for the above tools** with better reasoning!
 
 **🔥 🔈Speak to edit🎨: Whisper + ChatGPT + Grounded-SAM + SD**
 
@@ -32,10 +33,6 @@ Using BLIP to generate caption, extracting tags with ChatGPT, and using Grounded
 
 ![](./assets/automatic_label_output_demo3.jpg)
 
-**ChatBot**
-![](./assets/chatbot_demo.png)
-
-
 **Imagine Space**
 
 Some possible avenues for future work ...
@@ -51,13 +48,18 @@ Some possible avenues for future work ...
 **Tips**
 - If you want to detect multiple objects in one sentence with [Grounding DINO](https://github.com/IDEA-Research/GroundingDINO), we suggest seperating each name with `.` . An example: `cat . dog . chair .`
 
-## :fire: What's New 
-- 🆕 Release the interactive fashion-edit playground in [here](https://github.com/IDEA-Research/Grounded-Segment-Anything/tree/humanFace). Run in the notebook, just click for annotating points for further segmentation. Enjoy it! 
+## What's New
 
+- :fire: **ChatBot** for our project is built!
+
+https://user-images.githubusercontent.com/24236723/231955561-2ae4ec1a-c75f-4cc5-9b7b-517aa1432123.mp4
 
-  <img src="https://github.com/IDEA-Research/Grounded-Segment-Anything/blob/humanFace/assets/interactive-fashion-edit.png" width="500" height="260"/><img src="https://github.com/IDEA-Research/Grounded-Segment-Anything/blob/humanFace/assets/interactive-mark.gif" width="250" height="250"/>
 
 
+- 🆕 Release the interactive fashion-edit playground in [here](https://github.com/IDEA-Research/Grounded-Segment-Anything/tree/humanFace). Run in the notebook, just click for annotating points for further segmentation. Enjoy it! 
+
+
+  <img src="https://github.com/IDEA-Research/Grounded-Segment-Anything/blob/humanFace/assets/interactive-fashion-edit.png" width="500" height="260"/><img src="https://github.com/IDEA-Research/Grounded-Segment-Anything/blob/humanFace/assets/interactive-mark.gif" width="250" height="250"/>
 
 - :new: Checkout our related human-face-edit branch [here](https://github.com/IDEA-Research/Grounded-Segment-Anything/tree/humanFace). We'll keep updating this branch with more interesting features. Here are some examples:
 
diff --git a/chatbot.py b/chatbot.py
@@ -12,6 +12,7 @@
 import argparse
 import inspect
 
+import shutil
 import torchvision
 import whisper
 import matplotlib.pyplot as plt
@@ -1216,7 +1217,7 @@ def inference_auto_segment_object(self, image_path):
         text_prompt = generate_tags(caption, split=",")
         print(f"\nCaption: {caption}")
         print(f"Tags: {text_prompt}")
-        updated_image_path, pred_phrases = self._segment_object(image_path, text_prompt, func_name="seg-objects")
+        updated_image_path, pred_phrases = self._segment_object(image_path, text_prompt, func_name="auto-label")
         caption = check_caption(caption, pred_phrases)
         print(f"Revise caption with number: {caption}")
         print(f"Processed SegmentMultiObject, Input Image: {image_path}, Caption: {caption}, "
@@ -1251,8 +1252,8 @@ def _inpainting(self, image_path, to_be_replaced_txt, replace_with_txt, func_nam
         )
         # inpainting pipeline
         mask = masks[0][0].cpu().numpy() # simply choose the first mask, which will be refine in the future release
-        mask_pil = Image.fromarray(mask)
-        image_pil = Image.fromarray(image)
+        mask_pil = Image.fromarray(mask).resize((512, 512))
+        image_pil = Image.fromarray(image).resize((512, 512))
         image = self.sd_pipe(prompt=replace_with_txt, image=image_pil, mask_image=mask_pil).images[0]
         updated_image_path = get_new_image_name(image_path, func_name)
         image.save(updated_image_path)
@@ -1313,18 +1314,23 @@ def run_text(self, text, state):
         return state, state
 
     def run_image(self, image, state, txt, lang):
-        image_filename = os.path.join('image', f"{str(uuid.uuid4())[:8]}.png")
-        print("======>Auto Resize Image...")
-        img = Image.open(image.name)
-        width, height = img.size
-        ratio = min(512 / width, 512 / height)
-        width_new, height_new = (round(width * ratio), round(height * ratio))
-        width_new = int(np.round(width_new / 64.0)) * 64
-        height_new = int(np.round(height_new / 64.0)) * 64
-        img = img.resize((width_new, height_new))
-        img = img.convert('RGB')
-        img.save(image_filename, "PNG")
-        print(f"Resize image form {width}x{height} to {width_new}x{height_new}")
+        # image_filename = os.path.join('image', f"{str(uuid.uuid4())[:8]}.png")
+        # print("======>Auto Resize Image...")
+        # img = Image.open(image.name)
+        # width, height = img.size
+        # ratio = min(512 / width, 512 / height)
+        # width_new, height_new = (round(width * ratio), round(height * ratio))
+        # width_new = int(np.round(width_new / 64.0)) * 64
+        # height_new = int(np.round(height_new / 64.0)) * 64
+        # img = img.resize((width_new, height_new))
+        # img = img.convert('RGB')
+        # img.save(image_filename)
+        # img.save(image_filename, "PNG")
+        # print(f"Resize image form {width}x{height} to {width_new}x{height_new}")
+        ## Directly use original image for better results
+        suffix = image.name.split('.')[-1] 
+        image_filename = os.path.join('image', f"{str(uuid.uuid4())[:8]}.{suffix}")
+        shutil.copy(image.name, image_filename)
         if 'Grounded_dino_sam_inpainting' in self.models:
             description = self.models['Grounded_dino_sam_inpainting'].inference_caption(image_filename)
         else:
@@ -1388,7 +1394,7 @@ def speech_recognition(speech_file):
 
 if __name__ == '__main__':
     load_dict = {'Grounded_dino_sam_inpainting': 'cuda:0'}
-#     load_dict = {'ImageCaptioning': 'cuda:0'}
+    # load_dict = {'ImageCaptioning': 'cuda:0'}
     
     bot = ConversationBot(load_dict)
 
@@ -1451,3 +1457,4 @@ def speech_recognition(speech_file):
         clear.click(lambda: [], None, state)
 
     demo.launch(server_name="0.0.0.0", server_port=10010)
+