showlab
diff --git a/‎main.py
+3-2 b/‎main.py
+3-2
diff --git a/‎main_gradio.py
+3-2 b/‎main_gradio.py
+3-2
diff --git a/‎models/__pycache__/blip2_model.cpython-38.pyc
142 Bytes b/‎models/__pycache__/blip2_model.cpython-38.pyc
142 Bytes
diff --git a/‎models/__pycache__/gpt_model.cpython-38.pyc
4 Bytes b/‎models/__pycache__/gpt_model.cpython-38.pyc
4 Bytes
diff --git a/‎models/__pycache__/image_text_transformation.cpython-38.pyc
103 Bytes b/‎models/__pycache__/image_text_transformation.cpython-38.pyc
103 Bytes
diff --git a/‎models/__pycache__/region_semantic.cpython-38.pyc
697 Bytes b/‎models/__pycache__/region_semantic.cpython-38.pyc
697 Bytes
diff --git a/‎models/blip2_model.py
+3-1 b/‎models/blip2_model.py
+3-1
diff --git a/‎models/grit_src/__pycache__/image_dense_captions.cpython-38.pyc
77 Bytes b/‎models/grit_src/__pycache__/image_dense_captions.cpython-38.pyc
77 Bytes
diff --git a/‎models/grit_src/image_dense_captions.py
+2 b/‎models/grit_src/image_dense_captions.py
+2
diff --git a/‎models/image_text_transformation.py
+4-2 b/‎models/image_text_transformation.py
+4-2
diff --git a/‎models/region_semantic.py
+28-9 b/‎models/region_semantic.py
+28-9
diff --git a/‎models/segment_models/__pycache__/edit_anything_model.cpython-38.pyc
2.32 KB b/‎models/segment_models/__pycache__/edit_anything_model.cpython-38.pyc
2.32 KB
diff --git a/‎models/segment_models/__pycache__/semantic_segment_anything_model.cpython-38.pyc
101 Bytes b/‎models/segment_models/__pycache__/semantic_segment_anything_model.cpython-38.pyc
101 Bytes
diff --git a/‎models/segment_models/__pycache__/semgent_anything_model.cpython-38.pyc
58 Bytes b/‎models/segment_models/__pycache__/semgent_anything_model.cpython-38.pyc
58 Bytes
diff --git a/‎models/segment_models/edit_anything_model.py
+50 b/‎models/segment_models/edit_anything_model.py
+50
diff --git a/‎models/segment_models/semantic_segment_anything_model.py
+2 b/‎models/segment_models/semantic_segment_anything_model.py
+2
diff --git a/‎models/segment_models/semgent_anything_model.py
+2-1 b/‎models/segment_models/semgent_anything_model.py
+2-1
diff --git a/‎output/1_result.jpg
-51.2 KB b/‎output/1_result.jpg
-51.2 KB
diff --git a/‎readme.md
+5-3 b/‎readme.md
+5-3
diff --git a/‎utils/__pycache__/util.cpython-38.pyc
677 Bytes b/‎utils/__pycache__/util.cpython-38.pyc
677 Bytes
diff --git a/‎utils/util.py
+31 b/‎utils/util.py
+31
@@ -9,10 +9,11 @@
     parser.add_argument('--gpt_version', choices=['gpt-3.5-turbo', 'gpt4'], default='gpt-3.5-turbo')
     parser.add_argument('--image_caption', action='store_true', dest='image_caption', default=True, help='Set this flag to True if you want to use BLIP2 Image Caption')
     parser.add_argument('--dense_caption', action='store_true', dest='dense_caption', default=True, help='Set this flag to True if you want to use Dense Caption')
-    parser.add_argument('--semantic_segment', action='store_true', dest='semantic_segment', default=False, help='Set this flag to True if you want to use semantic segmentation')
+    parser.add_argument('--semantic_segment', action='store_true', dest='semantic_segment', default=True, help='Set this flag to True if you want to use semantic segmentation')
+    parser.add_argument('--region_classify_model', choices=['ssa', 'edit_anything'], dest='region_classify_model', default='edit_anything', help='Select the region classification model: semantic segment anything or edit anything')
     parser.add_argument('--image_caption_device', choices=['cuda', 'cpu'], default='cuda', help='Select the device: cuda or cpu, gpu memory larger than 14G is recommended')
     parser.add_argument('--dense_caption_device', choices=['cuda', 'cpu'], default='cuda', help='Select the device: cuda or cpu, < 6G GPU is not recommended>')
-    parser.add_argument('--semantic_segment_device', choices=['cuda', 'cpu'], default='cuda', help='Select the device: cuda or cpu, gpu memory larger than 14G is recommended')
+    parser.add_argument('--semantic_segment_device', choices=['cuda', 'cpu'], default='cpu', help='Select the device: cuda or cpu, gpu memory larger than 14G is recommended. Make sue this model and image_caption model on same device.')
     parser.add_argument('--contolnet_device', choices=['cuda', 'cpu'], default='cuda', help='Select the device: cuda or cpu, <6G GPU is not recommended>')
 
     args = parser.parse_args()
 
@@ -11,10 +11,11 @@
 parser.add_argument('--gpt_version', choices=['gpt-3.5-turbo', 'gpt4'], default='gpt-3.5-turbo')
 parser.add_argument('--image_caption', action='store_true', dest='image_caption', default=True, help='Set this flag to True if you want to use BLIP2 Image Caption')
 parser.add_argument('--dense_caption', action='store_true', dest='dense_caption', default=True, help='Set this flag to True if you want to use Dense Caption')
-parser.add_argument('--semantic_segment', action='store_true', dest='semantic_segment', default=False, help='Set this flag to True if you want to use semantic segmentation')
+parser.add_argument('--semantic_segment', action='store_true', dest='semantic_segment', default=True, help='Set this flag to True if you want to use semantic segmentation')
+parser.add_argument('--region_classify_model', choices=['ssa', 'edit_anything'], dest='region_classify_model', default='edit_anything', help='Select the region classification model: semantic segment anything or edit anything')
 parser.add_argument('--image_caption_device', choices=['cuda', 'cpu'], default='cpu', help='Select the device: cuda or cpu, gpu memory larger than 14G is recommended')
 parser.add_argument('--dense_caption_device', choices=['cuda', 'cpu'], default='cuda', help='Select the device: cuda or cpu, < 6G GPU is not recommended>')
-parser.add_argument('--semantic_segment_device', choices=['cuda', 'cpu'], default='cpu', help='Select the device: cuda or cpu, gpu memory larger than 14G is recommended')
+parser.add_argument('--semantic_segment_device', choices=['cuda', 'cpu'], default='cpu', help='Select the device: cuda or cpu, gpu memory larger than 14G is recommended. Make sue this model and image_caption model on same device.')
 parser.add_argument('--contolnet_device', choices=['cuda', 'cpu'], default='cuda', help='Select the device: cuda or cpu, <6G GPU is not recommended>')
 
 args = parser.parse_args()
 
@@ -2,6 +2,7 @@
 import requests
 from transformers import Blip2Processor, Blip2ForConditionalGeneration, BlipProcessor, BlipForConditionalGeneration
 import torch
+from utils.util import resize_long_edge
 
 
 class ImageCaptioning:
@@ -20,12 +21,13 @@ def initialize_model(self):
         # )
         # for gpu with small memory
         processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
-        model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
+        model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base", torch_dtype=self.data_type)
         model.to(self.device)
         return processor, model
 
     def image_caption(self, image_src):
         image = Image.open(image_src)
+        image = resize_long_edge(image, 384)
         inputs = self.processor(images=image, return_tensors="pt").to(self.device, self.data_type)
         generated_ids = self.model.generate(**inputs)
         generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
 
@@ -16,6 +16,7 @@
 
 from models.grit_src.grit.predictor import VisualizationDemo
 import json
+from utils.util import resize_long_edge_cv2
 
 
 # constants
@@ -62,6 +63,7 @@ def image_caption_api(image_src, device):
     demo = VisualizationDemo(cfg)
     if image_src:
         img = read_image(image_src, format="BGR")
+        img = resize_long_edge_cv2(img, 384)
         predictions, visualized_output = demo.run_on_image(img)
         new_caption = dense_pred_to_caption(predictions)
     return new_caption
@@ -3,7 +3,7 @@
 from models.gpt_model import ImageToText
 from models.controlnet_model import TextToImage
 from models.region_semantic import RegionSemantic
-from utils.util import read_image_width_height, display_images_and_text
+from utils.util import read_image_width_height, display_images_and_text, resize_long_edge
 import argparse
 from PIL import Image
 import base64
@@ -33,13 +33,15 @@ def init_models(self):
         self.dense_caption_model = DenseCaptioning(device=self.args.dense_caption_device)
         self.gpt_model = ImageToText(openai_key)
         self.controlnet_model = TextToImage(device=self.args.contolnet_device)
-        self.region_semantic_model = RegionSemantic(device=self.args.semantic_segment_device)
+        self.region_semantic_model = RegionSemantic(device=self.args.semantic_segment_device, image_caption_model=self.image_caption_model, region_classify_model=self.args.region_classify_model)
         print('\033[1;32m' + "Model initialization finished!".center(50, '-') + '\033[0m')
 
 
     def image_to_text(self, img_src):
         # the information to generate paragraph based on the context
         self.ref_image = Image.open(img_src)
+        # resize image to long edge 384
+        self.ref_image = resize_long_edge(self.ref_image, 384)
         width, height = read_image_width_height(img_src)
         print(self.args)
         if self.args.image_caption:
 
@@ -1,38 +1,57 @@
 from models.segment_models.semgent_anything_model import SegmentAnything
 from models.segment_models.semantic_segment_anything_model import SemanticSegment
+from models.segment_models.edit_anything_model import EditAnything
 
 
 class RegionSemantic():
-    def __init__(self, device):
+    def __init__(self, device, image_caption_model, region_classify_model='edit_anything'):
         self.device = device
+        self.image_caption_model = image_caption_model
+        self.region_classify_model = region_classify_model
         self.init_models()
 
     def init_models(self):
         self.segment_model = SegmentAnything(self.device)
-        self.semantic_segment_model = SemanticSegment(self.device)
-
-    def semantic_prompt_gen(self, anns):
+        if self.region_classify_model == 'ssa':
+            self.semantic_segment_model = SemanticSegment(self.device)
+        elif self.region_classify_model == 'edit_anything':
+            self.edit_anything_model = EditAnything(self.device, self.image_caption_model)
+            print('initalize edit anything model')
+        else:
+            raise ValueError("semantic_class_model must be 'ssa' or 'edit_anything'")
+        
+    def semantic_prompt_gen(self, anns, topk=5):
         """
         fliter too small objects and objects with low stability score
         anns: [{'class_name': 'person', 'bbox': [0.0, 0.0, 0.0, 0.0], 'size': [0, 0], 'stability_score': 0.0}, ...]
         semantic_prompt: "person: [0.0, 0.0, 0.0, 0.0]; ..."
         """
         # Sort annotations by area in descending order
         sorted_annotations = sorted(anns, key=lambda x: x['area'], reverse=True)
+        anns_len = len(sorted_annotations)
         # Select the top 10 largest regions
-        top_10_largest_regions = sorted_annotations[:10]
+        top_10_largest_regions = sorted_annotations[:min(anns_len, topk)]
         semantic_prompt = ""
-        print('\033[1;35m' + '*' * 100 + '\033[0m')
-        print("\nStep3, Semantic Prompt:")
         for region in top_10_largest_regions:
             semantic_prompt += region['class_name'] + ': ' + str(region['bbox']) + "; "
         print(semantic_prompt)
         print('\033[1;35m' + '*' * 100 + '\033[0m')
         return semantic_prompt
 
-    def region_semantic(self, img_src):
+    def region_semantic(self, img_src, region_classify_model='edit_anything'):
+        print('\033[1;35m' + '*' * 100 + '\033[0m')
+        print("\nStep3, Semantic Prompt:")
         anns = self.segment_model.generate_mask(img_src)
-        anns_w_class = self.semantic_segment_model.semantic_class_w_mask(img_src, anns)
+        if region_classify_model == 'ssa':
+            print('generate region supervision with blip2 model....\n')
+            anns_w_class = self.semantic_segment_model.semantic_class_w_mask(img_src, anns)
+            print('finished...\n')
+        elif region_classify_model == 'edit_anything':
+            print('generate region supervision with edit anything model....\n')
+            anns_w_class = self.edit_anything_model.semantic_class_w_mask(img_src, anns)
+            print('finished...\n')
+        else:
+            raise ValueError("semantic_class_model must be 'ssa' or 'edit_anything'")
         return self.semantic_prompt_gen(anns_w_class)
 
     def region_semantic_debug(self, img_src):
 
@@ -0,0 +1,50 @@
+import cv2
+import torch
+import mmcv
+import numpy as np
+from PIL import Image
+from utils.util import resize_long_edge
+
+class EditAnything:
+    def __init__(self, device, image_caption_model):
+        self.device = image_caption_model.device
+        self.data_type = image_caption_model.data_type
+        self.image_caption_model = image_caption_model
+
+    #  working on paraliz these images now
+    def region_classify_w_blip2(self, image):
+        inputs = self.image_caption_model.processor(images=image, return_tensors="pt").to(self.device, self.data_type)
+        generated_ids =  self.image_caption_model.model.generate(**inputs)
+        generated_text = self.image_caption_model.processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+        return generated_text
+
+    def region_level_semantic_api(self, image, anns, topk=5):
+        """
+        rank regions by area, and classify each region with blip2
+        Args:
+            image: numpy array
+            topk: int
+        Returns:
+            topk_region_w_class_label: list of dict with key 'class_label'
+        """
+        topk_region_w_class_label = []
+        if len(anns) == 0:
+            return []
+        sorted_anns = sorted(anns, key=(lambda x: x['area']), reverse=True)
+        for i in range(min(topk, len(sorted_anns))):
+            ann = anns[i]
+            m = ann['segmentation']
+            m_3c = m[:,:, np.newaxis]
+            m_3c = np.concatenate((m_3c,m_3c,m_3c), axis=2)
+            bbox = ann['bbox']
+            region = mmcv.imcrop(image*m_3c, np.array([bbox[0], bbox[1], bbox[0] + bbox[2], bbox[1] + bbox[3]]), scale=1)
+            region_class_label = self.region_classify_w_blip2(region)
+            ann['class_name'] = region_class_label
+            # print(ann['class_label'], str(bbox))
+            topk_region_w_class_label.append(ann)
+        return topk_region_w_class_label
+
+    def semantic_class_w_mask(self, img_src, anns):
+        image = Image.open(img_src)
+        image = resize_long_edge(image, 384)
+        return self.region_level_semantic_api(image, anns)
@@ -10,6 +10,7 @@
 import pycocotools.mask as maskUtils
 from models.segment_models.configs.ade20k_id2label import CONFIG as CONFIG_ADE20K_ID2LABEL
 from models.segment_models.configs.coco_id2label import CONFIG as CONFIG_COCO_ID2LABEL
+from utils.util import resize_long_edge, resize_long_edge_cv2
 # from mmdet.core.visualization.image import imshow_det_bboxes # comment this line if you don't use mmdet
 
 nlp = spacy.load('en_core_web_sm')
@@ -113,6 +114,7 @@ def semantic_class_w_mask(self, img_src, anns, out_file_name="output/test.json",
         :return: dict('segmentation', 'area', 'bbox', 'predicted_iou', 'point_coords', 'stability_score', 'crop_box', "class_name", "class_proposals"})
         """
         img = mmcv.imread(img_src)
+        img = resize_long_edge_cv2(img, 384)
         oneformer_coco_seg = self.oneformer_segmentation(Image.fromarray(img), self.oneformer_coco_processor, self.oneformer_coco_model)
         oneformer_ade20k_seg = self.oneformer_segmentation(Image.fromarray(img), self.oneformer_ade20k_processor, self.oneformer_ade20k_model)
         bitmasks, class_names = [], []
 
@@ -1,6 +1,6 @@
 import cv2
 from segment_anything import SamAutomaticMaskGenerator, sam_model_registry
-import torch
+from utils.util import resize_long_edge_cv2
 
 class SegmentAnything:
     def __init__(self, device, arch="vit_h", pretrained_weights="pretrained_models/sam_vit_h_4b8939.pth"):
@@ -16,5 +16,6 @@ def initialize_model(self, arch, pretrained_weights):
     def generate_mask(self, img_src):
         image = cv2.imread(img_src)
         image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        image = resize_long_edge_cv2(image, 384)
         anns = self.model.generate(image)
         return anns
@@ -43,7 +43,8 @@
   <img src="examples/icon/news.gif" alt="Your Image Description" width=100>  <strong><span style="font-size: 24px;">News</span></strong>
 </p>
 
-
+- 17/April/2023. In addition to semantic segment anything, we use [Edit Anything](https://github.com/sail-sg/EditAnything) to get region-level semantic.
+- 17/April/2023. Our project is online on Huggingface. Have a try! [huggingface](https://huggingface.co/spaces/Awiny/Image2Paragraph/tree/main)
 - 14/April/2023. Our project is very popular in twitter. Looking [the posted twitter](https://twitter.com/awinyimgprocess/status/1646225454599372800?s=46&t=HvOe9T2n35iFuCHP5aIHpQ) for details.
 
 ### To Do List
@@ -57,9 +58,10 @@
 - [x] Integrate GRIT into our code.
 - [x] Support GPT4 API.
 - [x] Notebook/Huggingface Space.
+- [x] Region Semantic Classification from Edit-Anything.
+- [x] Make the model lightweight.
 
 #### Doing
-- [ ] Make the model lightweight.
 - [ ] Replace ChatGPT with own trained LLM.
 - [ ] Other grounding text2image model as instead of Canny ControlNet.
 - [ ] Show retrieval result in gradio.
@@ -179,4 +181,4 @@ If you have more suggestions or functions need to be implemented in this codebas
 
 ## Acknowledgment
 
-This work is based on [ChatGPT](http://chat.openai.com), [BLIP2](https://huggingface.co/spaces/Salesforce/BLIP2), [GRIT](https://github.com/JialianW/GRiT),  [OFA](https://github.com/OFA-Sys/OFA),[Segment-Anything](https://segment-anything.com), [Semantic-Segment-Anything](https://github.com/fudan-zvg/Semantic-Segment-Anything), [ControlNet](https://github.com/lllyasviel/ControlNet).
+This work is based on [ChatGPT](http://chat.openai.com), [Edit_Anything](https://github.com/sail-sg/EditAnything), [BLIP2](https://huggingface.co/spaces/Salesforce/BLIP2), [GRIT](https://github.com/JialianW/GRiT),  [OFA](https://github.com/OFA-Sys/OFA),[Segment-Anything](https://segment-anything.com), [Semantic-Segment-Anything](https://github.com/fudan-zvg/Semantic-Segment-Anything), [ControlNet](https://github.com/lllyasviel/ControlNet).
@@ -14,6 +14,37 @@ def read_image_width_height(image_path):
     width, height = image.size
     return width, height
 
+def resize_long_edge(image, target_size=384):
+    # Calculate the aspect ratio
+    width, height = image.size
+    aspect_ratio = float(width) / float(height)
+
+    # Determine the new dimensions
+    if width > height:
+        new_width = target_size
+        new_height = int(target_size / aspect_ratio)
+    else:
+        new_width = int(target_size * aspect_ratio)
+        new_height = target_size
+
+    # Resize the image
+    resized_image = image.resize((new_width, new_height), Image.ANTIALIAS)
+    return resized_image
+
+def resize_long_edge_cv2(image, target_size=384):
+    height, width = image.shape[:2]
+    aspect_ratio = float(width) / float(height)
+
+    if height > width:
+        new_height = target_size
+        new_width = int(target_size * aspect_ratio)
+    else:
+        new_width = target_size
+        new_height = int(target_size / aspect_ratio)
+
+    resized_image = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_AREA)
+    return resized_image
+
 def display_images_and_text(source_image_path, generated_image, generated_paragraph, outfile_name):
     source_image = Image.open(source_image_path)
     # Create a new image that can fit the images and the text