MegEngine
diff --git a/‎hubconf.py
+19-20 b/‎hubconf.py
+19-20
diff --git a/‎official/assets/norway_sample_2687.png
873 KB b/‎official/assets/norway_sample_2687.png
873 KB
diff --git a/‎official/assets/norway_segmentation.png
10.4 KB b/‎official/assets/norway_segmentation.png
10.4 KB
diff --git a/‎official/assets/test_000009.png
124 KB b/‎official/assets/test_000009.png
124 KB
diff --git a/‎official/assets/test_000010.png
113 KB b/‎official/assets/test_000010.png
113 KB
diff --git a/‎official/assets/test_depth.png
19.5 KB b/‎official/assets/test_depth.png
19.5 KB
diff --git a/‎official/assets/test_sample_255.png
135 KB b/‎official/assets/test_sample_255.png
135 KB
diff --git a/‎official/assets/total.png
1.94 MB b/‎official/assets/total.png
1.94 MB
diff --git a/‎official/multimodal/__init__.py
+1 b/‎official/multimodal/__init__.py
+1
diff --git a/‎official/multimodal/clip/simple_tokenizer.py
+23 b/‎official/multimodal/clip/simple_tokenizer.py
+23
diff --git a/‎official/multimodal/dalle/README.md
+57 b/‎official/multimodal/dalle/README.md
+57
diff --git a/‎official/multimodal/dalle/__init__.py
+12 b/‎official/multimodal/dalle/__init__.py
+12
@@ -9,7 +9,18 @@
     vit_b_16,
     vit_b_32,
     vit_l_14,
-    vit_l_14_336px
+    vit_l_14_336px,
+)
+from official.multimodal.dalle import (
+    Generator,
+    OpenAIDiscreteVAE,
+    OpenAIDiscreteVAEDecoder,
+    OpenAIDiscreteVAEEncoder,
+    VQGanVAE,
+    coco_512_16_16d_16h_80tsl,
+    openai_discrete_VAE_decoder,
+    openai_discrete_VAE_encoder,
+    vqgan_vae_1024,
 )
 from official.multimodal.taming_transformer import (
     ConditionalSampler,
@@ -20,7 +31,7 @@
     s_flckr_transformer,
     vqgan_gumbel_f8,
     vqgan_imagenet_f16_1024,
-    vqgan_imagenet_f16_16384
+    vqgan_imagenet_f16_16384,
 )
 from official.nlp.bert.model import (
     cased_L_12_H_768_A_12,
@@ -30,7 +41,7 @@
     uncased_L_12_H_768_A_12,
     uncased_L_24_H_1024_A_16,
     wwm_cased_L_24_H_1024_A_16,
-    wwm_uncased_L_24_H_1024_A_16
+    wwm_uncased_L_24_H_1024_A_16,
 )
 from official.quantization.models import quantized_resnet18
 from official.vision.classification.resnet.model import (
@@ -43,13 +54,13 @@
     resnet101,
     resnet152,
     resnext50_32x4d,
-    resnext101_32x8d
+    resnext101_32x8d,
 )
 from official.vision.classification.shufflenet.model import (
     shufflenet_v2_x0_5,
     shufflenet_v2_x1_0,
     shufflenet_v2_x1_5,
-    shufflenet_v2_x2_0
+    shufflenet_v2_x2_0,
 )
 from official.vision.detection.configs import (
     atss_res18_coco_3x_800size,
@@ -76,30 +87,18 @@
     retinanet_res34_coco_3x_800size,
     retinanet_res50_coco_3x_800size,
     retinanet_res101_coco_3x_800size,
-    retinanet_resx101_coco_2x_800size
+    retinanet_resx101_coco_2x_800size,
 )
 from official.vision.detection.models import ATSS, FCOS, FasterRCNN, FreeAnchor, RetinaNet
 from official.vision.detection.tools.utils import DetEvaluator
 from official.vision.keypoints.inference import KeypointEvaluator
 from official.vision.keypoints.models import (
     simplebaseline_res50,
     simplebaseline_res101,
-    simplebaseline_res152
+    simplebaseline_res152,
 )
 from official.vision.segmentation.configs import (
     deeplabv3plus_res101_cityscapes_768size,
-    deeplabv3plus_res101_voc_512size
+    deeplabv3plus_res101_voc_512size,
 )
 from official.vision.segmentation.models import DeepLabV3Plus
-from official.multimodal.clip.models import (
-    rn50,
-    rn101,
-    rn50x4,
-    rn50x16,
-    rn50x64,
-    vit_b_32,
-    vit_b_16,
-    vit_l_14,
-    vit_l_14_336px,
-)
-from official.multimodal.clip.inference_utils import ClipInferenceUtils
@@ -0,0 +1 @@
+from .dalle.dalle import DALLE
@@ -141,6 +141,29 @@ def decode(self, tokens):
             'utf-8', errors="replace").replace('</w>', ' ')
         return text
 
+    def tokenize(
+        self,
+        texts: Union[str, List[str]],
+        context_length: int = 77,
+        truncate_text: bool = False
+    ):
+        if isinstance(texts, str):
+            texts = [texts]
+
+        all_tokens = [self.encode(text) for text in texts]
+        result = np.zeros((len(all_tokens), context_length), dtype=np.int32)
+
+        for i, tokens in enumerate(all_tokens):
+            if len(tokens) > context_length:
+                if truncate_text:
+                    tokens = tokens[:context_length]
+                else:
+                    raise RuntimeError(
+                        f"Input {texts[i]} is too long for context length {context_length}")
+            result[i, :len(tokens)] = tokens
+
+        return mge.tensor(result)
+
 
 def tokenize(texts: Union[str, List[str]], context_length: int = 77, truncate: bool = False):
     """
 
@@ -0,0 +1,57 @@
+# DALLE
+
+此仓库包含MegEngine实现的多模态模型DALLE以及文生图代码，但不包含训练代码。
+
+## 图像重建
+
+对于给定的大小为256x256的归一化四维输入，可以使用如下方式进行重建：
+
+```python
+from official.multimodal.dalle.vae import OpenAIDiscreteVAE
+from official.multimodal.big_sleep.big_sleep import save_images
+
+
+vae = OpenAIDiscreteVAE(True)
+
+img_seq = vae.get_codebook_indices(input)
+
+reconstructed_image = vae.decode(img_seq)
+
+save_images(reconstructed_image, './image.png')
+
+```
+
+
+
+## 文生图
+
+可以使用以下代码体验文生图的功能，需要先下载[dalle_new_variety.bpe](https://data.megengine.org.cn/research/multimodality/dalle_new_variety.bpe)文件
+
+```python
+from official.multimodal.dalle import coco_512_16_16d_16h_80tsl
+from official.multimodal.dalle import Generator
+
+dalle = coco_512_16_16d_16h_80tsl()
+
+generator = Generator(
+    dalle,
+    texts = ['A tower has a clock on it on a day with a blue sky'],
+    num_images=64,
+    batch_size=4,
+    bpe_path = './dalle_new_variety.bpe',
+    root='./dalle'
+)
+
+generator()
+```
+
+生成结果如下所示：
+
+![res](../../assets/total.png)
+
+
+## 参考
+
+[DALLE-pytorch](https://github.com/lucidrains/DALLE-pytorch)
+
+[DALLE-pytorch-discussions](https://github.com/lucidrains/DALLE-pytorch/discussions/335)
@@ -0,0 +1,12 @@
+from .dalle import DALLE
+from .generate import Generator
+from .pretrained import coco_512_16_16d_16h_80tsl
+from .vae import (
+    OpenAIDiscreteVAE,
+    OpenAIDiscreteVAEDecoder,
+    OpenAIDiscreteVAEEncoder,
+    VQGanVAE,
+    openai_discrete_VAE_decoder,
+    openai_discrete_VAE_encoder
+)
+from .vae.vqgan_vae import vqgan_vae_1024