Removes SV3D video_decoder, keeps SV3D image_decoder

Stability-AI · Mar 18, 2024 · 30e4d32 · 30e4d32
1 parent b41860f
commit 30e4d32
Show file tree

Hide file tree

Showing 11 changed files with 400 additions and 568 deletions.
diff --git a/configs/inference/sv3d_p.yaml b/configs/inference/sv3d_p.yaml
@@ -103,17 +103,16 @@ model:
         encoder_config:
           target: torch.nn.Identity
         decoder_config:
-          target: sgm.modules.autoencoding.temporal_ae.VideoDecoder
+          target: sgm.modules.diffusionmodules.model.Decoder
           params:
-            attn_type: vanilla
+            attn_type: vanilla-xformers
             double_z: True
             z_channels: 4
             resolution: 256
             in_channels: 3
             out_ch: 3
             ch: 128
-            ch_mult: [1, 2, 4, 4]
+            ch_mult: [ 1, 2, 4, 4 ]
             num_res_blocks: 2
-            attn_resolutions: []
-            dropout: 0.0
-            video_kernel_size: [3, 1, 1]
+            attn_resolutions: [ ]
+            dropout: 0.0
diff --git a/configs/inference/sv3d_p_image_decoder.yaml b/configs/inference/sv3d_p_image_decoder.yaml
diff --git a/configs/inference/sv3d_u.yaml b/configs/inference/sv3d_u.yaml
@@ -91,17 +91,16 @@ model:
         encoder_config:
           target: torch.nn.Identity
         decoder_config:
-          target: sgm.modules.autoencoding.temporal_ae.VideoDecoder
+          target: sgm.modules.diffusionmodules.model.Decoder
           params:
-            attn_type: vanilla
+            attn_type: vanilla-xformers
             double_z: True
             z_channels: 4
             resolution: 256
             in_channels: 3
             out_ch: 3
             ch: 128
-            ch_mult: [1, 2, 4, 4]
+            ch_mult: [ 1, 2, 4, 4 ]
             num_res_blocks: 2
-            attn_resolutions: []
-            dropout: 0.0
-            video_kernel_size: [3, 1, 1]
+            attn_resolutions: [ ]
+            dropout: 0.0
diff --git a/configs/inference/sv3d_u_image_decoder.yaml b/configs/inference/sv3d_u_image_decoder.yaml
diff --git a/scripts/demo/video_sampling.py b/scripts/demo/video_sampling.py
@@ -109,26 +109,6 @@
             "decoding_t": 14,
         },
     },
-    "sv3d_u_image_decoder": {
-        "T": 21,
-        "H": 576,
-        "W": 576,
-        "C": 4,
-        "f": 8,
-        "config": "configs/inference/sv3d_u_image_decoder.yaml",
-        "ckpt": "checkpoints/sv3d_u_image_decoder.safetensors",
-        "options": {
-            "discretization": 1,
-            "cfg": 2.5,
-            "sigma_min": 0.002,
-            "sigma_max": 700.0,
-            "rho": 7.0,
-            "guider": 3,
-            "force_uc_zero_embeddings": ["cond_frames", "cond_frames_without_noise"],
-            "num_steps": 50,
-            "decoding_t": 14,
-        },
-    },
     "sv3d_p": {
         "T": 21,
         "H": 576,
@@ -149,26 +129,6 @@
             "decoding_t": 14,
         },
     },
-    "sv3d_p_image_decoder": {
-        "T": 21,
-        "H": 576,
-        "W": 576,
-        "C": 4,
-        "f": 8,
-        "config": "configs/inference/sv3d_p_image_decoder.yaml",
-        "ckpt": "checkpoints/sv3d_p_image_decoder.safetensors",
-        "options": {
-            "discretization": 1,
-            "cfg": 2.5,
-            "sigma_min": 0.002,
-            "sigma_max": 700.0,
-            "rho": 7.0,
-            "guider": 3,
-            "force_uc_zero_embeddings": ["cond_frames", "cond_frames_without_noise"],
-            "num_steps": 50,
-            "decoding_t": 14,
-        },
-    },
 }
 
 

diff --git a/scripts/sampling/configs/sv3d_p.yaml b/scripts/sampling/configs/sv3d_p.yaml
@@ -3,7 +3,7 @@ model:
   params:
     scale_factor: 0.18215
     disable_first_stage_autocast: True
-    ckpt_path: checkpoints/sv3d_p.safetensors
+    ckpt_path: checkpoints/sv3d_p_image_decoder.safetensors
 
     denoiser_config:
       target: sgm.modules.diffusionmodules.denoiser.Denoiser
@@ -104,20 +104,19 @@ model:
         encoder_config:
           target: torch.nn.Identity
         decoder_config:
-          target: sgm.modules.autoencoding.temporal_ae.VideoDecoder
+          target: sgm.modules.diffusionmodules.model.Decoder
           params:
-            attn_type: vanilla
+            attn_type: vanilla-xformers
             double_z: True
             z_channels: 4
             resolution: 256
             in_channels: 3
             out_ch: 3
             ch: 128
-            ch_mult: [1, 2, 4, 4]
+            ch_mult: [ 1, 2, 4, 4 ]
             num_res_blocks: 2
-            attn_resolutions: []
+            attn_resolutions: [ ]
             dropout: 0.0
-            video_kernel_size: [3, 1, 1]
 
     sampler_config:
       target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler