added flexible forward diffusion

liopeer · liopeer · commit ec7d46922802 · 2023-10-30T18:46:01.000+01:00
diff --git a/diffusion_models/models/diffusion.py b/diffusion_models/models/diffusion.py
@@ -47,7 +47,7 @@ def __init__(self, timesteps: int, start: float=0.0001, end: float=0.02, offset:
         if self.type == "linear":
             self.init_betas = self._linear_scheduler(timesteps=self.timesteps, start=self.start, end=self.end)
         elif self.type == "cosine":
-            self.init_betas = self._cosine_scheduler(timesteps=self.timesteps, start=self.start, end=self.end)
+            self.init_betas = self._cosine_scheduler(timesteps=self.timesteps, offset=self.offset, max_beta=self.max_beta)
         else:
             raise NotImplementedError("Invalid scheduler option:", type)
         self.init_alphas = 1. - self.init_betas
@@ -90,15 +90,51 @@ def forward(
         x_t = sqrt_alphas_dash_t.view(-1, 1, 1, 1) * x_0
         x_t += sqrt_one_minus_alphas_dash_t.view(-1, 1, 1, 1) * noise_normal
         return x_t, noise_normal
+    
+    def forward_flexible(
+            self,
+            x_t1: Float[Tensor, "batch channels height width"],
+            t_1: Int64[Tensor, "batch"],
+            t_2: Int64[Tensor, "batch"]
+        ) -> Float[Tensor, "batch channels height width"]:
+        """Flexible method that enables jumping from/to any timestep in the forward diffusion process.
+        
+        Parameters
+        ----------
+        x_t1
+            batch of (partially noisy) inputs of different stages
+        t_1
+            initial timesteps of forward process (that above x_t1 are in at the moment)
+        t_2
+            timesteps that we would x_t1 transport to (elements must be larger than corresponding elements in t_1)
+        """
+        diff = t_2 - t_1
+        if diff[diff<0].shape[0] != 0:
+            raise ValueError("Timesteps in forward process must increase.")
+        noise_normal = torch.randn_like(x_t1, device=x_t1.device)
+        if (True in torch.gt(t_1, self.timesteps-1)) or (True in torch.gt(t_2, self.timesteps-1)):
+            raise IndexError("t ({}, {}) chosen larger than max. available t ({})".format(t_1, t_2, self.timesteps-1))
+        batch_sqrt_alphas_dash = torch.zeros((t_1.shape[0]))
+        batch_sqrt_one_minus_alpha_dash = torch.zeros((t_1.shape[0]))
+        for sample in range(x_t1.shape[0]):
+            alphas_interval = self.alphas[t_1[sample]:t_2[sample]+1]
+            alphas_dash_interval = torch.cumprod(alphas_interval, axis=0)
+            sqrt_alphas_dash_interval = torch.sqrt(alphas_dash_interval)
+            sqrt_one_minus_alphas_dash_interval = torch.sqrt(1. - alphas_dash_interval)
+            batch_sqrt_alphas_dash[sample] = sqrt_alphas_dash_interval
+            batch_sqrt_one_minus_alpha_dash[sample] = sqrt_one_minus_alphas_dash_interval
+        mean = batch_sqrt_alphas_dash.view(-1, 1, 1, 1) * x_t1
+        out = mean + batch_sqrt_one_minus_alpha_dash.view(-1, 1, 1, 1) * noise_normal
+        return out, noise_normal
 
     def _linear_scheduler(self, timesteps, start, end):
         return torch.linspace(start, end, timesteps)
     
-    def _cosine_scheduler(self, timesteps, start, end):
+    def _cosine_scheduler(self, timesteps, offset, max_beta):
         """t is actually t/T from the paper"""
-        return self._betas_for_alpha_bar(timesteps, lambda t: math.cos((t + self.offset) / (1.0 + self.offset) * math.pi / 2) ** 2, self.max_beta)
+        return self._betas_for_alpha_bar(timesteps, lambda t: math.cos((t + offset) / (1.0 + offset) * math.pi / 2) ** 2, max_beta)
 
-    def _betas_for_alpha_bar(self, num_diffusion_timesteps, alpha_bar, max_beta=0.999):
+    def _betas_for_alpha_bar(self, num_diffusion_timesteps, alpha_bar, max_beta):
         betas = []
         for i in range(num_diffusion_timesteps):
             t1 = i / num_diffusion_timesteps # t -> t/T
@@ -158,11 +194,63 @@ def forward(
         """
         timesteps = self._sample_timesteps(x.shape[0], device=x.device)
         time_enc = self.time_encoder.get_pos_encoding(timesteps)
+        # make (partially) noisy versions of batch, returns noisy version + applied noise
         x_t, noise = self.fwd_diff(x, timesteps)
+        # predict the applied noise from the noisy version
         noise_pred = self.model(x_t, time_enc)
         return noise_pred, noise
     
+    def init_noise(self, num_samples: int):
+        return torch.randn((num_samples, self.model.in_channels, self.img_size, self.img_size), device=list(self.parameters())[0].device)
+    
+    def denoise_singlestep(
+            self, 
+            x: Float[Tensor, "batch channels height width"],
+            t: Int64[Tensor, "batch"]
+        ) -> Float[Tensor, "batch channels height width"]:
+        """Denoise single timestep in reverse direction.
+
+        Parameters
+        ----------
+        x
+            tensor representing a batch of noisy pictures (may be of different timesteps)
+        t
+            tensor representing the t timesteps for the batch
+
+        Returns
+        -------
+        out
+            less noisy version (by one timestep)
+        """
+        self.model.eval()
+        with torch.no_grad():
+            t_enc = self.time_encoder.get_pos_encoding(t)
+            noise_pred = self.model(x, t_enc)
+            alpha = self.fwd_diff.alphas[t][:, None, None, None]
+            alpha_hat = self.fwd_diff.alphas_dash[t][:, None, None, None]
+            beta = self.fwd_diff.betas[t][:, None, None, None]
+            noise = torch.randn_like(x, device=noise_pred.device)
+            # noise where t = 1 should be zero
+            (t_one_idx, ) = torch.where(t==1)
+            noise[t_one_idx] = 0
+            x = 1 / torch.sqrt(alpha) * (x - ((1 - alpha) / (torch.sqrt(1 - alpha_hat))) * noise_pred) + torch.sqrt(beta) * noise
+        self.model.train()
+        return x
+    
     def sample(
+            self,
+            num_samples: int,
+            debugging: bool=False,
+            save_every: int=20
+        ) -> Float[Tensor, "batch channel height width"]:
+        beta = self.fwd_diff.betas[-1].view(-1,1,1,1)
+        x = self.init_noise(num_samples) * torch.sqrt(beta)
+        for i in reversed(range(1, self.fwd_diff.timesteps)):
+            t = i * torch.ones((num_samples), dtype=torch.long, device=list(self.model.parameters())[0].device)
+            x = self.denoise_singlestep(x, t)
+        return x
+    
+    def sample2(
             self, 
             num_samples: int, 
             debugging: bool=False,
diff --git a/diffusion_models/models/unet.py b/diffusion_models/models/unet.py
@@ -231,7 +231,10 @@ def __init__(
             dropout: float=0.5,
             activation: nn.Module=nn.SiLU,
             verbose: bool=False,
-            init_channels: int=64
+            init_channels: int=64,
+            attention: bool=True,
+            attention_heads: int=4,
+            attention_ff_dim: int=None
         ) -> None:
         """Constructor of UNet.
 
@@ -251,6 +254,14 @@ def __init__(
             activation function to be used
         verbose
             verbose printing of tensor shapes for debbugging
+        init_channels
+            number of channels to initially transform the input to (usually 64, 128, ...)
+        attention
+            whether to use self-attention layers
+        attention_heads
+            number of attention heads to be used
+        attention_ff_dim
+            hidden dimension of feedforward layer in self attention module, None defaults to input dimension
         """
         super().__init__()
         self.num_layers = num_encoding_blocks
@@ -263,6 +274,9 @@ def __init__(
         self.activation = activation
         self.verbose = verbose
         self.init_channels = init_channels
+        self.attention = attention
+        self.attention_heads = attention_heads
+        self.attention_ff_dim = attention_ff_dim
 
         self.encoding_channels, self.decoding_channels = self._get_channel_lists(init_channels, num_encoding_blocks)
 
@@ -273,7 +287,10 @@ def __init__(
             nn.Dropout(self.dropout)
         )
 
-        self.encoder = nn.ModuleList([EncodingBlock(self.encoding_channels[i], self.encoding_channels[i+1], time_emb_size, kernel_size, dropout, self.activation, verbose) for i in range(len(self.encoding_channels[:-1]))])
+        if attention:
+            self.encoder = nn.ModuleList([AttentionEncodingBlock(self.encoding_channels[i], self.encoding_channels[i+1], time_emb_size, kernel_size, dropout, self.activation, verbose, attention_heads, attention_ff_dim) for i in range(len(self.encoding_channels[:-1]))])
+        else:
+            self.encoder = nn.ModuleList([EncodingBlock(self.encoding_channels[i], self.encoding_channels[i+1], time_emb_size, kernel_size, dropout, self.activation, verbose) for i in range(len(self.encoding_channels[:-1]))])
 
         self.bottleneck = nn.Sequential(
             nn.Conv2d(self.encoding_channels[-1], self.encoding_channels[-1] * 2, kernel_size=self.kernel_size, padding="same"),
@@ -286,8 +303,11 @@ def __init__(
             nn.Dropout(self.dropout)
         )
 
-        self.decoder = nn.ModuleList([DecodingBlock(self.decoding_channels[i], self.decoding_channels[i+1], time_emb_size, kernel_size, dropout, self.activation, verbose) for i in range(len(self.encoding_channels[:-1]))])
-
+        if attention:
+            self.decoder = nn.ModuleList([AttentionDecodingBlock(self.decoding_channels[i], self.decoding_channels[i+1], time_emb_size, kernel_size, dropout, self.activation, verbose, attention_heads, attention_ff_dim) for i in range(len(self.encoding_channels[:-1]))])
+        else:
+            self.decoder = nn.ModuleList([DecodingBlock(self.decoding_channels[i], self.decoding_channels[i+1], time_emb_size, kernel_size, dropout, self.activation, verbose) for i in range(len(self.encoding_channels[:-1]))])
+        
         self.out_conv = nn.Conv2d(init_channels, in_channels, kernel_size=kernel_size, padding="same")
 
     def _get_channel_lists(self, start_channels, num_layers):
@@ -367,4 +387,107 @@ def _check_sizes(self, x):
         heights = [(elem.is_integer() and (elem % 2 == 0)) for elem in heights]
         if (False in widths) or (False in heights):
             return False
-        return True
+        return True
+    
+class SelfAttention(nn.Module):
+    def __init__(
+            self, 
+            channels: int,
+            num_heads: int,
+            dropout: float,
+            dim_feedforward: int=None,
+            activation: nn.Module=nn.SiLU
+        ) -> None:
+        """Constructor of SelfAttention module.
+        
+        Implementation of self-attention layer for image data.
+
+        Parameters
+        ----------
+        channels
+            number of input channels
+        num_heads
+            number of desired attention heads
+        dropout
+            dropout probability value
+        dim_feedforward
+            dimension of hidden layers in feedforward NN, defaults to number of input channels
+        activation
+            activation function to be used, as uninstantiated nn.Module
+        """
+        super().__init__()
+        self.channels = channels
+        self.num_heads = num_heads
+        self.dropout = dropout
+        if dim_feedforward is not None:
+            self.dim_feedforward = dim_feedforward
+        else:
+            self.dim_feedforward = channels
+        self.activation = activation()
+        self.attention_layer = nn.TransformerEncoderLayer(
+            channels,
+            num_heads,
+            self.dim_feedforward,
+            dropout,
+            self.activation,
+            batch_first=True
+        )
+
+    def forward(self, x: Float[Tensor, "batch channels height width"]) -> Float[Tensor, "batch channels height width"]:
+        """Forward method of SelfAttention module.
+        
+        Parameters
+        ----------
+        x
+            input tensor
+        
+        Returns
+        -------
+        out
+            output tensor
+        """
+        # transform feature maps into vectors and put feature dimension (channels) at the end
+        orig_ize = x.size()
+        x = x.view(-1, x.shape[1], x.shape[2]*x.shape[3]).swapaxes(1,2)
+        x = self.attention_layer(x)
+        return x.swapaxes(1,2).view(*orig_ize)
+    
+class AttentionEncodingBlock(EncodingBlock):
+    def __init__(
+            self, 
+            in_channels: int, 
+            out_channels: int, 
+            time_embedding_size: int, 
+            kernel_size: int = 3, 
+            dropout: float = 0.5, 
+            activation: nn.Module = nn.SiLU, 
+            verbose: bool = False,
+            attention_heads: int=4,
+            attention_ff_dim: int=None
+        ) -> None:
+        super().__init__(in_channels, out_channels, time_embedding_size, kernel_size, dropout, activation, verbose)
+        self.sa = SelfAttention(out_channels, attention_heads, dropout, attention_ff_dim, activation)
+
+    def forward(self, x: Tensor, time_embedding: Tensor) -> Tuple[Tensor, Tensor]:
+        out, skip = super().forward(x, time_embedding)
+        return self.sa(out), skip
+
+class AttentionDecodingBlock(DecodingBlock):
+    def __init__(
+            self, 
+            in_channels: int, 
+            out_channels: int, 
+            time_embedding_size: int, 
+            kernel_size: int = 3, 
+            dropout: float = 0.5, 
+            activation: nn.Module = nn.SiLU, 
+            verbose: bool = False,
+            attention_heads: int=4,
+            attention_ff_dim: int=None
+        ) -> None:
+        super().__init__(in_channels, out_channels, time_embedding_size, kernel_size, dropout, activation, verbose)
+        self.sa = SelfAttention(out_channels, attention_heads, dropout, attention_ff_dim, activation)
+
+    def forward(self, x: Tensor, skip: Tensor, time_embedding: Tensor = None) -> Tensor:
+        out = super().forward(x, skip, time_embedding)
+        return self.sa(out)
diff --git a/diffusion_models/utils/datasets.py b/diffusion_models/utils/datasets.py
@@ -15,7 +15,7 @@ def __init__(self, root: str, train: bool = True, transform: Callable[..., Any]
         super().__init__(root, train, transform, target_transform, download)
 
 class Cifar10DebugDataset(Cifar10Dataset):
-    __len__ = lambda x: 100
+    __len__ = lambda x: 5000
 
 class MNISTTrainDataset(MNIST):
     def __init__(self, root: str, train: bool = True, transform: Callable[..., Any] | None = None, target_transform: Callable[..., Any] | None = None, download: bool = False) -> None:
diff --git a/tests/flexible_foward.py b/tests/flexible_foward.py
@@ -0,0 +1,38 @@
+import torch
+from train_generative import config
+import torchvision
+import torch.nn as nn
+from math import sqrt
+
+model = config.architecture(
+        backbone = config.backbone(
+            num_encoding_blocks = 4,
+            in_channels = 1,
+            kernel_size = 3,
+            dropout = config.dropout,
+            activation = nn.SiLU,
+            time_emb_size = config.time_enc_dim,
+            init_channels = 128,
+            attention = False,
+            attention_heads = 0,
+            attention_ff_dim = 0
+        ),
+        fwd_diff = config.forward_diff(
+            timesteps = config.max_timesteps,
+            start = config.t_start,
+            end = config.t_end,
+            offset = config.offset,
+            max_beta = config.max_beta,
+            type = "linear"
+        ),
+        img_size = config.img_size,
+        time_enc_dim = config.time_enc_dim,
+        dropout = config.dropout
+    )
+
+model = model.to("cuda")
+model.load_state_dict(torch.load("/itet-stor/peerli/net_scratch/ghoulish-goosebump-9/checkpoint90.pt"))
+
+samples = model.sample(9)
+samples = torchvision.utils.make_grid(samples, nrow=int(sqrt(9)))
+torchvision.utils.save_image(samples, "/home/peerli/Downloads/sample2.png")
diff --git a/tests/job.sh b/tests/job.sh
@@ -2,12 +2,12 @@
 #SBATCH  --account=student
 #SBATCH  --output=log/%j.out
 #SBATCH  --error=log/%j.err
-#SBATCH  --gres=gpu:4
+#SBATCH  --gres=gpu:2
 #SBATCH  --mem=32G
 #SBATCH  --job-name=mnist_double
 #SBATCH  --constraint='titan_xp|geforce_gtx_titan_x'
 
 source /scratch_net/biwidl311/peerli/conda/etc/profile.d/conda.sh
 conda activate liotorch
 mkdir log
-python -u train_generative.py "$@"
+python -u train_generative.py "$@"
diff --git a/tests/tests.py b/tests/tests.py
diff --git a/tests/train_generative.py b/tests/train_generative.py