xyfJASON
diff --git a/Diff for: ‎README.md
+1-1 b/Diff for: ‎README.md
+1-1
diff --git a/Diff for: ‎configs/imagenet256/vqgan-titok.yaml
+95 b/Diff for: ‎configs/imagenet256/vqgan-titok.yaml
+95
diff --git a/Diff for: ‎docs/benchmark-imagenet256.md
+18-3 b/Diff for: ‎docs/benchmark-imagenet256.md
+18-3
diff --git a/Diff for: ‎losses/__init__.py
+1 b/Diff for: ‎losses/__init__.py
+1
diff --git a/Diff for: ‎losses/adversarial.py
+25-1 b/Diff for: ‎losses/adversarial.py
+25-1
diff --git a/Diff for: ‎losses/perceptual_loss.py
+49 b/Diff for: ‎losses/perceptual_loss.py
+49
@@ -19,7 +19,7 @@ Implement visual tokenizers with PyTorch.
 - [ ] Index Backpropogate Quantization (IBQ)
 - [ ] Grouped Spherical Quantization (GSQ)
 
-**ImageNet 256x256 Re-implementation**:
+**ImageNet 256x256 Reproduction**:
 
 - [x] VQGAN (Taming-Transformers)
 - [x] VQGAN (LlamaGen)
 
@@ -0,0 +1,95 @@
+seed: 8888
+
+data:
+  name: imagenet
+  root: ~/data/ImageNet/ILSVRC2012/Images
+  img_size: 256
+  crop: random
+
+dataloader:
+  num_workers: 4
+  pin_memory: true
+  prefetch_factor: 2
+
+encoder:
+  target: models.autoencoder.titok_net.Encoder
+  params:
+    in_channels: 3
+    image_size: 256
+    patch_size: 16
+    embed_dim: 768  # base
+    n_heads: 12     # base
+    n_layers: 12    # base
+    n_tokens: 64
+
+decoder:
+  target: models.autoencoder.titok_net.Decoder
+  params:
+    out_channels: 3
+    image_size: 256
+    patch_size: 16
+    embed_dim: 768  # base
+    n_heads: 12     # base
+    n_layers: 12    # base
+    n_tokens: 64
+
+quantizer:
+  target: models.quantizer.VectorQuantizer
+  params:
+    codebook_num: 4096
+    codebook_dim: 12
+    l2_norm: True
+
+disc:
+  target: models.discriminator.TitokGANDiscriminator
+
+train:
+  n_steps: 1000000
+  batch_size: 256
+  micro_batch_size: ~
+
+  type_rec: l2
+  coef_rec: 1.0
+
+  coef_lpips: 1.0
+
+  type_perc: convnext_s
+  coef_perc: 0.1
+
+  coef_commit: 0.25
+  coef_vq: 1.0
+
+  coef_adv: 0.1
+  start_adv: 200000
+  coef_lecam_reg: 0.001
+
+  ema:
+    decay: 0.9999
+    ema_warmup_type: crowsonkb
+
+  clip_grad_norm: 1.0
+
+  print_freq: 500
+  sample_freq: 10000
+  save_freq: 50000
+
+  optim:
+    target: torch.optim.AdamW
+    params:
+      lr: 0.0001
+      betas: [0.9, 0.999]
+      weight_decay: 0.0001
+
+  optim_d:
+    target: torch.optim.AdamW
+    params:
+      lr: 0.0001
+      betas: [0.9, 0.999]
+      weight_decay: 0.0001
+
+  sched:
+    target: utils.scheduler.CosineMinimumWarmupLR
+    params:
+      warmup_steps: 10000
+      training_steps: 1000000
+      min_lr: 0.00001
@@ -8,9 +8,9 @@ This benchmark aims to reproduce the results reported in the papers as closely a
 
 
 
-## Quantitative results
+## VQGAN (Taming Transformers)
 
-Using hyperparameters from ["Taming Transformers"](http://arxiv.org/abs/2012.09841) paper (see [config](../configs/imagenet256/vqgan-taming.yaml)):
+[[paper]](http://arxiv.org/abs/2012.09841) [[config]](../configs/imagenet256/vqgan-taming.yaml)
 
 |  Downsample ratio   | Codebook dim. | Codebook size | Codebook usage↑ |  PSNR↑  | SSIM↑  | LPIPS↓ | rFID↓  |
 |:-------------------:|:-------------:|:-------------:|:---------------:|:-------:|:------:|:------:|:------:|
@@ -22,11 +22,26 @@ Using hyperparameters from ["Taming Transformers"](http://arxiv.org/abs/2012.098
 
 
 
-Using hyperparameters from ["LlamaGen"](http://arxiv.org/abs/2406.06525) paper (see [config](../configs/imagenet256/vqgan-llamagen.yaml)):
+## VQGAN (LlamaGen)
+
+[[paper]](http://arxiv.org/abs/2406.06525) [[config]](../configs/imagenet256/vqgan-llamagen.yaml)
 
 |  Downsample ratio  | Codebook dim. | Codebook size | Codebook usage↑ |  PSNR↑  | SSIM↑  | LPIPS↓ | rFID↓  |
 |:------------------:|:-------------:|:-------------:|:---------------:|:-------:|:------:|:------:|:------:|
 |         16         |       8       |     16384     |      100%       | 20.7201 | 0.5509 | 0.1385 | 2.1073 |
 
 - ️🌱 The PSNR is close to the results reported in the paper (20.79).
 - ️🌱 The rFID is even slightly better than the results reported in the paper (2.19).
+
+
+
+## TiTok
+
+[[paper]](https://arxiv.org/abs/2406.07550) [[project page]](https://yucornetto.github.io/projects/titok.html) [[config]](../configs/imagenet256/vqgan-titok.yaml)
+
+| \# tokens | Codebook dim. | Codebook size | Codebook usage↑ |  PSNR↑  | SSIM↑  | LPIPS↓ | rFID↓  |
+|:---------:|:-------------:|:-------------:|:---------------:|:-------:|:------:|:------:|:------:|
+|    64     |      12       |     4096      |      100%       | 17.8995 | 0.4022 | 0.2681 | 4.6691 |
+
+- ⚠️ The model is trained with a single-stage training strategy, which is different from the paper.
+- ⚠️ The results are not good. Reconstructed images contain repeated patterns and artifacts. Need further investigation.
@@ -1,2 +1,3 @@
 from .adversarial import AdversarialLoss
 from .lpips import LPIPS as LPIPSLoss
+from .perceptual_loss import PerceptualLoss
@@ -15,14 +15,35 @@ class AdversarialLoss(nn.Module):
         Objective of the discriminator: min E[max(0, 1-D(x))] + E[max(0, 1+D(G(z)))]
         Objective of the generator: min -E[D(G(z))]
 
+    Supports LeCam regularization on the discriminator.
+
     """
-    def __init__(self, discriminator: nn.Module, loss_type: str):
+    def __init__(
+            self,
+            discriminator: nn.Module,
+            loss_type: str,
+            coef_lecam_reg: float = 0.0,
+            lecam_reg_ema_decay: float = 0.999,
+    ):
         super().__init__()
         assert loss_type in ['ns', 'hinge']
 
         self.discriminator = discriminator
         self.loss_type = loss_type
 
+        self.coef_lecam_reg = coef_lecam_reg
+        if self.coef_lecam_reg > 0.0:
+            self.lecam_reg_ema_decay = lecam_reg_ema_decay
+            self.register_buffer('ema_real_logits_mean', torch.zeros(1))
+            self.register_buffer('ema_fake_logits_mean', torch.zeros(1))
+
+    def lecam_reg(self, real_logits_mean: Tensor, fake_logits_mean: Tensor):
+        lecam_loss = (torch.mean(torch.pow(F.relu(real_logits_mean - self.ema_fake_logits_mean), 2)) +
+                      torch.mean(torch.pow(F.relu(self.ema_real_logits_mean - fake_logits_mean), 2)))
+        self.ema_real_logits_mean = self.ema_real_logits_mean * self.lecam_reg_ema_decay + real_logits_mean.detach() * (1 - self.lecam_reg_ema_decay)  # noqa
+        self.ema_fake_logits_mean = self.ema_fake_logits_mean * self.lecam_reg_ema_decay + fake_logits_mean.detach() * (1 - self.lecam_reg_ema_decay)  # noqa
+        return lecam_loss
+
     def forward_G(self, fake_data: Tensor, *args, **kwargs):
         fake_logits = self.discriminator(fake_data, *args, **kwargs)
         if self.loss_type == 'ns':
@@ -45,6 +66,9 @@ def forward_D(self, fake_data: Tensor, real_data: Tensor, *args, **kwargs):
         else:
             raise ValueError(f'Unknown loss type: {self.loss_type}')
 
+        if self.coef_lecam_reg > 0.0:
+            loss = loss + self.coef_lecam_reg * self.lecam_reg(real_logits.mean(), fake_logits.mean())
+
         return loss
 
     def forward(self, mode: str, fake_data: Tensor, real_data: Tensor = None, *args, **kwargs):
 
@@ -0,0 +1,49 @@
+"""Perceptual loss.
+
+References:
+  - https://github.com/bytedance/1d-tokenizer/blob/main/modeling/modules/perceptual_loss.py
+  - https://github.com/markweberdev/maskbit/blob/main/modeling/modules/perceptual_loss.py
+"""
+
+import torch
+import torch.nn.functional as F
+from torchvision import models
+
+
+class PerceptualLoss(torch.nn.Module):
+    def __init__(self, model_name: str = 'resnet50'):
+        super().__init__()
+        if model_name == 'resnet50':
+            self.model = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V1).eval()
+        elif model_name == "convnext_s":
+            self.model = models.convnext_small(weights=models.ConvNeXt_Small_Weights.IMAGENET1K_V1).eval()
+        else:
+            raise ValueError(f'Unsupported model name: {model_name}')
+
+        self.register_buffer('imagenet_mean', torch.Tensor([0.485, 0.456, 0.406])[None, :, None, None])
+        self.register_buffer('imagenet_std', torch.Tensor([0.229, 0.224, 0.225])[None, :, None, None])
+
+        for param in self.parameters():
+            param.requires_grad = False
+
+    def forward(self, image1: torch.Tensor, image2: torch.Tensor):
+        """Computes the perceptual loss.
+
+        Args:
+            image1: A tensor of shape (B, C, H, W) in range [0, 1].
+            image2: A tensor of shape (B, C, H, W) in range [0, 1].
+
+        Returns:
+            A scalar tensor, the perceptual loss.
+        """
+        image1 = F.interpolate(image1, size=224, mode='bilinear', align_corners=False, antialias=True)
+        image2 = F.interpolate(image2, size=224, mode='bilinear', align_corners=False, antialias=True)
+
+        image1 = (image1 - self.imagenet_mean) / self.imagenet_std
+        image2 = (image2 - self.imagenet_mean) / self.imagenet_std
+
+        pred1 = self.model(image1)
+        pred2 = self.model(image2)
+
+        loss = F.mse_loss(pred1, pred2, reduction='mean')
+        return loss
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
`1`	`1`	`from .adversarial import AdversarialLoss`
`2`	`2`	`from .lpips import LPIPS as LPIPSLoss`
	`3`	`+from .perceptual_loss import PerceptualLoss`