pytorch
diff --git a/‎.ci/docker/common/install_conda.sh
Lines changed: 2 additions & 1 deletion b/‎.ci/docker/common/install_conda.sh
Lines changed: 2 additions & 1 deletion
diff --git a/‎.ci/docker/dev-requirements.txt renamed to ‎.ci/docker/requirements-dev.txt b/‎.ci/docker/dev-requirements.txt renamed to ‎.ci/docker/requirements-dev.txt
diff --git a/‎torchtitan/experiments/flux/requirements.txt renamed to ‎.ci/docker/requirements-flux.txt
Lines changed: 1 addition & 0 deletions b/‎torchtitan/experiments/flux/requirements.txt renamed to ‎.ci/docker/requirements-flux.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎.ci/docker/ubuntu/Dockerfile
Lines changed: 3 additions & 2 deletions b/‎.ci/docker/ubuntu/Dockerfile
Lines changed: 3 additions & 2 deletions
diff --git a/‎.github/workflows/integration_test_8gpu_flux.yaml
Lines changed: 46 additions & 0 deletions b/‎.github/workflows/integration_test_8gpu_flux.yaml
Lines changed: 46 additions & 0 deletions
diff --git a/‎.github/workflows/unit_test_cpu_flux.yaml
Lines changed: 33 additions & 0 deletions b/‎.github/workflows/unit_test_cpu_flux.yaml
Lines changed: 33 additions & 0 deletions
diff --git a/‎.gitignore
Lines changed: 1 addition & 1 deletion b/‎.gitignore
Lines changed: 1 addition & 1 deletion
diff --git a/‎CONTRIBUTING.md
Lines changed: 1 addition & 1 deletion b/‎CONTRIBUTING.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev-requirements.txt
Lines changed: 0 additions & 1 deletion b/‎dev-requirements.txt
Lines changed: 0 additions & 1 deletion
diff --git a/‎requirements-dev.txt
Lines changed: 1 addition & 0 deletions b/‎requirements-dev.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/integration_tests.py
Lines changed: 0 additions & 3 deletions b/‎tests/integration_tests.py
Lines changed: 0 additions & 3 deletions
diff --git a/‎torchtitan/components/checkpoint.py
Lines changed: 2 additions & 2 deletions b/‎torchtitan/components/checkpoint.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎torchtitan/experiments/flux/README.md
Lines changed: 26 additions & 2 deletions b/‎torchtitan/experiments/flux/README.md
Lines changed: 26 additions & 2 deletions
diff --git a/‎torchtitan/experiments/flux/dataset/flux_dataset.py
Lines changed: 18 additions & 14 deletions b/‎torchtitan/experiments/flux/dataset/flux_dataset.py
Lines changed: 18 additions & 14 deletions
diff --git a/‎torchtitan/experiments/flux/dataset/tokenizer.py
Lines changed: 74 additions & 1 deletion b/‎torchtitan/experiments/flux/dataset/tokenizer.py
Lines changed: 74 additions & 1 deletion
@@ -39,8 +39,9 @@ install_python() {
 install_pip_dependencies() {
   pushd /opt/conda
   # Install all Python dependencies
-  pip_install -r /opt/conda/dev-requirements.txt
+  pip_install -r /opt/conda/requirements-dev.txt
   pip_install -r /opt/conda/requirements.txt
+  pip_install -r /opt/conda/requirements-flux.txt
   popd
 }
 
 
@@ -1,3 +1,4 @@
 transformers>=4.51.1
 einops
 sentencepiece
+pillow
@@ -29,12 +29,13 @@ ARG MINICONDA_VERSION
 ARG PYTHON_VERSION
 ENV PYTHON_VERSION=$PYTHON_VERSION
 ENV PATH /opt/conda/envs/py_$PYTHON_VERSION/bin:/opt/conda/bin:$PATH
-COPY dev-requirements.txt /opt/conda/
+COPY requirements-dev.txt /opt/conda/
 COPY requirements.txt /opt/conda/
+COPY requirements-flux.txt /opt/conda/
 COPY conda-env-ci.txt /opt/conda/
 COPY ./common/install_conda.sh install_conda.sh
 COPY ./common/utils.sh utils.sh
-RUN bash ./install_conda.sh && rm install_conda.sh utils.sh /opt/conda/dev-requirements.txt /opt/conda/requirements.txt /opt/conda/conda-env-ci.txt
+RUN bash ./install_conda.sh && rm install_conda.sh utils.sh /opt/conda/requirements-dev.txt /opt/conda/requirements.txt /opt/conda/requirements-flux.txt /opt/conda/conda-env-ci.txt
 
 USER ci-user
 CMD ["bash"]
@@ -0,0 +1,46 @@
+name: Flux 8 GPU Integration Test
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - 'torchtitan/experiments/flux/**'
+  pull_request:
+    paths:
+      - 'torchtitan/experiments/flux/**'
+  schedule:
+    # Runs every 6 hours
+    - cron: '0 */6 * * *'
+concurrency:
+  group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+defaults:
+  run:
+    shell: bash -l -eo pipefail {0}
+
+jobs:
+  build-test:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      runner: linux.g5.48xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.6"
+      # This image is faster to clone than the default, but it lacks CC needed by triton
+      # (1m25s vs 2m37s).
+      docker-image: torchtitan-ubuntu-20.04-clang12
+      repository: pytorch/torchtitan
+      upload-artifact: outputs
+      script: |
+        set -eux
+
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        pip config --user set global.progress_bar off
+
+        python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
+
+        mkdir artifacts-to-be-uploaded
+        python -m torchtitan.experiments.flux.tests.integration_tests artifacts-to-be-uploaded --ngpu 8
@@ -0,0 +1,33 @@
+name: Flux Model CPU Unit Test
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - 'torchtitan/experiments/flux/**'
+  pull_request:
+    paths:
+      - 'torchtitan/experiments/flux/**'
+
+
+concurrency:
+  group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build-test:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      docker-image: torchtitan-ubuntu-20.04-clang12
+      repository: pytorch/torchtitan
+      script: |
+        set -eux
+
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        pip config --user set global.progress_bar off
+
+        pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
+        pytest torchtitan/experiments/flux/tests/unit_tests/ --cov=. --cov-report=xml --durations=20 -vv
@@ -14,7 +14,7 @@ wandb
 
 torchtitan/datasets/**/*.model
 assets/**/*.model
-torchtitan/experiments/**/assets/*
+torchtitan/experiments/flux/assets/*
 
 # temp files
 *.log
 
@@ -4,7 +4,7 @@ possible. Contributions should follow the [Contributing Guidelines](#contributin
 
 ### Setup
 ```
-pip install -r dev-requirements.txt
+pip install -r requirements-dev.txt
 ```
 
 ### Pull Requests
 
@@ -0,0 +1 @@
+.ci/docker/requirements-dev.txt
@@ -31,7 +31,6 @@ class OverrideDefinitions:
     test_descr: str = "default"
     test_name: str = "default"
     ngpu: int = 4
-    model_flavor: str = "debugmodel"
 
     def __repr__(self):
         return self.test_descr
@@ -495,7 +494,6 @@ def run_test(test_flavor: OverrideDefinitions, full_path: str, output_dir: str):
     # run_test supports sequence of tests.
     test_name = test_flavor.test_name
     dump_folder_arg = f"--job.dump_folder {output_dir}/{test_name}"
-    model_flavor_arg = f"--model.flavor {test_flavor.model_flavor}"
     all_ranks = ",".join(map(str, range(test_flavor.ngpu)))
 
     for idx, override_arg in enumerate(test_flavor.override_args):
@@ -508,7 +506,6 @@ def run_test(test_flavor: OverrideDefinitions, full_path: str, output_dir: str):
                 "./scripts/estimate/run_memory_estimation.sh"
             )
         cmd += " " + dump_folder_arg
-        cmd += " " + model_flavor_arg
         if override_arg:
             cmd += " " + " ".join(override_arg)
         logger.info(
 
@@ -561,8 +561,8 @@ def _save_last_step(self, curr_step: int) -> None:
 
             # For now, we will manually pop the freqs_cis buffer, as we made this permanent
             # temporarily and we don't want to include it in the exported state_dict.
-            # Context: https://github.com/pytorch/torchtitan/blob/main/torchtitan/models/llama/model.py#L348
-            self.states.pop("freqs_cis")
+            # Context: https://github.com/pytorch/torchtitan/blob/main/torchtitan/models/llama3/model.py#L404
+            self.states.pop("freqs_cis", None)
 
             if self.export_dtype != torch.float32:
                 self.states = {
 
@@ -1,8 +1,16 @@
 # FLUX model in torchtitan
+[![integration tests](https://github.com/pytorch/torchtitan/actions/workflows/flux_integration_test_8gpu.yaml/badge.svg?branch=main)](https://github.com/pytorch/torchtitan/actions/workflows/flux_integration_test_8gpu.yaml?query=branch%3Amain)
+
 
 ## Overview
 This directory contains the implementation of the [FLUX](https://github.com/black-forest-labs/flux/tree/main) model in torchtitan. In torchtitan, we showcase the pre-training process of text-to-image part of the FLUX model.
 
+## Prerequisites
+Install the required dependencies:
+```bash
+pip install -r requirements-flux.txt
+```
+
 ## Usage
 First, download the autoencoder model from HuggingFace with your own access token:
 ```bash
@@ -22,15 +30,31 @@ If you want to train with other model config, run the following command:
 CONFIG_FILE="./torchtitan/experiments/flux/train_configs/flux_schnell_model.toml" ./torchtitan/experiments/flux/run_train.sh
 ```
 
+## Running Tests
+
+### Unit Tests
+To run the unit tests for the FLUX model, use the following command:
+```bash
+pytest -s torchtitan/experiments/flux/tests/
+```
+
+### Integration Tests
+To run the integration tests for the FLUX model, use the following command:
+```bash
+python -m torchtitan.experiments.flux.tests.integration_tests <output_dir>
+```
+
+
 ## Supported Features
 - Parallelism: The model supports FSDP, HSDP for training on multiple GPUs.
 - Activation checkpointing: The model uses activation checkpointing to reduce memory usage during training.
 - Distributed checkpointing and loading.
-    - Notes on the current checkpointing implementation: TO keep the model wieghts are sharded the same way as checkpointing, we need to shard the model weights before saving the checkpoint. This is done by checking each module at the end of envaluation, and sharding the weights of the module if it is a FSDPModule.
+    - Notes on the current checkpointing implementation: To keep the model wieghts are sharded the same way as checkpointing, we need to shard the model weights before saving the checkpoint. This is done by checking each module at the end of envaluation, and sharding the weights of the module if it is a FSDPModule.
+- CI for FLUX model. Supported periodically running integration tests on 8 GPUs, and unittests.
 
 
 
 ## TODO
 - [ ] More parallesim support (Tensor Parallelism, Context Parallelism, etc)
 - [ ] Implement the num_flops_per_token calculation in get_nparams_and_flops() function
-- [ ] Implement test cases in CI for FLUX model. Adding more unit tests for FLUX model (eg, unit test for preprocessor, etc)
+- [ ] Add `torch.compile` support
@@ -20,8 +20,12 @@
 from torch.utils.data import IterableDataset
 from torchtitan.components.dataloader import ParallelAwareDataloader
 
+from torchtitan.components.tokenizer import Tokenizer
 from torchtitan.config_manager import JobConfig
-from torchtitan.experiments.flux.dataset.tokenizer import FluxTokenizer
+from torchtitan.experiments.flux.dataset.tokenizer import (
+    build_flux_tokenizer,
+    FluxTokenizer,
+)
 from torchtitan.tools.logging import logger
 
 
@@ -115,6 +119,13 @@ class TextToImageDatasetConfig:
         loader=lambda path: load_dataset(path, split="train", streaming=True),
         data_processor=_cc12m_wds_data_processor,
     ),
+    "cc12m-test": TextToImageDatasetConfig(
+        path="torchtitan/experiments/flux/tests/assets/cc12m_test",
+        loader=lambda path: load_dataset(
+            path, split="train", data_files={"train": "*.tar"}, streaming=True
+        ),
+        data_processor=_cc12m_wds_data_processor,
+    ),
 }
 
 
@@ -150,8 +161,8 @@ def __init__(
         self,
         dataset_name: str,
         dataset_path: Optional[str],
-        t5_tokenizer: FluxTokenizer,
-        clip_tokenizer: FluxTokenizer,
+        t5_tokenizer: Tokenizer,
+        clip_tokenizer: Tokenizer,
         job_config: Optional[JobConfig] = None,
         dp_rank: int = 0,
         dp_world_size: int = 1,
@@ -243,6 +254,7 @@ def __iter__(self):
             self._sample_idx += 1
 
             labels = sample_dict.pop("image")
+
             yield sample_dict, labels
 
     def load_state_dict(self, state_dict):
@@ -267,21 +279,13 @@ def build_flux_dataloader(
     dataset_path = job_config.training.dataset_path
     batch_size = job_config.training.batch_size
 
-    t5_encoder_name = job_config.encoder.t5_encoder
-    clip_encoder_name = job_config.encoder.clip_encoder
-    max_t5_encoding_len = job_config.encoder.max_t5_encoding_len
+    t5_tokenizer, clip_tokenizer = build_flux_tokenizer(job_config)
 
     ds = FluxDataset(
         dataset_name=dataset_name,
         dataset_path=dataset_path,
-        t5_tokenizer=FluxTokenizer(
-            t5_encoder_name,
-            max_length=max_t5_encoding_len,
-        ),
-        clip_tokenizer=FluxTokenizer(
-            clip_encoder_name,
-            max_length=77,
-        ),  # fix max_length for CLIP
+        t5_tokenizer=t5_tokenizer,
+        clip_tokenizer=clip_tokenizer,
         job_config=job_config,
         dp_rank=dp_rank,
         dp_world_size=dp_world_size,
 
@@ -10,10 +10,54 @@
 
 from typing import List
 
+import torch
 from torchtitan.components.tokenizer import Tokenizer
+from torchtitan.config_manager import JobConfig
+from torchtitan.datasets.tokenizer.tiktoken import TikTokenizer
 from transformers import CLIPTokenizer, T5Tokenizer
 
 
+class FluxTestTokenizer(Tokenizer):
+    """
+    Flux Tokenizer for test purpose. This is a simple wrapper around the TikTokenizer,
+     to make it has same interface as the T5 and CLIP tokenizer used for Flux.
+    """
+
+    def __init__(self, model_path: str = "t5-small", max_length: int = 77, **hf_kwargs):
+        self.tiktokenizer = TikTokenizer(model_path, **hf_kwargs)
+        self._max_length = max_length
+        self.pad_id = 0
+
+    def _pad_and_chunk_tokens(
+        self, tokens: List[int], max_length: int, pad_token: int
+    ) -> List[int]:
+        # Pad the token sequence to max_length
+        if len(tokens) < max_length:
+            # If tokens are shorter than max_length, pad with pad_id or eos_id if pad_id is not defined
+            padding = [pad_token] * (max_length - len(tokens))
+            tokens = tokens + padding
+
+        # Chunk the token sequence to max_length
+        if len(tokens) > max_length:
+            tokens = tokens[:max_length]
+
+        return tokens
+
+    def encode(self, text: str) -> torch.Tensor:
+        """
+        Use TikTokenizer to encode the text into tokens, and then pad and chunk the tokens to max_length.
+        """
+        tokens = self.tiktokenizer.encode(text, bos=True, eos=True)
+        tokens = self._pad_and_chunk_tokens(tokens, self._max_length, self.pad_id)
+        return torch.tensor(tokens)
+
+    def decode(self, t: List[int]) -> str:
+        """
+        Decode function. This function will not be called.
+        """
+        return self.tiktokenizer.decode(t)
+
+
 class FluxTokenizer(Tokenizer):
     """
     Tokenizing and encoding/decoding text using the T5 or Clip tokenizer.
@@ -42,7 +86,7 @@ def __init__(self, model_path: str = "t5-small", max_length: int = 77, **hf_kwar
     def encode(
         self,
         s: str,
-    ) -> List[int]:
+    ) -> torch.Tensor:
         """
         Encode the prompt text into tokens.
         """
@@ -62,3 +106,32 @@ def decode(self, t: List[int]) -> str:
         Decode function. This function will not be called.
         """
         return self._tokenizer.decode(t)
+
+
+def build_flux_tokenizer(job_config: JobConfig) -> tuple[Tokenizer, Tokenizer]:
+    """
+    Build the tokenizer for Flux.
+    """
+    t5_tokenizer_path = job_config.encoder.t5_encoder
+    clip_tokenzier_path = job_config.encoder.clip_encoder
+    max_t5_encoding_len = job_config.encoder.max_t5_encoding_len
+
+    # NOTE: This tokenizer is used for offline CI and testing only, borrowed from llama3 tokenizer
+    if job_config.training.test_mode:
+        tokenizer_class = FluxTestTokenizer
+        t5_tokenizer_path = clip_tokenzier_path = job_config.model.tokenizer_path
+    else:
+        tokenizer_class = FluxTokenizer
+
+    # T5 tokenzier will pad the token sequence to max_t5_encoding_len,
+    # and CLIP tokenizer will pad the token sequence to 77 (fixed number).
+    t5_tokenizer = tokenizer_class(
+        t5_tokenizer_path,
+        max_length=max_t5_encoding_len,
+    )
+    clip_tokenizer = tokenizer_class(
+        clip_tokenzier_path,
+        max_length=77,
+    )
+
+    return t5_tokenizer, clip_tokenizer
-Original file line number
+Diff line change
@@ @@ -1,3 +1,4 @@ @@
 transformers>=4.51.1
 einops
 sentencepiece
 +pillow