-
Notifications
You must be signed in to change notification settings - Fork 763
Migrate VAE example to Flax NNX #5077
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 1 commit
b4627cb
6fd002a
f959a6d
76971ef
ed731fc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,7 +25,7 @@ | |
| import jax | ||
| from ml_collections import config_flags | ||
| import tensorflow as tf | ||
|
|
||
| import time | ||
| import train | ||
|
|
||
|
|
||
|
|
@@ -38,6 +38,8 @@ | |
| lock_config=True, | ||
| ) | ||
|
|
||
| flags.DEFINE_string('workdir', None, 'Directory to store logs and checkpoints.') | ||
|
|
||
|
|
||
| def main(argv): | ||
| if len(argv) > 1: | ||
|
|
@@ -55,9 +57,9 @@ def main(argv): | |
| f'process_index: {jax.process_index()}, ' | ||
| f'process_count: {jax.process_count()}' | ||
| ) | ||
|
|
||
| start = time.time() | ||
| train.train_and_evaluate(FLAGS.config) | ||
|
|
||
| logging.info('Total training time: %.2f seconds', time.time() - start) | ||
vfdev-5 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| if __name__ == '__main__': | ||
| app.run(main) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @sanepunk why do you remove abseil app and the usage of config file? |
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,7 +1,9 @@ | ||
| absl-py==1.4.0 | ||
| flax==0.6.9 | ||
| numpy==1.23.5 | ||
| flax~=0.10 | ||
vfdev-5 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| numpy>=1.26.4 | ||
| optax==0.1.5 | ||
| Pillow==10.2.0 | ||
| tensorflow==2.12.0 | ||
| tensorflow-datasets==4.9.2 | ||
| tensorflow~=2.16.0 | ||
vfdev-5 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| tensorflow-datasets==4.9.2 | ||
| clu==0.0.12 | ||
vfdev-5 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| ml-collections>=0.1.1 | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -14,11 +14,10 @@ | |
| """Training and evaluation logic.""" | ||
|
|
||
| from absl import logging | ||
| from flax import linen as nn | ||
| from flax import nnx | ||
| import input_pipeline | ||
| import models | ||
| import utils as vae_utils | ||
| from flax.training import train_state | ||
| import jax | ||
| from jax import random | ||
| import jax.numpy as jnp | ||
|
|
@@ -34,7 +33,7 @@ def kl_divergence(mean, logvar): | |
|
|
||
| @jax.vmap | ||
| def binary_cross_entropy_with_logits(logits, labels): | ||
| logits = nn.log_sigmoid(logits) | ||
| logits = nnx.log_sigmoid(logits) | ||
| return -jnp.sum( | ||
| labels * logits + (1.0 - labels) * jnp.log(-jnp.expm1(logits)) | ||
| ) | ||
|
|
@@ -45,36 +44,34 @@ def compute_metrics(recon_x, x, mean, logvar): | |
| kld_loss = kl_divergence(mean, logvar).mean() | ||
| return {'bce': bce_loss, 'kld': kld_loss, 'loss': bce_loss + kld_loss} | ||
|
|
||
|
|
||
| def train_step(state, batch, z_rng, latents): | ||
| def loss_fn(params): | ||
| recon_x, mean, logvar = models.model(latents).apply( | ||
| {'params': params}, batch, z_rng | ||
| ) | ||
|
|
||
| @nnx.jit | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's use donate args to donate model and optimizer to reduce GPU memory usage.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I tried adding donate_argnums to nnx.jit in the train_step, but was getting NaN loss and kl divergence. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What to do about this? |
||
| def train_step(optimizer: nnx.Optimizer, model: nnx.Module, batch, z_rng, latents): | ||
vfdev-5 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| """Single training step for the VAE model.""" | ||
| def loss_fn(model): | ||
| recon_x, mean, logvar = model(batch, z_rng) | ||
| bce_loss = binary_cross_entropy_with_logits(recon_x, batch).mean() | ||
| kld_loss = kl_divergence(mean, logvar).mean() | ||
| loss = bce_loss + kld_loss | ||
| return loss | ||
|
|
||
| grads = jax.grad(loss_fn)(state.params) | ||
| return state.apply_gradients(grads=grads) | ||
| loss, grads = nnx.value_and_grad(loss_fn)(model) | ||
| optimizer.update(grads) | ||
| return loss | ||
|
|
||
|
|
||
| def eval_f(params, images, z, z_rng, latents): | ||
| def eval_model(vae): | ||
| recon_images, mean, logvar = vae(images, z_rng) | ||
| comparison = jnp.concatenate([ | ||
| images[:8].reshape(-1, 28, 28, 1), | ||
| recon_images[:8].reshape(-1, 28, 28, 1), | ||
| ]) | ||
| @nnx.jit | ||
| def eval_f(model: nnx.Module, images, z, z_rng, latents): | ||
| """Evaluation function for the VAE model.""" | ||
| recon_images, mean, logvar = model(images, z_rng) | ||
| comparison = jnp.concatenate([ | ||
| images[:8].reshape(-1, 28, 28, 1), | ||
| recon_images[:8].reshape(-1, 28, 28, 1), | ||
| ]) | ||
| generate_images = model.generate(z) | ||
| generate_images = generate_images.reshape(-1, 28, 28, 1) | ||
| metrics = compute_metrics(recon_images, images, mean, logvar) | ||
| return metrics, comparison, generate_images | ||
|
|
||
| generate_images = vae.generate(z) | ||
| generate_images = generate_images.reshape(-1, 28, 28, 1) | ||
| metrics = compute_metrics(recon_images, images, mean, logvar) | ||
| return metrics, comparison, generate_images | ||
|
|
||
| return nn.apply(eval_model, models.model(latents))({'params': params}) | ||
|
|
||
|
|
||
| def train_and_evaluate(config: ml_collections.ConfigDict): | ||
|
|
@@ -90,14 +87,9 @@ def train_and_evaluate(config: ml_collections.ConfigDict): | |
| test_ds = input_pipeline.build_test_set(ds_builder) | ||
|
|
||
| logging.info('Initializing model.') | ||
| init_data = jnp.ones((config.batch_size, 784), jnp.float32) | ||
| params = models.model(config.latents).init(key, init_data, rng)['params'] | ||
|
|
||
| state = train_state.TrainState.create( | ||
| apply_fn=models.model(config.latents).apply, | ||
| params=params, | ||
| tx=optax.adam(config.learning_rate), | ||
| ) | ||
| rngs = nnx.Rngs(0) | ||
| model = models.model(784, config.latents, rngs=rngs) | ||
| optimizer = nnx.Optimizer(model, optax.adam(config.learning_rate)) | ||
|
|
||
| rng, z_key, eval_rng = random.split(rng, 3) | ||
| z = random.normal(z_key, (64, config.latents)) | ||
|
|
@@ -110,10 +102,10 @@ def train_and_evaluate(config: ml_collections.ConfigDict): | |
| for _ in range(steps_per_epoch): | ||
| batch = next(train_ds) | ||
| rng, key = random.split(rng) | ||
| state = train_step(state, batch, key, config.latents) | ||
| loss_val = train_step(optimizer, model, batch, key, config.latents) | ||
|
|
||
| metrics, comparison, sample = eval_f( | ||
| state.params, test_ds, z, eval_rng, config.latents | ||
| model, test_ds, z, eval_rng, config.latents | ||
| ) | ||
| vae_utils.save_image( | ||
| comparison, f'results/reconstruction_{epoch}.png', nrow=8 | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.