From 3b8ee6c42508ba0da9e554650a23f8d53ba6ce19 Mon Sep 17 00:00:00 2001 From: Lionel Peer Date: Fri, 22 Sep 2023 18:25:57 +0200 Subject: [PATCH] gpu transfer not working --- .gitignore | 1 + README.md | 9 +++++++++ diffusion_models/models/unet.py | 2 ++ tests/job.sh | 6 ++++-- tests/job_singlegpu.sh | 13 +++++++++++++ tests/train_generative.py | 10 +++++----- 6 files changed, 34 insertions(+), 7 deletions(-) create mode 100644 tests/job_singlegpu.sh diff --git a/.gitignore b/.gitignore index fc3f90c..42f5fdf 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,7 @@ tests/data # logging tests/wandb +tests/log # sphinx build docs/source/_autosummary diff --git a/README.md b/README.md index 30e4c31..f49e387 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,15 @@ The goal of this repo is to unite models and approaches for conditioning DDPMs ( ## Getting Started Everything to get you started you will find in the documentation of the [DiffusionMRI documentation](https://liopeer.github.io/diffusionmodels/index.html). Feel free to raise issues or get in touch if you would like to contribute. +## Cluster Commands +make sure that `log` folder exists +```bash +sbatch --job-name=NAME --output=log/%j.out --gres=gpu:1 --mem=10G subscript.sh SCRIPT_PARAMS +``` +```bash +srun --time 10 --partition=gpu.debug --gres=gpu:1 --pty bash -i +``` + ## VSCode Remote Troubleshooting ### Repeated Password Query [UChicago-VSCode Remote](https://howto.cs.uchicago.edu/techstaff:vscode) \ No newline at end of file diff --git a/diffusion_models/models/unet.py b/diffusion_models/models/unet.py index b1503a3..2cc306b 100644 --- a/diffusion_models/models/unet.py +++ b/diffusion_models/models/unet.py @@ -75,6 +75,8 @@ def forward( skip convoluted but non-downscaled tensor for skip connection """ + print(self.conv1[0].weight.device, x.device) + assert False x = self.conv1(x) if time_embedding is not None: time_embedding = self.time_embedding_fc(time_embedding) diff --git a/tests/job.sh b/tests/job.sh index 56ef245..320df89 100644 --- a/tests/job.sh +++ b/tests/job.sh @@ -1,11 +1,13 @@ #!/bin/bash +#SBATCH --account=student #SBATCH --output=log/%j.out +#SBATCH --error=log/%j.err #SBATCH --gres=gpu:2 -#SBATCH --mem=10G +#SBATCH --mem=32G #SBATCH --job-name=mnist_double #SBATCH --constraint='titan_xp' source /scratch_net/biwidl311/peerli/conda/etc/profile.d/conda.sh conda activate liotorch mkdir log -python -u train_parallel.py "$@" +python -u train_discriminative.py "$@" diff --git a/tests/job_singlegpu.sh b/tests/job_singlegpu.sh new file mode 100644 index 0000000..c692183 --- /dev/null +++ b/tests/job_singlegpu.sh @@ -0,0 +1,13 @@ +#!/bin/bash +#SBATCH --account=student +#SBATCH --output=log/%j.out +#SBATCH --error=log/%j.err +#SBATCH --gres=gpu:1 +#SBATCH --mem=32G +#SBATCH --job-name=mnist_double +#SBATCH --constraint='titan_xp' + +source /scratch_net/biwidl311/peerli/conda/etc/profile.d/conda.sh +conda activate liotorch +mkdir log +python -u train_parallel.py "$@" diff --git a/tests/train_generative.py b/tests/train_generative.py index 16a8e0a..94146ab 100644 --- a/tests/train_generative.py +++ b/tests/train_generative.py @@ -21,7 +21,7 @@ total_epochs = 2, batch_size = 1000, learning_rate = 0.001, - device_type = "cpu", + device_type = "cuda", dataset = MNISTTrainDataset, architecture = DiffusionModel, backbone = UNet, @@ -36,10 +36,10 @@ schedule_type = "linear", time_enc_dim = 256, optimizer = torch.optim.Adam, - data_path = os.path.abspath("./data"), - checkpoint_folder = os.path.abspath(os.path.join("./data/checkpoints")), - #data_path = "/itet-stor/peerli/net_scratch", - #checkpoint_folder = "/itet-stor/peerli/net_scratch/mnist_checkpoints", + #data_path = os.path.abspath("./data"), + #checkpoint_folder = os.path.abspath(os.path.join("./data/checkpoints")), + data_path = "/itet-stor/peerli/net_scratch", + checkpoint_folder = "/itet-stor/peerli/net_scratch/mnist_checkpoints", save_every = 10, loss_func = F.mse_loss, log_wandb = False