gpu transfer not working

liopeer · Sep 22, 2023 · 3b8ee6c · 3b8ee6c
1 parent 2d9c488
commit 3b8ee6c
Show file tree

Hide file tree

Showing 6 changed files with 34 additions and 7 deletions.
diff --git a/.gitignore b/.gitignore
@@ -7,6 +7,7 @@ tests/data
 
 # logging
 tests/wandb
+tests/log
 
 # sphinx build
 docs/source/_autosummary

diff --git a/README.md b/README.md
@@ -12,6 +12,15 @@ The goal of this repo is to unite models and approaches for conditioning DDPMs (
 ## Getting Started
 Everything to get you started you will find in the documentation of the [DiffusionMRI documentation](https://liopeer.github.io/diffusionmodels/index.html). Feel free to raise issues or get in touch if you would like to contribute.
 
+## Cluster Commands
+make sure that `log` folder exists
+```bash
+sbatch --job-name=NAME --output=log/%j.out --gres=gpu:1 --mem=10G subscript.sh SCRIPT_PARAMS
+```
+```bash
+srun --time 10 --partition=gpu.debug --gres=gpu:1 --pty bash -i
+```
+
 ## VSCode Remote Troubleshooting
 ### Repeated Password Query
 [UChicago-VSCode Remote](https://howto.cs.uchicago.edu/techstaff:vscode)
diff --git a/diffusion_models/models/unet.py b/diffusion_models/models/unet.py
@@ -75,6 +75,8 @@ def forward(
         skip
             convoluted but non-downscaled tensor for skip connection
         """
+        print(self.conv1[0].weight.device, x.device)
+        assert False
         x = self.conv1(x)
         if time_embedding is not None:
             time_embedding = self.time_embedding_fc(time_embedding)

diff --git a/tests/job.sh b/tests/job.sh
@@ -1,11 +1,13 @@
 #!/bin/bash
+#SBATCH  --account=student
 #SBATCH  --output=log/%j.out
+#SBATCH  --error=log/%j.err
 #SBATCH  --gres=gpu:2
-#SBATCH  --mem=10G
+#SBATCH  --mem=32G
 #SBATCH  --job-name=mnist_double
 #SBATCH  --constraint='titan_xp'
 
 source /scratch_net/biwidl311/peerli/conda/etc/profile.d/conda.sh
 conda activate liotorch
 mkdir log
-python -u train_parallel.py "$@"
+python -u train_discriminative.py "$@"
diff --git a/tests/job_singlegpu.sh b/tests/job_singlegpu.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+#SBATCH  --account=student
+#SBATCH  --output=log/%j.out
+#SBATCH  --error=log/%j.err
+#SBATCH  --gres=gpu:1
+#SBATCH  --mem=32G
+#SBATCH  --job-name=mnist_double
+#SBATCH  --constraint='titan_xp'
+
+source /scratch_net/biwidl311/peerli/conda/etc/profile.d/conda.sh
+conda activate liotorch
+mkdir log
+python -u train_parallel.py "$@"
diff --git a/tests/train_generative.py b/tests/train_generative.py
@@ -21,7 +21,7 @@
     total_epochs = 2,
     batch_size = 1000,
     learning_rate = 0.001,
-    device_type = "cpu",
+    device_type = "cuda",
     dataset = MNISTTrainDataset,
     architecture = DiffusionModel,
     backbone = UNet,
@@ -36,10 +36,10 @@
     schedule_type = "linear",
     time_enc_dim = 256,
     optimizer = torch.optim.Adam,
-    data_path = os.path.abspath("./data"),
-    checkpoint_folder = os.path.abspath(os.path.join("./data/checkpoints")),
-    #data_path = "/itet-stor/peerli/net_scratch",
-    #checkpoint_folder = "/itet-stor/peerli/net_scratch/mnist_checkpoints",
+    #data_path = os.path.abspath("./data"),
+    #checkpoint_folder = os.path.abspath(os.path.join("./data/checkpoints")),
+    data_path = "/itet-stor/peerli/net_scratch",
+    checkpoint_folder = "/itet-stor/peerli/net_scratch/mnist_checkpoints",
     save_every = 10,
     loss_func = F.mse_loss,
     log_wandb = False