From 3b8ee6c42508ba0da9e554650a23f8d53ba6ce19 Mon Sep 17 00:00:00 2001
From: Lionel Peer <lionel.peer@gmail.com>
Date: Fri, 22 Sep 2023 18:25:57 +0200
Subject: [PATCH] gpu transfer not working

---
 .gitignore                      |  1 +
 README.md                       |  9 +++++++++
 diffusion_models/models/unet.py |  2 ++
 tests/job.sh                    |  6 ++++--
 tests/job_singlegpu.sh          | 13 +++++++++++++
 tests/train_generative.py       | 10 +++++-----
 6 files changed, 34 insertions(+), 7 deletions(-)
 create mode 100644 tests/job_singlegpu.sh

diff --git a/.gitignore b/.gitignore
index fc3f90c..42f5fdf 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,7 @@ tests/data
 
 # logging
 tests/wandb
+tests/log
 
 # sphinx build
 docs/source/_autosummary
diff --git a/README.md b/README.md
index 30e4c31..f49e387 100644
--- a/README.md
+++ b/README.md
@@ -12,6 +12,15 @@ The goal of this repo is to unite models and approaches for conditioning DDPMs (
 ## Getting Started
 Everything to get you started you will find in the documentation of the [DiffusionMRI documentation](https://liopeer.github.io/diffusionmodels/index.html). Feel free to raise issues or get in touch if you would like to contribute.
 
+## Cluster Commands
+make sure that `log` folder exists
+```bash
+sbatch --job-name=NAME --output=log/%j.out --gres=gpu:1 --mem=10G subscript.sh SCRIPT_PARAMS
+```
+```bash
+srun --time 10 --partition=gpu.debug --gres=gpu:1 --pty bash -i
+```
+
 ## VSCode Remote Troubleshooting
 ### Repeated Password Query
 [UChicago-VSCode Remote](https://howto.cs.uchicago.edu/techstaff:vscode)
\ No newline at end of file
diff --git a/diffusion_models/models/unet.py b/diffusion_models/models/unet.py
index b1503a3..2cc306b 100644
--- a/diffusion_models/models/unet.py
+++ b/diffusion_models/models/unet.py
@@ -75,6 +75,8 @@ def forward(
         skip
             convoluted but non-downscaled tensor for skip connection
         """
+        print(self.conv1[0].weight.device, x.device)
+        assert False
         x = self.conv1(x)
         if time_embedding is not None:
             time_embedding = self.time_embedding_fc(time_embedding)
diff --git a/tests/job.sh b/tests/job.sh
index 56ef245..320df89 100644
--- a/tests/job.sh
+++ b/tests/job.sh
@@ -1,11 +1,13 @@
 #!/bin/bash
+#SBATCH  --account=student
 #SBATCH  --output=log/%j.out
+#SBATCH  --error=log/%j.err
 #SBATCH  --gres=gpu:2
-#SBATCH  --mem=10G
+#SBATCH  --mem=32G
 #SBATCH  --job-name=mnist_double
 #SBATCH  --constraint='titan_xp'
 
 source /scratch_net/biwidl311/peerli/conda/etc/profile.d/conda.sh
 conda activate liotorch
 mkdir log
-python -u train_parallel.py "$@"
+python -u train_discriminative.py "$@"
diff --git a/tests/job_singlegpu.sh b/tests/job_singlegpu.sh
new file mode 100644
index 0000000..c692183
--- /dev/null
+++ b/tests/job_singlegpu.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+#SBATCH  --account=student
+#SBATCH  --output=log/%j.out
+#SBATCH  --error=log/%j.err
+#SBATCH  --gres=gpu:1
+#SBATCH  --mem=32G
+#SBATCH  --job-name=mnist_double
+#SBATCH  --constraint='titan_xp'
+
+source /scratch_net/biwidl311/peerli/conda/etc/profile.d/conda.sh
+conda activate liotorch
+mkdir log
+python -u train_parallel.py "$@"
diff --git a/tests/train_generative.py b/tests/train_generative.py
index 16a8e0a..94146ab 100644
--- a/tests/train_generative.py
+++ b/tests/train_generative.py
@@ -21,7 +21,7 @@
     total_epochs = 2,
     batch_size = 1000,
     learning_rate = 0.001,
-    device_type = "cpu",
+    device_type = "cuda",
     dataset = MNISTTrainDataset,
     architecture = DiffusionModel,
     backbone = UNet,
@@ -36,10 +36,10 @@
     schedule_type = "linear",
     time_enc_dim = 256,
     optimizer = torch.optim.Adam,
-    data_path = os.path.abspath("./data"),
-    checkpoint_folder = os.path.abspath(os.path.join("./data/checkpoints")),
-    #data_path = "/itet-stor/peerli/net_scratch",
-    #checkpoint_folder = "/itet-stor/peerli/net_scratch/mnist_checkpoints",
+    #data_path = os.path.abspath("./data"),
+    #checkpoint_folder = os.path.abspath(os.path.join("./data/checkpoints")),
+    data_path = "/itet-stor/peerli/net_scratch",
+    checkpoint_folder = "/itet-stor/peerli/net_scratch/mnist_checkpoints",
     save_every = 10,
     loss_func = F.mse_loss,
     log_wandb = False