a-r-j
diff --git a/‎CHANGELOG.md
+8 b/‎CHANGELOG.md
+8
diff --git a/‎docs/source/conf.py
+1-1 b/‎docs/source/conf.py
+1-1
diff --git a/‎poetry.lock
+4-4 b/‎poetry.lock
+4-4
diff --git a/‎proteinworkshop/config/dataset/antibody_developability.yaml
+1 b/‎proteinworkshop/config/dataset/antibody_developability.yaml
+1
diff --git a/‎proteinworkshop/config/dataset/cath.yaml
+1 b/‎proteinworkshop/config/dataset/cath.yaml
+1
diff --git a/‎proteinworkshop/config/dataset/ccpdb_ligands.yaml
+1 b/‎proteinworkshop/config/dataset/ccpdb_ligands.yaml
+1
diff --git a/‎proteinworkshop/config/dataset/ccpdb_metal.yaml
+1 b/‎proteinworkshop/config/dataset/ccpdb_metal.yaml
+1
diff --git a/‎proteinworkshop/config/dataset/ccpdb_nucleic.yaml
+1 b/‎proteinworkshop/config/dataset/ccpdb_nucleic.yaml
+1
diff --git a/‎proteinworkshop/config/dataset/ccpdb_nucleotides.yaml
+1 b/‎proteinworkshop/config/dataset/ccpdb_nucleotides.yaml
+1
diff --git a/‎proteinworkshop/config/dataset/deep_sea_proteins.yaml
+1 b/‎proteinworkshop/config/dataset/deep_sea_proteins.yaml
+1
diff --git a/‎proteinworkshop/config/dataset/dummy.yaml
+1 b/‎proteinworkshop/config/dataset/dummy.yaml
+1
diff --git a/‎proteinworkshop/config/dataset/ec_reaction.yaml
+1 b/‎proteinworkshop/config/dataset/ec_reaction.yaml
+1
diff --git a/‎proteinworkshop/config/dataset/fold_family.yaml
+1 b/‎proteinworkshop/config/dataset/fold_family.yaml
+1
diff --git a/‎proteinworkshop/config/dataset/fold_fold.yaml
+1 b/‎proteinworkshop/config/dataset/fold_fold.yaml
+1
diff --git a/‎proteinworkshop/config/dataset/fold_superfamily.yaml
+1 b/‎proteinworkshop/config/dataset/fold_superfamily.yaml
+1
diff --git a/‎proteinworkshop/config/dataset/go-bp.yaml
+1 b/‎proteinworkshop/config/dataset/go-bp.yaml
+1
diff --git a/‎proteinworkshop/config/dataset/go-cc.yaml
+1 b/‎proteinworkshop/config/dataset/go-cc.yaml
+1
diff --git a/‎proteinworkshop/config/dataset/go-mf.yaml
+1 b/‎proteinworkshop/config/dataset/go-mf.yaml
+1
diff --git a/‎proteinworkshop/config/dataset/masif_site.yaml
+1 b/‎proteinworkshop/config/dataset/masif_site.yaml
+1
diff --git a/‎proteinworkshop/config/dataset/metal_3d.yaml
+1 b/‎proteinworkshop/config/dataset/metal_3d.yaml
+1
diff --git a/‎proteinworkshop/config/dataset/pdb.yaml
+1 b/‎proteinworkshop/config/dataset/pdb.yaml
+1
diff --git a/‎proteinworkshop/config/dataset/ptm.yaml
+1 b/‎proteinworkshop/config/dataset/ptm.yaml
+1
diff --git a/‎proteinworkshop/datasets/antibody_developability.py
+6 b/‎proteinworkshop/datasets/antibody_developability.py
+6
diff --git a/‎proteinworkshop/datasets/astral.py
+4-2 b/‎proteinworkshop/datasets/astral.py
+4-2
diff --git a/‎proteinworkshop/datasets/base.py
+21-2 b/‎proteinworkshop/datasets/base.py
+21-2
diff --git a/‎proteinworkshop/datasets/cath.py
+7 b/‎proteinworkshop/datasets/cath.py
+7
@@ -1,3 +1,11 @@
+### 2.0.2 (Unreleased)
+
+* Fixes raw data download triggered by absence of PDB when using pre-processed datasets ([#24](https://github.com/a-r-j/ProteinWorkshop/pull/24))
+* Fixes bug where batches created from `in_memory=True` data were not correctly formatted ([#24](https://github.com/a-r-j/ProteinWorkshop/pull/24))
+* Consistently exposes the `overwrite` argument for datamodules to users ([#24](https://github.com/a-r-j/ProteinWorkshop/pull/24))
+* Fixes bug where downloading FoldComp datasets into directories with the same name as the dataset throws an error ([#24](https://github.com/a-r-j/ProteinWorkshop/pull/24))
+* Increments `graphein` dependency to `1.7.3` ([#24](https://github.com/a-r-j/ProteinWorkshop/pull/24))
+
 ### 2.0.1 (29/08/2023)
 
 * Fixes incorrect lookup of `DATA_PATH` env var ([#19](https://github.com/a-r-j/ProteinWorkshop/pull/19))
 
@@ -9,7 +9,7 @@
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 project = "Protein Workshop"
 author = "Arian R. Jamasb"
-release = "0.2.1"
+release = "0.2.2"
 copyright = f"{datetime.datetime.now().year}, {author}"
 
 # -- General configuration ---------------------------------------------------
 
@@ -9,4 +9,5 @@ datamodule:
   format: "mmtf" # Format of the structure files
   obsolete_strategy: "drop" # What to do with obsolete PDB entries
   transforms: ${transforms} # Transforms to apply to dataset examples
+  overwrite: False
 num_classes: 2 # Number of classes
@@ -8,4 +8,5 @@ datamodule:
   batch_size: 32 # Batch size for dataloader
   dataset_fraction: 1.0 # Fraction of the dataset to use
   transforms: ${transforms} # Transforms to apply to dataset examples
+  overwrite: False # Whether to overwrite the dataset if it already exists
 num_classes: 23 # Number of classes
@@ -14,5 +14,6 @@ datamodule:
   val_fraction: 0.1 # Fraction of the dataset to use for validation
   test_fraction: 0.1 # Fraction of the dataset to use for testing
   transforms: ${transforms}
+  overwrite: False # Whether to overwrite the dataset if it already exists
 
 num_classes: 7 # Number of classes
@@ -14,5 +14,6 @@ datamodule:
   val_fraction: 0.1 # Fraction of the dataset to use for validation
   test_fraction: 0.1 # Fraction of the dataset to use for testing
   transforms: ${transforms}
+  overwrite: False # Whether to overwrite the dataset if it already exists
 
 num_classes: 7 # Number of classes
@@ -14,5 +14,6 @@ datamodule:
   val_fraction: 0.1 # Fraction of the dataset to use for validation
   test_fraction: 0.1 # Fraction of the dataset to use for testing
   transforms: ${transforms}
+  overwrite: False # Whether to overwrite the dataset if it already exists
 
 num_classes: 2 # Number of classes
@@ -14,5 +14,6 @@ datamodule:
   val_fraction: 0.1 # Fraction of the dataset to use for validation
   test_fraction: 0.1 # Fraction of the dataset to use for testing
   transforms: ${transforms}
+  overwrite: False # Whether to overwrite the dataset if it already exists
 
 num_classes: 8 # Number of classes
@@ -9,4 +9,5 @@ datamodule:
   obsolete_strategy: "drop"
   format: "mmtf" # Format of the raw PDB/MMTF files
   transforms: ${transforms}
+  overwrite: False
 num_classes: 2
@@ -8,5 +8,6 @@ datamodule:
   obsolete_strategy: "drop"
   format: "mmtf.gz" # Format of the raw PDB/MMTF files
   transforms: ${transforms}
+  overwrite: True
 
 num_classes: 2 # Number of classes in the dataset
@@ -9,4 +9,5 @@ datamodule:
   dataset_fraction: 1.0 # Fraction of the dataset to use
   shuffle_labels: False # Whether to shuffle labels for permutation testing
   transforms: ${transforms}
+  overwrite: False
 num_classes: 384
@@ -8,4 +8,5 @@ datamodule:
   dataset_fraction: 1.0 # Fraction of dataset to use
   shuffle_labels: False # Whether to shuffle labels for permutation testing
   transforms: ${transforms} # Transforms to apply to dataset examples
+  overwrite: False # Whether to overwrite existing dataset files
 num_classes: 1195 # Number of classes
@@ -8,4 +8,5 @@ datamodule:
   dataset_fraction: 1.0 # Fraction of dataset to use
   shuffle_labels: False # Whether to shuffle labels for permutation testing
   transforms: ${transforms} # Transforms to apply to dataset examples
+  overwrite: False # Whether to overwrite existing dataset files
 num_classes: 1195 # Number of classes
@@ -8,4 +8,5 @@ datamodule:
   dataset_fraction: 1.0 # Fraction of dataset to use
   shuffle_labels: False # Whether to shuffle labels for permutation testing
   transforms: ${transforms} # Transforms to apply to dataset examples
+  overwrite: False # Whether to overwrite existing dataset files
 num_classes: 1195 # Number of classes
@@ -10,4 +10,5 @@ datamodule:
   num_workers: 8 # Number of workers for dataloader
   split: "BP" # Split of the dataset to use (`BP`, `MF`, `CC`)
   transforms: ${transforms} # Transforms to apply to dataset examples
+  overwrite: False # Whether to overwrite existing dataset files
 num_classes: 2 # Number of classes
@@ -10,4 +10,5 @@ datamodule:
   num_workers: 8 # Number of workers for dataloader
   split: "CC" # Split of the dataset to use (`BP`, `MF`, `CC`)
   transforms: ${transforms} # Transforms to apply to dataset examples
+  overwrite: False # Whether to overwrite existing dataset files
 num_classes: 2 # Number of classes
@@ -10,4 +10,5 @@ datamodule:
   num_workers: 8 # Number of workers for dataloader
   split: "MF" # Split of the dataset to use (`BP`, `MF`, `CC`)
   transforms: ${transforms} # Transforms to apply to dataset examples
+  overwrite: False # Whether to overwrite existing dataset files
 num_classes: 2 # Number of classes
@@ -9,4 +9,5 @@ datamodule:
   dataset_fraction: 1.0 # Fraction of the dataset to use
   shuffle_labels: False # Whether to shuffle labels for permutation testing
   transforms: ${transforms} # Transforms to apply to dataset examples
+  overwrite: False # Whether to overwrite existing dataset files
 num_classes: 2 # Number of classes
@@ -9,4 +9,5 @@ datamodule:
   num_workers: 8 # Number of workers for dataloader
   transforms: ${transforms}
   obsolete_strategy: "drop"  # Or replace
+  overwrite: False
 num_classes: 2
@@ -5,6 +5,7 @@ datamodule:
   num_workers: 4 # Number of workers for dataloader
   pin_memory: True # Pin memory for dataloader
   transforms: ${transforms} # Transforms to apply to dataset examples
+  overwrite: False # Whether to overwrite existing dataset files
 
   pdb_dataset:
     _target_: "proteinworkshop.datasets.pdb_dataset.PDBData"
 
@@ -7,4 +7,5 @@ datamodule:
   pin_memory: True # Pin memory for dataloader
   num_workers: 16 # Number of workers for dataloader
   transforms: ${transforms} # Transforms to apply to dataset examples
+  overwrite: False # Whether to overwrite existing dataset files
 num_classes: 13 # Number of classes
@@ -27,6 +27,7 @@ def __init__(
         format: Literal["mmtf", "pdb"] = "mmtf",
         obsolete_strategy: str = "drop",
         transforms: Optional[List[Callable]] = None,
+        overwrite: bool = False,
     ) -> None:
         """
         Data module for antibody developability dataset from Chen et al.
@@ -49,6 +50,9 @@ def __init__(
         :type obsolete_strategy: str
         :param transforms: List of transforms to apply to dataset.
         :type transforms: Optional[List[Callable]]
+        :param overwrite: Whether or not to overwrite existing processed data.
+            Defaults t o ``False``.
+        :type overwrite: bool
         """
         super().__init__()
         self.root = pathlib.Path(path)
@@ -64,6 +68,7 @@ def __init__(
 
         self.format = format
         self.obsolete_strategy = obsolete_strategy
+        self.overwrite = overwrite
 
         if transforms is not None:
             self.transform = self.compose_transforms(
@@ -136,6 +141,7 @@ def _get_dataset(self, split: str) -> ProteinDataset:
             format=self.format,
             transform=self.transform,
             in_memory=self.in_memory,
+            overwrite=self.overwrite,
         )
 
     def train_dataset(self) -> ProteinDataset:
 
@@ -2,7 +2,7 @@
 import pathlib
 import random
 import tarfile
-from typing import Callable, Dict, Iterable, List, Optional, Literal
+from typing import Callable, Dict, Iterable, List, Literal, Optional
 
 import omegaconf
 import pandas as pd
@@ -140,7 +140,9 @@ def parse_class_map(self) -> Dict[str, str]:
     def setup(self, stage: Optional[str] = None):
         self.download()
 
-    def parse_dataset(self, split: Literal["train", "val", "test"]) -> List[str]:
+    def parse_dataset(
+        self, split: Literal["train", "val", "test"]
+    ) -> List[str]:
         """Parses ASTRAL dataset. Returns a list of IDs for each split.
 
         :param split: Split to parse.
 
@@ -296,6 +296,18 @@ def __init__(
         self.store_het = store_het
         self.out_names = out_names
 
+        # Determine whether to download raw structures
+        if not self.overwrite and all(
+            os.path.exists(Path(self.root) / "processed" / p)
+            for p in self.processed_file_names
+        ):
+            logger.info(
+                f"All structures already processed and overwrite=False. Skipping download."
+            )
+            self._skip_download = True
+        else:
+            self._skip_download = False
+
         super().__init__(root, transform, pre_transform, pre_filter, log)
         self.structures = pdb_codes if pdb_codes is not None else pdb_paths
         if self.in_memory:
@@ -319,6 +331,11 @@ def download(self):
 
         Downloaded files are stored in ``self.raw_dir``.
         """
+        if self._skip_download:
+            logger.info(
+                "All structures already processed and overwrite=False. Skipping download."
+            )
+            return
         if self.pdb_codes is not None:
             to_download = (
                 self.pdb_codes
@@ -366,6 +383,8 @@ def raw_file_names(self) -> List[str]:
         :return: List of raw file names.
         :rtype: List[str]
         """
+        if self._skip_download:
+            return []
         if self.pdb_paths is None:
             return [f"{pdb}.{format}" for pdb in self.pdb_codes]
         else:
@@ -419,7 +438,7 @@ def process(self):
             pdb_codes = self.pdb_codes
 
         raw_dir = Path(self.raw_dir)
-        for i, pdb in tqdm(pdb_codes):
+        for i, pdb in enumerate(tqdm(pdb_codes)):
             try:
                 path = raw_dir / f"{pdb}.{self.format}"
                 if path.exists():
@@ -473,7 +492,7 @@ def get(self, idx: int) -> Data:
         :return: PyTorch Geometric Data object.
         """
         if self.in_memory:
-            return self.data[idx]
+            return self._batch_format(self.data[idx])
 
         if self.out_names is not None:
             fname = f"{self.out_names[idx]}.pt"
 
@@ -25,6 +25,7 @@ def __init__(
         num_workers: int = 16,
         dataset_fraction: float = 1.0,
         transforms: Optional[Iterable[Callable]] = None,
+        overwrite: bool = False,
     ) -> None:
         """Data module for CATH dataset.
 
@@ -46,6 +47,9 @@ def __init__(
         :type dataset_fraction: float
         :param transforms: List of transforms to apply to dataset.
         :type transforms: Optional[List[Callable]]
+        :param overwrite: Whether to overwrite existing data.
+            Defaults to ``False``.
+        :type overwrite: bool
         """
         super().__init__()
 
@@ -166,6 +170,7 @@ def train_dataset(self) -> ProteinDataset:
             transform=self.transform,
             format=self.format,
             in_memory=self.in_memory,
+            overwrite=self.overwrite,
         )
 
     def val_dataset(self) -> ProteinDataset:
@@ -188,6 +193,7 @@ def val_dataset(self) -> ProteinDataset:
             transform=self.transform,
             format=self.format,
             in_memory=self.in_memory,
+            overwrite=self.overwrite,
         )
 
     def test_dataset(self) -> ProteinDataset:
@@ -209,6 +215,7 @@ def test_dataset(self) -> ProteinDataset:
             transform=self.transform,
             format=self.format,
             in_memory=self.in_memory,
+            overwrite=self.overwrite,
         )
 
     def train_dataloader(self) -> ProteinDataLoader: