fixed dataset downloads (#70)

* fixed downloads for all datasets * fixed sen1floods11 * Update download signature --------- Co-authored-by: gle-bellier <[email protected]>
yurujaja · Sep 27, 2024 · b4d5663 · b4d5663
1 parent 7aa6e09
commit b4d5663
Show file tree

Hide file tree

Showing 12 changed files with 41 additions and 92 deletions.
diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
@@ -234,7 +234,7 @@ We have designed the repo to allow for using your own datasets with minimal effo
              }
 
          @staticmethod
-         def download(dataset_config, silent=False):
+         def download(self, silent=False):
              # Implement if your dataset requires downloading
              pass
      ```

diff --git a/pangaea/datasets/ai4smallfarms.py b/pangaea/datasets/ai4smallfarms.py
@@ -82,13 +82,6 @@ def __init__(
             auto_download=auto_download,
         )
 
-        self.root_path = pathlib.Path(root_path)
-        self.split = split
-        self.image_dir = self.root_path.joinpath(f"sentinel-2-asia/{split}/images")
-        self.mask_dir = self.root_path.joinpath(f"sentinel-2-asia/{split}/masks")
-        self.image_list = sorted(glob(str(self.image_dir.joinpath("*.tif"))))
-        self.mask_list = sorted(glob(str(self.mask_dir.joinpath("*.tif"))))
-
         self.data_mean = data_mean
         self.data_std = data_std
         self.data_min = data_min
@@ -101,6 +94,13 @@ def __init__(
         self.download_url = download_url
         self.auto_download = auto_download
 
+        self.root_path = pathlib.Path(root_path)
+        self.split = split
+        self.image_dir = self.root_path.joinpath(f"sentinel-2-asia/{split}/images")
+        self.mask_dir = self.root_path.joinpath(f"sentinel-2-asia/{split}/masks")
+        self.image_list = sorted(glob(str(self.image_dir.joinpath("*.tif"))))
+        self.mask_list = sorted(glob(str(self.mask_dir.joinpath("*.tif"))))
+
     def __len__(self):
         return len(self.image_list)
 
@@ -133,8 +133,8 @@ def __getitem__(self, index):
         }
 
     @staticmethod
-    def download(dataset_config: dict, silent=False):
-        root_path = pathlib.Path(dataset_config["root_path"])
+    def download(self, silent=False):
+        root_path = pathlib.Path(self.root_path)
 
         # Create the root directory if it does not exist
         if not root_path.exists():

diff --git a/pangaea/datasets/base.py b/pangaea/datasets/base.py
@@ -1,6 +1,6 @@
 import torch
 from torch.utils.data import Dataset
-
+import os
 
 class GeoFMDataset(Dataset):
     """Base class for all datasets."""
@@ -72,6 +72,9 @@ def __init__(
         self.download_url = download_url
         self.auto_download = auto_download
 
+        if not os.path.exists(self.root_path):
+            self.download(self)
+
     def __len__(self) -> int:
         """Returns the length of the dataset.
 

diff --git a/pangaea/datasets/biomassters.py b/pangaea/datasets/biomassters.py
@@ -176,12 +176,5 @@ def __getitem__(self, index):
         }
 
     # @staticmethod
-    # def get_splits(dataset_config):
-    #     dataset_train = BioMassters(cfg=dataset_config, split='train')
-    #     dataset_val = BioMassters(cfg=dataset_config, split='val')
-    #     dataset_test = BioMassters(cfg=dataset_config, split='test')
-    #     return dataset_train, dataset_val, dataset_test
-
-    # @staticmethod
-    # def download(dataset_config:dict, silent=False):
+    # def download(self, silent=False):
     #     pass
diff --git a/pangaea/datasets/croptypemapping.py b/pangaea/datasets/croptypemapping.py
@@ -222,24 +222,17 @@ def pad_or_crop(self, tensor):
         # else:
         #     tensor = tensor[..., :self.grid_size]
         return tensor
-
-    # @staticmethod
-    # def get_splits(dataset_config):
-    #     dataset_train = CropTypeMappingSouthSudan(cfg=dataset_config, split="train")
-    #     dataset_val = CropTypeMappingSouthSudan(cfg=dataset_config, split="val")
-    #     dataset_test = CropTypeMappingSouthSudan(cfg=dataset_config, split="test")
-    #     return dataset_train, dataset_val, dataset_test
 
     @staticmethod
-    def download(dataset_config: dict, silent=False):
-        if os.path.exists(dataset_config["root_path"]):
+    def download(self, silent=False):
+        if os.path.exists(self.root_path):
             if not silent:
                 print("CropTypeMapping Dataset folder exists, skipping downloading dataset.")
             return
 
-        output_path = dataset_config["root_path"]
+        output_path = self.root_path
         os.makedirs(output_path, exist_ok=True)
-        url = dataset_config["download_url"]
+        url = self.download_url
 
         temp_file = os.path.join(output_path, "archive.tar.gz")
 

diff --git a/pangaea/datasets/fivebillionpixels.py b/pangaea/datasets/fivebillionpixels.py
@@ -20,9 +20,7 @@
 from pangaea.datasets.utils import DownloadProgressBar
 
 from pangaea.datasets.base import GeoFMDataset
-# from utils.registry import DATASET_REGISTRY
 
-# @DATASET_REGISTRY.register()
 class FiveBillionPixels(GeoFMDataset):
     def __init__(
         self,

diff --git a/pangaea/datasets/hlsburnscars.py b/pangaea/datasets/hlsburnscars.py
@@ -110,14 +110,13 @@ def __init__(
         self.download_url = download_url
         self.auto_download = auto_download
 
-        # ISSUE
         self.split_mapping = {'train': 'training', 'val': 'validation', 'test': 'validation'}
 
         all_files = sorted(glob(os.path.join(self.root_path, self.split_mapping[self.split], '*merged.tif')))
         all_targets = sorted(glob(os.path.join(self.root_path, self.split_mapping[self.split], '*mask.tif')))
 
         if self.split != "test":
-            split_indices = self.get_stratified_train_val_split(all_files)
+            split_indices = self.get_train_val_split(all_files)
             if self.split == "train":
                 indices = split_indices["train"]
             else:
@@ -130,16 +129,13 @@ def __init__(
 
 
     @staticmethod
-    def get_stratified_train_val_split(all_files, split) -> Tuple[Sequence[int], Sequence[int]]:
+    def get_train_val_split(all_files) -> Tuple[Sequence[int], Sequence[int]]:
 
        # Fixed stratified sample to split data into train/val. 
        # This keeps 90% of datapoints belonging to an individual event in the training set and puts the remaining 10% in the validation set. 
-        # disaster_names = list(
-            # map(lambda path: pathlib.Path(path).name.split("_")[0], all_files))
         train_idxs, val_idxs = train_test_split(np.arange(len(all_files)),
                                                 test_size=0.1,
                                                 random_state=23,
-                                                # stratify=disaster_names
                                                 )
         return {"train": train_idxs, "val": val_idxs}
 
@@ -185,9 +181,9 @@ def get_stratified_train_val_split(all_files) -> Tuple[Sequence[int], Sequence[i
         return {"train": train_idxs, "val": val_idxs}
 
     @staticmethod
-    def download(dataset_config:dict, silent=False):
-        output_path = pathlib.Path(dataset_config["root_path"])
-        url = dataset_config["download_url"]
+    def download(self, silent=False):
+        output_path = pathlib.Path(self.root_path)
+        url = self.download_url
 
         try:
             os.makedirs(output_path, exist_ok=False)

diff --git a/pangaea/datasets/mados.py b/pangaea/datasets/mados.py
@@ -181,9 +181,9 @@ def get_band(path):
         return int(path.split('_')[-2])
 
     @staticmethod
-    def download(dataset_config: dict, silent=False):
-        output_path = pathlib.Path(dataset_config["root_path"])
-        url = dataset_config["download_url"]
+    def download(self, silent=False):
+        output_path = pathlib.Path(self.root_path)
+        url = self.download_url
 
         existing_dirs = list(output_path.glob("Scene_*"))
         if existing_dirs:
@@ -219,11 +219,4 @@ def download(dataset_config: dict, silent=False):
             zip_ref.extractall(output_path, members)
             print("done.")
 
-        (output_path / temp_file_name).unlink()
-
-    # @staticmethod
-    # def get_splits(dataset_config):
-    #     dataset_train = MADOS(cfg=dataset_config, split="train")
-    #     dataset_val = MADOS(cfg=dataset_config, split="val")
-    #     dataset_test = MADOS(cfg=dataset_config, split="test")
-    #     return dataset_train, dataset_val, dataset_test
+        (output_path / temp_file_name).unlink()
diff --git a/pangaea/datasets/sen1floods11.py b/pangaea/datasets/sen1floods11.py
@@ -10,7 +10,6 @@
 from pangaea.datasets.utils import download_bucket_concurrently
 from pangaea.datasets.base import GeoFMDataset
 
-# @DATASET_REGISTRY.register()
 class Sen1Floods11(GeoFMDataset):
 
     def __init__(
@@ -65,6 +64,9 @@ def __init__(
             auto_download (bool): whether to download the dataset automatically.
             gcs_bucket (str): subset for downloading the dataset.
         """
+
+        self.gcs_bucket = gcs_bucket
+
         super(Sen1Floods11, self).__init__(
             split=split,
             dataset_name=dataset_name,
@@ -83,7 +85,6 @@ def __init__(
             data_max=data_max,
             download_url=download_url,
             auto_download=auto_download,
-            # gcs_bucket=gcs_bucket,
         )
 
         self.root_path = root_path
@@ -101,7 +102,6 @@ def __init__(
         self.ignore_index = ignore_index
         self.download_url = download_url
         self.auto_download = auto_download
-        self.gcs_bucket = gcs_bucket
 
         self.split_mapping = {'train': 'train', 'val': 'valid', 'test': 'test'}
 
@@ -167,19 +167,12 @@ def __getitem__(self, index):
         }
         return output
 
-    # @staticmethod
-    # def get_splits(dataset_config):
-    #     dataset_train = Sen1Floods11(dataset_config, split="train")
-    #     dataset_val = Sen1Floods11(dataset_config, split="val")
-    #     dataset_test = Sen1Floods11(dataset_config, split="test")
-    #     return dataset_train, dataset_val, dataset_test
-
     @staticmethod
-    def download(dataset_config: dict, silent=False):
-        if os.path.exists(dataset_config["root_path"]):
+    def download(self, silent=False):
+        if os.path.exists(self.root_path):
             if not silent:
                 print("Sen1Floods11 Dataset folder exists, skipping downloading dataset.")
             return
-        download_bucket_concurrently(dataset_config["gcs_bucket"], dataset_config["root_path"])
+        download_bucket_concurrently(self.gcs_bucket, self.root_path)
 
 
diff --git a/pangaea/datasets/spacenet7.py b/pangaea/datasets/spacenet7.py
@@ -244,8 +244,8 @@ def get_band(path):
         return int(path.split('_')[-2])
 
     @staticmethod
-    def download(dataset_config: dict, silent=False):
-        output_path = Path(dataset_config["root_path"])
+    def download(self, silent=False):
+        output_path = Path(self.root_path)
 
         if not output_path.exists():
             output_path.mkdir()
@@ -255,7 +255,7 @@ def download(dataset_config: dict, silent=False):
             return
 
         # download from Google Drive
-        url = dataset_config["download_url"]
+        url = self.download_url
         tar_file = output_path / f'spacenet7.tar.gz'
         gdown.download(url, str(tar_file), quiet=False)
 

diff --git a/pangaea/datasets/utae_dynamicen.py b/pangaea/datasets/utae_dynamicen.py
@@ -229,5 +229,5 @@ def __getitem__(self, index):
     #     return dataset_train, dataset_val, dataset_test
 
     @staticmethod
-    def download(dataset_config: dict, silent=False):
+    def download(self, silent=False):
         pass
diff --git a/pangaea/datasets/xview2.py b/pangaea/datasets/xview2.py
@@ -174,31 +174,11 @@ def __getitem__(self, idx: int) -> Dict[str, Union[torch.Tensor,  Any, str]]:
             'metadata': {"filename":fn}
         }
 
-        # return {
-        #     'image': {
-        #         't0' : {
-        #             'optical': img_pre,
-        #             },
-        #         't1': {
-        #             'optical': img_post,
-        #             },
-        #     },
-        #     'target': msk,  
-        #     'metadata': {"filename":fn}
-        # }
-
-    # @staticmethod
-    # def get_splits(dataset_config):
-    #     dataset_train = xView2(cfg=dataset_config, split="train")
-    #     dataset_val = xView2(cfg=dataset_config, split="val")
-    #     dataset_test = xView2(cfg=dataset_config, split="test")
-    #     return dataset_train, dataset_val, dataset_test
-
 
     @staticmethod
-    def download(dataset_config:dict, silent=False):
-        output_path = pathlib.Path(dataset_config["root_path"])
-        url = dataset_config["download_url"]
+    def download(self, silent=False):
+        output_path = pathlib.Path(self.root_path)
+        url = self.download_url
 
         try:
             os.makedirs(output_path, exist_ok=False)