talmolab · shaikh58 · Jul 31, 2024 · Jul 31, 2024 · Aug 2, 2024 · Aug 5, 2024
diff --git a/.gitignore b/.gitignore
@@ -142,3 +142,8 @@ dreem/training/models/*
 
 # docs
 site/
+*.xml
+dreem/training/configs/base.yaml
+dreem/training/configs/override.yaml
+dreem/training/configs/override.yaml
+dreem/training/configs/base.yaml
diff --git a/dreem/datasets/base_dataset.py b/dreem/datasets/base_dataset.py
@@ -87,6 +87,7 @@ def create_chunks(self) -> None:
         if self.chunk:
             self.chunked_frame_idx, self.label_idx = [], []
             for i, frame_idx in enumerate(self.frame_idx):
+                # splits frame indices into chunks of length clip_length
                 frame_idx_split = torch.split(frame_idx, self.clip_length)
                 self.chunked_frame_idx.extend(frame_idx_split)
                 self.label_idx.extend(len(frame_idx_split) * [i])

diff --git a/dreem/datasets/sleap_dataset.py b/dreem/datasets/sleap_dataset.py
@@ -106,12 +106,13 @@ def __init__(
         # if self.seed is not None:
         #     np.random.seed(self.seed)
         self.labels = [sio.load_slp(slp_file) for slp_file in self.slp_files]
-        self.videos = [imageio.get_reader(vid_file) for vid_file in self.vid_files]
+        self.vid_readers = {}
         # do we need this? would need to update with sleap-io
 
         # for label in self.labels:
         # label.remove_empty_instances(keep_empty_frames=False)
 
+        # list of lists, each sublist is a list of frame indices for a given video
         self.frame_idx = [torch.arange(len(labels)) for labels in self.labels]
         # Method in BaseDataset. Creates label_idx and chunked_frame_idx to be
         # used in call to get_instances()
@@ -123,6 +124,7 @@ def get_indices(self, idx: int) -> tuple:
         Args:
             idx: the index of the batch.
         """
+        # self.label_idx is a list of indices specifying which video each chunk belongs to
         return self.label_idx[idx], self.chunked_frame_idx[idx]
 
     def get_instances(self, label_idx: list[int], frame_idx: list[int]) -> list[Frame]:
@@ -136,19 +138,17 @@ def get_instances(self, label_idx: list[int], frame_idx: list[int]) -> list[Fram
             A list of `dreem.io.Frame` objects containing metadata and instance data for the batch/clip.
 
         """
-        video = self.labels[label_idx]
+        # each entry in self.labels is a sleap Labels object (which is a list of LabeledFrames)
+        video = self.labels[label_idx]  # label_idx is the
 
         video_name = self.video_files[label_idx]
-
-        vid_reader = self.videos[label_idx]
-
         # img = vid_reader.get_data(0)
 
         skeleton = video.skeletons[-1]
 
         frames = []
         for i, frame_ind in enumerate(frame_idx):
-            (
+            (  # frame_idx is a list of frame indices for a given video
                 instances,
                 gt_track_ids,
                 poses,
@@ -159,15 +159,15 @@ def get_instances(self, label_idx: list[int], frame_idx: list[int]) -> list[Fram
 
             frame_ind = int(frame_ind)
 
-            lf = video[frame_ind]
+            lf = video[frame_ind]  # video is a sleap Labels object for a given file
 
             try:
-                img = vid_reader.get_data(int(lf.frame_idx))
-            except IndexError as e:
-                logger.warning(
-                    f"Could not read frame {frame_ind} from {video_name} due to {e}"
-                )
-                continue
+                img = lf.image  # a single frame from the video
+            except FileNotFoundError as e:
+                if video_name not in self.vid_readers:
+                    self.vid_readers[video_name] = sio.load_video(video_name)
+                vid_reader = self.vid_readers[video_name]
+                img = vid_reader[lf.frame_idx]
 
             if len(img.shape) == 2:
                 img = img.expand_dims(-1)
@@ -370,5 +370,5 @@ def get_instances(self, label_idx: list[int], frame_idx: list[int]) -> list[Fram
 
     def __del__(self):
         """Handle file closing before garbage collection."""
-        for reader in self.videos:
+        for reader in self.vid_readers:
-        for reader in self.vid_readers:
+        for video_name, reader in self.vid_readers.items():
+            try:
+                if hasattr(reader, 'close'):
+                    reader.close()
+            except Exception as e:
+                logger.warning(f"Failed to close video reader for {video_name}: {str(e)}")
-        for reader in self.vid_readers:
+        for video_name, reader in self.vid_readers.items():
+            try:
+                if hasattr(reader, 'close'):
+                    reader.close()
+            except Exception as e:
+                logger.warning(f"Failed to close video reader for {video_name}: {str(e)}")
             reader.close()
diff --git a/dreem/inference/eval.py b/dreem/inference/eval.py
@@ -26,31 +26,42 @@ def run(cfg: DictConfig) -> dict[int, sio.Labels]:
     """
     eval_cfg = Config(cfg)
 
-    if "checkpoints" in cfg.keys():
+    # update with parameters for batch train job
+    if "batch_config" in cfg.keys():
-    if "batch_config" in cfg.keys():
+    if "batch_config" in cfg:
-    if "batch_config" in cfg.keys():
+    if "batch_config" in cfg:
         try:
             index = int(os.environ["POD_INDEX"])
-        # For testing without deploying a job on runai
-        except KeyError:
-            index = input("Pod Index Not found! Please choose a pod index: ")
-
-        logger.info(f"Pod Index: {index}")
-
-        checkpoints = pd.read_csv(cfg.checkpoints)
-        checkpoint = checkpoints.iloc[index]
+        except KeyError as e:
+            index = int(
+                input(f"{e}. Assuming single run!\nPlease input task index to run:")
+            )
-            index = int(
-                input(f"{e}. Assuming single run!\nPlease input task index to run:")
-            )
+            while True:
+                try:
+                    index = int(
+                        input(f"{e}. Assuming single run!\nPlease input task index to run:")
+                    )
+                    break
+                except ValueError:
+                    print("Invalid input. Please enter an integer.")
-            index = int(
-                input(f"{e}. Assuming single run!\nPlease input task index to run:")
-            )
+            while True:
+                try:
+                    index = int(
+                        input(f"{e}. Assuming single run!\nPlease input task index to run:")
+                    )
+                    break
+                except ValueError:
+                    print("Invalid input. Please enter an integer.")
+
+        hparams_df = pd.read_csv(cfg.batch_config)
+        hparams = hparams_df.iloc[index].to_dict()
+        _ = hparams.pop("Unnamed: 0", None)
-        hparams_df = pd.read_csv(cfg.batch_config)
-        hparams = hparams_df.iloc[index].to_dict()
-        _ = hparams.pop("Unnamed: 0", None)
+        if not os.path.exists(cfg.batch_config):
+            raise FileNotFoundError(f"Batch config file {cfg.batch_config} not found")
+        hparams_df = pd.read_csv(cfg.batch_config)
+        if len(hparams_df) <= index:
+            raise IndexError(f"Task index {index} exceeds number of configurations {len(hparams_df)}")
+        hparams = hparams_df.iloc[index].to_dict()
+        _ = hparams.pop("Unnamed: 0", None)
-        hparams_df = pd.read_csv(cfg.batch_config)
-        hparams = hparams_df.iloc[index].to_dict()
-        _ = hparams.pop("Unnamed: 0", None)
+        if not os.path.exists(cfg.batch_config):
+            raise FileNotFoundError(f"Batch config file {cfg.batch_config} not found")
+        hparams_df = pd.read_csv(cfg.batch_config)
+        if len(hparams_df) <= index:
+            raise IndexError(f"Task index {index} exceeds number of configurations {len(hparams_df)}")
+        hparams = hparams_df.iloc[index].to_dict()
+        _ = hparams.pop("Unnamed: 0", None)
+
+        if eval_cfg.set_hparams(hparams):
+            logger.info("Updated the following hparams to the following values")
+            logger.info(hparams)
     else:
-        checkpoint = eval_cfg.cfg.ckpt_path
+        hparams = {}
+
+    checkpoint = eval_cfg.cfg.ckpt_path
 
+    logger.info(f"Testing model saved at {checkpoint}")
     model = GTRRunner.load_from_checkpoint(checkpoint)
+
     model.tracker_cfg = eval_cfg.cfg.tracker
     model.tracker = Tracker(**model.tracker_cfg)
+
     logger.info(f"Using the following tracker:")
-    logger.info(f"Using the following tracker:")
+    logger.info("Using the following tracker:")
-    logger.info(f"Using the following tracker:")
+    logger.info("Using the following tracker:")
+
     print(model.tracker)
     model.metrics["test"] = eval_cfg.cfg.runner.metrics.test
     model.persistent_tracking["test"] = eval_cfg.cfg.tracker.get(
         "persistent_tracking", False
     )
     logger.info(f"Computing the following metrics:")
-    logger.info(model.metrics.test)
+    logger.info(model.metrics['test'])
     model.test_results["save_path"] = eval_cfg.cfg.runner.save_path
     logger.info(f"Saving results to {model.test_results['save_path']}")
 

diff --git a/dreem/inference/post_processing.py b/dreem/inference/post_processing.py
@@ -126,6 +126,8 @@ def filter_max_center_dist(
     k_boxes: torch.Tensor | None = None,
     nonk_boxes: torch.Tensor | None = None,
     id_inds: torch.Tensor | None = None,
+    h: int = None,
+    w: int = None,
 ) -> torch.Tensor:
     """Filter trajectory score by distances between objects across frames.
 
@@ -135,6 +137,8 @@ def filter_max_center_dist(
         k_boxes: The bounding boxes in the current frame
         nonk_boxes: the boxes not in the current frame
         id_inds: track ids
+        h: height of image
+        w: width of image
 
     Returns:
         An N_t x N association matrix
@@ -147,13 +151,15 @@ def filter_max_center_dist(
         k_s = ((k_boxes[:, :, 2:] - k_boxes[:, :, :2]) ** 2).sum(dim=2)  # n_k
 
         nonk_ct = (nonk_boxes[:, :, :2] + nonk_boxes[:, :, 2:]) / 2
-
+        # TODO: nonk_boxes should be only from previous frame rather than entire window
         dist = ((k_ct[:, None, :, :] - nonk_ct[None, :, :, :]) ** 2).sum(
             dim=-1
         )  # n_k x Np
-
-        norm_dist = dist / (k_s[:, None, :] + 1e-8)
+        # TODO: note that dist is in units of fraction of the height and width of the image;
+        # TODO: need to scale it by the original image size so that its in units of pixels
+        # norm_dist = dist / (k_s[:, None, :] + 1e-8)
         norm_dist = dist.mean(axis=-1)  # n_k x Np
+        # norm_dist =
 
         valid = norm_dist < max_center_dist  # n_k x Np
         valid_assn = (

diff --git a/dreem/inference/track.py b/dreem/inference/track.py
@@ -96,25 +96,35 @@ def run(cfg: DictConfig) -> dict[int, sio.Labels]:
     """
     pred_cfg = Config(cfg)
 
-    if "checkpoints" in cfg.keys():
+    # update with parameters for batch train job
+    if "batch_config" in cfg.keys():
-    if "batch_config" in cfg.keys():
+    if "batch_config" in cfg:
-    if "batch_config" in cfg.keys():
+    if "batch_config" in cfg:
         try:
             index = int(os.environ["POD_INDEX"])
-        # For testing without deploying a job on runai
-        except KeyError:
-            index = input("Pod Index Not found! Please choose a pod index: ")
-
-        logger.info(f"Pod Index: {index}")
-
-        checkpoints = pd.read_csv(cfg.checkpoints)
-        checkpoint = checkpoints.iloc[index]
+        except KeyError as e:
+            index = int(
+                input(f"{e}. Assuming single run!\nPlease input task index to run:")
+            )
-        except KeyError as e:
-            index = int(
-                input(f"{e}. Assuming single run!\nPlease input task index to run:")
-            )
+        except KeyError:
+            logger.warning("Environment variable 'POD_INDEX' not found. Using default index 0.")
+            index = 0
-        except KeyError as e:
-            index = int(
-                input(f"{e}. Assuming single run!\nPlease input task index to run:")
-            )
+        except KeyError:
+            logger.warning("Environment variable 'POD_INDEX' not found. Using default index 0.")
+            index = 0
+
+        hparams_df = pd.read_csv(cfg.batch_config)
+        hparams = hparams_df.iloc[index].to_dict()
+        _ = hparams.pop("Unnamed: 0", None)
+
+        if pred_cfg.set_hparams(hparams):
+            logger.info("Updated the following hparams to the following values")
+            logger.info(hparams)
     else:
-        checkpoint = pred_cfg.cfg.ckpt_path
+        hparams = {}
+
+    checkpoint = pred_cfg.cfg.ckpt_path
 
+    logger.info(f"Running inference with model from {checkpoint}")
     model = GTRRunner.load_from_checkpoint(checkpoint)
+
     tracker_cfg = pred_cfg.get_tracker_cfg()
-    logger.info("Updating tracker hparams")
+
     model.tracker_cfg = tracker_cfg
     model.tracker = Tracker(**model.tracker_cfg)
+
     logger.info(f"Using the following tracker:")
     logger.info(model.tracker)
 
@@ -124,12 +134,14 @@ def run(cfg: DictConfig) -> dict[int, sio.Labels]:
     os.makedirs(outdir, exist_ok=True)
 
     for label_file, vid_file in zip(labels_files, vid_files):
+        logger.info(f"Tracking {label_file} - {vid_file}...")
         dataset = pred_cfg.get_dataset(
             label_files=[label_file], vid_files=[vid_file], mode="test"
         )
         dataloader = pred_cfg.get_dataloader(dataset, mode="test")
         preds = track(model, trainer, dataloader)
         outpath = os.path.join(outdir, f"{Path(label_file).stem}.dreem_inference.slp")
+        logger.info(f"Saving results to {outpath}...")
         preds.save(outpath)
 
     return preds

diff --git a/dreem/inference/tracker.py b/dreem/inference/tracker.py
@@ -138,8 +138,10 @@ def track(
         # asso_preds, pred_boxes, pred_time, embeddings = self.model(
         #     instances, reid_features
         # )
+        # get reference and query instances from TrackQueue and calls _run_global_tracker()
         instances_pred = self.sliding_inference(model, frames)
 
+        # e.g. during train/val, don't track across batches so persistent_tracking is switched off
         if not self.persistent_tracking:
             logger.debug(f"Clearing Queue after tracking")
             self.track_queue.end_tracks()
@@ -164,7 +166,9 @@ def sliding_inference(
         # H: height.
         # W: width.
 
+        # frames is untracked clip for inference
         for batch_idx, frame_to_track in enumerate(frames):
+            # tracked_frames is a list of reference frames that have been tracked (associated)
             tracked_frames = self.track_queue.collate_tracks(
                 device=frame_to_track.frame_id.device
             )
@@ -188,19 +192,21 @@ def sliding_inference(
                     )
 
                     curr_track_id = 0
+                    # if track ids exist from another tracking program i.e. sleap, init with those
                     for i, instance in enumerate(frames[batch_idx].instances):
                         instance.pred_track_id = instance.gt_track_id
                         curr_track_id = max(curr_track_id, instance.pred_track_id)
-
+                    # if no track ids, then assign new ones
                     for i, instance in enumerate(frames[batch_idx].instances):
                         if instance.pred_track_id == -1:
-                            curr_track += 1
+                            curr_track_id += 1
-                    # if track ids exist from another tracking program i.e. sleap, init with those
-                    for i, instance in enumerate(frames[batch_idx].instances):
-                        instance.pred_track_id = instance.gt_track_id
-                        curr_track_id = max(curr_track_id, instance.pred_track_id)
-
-                    # if no track ids, then assign new ones
-                    for i, instance in enumerate(frames[batch_idx].instances):
-                        if instance.pred_track_id == -1:
-                            curr_track += 1
-                            curr_track_id += 1
+                    # if track ids exist from another tracking program i.e. sleap, init with those
+                    for _, instance in enumerate(frames[batch_idx].instances):
+                        instance.pred_track_id = instance.gt_track_id
+                        curr_track_id = max(curr_track_id, instance.pred_track_id)
+                    # if no track ids, then assign new ones
+                    for _, instance in enumerate(frames[batch_idx].instances):
+                        if instance.pred_track_id == -1:
+                            curr_track_id += 1
-                    # if track ids exist from another tracking program i.e. sleap, init with those
-                    for i, instance in enumerate(frames[batch_idx].instances):
-                        instance.pred_track_id = instance.gt_track_id
-                        curr_track_id = max(curr_track_id, instance.pred_track_id)
-
-                    # if no track ids, then assign new ones
-                    for i, instance in enumerate(frames[batch_idx].instances):
-                        if instance.pred_track_id == -1:
-                            curr_track += 1
-                            curr_track_id += 1
+                    # if track ids exist from another tracking program i.e. sleap, init with those
+                    for _, instance in enumerate(frames[batch_idx].instances):
+                        instance.pred_track_id = instance.gt_track_id
+                        curr_track_id = max(curr_track_id, instance.pred_track_id)
+                    # if no track ids, then assign new ones
+                    for _, instance in enumerate(frames[batch_idx].instances):
+                        if instance.pred_track_id == -1:
+                            curr_track_id += 1
                             instance.pred_track_id = curr_track_id
 
             else:
                 if (
                     frame_to_track.has_instances()
                 ):  # Check if there are detections. If there are skip and increment gap count
+                    # combine the tracked frames with the latest frame; inference pipeline uses latest frame as pred
                     frames_to_track = tracked_frames + [
                         frame_to_track
                     ]  # better var name?
@@ -217,7 +223,7 @@ def sliding_inference(
                 self.track_queue.add_frame(frame_to_track)
             else:
                 self.track_queue.increment_gaps([])
-
+            # update the frame object from the input inference untracked clip
             frames[batch_idx] = frame_to_track
         return frames
 
@@ -252,7 +258,7 @@ def _run_global_tracker(
         # E.g.: instances_per_frame: [4, 5, 6, 7]; window of length 4 with 4 detected instances in the first frame of the window.
 
         _ = model.eval()
-
+        # get the last frame in the clip to perform inference on
         query_frame = frames[query_ind]
 
         query_instances = query_frame.instances
@@ -279,8 +285,10 @@ def _run_global_tracker(
 
         # (L=1, n_query, total_instances)
         with torch.no_grad():
+            # GTR knows this is for inference since query_instances is not None
             asso_matrix = model(all_instances, query_instances)
 
+        # GTR output is n_query x n_instances - split this into per-frame to softmax each frame separately
         asso_output = asso_matrix[-1].matrix.split(
             instances_per_frame, dim=1
         )  # (window_size, n_query, N_i)
@@ -296,7 +304,7 @@ def _run_global_tracker(
 
         asso_output_df.index.name = "Instances"
         asso_output_df.columns.name = "Instances"
-
+        # save the association matrix to the Frame object
         query_frame.add_traj_score("asso_output", asso_output_df)
         query_frame.asso_output = asso_matrix[-1]
 
@@ -343,6 +351,8 @@ def _run_global_tracker(
 
         query_frame.add_traj_score("asso_nonquery", asso_nonquery_df)
 
+        # need frame height and width to scale boxes during post-processing
+        _, h, w = query_frame.img_shape.flatten()
         pred_boxes = model_utils.get_boxes(all_instances)
         query_boxes = pred_boxes[query_inds]  # n_k x 4
         nonquery_boxes = pred_boxes[nonquery_inds]  # n_nonquery x 4
@@ -374,7 +384,7 @@ def _run_global_tracker(
 
             query_frame.add_traj_score("decay_time", decay_time_traj_score)
         ################################################################################
-
+        # reduce association matrix - aggregating reference instance association scores by tracks
         # (n_query x n_nonquery) x (n_nonquery x n_traj) --> n_query x n_traj
         traj_score = torch.mm(traj_score, id_inds.cpu())  # (n_query, n_traj)
 
@@ -387,6 +397,7 @@ def _run_global_tracker(
 
         query_frame.add_traj_score("traj_score", traj_score_df)
         ################################################################################
+        # IOU-based post-processing; add a weighted IOU across successive frames to association scores
 
         # with iou -> combining with location in tracker, they set to True
         # todo -> should also work without pos_embed
@@ -421,11 +432,12 @@ def _run_global_tracker(
 
             query_frame.add_traj_score("weight_iou", iou_traj_score)
         ################################################################################
+        # filters association matrix such that instances too far from each other get scores=0
 
         # threshold for continuing a tracking or starting a new track -> they use 1.0
         # todo -> should also work without pos_embed
         traj_score = post_processing.filter_max_center_dist(
-            traj_score, self.max_center_dist, query_boxes, nonquery_boxes, id_inds
+            traj_score, self.max_center_dist, query_boxes, nonquery_boxes, id_inds, h, w
         )
 
         if self.max_center_dist is not None and self.max_center_dist > 0:
@@ -439,6 +451,7 @@ def _run_global_tracker(
             query_frame.add_traj_score("max_center_dist", max_center_dist_traj_score)
 
         ################################################################################
+        # softmax along tracks for each instance, for interpretability
         scaled_traj_score = torch.softmax(traj_score, dim=1)
         scaled_traj_score_df = pd.DataFrame(
             scaled_traj_score.numpy(), columns=unique_ids.cpu().numpy()
@@ -449,6 +462,7 @@ def _run_global_tracker(
         query_frame.add_traj_score("scaled", scaled_traj_score_df)
         ################################################################################
 
+        # hungarian matching
         match_i, match_j = linear_sum_assignment((-traj_score))
 
         track_ids = instance_ids.new_full((n_query,), -1)
@@ -462,6 +476,7 @@ def _run_global_tracker(
             thresh = (
                 overlap_thresh * id_inds[:, j].sum() if mult_thresh else overlap_thresh
             )
+            # if the association score for a query instance is lower than the threshold, create a new track for it
             if n_traj >= self.max_tracks or traj_score[i, j] > thresh:
                 logger.debug(
                     f"Assigning instance {i} to track {j} with id {unique_ids[j]}"