JetBrains-Research
diff --git a/‎.gitignore
-1 b/‎.gitignore
-1
diff --git a/‎README.md
+3-3 b/‎README.md
+3-3
diff --git a/‎code2seq/dataset/__init__.py ‎code2seq/data/__init__.py
+14-3 b/‎code2seq/dataset/__init__.py ‎code2seq/data/__init__.py
+14-3
diff --git a/‎code2seq/data/path_context.py
+71 b/‎code2seq/data/path_context.py
+71
diff --git a/‎code2seq/data/path_context_data_module.py
+89 b/‎code2seq/data/path_context_data_module.py
+89
diff --git a/‎code2seq/data/path_context_dataset.py
+89 b/‎code2seq/data/path_context_dataset.py
+89
diff --git a/‎code2seq/data/typed_path_context_data_module.py
+25 b/‎code2seq/data/typed_path_context_data_module.py
+25
diff --git a/‎code2seq/data/typed_path_context_dataset.py
+21 b/‎code2seq/data/typed_path_context_dataset.py
+21
@@ -7,7 +7,6 @@ __pycache__/
 *.py[cod]
 *$py.class
 
-data/
 wandb/
 notebooks/
 outputs/
 
@@ -21,9 +21,9 @@ Minimal code example to run the model:
 from os.path import join
 
 import hydra
-from code2seq.dataset import PathContextDataModule
+from code2seq.data import PathContextDataModule
 from code2seq.model import Code2Seq
-from code2seq.utils.vocabulary import Vocabulary
+from code2seq.data.vocabulary import Vocabulary
 from omegaconf import DictConfig
 from pytorch_lightning import Trainer
 
@@ -43,5 +43,5 @@ if __name__ == "__main__":
     train()
 ```
 
-Navigate to [code2seq/configs](code2seq/configs) to see examples of configs.
+Navigate to [code2seq/configs](configs) to see examples of configs.
 If you had any questions then feel free to open the issue.
@@ -1,14 +1,25 @@
+from .path_context import (
+    Path,
+    LabeledPathContext,
+    BatchedLabeledPathContext,
+    TypedPath,
+    LabeledTypedPathContext,
+    BatchedLabeledTypedPathContext,
+)
 from .path_context_dataset import PathContextDataset
-from .data_classes import PathContextSample, PathContextBatch
 from .path_context_data_module import PathContextDataModule
 from .typed_path_context_dataset import TypedPathContextDataset
 from .typed_path_context_data_module import TypedPathContextDataModule
 
 __all__ = [
+    "Path",
+    "LabeledPathContext",
+    "BatchedLabeledPathContext",
     "PathContextDataset",
-    "PathContextSample",
-    "PathContextBatch",
     "PathContextDataModule",
+    "TypedPath",
+    "LabeledTypedPathContext",
+    "BatchedLabeledTypedPathContext",
     "TypedPathContextDataset",
     "TypedPathContextDataModule",
 ]
@@ -0,0 +1,71 @@
+from dataclasses import dataclass
+from typing import List, Iterable, Tuple, Optional
+
+import torch
+
+
+@dataclass
+class Path:
+    from_token: torch.Tensor  # [max token parts]
+    path_node: torch.Tensor  # [path length]
+    to_token: torch.Tensor  # [max token parts]
+
+
+@dataclass
+class LabeledPathContext:
+    label: torch.Tensor  # [max label parts]
+    path_contexts: List[Path]
+
+
+class BatchedLabeledPathContext:
+    def __init__(self, samples: List[Optional[LabeledPathContext]]):
+        samples = [s for s in samples if s is not None]
+
+        # [batch size; max label parts]
+        self.labels = torch.cat([s.label for s in samples], dim=1)
+        # [batch size]
+        self.contexts_per_label = [len(s.path_contexts) for s in samples]
+
+        # [paths in batch; max token parts]
+        self.from_token = torch.cat([path.from_token for s in samples for path in s.path_contexts], dim=1)
+        # [paths in batch; path length]
+        self.path_node = torch.cat([path.path_node for s in samples for path in s.path_contexts], dim=1)
+        # [paths in batch; max token parts]
+        self.to_token = torch.cat([path.to_token for s in samples for path in s.path_contexts], dim=1)
+
+    def __len__(self) -> int:
+        return len(self.contexts_per_label)
+
+    def __get_all_tensors(self) -> Iterable[Tuple[str, torch.Tensor]]:
+        for name, value in vars(self).items():
+            if isinstance(value, torch.Tensor):
+                yield name, value
+
+    def pin_memory(self) -> "BatchedLabeledPathContext":
+        for name, value in self.__get_all_tensors():
+            setattr(self, name, value.pin_memory())
+        return self
+
+    def move_to_device(self, device: torch.device):
+        for name, value in self.__get_all_tensors():
+            setattr(self, name, value.to(device))
+
+
+@dataclass
+class TypedPath(Path):
+    from_type: torch.Tensor  # [max type parts]
+    to_type: torch.Tensor  # [max type parts]
+
+
+@dataclass
+class LabeledTypedPathContext(LabeledPathContext):
+    path_contexts: List[TypedPath]
+
+
+class BatchedLabeledTypedPathContext(BatchedLabeledPathContext):
+    def __init__(self, samples: List[Optional[LabeledTypedPathContext]]):
+        super().__init__(samples)
+        # [paths in batch; max type parts]
+        self.from_type = torch.cat([path.from_type for s in samples for path in s.path_contexts], dim=1)
+        # [paths in batch; max type parts]
+        self.to_type = torch.cat([path.to_type for s in samples for path in s.path_contexts], dim=1)
@@ -0,0 +1,89 @@
+from os.path import exists, join, basename
+from typing import List, Optional
+
+import torch
+from commode_utils.common import download_dataset
+from commode_utils.vocabulary import build_from_scratch
+from omegaconf import DictConfig
+from pytorch_lightning import LightningDataModule
+from torch.utils.data import DataLoader
+
+from code2seq.data import PathContextDataset, LabeledPathContext, BatchedLabeledPathContext
+from code2seq.data.vocabulary import Vocabulary
+
+
+class PathContextDataModule(LightningDataModule):
+    _train = "train"
+    _val = "val"
+    _test = "test"
+
+    _vocabulary: Optional[Vocabulary] = None
+
+    def __init__(self, data_dir: str, config: DictConfig):
+        super().__init__()
+        self._config = config
+        self._data_dir = data_dir
+        self._name = basename(data_dir)
+
+    @property
+    def vocabulary(self) -> Vocabulary:
+        if self._vocabulary is None:
+            raise RuntimeError(f"Setup data module for initializing vocabulary")
+        return self._vocabulary
+
+    def prepare_data(self):
+        if exists(self._data_dir):
+            print(f"Dataset is already downloaded")
+            return
+        if "url" not in self._config:
+            raise ValueError(f"Config doesn't contain url for, can't download it automatically")
+        download_dataset(self._config.url, self._data_dir, self._name)
+
+    def setup(self, stage: Optional[str] = None):
+        if not exists(join(self._data_dir, Vocabulary.vocab_filename)):
+            print("Can't find vocabulary, collect it from train holdout")
+            build_from_scratch(join(self._data_dir, f"{self._train}.jsonl"), Vocabulary)
+        vocabulary_path = join(self._data_dir, Vocabulary.vocab_filename)
+        self._vocabulary = Vocabulary(vocabulary_path, self._config.max_labels, self._config.max_tokens)
+
+    @staticmethod
+    def collate_wrapper(batch: List[Optional[LabeledPathContext]]) -> BatchedLabeledPathContext:
+        return BatchedLabeledPathContext(batch)
+
+    def _create_dataset(self, holdout_file: str, random_context: bool) -> PathContextDataset:
+        return PathContextDataset(holdout_file, self._config, self._vocabulary, random_context)
+
+    def _shared_dataloader(self, holdout: str) -> DataLoader:
+        if self._vocabulary is None:
+            raise RuntimeError(f"Setup vocabulary before creating data loaders")
+
+        holdout_file = join(self._data_dir, f"{holdout}.jsonl")
+        random_context = self._config.random_context if holdout == self._train else False
+        dataset = self._create_dataset(holdout_file, random_context)
+
+        batch_size = self._config.batch_size if holdout == self._train else self._config.test_batch_size
+        shuffle = holdout == self._train
+
+        return DataLoader(
+            dataset,
+            batch_size,
+            shuffle=shuffle,
+            num_workers=self._config.num_workers,
+            collate_fn=self.collate_wrapper,
+            pin_memory=True,
+        )
+
+    def train_dataloader(self, *args, **kwargs) -> DataLoader:
+        return self._shared_dataloader(self._train)
+
+    def val_dataloader(self, *args, **kwargs) -> DataLoader:
+        return self._shared_dataloader(self._val)
+
+    def test_dataloader(self, *args, **kwargs) -> DataLoader:
+        return self._shared_dataloader(self._test)
+
+    def transfer_batch_to_device(
+        self, batch: BatchedLabeledPathContext, device: torch.device, dataloader_idx: int
+    ) -> BatchedLabeledPathContext:
+        batch.move_to_device(device)
+        return batch
@@ -0,0 +1,89 @@
+from os.path import exists
+from typing import Dict, List, Optional
+
+import torch
+from commode_utils.filesystem import get_lines_offsets, get_line_by_offset
+from omegaconf import DictConfig
+from torch.utils.data import Dataset
+
+from code2seq.data.path_context import LabeledPathContext, Path
+from code2seq.data.vocabulary import Vocabulary
+
+
+class PathContextDataset(Dataset):
+    _log_file = "bad_samples.log"
+    _separator = "|"
+
+    def __init__(self, data_file: str, config: DictConfig, vocabulary: Vocabulary, random_context: bool):
+        if not exists(data_file):
+            raise ValueError(f"Can't find file with data: {data_file}")
+        self._data_file = data_file
+        self._config = config
+        self._vocab = vocabulary
+        self._random_context = random_context
+
+        self._label_unk = vocabulary.label_to_id[vocabulary.UNK]
+
+        self._line_offsets = get_lines_offsets(data_file)
+        self._n_samples = len(self._line_offsets)
+
+        open(self._log_file, "w").close()
+
+    def __len__(self):
+        return self._n_samples
+
+    def __getitem__(self, index) -> Optional[LabeledPathContext]:
+        raw_sample = get_line_by_offset(self._data_file, self._line_offsets[index])
+        try:
+            raw_label, *raw_path_contexts = raw_sample.split()
+        except ValueError as e:
+            with open(self._log_file, "a") as f_out:
+                f_out.write(f"Error reading sample from line #{index}: {e}")
+            return None
+
+        # Choose paths for current data sample
+        n_contexts = min(len(raw_path_contexts), self._config.max_context)
+        if self._random_context:
+            raw_path_contexts.shuffle()
+        raw_path_contexts = raw_path_contexts[:n_contexts]
+
+        # Tokenize label
+        label = self._get_label(raw_label)
+
+        # Tokenize paths
+        try:
+            paths = [self._get_path(raw_path.split(",")) for raw_path in raw_path_contexts]
+        except ValueError as e:
+            with open(self._log_file, "a") as f_out:
+                f_out.write(f"Error parsing sample from line #{index}: {e}")
+            return None
+
+        return LabeledPathContext(label, paths)
+
+    def _get_label(self, raw_label: str) -> torch.Tensor:
+        label = torch.full((self._config.max_label_parts + 1, 1), self._vocab.label_to_id[self._vocab.PAD])
+        label[0, 0] = self._vocab.label_to_id[self._vocab.SOS]
+        sublabels = raw_label.split(self._separator)[: self._config.max_label_parts]
+        label[1 : len(sublabels) + 1, 0] = torch.tensor(
+            [self._vocab.label_to_id.get(sl, self._label_unk) for sl in sublabels]
+        )
+        if len(sublabels) < self._config.max_label_parts:
+            label[len(sublabels) + 1, 0] = self._vocab.label_to_id[self._vocab.EOS]
+        return label
+
+    def _tokenize_token(self, token: str, vocab: Dict[str, int], max_parts: Optional[int]) -> torch.Tensor:
+        sub_tokens = token.split(self._separator)
+        max_parts = max_parts or len(sub_tokens)
+        token_unk = vocab[self._vocab.UNK]
+
+        result = torch.full((max_parts,), vocab[self._vocab.PAD], dtype=torch.long)
+        sub_tokens_ids = [vocab.get(st, token_unk) for st in sub_tokens[:max_parts]]
+        result[: len(sub_tokens_ids)] = torch.tensor(sub_tokens_ids)
+        return result
+
+    def _get_path(self, raw_path: List[str]) -> Path:
+        return Path(
+            from_token=self._tokenize_token(raw_path[0], self._vocab.token_to_id, self._config.max_token_parts),
+            path_node=self._tokenize_token(raw_path[1], self._vocab.node_to_id, None),
+            to_token=self._tokenize_token(raw_path[2], self._vocab.token_to_id, self._config.max_token_parts),
+        )
@@ -0,0 +1,25 @@
+from typing import List, Optional
+
+from omegaconf import DictConfig
+
+from code2seq.data import (
+    PathContextDataModule,
+    TypedPathContextDataset,
+    BatchedLabeledTypedPathContext,
+    LabeledTypedPathContext,
+)
+from code2seq.data.vocabulary import TypedVocabulary
+
+
+class TypedPathContextDataModule(PathContextDataModule):
+    _vocabulary: Optional[TypedVocabulary] = None
+
+    def __init__(self, data_dir: str, config: DictConfig):
+        super().__init__(data_dir, config)
+
+    @staticmethod
+    def collate_wrapper(batch: List[Optional[LabeledTypedPathContext]]) -> BatchedLabeledTypedPathContext:
+        return BatchedLabeledTypedPathContext(batch)
+
+    def _create_dataset(self, holdout_file: str, random_context: bool) -> TypedPathContextDataset:
+        return TypedPathContextDataset(holdout_file, self._config, self._vocabulary, random_context)
@@ -0,0 +1,21 @@
+from typing import List
+
+from omegaconf import DictConfig
+
+from code2seq.data import PathContextDataset, TypedPath
+from code2seq.data.vocabulary import TypedVocabulary
+
+
+class TypedPathContextDataset(PathContextDataset):
+    def __init__(self, data_file: str, config: DictConfig, vocabulary: TypedVocabulary, random_context: bool):
+        super().__init__(data_file, config, vocabulary, random_context)
+        self._vocab = vocabulary
+
+    def _get_path(self, raw_path: List[str]) -> TypedPath:
+        return TypedPath(
+            from_type=self._tokenize_token(raw_path[0], self._vocab.type_to_id, self._config.max_type_parts),
+            from_token=self._tokenize_token(raw_path[1], self._vocab.token_to_id, self._config.max_token_parts),
+            path_node=self._tokenize_token(raw_path[2], self._vocab.node_to_id, None),
+            to_token=self._tokenize_token(raw_path[3], self._vocab.token_to_id, self._config.max_token_parts),
+            to_type=self._tokenize_token(raw_path[4], self._vocab.type_to_id, self._config.max_type_parts),
+        )