Skip to content

Commit

Permalink
Avoid redundant memory consumption
Browse files Browse the repository at this point in the history
Patching can be removed once LLM Foundry uses a `datasets` version that
has huggingface/datasets#7136 integrated.
  • Loading branch information
janEbert committed Sep 4, 2024
1 parent a86dbc6 commit 20154ee
Showing 1 changed file with 11 additions and 0 deletions.
11 changes: 11 additions & 0 deletions llm-foundry-env/py-scripts/convert_dataset_parquet_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,16 @@ class ConcatMode(Enum):
CONCAT_TOKENS = 'CONCAT_TOKENS'


def patch_dataset_sharding():
import itertools

def my_iter(self):
ex_iterator = iter(self.ex_iterable)
return itertools.islice(ex_iterator, self.offset, None, self.step)

hf_datasets.iterable_dataset.StepExamplesIterable.__iter__ = my_iter


def build_hf_dataset(
path: str,
split: str,
Expand Down Expand Up @@ -335,6 +345,7 @@ def parse_args() -> Namespace:


if __name__ == '__main__':
patch_dataset_sharding()
args = parse_args()
convert_dataset_parquet_from_args(
path=args.path,
Expand Down

0 comments on commit 20154ee

Please sign in to comment.