diff --git a/setup.py b/setup.py index 66552348432..399aad04395 100644 --- a/setup.py +++ b/setup.py @@ -141,7 +141,7 @@ AUDIO_REQUIRE = [ "soundfile>=0.12.1", "librosa", - "soxr>=0.4.0; python_version>='3.9'", # Supports numpy-2 + "soxr>=0.4.0", # Supports numpy-2 ] VISION_REQUIRE = [ diff --git a/src/datasets/formatting/formatting.py b/src/datasets/formatting/formatting.py index 165f178f317..54af4fc61f6 100644 --- a/src/datasets/formatting/formatting.py +++ b/src/datasets/formatting/formatting.py @@ -22,9 +22,7 @@ import numpy as np import pandas as pd import pyarrow as pa -from packaging import version -from .. import config from ..features import Features from ..features.features import _ArrayXDExtensionType, _is_zero_copy_only, decode_nested_example, pandas_types_mapper from ..table import Table @@ -304,49 +302,46 @@ def __repr__(self): self._format_all() return repr(self.data) - if config.PY_VERSION >= version.parse("3.9"): - # merging with the union ("|") operator is supported in Python 3.9+ - - def __or__(self, other): - if isinstance(other, LazyDict): - inst = self.copy() - other = other.copy() - other._format_all() - inst.keys_to_format -= other.data.keys() - inst.data = inst.data | other.data - return inst - if isinstance(other, dict): - inst = self.copy() - inst.keys_to_format -= other.keys() - inst.data = inst.data | other - return inst - return NotImplemented - - def __ror__(self, other): - if isinstance(other, LazyDict): - inst = self.copy() - other = other.copy() - other._format_all() - inst.keys_to_format -= other.data.keys() - inst.data = other.data | inst.data - return inst - if isinstance(other, dict): - inst = self.copy() - inst.keys_to_format -= other.keys() - inst.data = other | inst.data - return inst - return NotImplemented - - def __ior__(self, other): - if isinstance(other, LazyDict): - other = other.copy() - other._format_all() - self.keys_to_format -= other.data.keys() - self.data |= other.data - else: - self.keys_to_format -= other.keys() - self.data |= other - return self + def __or__(self, other): + if isinstance(other, LazyDict): + inst = self.copy() + other = other.copy() + other._format_all() + inst.keys_to_format -= other.data.keys() + inst.data = inst.data | other.data + return inst + if isinstance(other, dict): + inst = self.copy() + inst.keys_to_format -= other.keys() + inst.data = inst.data | other + return inst + return NotImplemented + + def __ror__(self, other): + if isinstance(other, LazyDict): + inst = self.copy() + other = other.copy() + other._format_all() + inst.keys_to_format -= other.data.keys() + inst.data = other.data | inst.data + return inst + if isinstance(other, dict): + inst = self.copy() + inst.keys_to_format -= other.keys() + inst.data = other | inst.data + return inst + return NotImplemented + + def __ior__(self, other): + if isinstance(other, LazyDict): + other = other.copy() + other._format_all() + self.keys_to_format -= other.data.keys() + self.data |= other.data + else: + self.keys_to_format -= other.keys() + self.data |= other + return self def __copy__(self): # Identical to `UserDict.__copy__` diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py index 20fab1962e4..e2d773a7689 100644 --- a/tests/test_arrow_dataset.py +++ b/tests/test_arrow_dataset.py @@ -3175,12 +3175,11 @@ def test_tf_dataset_options(self, in_memory): self.assertEqual(len(tf_dataset), 2) # One batch of 3 and one batch of 1 self.assertEqual(len(tf_dataset_with_drop), 1) # Incomplete batch of 1 is dropped # Test that `NotImplementedError` is raised `batch_size` is None and `num_workers` is > 0 - if sys.version_info >= (3, 8): - with self._create_dummy_dataset(in_memory, tmp_dir.name, multiple_columns=True) as dset: - with self.assertRaisesRegex( - NotImplementedError, "`batch_size` must be specified when using multiple workers" - ): - dset.to_tf_dataset(columns="col_1", batch_size=None, num_workers=2) + with self._create_dummy_dataset(in_memory, tmp_dir.name, multiple_columns=True) as dset: + with self.assertRaisesRegex( + NotImplementedError, "`batch_size` must be specified when using multiple workers" + ): + dset.to_tf_dataset(columns="col_1", batch_size=None, num_workers=2) del tf_dataset # For correct cleanup del tf_dataset_with_drop