Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@
AUDIO_REQUIRE = [
"soundfile>=0.12.1",
"librosa",
"soxr>=0.4.0; python_version>='3.9'", # Supports numpy-2
"soxr>=0.4.0", # Supports numpy-2
]

VISION_REQUIRE = [
Expand Down
85 changes: 40 additions & 45 deletions src/datasets/formatting/formatting.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,7 @@
import numpy as np
import pandas as pd
import pyarrow as pa
from packaging import version

from .. import config
from ..features import Features
from ..features.features import _ArrayXDExtensionType, _is_zero_copy_only, decode_nested_example, pandas_types_mapper
from ..table import Table
Expand Down Expand Up @@ -304,49 +302,46 @@ def __repr__(self):
self._format_all()
return repr(self.data)

if config.PY_VERSION >= version.parse("3.9"):
# merging with the union ("|") operator is supported in Python 3.9+

def __or__(self, other):
if isinstance(other, LazyDict):
inst = self.copy()
other = other.copy()
other._format_all()
inst.keys_to_format -= other.data.keys()
inst.data = inst.data | other.data
return inst
if isinstance(other, dict):
inst = self.copy()
inst.keys_to_format -= other.keys()
inst.data = inst.data | other
return inst
return NotImplemented

def __ror__(self, other):
if isinstance(other, LazyDict):
inst = self.copy()
other = other.copy()
other._format_all()
inst.keys_to_format -= other.data.keys()
inst.data = other.data | inst.data
return inst
if isinstance(other, dict):
inst = self.copy()
inst.keys_to_format -= other.keys()
inst.data = other | inst.data
return inst
return NotImplemented

def __ior__(self, other):
if isinstance(other, LazyDict):
other = other.copy()
other._format_all()
self.keys_to_format -= other.data.keys()
self.data |= other.data
else:
self.keys_to_format -= other.keys()
self.data |= other
return self
def __or__(self, other):
if isinstance(other, LazyDict):
inst = self.copy()
other = other.copy()
other._format_all()
inst.keys_to_format -= other.data.keys()
inst.data = inst.data | other.data
return inst
if isinstance(other, dict):
inst = self.copy()
inst.keys_to_format -= other.keys()
inst.data = inst.data | other
return inst
return NotImplemented

def __ror__(self, other):
if isinstance(other, LazyDict):
inst = self.copy()
other = other.copy()
other._format_all()
inst.keys_to_format -= other.data.keys()
inst.data = other.data | inst.data
return inst
if isinstance(other, dict):
inst = self.copy()
inst.keys_to_format -= other.keys()
inst.data = other | inst.data
return inst
return NotImplemented

def __ior__(self, other):
if isinstance(other, LazyDict):
other = other.copy()
other._format_all()
self.keys_to_format -= other.data.keys()
self.data |= other.data
else:
self.keys_to_format -= other.keys()
self.data |= other
return self

def __copy__(self):
# Identical to `UserDict.__copy__`
Expand Down
11 changes: 5 additions & 6 deletions tests/test_arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3175,12 +3175,11 @@ def test_tf_dataset_options(self, in_memory):
self.assertEqual(len(tf_dataset), 2) # One batch of 3 and one batch of 1
self.assertEqual(len(tf_dataset_with_drop), 1) # Incomplete batch of 1 is dropped
# Test that `NotImplementedError` is raised `batch_size` is None and `num_workers` is > 0
if sys.version_info >= (3, 8):
with self._create_dummy_dataset(in_memory, tmp_dir.name, multiple_columns=True) as dset:
with self.assertRaisesRegex(
NotImplementedError, "`batch_size` must be specified when using multiple workers"
):
dset.to_tf_dataset(columns="col_1", batch_size=None, num_workers=2)
with self._create_dummy_dataset(in_memory, tmp_dir.name, multiple_columns=True) as dset:
with self.assertRaisesRegex(
NotImplementedError, "`batch_size` must be specified when using multiple workers"
):
dset.to_tf_dataset(columns="col_1", batch_size=None, num_workers=2)
del tf_dataset # For correct cleanup
del tf_dataset_with_drop

Expand Down
Loading