Skip to content
Open
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
0d54326
Change IntervalDtype to inherit from _BaseDtype
vyasr Jan 14, 2026
0357100
Add _recursively_replace_fields method to IntervalDtype
vyasr Jan 14, 2026
ebdc5ea
Fix type annotations and imports for IntervalDtype
vyasr Jan 14, 2026
243ae71
Change IntervalColumn to inherit from ColumnBase
vyasr Jan 15, 2026
c76e792
Add infrastructure methods to IntervalColumn
vyasr Jan 15, 2026
f3840c6
Add _with_type_metadata method to IntervalColumn
vyasr Jan 15, 2026
7c8fe07
Fix element_indexing and imports in IntervalColumn
vyasr Jan 15, 2026
35fd074
Add __cuda_array_interface__ property to IntervalColumn
vyasr Jan 15, 2026
017623b
Fix as_column to handle list of pd.Interval objects
vyasr Jan 15, 2026
c8e20b4
Remove IntervalDtype handling from StructColumn._with_type_metadata
vyasr Jan 15, 2026
e47a7aa
Simplify StructColumn._validate_args dtype check
vyasr Jan 15, 2026
dfd6a8c
Fix index type dispatch for StructColumn
vyasr Jan 15, 2026
9261a64
Fix IntervalDtype conversion in as_column
vyasr Jan 15, 2026
add9078
Add IntervalDtype handling to dtype_to_pylibcudf_type
vyasr Jan 15, 2026
34ed654
Handle IntervalDtype in column_empty like StructDtype
vyasr Jan 15, 2026
e87fd75
Preserve column type metadata in _gather operation
vyasr Jan 15, 2026
3c2b2aa
Adapt interval/struct split to completely_remove_children_v2 architec…
vyasr Jan 16, 2026
5786fb2
Remove unused _get_sliced_child method from IntervalColumn
vyasr Jan 21, 2026
6571d76
Fix find_common_type to handle IntervalDtype
vyasr Jan 21, 2026
8da1713
Fix outstanding tests
vyasr Jan 21, 2026
a17ad0b
Remove now passing test
vyasr Jan 21, 2026
b39b174
Address PR review comments
vyasr Jan 21, 2026
825d8f4
One more CoW test
vyasr Jan 22, 2026
fec7b1a
Merge remote-tracking branch 'upstream/main' into refactor/split_inte…
vyasr Jan 22, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 15 additions & 5 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -2646,9 +2646,9 @@ def column_empty(
dtype : Dtype
Type of the column.
"""
if (is_struct := isinstance(dtype, StructDtype)) or isinstance(
dtype, ListDtype
):
if (
is_struct := isinstance(dtype, (StructDtype, IntervalDtype))
) or isinstance(dtype, ListDtype):
if is_struct:
children = tuple(
column_empty(row_count, field_dtype)
Expand Down Expand Up @@ -2844,7 +2844,15 @@ def as_column(
dtype=dtype,
length=length,
)
if (
if isinstance(arbitrary.dtype, pd.IntervalDtype):
# Wrap StructColumn as IntervalColumn with proper metadata
result = result._with_type_metadata(
IntervalDtype(
subtype=cudf.dtype(arbitrary.dtype.subtype),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: cudf.dtype will be called in the IntervalDtype constructor as it would be nice to have less places use cudf.dtype

closed=arbitrary.dtype.closed,
)
)
elif (
cudf.get_option("mode.pandas_compatible")
and isinstance(arbitrary.dtype, pd.CategoricalDtype)
and is_pandas_nullable_extension_dtype(
Expand Down Expand Up @@ -3266,12 +3274,14 @@ def as_column(
length=length,
)
elif (
isinstance(element, (pd.Timestamp, pd.Timedelta))
isinstance(element, (pd.Timestamp, pd.Timedelta, pd.Interval))
or element is pd.NaT
):
# TODO: Remove this after
# https://github.com/apache/arrow/issues/26492
# is fixed.
# Note: pd.Interval also requires pandas Series conversion
# because PyArrow cannot infer interval type from raw list
return as_column(
pd.Series(arbitrary),
dtype=dtype,
Expand Down
79 changes: 77 additions & 2 deletions python/cudf/cudf/core/column/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,18 @@

import cudf
from cudf.core.column.column import ColumnBase, _handle_nulls, as_column
from cudf.core.column.struct import StructColumn
from cudf.core.dtypes import IntervalDtype, _dtype_to_metadata
from cudf.utils.dtypes import is_dtype_obj_interval
from cudf.utils.scalar import maybe_nested_pa_scalar_to_py

if TYPE_CHECKING:
from cudf._typing import DtypeObj
from cudf.core.buffer import Buffer


class IntervalColumn(StructColumn):
class IntervalColumn(ColumnBase):
_VALID_PLC_TYPES = {plc.TypeId.STRUCT}

@classmethod
def _validate_args( # type: ignore[override]
cls, plc_column: plc.Column, dtype: IntervalDtype
Expand All @@ -48,6 +51,39 @@ def _validate_args( # type: ignore[override]
raise ValueError("dtype must be a IntervalDtype.")
return plc_column, dtype

def _with_type_metadata(self, dtype: DtypeObj) -> ColumnBase:
"""
Apply IntervalDtype metadata to this column.

Creates new children with the subtype metadata applied and
reconstructs the plc.Column.
"""
if isinstance(dtype, IntervalDtype):
new_children = tuple(
ColumnBase.from_pylibcudf(child).astype(dtype.subtype)
for child in self.plc_column.children()
)
new_plc_column = plc.Column(
plc.DataType(plc.TypeId.STRUCT),
self.plc_column.size(),
self.plc_column.data(),
self.plc_column.null_mask(),
self.plc_column.null_count(),
self.plc_column.offset(),
[child.plc_column for child in new_children],
)
return type(self)._from_preprocessed(
plc_column=new_plc_column,
dtype=dtype,
)
# For pandas dtypes, store them directly in the column's dtype property
elif isinstance(dtype, pd.ArrowDtype) and isinstance(
dtype.pyarrow_dtype, pa.lib.StructType
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's probably More Correct if dtype.pyarrow_dtype was an instance of ArrowIntervalType (we have similar handling for this in ColumnBase.from_arrow)

):
self._dtype = dtype

return self

@classmethod
def from_arrow(cls, array: pa.Array | pa.ChunkedArray) -> Self:
if not isinstance(array, pa.ExtensionArray):
Expand Down Expand Up @@ -76,6 +112,36 @@ def to_arrow(self) -> pa.Array:
struct_arrow = pa.array([], typ.storage_type)
return pa.ExtensionArray.from_storage(typ, struct_arrow)

@classmethod
def _deserialize_plc_column(
cls,
header: dict,
dtype: DtypeObj,
data: Buffer | None,
mask: Buffer | None,
children: list[plc.Column],
) -> plc.Column:
"""Construct plc.Column using STRUCT type for interval columns."""
offset = header.get("offset", 0)
if mask is None:
null_count = 0
else:
null_count = plc.null_mask.null_count(
mask, offset, header["size"] + offset
)

plc_type = plc.DataType(plc.TypeId.STRUCT)
return plc.Column(
plc_type,
header["size"],
data,
mask,
null_count,
offset,
children,
validate=False,
)

def copy(self, deep: bool = True) -> Self:
return super().copy(deep=deep)._with_type_metadata(self.dtype) # type: ignore[return-value]

Expand Down Expand Up @@ -134,6 +200,12 @@ def right(self) -> ColumnBase:
self.plc_column.children()[1]
)._with_type_metadata(self.dtype.subtype) # type: ignore[union-attr]

@property
def __cuda_array_interface__(self) -> dict[str, Any]:
raise NotImplementedError(
"Intervals are not yet supported via `__cuda_array_interface__`"
)

def overlaps(other) -> ColumnBase:
raise NotImplementedError("overlaps is not currently implemented.")

Expand Down Expand Up @@ -176,6 +248,9 @@ def element_indexing(
self, index: int
) -> pd.Interval | dict[Any, Any] | None:
result = super().element_indexing(index)
if isinstance(result, pa.Scalar):
py_element = maybe_nested_pa_scalar_to_py(result)
result = self.dtype._recursively_replace_fields(py_element) # type: ignore[union-attr]
if isinstance(result, dict) and cudf.get_option(
"mode.pandas_compatible"
):
Expand Down
41 changes: 30 additions & 11 deletions python/cudf/cudf/core/column/struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,17 +53,14 @@ def _validate_args( # type: ignore[override]
cls, plc_column: plc.Column, dtype: StructDtype
) -> tuple[plc.Column, StructDtype]:
plc_column, dtype = super()._validate_args(plc_column, dtype) # type: ignore[assignment]
# IntervalDtype is a subclass of StructDtype, so compare types exactly
if (
not cudf.get_option("mode.pandas_compatible")
and type(dtype) is not StructDtype
and not isinstance(dtype, StructDtype)
) or (
cudf.get_option("mode.pandas_compatible")
and not is_dtype_obj_struct(dtype)
):
raise ValueError(
f"{type(dtype).__name__} must be a StructDtype exactly."
)
raise ValueError(f"{type(dtype).__name__} must be a StructDtype.")
return plc_column, dtype

def _get_sliced_child(self, idx: int) -> ColumnBase:
Expand Down Expand Up @@ -148,15 +145,15 @@ def __cuda_array_interface__(self) -> Mapping[str, Any]:
"Structs are not yet supported via `__cuda_array_interface__`"
)

def _with_type_metadata(
self: StructColumn, dtype: DtypeObj
) -> StructColumn:
from cudf.core.column import IntervalColumn
def _with_type_metadata(self: StructColumn, dtype: DtypeObj) -> ColumnBase:
from cudf.core.dtypes import IntervalDtype

# Check IntervalDtype first because it's a subclass of StructDtype
if isinstance(dtype, IntervalDtype):
# TODO: Rewrite this to avoid needing to round-trip via ColumnBase
# Dispatch to IntervalColumn when given IntervalDtype
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could this entire branch just call return IntervalColumn._with_type_metadata?

from cudf.core.column.interval import IntervalColumn

# Apply subtype metadata to children and reconstruct as IntervalColumn
new_children = tuple(
ColumnBase.from_pylibcudf(child).astype(dtype.subtype)
for child in self.plc_column.children()
Expand All @@ -175,7 +172,29 @@ def _with_type_metadata(
dtype=dtype,
)
elif isinstance(dtype, StructDtype):
self._dtype = dtype
new_children = tuple(
ColumnBase.from_pylibcudf(child)._with_type_metadata(
dtype.fields[f]
)
for child, f in zip(
self.plc_column.children(),
dtype.fields.keys(),
strict=True,
)
)
new_plc_column = plc.Column(
plc.DataType(plc.TypeId.STRUCT),
self.plc_column.size(),
self.plc_column.data(),
self.plc_column.null_mask(),
self.plc_column.null_count(),
self.plc_column.offset(),
[child.plc_column for child in new_children],
)
return StructColumn._from_preprocessed(
plc_column=new_plc_column,
dtype=dtype,
)
# For pandas dtypes, store them directly in the column's dtype property
elif isinstance(dtype, pd.ArrowDtype) and isinstance(
dtype.pyarrow_dtype, pa.StructType
Expand Down
94 changes: 85 additions & 9 deletions python/cudf/cudf/core/dtypes.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION.
# SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION.
# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations

Expand Down Expand Up @@ -969,7 +969,7 @@ class Decimal128Dtype(DecimalDtype):
ITEMSIZE = 16


class IntervalDtype(StructDtype):
class IntervalDtype(_BaseDtype):
"""
A data type for Interval data.

Expand Down Expand Up @@ -997,7 +997,7 @@ def __init__(
raise ValueError(f"{closed=} is not valid")
if subtype is None:
self._subtype = None
dtypes = {}
self._fields = {}
else:
self._subtype = cudf.dtype(subtype)
if isinstance(
Expand All @@ -1007,13 +1007,33 @@ def __init__(
"category, object, and string subtypes are not supported "
"for IntervalDtype"
)
dtypes = {"left": self._subtype, "right": self._subtype}
super().__init__(dtypes)
self._fields = {"left": self._subtype, "right": self._subtype}

@property
def subtype(self) -> DtypeObj | None:
return self._subtype

@property
def fields(self) -> dict[str, DtypeObj]:
"""
Returns an ordered dict of column name and dtype key-value.

For IntervalDtype, this always returns {"left": subtype, "right": subtype}.
"""
return self._fields

@property
def type(self):
# TODO: we should change this to return something like an
# IntervalDtypeType, once we figure out what that should look like
return pd.Interval

@cached_property
def itemsize(self) -> int:
if self._subtype is None:
return 0
return sum(field.itemsize for field in self.fields.values())

def __repr__(self) -> str:
if self.subtype is None:
return "interval"
Expand Down Expand Up @@ -1046,7 +1066,6 @@ def __eq__(self, other) -> bool:
# This means equality isn't transitive but mimics pandas
return other in (self.name, str(self))
elif type(self) is not type(other):
# Avoid isinstance because this subclasses StructDtype
return False
elif other.subtype is None:
# Equivalent to the string "interval"
Expand All @@ -1056,6 +1075,44 @@ def __eq__(self, other) -> bool:
def __hash__(self) -> int:
return hash((self.subtype, self.closed))

def _recursively_replace_fields(self, result: dict) -> dict:
"""
Return a new dict result but with the keys replaced by "left" and "right".

Intended when result comes from pylibcudf without preserved nested field names.
Converts dict with numeric/string keys to {"left": ..., "right": ...}.
Handles nested StructDtype and ListDtype recursively.
"""
# Convert the dict keys (which may be numeric like 0, 1 or string like "0", "1")
# to the proper field names "left" and "right"
values = list(result.values())
if len(values) != 2:
raise ValueError(
f"Expected 2 fields for IntervalDtype, got {len(values)}"
)

new_result = {}
for field_name, result_value in zip(
["left", "right"], values, strict=True
):
if self._subtype is None:
new_result[field_name] = result_value
elif isinstance(self._subtype, StructDtype) and isinstance(
result_value, dict
):
new_result[field_name] = (
self._subtype._recursively_replace_fields(result_value)
)
elif isinstance(self._subtype, ListDtype) and isinstance(
result_value, list
):
new_result[field_name] = (
self._subtype._recursively_replace_fields(result_value)
)
else:
new_result[field_name] = result_value
return new_result

def serialize(self) -> tuple[dict, list]:
header = {
"fields": (
Expand Down Expand Up @@ -1307,11 +1364,25 @@ def recursively_update_struct_names(
dtype: DtypeObj, child_names: Mapping[Any, Any]
) -> DtypeObj:
"""
Update dtype's field names (namely StructDtype) recursively with child_names.
Update dtype's field names (namely StructDtype and IntervalDtype) recursively with child_names.

Needed for nested types that come from libcudf which do not carry struct field names.
"""
if isinstance(dtype, StructDtype):
if isinstance(dtype, IntervalDtype):
# For IntervalDtype, child_names should have "left" and "right" keys
# But we need to recursively update the subtype if it's nested
if dtype.subtype is None:
return dtype
# child_names should be {"left": {...}, "right": {...}}
left_names = child_names.get("left", {})
# Since left and right have the same dtype, we only need one of them
if isinstance(dtype.subtype, (StructDtype, ListDtype)):
new_subtype = recursively_update_struct_names(
dtype.subtype, left_names
)
return IntervalDtype(subtype=new_subtype, closed=dtype.closed)
return dtype
elif isinstance(dtype, StructDtype):
return StructDtype(
{
new_name: recursively_update_struct_names(
Expand All @@ -1337,7 +1408,12 @@ def recursively_update_struct_names(
def _dtype_to_metadata(dtype: DtypeObj) -> plc.interop.ColumnMetadata:
# Convert a cudf or pandas dtype to pylibcudf ColumnMetadata for arrow conversion
cm = plc.interop.ColumnMetadata()
if isinstance(dtype, StructDtype):
if isinstance(dtype, IntervalDtype):
# IntervalDtype is stored as a struct with "left" and "right" fields
for name, field_dtype in dtype.fields.items():
cm.children_meta.append(_dtype_to_metadata(field_dtype))
cm.children_meta[-1].name = name
elif isinstance(dtype, StructDtype):
for name, dtype in dtype.fields.items():
cm.children_meta.append(_dtype_to_metadata(dtype))
cm.children_meta[-1].name = name
Expand Down
Loading
Loading