Add ConfigOptions convenience class to cudf-polars (#18137)

rjzamora · web-flow · commit 63459ca65b5b · 2025-03-13T16:39:04.000Z
I propose that we add a simple management utility for `GPUEngine` configuration options in cudf-polars. In order to support a variety of configurable groupby/shuffle/sort/join options for multi-gpu execution, we will want to make it easy to pass through these options to `IR` nodes at translation time. We will also want it to be "easy" set/get specific options at traversal/evaluation time. Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) - Peter Andreas Entschev (https://github.com/pentschev) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: #18137
diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py
@@ -202,46 +202,6 @@ def _callback(
             raise ValueError(f"Unknown executor '{executor}'")
 
 
-def validate_config_options(config: dict) -> None:
-    """
-    Validate the configuration options for the GPU engine.
-
-    Parameters
-    ----------
-    config
-        Configuration options to validate.
-
-    Raises
-    ------
-    ValueError
-        If the configuration contains unsupported options.
-    """
-    if unsupported := (
-        config.keys()
-        - {"raise_on_fail", "parquet_options", "executor", "executor_options"}
-    ):
-        raise ValueError(
-            f"Engine configuration contains unsupported settings: {unsupported}"
-        )
-    assert {"chunked", "chunk_read_limit", "pass_read_limit"}.issuperset(
-        config.get("parquet_options", {})
-    )
-
-    # Validate executor_options
-    executor = config.get("executor", "pylibcudf")
-    if executor == "dask-experimental":
-        unsupported = config.get("executor_options", {}).keys() - {
-            "max_rows_per_partition",
-            "parquet_blocksize",
-            "cardinality_factor",
-            "groupby_n_ary",
-        }
-    else:
-        unsupported = config.get("executor_options", {}).keys()
-    if unsupported:
-        raise ValueError(f"Unsupported executor_options for {executor}: {unsupported}")
-
-
 def execute_with_cudf(nt: NodeTraverser, *, config: GPUEngine) -> None:
     """
     A post optimization callback that attempts to execute the plan with cudf.
@@ -269,7 +229,6 @@ def execute_with_cudf(nt: NodeTraverser, *, config: GPUEngine) -> None:
     memory_resource = config.memory_resource
     raise_on_fail = config.config.get("raise_on_fail", False)
     executor = config.config.get("executor", None)
-    validate_config_options(config.config)
 
     with nvtx.annotate(message="ConvertIR", domain="cudf_polars"):
         translator = Translator(nt, config)
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -39,6 +39,7 @@
     from polars.polars import _expr_nodes as pl_expr
 
     from cudf_polars.typing import Schema, Slice as Zlice
+    from cudf_polars.utils.config import ConfigOptions
 
 
 __all__ = [
@@ -284,7 +285,7 @@ class Scan(IR):
     """Reader-specific options, as dictionary."""
     cloud_options: dict[str, Any] | None
     """Cloud-related authentication options, currently ignored."""
-    config_options: dict[str, Any]
+    config_options: ConfigOptions
     """GPU-specific configuration options"""
     paths: list[str]
     """List of paths to read from."""
@@ -308,7 +309,7 @@ def __init__(
         typ: str,
         reader_options: dict[str, Any],
         cloud_options: dict[str, Any] | None,
-        config_options: dict[str, Any],
+        config_options: ConfigOptions,
         paths: list[str],
         with_columns: list[str] | None,
         skip_rows: int,
@@ -413,7 +414,7 @@ def get_hashable(self) -> Hashable:
             self.typ,
             json.dumps(self.reader_options),
             json.dumps(self.cloud_options),
-            json.dumps(self.config_options),
+            self.config_options,
             tuple(self.paths),
             tuple(self.with_columns) if self.with_columns is not None else None,
             self.skip_rows,
@@ -428,7 +429,7 @@ def do_evaluate(
         schema: Schema,
         typ: str,
         reader_options: dict[str, Any],
-        config_options: dict[str, Any],
+        config_options: ConfigOptions,
         paths: list[str],
         with_columns: list[str] | None,
         skip_rows: int,
@@ -516,8 +517,7 @@ def do_evaluate(
                 colnames[0],
             )
         elif typ == "parquet":
-            parquet_options = config_options.get("parquet_options", {})
-            if parquet_options.get("chunked", True):
+            if config_options.get("parquet_options.chunked", default=True):
                 options = plc.io.parquet.ParquetReaderOptions.builder(
                     plc.io.SourceInfo(paths)
                 ).build()
@@ -534,11 +534,13 @@ def do_evaluate(
                     options.set_columns(with_columns)
                 reader = plc.io.parquet.ChunkedParquetReader(
                     options,
-                    chunk_read_limit=parquet_options.get(
-                        "chunk_read_limit", cls.PARQUET_DEFAULT_CHUNK_SIZE
+                    chunk_read_limit=config_options.get(
+                        "parquet_options.chunk_read_limit",
+                        default=cls.PARQUET_DEFAULT_CHUNK_SIZE,
                     ),
-                    pass_read_limit=parquet_options.get(
-                        "pass_read_limit", cls.PARQUET_DEFAULT_PASS_LIMIT
+                    pass_read_limit=config_options.get(
+                        "parquet_options.pass_read_limit",
+                        default=cls.PARQUET_DEFAULT_PASS_LIMIT,
                     ),
                 )
                 chk = reader.read_chunk()
@@ -702,15 +704,15 @@ class DataFrameScan(IR):
     """Polars LazyFrame object."""
     projection: tuple[str, ...] | None
     """List of columns to project out."""
-    config_options: dict[str, Any]
+    config_options: ConfigOptions
     """GPU-specific configuration options"""
 
     def __init__(
         self,
         schema: Schema,
         df: Any,
         projection: Sequence[str] | None,
-        config_options: dict[str, Any],
+        config_options: ConfigOptions,
     ):
         self.schema = schema
         self.df = df
@@ -736,7 +738,7 @@ def get_hashable(self) -> Hashable:
             schema_hash,
             id(self.df),
             self.projection,
-            json.dumps(self.config_options),
+            self.config_options,
         )
 
     @classmethod
@@ -876,7 +878,7 @@ def __init__(self, polars_groupby_options: Any):
     """Preserve order in groupby."""
     options: GroupbyOptions
     """Arbitrary options."""
-    config_options: dict[str, Any]
+    config_options: ConfigOptions
     """GPU-specific configuration options"""
 
     def __init__(
@@ -886,7 +888,7 @@ def __init__(
         agg_requests: Sequence[expr.NamedExpr],
         maintain_order: bool,  # noqa: FBT001
         options: Any,
-        config_options: dict[str, Any],
+        config_options: ConfigOptions,
         df: IR,
     ):
         self.schema = schema
@@ -912,18 +914,6 @@ def __init__(
             self.AggInfos(self.agg_requests),
         )
 
-    def get_hashable(self) -> Hashable:
-        """Hashable representation of the node."""
-        return (
-            type(self),
-            tuple(self.schema.items()),
-            self.keys,
-            self.maintain_order,
-            self.options,
-            json.dumps(self.config_options),
-            self.children,
-        )
-
     @staticmethod
     def check_agg(agg: expr.Expr) -> int:
         """
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -5,6 +5,7 @@
 
 from __future__ import annotations
 
+import copy
 import functools
 import json
 from contextlib import AbstractContextManager, nullcontext
@@ -23,7 +24,7 @@
 from cudf_polars.dsl import expr, ir
 from cudf_polars.dsl.to_ast import insert_colrefs
 from cudf_polars.typing import NodeTraverser
-from cudf_polars.utils import dtypes, sorting
+from cudf_polars.utils import config, dtypes, sorting
 
 if TYPE_CHECKING:
     from polars import GPUEngine
@@ -41,13 +42,13 @@ class Translator:
     ----------
     visitor
         Polars NodeTraverser object
-    config
+    engine
         GPU engine configuration.
     """
 
-    def __init__(self, visitor: NodeTraverser, config: GPUEngine):
+    def __init__(self, visitor: NodeTraverser, engine: GPUEngine):
         self.visitor = visitor
-        self.config = config
+        self.config_options = config.ConfigOptions(copy.deepcopy(engine.config))
         self.errors: list[Exception] = []
 
     def translate_ir(self, *, n: int | None = None) -> ir.IR:
@@ -233,7 +234,7 @@ def _(
         typ,
         reader_options,
         cloud_options,
-        translator.config.config.copy(),
+        translator.config_options,
         node.paths,
         with_columns,
         skip_rows,
@@ -260,7 +261,7 @@ def _(
         schema,
         node.df,
         node.projection,
-        translator.config.config.copy(),
+        translator.config_options,
     )
 
 
@@ -288,7 +289,7 @@ def _(
         aggs,
         node.maintain_order,
         node.options,
-        translator.config.config.copy(),
+        translator.config_options,
         inp,
     )
 
diff --git a/python/cudf_polars/cudf_polars/experimental/groupby.py b/python/cudf_polars/cudf_polars/experimental/groupby.py
@@ -166,9 +166,9 @@ def _(
     groupby_key_columns = [ne.name for ne in ir.keys]
     cardinality_factor = {
         c: min(f, 1.0)
-        for c, f in ir.config_options.get("executor_options", {})
-        .get("cardinality_factor", {})
-        .items()
+        for c, f in ir.config_options.get(
+            "executor_options.cardinality_factor", default={}
+        ).items()
         if c in groupby_key_columns
     }
     if cardinality_factor:
@@ -278,7 +278,7 @@ def _(
 
     # Simple N-ary tree reduction
     j = 0
-    n_ary = ir.config_options.get("executor_options", {}).get("groupby_n_ary", 32)
+    n_ary = ir.config_options.get("executor_options.groupby_n_ary", default=32)
     graph: MutableMapping[Any, Any] = {}
     name = get_key_name(ir)
     keys: list[Any] = [(child_name, i) for i in range(child_count)]
diff --git a/python/cudf_polars/cudf_polars/experimental/io.py b/python/cudf_polars/cudf_polars/experimental/io.py
@@ -22,14 +22,16 @@
     from cudf_polars.dsl.expr import NamedExpr
     from cudf_polars.experimental.dispatch import LowerIRTransformer
     from cudf_polars.typing import Schema
+    from cudf_polars.utils.config import ConfigOptions
 
 
 @lower_ir_node.register(DataFrameScan)
 def _(
     ir: DataFrameScan, rec: LowerIRTransformer
 ) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
-    rows_per_partition = ir.config_options.get("executor_options", {}).get(
-        "max_rows_per_partition", 1_000_000
+    rows_per_partition = ir.config_options.get(
+        "executor_options.max_rows_per_partition",
+        default=1_000_000,
     )
 
     nrows = max(ir.df.shape()[0], 1)
@@ -91,8 +93,10 @@ def from_scan(ir: Scan) -> ScanPartitionPlan:
         """Extract the partitioning plan of a Scan operation."""
         if ir.typ == "parquet":
             # TODO: Use system info to set default blocksize
-            parallel_options = ir.config_options.get("executor_options", {})
-            blocksize: int = parallel_options.get("parquet_blocksize", 1024**3)
+            blocksize: int = ir.config_options.get(
+                "executor_options.parquet_blocksize",
+                default=1024**3,
+            )
             stats = _sample_pq_statistics(ir)
             file_size = sum(float(stats[column]) for column in ir.schema)
             if file_size > 0:
@@ -168,7 +172,7 @@ def do_evaluate(
         schema: Schema,
         typ: str,
         reader_options: dict[str, Any],
-        config_options: dict[str, Any],
+        config_options: ConfigOptions,
         paths: list[str],
         with_columns: list[str] | None,
         skip_rows: int,
@@ -270,11 +274,9 @@ def _(
         paths = list(ir.paths)
         if plan.flavor == ScanPartitionFlavor.SPLIT_FILES:
             # Disable chunked reader when splitting files
-            config_options = ir.config_options.copy()
-            config_options["parquet_options"] = config_options.get(
-                "parquet_options", {}
-            ).copy()
-            config_options["parquet_options"]["chunked"] = False
+            config_options = ir.config_options.set(
+                name="parquet_options.chunked", value=False
+            )
 
             slices: list[SplitScan] = []
             for path in paths:
diff --git a/python/cudf_polars/cudf_polars/utils/config.py b/python/cudf_polars/cudf_polars/utils/config.py
diff --git a/python/cudf_polars/tests/experimental/test_groupby.py b/python/cudf_polars/tests/experimental/test_groupby.py