Back Data with DataRow object (#4773)

mpolson64 · facebook-github-bot · commit 8b28f9702153 · 2026-01-20T09:46:32.000-08:00
Summary:

NOTE: This is much slower than the implementation which is backed by a dataframe. For clarity, Ive put this naive implementation up as its own diff and the next diff hunts for speedups.

Creates new source of truth for Data: the DataRow. The df is now a cached property which is dynamically generated based on these rows.

In the future, these will become a Base object in SQLAlchemy st. Data will have a SQLAlchemy relationship to a list of DataRows which live in their own table.

RFC:

1. Im renaming sem -&gt; se here (but keeping sem in the df for now, since this could be an incredibly involved cleanup). Do we have alignment that this is a positive change? If so I can either start of backlog the cleanup across the codebase. cc Balandat who Ive talked about this with a while back.
2. This removes the ability for Data to contain arbitrary columns, which was added in D83682740 and afaik unused. Arbitrary new columns would not be compatible with the new storage setup (it was easy in the old setup which is why we added it), and I think we should take a careful look at how to store contextual data in the future in a structured way.

Differential Revision: D90605846
diff --git a/ax/analysis/plotly/tests/test_marginal_effects.py b/ax/analysis/plotly/tests/test_marginal_effects.py
@@ -43,7 +43,7 @@ def setUp(self) -> None:
             self.experiment.trials[i].mark_running(no_runner_required=True)
             self.experiment.attach_data(
                 Data(
-                    pd.DataFrame(
+                    df=pd.DataFrame(
                         {
                             "trial_index": [i] * num_arms,
                             "arm_name": [f"0_{j}" for j in range(num_arms)],
diff --git a/ax/core/base_trial.py b/ax/core/base_trial.py
@@ -15,7 +15,7 @@
 from typing import Any, TYPE_CHECKING
 
 from ax.core.arm import Arm
-from ax.core.data import Data, sort_by_trial_index_and_arm_name
+from ax.core.data import Data
 from ax.core.evaluations_to_data import raw_evaluations_to_data
 from ax.core.generator_run import GeneratorRun, GeneratorRunType
 from ax.core.metric import Metric, MetricFetchResult
@@ -442,8 +442,6 @@ def fetch_data(self, metrics: list[Metric] | None = None, **kwargs: Any) -> Data
         data = Metric._unwrap_trial_data_multi(
             results=self.fetch_data_results(metrics=metrics, **kwargs)
         )
-        if not data.has_step_column:
-            data.full_df = sort_by_trial_index_and_arm_name(data.full_df)
 
         return data
 
diff --git a/ax/core/data.py b/ax/core/data.py
@@ -38,6 +38,36 @@
 MAP_KEY = "step"
 
 
+class DataRow:
+    def __init__(
+        self,
+        trial_index: int,
+        arm_name: str,
+        metric_name: str,
+        metric_signature: str,
+        mean: float,
+        se: float,
+        step: float | None = None,
+        start_time: int | None = None,
+        end_time: int | None = None,
+        n: int | None = None,
+    ) -> None:
+        self.trial_index: int = trial_index
+        self.arm_name: str = arm_name
+
+        self.metric_name: str = metric_name
+        self.metric_signature: str = metric_signature
+
+        self.mean: float = mean
+        self.se: float = se
+
+        self.step: float | None = step
+
+        self.start_time: int | None = start_time
+        self.end_time: int | None = end_time
+        self.n: int | None = n
+
+
 class Data(Base, SerializationMixin):
     """Class storing numerical data for an experiment.
 
@@ -101,8 +131,6 @@ class Data(Base, SerializationMixin):
         "start_time": pd.Timestamp,
         "end_time": pd.Timestamp,
         "n": int,
-        "frac_nonnull": np.float64,
-        "random_split": int,
         MAP_KEY: float,
     }
 
@@ -115,16 +143,19 @@ class Data(Base, SerializationMixin):
         "metric_signature",
     ]
 
-    full_df: pd.DataFrame
+    _data_rows: list[DataRow]
 
     def __init__(
         self,
+        data_rows: Iterable[DataRow] | None = None,
         df: pd.DataFrame | None = None,
         _skip_ordering_and_validation: bool = False,
     ) -> None:
         """Initialize a ``Data`` object from the given DataFrame.
 
         Args:
+            data_rows: Iterable of DataRows. If provided, this will be used as the
+                source of truth for Data, over df.
             df: DataFrame with underlying data, and required columns. Data must
                 be unique at the level of ("trial_index", "arm_name",
                 "metric_name"), plus "step" if a "step" column is present. A
@@ -135,32 +166,86 @@ def __init__(
                 Intended only for use in `Data.filter`, where the contents
                 of the DataFrame are known to be ordered and valid.
         """
-        if df is None:
-            # Initialize with barebones DF with expected dtypes
-            self.full_df = pd.DataFrame.from_dict(
+        if data_rows is not None:
+            if isinstance(data_rows, pd.DataFrame):
+                raise ValueError(
+                    "data_rows must be an iterable of DataRows, not a DataFrame."
+                )
+            self._data_rows = [*data_rows]
+        elif df is not None:
+            # Unroll the df into a list of DataRows
+            if missing_columns := self.REQUIRED_COLUMNS - {*df.columns}:
+                raise ValueError(
+                    f"Dataframe must contain required columns {list(missing_columns)}."
+                )
+
+            self._data_rows = [
+                DataRow(
+                    trial_index=row["trial_index"],
+                    arm_name=row["arm_name"],
+                    metric_name=row["metric_name"],
+                    metric_signature=row["metric_signature"],
+                    mean=row["mean"],
+                    se=row["sem"],
+                    step=row.get(MAP_KEY),
+                    start_time=row.get("start_time"),
+                    end_time=row.get("end_time"),
+                    n=row.get("n"),
+                )
+                for _, row in df.iterrows()
+            ]
+        else:
+            self._data_rows = []
+
+        self._memo_df: pd.DataFrame | None = None
+        self.has_step_column: bool = any(
+            row.step is not None for row in self._data_rows
+        )
+
+    @cached_property
+    def full_df(self) -> pd.DataFrame:
+        """
+        Convert the DataRows into a pandas DataFrame. If step, start_time, or end_time
+        is None for all rows the column will be elided.
+        """
+        if len(self._data_rows) == 0:
+            return pd.DataFrame.from_dict(
                 {
                     col: pd.Series([], dtype=self.COLUMN_DATA_TYPES[col])
                     for col in self.REQUIRED_COLUMNS
                 }
             )
-        elif _skip_ordering_and_validation:
-            self.full_df = df
-        else:
-            columns = set(df.columns)
-            missing_columns = self.REQUIRED_COLUMNS - columns
-            if missing_columns:
-                raise ValueError(
-                    f"Dataframe must contain required columns {list(missing_columns)}."
-                )
-            # Drop rows where every input is null. Since `dropna` can be slow, first
-            # check trial index to see if dropping nulls might be needed.
-            if df["trial_index"].isnull().any():
-                df = df.dropna(axis=0, how="all", ignore_index=True)
-            df = self._safecast_df(df=df)
-            self.full_df = self._get_df_with_cols_in_expected_order(df=df)
 
-        self._memo_df: pd.DataFrame | None = None
-        self.has_step_column: bool = MAP_KEY in self.full_df.columns
+        # Detect whether any of the optional attributes are present and should be
+        # included as columns in the full DataFrame.
+        include_step = any(row.step is not None for row in self._data_rows)
+        include_start_time = any(row.start_time is not None for row in self._data_rows)
+        include_end_time = any(row.end_time is not None for row in self._data_rows)
+        include_n = any(row.n is not None for row in self._data_rows)
+
+        records = [
+            {
+                "trial_index": row.trial_index,
+                "arm_name": row.arm_name,
+                "metric_name": row.metric_name,
+                "metric_signature": row.metric_signature,
+                "mean": row.mean,
+                "sem": row.se,
+                **({"step": row.step} if include_step else {}),
+                **({"start_time": row.start_time} if include_start_time else {}),
+                **({"end_time": row.end_time} if include_end_time else {}),
+                **({"n": row.n} if include_n else {}),
+            }
+            for row in self._data_rows
+        ]
+
+        return sort_by_trial_index_and_arm_name(
+            df=self._get_df_with_cols_in_expected_order(
+                df=self._safecast_df(
+                    df=pd.DataFrame.from_records(records),
+                ),
+            )
+        )
 
     @classmethod
     def _get_df_with_cols_in_expected_order(cls, df: pd.DataFrame) -> pd.DataFrame:
diff --git a/ax/core/tests/test_data.py b/ax/core/tests/test_data.py
@@ -187,14 +187,6 @@ def test_from_multiple_with_generator(self) -> None:
         self.assertEqual(len(data.full_df), 2 * len(self.data_with_df.full_df))
         self.assertFalse(data.has_step_column)
 
-    def test_extra_columns(self) -> None:
-        value = 3
-        extra_col_df = self.df.assign(foo=value)
-        data = Data(df=extra_col_df)
-        self.assertIn("foo", data.full_df.columns)
-        self.assertIn("foo", data.df.columns)
-        self.assertTrue((data.full_df["foo"] == value).all())
-
     def test_get_df_with_cols_in_expected_order(self) -> None:
         with self.subTest("Wrong order"):
             df = pd.DataFrame(columns=["mean", "trial_index", "hat"], data=[[0] * 3])
diff --git a/ax/core/tests/test_experiment.py b/ax/core/tests/test_experiment.py
@@ -673,7 +673,7 @@ def test_fetch_and_store_data(self) -> None:
 
         # Verify we do get the stored data if there are an unimplemented metrics.
         # Remove attached data for nonexistent metric.
-        exp.data.full_df = exp.data.full_df.loc[lambda x: x["metric_name"] != "z"]
+        exp.data = Data(df=exp.data.full_df.loc[lambda x: x["metric_name"] != "z"])
 
         # Remove implemented metric that is `available_while_running`
         # (and therefore not pulled from cache).
@@ -685,7 +685,9 @@ def test_fetch_and_store_data(self) -> None:
         looked_up_df = looked_up_data.full_df
         self.assertFalse((looked_up_df["metric_name"] == "z").any())
         self.assertTrue(
-            batch.fetch_data().full_df.equals(
+            batch.fetch_data()
+            .full_df.sort_values(["arm_name", "metric_name"], ignore_index=True)
+            .equals(
                 looked_up_df.loc[lambda x: (x["trial_index"] == 0)].sort_values(
                     ["arm_name", "metric_name"], ignore_index=True
                 )
diff --git a/ax/plot/pareto_utils.py b/ax/plot/pareto_utils.py
@@ -207,7 +207,7 @@ def get_observed_pareto_frontiers(
         ):
             # Make sure status quo is always included, for derelativization
             arm_names.append(experiment.status_quo.name)
-        data = Data(data.df[data.df["arm_name"].isin(arm_names)])
+        data = Data(df=data.df[data.df["arm_name"].isin(arm_names)])
     adapter = get_tensor_converter_adapter(experiment=experiment, data=data)
     pareto_observations = observed_pareto_frontier(adapter=adapter)
     # Convert to ParetoFrontierResults
diff --git a/ax/plot/scatter.py b/ax/plot/scatter.py
@@ -1731,7 +1731,7 @@ def tile_observations(
     if data is None:
         data = experiment.fetch_data()
     if arm_names is not None:
-        data = Data(data.df[data.df["arm_name"].isin(arm_names)])
+        data = Data(df=data.df[data.df["arm_name"].isin(arm_names)])
     m_ts = Generators.THOMPSON(
         data=data,
         search_space=experiment.search_space,
diff --git a/ax/plot/tests/test_fitted_scatter.py b/ax/plot/tests/test_fitted_scatter.py
@@ -33,7 +33,7 @@ def test_fitted_scatter(self) -> None:
         model = Generators.BOTORCH_MODULAR(
             # Adapter kwargs
             experiment=exp,
-            data=Data.from_multiple_data([data, Data(df)]),
+            data=Data.from_multiple_data([data, Data(df=df)]),
         )
         # Assert that each type of plot can be constructed successfully
         scalarized_metric_config = [
diff --git a/ax/plot/tests/test_pareto_utils.py b/ax/plot/tests/test_pareto_utils.py
@@ -107,7 +107,7 @@ def test_get_observed_pareto_frontiers(self) -> None:
         # For the check below, compute which arms are better than SQ
         df = experiment.fetch_data().df
         df["sem"] = np.nan
-        data = Data(df)
+        data = Data(df=df)
         sq_val = df[(df["arm_name"] == "status_quo") & (df["metric_name"] == "m1")][
             "mean"
         ].values[0]
diff --git a/ax/storage/json_store/tests/test_json_store.py b/ax/storage/json_store/tests/test_json_store.py
@@ -705,10 +705,6 @@ def test_decode_map_data_backward_compatible(self) -> None:
                 class_decoder_registry=CORE_CLASS_DECODER_REGISTRY,
             )
             self.assertEqual(len(map_data.full_df), 2)
-            # Even though the "epoch" and "timestamps" columns have not been
-            # renamed to "step", they are present
-            self.assertEqual(map_data.full_df["epoch"].tolist(), [0.0, 1.0])
-            self.assertEqual(map_data.full_df["timestamps"].tolist(), [3.0, 4.0])
             self.assertIsInstance(map_data, Data)
 
         with self.subTest("Single map key"):
@@ -729,8 +725,8 @@ def test_decode_map_data_backward_compatible(self) -> None:
                 decoder_registry=CORE_DECODER_REGISTRY,
                 class_decoder_registry=CORE_CLASS_DECODER_REGISTRY,
             )
-            self.assertIn("epoch", map_data.full_df.columns)
-            self.assertEqual(map_data.full_df["epoch"].tolist(), [0.0, 1.0])
+            self.assertEqual(len(map_data.full_df), 2)
+            self.assertIsInstance(map_data, Data)
 
         with self.subTest("No map key"):
             data_json = {
diff --git a/ax/storage/sqa_store/utils.py b/ax/storage/sqa_store/utils.py
@@ -44,6 +44,7 @@
     # don't need to recur into them during `copy_db_ids`.
     "auxiliary_experiments_by_purpose",
     "_metric_fetching_errors",
+    "_data_rows",
 }
 SKIP_ATTRS_ERROR_SUFFIX = "Consider adding to COPY_DB_IDS_ATTRS_TO_SKIP if appropriate."
 
diff --git a/ax/utils/testing/core_stubs.py b/ax/utils/testing/core_stubs.py
@@ -2581,7 +2581,7 @@ def get_branin_data_batch(
         for i in range(len(means))
         for metric in metrics
     ]
-    return Data(pd.DataFrame.from_records(records))
+    return Data(df=pd.DataFrame.from_records(records))
 
 
 def get_branin_data_multi_objective(

Original file line number	Diff line number	Diff line change
`@@ -43,7 +43,7 @@ def setUp(self) -> None:`
`43`	`43`	`self.experiment.trials[i].mark_running(no_runner_required=True)`
`44`	`44`	`self.experiment.attach_data(`
`45`	`45`	`Data(`
`46`		`- pd.DataFrame(`
	`46`	`+ df=pd.DataFrame(`
`47`	`47`	`{`
`48`	`48`	`"trial_index": [i] * num_arms,`
`49`	`49`	`"arm_name": [f"0_{j}" for j in range(num_arms)],`
Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,7 @@ def test_fitted_scatter(self) -> None:`
`33`	`33`	`model = Generators.BOTORCH_MODULAR(`
`34`	`34`	`# Adapter kwargs`
`35`	`35`	`experiment=exp,`
`36`		`- data=Data.from_multiple_data([data, Data(df)]),`
	`36`	`+ data=Data.from_multiple_data([data, Data(df=df)]),`
`37`	`37`	`)`
`38`	`38`	`# Assert that each type of plot can be constructed successfully`
`39`	`39`	`scalarized_metric_config = [`