Allow truncation of hurdle distributions

ricardoV94 · ricardoV94 · commit 9c65ca1a3180 · 2025-06-04T16:02:18.000+02:00
diff --git a/notebooks/xmodel.ipynb b/notebooks/xmodel.ipynb
@@ -0,0 +1,169 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8b46af09bc772f64",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pytensor.tensor as pt\n",
+    "import pytensor.xtensor as px\n",
+    "\n",
+    "import pymc as pm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eaca7be1e40a81c6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class XModel(pm.Model):\n",
+    "    def register_rv(self, rv, *args, dims=None, **kwargs):\n",
+    "        rv = super().register_rv(rv, *args, dims=dims, **kwargs)\n",
+    "        if dims is not None:\n",
+    "            rv = px.as_xtensor(rv, dims=dims)\n",
+    "        return rv\n",
+    "\n",
+    "    def add_named_variable(self, var, dims=None):\n",
+    "        if isinstance(var.type, px.type.XTensorType):\n",
+    "            if dims is None:\n",
+    "                dims = var.dims\n",
+    "            else:\n",
+    "                if dims != var.dims:\n",
+    "                    raise ValueError(\n",
+    "                        f\"Provided dims {dims} do not match variable pre-existing {var.dims}. \"\n",
+    "                        \"Use rename and/or transpose to match new dims\"\n",
+    "                    )\n",
+    "        super().add_named_variable(var, dims)\n",
+    "\n",
+    "\n",
+    "def XData(name, x, *args, **kwargs):\n",
+    "    x = pm.Data(name, x, *args, **kwargs)\n",
+    "    model = pm.modelcontext(None)\n",
+    "    if (dims := model.named_vars_to_dims.get(x.name, None)) is not None:\n",
+    "        x = px.as_xtensor(x, dims=dims)\n",
+    "    return x"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "efeb5d5820e2efe7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "N = 100\n",
+    "seed = sum(map(ord, \"xarray>=numpy?\"))\n",
+    "rng = np.random.default_rng(seed)\n",
+    "\n",
+    "x_np = np.linspace(0, 10, N)\n",
+    "y_np = np.piecewise(\n",
+    "    x_np,\n",
+    "    [x_np <= 3, (x_np > 3) & (x_np <= 7), x_np > 7],\n",
+    "    [lambda x: 0.5 * x, lambda x: 1.5 + 0.2 * (x - 3), lambda x: 2.3 - 0.1 * (x - 7)],\n",
+    ")\n",
+    "y_np += rng.normal(0, 0.2, size=N)\n",
+    "group_idx = rng.choice(3, size=N)\n",
+    "\n",
+    "N_knots = 13\n",
+    "knots_np = np.linspace(0, 10, num=N_knots)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6f5476abb800b402",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "coords = {\n",
+    "    \"group\": range(3),\n",
+    "    \"knots\": range(N_knots),\n",
+    "    \"obs\": range(N),\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ca734923d4d51c4c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with pm.Model(coords=coords) as model:\n",
+    "    x = pm.Data(\"x\", x_np, dims=\"obs\")\n",
+    "    knots = pm.Data(\"knots\", knots_np, dims=\"knot\")\n",
+    "\n",
+    "    sigma = pm.HalfCauchy(\"sigma\", beta=1)\n",
+    "    sigma_beta0 = pm.HalfNormal(\"sigma_beta0\", sigma=10)\n",
+    "    beta0 = pm.HalfNormal(\"beta_0\", sigma=sigma_beta0, dims=\"group\")\n",
+    "    z = pm.Normal(\"z\", dims=(\"group\", \"knot\"))\n",
+    "\n",
+    "    delta_factors = pt.special.softmax(z, axis=-1)  # (groups, knot)\n",
+    "    slope_factors = 1 - pt.cumsum(delta_factors[:, :-1], axis=-1)  # (groups, knot-1)\n",
+    "    spline_slopes = pt.join(-1, beta0[:, None], beta0[:, None] * slope_factors)  # (groups, knot-1)\n",
+    "    beta = pt.join(-1, beta0[:, None], pt.diff(spline_slopes, axis=-1))  # (groups, knot)\n",
+    "\n",
+    "    beta = pm.Deterministic(\"beta\", beta, dims=(\"group\", \"knot\"))\n",
+    "\n",
+    "    X = pt.maximum(0, x[:, None] - knots[None, :])  # (n, knot)\n",
+    "    mu = (X * beta[group_idx]).sum(-1)  # ((n, knots) * (n, knots)).sum(-1) = (n,)\n",
+    "    y = pm.Normal(\"y\", mu=mu, sigma=sigma, observed=y_np, dims=\"obs\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "48d4d69fcc838be3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with XModel(coords=coords) as xmodel:\n",
+    "    x = XData(\"x\", x_np, dims=\"obs\")\n",
+    "    knots = XData(\"knots\", knots_np, dims=\"knot\")\n",
+    "\n",
+    "    sigma = pm.HalfCauchy(\"sigma\", beta=1)\n",
+    "    sigma_beta0 = pm.HalfNormal(\"sigma_beta0\", sigma=10)\n",
+    "    beta0 = pm.HalfNormal(\"beta_0\", sigma=sigma_beta0, dims=\"group\")\n",
+    "    z = pm.Normal(\"z\", dims=(\"group\", \"knot\"))\n",
+    "\n",
+    "    delta_factors = px.special.softmax(z, dim=\"knot\")\n",
+    "    slope_factors = 1 - delta_factors.isel(knot=slice(None, -1)).cumsum(\"knot\")\n",
+    "    spline_slopes = px.concat([beta0, beta0 * slope_factors], dim=\"knot\")\n",
+    "    beta = px.concat([beta0, spline_slopes.diff(\"knot\")], dim=\"knot\")\n",
+    "\n",
+    "    beta = pm.Deterministic(\"beta\", beta, dims=(\"group\", \"knot\"))\n",
+    "\n",
+    "    X = px.math.scalar_maximum(0, x - knots)\n",
+    "    mu = (X * beta.isel(group=group_idx).rename(group=\"obs\")).sum(\"knot\")\n",
+    "    y_obs = pm.Normal(\"y_obs\", mu=mu.values, sigma=sigma, observed=y_np, dims=\"obs\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "da17a5c329187db6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f\"{model.compile_logp()(model.initial_point()):,}\")\n",
+    "print(f\"{xmodel.compile_logp()(xmodel.initial_point()):,}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "85841107447a1ddd",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/pymc/distributions/mixture.py b/pymc/distributions/mixture.py
@@ -36,7 +36,6 @@
 )
 from pymc.distributions.shape_utils import _change_dist_size, change_dist_size, rv_size_is_none
 from pymc.distributions.transforms import _default_transform
-from pymc.distributions.truncated import Truncated
 from pymc.logprob.abstract import _logcdf, _logcdf_helper, _logprob
 from pymc.logprob.basic import logp
 from pymc.logprob.transforms import IntervalTransform
@@ -809,13 +808,13 @@ def dist(cls, psi, mu=None, alpha=None, p=None, n=None, **kwargs):
         )
 
 
-class _MarginalHurdleRV(_BaseMixtureRV):
+class _HurdleRV(_BaseMixtureRV):
     pass
 
 
 class _Hurdle(_BaseMixtureDistribution):
-    rv_type = _MarginalHurdleRV
-    rv_op = _MarginalHurdleRV.rv_op
+    rv_type = _HurdleRV
+    rv_op = _HurdleRV.rv_op
 
     @classmethod
     def _create(cls, *, name, nonzero_p, nonzero_dist, max_n_steps=10_000, **kwargs):
@@ -826,6 +825,8 @@ def _create(cls, *, name, nonzero_p, nonzero_dist, max_n_steps=10_000, **kwargs)
         In hurdle models, the zeros come from a completely different process than the rest of the data.
         In other words, the zeros are not inflated, they come from a different process.
         """
+        from pymc.distributions.truncated import Truncated
+
         dtype = nonzero_dist.dtype
 
         if dtype.startswith("int"):
@@ -848,12 +849,12 @@ def _create(cls, *, name, nonzero_p, nonzero_dist, max_n_steps=10_000, **kwargs)
             return cls.dist(weights, comp_dists, **kwargs)
 
 
-@_logprob.register(_MarginalHurdleRV)
+@_logprob.register(_HurdleRV)
 def marginal_hurdle_logprob(op, values, rng, weights, *components, **kwargs):
     (value,) = values
 
     if len(components) != 2:
-        raise TypeError(
+        raise NotImplementedError(
             f"MarginalHurdleRV logp only supports 2 components, got {(len(components))}"
         )
 
diff --git a/pymc/distributions/truncated.py b/pymc/distributions/truncated.py
@@ -30,11 +30,13 @@
 from pymc.distributions.continuous import TruncatedNormal, bounded_cont_transform
 from pymc.distributions.dist_math import check_parameters
 from pymc.distributions.distribution import (
+    DiracDeltaRV,
     Distribution,
     SymbolicRandomVariable,
     _support_point,
     support_point,
 )
+from pymc.distributions.mixture import _HurdleRV
 from pymc.distributions.shape_utils import (
     _change_dist_size,
     change_dist_size,
@@ -79,7 +81,9 @@ def rv_op(cls, dist, lower, upper, max_n_steps, *, size=None):
 
         # Try to use specialized Op
         try:
-            return _truncated(dist.owner.op, lower, upper, size, *dist.owner.inputs)
+            return _truncated(
+                dist.owner.op, lower, upper, size, *dist.owner.inputs, max_n_steps=max_n_steps
+            )
         except NotImplementedError:
             pass
 
@@ -222,7 +226,7 @@ def update(self, node: Apply):
 
 
 @singledispatch
-def _truncated(op: Op, lower, upper, size, *params):
+def _truncated(op: Op, lower, upper, size, *params, max_n_steps: int):
     """Return the truncated equivalent of another `RandomVariable`."""
     raise NotImplementedError(f"{op} does not have an equivalent truncated version implemented")
 
@@ -307,13 +311,14 @@ def dist(cls, dist, lower=None, upper=None, max_n_steps: int = 10_000, **kwargs)
                 f"Truncation dist must be a distribution created via the `.dist()` API, got {type(dist)}"
             )
 
-        if (
-            isinstance(dist.owner.op, SymbolicRandomVariable)
-            and "[size]" not in dist.owner.op.extended_signature
+        if isinstance(dist.owner.op, SymbolicRandomVariable) and not (
+            "[size]" in dist.owner.op.extended_signature
+            # If there's a specific _truncated dispatch for this RV, that's also fine
+            or _truncated.dispatch(type(dist.owner.op)) is not _truncated.dispatch(object)
         ):
             # Truncation needs to wrap the underlying dist, but not all SymbolicRandomVariables encapsulate the whole
             # random graph and as such we don't know where the actual inputs begin. This happens mostly for
-            # distribution factories like `Censored` and `Mixture` which would have a very complex signature if they
+            # distribution factories like `Censored` which would have a very complex signature if they
             # encapsulated the random components instead of taking them as inputs like they do now.
             # SymbolicRandomVariables that encapsulate the whole random graph can be identified for having a size parameter.
             raise NotImplementedError(f"Truncation not implemented for {dist.owner.op}")
@@ -462,7 +467,7 @@ def truncated_logcdf(op: TruncatedRV, value, *inputs, **kwargs):
 
 
 @_truncated.register(NormalRV)
-def _truncated_normal(op, lower, upper, size, rng, old_size, mu, sigma):
+def _truncated_normal(op, lower, upper, size, rng, old_size, mu, sigma, *, max_n_steps):
     return TruncatedNormal.dist(
         mu=mu,
         sigma=sigma,
@@ -472,3 +477,34 @@ def _truncated_normal(op, lower, upper, size, rng, old_size, mu, sigma):
         size=size,
         dtype=op.dtype,
     )
+
+
+@_truncated.register(_HurdleRV)
+def _truncated_hurdle(op: _HurdleRV, lower, upper, size, rng, weights, *components, max_n_steps):
+    if len(components) != 2:
+        raise TypeError("Truncated HurdleRV only supports two components")
+
+    dirac_delta_dist, other_dist = components
+
+    if not isinstance(dirac_delta_dist.owner.op, DiracDeltaRV):
+        raise TypeError("First component of HurdleRV must be a DiracDeltaRV")
+
+    # If the DiracDelta value is outside the truncation bounds, this is effectively a non-hurdle distribution
+    # We achieve this by adjusting the weights of the DiracDelta component, so it's never selected in that case
+    [dirac_delta_value] = dirac_delta_dist.owner.op.dist_params(dirac_delta_dist.owner)
+    nonzero_p = weights[..., 1]
+    lower_check = np.array(True) if lower is None else lower <= dirac_delta_value
+    upper_check = np.array(True) if upper is None else dirac_delta_value <= upper
+    adjusted_nonzero_p = pt.where(
+        lower_check & upper_check,
+        nonzero_p,
+        1,
+    )
+    adjusted_weights = pt.stack([1 - adjusted_nonzero_p, adjusted_nonzero_p], axis=-1)
+
+    # The only remaining step is to truncate the other distribution
+    truncated_dist = Truncated.dist(other_dist, lower=lower, upper=upper, max_n_steps=max_n_steps)
+
+    # Creating a hurdle with the adjusted weights and the truncated distribution
+    # Should be equivalent to truncating the original hurdle distribution
+    return op.rv_op(adjusted_weights, dirac_delta_dist, truncated_dist, size=size)
diff --git a/tests/distributions/test_mixture.py b/tests/distributions/test_mixture.py
@@ -49,6 +49,7 @@
     Poisson,
     StickBreakingWeights,
     Triangular,
+    Truncated,
     Uniform,
     ZeroInflatedBinomial,
     ZeroInflatedNegativeBinomial,
@@ -1710,3 +1711,30 @@ def logp_fn(value, psi, mu, sigma):
                 return np.log(psi) + st.lognorm.logpdf(value, sigma, 0, np.exp(mu))
 
         check_logp(HurdleLogNormal, Rplus, {"psi": Unit, "mu": R, "sigma": Rplusbig}, logp_fn)
+
+    @pytest.mark.parametrize("lower", (-np.inf, 0, None, 1))
+    def test_truncated_hurdle_lognormal(self, lower):
+        psi = 0.7
+        x = HurdleLogNormal.dist(psi=psi, mu=3, sigma=1)
+        x_trunc = Truncated.dist(x, lower=lower, upper=30, size=(1000,))
+
+        x_trunc_draws = draw(x_trunc)
+        assert ((x_trunc_draws >= (lower or -np.inf)) & (x_trunc_draws <= 30)).all()
+
+        x_trunc = Truncated.dist(x, lower=lower, upper=30, size=(4,))
+        x_trunc_logp = logp(x_trunc, [0, 5.5, 30.0, 30.1]).eval()
+        effective_psi = psi if (lower or -np.inf) <= 0 else 1
+        np.testing.assert_allclose(
+            x_trunc_logp,
+            [
+                np.log(1 - effective_psi),  # 0 is not in the support of the distribution
+                *(
+                    np.log(effective_psi)
+                    + logp(
+                        Truncated.dist(LogNormal.dist(mu=3, sigma=1), lower=lower, upper=30),
+                        [5.5, 30.0],
+                    )
+                ).eval(),
+                -np.inf,  # 30.1 is outside the upper bound
+            ],
+        )