pymc-devs
diff --git a/‎README.rst‎
Lines changed: 21 additions & 22 deletions b/‎README.rst‎
Lines changed: 21 additions & 22 deletions
diff --git a/‎pytensor/link/numba/dispatch/elemwise.py‎
Lines changed: 2 additions & 0 deletions b/‎pytensor/link/numba/dispatch/elemwise.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎pytensor/link/numba/dispatch/vectorize_codegen.py‎
Lines changed: 21 additions & 16 deletions b/‎pytensor/link/numba/dispatch/vectorize_codegen.py‎
Lines changed: 21 additions & 16 deletions
diff --git a/‎pytensor/scan/rewriting.py‎
Lines changed: 16 additions & 0 deletions b/‎pytensor/scan/rewriting.py‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎pytensor/sparse/rewriting.py‎
Lines changed: 4 additions & 1 deletion b/‎pytensor/sparse/rewriting.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎pytensor/tensor/basic.py‎
Lines changed: 1 addition & 1 deletion b/‎pytensor/tensor/basic.py‎
Lines changed: 1 addition & 1 deletion
@@ -57,36 +57,35 @@ Getting started
     d = a/a + (M + a).dot(v)
 
     pytensor.dprint(d)
-    #  Add [id A]
-    #  ├─ ExpandDims{axis=0} [id B]
-    #  │  └─ True_div [id C]
-    #  │     ├─ a [id D]
-    #  │     └─ a [id D]
-    #  └─ dot [id E]
-    #     ├─ Add [id F]
-    #     │  ├─ M [id G]
-    #     │  └─ ExpandDims{axes=[0, 1]} [id H]
-    #     │     └─ a [id D]
-    #     └─ v [id I]
+    # Add [id A]
+    #  ├─ True_div [id B]
+    #  │  ├─ a [id C]
+    #  │  └─ a [id C]
+    #  └─ Squeeze{axis=1} [id D]
+    #     └─ Dot [id E]
+    #        ├─ Add [id F]
+    #        │  ├─ M [id G]
+    #        │  └─ a [id C]
+    #        └─ ExpandDims{axis=1} [id H]
+    #           └─ v [id I]
 
     f_d = pytensor.function([a, v, M], d)
 
     # `a/a` -> `1` and the dot product is replaced with a BLAS function
     # (i.e. CGemv)
     pytensor.dprint(f_d)
-    # Add [id A] 5
-    #  ├─ [1.] [id B]
-    #  └─ CGemv{inplace} [id C] 4
-    #     ├─ AllocEmpty{dtype='float64'} [id D] 3
-    #     │  └─ Shape_i{0} [id E] 2
+    # Add [id A] 4
+    #  ├─ 1.0 [id B]
+    #  └─ CGemv{inplace} [id C] 3
+    #     ├─ AllocEmpty{dtype='float64'} [id D] 2
+    #     │  └─ Shape_i{0} [id E] 1
     #     │     └─ M [id F]
-    #     ├─ 1.0 [id G]
-    #     ├─ Add [id H] 1
+    #     ├─ 1.0 [id B]
+    #     ├─ Add [id G] 0
     #     │  ├─ M [id F]
-    #     │  └─ ExpandDims{axes=[0, 1]} [id I] 0
-    #     │     └─ a [id J]
-    #     ├─ v [id K]
-    #     └─ 0.0 [id L]
+    #     │  └─ a [id H]
+    #     ├─ v [id I]
+    #     └─ 0.0 [id J]
 
 See `the PyTensor documentation <https://pytensor.readthedocs.io/en/latest/>`__ for in-depth tutorials.
 
 
@@ -383,7 +383,9 @@ def impl(*inputs):
                 type(op),
                 tuple(op.inplace_pattern.items()),
                 input_bc_patterns,
+                output_bc_patterns,
                 scalar_cache_key,
+                2,  # cache version
             )
         )
         elemwise_key = sha256(elemwise_key.encode()).hexdigest()
 
@@ -125,7 +125,7 @@ def _vectorized(
         raise TypeError("allow_core_scalar must be literal.")
     allow_core_scalar = allow_core_scalar.literal_value
 
-    batch_ndim = len(input_bc_patterns[0])
+    batch_ndim = len(output_bc_patterns[0])
     nin = len(constant_inputs_types) + len(input_types)
     nout = len(output_bc_patterns)
 
@@ -138,13 +138,6 @@ def _vectorized(
     if not all(isinstance(input, types.Array) for input in input_types):
         raise TypingError("Vectorized inputs must be arrays.")
 
-    if not all(
-        len(pattern) == batch_ndim for pattern in input_bc_patterns + output_bc_patterns
-    ):
-        raise TypingError(
-            "Vectorized broadcastable patterns must have the same length."
-        )
-
     core_input_types = []
     for input_type, bc_pattern in zip(input_types, input_bc_patterns, strict=True):
         core_ndim = input_type.ndim - len(bc_pattern)
@@ -291,16 +284,21 @@ def compute_itershape(
     size: list[ir.Instruction] | None,
 ):
     one = ir.IntType(64)(1)
-    batch_ndim = len(broadcast_pattern[0])
+    batch_ndim = max((len(p) for p in broadcast_pattern), default=0)
     shape = [None] * batch_ndim
     if size is not None:
         shape = size
         for i in range(batch_ndim):
             for j, (bc, in_shape) in enumerate(
                 zip(broadcast_pattern, in_shapes, strict=True)
             ):
-                length = in_shape[i]
-                if bc[i]:
+                # Offset for inputs with fewer dims than batch_ndim
+                offset = batch_ndim - len(bc)
+                if i < offset:
+                    # Implicit broadcast dim — no array dim to check
+                    continue
+                length = in_shape[i - offset]
+                if bc[i - offset]:
                     with builder.if_then(
                         builder.icmp_unsigned("!=", length, one), likely=False
                     ):
@@ -336,8 +334,11 @@ def compute_itershape(
             for j, (bc, in_shape) in enumerate(
                 zip(broadcast_pattern, in_shapes, strict=True)
             ):
-                length = in_shape[i]
-                if bc[i]:
+                offset = batch_ndim - len(bc)
+                if i < offset:
+                    continue
+                length = in_shape[i - offset]
+                if bc[i - offset]:
                     with builder.if_then(
                         builder.icmp_unsigned("!=", length, one), likely=False
                     ):
@@ -452,6 +453,7 @@ def make_loop_call(
     # output_scope_set = mod.add_metadata([input_scope, output_scope])
 
     zero = ir.Constant(ir.IntType(64), 0)
+    batch_ndim = len(iter_shape)
 
     # Setup loops and initialize accumulators for outputs
     # This part corresponds to opening the loops
@@ -480,9 +482,12 @@ def make_loop_call(
     for input, input_type, bc in zip(inputs, input_types, input_bc, strict=True):
         core_ndim = input_type.ndim - len(bc)
 
-        idxs_bc = [zero if bc else idx for idx, bc in zip(idxs, bc, strict=True)] + [
-            zero
-        ] * core_ndim
+        # For inputs with fewer batch dims than the loop, skip leading loop indices
+        offset = batch_ndim - len(bc)
+        idxs_bc = [
+            zero if bc_dim else idx
+            for idx, bc_dim in zip(idxs[offset:], bc, strict=True)
+        ] + [zero] * core_ndim
         ptr = cgutils.get_item_pointer2(
             context,
             builder,
 
@@ -56,6 +56,7 @@
     Alloc,
     AllocEmpty,
     atleast_Nd,
+    expand_dims,
     get_scalar_constant_value,
 )
 from pytensor.tensor.elemwise import DimShuffle, Elemwise
@@ -504,6 +505,21 @@ def add_to_replace(y):
 
             to_remove_set.add(nd)
 
+            # When inner Elemwise inputs have different ndims, lower-ndim
+            # inputs are implicitly left-padded. Outer equivalents of inputs
+            # with a time dimension need broadcast dims inserted right after
+            # the time dim (position 0) to match that implicit padding.
+            # E.g., inner (v,) broadcasting with (a, v) → outer (t, v)
+            # must become (t, 1, v) so it broadcasts with (a, v) to (t, a, v).
+            inner_max_ndim = max(x.type.ndim for x in nd.inputs)
+            for i, x in enumerate(nd.inputs):
+                has_time = x in inner_seqs_set or x in to_replace_set
+                n_pad = inner_max_ndim - x.type.ndim
+                if has_time and n_pad > 0:
+                    outside_ins[i] = expand_dims(
+                        outside_ins[i], axis=tuple(range(1, 1 + n_pad))
+                    )
+
             # Do not call make_node for test_value
             nw_outer_node = nd.op.make_node(*outside_ins)
 
 
@@ -16,7 +16,7 @@
 from pytensor.sparse.basic import csm_properties
 from pytensor.sparse.math import usmm
 from pytensor.tensor import blas
-from pytensor.tensor.basic import as_tensor_variable, cast
+from pytensor.tensor.basic import as_tensor_variable, atleast_Nd, cast
 from pytensor.tensor.math import mul, neg, sub
 from pytensor.tensor.rewriting.basic import register_canonicalize, register_specialize
 from pytensor.tensor.shape import shape, specify_shape
@@ -957,6 +957,9 @@ def local_usmm_csx(fgraph, node):
                 if y.type.dtype != dtype_out:
                     return False
 
+                # UsmmCscDense requires alpha to be 2-d with shape (1, 1)
+                if alpha.ndim < 2:
+                    alpha = atleast_Nd(alpha, n=2)
                 return [usmm_csc_dense(alpha, x_val, x_ind, x_ptr, x_nsparse, y, z)]
     return False
 
 
@@ -379,7 +379,7 @@ def _get_underlying_scalar_constant_value(
                     ret = [[None]]
                     v.owner.op.perform(v.owner, const, ret)
                     return np.asarray(ret[0][0].copy())
-            # In fast_compile, we don't enable local_fill_to_alloc, so
+            # In fast_compile, we don't enable local_second_to_alloc, so
             # we need to investigate Second as Alloc. So elemwise
             # don't disable the check for Second.
             elif isinstance(op, Elemwise):
Original file line number	Diff line number	Diff line change
`@@ -383,7 +383,9 @@ def impl(*inputs):`
`383`	`383`	`type(op),`
`384`	`384`	`tuple(op.inplace_pattern.items()),`
`385`	`385`	`input_bc_patterns,`
	`386`	`+ output_bc_patterns,`
`386`	`387`	`scalar_cache_key,`
	`388`	`+ 2, # cache version`
`387`	`389`	`)`
`388`	`390`	`)`
`389`	`391`	`elemwise_key = sha256(elemwise_key.encode()).hexdigest()`