fixing dtype promotion in where (#1734)

jjsjann123 · web-flow · commit 40f79727c419 · 2025-02-04T17:14:49.000Z
diff --git a/thunder/executors/nvfuserex_impl.py b/thunder/executors/nvfuserex_impl.py
@@ -1965,7 +1965,18 @@ def where(
     nva = getnv(a, fd, lc_to_nv_map)
     nvb = getnv(b, fd, lc_to_nv_map)
 
-    return fd.ops.where(nvpred, nva, nvb)
+    # explicit type promotion is necessary, since nvfuser can't do this properly with scalar inputs. See
+    # issue: https://github.com/NVIDIA/Fuser/issues/3816
+    # Determines result dtype
+    numbertype, tensordtype = utils.check_same_dtype(a, b)
+    dtype = tensordtype if tensordtype is not None else numbertype
+
+    # NOTE: for scalar inputs, dtype mapping is different. e.g. float -> double. We convert dtypes to strong
+    # type if the output is supposed to be a tensor proxy
+    if any(map(lambda x: isinstance(x, TensorProxy), (pred, a, b))):
+        dtype = dtypes.to_strong_dtype(dtype)
+
+    return fd.ops.cast(fd.ops.where(nvpred, nva, nvb), lcdtype_to_nvdtype(dtype))
 
 
 register_supported(PrimIDs.WHERE, where, _elementwise_ternary_check)
diff --git a/thunder/tests/opinfos.py b/thunder/tests/opinfos.py
@@ -2537,18 +2537,19 @@ def div_sample_generator(op, device, dtype, requires_grad, **kwargs):
     torch_reference=torch.div,
     test_directives=(
         # NOTE: PyTorch doesn't support boolean division
-        # TODO: fix dtype mismatch when using nvfuser executors
         DecorateInfo(
             pytest.mark.xfail,
             "test_core_vs_torch_consistency",
             dtypes=(datatypes.bool8,),
             devicetypes=(devices.DeviceType.CPU, devices.DeviceType.CUDA),
         ),
+        # NOTE: bfloat16 and float16 is skipped
+        # See: https://github.com/Lightning-AI/lightning-thunder/issues/1724
         DecorateInfo(
             pytest.mark.xfail,
             "test_core_vs_torch_consistency",
             executors=("nvfuser",),
-            dtypes=(datatypes.bool8, datatypes.bfloat16, datatypes.float16, datatypes.float32),
+            dtypes=(datatypes.bool8, datatypes.bfloat16, datatypes.float16),
         ),
         DecorateInfo(pytest.mark.xfail, "test_vjp_correctness"),
     ),
@@ -2718,6 +2719,17 @@ def where_sample_generator(op, device, dtype, requires_grad, **kwargs):
         pred, a, b = make(pred_shape, dtype=torch.bool, requires_grad=False), make(a_shape), make(b_shape)
         yield SampleInput(pred, a, b)
 
+    # NOTE: requires_grad needs tensor inputs on non-pred.
+    if not requires_grad:
+        # generate scalar inputs
+        dtypes = [float, int, bool, complex]
+
+        for dtype in dtypes:
+            pred = make([2, 3], dtype=torch.bool, requires_grad=False)
+            a = dtype(1.0)
+            b = dtype(0.0)
+            yield SampleInput(pred, a, b)
+
 
 def where_error_generator(op, device, dtype=torch.float32, **kwargs):
     make = partial(make_tensor, device=device, dtype=dtype)