diff --git a/docs/source/kernels.rst b/docs/source/kernels.rst
index 5c7ae0945..5fa89b916 100644
--- a/docs/source/kernels.rst
+++ b/docs/source/kernels.rst
@@ -9,7 +9,7 @@ gpytorch.kernels
 
 
 If you don't know what kernel to use, we recommend that you start out with a
-:code:`gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())`.
+:code:`gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel()) + gpytorch.kernel.ConstantKernel()`.
 
 
 Kernel
@@ -22,6 +22,13 @@ Kernel
 Standard Kernels
 -----------------------------
 
+:hidden:`ConstantKernel`
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: ConstantKernel
+   :members:
+
+
 :hidden:`CosineKernel`
 ~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/gpytorch/kernels/__init__.py b/gpytorch/kernels/__init__.py
index cc85fe624..1d87e764b 100644
--- a/gpytorch/kernels/__init__.py
+++ b/gpytorch/kernels/__init__.py
@@ -2,6 +2,7 @@
 from . import keops
 from .additive_structure_kernel import AdditiveStructureKernel
 from .arc_kernel import ArcKernel
+from .constant_kernel import ConstantKernel
 from .cosine_kernel import CosineKernel
 from .cylindrical_kernel import CylindricalKernel
 from .distributional_input_kernel import DistributionalInputKernel
@@ -38,6 +39,7 @@
     "ArcKernel",
     "AdditiveKernel",
     "AdditiveStructureKernel",
+    "ConstantKernel",
     "CylindricalKernel",
     "MultiDeviceKernel",
     "CosineKernel",
diff --git a/gpytorch/kernels/constant_kernel.py b/gpytorch/kernels/constant_kernel.py
new file mode 100644
index 000000000..98a3560e2
--- /dev/null
+++ b/gpytorch/kernels/constant_kernel.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python3
+
+from typing import Optional, Tuple
+
+import torch
+from torch import Tensor
+
+from ..constraints import Interval, Positive
+from ..priors import Prior
+from .kernel import Kernel
+
+
+class ConstantKernel(Kernel):
+    """
+    Constant covariance kernel for the probabilistic inference of constant coefficients.
+
+    ConstantKernel represents the prior variance `k(x1, x2) = var(c)` of a constant `c`.
+    The prior variance of the constant is optimized during the GP hyper-parameter
+    optimization stage. The actual value of the constant is computed (implicitly) using
+    the linear algebraic approaches for the computation of GP samples and posteriors.
+
+    The constant kernel `k_constant` is most useful as a modification of an arbitrary
+    base kernel `k_base`:
+    1) Additive constants: The modification `k_base + k_constant` allows the GP to
+    infer a non-zero asymptotic value far from the training data, which generally
+    leads to more accurate extrapolation. Notably, the uncertainty in this constant
+    value affects the posterior covariances through the posterior inference equations.
+    This is not the case when a constant prior mean is not used, since the prior mean
+    does not show up the posterior covariance and is regularized by the log-determinant
+    during the optimization of the marginal likelihood.
+    2) Multiplicative constants: The modification `k_base * k_constant` allows the GP to
+    modulate the variance of the kernel `k_base`, and is mathematically identical to
+    `ScaleKernel(base_kernel)` with the same constant.
+    """
+
+    has_lengthscale = False
+
+    def __init__(
+        self,
+        batch_shape: Optional[torch.Size] = None,
+        constant_prior: Optional[Prior] = None,
+        constant_constraint: Optional[Interval] = None,
+        active_dims: Optional[Tuple[int, ...]] = None,
+    ):
+        """Constructor of ConstantKernel.
+
+        Args:
+            batch_shape: The batch shape of the kernel.
+            constant_prior: Prior over the constant parameter.
+            constant_constraint: Constraint to place on constant parameter.
+            active_dims: The dimensions of the input with which to evaluate the kernel.
+                This is mute for the constant kernel, but added for compatability with
+                the Kernel API.
+        """
+        super().__init__(batch_shape=batch_shape, active_dims=active_dims)
+
+        self.register_parameter(
+            name="raw_constant",
+            parameter=torch.nn.Parameter(torch.zeros(*self.batch_shape, 1)),
+        )
+
+        if constant_prior is not None:
+            if not isinstance(constant_prior, Prior):
+                raise TypeError("Expected gpytorch.priors.Prior but got " + type(constant_prior).__name__)
+            self.register_prior(
+                "constant_prior",
+                constant_prior,
+                lambda m: m.constant,
+                lambda m, v: m._set_constant(v),
+            )
+
+        if constant_constraint is None:
+            constant_constraint = Positive()
+        self.register_constraint("raw_constant", constant_constraint)
+
+    @property
+    def constant(self) -> Tensor:
+        return self.raw_constant_constraint.transform(self.raw_constant)
+
+    @constant.setter
+    def constant(self, value: Tensor) -> None:
+        self._set_constant(value)
+
+    def _set_constant(self, value: Tensor) -> None:
+        value = value.view(*self.batch_shape, 1)
+        self.initialize(raw_constant=self.raw_constant_constraint.inverse_transform(value))
+
+    def forward(
+        self,
+        x1: Tensor,
+        x2: Tensor,
+        diag: Optional[bool] = False,
+        last_dim_is_batch: Optional[bool] = False,
+    ) -> Tensor:
+        """Evaluates the constant kernel.
+
+        Args:
+            x1: First input tensor of shape (batch_shape x n1 x d).
+            x2: Second input tensor of shape (batch_shape x n2 x d).
+            diag: If True, returns the diagonal of the covariance matrix.
+            last_dim_is_batch: If True, the last dimension of size `d` of the input
+                tensors are treated as a batch dimension.
+
+        Returns:
+            A (batch_shape x n1 x n2)-dim, resp. (batch_shape x n1)-dim, tensor of
+            constant covariance values if diag is False, resp. True.
+        """
+        if last_dim_is_batch:
+            x1 = x1.transpose(-1, -2).unsqueeze(-1)
+            x2 = x2.transpose(-1, -2).unsqueeze(-1)
+
+        dtype = torch.promote_types(x1.dtype, x2.dtype)
+        batch_shape = torch.broadcast_shapes(x1.shape[:-2], x2.shape[:-2])
+        shape = batch_shape + (x1.shape[-2],) + (() if diag else (x2.shape[-2],))
+        constant = self.constant.to(dtype=dtype, device=x1.device)
+
+        if not diag:
+            constant = constant.unsqueeze(-1)
+
+        if last_dim_is_batch:
+            constant = constant.unsqueeze(-1)
+
+        return constant.expand(shape)
diff --git a/gpytorch/test/base_kernel_test_case.py b/gpytorch/test/base_kernel_test_case.py
index 5301ce2d9..88f6afbd5 100644
--- a/gpytorch/test/base_kernel_test_case.py
+++ b/gpytorch/test/base_kernel_test_case.py
@@ -122,23 +122,21 @@ def test_no_batch_kernel_double_batch_x_ard(self):
         actual_diag = actual_covar_mat.diagonal(dim1=-1, dim2=-2)
         self.assertAllClose(kernel_diag, actual_diag, rtol=1e-3, atol=1e-5)
 
-    def test_smoke_double_batch_kernel_double_batch_x_no_ard(self):
+    def test_smoke_double_batch_kernel_double_batch_x_no_ard(self) -> None:
         kernel = self.create_kernel_no_ard(batch_shape=torch.Size([3, 2]))
         x = self.create_data_double_batch()
-        batch_covar_mat = kernel(x).evaluate_kernel().to_dense()
+        kernel(x).evaluate_kernel().to_dense()
         kernel(x, diag=True)
-        return batch_covar_mat
 
-    def test_smoke_double_batch_kernel_double_batch_x_ard(self):
+    def test_smoke_double_batch_kernel_double_batch_x_ard(self) -> None:
         try:
             kernel = self.create_kernel_ard(num_dims=2, batch_shape=torch.Size([3, 2]))
         except NotImplementedError:
             return
 
         x = self.create_data_double_batch()
-        batch_covar_mat = kernel(x).evaluate_kernel().to_dense()
+        kernel(x).evaluate_kernel().to_dense()
         kernel(x, diag=True)
-        return batch_covar_mat
 
     def test_kernel_getitem_single_batch(self):
         kernel = self.create_kernel_no_ard(batch_shape=torch.Size([2]))
diff --git a/setup.py b/setup.py
index df580c8aa..d5a05fbe9 100644
--- a/setup.py
+++ b/setup.py
@@ -82,6 +82,7 @@ def find_version(*file_paths):
             "nbclient<=0.7.3",
             "nbformat<=5.8.0",
             "nbsphinx<=0.9.1",
+            "lxml_html_clean",
             "platformdirs<=3.2.0",
             "setuptools_scm<=7.1.0",
             "sphinx<=6.2.1",
diff --git a/test/kernels/test_constant_kernel.py b/test/kernels/test_constant_kernel.py
new file mode 100644
index 000000000..849ec3996
--- /dev/null
+++ b/test/kernels/test_constant_kernel.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+
+import itertools
+import unittest
+
+import torch
+
+from torch import Tensor
+
+from gpytorch.kernels import AdditiveKernel, ConstantKernel, MaternKernel, ProductKernel, ScaleKernel
+from gpytorch.lazy import LazyEvaluatedKernelTensor
+from gpytorch.priors.torch_priors import GammaPrior
+from gpytorch.test.base_kernel_test_case import BaseKernelTestCase
+
+
+class TestConstantKernel(unittest.TestCase, BaseKernelTestCase):
+    def create_kernel_no_ard(self, **kwargs):
+        return ConstantKernel(**kwargs)
+
+    def test_constant_kernel(self):
+        with self.subTest(device="cpu"):
+            self._test_constant_kernel(torch.device("cpu"))
+
+        if torch.cuda.is_available():
+            with self.subTest(device="cuda"):
+                self._test_constant_kernel(torch.device("cuda"))
+
+    def _test_constant_kernel(self, device: torch.device):
+        n, d = 3, 5
+        dtypes = [torch.float, torch.double]
+        batch_shapes = [(), (2,), (7, 2)]
+        torch.manual_seed(123)
+        for dtype, batch_shape in itertools.product(dtypes, batch_shapes):
+            tkwargs = {"dtype": dtype, "device": device}
+            places = 6 if dtype == torch.float else 12
+            X = torch.rand(*batch_shape, n, d, **tkwargs)
+
+            constant_kernel = ConstantKernel(batch_shape=batch_shape)
+            KL = constant_kernel(X)
+            self.assertIsInstance(KL, LazyEvaluatedKernelTensor)
+            KM = KL.to_dense()
+            self.assertIsInstance(KM, Tensor)
+            self.assertEqual(KM.shape, (*batch_shape, n, n))
+            self.assertEqual(KM.dtype, dtype)
+            self.assertEqual(KM.device.type, device.type)
+            # standard deviation is zero iff KM is constant
+            self.assertAlmostEqual(KM.std().item(), 0, places=places)
+
+            # testing last_dim_is_batch
+            with self.subTest(last_dim_is_batch=True):
+                KD = constant_kernel(X, last_dim_is_batch=True).to(device=device)
+                self.assertIsInstance(KD, LazyEvaluatedKernelTensor)
+                KM = KD.to_dense()
+                self.assertIsInstance(KM, Tensor)
+                self.assertEqual(KM.shape, (*batch_shape, d, n, n))
+                self.assertAlmostEqual(KM.std().item(), 0, places=places)
+                self.assertEqual(KM.dtype, dtype)
+                self.assertEqual(KM.device.type, device.type)
+
+            # testing diag
+            with self.subTest(diag=True):
+                KD = constant_kernel(X, diag=True)
+                self.assertIsInstance(KD, Tensor)
+                self.assertEqual(KD.shape, (*batch_shape, n))
+                self.assertAlmostEqual(KD.std().item(), 0, places=places)
+                self.assertEqual(KD.dtype, dtype)
+                self.assertEqual(KD.device.type, device.type)
+
+            # testing diag and last_dim_is_batch
+            with self.subTest(diag=True, last_dim_is_batch=True):
+                KD = constant_kernel(X, diag=True, last_dim_is_batch=True)
+                self.assertIsInstance(KD, Tensor)
+                self.assertEqual(KD.shape, (*batch_shape, d, n))
+                self.assertAlmostEqual(KD.std().item(), 0, places=places)
+                self.assertEqual(KD.dtype, dtype)
+                self.assertEqual(KD.device.type, device.type)
+
+            # testing AD
+            with self.subTest(requires_grad=True):
+                X.requires_grad = True
+                constant_kernel(X).to_dense().sum().backward()
+                self.assertIsNone(X.grad)  # constant kernel is not dependent on X
+
+            # testing algebraic combinations with another kernel
+            base_kernel = MaternKernel().to(device=device)
+
+            with self.subTest(additive=True):
+                sum_kernel = base_kernel + constant_kernel
+                self.assertIsInstance(sum_kernel, AdditiveKernel)
+                self.assertAllClose(
+                    sum_kernel(X).to_dense(),
+                    base_kernel(X).to_dense() + constant_kernel.constant.unsqueeze(-1),
+                )
+
+            # product with constant is equivalent to scale kernel
+            with self.subTest(product=True):
+                product_kernel = base_kernel * constant_kernel
+                self.assertIsInstance(product_kernel, ProductKernel)
+
+                scale_kernel = ScaleKernel(base_kernel, batch_shape=batch_shape)
+                scale_kernel.to(device=device)
+                self.assertAllClose(scale_kernel(X).to_dense(), product_kernel(X).to_dense())
+
+            # setting constant
+            pies = torch.full_like(constant_kernel.constant, torch.pi)
+            constant_kernel.constant = pies
+            self.assertAllClose(constant_kernel.constant, pies)
+
+            # specifying prior
+            constant_kernel = ConstantKernel(constant_prior=GammaPrior(concentration=2.4, rate=2.7))
+
+            with self.assertRaisesRegex(TypeError, "Expected gpytorch.priors.Prior but got"):
+                ConstantKernel(constant_prior=1)