diff --git a/Cargo.lock b/Cargo.lock
index 5adfd6c34940..045e1268677e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -809,6 +809,7 @@ name = "cranelift-entity"
 version = "0.128.0"
 dependencies = [
  "cranelift-bitset",
+ "im-rc",
  "serde",
  "serde_derive",
 ]
diff --git a/cranelift/codegen/src/egraph.rs b/cranelift/codegen/src/egraph.rs
index 9993f7941f10..93961bc7876a 100644
--- a/cranelift/codegen/src/egraph.rs
+++ b/cranelift/codegen/src/egraph.rs
@@ -603,7 +603,7 @@ where
         // The initial best choice is "no simplification, just use the original
         // instruction" which has the original instruction's cost.
         let mut best = None;
-        let mut best_cost = cost::Cost::of_skeleton_op(
+        let mut best_cost = cost::ScalarCost::of_skeleton_op(
             ctx.func.dfg.insts[inst].opcode(),
             ctx.func.dfg.inst_args(inst).len(),
         );
@@ -682,7 +682,7 @@ where
 
             // Our best simplification is the one with the least cost. Update
             // `best` if necessary.
-            let cost = cost::Cost::of_skeleton_op(
+            let cost = cost::ScalarCost::of_skeleton_op(
                 ctx.func.dfg.insts[new_inst].opcode(),
                 ctx.func.dfg.inst_args(new_inst).len(),
             );
diff --git a/cranelift/codegen/src/egraph/cost.rs b/cranelift/codegen/src/egraph/cost.rs
index 1ff56fcd61f9..abc8e6fc5854 100644
--- a/cranelift/codegen/src/egraph/cost.rs
+++ b/cranelift/codegen/src/egraph/cost.rs
@@ -1,6 +1,97 @@
 //! Cost functions for egraph representation.
 
-use crate::ir::Opcode;
+use crate::ir::{DataFlowGraph, Inst, Opcode};
+use cranelift_entity::ImmutableEntitySet;
+
+/// The compound cost of an expression.
+///
+/// Tracks the set instructions that make up this expression and sums their
+/// costs, avoiding "double counting" the costs of values that were defined by
+/// the same instruction and values that appear multiple times within the
+/// expression (i.e. the expression is a DAG and not a tree).
+#[derive(Clone, Debug)]
+pub(crate) struct ExprCost {
+    // The total cost of this expression.
+    total: ScalarCost,
+    // The set of instructions that must be evaluated to produce the associated
+    // expression.
+    insts: ImmutableEntitySet<Inst>,
+}
+
+impl Ord for ExprCost {
+    fn cmp(&self, other: &Self) -> core::cmp::Ordering {
+        self.total.cmp(&other.total)
+    }
+}
+
+impl PartialOrd for ExprCost {
+    fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
+        self.total.partial_cmp(&other.total)
+    }
+}
+
+impl PartialEq for ExprCost {
+    fn eq(&self, other: &Self) -> bool {
+        self.total == other.total
+    }
+}
+
+impl Eq for ExprCost {}
+
+impl ExprCost {
+    /// Create an `ExprCost` with zero total cost and an empty set of
+    /// instructions.
+    pub fn zero() -> Self {
+        Self {
+            total: ScalarCost::zero(),
+            insts: ImmutableEntitySet::default(),
+        }
+    }
+
+    /// Create the cost for just the given instruction.
+    pub fn for_inst(dfg: &DataFlowGraph, inst: Inst) -> Self {
+        Self {
+            total: ScalarCost::of_opcode(dfg.insts[inst].opcode()),
+            insts: ImmutableEntitySet::unit(inst),
+        }
+    }
+
+    /// Add the other cost into this cost, unioning its set of instructions into
+    /// this cost's set, and only incrementing the total cost for new
+    /// instructions.
+    pub fn add(&mut self, dfg: &DataFlowGraph, other: &Self) {
+        match (self.insts.len(), other.insts.len()) {
+            // Nothing to do in this case.
+            (_, 0) => {}
+
+            // Clone `other` into `self` so that we reuse its set allocations.
+            (0, _) => {
+                *self = other.clone();
+            }
+
+            // Commute the addition so that we are (a) iterating over the
+            // smaller of the two sets, and (b) maximizing reuse of existing set
+            // allocations.
+            (a, b) if a < b => {
+                let mut other = other.clone();
+                for inst in self.insts.iter() {
+                    if other.insts.insert(inst) {
+                        other.total = other.total + ScalarCost::of_opcode(dfg.insts[inst].opcode());
+                    }
+                }
+                *self = other;
+            }
+
+            _ => {
+                for inst in other.insts.iter() {
+                    if self.insts.insert(inst) {
+                        self.total = self.total + ScalarCost::of_opcode(dfg.insts[inst].opcode());
+                    }
+                }
+            }
+        }
+    }
+}
 
 /// A cost of computing some value in the program.
 ///
@@ -31,11 +122,11 @@ use crate::ir::Opcode;
 /// that cannot be computed, or otherwise serve as a sentinel when
 /// performing search for the lowest-cost representation of a value.
 #[derive(Clone, Copy, PartialEq, Eq)]
-pub(crate) struct Cost(u32);
+pub(crate) struct ScalarCost(u32);
 
-impl core::fmt::Debug for Cost {
+impl core::fmt::Debug for ScalarCost {
     fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
-        if *self == Cost::infinity() {
+        if *self == ScalarCost::infinity() {
             write!(f, "Cost::Infinite")
         } else {
             f.debug_struct("Cost::Finite")
@@ -46,7 +137,7 @@ impl core::fmt::Debug for Cost {
     }
 }
 
-impl Ord for Cost {
+impl Ord for ScalarCost {
     #[inline]
     fn cmp(&self, other: &Self) -> std::cmp::Ordering {
         // We make sure that the high bits are the op cost and the low bits are
@@ -63,38 +154,38 @@ impl Ord for Cost {
     }
 }
 
-impl PartialOrd for Cost {
+impl PartialOrd for ScalarCost {
     #[inline]
     fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
         Some(self.cmp(other))
     }
 }
 
-impl Cost {
+impl ScalarCost {
     const DEPTH_BITS: u8 = 8;
     const DEPTH_MASK: u32 = (1 << Self::DEPTH_BITS) - 1;
     const OP_COST_MASK: u32 = !Self::DEPTH_MASK;
     const MAX_OP_COST: u32 = Self::OP_COST_MASK >> Self::DEPTH_BITS;
 
-    pub(crate) fn infinity() -> Cost {
+    pub(crate) fn infinity() -> ScalarCost {
         // 2^32 - 1 is, uh, pretty close to infinite... (we use `Cost`
         // only for heuristics and always saturate so this suffices!)
-        Cost(u32::MAX)
+        ScalarCost(u32::MAX)
     }
 
-    pub(crate) fn zero() -> Cost {
-        Cost(0)
+    pub(crate) fn zero() -> ScalarCost {
+        ScalarCost(0)
     }
 
     /// Construct a new `Cost` from the given parts.
     ///
     /// If the opcode cost is greater than or equal to the maximum representable
     /// opcode cost, then the resulting `Cost` saturates to infinity.
-    fn new(opcode_cost: u32, depth: u8) -> Cost {
+    fn new(opcode_cost: u32, depth: u8) -> ScalarCost {
         if opcode_cost >= Self::MAX_OP_COST {
             Self::infinity()
         } else {
-            Cost(opcode_cost << Self::DEPTH_BITS | u32::from(depth))
+            ScalarCost(opcode_cost << Self::DEPTH_BITS | u32::from(depth))
         }
     }
 
@@ -108,17 +199,17 @@ impl Cost {
     }
 
     /// Return the cost of an opcode.
-    fn of_opcode(op: Opcode) -> Cost {
+    pub(crate) fn of_opcode(op: Opcode) -> ScalarCost {
         match op {
             // Constants.
-            Opcode::Iconst | Opcode::F32const | Opcode::F64const => Cost::new(1, 0),
+            Opcode::Iconst | Opcode::F32const | Opcode::F64const => ScalarCost::new(1, 0),
 
             // Extends/reduces.
             Opcode::Uextend
             | Opcode::Sextend
             | Opcode::Ireduce
             | Opcode::Iconcat
-            | Opcode::Isplit => Cost::new(1, 0),
+            | Opcode::Isplit => ScalarCost::new(1, 0),
 
             // "Simple" arithmetic.
             Opcode::Iadd
@@ -129,27 +220,27 @@ impl Cost {
             | Opcode::Bnot
             | Opcode::Ishl
             | Opcode::Ushr
-            | Opcode::Sshr => Cost::new(3, 0),
+            | Opcode::Sshr => ScalarCost::new(3, 0),
 
             // "Expensive" arithmetic.
-            Opcode::Imul => Cost::new(10, 0),
+            Opcode::Imul => ScalarCost::new(10, 0),
 
             // Everything else.
             _ => {
                 // By default, be slightly more expensive than "simple"
                 // arithmetic.
-                let mut c = Cost::new(4, 0);
+                let mut c = ScalarCost::new(4, 0);
 
                 // And then get more expensive as the opcode does more side
                 // effects.
                 if op.can_trap() || op.other_side_effects() {
-                    c = c + Cost::new(10, 0);
+                    c = c + ScalarCost::new(10, 0);
                 }
                 if op.can_load() {
-                    c = c + Cost::new(20, 0);
+                    c = c + ScalarCost::new(20, 0);
                 }
                 if op.can_store() {
-                    c = c + Cost::new(50, 0);
+                    c = c + ScalarCost::new(50, 0);
                 }
 
                 c
@@ -157,40 +248,32 @@ impl Cost {
         }
     }
 
-    /// Compute the cost of the operation and its given operands.
-    ///
-    /// Caller is responsible for checking that the opcode came from an instruction
-    /// that satisfies `inst_predicates::is_pure_for_egraph()`.
-    pub(crate) fn of_pure_op(op: Opcode, operand_costs: impl IntoIterator<Item = Self>) -> Self {
-        let c = Self::of_opcode(op) + operand_costs.into_iter().sum();
-        Cost::new(c.op_cost(), c.depth().saturating_add(1))
-    }
-
     /// Compute the cost of an operation in the side-effectful skeleton.
     pub(crate) fn of_skeleton_op(op: Opcode, arity: usize) -> Self {
-        Cost::of_opcode(op) + Cost::new(u32::try_from(arity).unwrap(), (arity != 0) as _)
+        ScalarCost::of_opcode(op)
+            + ScalarCost::new(u32::try_from(arity).unwrap(), (arity != 0) as _)
     }
 }
 
-impl std::iter::Sum<Cost> for Cost {
-    fn sum<I: Iterator<Item = Cost>>(iter: I) -> Self {
+impl std::iter::Sum<ScalarCost> for ScalarCost {
+    fn sum<I: Iterator<Item = ScalarCost>>(iter: I) -> Self {
         iter.fold(Self::zero(), |a, b| a + b)
     }
 }
 
-impl std::default::Default for Cost {
-    fn default() -> Cost {
-        Cost::zero()
+impl std::default::Default for ScalarCost {
+    fn default() -> ScalarCost {
+        ScalarCost::zero()
     }
 }
 
-impl std::ops::Add<Cost> for Cost {
-    type Output = Cost;
+impl std::ops::Add<ScalarCost> for ScalarCost {
+    type Output = ScalarCost;
 
-    fn add(self, other: Cost) -> Cost {
+    fn add(self, other: ScalarCost) -> ScalarCost {
         let op_cost = self.op_cost().saturating_add(other.op_cost());
         let depth = std::cmp::max(self.depth(), other.depth());
-        Cost::new(op_cost, depth)
+        ScalarCost::new(op_cost, depth)
     }
 }
 
@@ -198,41 +281,51 @@ impl std::ops::Add<Cost> for Cost {
 mod tests {
     use super::*;
 
+    impl ScalarCost {
+        fn of_opcode_and_operands(
+            op: Opcode,
+            operand_costs: impl IntoIterator<Item = Self>,
+        ) -> Self {
+            let c = Self::of_opcode(op) + operand_costs.into_iter().sum();
+            ScalarCost::new(c.op_cost(), c.depth().saturating_add(1))
+        }
+    }
+
     #[test]
     fn add_cost() {
-        let a = Cost::new(5, 2);
-        let b = Cost::new(37, 3);
-        assert_eq!(a + b, Cost::new(42, 3));
-        assert_eq!(b + a, Cost::new(42, 3));
+        let a = ScalarCost::new(5, 2);
+        let b = ScalarCost::new(37, 3);
+        assert_eq!(a + b, ScalarCost::new(42, 3));
+        assert_eq!(b + a, ScalarCost::new(42, 3));
     }
 
     #[test]
     fn add_infinity() {
-        let a = Cost::new(5, 2);
-        let b = Cost::infinity();
-        assert_eq!(a + b, Cost::infinity());
-        assert_eq!(b + a, Cost::infinity());
+        let a = ScalarCost::new(5, 2);
+        let b = ScalarCost::infinity();
+        assert_eq!(a + b, ScalarCost::infinity());
+        assert_eq!(b + a, ScalarCost::infinity());
     }
 
     #[test]
     fn op_cost_saturates_to_infinity() {
-        let a = Cost::new(Cost::MAX_OP_COST - 10, 2);
-        let b = Cost::new(11, 2);
-        assert_eq!(a + b, Cost::infinity());
-        assert_eq!(b + a, Cost::infinity());
+        let a = ScalarCost::new(ScalarCost::MAX_OP_COST - 10, 2);
+        let b = ScalarCost::new(11, 2);
+        assert_eq!(a + b, ScalarCost::infinity());
+        assert_eq!(b + a, ScalarCost::infinity());
     }
 
     #[test]
     fn depth_saturates_to_max_depth() {
-        let a = Cost::new(10, u8::MAX);
-        let b = Cost::new(10, 1);
+        let a = ScalarCost::new(10, u8::MAX);
+        let b = ScalarCost::new(10, 1);
         assert_eq!(
-            Cost::of_pure_op(Opcode::Iconst, [a, b]),
-            Cost::new(21, u8::MAX)
+            ScalarCost::of_opcode_and_operands(Opcode::Iconst, [a, b]),
+            ScalarCost::new(21, u8::MAX)
         );
         assert_eq!(
-            Cost::of_pure_op(Opcode::Iconst, [b, a]),
-            Cost::new(21, u8::MAX)
+            ScalarCost::of_opcode_and_operands(Opcode::Iconst, [b, a]),
+            ScalarCost::new(21, u8::MAX)
         );
     }
 }
diff --git a/cranelift/codegen/src/egraph/elaborate.rs b/cranelift/codegen/src/egraph/elaborate.rs
index d9d156eb41c8..4ff9ee997970 100644
--- a/cranelift/codegen/src/egraph/elaborate.rs
+++ b/cranelift/codegen/src/egraph/elaborate.rs
@@ -2,7 +2,7 @@
 //! in CFG nodes.
 
 use super::Stats;
-use super::cost::Cost;
+use super::cost::ExprCost;
 use crate::ctxhash::NullCtx;
 use crate::dominator_tree::DominatorTree;
 use crate::hash_map::Entry as HashEntry;
@@ -71,8 +71,8 @@ pub(crate) struct Elaborator<'a> {
     ctrl_plane: &'a mut ControlPlane,
 }
 
-#[derive(Clone, Copy, Debug, PartialEq, Eq)]
-struct BestEntry(Cost, Value);
+#[derive(Clone, Debug, PartialEq, Eq)]
+struct BestEntry(ExprCost, Value);
 
 impl PartialOrd for BestEntry {
     fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
@@ -148,7 +148,7 @@ impl<'a> Elaborator<'a> {
     ) -> Self {
         let num_values = func.dfg.num_values();
         let mut value_to_best_value =
-            SecondaryMap::with_default(BestEntry(Cost::infinity(), Value::reserved_value()));
+            SecondaryMap::with_default(BestEntry(ExprCost::zero(), Value::reserved_value()));
         value_to_best_value.resize(num_values);
         Self {
             func,
@@ -321,9 +321,9 @@ impl<'a> Elaborator<'a> {
                     debug_assert!(!best[x].1.is_reserved_value());
                     debug_assert!(!best[y].1.is_reserved_value());
                     best[value] = if use_worst {
-                        std::cmp::max(best[x], best[y])
+                        std::cmp::max(&best[x], &best[y]).clone()
                     } else {
-                        std::cmp::min(best[x], best[y])
+                        std::cmp::min(&best[x], &best[y]).clone()
                     };
                     trace!(
                         " -> best of union({:?}, {:?}) = {:?}",
@@ -332,7 +332,7 @@ impl<'a> Elaborator<'a> {
                 }
 
                 ValueDef::Param(_, _) => {
-                    best[value] = BestEntry(Cost::zero(), value);
+                    best[value] = BestEntry(ExprCost::zero(), value);
                 }
 
                 // If the Inst is inserted into the layout (which is,
@@ -341,21 +341,20 @@ impl<'a> Elaborator<'a> {
                 // cost.
                 ValueDef::Result(inst, _) => {
                     if let Some(_) = self.func.layout.inst_block(inst) {
-                        best[value] = BestEntry(Cost::zero(), value);
+                        best[value] = BestEntry(ExprCost::zero(), value);
                     } else {
-                        let inst_data = &self.func.dfg.insts[inst];
                         // N.B.: at this point we know that the opcode is
                         // pure, so `pure_op_cost`'s precondition is
                         // satisfied.
-                        let cost = Cost::of_pure_op(
-                            inst_data.opcode(),
-                            self.func.dfg.inst_values(inst).map(|value| {
-                                debug_assert!(!best[value].1.is_reserved_value());
-                                best[value].0
-                            }),
-                        );
-                        best[value] = BestEntry(cost, value);
+                        let mut cost = ExprCost::for_inst(&self.func.dfg, inst);
+
+                        for val in self.func.dfg.inst_values(inst) {
+                            let BestEntry(val_cost, _val) = &best[val];
+                            cost.add(&self.func.dfg, val_cost);
+                        }
+
                         trace!(" -> cost of value {} = {:?}", value, cost);
+                        best[value] = BestEntry(cost, value);
                     }
                 }
             };
@@ -680,7 +679,7 @@ impl<'a> Elaborator<'a> {
                                 value: new_result,
                                 in_block: insert_block,
                             };
-                            let best_result = self.value_to_best_value[result];
+                            let best_result = &self.value_to_best_value[result];
                             self.value_to_elaborated_value.insert_if_absent_with_depth(
                                 &NullCtx,
                                 best_result.1,
@@ -688,7 +687,7 @@ impl<'a> Elaborator<'a> {
                                 scope_depth,
                             );
 
-                            self.value_to_best_value[new_result] = best_result;
+                            self.value_to_best_value[new_result] = best_result.clone();
 
                             trace!(
                                 " -> cloned inst has new result {} for orig {}",
@@ -706,7 +705,7 @@ impl<'a> Elaborator<'a> {
                                 value: result,
                                 in_block: insert_block,
                             };
-                            let best_result = self.value_to_best_value[result];
+                            let best_result = &self.value_to_best_value[result];
                             self.value_to_elaborated_value.insert_if_absent_with_depth(
                                 &NullCtx,
                                 best_result.1,
@@ -801,7 +800,7 @@ impl<'a> Elaborator<'a> {
             // map now.
             for &result in self.func.dfg.inst_results(inst) {
                 trace!(" -> result {}", result);
-                let best_result = self.value_to_best_value[result];
+                let best_result = &self.value_to_best_value[result];
                 self.value_to_elaborated_value.insert_if_absent(
                     &NullCtx,
                     best_result.1,
diff --git a/cranelift/entity/Cargo.toml b/cranelift/entity/Cargo.toml
index 31041d4377bd..e87022a8699a 100644
--- a/cranelift/entity/Cargo.toml
+++ b/cranelift/entity/Cargo.toml
@@ -17,6 +17,7 @@ workspace = true
 
 [dependencies]
 cranelift-bitset = { workspace=true }
+im-rc = "15.1.0"
 serde = { workspace = true, optional = true }
 serde_derive = { workspace = true, optional = true }
 
diff --git a/cranelift/entity/src/imm_set.rs b/cranelift/entity/src/imm_set.rs
new file mode 100644
index 000000000000..c450dcc5c086
--- /dev/null
+++ b/cranelift/entity/src/imm_set.rs
@@ -0,0 +1,255 @@
+//! Immutable entity sets.
+
+use super::EntityRef;
+use core::{fmt, marker::PhantomData, mem};
+use cranelift_bitset::ScalarBitSet;
+
+/// An immutable, persistent version of an [`EntitySet`][crate::EntitySet].
+#[derive(Clone)]
+pub struct ImmutableEntitySet<K> {
+    words: im_rc::OrdMap<u32, ScalarBitSet<usize>>,
+    len: u32,
+    _phantom: PhantomData<K>,
+}
+
+impl<K> Default for ImmutableEntitySet<K> {
+    fn default() -> Self {
+        Self {
+            words: Default::default(),
+            len: 0,
+            _phantom: Default::default(),
+        }
+    }
+}
+
+impl<K: fmt::Debug + EntityRef> fmt::Debug for ImmutableEntitySet<K> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_set().entries(self.iter()).finish()
+    }
+}
+
+impl<K> ImmutableEntitySet<K>
+where
+    K: EntityRef,
+{
+    const BITS_PER_WORD: usize = mem::size_of::<usize>() * 8;
+
+    #[inline]
+    fn word_and_bit(key: K) -> (u32, u8) {
+        let key_index = key.index();
+        let bit = key_index % Self::BITS_PER_WORD;
+        let word = key_index / Self::BITS_PER_WORD;
+        (u32::try_from(word).unwrap(), u8::try_from(bit).unwrap())
+    }
+
+    #[inline]
+    fn key_from_word_and_bit(word: u32, bit: u8) -> K {
+        let word = usize::try_from(word).unwrap();
+        let bit = usize::from(bit);
+        K::new(word * Self::BITS_PER_WORD + bit)
+    }
+
+    /// Create a set containing just the given key.
+    #[inline]
+    pub fn unit(key: K) -> Self {
+        let (word, bit) = Self::word_and_bit(key);
+        let mut bitset = ScalarBitSet::new();
+        bitset.insert(bit);
+        ImmutableEntitySet {
+            words: im_rc::OrdMap::unit(word, bitset),
+            len: 1,
+            _phantom: PhantomData,
+        }
+    }
+
+    /// Insert a new key into this set.
+    ///
+    /// Returns `true` if the set did not previously contain the key, `false`
+    /// otherwise.
+    #[inline]
+    pub fn insert(&mut self, key: K) -> bool {
+        let (word, bit) = Self::word_and_bit(key);
+        let bitset = self.words.entry(word).or_default();
+        let is_new = bitset.insert(bit);
+        self.len += u32::from(is_new);
+        is_new
+    }
+
+    /// Does this set contain the given key?
+    #[inline]
+    pub fn contains(&self, key: K) -> bool {
+        let (word, bit) = Self::word_and_bit(key);
+        self.words.get(&word).is_some_and(|bits| bits.contains(bit))
+    }
+
+    /// Get the number of elements in this set.
+    #[inline]
+    pub fn len(&self) -> usize {
+        usize::try_from(self.len).unwrap()
+    }
+
+    /// Iterate over the keys in this set, in order.
+    #[inline]
+    pub fn iter(&self) -> ImmutableEntitySetIter<'_, K> {
+        ImmutableEntitySetIter {
+            words: self.words.iter(),
+            word_and_bits: None,
+            _phantom: PhantomData,
+        }
+    }
+}
+
+/// An iterator over the entries in an [`ImmutableEntitySet`].
+pub struct ImmutableEntitySetIter<'a, K> {
+    words: im_rc::ordmap::Iter<'a, u32, ScalarBitSet<usize>>,
+    word_and_bits: Option<(u32, cranelift_bitset::scalar::Iter<usize>)>,
+    _phantom: PhantomData<K>,
+}
+
+impl<K> Iterator for ImmutableEntitySetIter<'_, K>
+where
+    K: EntityRef,
+{
+    type Item = K;
+
+    #[inline]
+    fn next(&mut self) -> Option<Self::Item> {
+        loop {
+            let (word, bits) = {
+                if self.word_and_bits.is_none() {
+                    let (&word, bits) = self.words.next()?;
+                    self.word_and_bits = Some((word, bits.iter()));
+                }
+                // Safety: we replaced `None` with `Some` just above.
+                unsafe { self.word_and_bits.as_mut().unwrap_unchecked() }
+            };
+
+            let Some(bit) = bits.next() else {
+                self.word_and_bits = None;
+                continue;
+            };
+
+            return Some(ImmutableEntitySet::key_from_word_and_bit(*word, bit));
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use alloc::vec::Vec;
+
+    #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
+    struct Key(u32);
+    crate::entity_impl!(Key);
+
+    #[test]
+    fn smoke_test() {
+        let mut set = ImmutableEntitySet::default();
+
+        for i in 0..100 {
+            let is_new = set.insert(Key::new(i));
+            assert!(is_new);
+        }
+        for i in 0..100 {
+            let is_new = set.insert(Key::new(i));
+            assert!(!is_new);
+        }
+
+        for i in 900..1000 {
+            let is_new = set.insert(Key::new(i));
+            assert!(is_new);
+        }
+        for i in 900..1000 {
+            let is_new = set.insert(Key::new(i));
+            assert!(!is_new);
+        }
+
+        for i in u32::MAX - 100..u32::MAX {
+            let i = usize::try_from(i).unwrap();
+            let is_new = set.insert(Key::new(i));
+            assert!(is_new);
+        }
+        for i in u32::MAX - 100..u32::MAX {
+            let i = usize::try_from(i).unwrap();
+            let is_new = set.insert(Key::new(i));
+            assert!(!is_new);
+        }
+
+        for i in 0..100 {
+            assert!(set.contains(Key::new(i)));
+        }
+        for i in 100..200 {
+            assert!(!set.contains(Key::new(i)));
+        }
+
+        for i in 800..900 {
+            assert!(!set.contains(Key::new(i)));
+        }
+        for i in 900..1000 {
+            assert!(set.contains(Key::new(i)));
+        }
+        for i in 1000..1100 {
+            assert!(!set.contains(Key::new(i)));
+        }
+
+        for i in u32::MAX - 200..u32::MAX - 100 {
+            let i = usize::try_from(i).unwrap();
+            assert!(!set.contains(Key::new(i)));
+        }
+        for i in u32::MAX - 100..u32::MAX {
+            let i = usize::try_from(i).unwrap();
+            assert!(set.contains(Key::new(i)));
+        }
+
+        assert_eq!(set.len(), 300);
+        assert_eq!(set.iter().count(), 300);
+        for k in set.iter() {
+            assert!(set.contains(k));
+        }
+    }
+
+    #[test]
+    fn unit() {
+        let set = ImmutableEntitySet::unit(Key::new(42));
+
+        assert!(set.contains(Key::new(42)));
+
+        assert!(!set.contains(Key::new(0)));
+        assert!(!set.contains(Key::new(41)));
+        assert!(!set.contains(Key::new(43)));
+
+        assert_eq!(set.iter().collect::<Vec<_>>(), [Key::new(42)]);
+    }
+
+    #[test]
+    fn iter() {
+        let mut set = ImmutableEntitySet::default();
+        set.insert(Key::new(0));
+        set.insert(Key::new(1));
+        set.insert(Key::new(2));
+        set.insert(Key::new(31));
+        set.insert(Key::new(32));
+        set.insert(Key::new(33));
+        set.insert(Key::new(63));
+        set.insert(Key::new(64));
+        set.insert(Key::new(65));
+        set.insert(Key::new(usize::try_from(u32::MAX - 1).unwrap()));
+
+        assert_eq!(
+            set.iter().collect::<Vec<_>>(),
+            [
+                Key::new(0),
+                Key::new(1),
+                Key::new(2),
+                Key::new(31),
+                Key::new(32),
+                Key::new(33),
+                Key::new(63),
+                Key::new(64),
+                Key::new(65),
+                Key::new(usize::try_from(u32::MAX - 1).unwrap()),
+            ]
+        );
+    }
+}
diff --git a/cranelift/entity/src/lib.rs b/cranelift/entity/src/lib.rs
index f412b277bea1..8ebce4a049d9 100644
--- a/cranelift/entity/src/lib.rs
+++ b/cranelift/entity/src/lib.rs
@@ -271,6 +271,7 @@ macro_rules! entity_impl {
 pub mod packed_option;
 
 mod boxed_slice;
+mod imm_set;
 mod iter;
 mod keys;
 mod list;
@@ -280,6 +281,7 @@ mod set;
 mod sparse;
 
 pub use self::boxed_slice::BoxedSlice;
+pub use self::imm_set::{ImmutableEntitySet, ImmutableEntitySetIter};
 pub use self::iter::{Iter, IterMut};
 pub use self::keys::Keys;
 pub use self::list::{EntityList, ListPool};
diff --git a/cranelift/filetests/filetests/egraph/cost-function.clif b/cranelift/filetests/filetests/egraph/cost-function.clif
new file mode 100644
index 000000000000..a8fc15bbc15f
--- /dev/null
+++ b/cranelift/filetests/filetests/egraph/cost-function.clif
@@ -0,0 +1,104 @@
+;; Egraph extraction and cost function edge cases.
+
+test optimize precise-output
+set opt_level=speed_and_size
+target x86_64
+
+function %f(i64) -> i64 {
+    block0(v0: i64):
+        v1 = iadd v0, v0
+        v2 = iadd v1, v1
+        v3 = iadd v2, v2
+        v4 = iadd v3, v3
+        v5 = iadd v4, v4
+        v6 = iadd v5, v5
+        v7 = iadd v6, v6
+        v8 = iadd v7, v7
+        v9 = iadd v8, v8
+        v10 = iadd v9, v9
+        v11 = iadd v10, v10
+        v12 = iadd v11, v11
+        v13 = iadd v12, v12
+        v14 = iadd v13, v13
+        v15 = iadd v14, v14
+        v16 = iadd v15, v15
+        v17 = iadd v16, v16
+        v18 = iadd v17, v17
+        v19 = iadd v18, v18
+        v20 = iadd v19, v19
+        v21 = iadd v20, v20
+        v22 = iadd v21, v21
+        v23 = iadd v22, v22
+        v24 = iadd v23, v23
+        v25 = iadd v24, v24
+        v26 = iadd v25, v25
+        v27 = iadd v26, v26
+        v28 = iadd v27, v27
+        v29 = iadd v28, v28
+        v30 = iadd v29, v29
+        v31 = iadd v30, v30
+        v32 = iadd v31, v31
+        v33 = iadd v32, v32
+
+        ;; When a cost function that doesn't "understand" shared structure and
+        ;; that expressions are DAGs, not trees -- for example, it just does
+        ;; something like
+        ;;
+        ;;     cost(v) = opcode_cost(v) + sum(cost(u) for u in operands(inst(v)))
+        ;;
+        ;; like our old cost function did -- then `v33` should now have infinite
+        ;; cost at this point.
+        ;;
+        ;; Now we append a little identity function to the end, which the
+        ;; optimizer should see through via the rules `x * 2 => x + x` followed
+        ;; by `(x + y) - y => x` and ultimately recognize that `v36` is the same
+        ;; as `v33`. However, if `cost(x)` saturated to infinity, because of
+        ;; poor cost functions that don't account for shared structure, then it
+        ;; is not actually true that `cost(x) < cost(identity(x))`, and we can
+        ;; fail to boil away the identity function. This test checks that we
+        ;; don't do that.
+
+        v34 = iconst.i64 2
+        v35 = imul v33, v34
+        v36 = isub v35, v33
+        return v36
+}
+
+; function %f(i64) -> i64 fast {
+; block0(v0: i64):
+;     v1 = iadd v0, v0
+;     v2 = iadd v1, v1
+;     v3 = iadd v2, v2
+;     v4 = iadd v3, v3
+;     v5 = iadd v4, v4
+;     v6 = iadd v5, v5
+;     v7 = iadd v6, v6
+;     v8 = iadd v7, v7
+;     v9 = iadd v8, v8
+;     v10 = iadd v9, v9
+;     v11 = iadd v10, v10
+;     v12 = iadd v11, v11
+;     v13 = iadd v12, v12
+;     v14 = iadd v13, v13
+;     v15 = iadd v14, v14
+;     v16 = iadd v15, v15
+;     v17 = iadd v16, v16
+;     v18 = iadd v17, v17
+;     v19 = iadd v18, v18
+;     v20 = iadd v19, v19
+;     v21 = iadd v20, v20
+;     v22 = iadd v21, v21
+;     v23 = iadd v22, v22
+;     v24 = iadd v23, v23
+;     v25 = iadd v24, v24
+;     v26 = iadd v25, v25
+;     v27 = iadd v26, v26
+;     v28 = iadd v27, v27
+;     v29 = iadd v28, v28
+;     v30 = iadd v29, v29
+;     v31 = iadd v30, v30
+;     v32 = iadd v31, v31
+;     v33 = iadd v32, v32
+;     return v33
+; }
+