Merge pull request #214 from stanfordnlp/revert-191-main

aryamanarora · web-flow · commit 724010faf6f4 · 2025-04-09T17:45:32.000-07:00
Revert "feat: add intervenable_model to forward's function signature"
diff --git a/pyvene/models/intervenable_base.py b/pyvene/models/intervenable_base.py
@@ -804,7 +804,6 @@ def _intervention_setter(
         keys,
         unit_locations_base,
         subspaces,
-        **intervention_forward_kwargs
     ) -> HandlerList:
         """
         Create a list of setter tracer that will set activations
@@ -849,7 +848,6 @@ def _intervention_setter(
                     None,
                     intervention,
                     subspaces[key_i] if subspaces is not None else None,
-                    **intervention_forward_kwargs
                 )
                 # fail if this is not a fresh collect
                 assert key not in self.activations
@@ -864,7 +862,6 @@ def _intervention_setter(
                             None,
                             intervention,
                             subspaces[key_i] if subspaces is not None else None,
-                            **intervention_forward_kwargs
                         )
                     else:
                         intervened_representation = do_intervention(
@@ -876,7 +873,6 @@ def _intervention_setter(
                             ),
                             intervention,
                             subspaces[key_i] if subspaces is not None else None,
-                            **intervention_forward_kwargs
                         )
                 else:
                     # highly unlikely it's a primitive intervention type
@@ -889,7 +885,6 @@ def _intervention_setter(
                         ),
                         intervention,
                         subspaces[key_i] if subspaces is not None else None,
-                        **intervention_forward_kwargs
                     )
                 if intervened_representation is None:
                     return
@@ -975,7 +970,6 @@ def _sync_forward_with_parallel_intervention(
                             ]
                             if subspaces is not None
                             else None,
-                            **kwargs
                         )
             counterfactual_outputs = self.model.output.save()
         
@@ -1003,7 +997,6 @@ def forward(
         output_original_output: Optional[bool] = False,
         return_dict: Optional[bool] = None,
         use_cache: Optional[bool] = None,
-        **kwargs
     ):
         activations_sources = source_representations
         if sources is not None and not isinstance(sources, list):
@@ -1043,7 +1036,7 @@ def forward(
         try:
 
             # run intervened forward
-            model_kwargs = { **kwargs }
+            model_kwargs = {}
             if labels is not None: # for training
                 model_kwargs["labels"] = labels
             if use_cache is not None and 'use_cache' in self.model.config.to_dict(): # for transformer models
@@ -1533,7 +1526,6 @@ def _intervention_setter(
         keys,
         unit_locations_base,
         subspaces,
-        **intervention_forward_kwargs
     ) -> HandlerList:
         """
         Create a list of setter handlers that will set activations
@@ -1581,7 +1573,6 @@ def hook_callback(model, args, kwargs, output=None):
                         None,
                         intervention,
                         subspaces[key_i] if subspaces is not None else None,
-                        **intervention_forward_kwargs
                     )
                     # fail if this is not a fresh collect
                     assert key not in self.activations
@@ -1597,7 +1588,6 @@ def hook_callback(model, args, kwargs, output=None):
                                 None,
                                 intervention,
                                 subspaces[key_i] if subspaces is not None else None,
-                                **intervention_forward_kwargs
                             )
                             if isinstance(raw_intervened_representation, InterventionOutput):
                                 self.full_intervention_outputs.append(raw_intervened_representation)
@@ -1614,7 +1604,6 @@ def hook_callback(model, args, kwargs, output=None):
                                 ),
                                 intervention,
                                 subspaces[key_i] if subspaces is not None else None,
-                                **intervention_forward_kwargs
                             )
                     else:
                         # highly unlikely it's a primitive intervention type
@@ -1627,7 +1616,6 @@ def hook_callback(model, args, kwargs, output=None):
                             ),
                             intervention,
                             subspaces[key_i] if subspaces is not None else None,
-                            **intervention_forward_kwargs
                         )
                     if intervened_representation is None:
                         return
@@ -1695,7 +1683,6 @@ def _wait_for_forward_with_parallel_intervention(
         unit_locations,
         activations_sources: Optional[Dict] = None,
         subspaces: Optional[List] = None,
-        **intervention_forward_kwargs
     ):
         # torch.autograd.set_detect_anomaly(True)
         all_set_handlers = HandlerList([])
@@ -1751,7 +1738,6 @@ def _wait_for_forward_with_parallel_intervention(
                         ]
                         if subspaces is not None
                         else None,
-                         **intervention_forward_kwargs
                     )
                     # for setters, we don't remove them.
                     all_set_handlers.extend(set_handlers)
@@ -1763,7 +1749,6 @@ def _wait_for_forward_with_serial_intervention(
         unit_locations,
         activations_sources: Optional[Dict] = None,
         subspaces: Optional[List] = None,
-         **intervention_forward_kwargs
     ):
         all_set_handlers = HandlerList([])
         for group_id, keys in self._intervention_group.items():
@@ -1820,7 +1805,6 @@ def _wait_for_forward_with_serial_intervention(
                         ]
                         if subspaces is not None
                         else None,
-                         **intervention_forward_kwargs
                     )
                     # for setters, we don't remove them.
                     all_set_handlers.extend(set_handlers)
@@ -1837,7 +1821,6 @@ def forward(
         output_original_output: Optional[bool] = False,
         return_dict: Optional[bool] = None,
         use_cache: Optional[bool] = None,
-        **intervention_forward_kwargs
     ):
         """
         Main forward function that serves a wrapper to
@@ -1946,7 +1929,6 @@ def forward(
                         unit_locations,
                         activations_sources,
                         subspaces,
-                        **intervention_forward_kwargs
                     )
                 )
             elif self.mode == "serial":
@@ -1956,7 +1938,6 @@ def forward(
                         unit_locations,
                         activations_sources,
                         subspaces,
-                        **intervention_forward_kwargs
                     )
                 )
 
@@ -2090,7 +2071,6 @@ def generate(
                         unit_locations,
                         activations_sources,
                         subspaces,
-                        **kwargs
                     )
                 )
             elif self.mode == "serial":
@@ -2100,7 +2080,6 @@ def generate(
                         unit_locations,
                         activations_sources,
                         subspaces,
-                        **kwargs
                     )
                 )
             
diff --git a/pyvene/models/interventions.py b/pyvene/models/interventions.py
@@ -75,7 +75,7 @@ def set_interchange_dim(self, interchange_dim):
             self.interchange_dim = interchange_dim
             
     @abstractmethod
-    def forward(self, base, source, subspaces=None, **kwargs):
+    def forward(self, base, source, subspaces=None):
         pass
 
 
@@ -153,7 +153,7 @@ class ZeroIntervention(ConstantSourceIntervention, LocalistRepresentationInterve
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
         
-    def forward(self, base, source=None, subspaces=None, **kwargs):
+    def forward(self, base, source=None, subspaces=None):
         return _do_intervention_by_swap(
             base,
             torch.zeros_like(base),
@@ -175,7 +175,7 @@ class CollectIntervention(ConstantSourceIntervention):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
         
-    def forward(self, base, source=None, subspaces=None, **kwargs):
+    def forward(self, base, source=None, subspaces=None):
         return _do_intervention_by_swap(
             base,
             source,
@@ -197,7 +197,7 @@ class SkipIntervention(BasisAgnosticIntervention, LocalistRepresentationInterven
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
         
-    def forward(self, base, source, subspaces=None, **kwargs):
+    def forward(self, base, source, subspaces=None):
         # source here is the base example input to the hook
         return _do_intervention_by_swap(
             base,
@@ -220,7 +220,7 @@ class VanillaIntervention(Intervention, LocalistRepresentationIntervention):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
-    def forward(self, base, source, subspaces=None, **kwargs): 
+    def forward(self, base, source, subspaces=None): 
         return _do_intervention_by_swap(
             base,
             source if self.source_representation is None else self.source_representation,
@@ -242,7 +242,7 @@ class AdditionIntervention(BasisAgnosticIntervention, LocalistRepresentationInte
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
-    def forward(self, base, source, subspaces=None, **kwargs):
+    def forward(self, base, source, subspaces=None):
         return _do_intervention_by_swap(
             base,
             source if self.source_representation is None else self.source_representation,
@@ -264,7 +264,7 @@ class SubtractionIntervention(BasisAgnosticIntervention, LocalistRepresentationI
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
-    def forward(self, base, source, subspaces=None, **kwargs):
+    def forward(self, base, source, subspaces=None):
         
         return _do_intervention_by_swap(
             base,
@@ -289,7 +289,7 @@ def __init__(self, **kwargs):
         rotate_layer = RotateLayer(self.embed_dim)
         self.rotate_layer = torch.nn.utils.parametrizations.orthogonal(rotate_layer)
 
-    def forward(self, base, source, subspaces=None, **kwargs):
+    def forward(self, base, source, subspaces=None):
         rotated_base = self.rotate_layer(base)
         rotated_source = self.rotate_layer(source)
         # interchange
@@ -340,7 +340,7 @@ def set_intervention_boundaries(self, intervention_boundaries):
             torch.tensor([intervention_boundaries]), requires_grad=True
         )
         
-    def forward(self, base, source, subspaces=None, **kwargs):
+    def forward(self, base, source, subspaces=None):
         batch_size = base.shape[0]
         rotated_base = self.rotate_layer(base)
         rotated_source = self.rotate_layer(source)
@@ -391,7 +391,7 @@ def get_temperature(self):
     def set_temperature(self, temp: torch.Tensor):
         self.temperature.data = temp
 
-    def forward(self, base, source, subspaces=None, **kwargs):
+    def forward(self, base, source, subspaces=None):
         batch_size = base.shape[0]
         rotated_base = self.rotate_layer(base)
         rotated_source = self.rotate_layer(source)
@@ -431,7 +431,7 @@ def get_temperature(self):
     def set_temperature(self, temp: torch.Tensor):
         self.temperature.data = temp
 
-    def forward(self, base, source, subspaces=None, **kwargs):
+    def forward(self, base, source, subspaces=None):
         batch_size = base.shape[0]
         # get boundary mask between 0 and 1 from sigmoid
         mask_sigmoid = torch.sigmoid(self.mask / torch.tensor(self.temperature)) 
@@ -456,7 +456,7 @@ def __init__(self, **kwargs):
         rotate_layer = LowRankRotateLayer(self.embed_dim, kwargs["low_rank_dimension"])
         self.rotate_layer = torch.nn.utils.parametrizations.orthogonal(rotate_layer)
 
-    def forward(self, base, source, subspaces=None, **kwargs):
+    def forward(self, base, source, subspaces=None):
         rotated_base = self.rotate_layer(base)
         rotated_source = self.rotate_layer(source)
         if subspaces is not None:
@@ -529,7 +529,7 @@ def __init__(self, **kwargs):
         )
         self.trainable = False
 
-    def forward(self, base, source, subspaces=None, **kwargs):
+    def forward(self, base, source, subspaces=None):
         base_norm = (base - self.pca_mean) / self.pca_std
         source_norm = (source - self.pca_mean) / self.pca_std
 
@@ -565,7 +565,7 @@ def __init__(self, **kwargs):
             prng(1, 4, self.embed_dim)))
         self.register_buffer('noise_level', torch.tensor(noise_level))
         
-    def forward(self, base, source=None, subspaces=None, **kwargs):
+    def forward(self, base, source=None, subspaces=None):
         base[..., : self.interchange_dim] += self.noise * self.noise_level
         return base
 
@@ -585,7 +585,7 @@ def __init__(self, **kwargs):
         self.autoencoder = AutoencoderLayer(
                 self.embed_dim, kwargs["latent_dim"])
 
-    def forward(self, base, source, subspaces=None, **kwargs):
+    def forward(self, base, source, subspaces=None):
         base_dtype = base.dtype
         base = base.to(self.autoencoder.encoder[0].weight.dtype)
         base_latent = self.autoencoder.encode(base)
@@ -619,7 +619,7 @@ def encode(self, input_acts):
     def decode(self, acts):
         return acts @ self.W_dec + self.b_dec
 
-    def forward(self, base, source=None, subspaces=None, **kwargs):
+    def forward(self, base, source=None, subspaces=None):
         # generate latents for base and source runs.
         base_latent = self.encode(base)
         source_latent = self.encode(source)
diff --git a/pyvene/models/modeling_utils.py b/pyvene/models/modeling_utils.py
@@ -446,7 +446,7 @@ def scatter_neurons(
 
 
 def do_intervention(
-    base_representation, source_representation, intervention, subspaces, **intervention_forward_kwargs
+    base_representation, source_representation, intervention, subspaces
 ):
     """Do the actual intervention."""
 
@@ -478,8 +478,7 @@ def do_intervention(
         assert False  # what's going on?
 
     intervention_output = intervention(
-        base_representation_f, source_representation_f, subspaces,
-        **intervention_forward_kwargs
+        base_representation_f, source_representation_f, subspaces
     )
     if isinstance(intervention_output, InterventionOutput):
         intervened_representation = intervention_output.output
diff --git a/tests/integration_tests/IntervenableBasicTestCase.py b/tests/integration_tests/IntervenableBasicTestCase.py
@@ -232,7 +232,7 @@ class MultiplierIntervention(
             def __init__(self, embed_dim, **kwargs):
                 super().__init__()
             def forward(
-            self, base, source=None, subspaces=None, **kwargs):
+            self, base, source=None, subspaces=None):
                 return base * 99.0
         # run with new intervention type
         pv_gpt2 = pv.IntervenableModel({
diff --git a/tests/integration_tests/InterventionWithLlamaTestCase.py b/tests/integration_tests/InterventionWithLlamaTestCase.py
@@ -156,32 +156,6 @@ def test_with_multiple_heads_positions_vanilla_intervention_positive(self):
                 heads=[4, 1],
                 positions=[7, 2],
             )
-    
-    def test_with_llm_head(self):
-        that = self
-        _lm_head_collection = {}
-        class AccessIntervenableModelIntervention:
-            is_source_constant = True
-            keep_last_dim = True
-            intervention_types = 'access_intervenable_model_intervention'
-            def __init__(self, layer_index, *args, **kwargs):
-                super().__init__()
-                self.layer_index = layer_index
-            def __call__(self, base, source=None, subspaces=None, model=None, **kwargs):
-                intervenable_model = kwargs.get('intervenable_model', None)
-                assert intervenable_model is not None
-                _lm_head_collection[self.layer_index] = intervenable_model.model.lm_head(base.to(that.device))
-                return base
-        # run with new intervention type
-        pv_llama = IntervenableModel([{
-            "intervention": AccessIntervenableModelIntervention(layer_index=layer),
-            "component": f"model.layers.{layer}.input"
-        } for layer in [1, 3]], model=self.llama)
-        intervened_outputs = pv_llama(
-            base=self.tokenizer("The capital of Spain is", return_tensors="pt").to(that.device), 
-            unit_locations={"base": 3},
-            intervenable_model=pv_llama
-        )
 
             
 def suite():