Skip to content

Commit c24e97f

Browse files
rahul-tuliSara Adkins
and
Sara Adkins
authored
[Feature Branch] Quant modifier UX (#2263)
* Split WandaPruningModifier and SparseGPTModifier Make sparsegpt not inherit from wanda modifier Decouple SparseGPTModifierPyTorch from WandaPruningModifier Fix docstrings * Split SparseGPT and GPTQ modifiers (#2272) * Update OBCQ * Extract GPTQ Modifier * [GPTQ Modifier UX] Update tests to use GPTQModifier for obcq style quantization (#2294) * Update OBCQ * Extract GPTQ Modifier * Update test recipes * GPTQ UX config groups support (#2273) * Update OBCQ * Extract GPTQ Modifier * Update test recipes * Add config_groups support to GPTQModifier * mask_structure preservation test (#2284) * test * Preserve weight sparsity if greater than threshold * Add argument to preserve sparsity mask in SPARSEGPT * fix case when mask is none * Add test to check mask_structure - initial mask structure should be preserved b/w consecutive runs; added test to check this * Update tensor_follows_mask_structure to check for atleast n zeros --------- Co-authored-by: Sara Adkins <[email protected]> * PR comments --------- Co-authored-by: Sara Adkins <[email protected]> * Fix default case * Update test to use new vLLMQuantizationModifier * Style --------- Co-authored-by: Sara Adkins <[email protected]>
1 parent 53541f3 commit c24e97f

35 files changed

+1367
-233
lines changed

Diff for: integrations/huggingface-transformers/tutorials/text-generation/example_alternating_recipe.yaml

-2
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ initial_sparsity_stage:
55
sparsity: 0.5
66
block_size: 128
77
sequential_update: False
8-
quantize: False
98
percdamp: 0.01
109
mask_structure: "0:0"
1110
targets: [
@@ -24,7 +23,6 @@ next_sparsity_stage:
2423
sparsity: 0.7
2524
block_size: 128
2625
sequential_update: False
27-
quantize: False
2826
percdamp: 0.01
2927
mask_structure: "0:0"
3028
targets: [

Diff for: src/sparseml/modifiers/obcq/base.py

+79-75
Original file line numberDiff line numberDiff line change
@@ -12,20 +12,17 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
import logging
16-
from typing import Any, Dict, List, Optional, Union
15+
from typing import Dict, List, Optional, Union
1716

18-
from sparseml.core.factory import ModifierFactory
17+
from sparseml.core import Modifier
18+
from sparseml.core.model.base import ModifiableModel
1919
from sparseml.core.state import State
20-
from sparseml.modifiers.pruning.wanda.base import WandaPruningModifier
2120

2221

2322
__all__ = ["SparseGPTModifier"]
2423

25-
_LOGGER = logging.getLogger(__name__)
2624

27-
28-
class SparseGPTModifier(WandaPruningModifier):
25+
class SparseGPTModifier(Modifier):
2926
"""
3027
Modifier for applying the one-shot OBCQ algorithm to a model
3128
@@ -41,84 +38,91 @@ class SparseGPTModifier(WandaPruningModifier):
4138
- on_finalize
4239
- LayerCompressor.revert_layer_wrappers()
4340
44-
:param block_size: Used to determine number of columns to compress in one pass
45-
:param quantize: Whether or not to quantize weights during SparseGPT. Set to
46-
True to quantize using an existing quantization modifier, or pass in the
47-
configuration for a quantization modifier if one does not already exist
48-
in the recipe
4941
:param sparsity: Sparsity to compress model to
42+
:param sparsity_profile: Can be set to 'owl' to use Outlier Weighed
43+
Layerwise Sparsity (OWL), more information can be found
44+
in the paper https://arxiv.org/pdf/2310.05175
45+
:param owl_m: Number of outliers to use for OWL
46+
:param owl_lmbda: Lambda value to use for OWL
47+
:param mask_structure: String to define the structure of the mask to apply.
48+
Must be of the form N:M where N, M are integers that define a custom block
49+
shape. Defaults to 0:0 which represents an unstructured mask.
50+
:param sequential_update: Whether or not to update weights sequentially by layer,
51+
True saves on GPU memory
52+
:param targets: list of layer names to compress during OBCQ, or '__ALL__'
53+
to compress every layer in the model
54+
:param block_size: Used to determine number of columns to compress in one pass
5055
:param dampening_frac: Amount of dampening to apply to H, as a fraction of the
5156
diagonal norm
57+
:param preserve_sparsity_mask: Whether or not to preserve the sparsity mask
58+
during when applying sparsegpt, this becomes useful when starting from a
59+
previously pruned model, defaults to False.
5260
"""
5361

54-
block_size: int = 128
55-
quantize: Union[bool, Dict] = False
5662
sparsity: Union[float, List[float]] = 0.0
63+
sparsity_profile: Optional[str] = None
64+
owl_m: Optional[int] = None
65+
owl_lmbda: Optional[float] = None
66+
mask_structure: str = "0:0"
67+
sequential_update: Optional[bool] = False
68+
targets: Union[str, List[str], None] = None
69+
block_size: int = 128
5770
dampening_frac: Optional[float] = 0.01
58-
quantization_modifier_: Any = None
71+
preserve_sparsity_mask: bool = False
72+
prunen_: Optional[int] = None
73+
prunem_: Optional[int] = None
74+
compressible_layers_: Optional[List] = None
5975

6076
def on_initialize_structure(self, state: State, **kwargs):
6177
"""
62-
Check the model's quantization state matches that expected by this modifier,
63-
adding a default quantization scheme if needed
78+
Initialize the structure of the model for compression.
79+
This modifier does not modifiy the model structure, so this method
80+
is a no-op.
81+
82+
:param state: session state storing input model and calibration data
83+
"""
84+
return True
85+
86+
def compressible_layers(self) -> Dict:
87+
"""
88+
Retrieves the modules corresponding to a list of
89+
compressible layer names
90+
91+
:precondition: self.model is set and is a `ModifiableModel`
92+
:precondition: The `ModifiableModel` implements a `get_layers`
93+
method
94+
:return: dictionary of modules to compress
95+
"""
96+
if not isinstance(self.model, ModifiableModel):
97+
raise ValueError(
98+
"`self.model` must be a ModifiableModel to use "
99+
f"the {self.__class__.__qualname__} modifier but got "
100+
f"{type(self.model)} instead"
101+
)
102+
103+
return self.model.get_layers(self.targets)
104+
105+
def _validate_layerwise_sparsity(self):
106+
if isinstance(self.sparsity, float):
107+
# single sparsity will be applied to all layers
108+
return
109+
110+
target_layers = list(self.compressible_layers_.keys())
111+
112+
if len(target_layers) != len(self.sparsity):
113+
raise ValueError(
114+
"Number of layer targets must match the number of "
115+
f"sparsities. Got {len(target_layers)} layers and "
116+
f"{len(self.sparsity)} sparsities"
117+
)
118+
119+
def on_finalize(self, state: State, **kwargs):
120+
"""
121+
Nothing to do on finalize, on this level.
122+
Quantization Modifier if any will be finalized in the subclass
64123
65124
:param state: session state storing input model and calibration data
125+
:param kwargs: additional arguments
126+
:return: True
66127
"""
67-
quantization_already_active = state.model.qat_active()
68-
if isinstance(self.quantize, bool):
69-
if not self.quantize and quantization_already_active:
70-
_LOGGER.warning(
71-
"SparseGPT quantization is set to False, but a "
72-
"quantization modifier is already active on the model "
73-
"resetting quantize to True"
74-
)
75-
self.quantize = True
76-
elif self.quantize and not quantization_already_active:
77-
_LOGGER.warning(
78-
"SparseGPT quantization is set to True without an "
79-
"active quantization modifier. Creating a default "
80-
"8-bit quantization modifier"
81-
)
82-
default_quant_config = {"QuantizationModifier": {}}
83-
self._build_quant_modifier_from_dict(
84-
default_quant_config, state.framework
85-
)
86-
return # use existing quantization modifier if there is one
87-
else:
88-
if not isinstance(self.quantize, Dict):
89-
raise ValueError(
90-
"SparseGPTModifier.quantize accepts only a single "
91-
"quantization modifier or a boolean. Found "
92-
f"type {type(self.quantize)}"
93-
)
94-
if len(self.quantize) != 1:
95-
raise ValueError(
96-
"SparseGPTModifier.quantize accepts only a single "
97-
"quantization modifier or a boolean. Found "
98-
f"{len(self.quantize)} modifiers"
99-
)
100-
if quantization_already_active:
101-
_LOGGER.warning(
102-
"Attempting to initialize quantization for SparseGPT "
103-
"but a quantization modifier has already been applied. "
104-
"The quantization configuration defined under the "
105-
"SparseGPT modifier will be ignored."
106-
)
107-
self.quantize = True
108-
return
109-
self._build_quant_modifier_from_dict(self.quantize, state.framework)
110-
self.quantize = True
111-
112-
if self.quantization_modifier_:
113-
self.quantization_modifier_.on_initialize_structure(state, **kwargs)
114-
115-
def _build_quant_modifier_from_dict(self, quant_config, framework):
116-
modifier_type = list(quant_config.keys())[0]
117-
modifier_args = quant_config[modifier_type]
118-
self.quantization_modifier_ = ModifierFactory.create(
119-
modifier_type,
120-
framework=framework,
121-
allow_registered=True,
122-
allow_experimental=True,
123-
**modifier_args,
124-
)
128+
return True

0 commit comments

Comments
 (0)