@@ -175,41 +175,42 @@ To accelerate inference with ONNX Runtime, 🤗 Optimum uses _configuration obje
175175Before applying quantization or optimization, first export our model to the ONNX format:
176176
177177``` python
178- >> > from optimum.onnxruntime import ORTModelForSequenceClassification
179- >> > from transformers import AutoTokenizer
180- >> > model_checkpoint = " distilbert-base-uncased-finetuned-sst-2-english"
181- >> > save_directory = " tmp/onnx/"
182- >> > # Load a model from transformers and export it to ONNX
183- >> > tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
184- >> > ort_model = ORTModelForSequenceClassification.from_pretrained(model_checkpoint, from_transformers = True )
185- >> > # Save the onnx model and tokenizer
186- >> > ort_model.save_pretrained(save_directory)
187- >> > tokenizer.save_pretrained(save_directory)
178+ from optimum.onnxruntime import ORTModelForSequenceClassification
179+ from transformers import AutoTokenizer
180+
181+ model_checkpoint = " distilbert-base-uncased-finetuned-sst-2-english"
182+ save_directory = " tmp/onnx/"
183+ # Load a model from transformers and export it to ONNX
184+ tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
185+ ort_model = ORTModelForSequenceClassification.from_pretrained(model_checkpoint, from_transformers = True )
186+ # Save the onnx model and tokenizer
187+ ort_model.save_pretrained(save_directory)
188+ tokenizer.save_pretrained(save_directory)
188189```
189190
190191Let's see now how we can apply dynamic quantization with ONNX Runtime:
191192
192193``` python
193- >> > from optimum.onnxruntime.configuration import AutoQuantizationConfig
194- >> > from optimum.onnxruntime import ORTQuantizer
195-
196- >> > # Define the quantization methodology
197- >> > qconfig = AutoQuantizationConfig.arm64(is_static = False , per_channel = False )
198- >> > quantizer = ORTQuantizer.from_pretrained(ort_model)
199- >> > # Apply dynamic quantization on the model
200- >> > quantizer.quantize(save_dir = save_directory, quantization_config = qconfig)
194+ from optimum.onnxruntime.configuration import AutoQuantizationConfig
195+ from optimum.onnxruntime import ORTQuantizer
196+
197+ # Define the quantization methodology
198+ qconfig = AutoQuantizationConfig.arm64(is_static = False , per_channel = False )
199+ quantizer = ORTQuantizer.from_pretrained(ort_model)
200+ # Apply dynamic quantization on the model
201+ quantizer.quantize(save_dir = save_directory, quantization_config = qconfig)
201202```
202203
203204In this example, we've quantized a model from the Hugging Face Hub, but it could also be a path to a local model directory. The result from applying the ` quantize() ` method is a ` model_quantized.onnx ` file that can be used to run inference. Here's an example of how to load an ONNX Runtime model and generate predictions with it:
204205
205206``` python
206- >> > from optimum.onnxruntime import ORTModelForSequenceClassification
207- >> > from transformers import pipeline, AutoTokenizer
207+ from optimum.onnxruntime import ORTModelForSequenceClassification
208+ from transformers import pipeline, AutoTokenizer
208209
209- >> > model = ORTModelForSequenceClassification.from_pretrained(save_directory, file_name = " model_quantized.onnx" )
210- >> > tokenizer = AutoTokenizer.from_pretrained(save_directory)
211- >> > classifier = pipeline(" text-classification" , model = model, tokenizer = tokenizer)
212- >> > results = classifier(" I love burritos!" )
210+ model = ORTModelForSequenceClassification.from_pretrained(save_directory, file_name = " model_quantized.onnx" )
211+ tokenizer = AutoTokenizer.from_pretrained(save_directory)
212+ classifier = pipeline(" text-classification" , model = model, tokenizer = tokenizer)
213+ results = classifier(" I love burritos!" )
213214```
214215
215216#### Optimum Intel
0 commit comments