install torchao mps ops by default when running on Apple Silicon

manuelcandales · manuelcandales · commit ac897059c5c5 · 2025-03-19T15:27:45.000-04:00
diff --git a/install/install_torchao.sh b/install/install_torchao.sh
@@ -31,9 +31,16 @@ else
 fi
 echo "Using pip executable: $PIP_EXECUTABLE"
 
+if [[ $(uname -s) == "Darwin" && $(uname -m) == "arm64" ]]; then
+  echo "Building torchao experimental mps ops (Apple Silicon detected)"
+  APPLE_SILICON_DETECTED=1
+else
+  echo "NOT building torchao experimental mps ops (Apple Silicon NOT detected)"
+  APPLE_SILICON_DETECTED=0
+fi
 
 export TORCHAO_PIN=$(cat install/.pins/torchao-pin.txt)
 (
   set -x
-  USE_CPP=1 $PIP_EXECUTABLE install git+https://github.com/pytorch/ao.git@${TORCHAO_PIN}
+  USE_CPP=1 TORCHAO_BUILD_EXPERIMENTAL_MPS=${APPLE_SILICON_DETECTED} $PIP_EXECUTABLE install git+https://github.com/pytorch/ao.git@${TORCHAO_PIN}
 )
diff --git a/torchchat/utils/quantize.py b/torchchat/utils/quantize.py
@@ -56,6 +56,7 @@
 from torchao.experimental.quant_api import (
     int8_dynamic_activation_intx_weight,
     IntxWeightEmbeddingQuantizer,
+    UIntxWeightOnlyLinearQuantizer,
 )
 from torchao.quantization.granularity import (
     PerGroup,
@@ -137,12 +138,12 @@ def quantize_model(
                 group_size = q_kwargs["groupsize"]
                 bit_width = q_kwargs["bitwidth"]
                 has_weight_zeros = q_kwargs["has_weight_zeros"]
-                granularity = PerRow() if group_size == -1 else PerGroup(group_size) 
+                granularity = PerRow() if group_size == -1 else PerGroup(group_size)
                 weight_dtype = getattr(torch, f"int{bit_width}")
 
                 try:
                     quantize_(
-                        model, 
+                        model,
                         int8_dynamic_activation_intx_weight(
                             weight_dtype=weight_dtype,
                             granularity=granularity,
@@ -154,7 +155,7 @@ def quantize_model(
                     print("Encountered error during quantization: {e}")
                     print("Trying with PlainLayout")
                     quantize_(
-                        model, 
+                        model,
                         int8_dynamic_activation_intx_weight(
                             weight_dtype=weight_dtype,
                             granularity=granularity,
@@ -946,38 +947,5 @@ def quantized_model(self) -> nn.Module:
     "linear:int4": Int4WeightOnlyQuantizer,
     "linear:a8wxdq": None, # uses quantize_ API
     "linear:a8w4dq": Int8DynActInt4WeightQuantizer,
+    "linear:afpwx": UIntxWeightOnlyLinearQuantizer,
 }
-
-try:
-    import importlib.util
-    import os
-    import sys
-
-    torchao_build_path = f"{os.getcwd()}/torchao-build"
-
-    # Try loading quantizer
-    torchao_experimental_quant_api_spec = importlib.util.spec_from_file_location(
-        "torchao_experimental_quant_api",
-        f"{torchao_build_path}/src/ao/torchao/experimental/quant_api.py",
-    )
-    torchao_experimental_quant_api = importlib.util.module_from_spec(
-        torchao_experimental_quant_api_spec
-    )
-    sys.modules["torchao_experimental_quant_api"] = torchao_experimental_quant_api
-    torchao_experimental_quant_api_spec.loader.exec_module(
-        torchao_experimental_quant_api
-    )
-    from torchao_experimental_quant_api import UIntxWeightOnlyLinearQuantizer
-    quantizer_class_dict["linear:afpwx"] = UIntxWeightOnlyLinearQuantizer
-
-    # Try loading custom op
-    try:
-        libname = "libtorchao_ops_mps_aten.dylib"
-        libpath = f"{torchao_build_path}/cmake-out/lib/{libname}"
-        torch.ops.load_library(libpath)
-        print("Loaded torchao mps ops.")
-    except Exception as e:
-        print("Unable to load torchao mps ops library.")
-
-except Exception as e:
-    print("Unable to import torchao experimental quant_api with error: ", e)