hotfix: revert torch.library register (#709)

yzh119 · web-flow · commit ccd3be927666 · 2024-12-30T11:06:40.000-08:00
We observe performance degradation for small operations in flashinfer v0.2 because of the overhead of `torch.library.custom_op` introduced in #554. This PR disables torch custom operator registrations first, we can add them back with lightweight registration later: https://github.com/vllm-project/vllm/blob/36e76700453924c8d421db99af70a88a1df835cd/vllm/utils.py#L1660-L1674 cc @zhyncs @abcdabcd987 @youkaichao
diff --git a/flashinfer/utils.py b/flashinfer/utils.py
@@ -236,19 +236,24 @@ def register_custom_op(
         device_types: Optional[Union[str, Sequence[str]]] = None,
         schema: Optional[str] = None,
     ) -> Callable:
-        return torch.library.custom_op(
-            name,
-            fn,
-            mutates_args=mutates_args,
-            device_types=device_types,
-            schema=schema,
-        )
+        # NOTE(Zihao): torch.library.custom_op has significant overhead as mentioned in the following link
+        # https://github.com/vllm-project/vllm/blob/36e76700453924c8d421db99af70a88a1df835cd/vllm/utils.py#L1660-L1674
+
+        # return torch.library.custom_op(
+        #     name,
+        #     fn,
+        #     mutates_args=mutates_args,
+        #     device_types=device_types,
+        #     schema=schema,
+        # )
+        return lambda x: x
 
     def register_fake_op(
         name: str,
         fn: Optional[Callable] = None,
     ) -> Callable:
-        return torch.library.register_fake(name, fn)
+        # return torch.library.register_fake(name, fn)
+        return lambda x: x
 
 
 def get_cuda_stream(device: torch.device) -> int: