From 1dbd618fc7b46955bd9017843c1f6f78c22a637a Mon Sep 17 00:00:00 2001
From: Darryl Abbate <drl@amazon.com>
Date: Fri, 29 Aug 2025 16:17:55 -0700
Subject: [PATCH 01/17] prov/efa: Remove `efa_domain` references in HMEM
 utility code

Signed-off-by: Darryl Abbate <drl@amazon.com>
---
 prov/efa/src/efa_hmem.c | 4 ++--
 prov/efa/src/efa_hmem.h | 2 --
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/prov/efa/src/efa_hmem.c b/prov/efa/src/efa_hmem.c
index 5a9475d4b53..9fcd063d3bd 100644
--- a/prov/efa/src/efa_hmem.c
+++ b/prov/efa/src/efa_hmem.c
@@ -29,7 +29,7 @@ static size_t efa_max_eager_msg_size_with_largest_header() {
  *
  * @return  0
  */
-static int efa_domain_hmem_info_init_protocol_thresholds(enum fi_hmem_iface iface)
+static int efa_hmem_info_init_protocol_thresholds(enum fi_hmem_iface iface)
 {
 	struct efa_hmem_info *info = &g_efa_hmem_info[iface];
 	size_t tmp_value;
@@ -298,7 +298,7 @@ efa_hmem_info_init_iface(enum fi_hmem_iface iface)
 			EFA_INFO(FI_LOG_CORE, "%s P2P support is not available.\n", fi_tostr(&iface, FI_TYPE_HMEM_IFACE));
 	}
 
-	efa_domain_hmem_info_init_protocol_thresholds(iface);
+	efa_hmem_info_init_protocol_thresholds(iface);
 }
 
 /**
diff --git a/prov/efa/src/efa_hmem.h b/prov/efa/src/efa_hmem.h
index 858b7035883..c04c2139063 100644
--- a/prov/efa/src/efa_hmem.h
+++ b/prov/efa/src/efa_hmem.h
@@ -33,8 +33,6 @@ struct efa_hmem_info {
 
 extern struct efa_hmem_info	g_efa_hmem_info[OFI_HMEM_MAX];
 
-struct efa_domain;
-
 int efa_hmem_validate_p2p_opt(enum fi_hmem_iface iface, int p2p_opt, uint32_t api_version);
 int efa_hmem_info_initialize();
 

From c894e191ecd598feb6c317e1e5464c082736ad5f Mon Sep 17 00:00:00 2001
From: Darryl Abbate <drl@amazon.com>
Date: Mon, 8 Sep 2025 12:43:24 -0700
Subject: [PATCH 02/17] prov/efa: Fix typo in HMEM/Neuron logging

Signed-off-by: Darryl Abbate <drl@amazon.com>
---
 prov/efa/src/efa_hmem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/prov/efa/src/efa_hmem.c b/prov/efa/src/efa_hmem.c
index 9fcd063d3bd..447e9fd4174 100644
--- a/prov/efa/src/efa_hmem.c
+++ b/prov/efa/src/efa_hmem.c
@@ -78,7 +78,7 @@ static int efa_hmem_info_init_protocol_thresholds(enum fi_hmem_iface iface)
 			EFA_WARN(FI_LOG_CORE,
 			         "The environment variable FI_EFA_INTER_MAX_MEDIUM_MESSAGE_SIZE was set, "
 			         "but EFA HMEM via Neuron API only supports eager and runting read protocols. "
-			         "The variable will not modify CUDA memory run config.\n");
+			         "The variable will not modify Neuron memory run config.\n");
 		}
 		break;
 	case FI_HMEM_SYNAPSEAI:

From a2d6229559e21305da7ec4c80ea6708df15c1c5b Mon Sep 17 00:00:00 2001
From: Darryl Abbate <drl@amazon.com>
Date: Tue, 2 Sep 2025 15:33:55 -0700
Subject: [PATCH 03/17] prov/efa: Improve some `FI_HMEM` iface logging

This simply utilizes `fi_tostr()` to log the interface identifier
instead of the integer value.

Signed-off-by: Darryl Abbate <drl@amazon.com>
---
 prov/efa/src/efa_mr.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/prov/efa/src/efa_mr.c b/prov/efa/src/efa_mr.c
index fc62d21058d..128b54717c9 100644
--- a/prov/efa/src/efa_mr.c
+++ b/prov/efa/src/efa_mr.c
@@ -196,8 +196,8 @@ static int efa_mr_hmem_setup(struct efa_mr *efa_mr,
 			efa_mr->peer.iface = attr->iface;
 		} else {
 			EFA_WARN(FI_LOG_MR,
-				 "FI_HMEM is not initialized for device type %d\n",
-				 attr->iface);
+				"%s is not initialized\n",
+				fi_tostr(&attr->iface, FI_TYPE_HMEM_IFACE));
 			return -FI_ENOSYS;
 		}
 	} else {
@@ -208,8 +208,8 @@ static int efa_mr_hmem_setup(struct efa_mr *efa_mr,
 		 * whatever reason.
 		 */
 		EFA_WARN_ONCE(FI_LOG_MR,
-		             "FI_HMEM support is disabled, assuming FI_HMEM_SYSTEM not type: %d.\n",
-		             attr->iface);
+		             "FI_HMEM support is disabled, assuming FI_HMEM_SYSTEM instead of %s\n",
+		             fi_tostr(&attr->iface, FI_TYPE_HMEM_IFACE));
 		efa_mr->peer.iface = FI_HMEM_SYSTEM;
 	}
 
@@ -327,7 +327,8 @@ static int efa_mr_cache_regattr(struct fid *fid, const struct fi_mr_attr *attr,
 
 	if (!ofi_hmem_is_initialized(attr->iface)) {
 		EFA_WARN(FI_LOG_MR,
-			 "Cannot register memory for uninitialized iface\n");
+			"Cannot register memory for uninitialized iface (%s)\n",
+			fi_tostr(&attr->iface, FI_TYPE_HMEM_IFACE));
 		return -FI_ENOSYS;
 	}
 
@@ -990,7 +991,8 @@ static int efa_mr_regattr(struct fid *fid, const struct fi_mr_attr *attr,
 
 	if (!ofi_hmem_is_initialized(attr->iface)) {
 		EFA_WARN(FI_LOG_MR,
-			 "Cannot register memory for uninitialized iface\n");
+			"Cannot register memory for uninitialized iface (%s)\n",
+			fi_tostr(&attr->iface, FI_TYPE_HMEM_IFACE));
 		return -FI_ENOSYS;
 	}
 

From a24dd59caefd8abddf6a80332ad85bf6cb1977e1 Mon Sep 17 00:00:00 2001
From: Darryl Abbate <drl@amazon.com>
Date: Fri, 29 Aug 2025 16:08:37 -0700
Subject: [PATCH 04/17] prov/efa: Improve `EFA_HMEM_IFACE_FOREACH*` macros

This actually assigns the fi_hmem_iface snum value to the provided
identifier, rather than providing an index into efa_hmem_ifaces

Signed-off-by: Darryl Abbate <drl@amazon.com>
---
 prov/efa/src/efa_hmem.c             |  7 ++++---
 prov/efa/src/efa_hmem.h             | 14 +++++++++-----
 prov/efa/src/efa_prov_info.c        |  4 +---
 prov/efa/src/efa_user_info.c        |  7 +++----
 prov/efa/src/rdm/efa_rdm_ep_fiops.c | 12 ++++++------
 5 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/prov/efa/src/efa_hmem.c b/prov/efa/src/efa_hmem.c
index 447e9fd4174..8a28bf2a664 100644
--- a/prov/efa/src/efa_hmem.c
+++ b/prov/efa/src/efa_hmem.c
@@ -365,7 +365,8 @@ int efa_hmem_validate_p2p_opt(enum fi_hmem_iface iface, int p2p_opt, uint32_t ap
  */
 int efa_hmem_info_initialize()
 {
-	int ret = 0, i = 0;
+	int ret = 0;
+	enum fi_hmem_iface iface;
 
 	if(g_efa_selected_device_cnt <= 0) {
 		return -FI_ENODEV;
@@ -373,8 +374,8 @@ int efa_hmem_info_initialize()
 
 	memset(g_efa_hmem_info, 0, OFI_HMEM_MAX * sizeof(struct efa_hmem_info));
 
-	EFA_HMEM_IFACE_FOREACH(i) {
-		efa_hmem_info_init_iface(efa_hmem_ifaces[i]);
+	EFA_HMEM_IFACE_FOREACH(iface) {
+		efa_hmem_info_init_iface(iface);
 	}
 
 	return ret;
diff --git a/prov/efa/src/efa_hmem.h b/prov/efa/src/efa_hmem.h
index c04c2139063..8d95d7e59f6 100644
--- a/prov/efa/src/efa_hmem.h
+++ b/prov/efa/src/efa_hmem.h
@@ -7,11 +7,15 @@
 #include "ofi_hmem.h"
 #include "efa_mr.h"
 
-#define EFA_HMEM_IFACE_FOREACH(var) \
-	for ((var) = 0; (var) < ((sizeof efa_hmem_ifaces) / (sizeof (enum fi_hmem_iface))); ++(var))
-
-#define EFA_HMEM_IFACE_FOREACH_NON_SYSTEM(var) \
-	for ((var) = 1; (var) < ((sizeof efa_hmem_ifaces) / (sizeof (enum fi_hmem_iface))); ++(var))
+#define EFA_HMEM_IFACE_FOREACH_FROM(var, start) \
+	for ( \
+		const enum fi_hmem_iface *_p = &efa_hmem_ifaces[start]; \
+		_p < &efa_hmem_ifaces[sizeof efa_hmem_ifaces / sizeof (enum fi_hmem_iface)] && ((var) = *_p, 1); \
+		_p++ \
+	)
+
+#define EFA_HMEM_IFACE_FOREACH(var) EFA_HMEM_IFACE_FOREACH_FROM(var, 0)
+#define EFA_HMEM_IFACE_FOREACH_NON_SYSTEM(var) EFA_HMEM_IFACE_FOREACH_FROM(var, 1)
 
 /* Order matters */
 static const enum fi_hmem_iface efa_hmem_ifaces[] = {
diff --git a/prov/efa/src/efa_prov_info.c b/prov/efa/src/efa_prov_info.c
index fe1b3300b42..1eebeb258c8 100644
--- a/prov/efa/src/efa_prov_info.c
+++ b/prov/efa/src/efa_prov_info.c
@@ -391,7 +391,6 @@ static int efa_prov_info_set_nic_attr(struct fi_info *prov_info, struct efa_devi
 #if HAVE_CUDA || HAVE_NEURON || HAVE_SYNAPSEAI
 void efa_prov_info_direct_set_hmem_flags(struct fi_info *prov_info)
 {
-	int i;
 	enum fi_hmem_iface iface;
 	struct efa_hmem_info *hmem_info;
 
@@ -414,8 +413,7 @@ void efa_prov_info_direct_set_hmem_flags(struct fi_info *prov_info)
 	}
 
 	/* EFA direct only supports HMEM when p2p support is available */
-	EFA_HMEM_IFACE_FOREACH_NON_SYSTEM(i) {
-		iface = efa_hmem_ifaces[i];
+	EFA_HMEM_IFACE_FOREACH_NON_SYSTEM(iface) {
 		hmem_info = &g_efa_hmem_info[iface];
 		if (hmem_info->initialized && !hmem_info->p2p_supported_by_device) {
 			EFA_INFO(FI_LOG_CORE,
diff --git a/prov/efa/src/efa_user_info.c b/prov/efa/src/efa_user_info.c
index f983f3e40f6..52ec84e0a31 100644
--- a/prov/efa/src/efa_user_info.c
+++ b/prov/efa/src/efa_user_info.c
@@ -186,7 +186,7 @@ bool efa_user_info_should_support_hmem(int version)
 {
 	bool any_hmem, rdma_allowed;
 	char *extra_info = "";
-	int i;
+	enum fi_hmem_iface iface;
 
 	/* Note that the default behavior of EFA provider is different between
 	 * libfabric API version when CUDA is used as HMEM system.
@@ -220,12 +220,11 @@ bool efa_user_info_should_support_hmem(int version)
 	}
 
 	any_hmem = false;
-	EFA_HMEM_IFACE_FOREACH_NON_SYSTEM(i) {
-		enum fi_hmem_iface hmem_iface = efa_hmem_ifaces[i];
+	EFA_HMEM_IFACE_FOREACH_NON_SYSTEM(iface) {
 		/* Note that .initialized doesn't necessarily indicate there are
 		   hardware devices available, only that the libraries are
 		   available. */
-		if (hmem_ops[hmem_iface].initialized) {
+		if (hmem_ops[iface].initialized) {
 			any_hmem = true;
 		}
 	}
diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c
index 303aaac11d6..ca82e337cc5 100644
--- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c
+++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c
@@ -510,7 +510,7 @@ int efa_rdm_ep_open(struct fid_domain *domain, struct fi_info *info,
 {
 	struct efa_domain *efa_domain = NULL;
 	struct efa_rdm_ep *efa_rdm_ep = NULL;
-	int ret, retv, i;
+	int ret, retv;
 	enum fi_hmem_iface iface;
 
 	efa_rdm_ep = calloc(1, sizeof(*efa_rdm_ep));
@@ -615,8 +615,7 @@ int efa_rdm_ep_open(struct fid_domain *domain, struct fi_info *info,
 	 * time. Refactor to handle multiple initialized interfaces to impose
 	 * tighter requirements for the default p2p opt
 	 */
-	EFA_HMEM_IFACE_FOREACH_NON_SYSTEM(i) {
-		iface = efa_hmem_ifaces[i];
+	EFA_HMEM_IFACE_FOREACH_NON_SYSTEM(iface) {
 		if (g_efa_hmem_info[iface].initialized &&
 		    g_efa_hmem_info[iface].p2p_supported_by_device) {
 			/* If user is using libfabric API 1.18 or later, by default EFA
@@ -1458,7 +1457,8 @@ ssize_t efa_rdm_ep_cancel(fid_t fid_ep, void *context)
  */
 static int efa_rdm_ep_set_fi_hmem_p2p_opt(struct efa_rdm_ep *efa_rdm_ep, int opt)
 {
-	int i, err;
+	int err;
+	enum fi_hmem_iface iface;
 
 	/*
 	 * Check the opt's validity against the first initialized non-system FI_HMEM
@@ -1469,9 +1469,9 @@ static int efa_rdm_ep_set_fi_hmem_p2p_opt(struct efa_rdm_ep *efa_rdm_ep, int opt
 	 * time. Refactor to handle multiple initialized interfaces to impose
 	 * tighter restrictions on valid p2p options.
 	 */
-	EFA_HMEM_IFACE_FOREACH_NON_SYSTEM(i) {
+	EFA_HMEM_IFACE_FOREACH_NON_SYSTEM(iface) {
 		err = efa_hmem_validate_p2p_opt(
-			efa_hmem_ifaces[i], opt,
+			iface, opt,
 			efa_rdm_ep->base_ep.info->fabric_attr->api_version);
 		if (err == -FI_ENODATA)
 			continue;

From 3d701de2a2590f54f2e17e45bb5ab1724dc0422a Mon Sep 17 00:00:00 2001
From: Darryl Abbate <drl@amazon.com>
Date: Fri, 31 Oct 2025 14:27:32 -0700
Subject: [PATCH 05/17] prov/efa: Generalize HMEM init guards in prov info init

Signed-off-by: Darryl Abbate <drl@amazon.com>
---
 prov/efa/src/efa_prov_info.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/prov/efa/src/efa_prov_info.c b/prov/efa/src/efa_prov_info.c
index 1eebeb258c8..edc3f76a6df 100644
--- a/prov/efa/src/efa_prov_info.c
+++ b/prov/efa/src/efa_prov_info.c
@@ -393,6 +393,7 @@ void efa_prov_info_direct_set_hmem_flags(struct fi_info *prov_info)
 {
 	enum fi_hmem_iface iface;
 	struct efa_hmem_info *hmem_info;
+	bool any_hmem_initialized = false;
 
 	assert(prov_info->ep_attr->type == FI_EP_RDM);
 
@@ -406,12 +407,16 @@ void efa_prov_info_direct_set_hmem_flags(struct fi_info *prov_info)
 	}
 
 	/* Check if HMEM libraries are available at runtime */
-	if (!ofi_hmem_is_initialized(FI_HMEM_CUDA) &&
-	    !ofi_hmem_is_initialized(FI_HMEM_NEURON) &&
-	    !ofi_hmem_is_initialized(FI_HMEM_SYNAPSEAI)) {
-			return;
+	EFA_HMEM_IFACE_FOREACH_NON_SYSTEM(iface) {
+		if (ofi_hmem_is_initialized(iface)) {
+			any_hmem_initialized = true;
+			break;
+		}
 	}
 
+	if (!any_hmem_initialized)
+		return;
+
 	/* EFA direct only supports HMEM when p2p support is available */
 	EFA_HMEM_IFACE_FOREACH_NON_SYSTEM(iface) {
 		hmem_info = &g_efa_hmem_info[iface];
@@ -557,6 +562,8 @@ int efa_prov_info_alloc_for_rdm(struct fi_info **prov_info_rdm_ptr,
 	uint64_t efa_domain_caps = FI_LOCAL_COMM | FI_REMOTE_COMM;
 
 	struct fi_info *prov_info_rdm;
+	enum fi_hmem_iface iface;
+	bool any_hmem_initialized = false;
 
 	assert(device->rdm_info);
 
@@ -675,10 +682,14 @@ int efa_prov_info_alloc_for_rdm(struct fi_info **prov_info_rdm_ptr,
 			prov_info_rdm->rx_attr->size = efa_env.rx_size;
 	}
 
+	EFA_HMEM_IFACE_FOREACH_NON_SYSTEM(iface) {
+		if (ofi_hmem_is_initialized(iface)) {
+			any_hmem_initialized = true;
+			break;
+		}
+	}
 	/* EFA RDM can support HMEM even if p2p support is not available */
-	if ((ofi_hmem_is_initialized(FI_HMEM_CUDA) ||
-	     ofi_hmem_is_initialized(FI_HMEM_NEURON) ||
-	     ofi_hmem_is_initialized(FI_HMEM_SYNAPSEAI))) {
+	if (any_hmem_initialized) {
 		prov_info_rdm->caps |= FI_HMEM;
 		prov_info_rdm->tx_attr->caps |= FI_HMEM;
 		prov_info_rdm->rx_attr->caps |= FI_HMEM;

From 3ef73aef02434ba91fd06df17fd9854840eb7917 Mon Sep 17 00:00:00 2001
From: Darryl Abbate <drl@amazon.com>
Date: Fri, 31 Oct 2025 14:13:15 -0700
Subject: [PATCH 06/17] prov/efa: Add `EFA_HAVE_NON_SYSTEM_HMEM` macro

Signed-off-by: Darryl Abbate <drl@amazon.com>
---
 prov/efa/src/efa_hmem.h            | 6 ++++++
 prov/efa/src/efa_prov_info.c       | 2 +-
 prov/efa/src/efa_user_info.c       | 2 +-
 prov/efa/test/efa_unit_test_info.c | 4 ++--
 4 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/prov/efa/src/efa_hmem.h b/prov/efa/src/efa_hmem.h
index 8d95d7e59f6..4bc72c305e3 100644
--- a/prov/efa/src/efa_hmem.h
+++ b/prov/efa/src/efa_hmem.h
@@ -7,6 +7,12 @@
 #include "ofi_hmem.h"
 #include "efa_mr.h"
 
+#if HAVE_CUDA || HAVE_NEURON || HAVE_SYNAPSEAI
+#  define EFA_HAVE_NON_SYSTEM_HMEM 1
+#else
+#  define EFA_HAVE_NON_SYSTEM_HMEM 0
+#endif
+
 #define EFA_HMEM_IFACE_FOREACH_FROM(var, start) \
 	for ( \
 		const enum fi_hmem_iface *_p = &efa_hmem_ifaces[start]; \
diff --git a/prov/efa/src/efa_prov_info.c b/prov/efa/src/efa_prov_info.c
index edc3f76a6df..a8171f1d2af 100644
--- a/prov/efa/src/efa_prov_info.c
+++ b/prov/efa/src/efa_prov_info.c
@@ -388,7 +388,7 @@ static int efa_prov_info_set_nic_attr(struct fi_info *prov_info, struct efa_devi
 	return ret;
 }
 
-#if HAVE_CUDA || HAVE_NEURON || HAVE_SYNAPSEAI
+#if EFA_HAVE_NON_SYSTEM_HMEM
 void efa_prov_info_direct_set_hmem_flags(struct fi_info *prov_info)
 {
 	enum fi_hmem_iface iface;
diff --git a/prov/efa/src/efa_user_info.c b/prov/efa/src/efa_user_info.c
index 52ec84e0a31..7f053b37e43 100644
--- a/prov/efa/src/efa_user_info.c
+++ b/prov/efa/src/efa_user_info.c
@@ -175,7 +175,7 @@ int efa_user_info_check_domain_object(const struct fi_info *hints,
 	return 0;
 }
 
-#if HAVE_CUDA || HAVE_NEURON || HAVE_SYNAPSEAI
+#if EFA_HAVE_NON_SYSTEM_HMEM
 /**
  * @brief determine if EFA provider should claim support of FI_HMEM in info
  * @param[in]	version		libfabric API version used by user
diff --git a/prov/efa/test/efa_unit_test_info.c b/prov/efa/test/efa_unit_test_info.c
index 47f31ab7745..7b029319bdc 100644
--- a/prov/efa/test/efa_unit_test_info.c
+++ b/prov/efa/test/efa_unit_test_info.c
@@ -66,7 +66,7 @@ void test_info_rdm_attributes()
 		assert_int_equal(info->ep_attr->max_msg_size, UINT64_MAX);
 		assert_int_equal(info->domain_attr->progress, FI_PROGRESS_MANUAL);
 		assert_int_equal(info->domain_attr->control_progress, FI_PROGRESS_MANUAL);
-#if HAVE_CUDA || HAVE_NEURON || HAVE_SYNAPSEAI
+#if EFA_HAVE_NON_SYSTEM_HMEM
 		assert_true(info->caps | FI_HMEM);
 #endif
 	}
@@ -159,7 +159,7 @@ void test_info_direct_attributes_rma()
 /**
  * @brief Verify that efa direct only supports HMEM with p2p
  */
-#if HAVE_CUDA || HAVE_NEURON || HAVE_SYNAPSEAI
+#if EFA_HAVE_NON_SYSTEM_HMEM
 void test_info_direct_hmem_support_p2p()
 {
 	struct fi_info *info;

From cbd7502668df0675ecc5dc88c6b43d3effffe714 Mon Sep 17 00:00:00 2001
From: Darryl Abbate <drl@amazon.com>
Date: Mon, 8 Sep 2025 14:54:44 -0700
Subject: [PATCH 07/17] prov/efa: Cache default max eager msg size

This is a constant value; no need to re-compute every time the function
is called.

Signed-off-by: Darryl Abbate <drl@amazon.com>
---
 prov/efa/src/efa_hmem.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/prov/efa/src/efa_hmem.c b/prov/efa/src/efa_hmem.c
index 8a28bf2a664..8eb2825af51 100644
--- a/prov/efa/src/efa_hmem.c
+++ b/prov/efa/src/efa_hmem.c
@@ -9,11 +9,16 @@ struct efa_hmem_info g_efa_hmem_info[OFI_HMEM_MAX];
 
 #if HAVE_CUDA || HAVE_NEURON
 static size_t efa_max_eager_msg_size_with_largest_header() {
-	int mtu_size;
+	static bool computed = false;
+	static size_t size = 0;
 
-	mtu_size = g_efa_selected_device_list[0].ibv_port_attr.max_msg_sz;
+	if (!computed) {
+		assert(g_efa_selected_device_list);
+		size = g_efa_selected_device_list[0].ibv_port_attr.max_msg_sz - efa_rdm_pkt_type_get_max_hdr_size();
+		computed = true;
+	}
 
-	return mtu_size - efa_rdm_pkt_type_get_max_hdr_size();
+	return size;
 }
 #else
 static size_t efa_max_eager_msg_size_with_largest_header() {

From 15664d3c5f8dc6241d51fb9d7bba6427fd6dc9d9 Mon Sep 17 00:00:00 2001
From: Darryl Abbate <drl@amazon.com>
Date: Tue, 2 Sep 2025 14:41:53 -0700
Subject: [PATCH 08/17] prov/efa: Flatten `efa_mr_peer.device`

This was ostensibly designed to match fi_mr_attr.device, but should
instead be treated more like a subset of ofi_mr_info, where device is
simply a uint64_t. Aliasing fi_mr_attr.device is already silly since
fi_mr_attr.iface should tell you the device type.

Signed-off-by: Darryl Abbate <drl@amazon.com>
---
 prov/efa/src/efa_mr.c                | 9 ++++-----
 prov/efa/src/efa_mr.h                | 7 +------
 prov/efa/src/rdm/efa_rdm_pke_utils.c | 2 +-
 prov/efa/src/rdm/efa_rdm_pke_utils.h | 2 +-
 4 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/prov/efa/src/efa_mr.c b/prov/efa/src/efa_mr.c
index 128b54717c9..1746f543bb5 100644
--- a/prov/efa/src/efa_mr.c
+++ b/prov/efa/src/efa_mr.c
@@ -213,13 +213,12 @@ static int efa_mr_hmem_setup(struct efa_mr *efa_mr,
 		efa_mr->peer.iface = FI_HMEM_SYSTEM;
 	}
 
-	/* efa_mr->peer.device is an union. Setting reserved to 0 cleared everything in it (cuda, neuron, synapseai etc) */
-	efa_mr->peer.device.reserved = 0;
+	efa_mr->peer.device = 0;
 	efa_mr->peer.flags &= ~OFI_HMEM_DATA_DEV_REG_HANDLE;
 	efa_mr->peer.hmem_data = NULL;
 	if (efa_mr->peer.iface == FI_HMEM_CUDA) {
 		efa_mr->needs_sync = true;
-		efa_mr->peer.device.cuda = attr->device.cuda;
+		efa_mr->peer.device = attr->device.cuda;
 
 		/* Only attempt GDRCopy registrations for efa rdm path */
 		if (efa_mr->domain->info_type == EFA_INFO_RDM && !(flags & FI_MR_DMABUF) && cuda_is_gdrcopy_enabled()) {
@@ -237,9 +236,9 @@ static int efa_mr_hmem_setup(struct efa_mr *efa_mr,
 			}
 		}
 	} else if (attr->iface == FI_HMEM_NEURON) {
-		efa_mr->peer.device.neuron = attr->device.neuron;
+		efa_mr->peer.device = attr->device.neuron;
 	} else if (attr->iface == FI_HMEM_SYNAPSEAI) {
-		efa_mr->peer.device.synapseai = attr->device.synapseai;
+		efa_mr->peer.device = attr->device.synapseai;
 	}
 
 	return FI_SUCCESS;
diff --git a/prov/efa/src/efa_mr.h b/prov/efa/src/efa_mr.h
index 8ef6ec03b7e..558dc6c3306 100644
--- a/prov/efa/src/efa_mr.h
+++ b/prov/efa/src/efa_mr.h
@@ -12,12 +12,7 @@
  */
 struct efa_mr_peer {
 	enum fi_hmem_iface  iface;
-	union {
-	    uint64_t        reserved;
-	    uint64_t        cuda;
-	    int             neuron;
-	    int             synapseai;
-	} device;
+	uint64_t			device;
 	uint64_t            flags;
 	void                *hmem_data;
 };
diff --git a/prov/efa/src/rdm/efa_rdm_pke_utils.c b/prov/efa/src/rdm/efa_rdm_pke_utils.c
index a227e7c32ce..e3489677419 100644
--- a/prov/efa/src/rdm/efa_rdm_pke_utils.c
+++ b/prov/efa/src/rdm/efa_rdm_pke_utils.c
@@ -149,7 +149,7 @@ int efa_rdm_ep_flush_queued_blocking_copy_to_hmem(struct efa_rdm_ep *ep)
 								data, pkt_entry->payload_size);
 		} else {
 			bytes_copied[i] = ofi_copy_to_hmem_iov(desc->peer.iface,
-			                                       desc->peer.device.reserved,
+			                                       desc->peer.device,
 			                                       rxe->iov, rxe->iov_count,
 			                                       segment_offset + ep->msg_prefix_size,
 			                                       data, pkt_entry->payload_size);
diff --git a/prov/efa/src/rdm/efa_rdm_pke_utils.h b/prov/efa/src/rdm/efa_rdm_pke_utils.h
index 8c6c13b1fc9..0d6960da843 100644
--- a/prov/efa/src/rdm/efa_rdm_pke_utils.h
+++ b/prov/efa/src/rdm/efa_rdm_pke_utils.h
@@ -150,7 +150,7 @@ efa_rdm_pke_copy_from_hmem_iov(struct efa_mr *iov_mr, struct efa_rdm_pke *pke,
 		copied = ofi_copy_from_hmem_iov(pke->wiredata + payload_offset,
 		                                data_size,
 		                                iov_mr ? iov_mr->peer.iface : FI_HMEM_SYSTEM,
-		                                iov_mr ? iov_mr->peer.device.reserved : 0,
+		                                iov_mr ? iov_mr->peer.device : 0,
 		                                ope->iov, ope->iov_count, segment_offset);
 	}
 

From 6b9d71b9572bca7134783a44152fc7ed7e69bd54 Mon Sep 17 00:00:00 2001
From: Darryl Abbate <drl@amazon.com>
Date: Tue, 2 Sep 2025 13:52:47 -0700
Subject: [PATCH 09/17] prov/efa: Improve efa_copy_(to|from)_hmem()

Signed-off-by: Darryl Abbate <drl@amazon.com>
---
 prov/efa/src/efa_hmem.h   | 69 ++++++++++++++++++++-------------------
 prov/efa/src/efa_tp_def.h | 61 ++++++++++++++++++++++++++++++++++
 2 files changed, 96 insertions(+), 34 deletions(-)

diff --git a/prov/efa/src/efa_hmem.h b/prov/efa/src/efa_hmem.h
index 4bc72c305e3..fbbf4151ea1 100644
--- a/prov/efa/src/efa_hmem.h
+++ b/prov/efa/src/efa_hmem.h
@@ -6,6 +6,7 @@
 
 #include "ofi_hmem.h"
 #include "efa_mr.h"
+#include "efa_tp.h"
 
 #if HAVE_CUDA || HAVE_NEURON || HAVE_SYNAPSEAI
 #  define EFA_HAVE_NON_SYSTEM_HMEM 1
@@ -50,32 +51,32 @@ int efa_hmem_info_initialize();
  * @brief Copy data from a hmem device to a system buffer
  *
  * @param[in]    desc            Pointer to a memory registration descriptor
- * @param[out]   buff            Destination system memory buffer
+ * @param[out]   dest            Destination system memory buffer
  * @param[in]    src             Source hmem device memory
  * @param[in]    size            Data size in bytes to copy
  * @return       FI_SUCCESS status code on success, or an error code.
  */
-static inline int efa_copy_from_hmem(void *desc, void *buff, const void *src, size_t size)
+static inline int efa_copy_from_hmem(void *desc, void *dest, const void *src, size_t size)
 {
-	uint64_t device = 0, flags = 0;
-	enum fi_hmem_iface iface = FI_HMEM_SYSTEM;
-	void *hmem_data = NULL;
-
-	if (desc) {
-		iface = ((struct efa_mr *)desc)->peer.iface;
-		device = ((struct efa_mr *)desc)->peer.device.reserved;
-		flags = ((struct efa_mr *)desc)->peer.flags;
-		hmem_data = ((struct efa_mr *)desc)->peer.hmem_data;
-	}
+	struct efa_mr_peer peer = { .iface = FI_HMEM_SYSTEM };
+
+	if (desc)
+		peer = ((struct efa_mr *) desc)->peer;
 
-	if (FI_HMEM_CUDA == iface && (flags & OFI_HMEM_DATA_DEV_REG_HANDLE)) {
-		assert(hmem_data);
+	if (peer.flags & OFI_HMEM_DATA_DEV_REG_HANDLE) {
+		assert(peer.hmem_data);
+		switch (peer.iface) {
 		/* TODO: Fine tune the max data size to switch from gdrcopy to cudaMemcpy */
-		cuda_gdrcopy_from_dev((uint64_t)hmem_data, buff, src, size);
-		return FI_SUCCESS;
+		case FI_HMEM_CUDA:
+			efa_tracepoint(dev_reg_copy_from_hmem, &peer, dest, src, size);
+			return ofi_hmem_dev_reg_copy_from_hmem(peer.iface, (uint64_t) peer.hmem_data, dest, src, size);
+		default:
+			break;
+		}
 	}
 
-	return ofi_copy_from_hmem(iface, device, buff, src, size);
+	efa_tracepoint(copy_from_hmem, &peer, dest, src, size);
+	return ofi_copy_from_hmem(peer.iface, peer.device, dest, src, size);
 };
 
 /**
@@ -83,31 +84,31 @@ static inline int efa_copy_from_hmem(void *desc, void *buff, const void *src, si
  *
  * @param[in]    desc            Pointer to a memory registration descriptor
  * @param[out]   dest            Destination hmem device memory
- * @param[in]    buff            Source system memory buffer
+ * @param[in]    src			 Source system memory buffer
  * @param[in]    size            Data size in bytes to copy
  * @return       FI_SUCCESS status code on success, or an error code.
  */
-static inline int efa_copy_to_hmem(void *desc, void *dest, const void *buff, size_t size)
+static inline int efa_copy_to_hmem(void *desc, void *dest, const void *src, size_t size)
 {
-	uint64_t device = 0, flags = 0;
-	enum fi_hmem_iface iface = FI_HMEM_SYSTEM;
-	void *hmem_data = NULL;
-
-	if (desc) {
-		iface = ((struct efa_mr *)desc)->peer.iface;
-		device = ((struct efa_mr *)desc)->peer.device.reserved;
-		flags = ((struct efa_mr *)desc)->peer.flags;
-		hmem_data = ((struct efa_mr *)desc)->peer.hmem_data;
-	}
+	struct efa_mr_peer peer = { .iface = FI_HMEM_SYSTEM };
+
+	if (desc)
+		peer = ((struct efa_mr *) desc)->peer;
 
-	if (FI_HMEM_CUDA == iface && (flags & OFI_HMEM_DATA_DEV_REG_HANDLE)) {
-		assert(hmem_data);
+	if (peer.flags & OFI_HMEM_DATA_DEV_REG_HANDLE) {
+		assert(peer.hmem_data);
+		switch (peer.iface) {
 		/* TODO: Fine tune the max data size to switch from gdrcopy to cudaMemcpy */
-		cuda_gdrcopy_to_dev((uint64_t)hmem_data, dest, buff, size);
-		return FI_SUCCESS;
+		case FI_HMEM_CUDA:
+			efa_tracepoint(dev_reg_copy_to_hmem, &peer, dest, src, size);
+			return ofi_hmem_dev_reg_copy_to_hmem(peer.iface, (uint64_t) peer.hmem_data, dest, src, size);
+		default:
+			break;
+		}
 	}
 
-	return ofi_copy_to_hmem(iface, device, dest, buff, size);
+	efa_tracepoint(copy_to_hmem, &peer, dest, src, size);
+	return ofi_copy_to_hmem(peer.iface, peer.device, dest, src, size);
 };
 
 ssize_t efa_copy_from_hmem_iov(void **desc, char *buff, size_t buff_size, const struct iovec *hmem_iov, size_t iov_count);
diff --git a/prov/efa/src/efa_tp_def.h b/prov/efa/src/efa_tp_def.h
index 8598cd4d9fb..ded496d2d05 100644
--- a/prov/efa/src/efa_tp_def.h
+++ b/prov/efa/src/efa_tp_def.h
@@ -11,6 +11,8 @@
 #define _EFA_TP_DEF_H
 
 #include <lttng/tracepoint.h>
+#include <rdma/fi_domain.h>
+#include "efa_mr.h"
 
 #define EFA_TP_PROV efa
 
@@ -233,6 +235,65 @@ LTTNG_UST_TRACEPOINT_LOGLEVEL(EFA_TP_PROV, data_path_direct_process_completion,
 
 #endif /* HAVE_EFA_DATA_PATH_DIRECT */
 
+LTTNG_UST_TRACEPOINT_ENUM(efa, hmem_iface,
+    LTTNG_UST_TP_ENUM_VALUES(
+        lttng_ust_field_enum_value("FI_HMEM_SYSTEM", FI_HMEM_SYSTEM)
+        lttng_ust_field_enum_value("FI_HMEM_CUDA", FI_HMEM_CUDA)
+        lttng_ust_field_enum_value("FI_HMEM_ROCR", FI_HMEM_ROCR)
+        lttng_ust_field_enum_value("FI_HMEM_NEURON", FI_HMEM_NEURON)
+        lttng_ust_field_enum_value("FI_HMEM_SYNAPSEAI", FI_HMEM_SYNAPSEAI)
+    )
+)
+
+#define HMEM_COPY_ARGS \
+	struct efa_mr_peer *, peer, \
+	void *, dest, \
+	const void *, src, \
+	size_t, size
+
+#define HMEM_COMMON_FIELDS \
+	lttng_ust_field_enum(efa, hmem_iface, int, iface, peer->iface) \
+	lttng_ust_field_integer_hex(void *, dest, dest) \
+	lttng_ust_field_integer_hex(void *, src, src) \
+	lttng_ust_field_integer(size_t, size, size)
+
+#define HMEM_COPY_FIELDS \
+	HMEM_COMMON_FIELDS \
+	lttng_ust_field_integer(uint64_t, device, peer->device)
+
+#define HMEM_DEV_REG_COPY_FIELDS \
+	HMEM_COMMON_FIELDS \
+	lttng_ust_field_integer_hex(void *, handle, peer->hmem_data)
+
+
+LTTNG_UST_TRACEPOINT_EVENT_CLASS(EFA_TP_PROV, hmem_copy,
+	LTTNG_UST_TP_ARGS(HMEM_COPY_ARGS),
+	LTTNG_UST_TP_FIELDS(HMEM_COPY_FIELDS))
+
+LTTNG_UST_TRACEPOINT_EVENT_CLASS(EFA_TP_PROV, hmem_dev_reg_copy,
+	LTTNG_UST_TP_ARGS(HMEM_COPY_ARGS),
+	LTTNG_UST_TP_FIELDS(HMEM_DEV_REG_COPY_FIELDS))
+
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(EFA_TP_PROV, hmem_copy, EFA_TP_PROV,
+	copy_to_hmem,
+	LTTNG_UST_TP_ARGS(HMEM_COPY_ARGS))
+LTTNG_UST_TRACEPOINT_LOGLEVEL(EFA_TP_PROV, copy_to_hmem, LTTNG_UST_TRACEPOINT_LOGLEVEL_INFO)
+
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(EFA_TP_PROV, hmem_copy, EFA_TP_PROV,
+	copy_from_hmem,
+	LTTNG_UST_TP_ARGS(HMEM_COPY_ARGS))
+LTTNG_UST_TRACEPOINT_LOGLEVEL(EFA_TP_PROV, copy_from_hmem, LTTNG_UST_TRACEPOINT_LOGLEVEL_INFO)
+
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(EFA_TP_PROV, hmem_dev_reg_copy, EFA_TP_PROV,
+	dev_reg_copy_to_hmem,
+	LTTNG_UST_TP_ARGS(HMEM_COPY_ARGS))
+LTTNG_UST_TRACEPOINT_LOGLEVEL(EFA_TP_PROV, dev_reg_copy_to_hmem, LTTNG_UST_TRACEPOINT_LOGLEVEL_INFO)
+
+LTTNG_UST_TRACEPOINT_EVENT_INSTANCE(EFA_TP_PROV, hmem_dev_reg_copy, EFA_TP_PROV,
+	dev_reg_copy_from_hmem,
+	LTTNG_UST_TP_ARGS(HMEM_COPY_ARGS))
+LTTNG_UST_TRACEPOINT_LOGLEVEL(EFA_TP_PROV, dev_reg_copy_from_hmem, LTTNG_UST_TRACEPOINT_LOGLEVEL_INFO)
+
 #endif /* _EFA_TP_DEF_H */
 
 #include <lttng/tracepoint-event.h>

From 180f8713d71669bdac79442db09803abfdc1f3b0 Mon Sep 17 00:00:00 2001
From: Darryl Abbate <drl@amazon.com>
Date: Fri, 21 Nov 2025 14:32:16 -0800
Subject: [PATCH 10/17] core: Add `rocr` to `fi_mr_attr.device`

Signed-off-by: Darryl Abbate <drl@amazon.com>
---
 include/rdma/fi_domain.h | 1 +
 man/fi_mr.3.md           | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/include/rdma/fi_domain.h b/include/rdma/fi_domain.h
index 0e6d0acb605..3c7a4ee4d06 100644
--- a/include/rdma/fi_domain.h
+++ b/include/rdma/fi_domain.h
@@ -179,6 +179,7 @@ struct fi_mr_attr {
 		int		ze;
 		int		neuron;
 		int		synapseai;
+		int		rocr;
 	} device;
 	void			*hmem_data;
 	size_t			page_size;
diff --git a/man/fi_mr.3.md b/man/fi_mr.3.md
index 7f5b472b229..bc2740cf0a4 100644
--- a/man/fi_mr.3.md
+++ b/man/fi_mr.3.md
@@ -608,6 +608,7 @@ struct fi_mr_attr {
         int            ze
         int            neuron;
         int            synapseai;
+        int            rocr;
     } device;
     void               *hmem_data;
     size_t             page_size;
@@ -809,6 +810,9 @@ field is determined by the value specified through iface.
 *synapseai*
 : For FI_HMEM_SYNAPSEAI, the device identifier for Habana Gaudi hardware.
 
+*rocr*
+: For FI_HMEM_ROCR, the device index for an AMD GPU.
+
 ## hmem_data
 The hmem_data field is reserved for future use and must be null.
 

From e6e05245eb15e41ab404137467abff95c2619710 Mon Sep 17 00:00:00 2001
From: Darryl Abbate <drl@amazon.com>
Date: Tue, 16 Sep 2025 16:05:40 -0700
Subject: [PATCH 11/17] hmem/rocr: Add HSA memory allocation utilities

Signed-off-by: Darryl Abbate <drl@amazon.com>
---
 include/ofi_hmem.h |   2 +
 src/hmem_rocr.c    | 144 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 146 insertions(+)

diff --git a/include/ofi_hmem.h b/include/ofi_hmem.h
index b405fd92978..ed2bc77ab0a 100644
--- a/include/ofi_hmem.h
+++ b/include/ofi_hmem.h
@@ -169,6 +169,8 @@ int rocr_dev_reg_copy_from_hmem(uint64_t handle, void *dest, const void *src,
 int rocr_hmem_get_dmabuf_fd(const void *addr, uint64_t size, int *dmabuf_fd,
 			    uint64_t *offset);
 int rocr_hmem_put_dmabuf_fd(int fd);
+void *rocr_alloc(size_t size);
+void rocr_free(void *ptr);
 
 int cuda_copy_to_dev(uint64_t device, void *dev, const void *host, size_t size);
 int cuda_copy_from_dev(uint64_t device, void *host, const void *dev, size_t size);
diff --git a/src/hmem_rocr.c b/src/hmem_rocr.c
index 59bc063813e..6ea9382f0a2 100644
--- a/src/hmem_rocr.c
+++ b/src/hmem_rocr.c
@@ -91,6 +91,15 @@ struct hsa_ops {
 	hsa_status_t (*hsa_shut_down)(void);
 	hsa_status_t (*hsa_status_string)(hsa_status_t status,
 					  const char **status_string);
+	hsa_status_t (*hsa_memory_allocate)(hsa_region_t region, size_t size,
+					void **ptr);
+	hsa_status_t (*hsa_memory_free)(void *ptr);
+	hsa_status_t (*hsa_agent_iterate_regions)(hsa_agent_t agent,
+					hsa_status_t (*callback)(hsa_region_t, void *),
+					void *data);
+	hsa_status_t (*hsa_region_get_info)(hsa_region_t region,
+					hsa_region_info_t attribute,
+					void *value);
 	hsa_status_t (*hsa_amd_dereg_dealloc_cb)(void *ptr,
 						 hsa_amd_deallocation_callback_t cb);
 	hsa_status_t (*hsa_amd_reg_dealloc_cb)(void *ptr,
@@ -156,6 +165,11 @@ static struct hsa_ops hsa_ops = {
 	.hsa_init = hsa_init,
 	.hsa_shut_down = hsa_shut_down,
 	.hsa_status_string = hsa_status_string,
+	/* memory allocation */
+	.hsa_memory_allocate = hsa_memory_allocate,
+	.hsa_memory_free = hsa_memory_free,
+	.hsa_agent_iterate_regions = hsa_agent_iterate_regions,
+	.hsa_region_get_info = hsa_region_get_info,
 	/* used for memory monitoring */
 	.hsa_amd_dereg_dealloc_cb =
 		hsa_amd_deregister_deallocation_callback,
@@ -255,6 +269,98 @@ hsa_status_t ofi_hsa_amd_memory_unlock(void *host_ptr)
 	return hsa_ops.hsa_amd_memory_unlock(host_ptr);
 }
 
+hsa_status_t ofi_hsa_memory_allocate(hsa_region_t region, size_t size, void **ptr)
+{
+	return hsa_ops.hsa_memory_allocate(region, size, ptr);
+}
+
+hsa_status_t ofi_hsa_memory_free(void *ptr)
+{
+	return hsa_ops.hsa_memory_free(ptr);
+}
+
+hsa_status_t ofi_hsa_agent_iterate_regions(hsa_agent_t agent,
+		hsa_status_t (*callback)(hsa_region_t, void *),
+		void *data)
+{
+	return hsa_ops.hsa_agent_iterate_regions(agent, callback, data);
+}
+
+hsa_status_t ofi_hsa_region_get_info(hsa_region_t region,
+		hsa_region_info_t attribute, void *value)
+{
+	return hsa_ops.hsa_region_get_info(region, attribute, value);
+}
+
+static hsa_status_t find_gpu_memory_region(hsa_region_t region, void *data)
+{
+	hsa_region_segment_t segment;
+	hsa_region_global_flag_t flags;
+	hsa_status_t hsa_ret;
+	hsa_region_t *gpu_region = (hsa_region_t *) data;
+
+	hsa_ret = ofi_hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment);
+	if (hsa_ret != HSA_STATUS_SUCCESS || segment != HSA_REGION_SEGMENT_GLOBAL)
+		return hsa_ret;
+
+	hsa_ret = ofi_hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
+	if (hsa_ret != HSA_STATUS_SUCCESS)
+		return hsa_ret;
+
+	if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED) {
+		*gpu_region = region;
+		return HSA_STATUS_INFO_BREAK;
+	}
+
+	return hsa_ret;
+}
+
+void *rocr_alloc(size_t size)
+{
+	hsa_region_t gpu_region = {0};
+	hsa_status_t hsa_ret;
+	void *ptr;
+
+	if (rocr_agents.num_gpu == 0)
+		return NULL;
+
+	hsa_ret = ofi_hsa_agent_iterate_regions(rocr_agents.gpu_agents[0],
+						find_gpu_memory_region, &gpu_region);
+	if (hsa_ret != HSA_STATUS_SUCCESS && hsa_ret != HSA_STATUS_INFO_BREAK) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Failed to find GPU memory region: %s\n",
+			ofi_hsa_status_to_string(hsa_ret));
+		return NULL;
+	}
+
+	if (gpu_region.handle == 0) {
+		FI_WARN(&core_prov, FI_LOG_CORE, "No suitable GPU memory region found\n");
+		return NULL;
+	}
+
+	hsa_ret = ofi_hsa_memory_allocate(gpu_region, size, &ptr);
+	if (hsa_ret != HSA_STATUS_SUCCESS) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Failed to allocate GPU memory: %s\n",
+			ofi_hsa_status_to_string(hsa_ret));
+		return NULL;
+	}
+
+	return ptr;
+}
+
+void rocr_free(void *ptr)
+{
+	hsa_status_t hsa_ret;
+
+	hsa_ret = ofi_hsa_memory_free(ptr);
+	if (hsa_ret != HSA_STATUS_SUCCESS) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Failed to free GPU memory: %s\n",
+			ofi_hsa_status_to_string(hsa_ret));
+	}
+}
+
 hsa_status_t ofi_hsa_memory_copy(void *dst, const void *src, size_t size)
 {
 	return hsa_ops.hsa_memory_copy(dst, src, size);
@@ -882,6 +988,34 @@ static int rocr_hmem_dl_init(void)
 		goto err;
 	}
 
+	hsa_ops.hsa_memory_allocate = dlsym(hsa_handle, "hsa_memory_allocate");
+	if (!hsa_ops.hsa_memory_allocate) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Failed to find hsa_memory_allocate\n");
+		goto err;
+	}
+
+	hsa_ops.hsa_memory_free = dlsym(hsa_handle, "hsa_memory_free");
+	if (!hsa_ops.hsa_memory_free) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Failed to find hsa_memory_free\n");
+		goto err;
+	}
+
+	hsa_ops.hsa_agent_iterate_regions = dlsym(hsa_handle, "hsa_agent_iterate_regions");
+	if (!hsa_ops.hsa_agent_iterate_regions) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Failed to find hsa_agent_iterate_regions\n");
+		goto err;
+	}
+
+	hsa_ops.hsa_region_get_info = dlsym(hsa_handle, "hsa_region_get_info");
+	if (!hsa_ops.hsa_region_get_info) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Failed to find hsa_region_get_info\n");
+		goto err;
+	}
+
 #if HAVE_HSA_AMD_PORTABLE_EXPORT_DMABUF
 	hsa_ops.hsa_amd_portable_export_dmabuf = dlsym(hsa_handle, "hsa_amd_portable_export_dmabuf");
 	if (!hsa_ops.hsa_amd_portable_export_dmabuf) {
@@ -1367,4 +1501,14 @@ int rocr_hmem_put_dmabuf_fd(int fd)
 	return -FI_ENOSYS;
 }
 
+void *rocr_alloc(size_t size)
+{
+	return NULL;
+}
+
+void rocr_free(void *ptr)
+{
+	return;
+}
+
 #endif /* HAVE_ROCR */

From 31757139216611293d16d0966cb107a10f74b50e Mon Sep 17 00:00:00 2001
From: Darryl Abbate <drl@amazon.com>
Date: Fri, 21 Nov 2025 14:39:02 -0800
Subject: [PATCH 12/17] fabtests: Add dmabuf ops for ROCr HMEM interface

Signed-off-by: Darryl Abbate <drl@amazon.com>
---
 fabtests/common/hmem.c      |  6 ++---
 fabtests/common/hmem_rocr.c | 51 +++++++++++++++++++++++++++++++++++++
 fabtests/include/hmem.h     |  3 +++
 3 files changed, 57 insertions(+), 3 deletions(-)

diff --git a/fabtests/common/hmem.c b/fabtests/common/hmem.c
index 16d552c491a..8e8260a50d6 100644
--- a/fabtests/common/hmem.c
+++ b/fabtests/common/hmem.c
@@ -103,8 +103,8 @@ static struct ft_hmem_ops hmem_ops[] = {
 		.mem_set = ft_rocr_memset,
 		.copy_to_hmem = ft_rocr_memcpy,
 		.copy_from_hmem = ft_rocr_memcpy,
-		.get_dmabuf_fd = ft_hmem_no_get_dmabuf_fd,
-		.put_dmabuf_fd = ft_hmem_no_put_dmabuf_fd,
+		.get_dmabuf_fd = ft_rocr_get_dmabuf_fd,
+		.put_dmabuf_fd = ft_rocr_put_dmabuf_fd,
 	},
 	[FI_HMEM_ZE] = {
 		.init = ft_ze_init,
@@ -235,4 +235,4 @@ int ft_hmem_put_dmabuf_fd(enum fi_hmem_iface iface, int fd)
 int ft_hmem_no_put_dmabuf_fd(int fd)
 {
 	return -FI_ENOSYS;
-}
\ No newline at end of file
+}
diff --git a/fabtests/common/hmem_rocr.c b/fabtests/common/hmem_rocr.c
index 3fed6d77e93..163e70094b5 100644
--- a/fabtests/common/hmem_rocr.c
+++ b/fabtests/common/hmem_rocr.c
@@ -63,6 +63,9 @@ struct rocr_ops {
 	hsa_status_t (*hsa_memory_free)(void *ptr);
 	hsa_status_t (*hsa_amd_memory_fill)(void* ptr, uint32_t value,
 					    size_t count);
+	hsa_status_t (*hsa_amd_portable_export_dmabuf)(const void *ptr, size_t size,
+						       int *dmabuf, uint64_t *offset);
+	hsa_status_t (*hsa_amd_portable_close_dmabuf)(int dmabuf);
 };
 
 static struct rocr_ops rocr_ops;
@@ -200,6 +203,11 @@ int ft_rocr_init(void)
 		goto err_dlclose_rocr;
 	}
 
+	rocr_ops.hsa_amd_portable_export_dmabuf =
+		dlsym(rocr_handle, "hsa_amd_portable_export_dmabuf");
+	rocr_ops.hsa_amd_portable_close_dmabuf =
+		dlsym(rocr_handle, "hsa_amd_portable_close_dmabuf");
+
 	hsa_ret = rocr_ops.hsa_init();
 	if (hsa_ret != HSA_STATUS_SUCCESS) {
 		ROCR_ERR(hsa_ret, "hsa_init failed");
@@ -343,6 +351,38 @@ int ft_rocr_memcpy(uint64_t device, void *dst, const void *src, size_t size)
 	return -FI_EIO;
 }
 
+int ft_rocr_get_dmabuf_fd(void *buf, size_t len,
+			  int *dmabuf_fd, uint64_t *dmabuf_offset)
+{
+	hsa_status_t hsa_ret;
+
+	if (!rocr_ops.hsa_amd_portable_export_dmabuf)
+		return -FI_EOPNOTSUPP;
+
+	hsa_ret = rocr_ops.hsa_amd_portable_export_dmabuf(buf, len,
+							  dmabuf_fd, dmabuf_offset);
+	if (hsa_ret == HSA_STATUS_SUCCESS)
+		return FI_SUCCESS;
+
+	ROCR_ERR(hsa_ret, "hsa_amd_portable_export_dmabuf failed");
+	return -FI_EIO;
+}
+
+int ft_rocr_put_dmabuf_fd(int fd)
+{
+	hsa_status_t hsa_ret;
+
+	if (!rocr_ops.hsa_amd_portable_close_dmabuf)
+		return -FI_EOPNOTSUPP;
+
+	hsa_ret = rocr_ops.hsa_amd_portable_close_dmabuf(fd);
+	if (hsa_ret == HSA_STATUS_SUCCESS)
+		return FI_SUCCESS;
+
+	ROCR_ERR(hsa_ret, "hsa_amd_portable_close_dmabuf failed");
+	return -FI_EIO;
+}
+
 #else
 
 int ft_rocr_init(void)
@@ -375,4 +415,15 @@ int ft_rocr_memcpy(uint64_t device, void *dst, const void *src, size_t size)
 	return -FI_ENOSYS;
 }
 
+int ft_rocr_get_dmabuf_fd(void *buf, size_t len,
+			  int *dmabuf_fd, uint64_t *dmabuf_offset)
+{
+	return -FI_ENOSYS;
+}
+
+int ft_rocr_put_dmabuf_fd(int fd)
+{
+	return -FI_ENOSYS;
+}
+
 #endif /* HAVE_ROCR_RUNTIME_H */
diff --git a/fabtests/include/hmem.h b/fabtests/include/hmem.h
index db837e1d5ad..9233d4b3821 100644
--- a/fabtests/include/hmem.h
+++ b/fabtests/include/hmem.h
@@ -201,6 +201,9 @@ int ft_rocr_alloc(uint64_t device, void **buf, size_t size);
 int ft_rocr_free(void *buf);
 int ft_rocr_memset(uint64_t device, void *buf, int value, size_t size);
 int ft_rocr_memcpy(uint64_t device, void *dst, const void *src, size_t size);
+int ft_rocr_get_dmabuf_fd(void *buf, size_t len,
+			  int *fd, uint64_t *offset);
+int ft_rocr_put_dmabuf_fd(int fd);
 
 int ft_neuron_init(void);
 int ft_neuron_cleanup(void);

From 1e23e87b577fe9fd1acd2bc7138ebc4e00c796c8 Mon Sep 17 00:00:00 2001
From: Darryl Abbate <drl@amazon.com>
Date: Mon, 29 Sep 2025 14:14:31 -0700
Subject: [PATCH 13/17] fabtests/pytest: Add common ROCr utilities

Signed-off-by: Darryl Abbate <drl@amazon.com>
---
 fabtests/pytest/common.py | 32 +++++++++++++++++++++++++++++---
 1 file changed, 29 insertions(+), 3 deletions(-)

diff --git a/fabtests/pytest/common.py b/fabtests/pytest/common.py
index 8f1bd3b2f57..ef3585e287b 100644
--- a/fabtests/pytest/common.py
+++ b/fabtests/pytest/common.py
@@ -121,10 +121,32 @@ def wait_until_neuron_device_available(ip, device_id):
     raise RuntimeError("Error: neuron device {} is not available after {} tries".format(device_id, maxtry))
 
 
+@functools.lru_cache(10)
+@retry(retry_on_exception=is_ssh_connection_error, stop_max_attempt_number=3, wait_fixed=5000)
+def num_rocr_devices(ip):
+    proc = run("ssh {} rocm-smi --alldevices".format(ip), shell=True,
+               stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+               timeout=60, encoding="utf-8")
+
+    if has_ssh_connection_err_msg(proc.stderr):
+        raise SshConnectionError()
+
+    # Count lines that start with a digit (device number)
+    result = 0
+    lines = proc.stdout.split("\n")
+    for line in lines:
+        line = line.strip()
+        if line and line[0].isdigit():
+            result += 1
+
+    return result
+
+
 def num_hmem_devices(ip, hmem_type):
     function_table = {
         "cuda" : num_cuda_devices,
-        "neuron" : num_neuron_devices
+        "neuron" : num_neuron_devices,
+        "rocr": num_rocr_devices,
     }
 
     if hmem_type not in function_table:
@@ -141,6 +163,10 @@ def has_neuron(ip):
     return num_neuron_devices(ip) > 0
 
 
+def has_rocr(ip):
+    return num_rocr_devices(ip) > 0
+
+
 @retry(retry_on_exception=is_ssh_connection_error, stop_max_attempt_number=3, wait_fixed=5000)
 def has_hmem_support(cmdline_args, ip):
     binpath = cmdline_args.binpath or ""
@@ -444,7 +470,7 @@ def prepare_base_command(self, command_type, executable,
         if host_memory_type == "host":
             return command, additional_env    # default addtional environment variable
 
-        assert host_memory_type == "cuda" or host_memory_type == "neuron"
+        assert host_memory_type in ["cuda", "neuron", "rocr"]
 
         if not has_hmem_support(self._cmdline_args, host_ip):
             pytest.skip("no hmem support")
@@ -464,7 +490,7 @@ def prepare_base_command(self, command_type, executable,
         else:
             hmem_device_id = 0
 
-        if host_memory_type == "cuda":
+        if host_memory_type in ["cuda", "rocr"]:
             command += " -i {}".format(hmem_device_id)
         else:
             assert host_memory_type == "neuron"

From 1ee243e657f451d00817f45b6b13545811206fcb Mon Sep 17 00:00:00 2001
From: Darryl Abbate <drl@amazon.com>
Date: Mon, 8 Sep 2025 15:34:07 -0700
Subject: [PATCH 14/17] prov/efa: Enable `FI_HMEM_ROCR` support

Signed-off-by: Darryl Abbate <drl@amazon.com>
---
 prov/efa/src/efa_hmem.c | 96 ++++++++++++++++++++++++++++++++++++++---
 prov/efa/src/efa_hmem.h |  5 ++-
 prov/efa/src/efa_mr.c   | 32 ++++++++------
 prov/efa/src/efa_mr.h   | 13 ++++--
 4 files changed, 122 insertions(+), 24 deletions(-)

diff --git a/prov/efa/src/efa_hmem.c b/prov/efa/src/efa_hmem.c
index 8eb2825af51..c2a475302d5 100644
--- a/prov/efa/src/efa_hmem.c
+++ b/prov/efa/src/efa_hmem.c
@@ -5,9 +5,10 @@
 #include "efa_hmem.h"
 #include "rdm/efa_rdm_pkt_type.h"
 
-struct efa_hmem_info g_efa_hmem_info[OFI_HMEM_MAX];
+struct efa_hmem_info g_efa_hmem_info[];
 
-#if HAVE_CUDA || HAVE_NEURON
+// TODO double-check for ROCr
+#if HAVE_CUDA || HAVE_NEURON || HAVE_ROCR
 static size_t efa_max_eager_msg_size_with_largest_header() {
 	static bool computed = false;
 	static size_t size = 0;
@@ -57,6 +58,7 @@ static int efa_hmem_info_init_protocol_thresholds(enum fi_hmem_iface iface)
 		fi_param_get_size_t(&efa_prov, "inter_min_read_write_size", &info->min_read_write_size);
 		break;
 	case FI_HMEM_CUDA:
+	case FI_HMEM_ROCR:
 		info->runt_size = EFA_DEFAULT_RUNT_SIZE;
 		info->max_medium_msg_size = 0;
 		info->min_read_msg_size = efa_max_eager_msg_size_with_largest_header() + 1;
@@ -67,8 +69,8 @@ static int efa_hmem_info_init_protocol_thresholds(enum fi_hmem_iface iface)
 		if (-FI_ENODATA != fi_param_get(&efa_prov, "inter_max_medium_message_size", &tmp_value)) {
 			EFA_WARN(FI_LOG_CORE,
 			         "The environment variable FI_EFA_INTER_MAX_MEDIUM_MESSAGE_SIZE was set, "
-			         "but EFA HMEM via Cuda API only supports eager and runting read protocols. "
-			         "The variable will not modify CUDA memory run config.\n");
+			         "but only eager and runting read protocols are supported for %s over EFA.\n",
+					 fi_tostr(&iface, FI_TYPE_HMEM_IFACE));
 		}
 		break;
 	case FI_HMEM_NEURON:
@@ -186,6 +188,79 @@ static inline void efa_hmem_info_check_p2p_support_cuda(struct efa_hmem_info *in
 	return;
 }
 
+static inline void efa_hmem_info_check_p2p_support_rocr(struct efa_hmem_info *info) {
+#if HAVE_ROCR
+	void *ptr = NULL;
+	struct ibv_mr *ibv_mr;
+	struct ibv_pd *ibv_pd;
+	int ibv_access = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ;
+	size_t len = ofi_get_page_size() * 2;
+	int ret;
+	int dmabuf_fd;
+	uint64_t dmabuf_offset;
+
+	ptr = rocr_alloc(len);
+	if (!ptr) {
+		info->initialized = false;
+		EFA_WARN(FI_LOG_CORE, "Failed to allocate ROCr buffer\n");
+		return;
+	}
+
+	ibv_pd = ibv_alloc_pd(g_efa_selected_device_list[0].ibv_ctx);
+	if (!ibv_pd) {
+		EFA_WARN(FI_LOG_CORE, "Failed to allocate ibv_pd: %d\n", errno);
+		rocr_free(ptr);
+		return;
+	}
+
+#if HAVE_EFA_DMABUF_MR
+	ret = rocr_hmem_get_dmabuf_fd(ptr, len, &dmabuf_fd, &dmabuf_offset);
+	if (ret == FI_SUCCESS) {
+		ibv_mr = ibv_reg_dmabuf_mr(ibv_pd, dmabuf_offset,
+					   len, (uint64_t) ptr, dmabuf_fd, ibv_access);
+		(void) rocr_hmem_put_dmabuf_fd(dmabuf_fd);
+		if (!ibv_mr) {
+			EFA_INFO(FI_LOG_CORE,
+				"Unable to register ROCr device buffer via dmabuf: %s. "
+				"Fall back to ibv_reg_mr\n", fi_strerror(-errno));
+			ibv_mr = ibv_reg_mr(ibv_pd, ptr, len, ibv_access);
+		}
+	} else {
+		EFA_INFO(FI_LOG_CORE,
+			"Unable to retrieve dmabuf fd of ROCr device buffer: %d. "
+			"Fall back to ibv_reg_mr\n", ret);
+		ibv_mr = ibv_reg_mr(ibv_pd, ptr, len, ibv_access);
+	}
+#else
+	ibv_mr = ibv_reg_mr(ibv_pd, ptr, len, ibv_access);
+#endif
+
+	if (!ibv_mr) {
+		info->p2p_supported_by_device = false;
+		EFA_WARN(FI_LOG_CORE,
+			 "Failed to register ROCr buffer with the EFA device, FI_HMEM transfers that require peer to peer support will fail.\n");
+		rocr_free(ptr);
+		(void) ibv_dealloc_pd(ibv_pd);
+		return;
+	}
+
+	ret = ibv_dereg_mr(ibv_mr);
+	rocr_free(ptr);
+	(void) ibv_dealloc_pd(ibv_pd);
+	if (ret) {
+		EFA_WARN(FI_LOG_CORE,
+			 "Failed to deregister ROCr buffer: %s\n",
+			 fi_strerror(-ret));
+		return;
+	}
+
+	info->p2p_supported_by_device = true;
+	return;
+
+#endif
+	return;
+}
+
 static inline void efa_hmem_info_check_p2p_support_neuron(struct efa_hmem_info *info) {
 #if HAVE_NEURON
 	struct ibv_mr *ibv_mr = NULL;
@@ -295,10 +370,19 @@ efa_hmem_info_init_iface(enum fi_hmem_iface iface)
 	} else if (ofi_hmem_p2p_disabled()) {
 		info->p2p_supported_by_device = false;
 	} else {
-		if (iface == FI_HMEM_CUDA)
+		switch (iface) {
+		case FI_HMEM_CUDA:
 			efa_hmem_info_check_p2p_support_cuda(info);
-		if (iface == FI_HMEM_NEURON)
+			break;
+		case FI_HMEM_ROCR:
+			efa_hmem_info_check_p2p_support_rocr(info);
+			break;
+		case FI_HMEM_NEURON:
 			efa_hmem_info_check_p2p_support_neuron(info);
+			break;
+		default:
+			break;
+		}
 		if (!info->p2p_supported_by_device)
 			EFA_INFO(FI_LOG_CORE, "%s P2P support is not available.\n", fi_tostr(&iface, FI_TYPE_HMEM_IFACE));
 	}
diff --git a/prov/efa/src/efa_hmem.h b/prov/efa/src/efa_hmem.h
index fbbf4151ea1..1f4d1d327a2 100644
--- a/prov/efa/src/efa_hmem.h
+++ b/prov/efa/src/efa_hmem.h
@@ -8,7 +8,7 @@
 #include "efa_mr.h"
 #include "efa_tp.h"
 
-#if HAVE_CUDA || HAVE_NEURON || HAVE_SYNAPSEAI
+#if HAVE_CUDA || HAVE_NEURON || HAVE_ROCR || HAVE_SYNAPSEAI
 #  define EFA_HAVE_NON_SYSTEM_HMEM 1
 #else
 #  define EFA_HAVE_NON_SYSTEM_HMEM 0
@@ -28,6 +28,7 @@
 static const enum fi_hmem_iface efa_hmem_ifaces[] = {
 	FI_HMEM_SYSTEM,	/* Must be first here */
 	FI_HMEM_CUDA,
+	FI_HMEM_ROCR,
 	FI_HMEM_NEURON,
 	FI_HMEM_SYNAPSEAI
 };
@@ -68,6 +69,7 @@ static inline int efa_copy_from_hmem(void *desc, void *dest, const void *src, si
 		switch (peer.iface) {
 		/* TODO: Fine tune the max data size to switch from gdrcopy to cudaMemcpy */
 		case FI_HMEM_CUDA:
+		case FI_HMEM_ROCR:
 			efa_tracepoint(dev_reg_copy_from_hmem, &peer, dest, src, size);
 			return ofi_hmem_dev_reg_copy_from_hmem(peer.iface, (uint64_t) peer.hmem_data, dest, src, size);
 		default:
@@ -100,6 +102,7 @@ static inline int efa_copy_to_hmem(void *desc, void *dest, const void *src, size
 		switch (peer.iface) {
 		/* TODO: Fine tune the max data size to switch from gdrcopy to cudaMemcpy */
 		case FI_HMEM_CUDA:
+		case FI_HMEM_ROCR:
 			efa_tracepoint(dev_reg_copy_to_hmem, &peer, dest, src, size);
 			return ofi_hmem_dev_reg_copy_to_hmem(peer.iface, (uint64_t) peer.hmem_data, dest, src, size);
 		default:
diff --git a/prov/efa/src/efa_mr.c b/prov/efa/src/efa_mr.c
index 1746f543bb5..dd81e8e12e2 100644
--- a/prov/efa/src/efa_mr.c
+++ b/prov/efa/src/efa_mr.c
@@ -31,7 +31,7 @@ size_t efa_mr_max_cached_size;
  * and should be sufficiently spaced apart s.t. they don't collide with each
  * other.
  */
-#define CUDA_NON_P2P_MR_KEYGEN_INIT	(0x100000000ull)
+#define NON_P2P_MR_KEYGEN_INIT	BIT_ULL(32)
 
 /* @brief Setup the MR cache.
  *
@@ -46,6 +46,7 @@ int efa_mr_cache_open(struct ofi_mr_cache **cache, struct efa_domain *domain)
 	struct ofi_mem_monitor *memory_monitors[OFI_HMEM_MAX] = {
 		[FI_HMEM_SYSTEM] = default_monitor,
 		[FI_HMEM_CUDA] = cuda_monitor,
+		[FI_HMEM_ROCR] = rocr_monitor,
 	};
 	int err;
 
@@ -235,6 +236,8 @@ static int efa_mr_hmem_setup(struct efa_mr *efa_mr,
 				efa_mr->peer.flags &= ~OFI_HMEM_DATA_DEV_REG_HANDLE;
 			}
 		}
+	} else if (attr->iface == FI_HMEM_ROCR) {
+		efa_mr->peer.device = attr->device.rocr;
 	} else if (attr->iface == FI_HMEM_NEURON) {
 		efa_mr->peer.device = attr->device.neuron;
 	} else if (attr->iface == FI_HMEM_SYNAPSEAI) {
@@ -544,7 +547,7 @@ static struct ibv_mr *efa_mr_reg_ibv_mr(struct efa_mr *efa_mr, struct fi_mr_attr
 	 * TODO: need such fallback for cuda as well when
 	 * FI_CUDA_API_PERMITTED is true
 	 */
-	if (efa_mr_is_neuron(efa_mr)) {
+	if (efa_mr_is_neuron(efa_mr) || efa_mr_is_rocr(efa_mr)) {
 		ret = ofi_hmem_get_dmabuf_fd(
 				efa_mr->peer.iface,
 				mr_attr->mr_iov->iov_base,
@@ -564,8 +567,9 @@ static struct ibv_mr *efa_mr_reg_ibv_mr(struct efa_mr *efa_mr, struct fi_mr_attr
 		} else if (ret == -FI_EOPNOTSUPP) {
 			/* Protocol not available => fallback */
 			EFA_INFO(FI_LOG_MR,
-				"Unable to get dmabuf fd for Neuron device buffer, "
-				"Fall back to ibv_reg_mr\n");
+				"Unable to get dmabuf fd for %s device buffer, "
+				"Fall back to ibv_reg_mr\n",
+				fi_tostr(&efa_mr->peer.iface, FI_TYPE_HMEM_IFACE));
 			return ibv_reg_mr(
 				efa_mr->domain->ibv_pd,
 				(void *)mr_attr->mr_iov->iov_base,
@@ -768,15 +772,14 @@ int efa_mr_update_domain_mr_map(struct efa_mr *efa_mr, struct fi_mr_attr *mr_att
 #endif /* HAVE_CUDA */
 
 /*
- * Since ibv_reg_mr() will fail for CUDA buffers when p2p is unavailable (and
- * thus isn't called), generate a proprietary internal key for
- * efa_mr->mr_fid.key. The key must be larger than UINT32_MAX to avoid
- * potential collisions with keys returned by ibv_reg_mr() for standard MR
- * registrations.
+ * Since ibv_reg_mr() will fail for accelerator buffers when p2p is unavailable
+ * (and thus isn't called), generate a proprietary internal key for
+ * efa_mr->mr_fid.key. The key must be larger than UINT32_MAX to avoid potential
+ * collisions with keys returned by ibv_reg_mr() for standard MR registrations.
  */
-static uint64_t efa_mr_cuda_non_p2p_keygen(void) {
-	static uint64_t CUDA_NON_P2P_MR_KEYGEN = CUDA_NON_P2P_MR_KEYGEN_INIT;
-	return CUDA_NON_P2P_MR_KEYGEN++;
+static uint64_t efa_mr_non_p2p_keygen(void) {
+	static uint64_t NON_P2P_MR_KEYGEN = NON_P2P_MR_KEYGEN_INIT;
+	return NON_P2P_MR_KEYGEN++;
 }
 
 /*
@@ -841,8 +844,9 @@ static int efa_mr_reg_impl(struct efa_mr *efa_mr, uint64_t flags, const void *at
 	 * For FI_HMEM_CUDA iface when p2p is unavailable, skip ibv_reg_mr() and
 	 * generate proprietary mr_fid key.
 	 */
-	if (mr_attr.iface == FI_HMEM_CUDA && !g_efa_hmem_info[FI_HMEM_CUDA].p2p_supported_by_device) {
-		efa_mr->mr_fid.key = efa_mr_cuda_non_p2p_keygen();
+	if ((mr_attr.iface == FI_HMEM_CUDA || mr_attr.iface == FI_HMEM_ROCR)
+		&& !g_efa_hmem_info[mr_attr.iface].p2p_supported_by_device) {
+		efa_mr->mr_fid.key = efa_mr_non_p2p_keygen();
 	} else {
 		efa_mr->ibv_mr = efa_mr_reg_ibv_mr(efa_mr, &mr_attr, fi_ibv_access, flags);
 		if (!efa_mr->ibv_mr) {
diff --git a/prov/efa/src/efa_mr.h b/prov/efa/src/efa_mr.h
index 558dc6c3306..1812aab32be 100644
--- a/prov/efa/src/efa_mr.h
+++ b/prov/efa/src/efa_mr.h
@@ -49,9 +49,11 @@ void efa_mr_cache_entry_dereg(struct ofi_mr_cache *cache,
 
 static inline bool efa_mr_is_hmem(struct efa_mr *efa_mr)
 {
-	return efa_mr ? (efa_mr->peer.iface == FI_HMEM_CUDA ||
-			 efa_mr->peer.iface == FI_HMEM_NEURON ||
-			 efa_mr->peer.iface == FI_HMEM_SYNAPSEAI): false;
+	return efa_mr && (
+		efa_mr->peer.iface == FI_HMEM_CUDA ||
+		efa_mr->peer.iface == FI_HMEM_ROCR ||
+		efa_mr->peer.iface == FI_HMEM_NEURON ||
+		efa_mr->peer.iface == FI_HMEM_SYNAPSEAI);
 }
 
 int efa_mr_cache_regv(struct fid_domain *domain_fid, const struct iovec *iov,
@@ -74,6 +76,11 @@ static inline bool efa_mr_is_synapseai(struct efa_mr *efa_mr)
 	return efa_mr ? (efa_mr->peer.iface == FI_HMEM_SYNAPSEAI) : false;
 }
 
+static inline bool efa_mr_is_rocr(struct efa_mr *efa_mr)
+{
+	return efa_mr && efa_mr->peer.iface == FI_HMEM_ROCR;
+}
+
 static inline void *efa_mr_get_shm_desc(struct efa_mr *efa_mr)
 {
 	if (!efa_mr)

From 2c96d8a73551e2e2c3d10f791a880cb11f314430 Mon Sep 17 00:00:00 2001
From: alexander-sannikov <Aleksandr.Sannikov@amd.com>
Date: Fri, 26 Sep 2025 18:51:24 +0100
Subject: [PATCH 15/17] fabtests/common: enable rocr device interface for
 fabtests

Signed-off-by: alexander-sannikov <Aleksandr.Sannikov@amd.com>
Signed-off-by: Darryl Abbate <drl@amazon.com>
(cherry picked from commit 79bfea240b01b7acd981d96b7d716a4cc24cde6e)
---
 fabtests/common/shared.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fabtests/common/shared.c b/fabtests/common/shared.c
index a8ecf1633a5..11ce5967dd9 100644
--- a/fabtests/common/shared.c
+++ b/fabtests/common/shared.c
@@ -3432,7 +3432,7 @@ void ft_usage(char *name, char *desc)
 void ft_hmem_usage()
 {
 	FT_PRINT_OPTS_USAGE("-D <device_iface>", "Specify device interface: "
-			    "e.g. cuda, ze, neuron, synapseai (default: None). "
+			    "e.g. cuda, ze, neuron, synapseai, rocr (default: None). "
 			    "Automatically enables FI_HMEM (-H)");
 	FT_PRINT_OPTS_USAGE("-i <device_id>", "Specify which device to use (default: 0)");
 	FT_PRINT_OPTS_USAGE("-H", "Enable provider FI_HMEM support");
@@ -3594,6 +3594,8 @@ void ft_parse_hmem_opts(int op, char *optarg, struct ft_opts *opts)
 			opts->iface = FI_HMEM_CUDA;
 		else if (!strncasecmp("neuron", optarg, 6))
 			opts->iface = FI_HMEM_NEURON;
+		else if (!strncasecmp("rocr", optarg, 4))
+			opts->iface = FI_HMEM_ROCR;
 		else if (!strncasecmp("synapseai", optarg, 9)) {
 			opts->iface = FI_HMEM_SYNAPSEAI;
 			opts->options |= FT_OPT_REG_DMABUF_MR;

From 91e6f32987a6a460ea1123cff5e90663fbafaa12 Mon Sep 17 00:00:00 2001
From: Darryl Abbate <drl@amazon.com>
Date: Mon, 29 Sep 2025 14:28:26 -0700
Subject: [PATCH 16/17] fabtests/pytest: Add `rocr_memory` mark

Signed-off-by: Darryl Abbate <drl@amazon.com>
---
 fabtests/pytest/pytest.ini | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fabtests/pytest/pytest.ini b/fabtests/pytest/pytest.ini
index 08f26b199f2..d830b5688da 100644
--- a/fabtests/pytest/pytest.ini
+++ b/fabtests/pytest/pytest.ini
@@ -10,6 +10,7 @@ markers =
     ubertest_verify: ubertest tests run with verify config
     cuda_memory: testing with cuda device memory direct
     neuron_memory: testing with neuron device memory
+    rocr_memory: testing with ROCr device memory
     serial: test must be run in seria mode
     unstable: test is unstable and only run when the marker is specified.
 junit_suite_name = fabtests

From c5eebdf74c66684d21c3eaade7acc67681349ce2 Mon Sep 17 00:00:00 2001
From: Darryl Abbate <drl@amazon.com>
Date: Mon, 29 Sep 2025 14:28:30 -0700
Subject: [PATCH 17/17] fabtests/pytest/efa: Add ROCr memory type parameters to
 test fixtures

Signed-off-by: Darryl Abbate <drl@amazon.com>
---
 fabtests/pytest/efa/conftest.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fabtests/pytest/efa/conftest.py b/fabtests/pytest/efa/conftest.py
index 9c12d5e812c..ae8d373382a 100644
--- a/fabtests/pytest/efa/conftest.py
+++ b/fabtests/pytest/efa/conftest.py
@@ -14,12 +14,15 @@
     pytest.param("cuda_to_cuda", marks=pytest.mark.cuda_memory),
     pytest.param("host_to_neuron", marks=pytest.mark.neuron_memory),
     pytest.param("neuron_to_neuron", marks=pytest.mark.neuron_memory),
+    pytest.param("host_to_rocr", marks=pytest.mark.rocr_memory),
+    pytest.param("rocr_to_rocr", marks=pytest.mark.rocr_memory),
 ]
 
 # Add more memory types that are useful for uni-directional tests.
 memory_type_list_all = memory_type_list_bi_dir + [
     pytest.param("cuda_to_host", marks=pytest.mark.cuda_memory),
     pytest.param("neuron_to_host", marks=pytest.mark.neuron_memory),
+    pytest.param("rocr_to_host", marks=pytest.mark.rocr_memory),
 ]
 
 @pytest.fixture(scope="module", params=memory_type_list_all)