diff --git a/doc/mpich/tuning_parameters.md b/doc/mpich/tuning_parameters.md
index 8d1e0aff474..b581a6297a8 100644
--- a/doc/mpich/tuning_parameters.md
+++ b/doc/mpich/tuning_parameters.md
@@ -1096,27 +1096,22 @@ GPU pipeline uses host buffer and pipelining technique to send internode
 messages instead of GPU RDMA. To enable this mode, use the following two
 CVARs:
 
-* `MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE`: This CVAR enables GPU pipeline
-for inter-node pt2pt messages
-* `MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD`: The threshold to start using
-GPU pipelining. Default is 1MB.
+* `MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD`: This CVAR enables enables the RNDV
+(rendezvous) path for large messages above the threshold. Recommended value
+is 1MB.
 
-* `MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ`: Specifies the chunk size
-(in bytes) for GPU pipeline data transfer.
+* `MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL=pipeline`: Forces the RNDV algorithm to
+use pipelining. The default is "auto", which will select best algorithms
+based on message attributes. Other include protocols include "read" - RDMA
+read, and "direct", which relies on underlying network library implementations.
 
-* `MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK`: Specifies the
+* `MPIR_CVAR_CH4_OFI_PIPELINE_CHUNK_SZ`: Specifies the chunk size
+(in bytes) for pipeline data transfer.
+
+* `MPIR_CVAR_CH4_OFI_PIPELINE_NUM_CHUNKS`: Specifies the
 number of buffers for GPU pipeline data transfer in each block/chunk of
 the pool.
 
-* `MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS`: Specifies the maximum
-total number of buffers MPICH buffer pool can allocate.
-
-* `MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE`: Specify engine type
-for copying from device to host (sender side), default 0
-
-* `MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE`: Specify engine type
-for copying from host to device (receiver side), default 0
-
 To enable GPU Direct RDMA support for pt2pt communication, use the
 following CVARs:
 * `MPIR_CVAR_CH4_OFI_ENABLE_HMEM`: This CVAR with a value of `1` enables
diff --git a/src/include/mpiimpl.h b/src/include/mpiimpl.h
index ebe8b779464..09f9079d934 100644
--- a/src/include/mpiimpl.h
+++ b/src/include/mpiimpl.h
@@ -155,7 +155,6 @@ typedef struct MPIR_Stream MPIR_Stream;
 /******************* PART 3: DEVICE INDEPENDENT HEADERS **********************/
 /*****************************************************************************/
 
-#include "mpir_misc.h"
 #include "mpir_dbg.h"
 #include "mpir_objects.h"
 #include "mpir_strerror.h"
@@ -166,6 +165,7 @@ typedef struct MPIR_Stream MPIR_Stream;
 #include "mpir_mem.h"
 #include "mpir_info.h"
 #include "mpir_errcodes.h"
+#include "mpir_misc.h"
 #include "mpir_errhandler.h"
 #include "mpir_attr_generic.h"
 #include "mpir_contextid.h"
diff --git a/src/include/mpir_misc.h b/src/include/mpir_misc.h
index 4937cac5342..6743412eff4 100644
--- a/src/include/mpir_misc.h
+++ b/src/include/mpir_misc.h
@@ -74,6 +74,26 @@ typedef struct {
     MPIR_request_type_t type;
 } MPIR_gpu_req;
 
+MPL_STATIC_INLINE_PREFIX void MPIR_async_test(MPIR_gpu_req * areq, int *is_done)
+{
+    int err;
+    switch (areq->type) {
+        case MPIR_NULL_REQUEST:
+            /* a dummy, immediately complete */
+            *is_done = 1;
+            break;
+        case MPIR_TYPEREP_REQUEST:
+            MPIR_Typerep_test(areq->u.y_req, is_done);
+            break;
+        case MPIR_GPU_REQUEST:
+            err = MPL_gpu_test(&areq->u.gpu_req, is_done);
+            MPIR_Assertp(err == MPL_SUCCESS);
+            break;
+        default:
+            MPIR_Assert(0);
+    }
+}
+
 int MPIR_Localcopy(const void *sendbuf, MPI_Aint sendcount, MPI_Datatype sendtype,
                    void *recvbuf, MPI_Aint recvcount, MPI_Datatype recvtype);
 int MPIR_Ilocalcopy(const void *sendbuf, MPI_Aint sendcount, MPI_Datatype sendtype,
diff --git a/src/include/mpir_typerep.h b/src/include/mpir_typerep.h
index e6758be3aa5..7f544fd44e8 100644
--- a/src/include/mpir_typerep.h
+++ b/src/include/mpir_typerep.h
@@ -78,8 +78,6 @@ int MPIR_Typerep_ipack(const void *inbuf, MPI_Aint incount, MPI_Datatype datatyp
 int MPIR_Typerep_iunpack(const void *inbuf, MPI_Aint insize, void *outbuf, MPI_Aint outcount,
                          MPI_Datatype datatype, MPI_Aint outoffset, MPI_Aint * actual_unpack_bytes,
                          MPIR_Typerep_req * typerep_req, uint32_t flags);
-int MPIR_Typerep_wait(MPIR_Typerep_req typerep_req);
-int MPIR_Typerep_test(MPIR_Typerep_req typerep_req, int *completed);
 
 int MPIR_Typerep_size_external32(MPI_Datatype type);
 int MPIR_Typerep_pack_external(const void *inbuf, MPI_Aint incount, MPI_Datatype datatype,
diff --git a/src/mpi/datatype/typerep/src/typerep_pre.h b/src/mpi/datatype/typerep/src/typerep_pre.h
index 022510fbe2c..347bed20a41 100644
--- a/src/mpi/datatype/typerep/src/typerep_pre.h
+++ b/src/mpi/datatype/typerep/src/typerep_pre.h
@@ -28,4 +28,7 @@ typedef struct {
 #define MPIR_TYPEREP_HANDLE_NULL NULL
 #endif
 
+int MPIR_Typerep_wait(MPIR_Typerep_req typerep_req);
+int MPIR_Typerep_test(MPIR_Typerep_req typerep_req, int *completed);
+
 #endif /* TYPEREP_PRE_H_INCLUDED */
diff --git a/src/mpi/misc/utils.c b/src/mpi/misc/utils.c
index bff9a3f9b7b..5b2334f11a2 100644
--- a/src/mpi/misc/utils.c
+++ b/src/mpi/misc/utils.c
@@ -500,7 +500,12 @@ int MPIR_Ilocalcopy_gpu(const void *sendbuf, MPI_Aint sendcount, MPI_Datatype se
         do_localcopy(sendbuf, sendcount, sendtype, sendoffset, recvbuf, recvcount, recvtype,
                      recvoffset, LOCALCOPY_NONBLOCKING, &req->u.y_req);
     MPIR_ERR_CHECK(mpi_errno);
-    req->type = MPIR_TYPEREP_REQUEST;
+
+    if (req->u.y_req.req == MPIR_TYPEREP_REQ_NULL) {
+        req->type = MPIR_NULL_REQUEST;
+    } else {
+        req->type = MPIR_TYPEREP_REQUEST;
+    }
 #endif
 
   fn_exit:
diff --git a/src/mpid/ch4/ch4_api.txt b/src/mpid/ch4/ch4_api.txt
index a7c8bdaeb99..631144bd573 100644
--- a/src/mpid/ch4/ch4_api.txt
+++ b/src/mpid/ch4/ch4_api.txt
@@ -71,8 +71,8 @@ Non Native API:
       NM*: am_hdr_sz, data_sz, data, count, datatype, sreq
      SHM*: am_hdr_sz, data_sz, data, count, datatype, sreq
   am_can_do_tag: bool
-      NM*: void
-     SHM*: void
+      NM*: rreq
+     SHM*: rreq
   am_tag_send : int
       NM*: rank, comm, handler_id, tag, buf, count, datatype, src_vci, dst_vci, sreq
      SHM*: rank, comm, handler_id, tag, buf, count, datatype, src_vci, dst_vci, sreq
diff --git a/src/mpid/ch4/netmod/ofi/Makefile.mk b/src/mpid/ch4/netmod/ofi/Makefile.mk
index 0ee8c26e384..6bf7e2ca3fb 100644
--- a/src/mpid/ch4/netmod/ofi/Makefile.mk
+++ b/src/mpid/ch4/netmod/ofi/Makefile.mk
@@ -18,7 +18,9 @@ mpi_core_sources   += src/mpid/ch4/netmod/ofi/func_table.c \
                       src/mpid/ch4/netmod/ofi/ofi_part.c \
                       src/mpid/ch4/netmod/ofi/ofi_events.c \
                       src/mpid/ch4/netmod/ofi/ofi_rndv.c \
-                      src/mpid/ch4/netmod/ofi/ofi_huge.c \
+		      src/mpid/ch4/netmod/ofi/ofi_rndv_read.c \
+		      src/mpid/ch4/netmod/ofi/ofi_rndv_write.c \
+                      src/mpid/ch4/netmod/ofi/ofi_pipeline.c \
                       src/mpid/ch4/netmod/ofi/ofi_progress.c \
                       src/mpid/ch4/netmod/ofi/ofi_am_events.c \
                       src/mpid/ch4/netmod/ofi/ofi_nic.c \
diff --git a/src/mpid/ch4/netmod/ofi/ofi_am.h b/src/mpid/ch4/netmod/ofi/ofi_am.h
index 9abef986825..9fa5607e552 100644
--- a/src/mpid/ch4/netmod/ofi/ofi_am.h
+++ b/src/mpid/ch4/netmod/ofi/ofi_am.h
@@ -215,9 +215,17 @@ MPL_STATIC_INLINE_PREFIX bool MPIDI_NM_am_check_eager(MPI_Aint am_hdr_sz, MPI_Ai
     }
 }
 
-MPL_STATIC_INLINE_PREFIX bool MPIDI_NM_am_can_do_tag(void)
+MPL_STATIC_INLINE_PREFIX bool MPIDI_NM_am_can_do_tag(MPIR_Request * rreq)
 {
-    return MPIDI_OFI_ENABLE_TAGGED;
+    if (MPIDI_OFI_ENABLE_TAGGED) {
+        MPI_Aint data_sz;
+        MPIR_Datatype_get_size_macro(MPIDIG_REQUEST(rreq, datatype), data_sz);
+        data_sz *= MPIDIG_REQUEST(rreq, count);
+        if (data_sz <= MPIDI_OFI_global.max_msg_size) {
+            return true;
+        }
+    }
+    return false;
 }
 
 MPL_STATIC_INLINE_PREFIX MPIDIG_recv_data_copy_cb MPIDI_NM_am_get_data_copy_cb(uint32_t attr)
diff --git a/src/mpid/ch4/netmod/ofi/ofi_events.c b/src/mpid/ch4/netmod/ofi/ofi_events.c
index 1945f15811a..7dc2023cf38 100644
--- a/src/mpid/ch4/netmod/ofi/ofi_events.c
+++ b/src/mpid/ch4/netmod/ofi/ofi_events.c
@@ -14,7 +14,6 @@
 
 static int peek_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * rreq);
 static int peek_empty_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * rreq);
-static int send_huge_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * sreq);
 static int ssend_ack_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * sreq);
 static int chunk_done_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * req);
 static int inject_emu_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * req);
@@ -38,9 +37,6 @@ static int peek_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * rre
     if (MPIDI_OFI_is_tag_rndv(wc->tag)) {
         mpi_errno = MPIDI_OFI_peek_rndv_event(vci, wc, rreq);
         goto fn_exit;
-    } else if (MPIDI_OFI_is_tag_huge(wc->tag)) {
-        mpi_errno = MPIDI_OFI_peek_huge_event(vci, wc, rreq);
-        goto fn_exit;
     }
 
     MPIR_STATUS_SET_COUNT(rreq->status, wc->len);
@@ -77,211 +73,6 @@ static int peek_empty_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request
     return MPI_SUCCESS;
 }
 
-/* GPU pipeline events */
-static int pipeline_send_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r)
-{
-    int mpi_errno = MPI_SUCCESS;
-    int c;
-    MPIDI_OFI_gpu_pipeline_request *req;
-    MPIR_Request *sreq;
-    void *wc_buf = NULL;
-    MPIR_FUNC_ENTER;
-
-    req = (MPIDI_OFI_gpu_pipeline_request *) r;
-    /* get original mpi request */
-    sreq = req->parent;
-    wc_buf = req->buf;
-    MPIDU_genq_private_pool_free_cell(MPIDI_OFI_global.gpu_pipeline_send_pool, wc_buf);
-
-    MPIR_cc_decr(sreq->cc_ptr, &c);
-    if (c == 0) {
-        MPIR_Request_free(sreq);
-    }
-    MPL_free(r);
-
-    MPIR_FUNC_EXIT;
-    return mpi_errno;
-}
-
-static int pipeline_recv_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r, int event_id)
-{
-    int mpi_errno = MPI_SUCCESS;
-    int vci_local, i;
-    MPIDI_OFI_gpu_pipeline_request *req;
-    MPIR_Request *rreq;
-    void *wc_buf = NULL;
-    int in_use MPL_UNUSED;
-    MPIDI_OFI_gpu_task_t *task = NULL;
-    MPL_gpu_engine_type_t engine_type =
-        (MPL_gpu_engine_type_t) MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE;
-
-    MPIR_FUNC_ENTER;
-
-    req = (MPIDI_OFI_gpu_pipeline_request *) r;
-    rreq = req->parent;
-    wc_buf = req->buf;
-    MPL_free(r);
-
-    void *recv_buf = MPIDI_OFI_REQUEST(rreq, buf);
-    size_t recv_count = MPIDI_OFI_REQUEST(rreq, count);
-    MPI_Datatype datatype = MPIDI_OFI_REQUEST(rreq, datatype);
-
-    fi_addr_t remote_addr = MPIDI_OFI_REQUEST(rreq, pipeline_info.remote_addr);
-    vci_local = MPIDI_OFI_REQUEST(rreq, pipeline_info.vci_local);
-
-    if (event_id == MPIDI_OFI_EVENT_RECV_GPU_PIPELINE_INIT) {
-        rreq->status.MPI_SOURCE = MPIDI_OFI_cqe_get_source(wc, true);
-        rreq->status.MPI_ERROR = MPIDI_OFI_idata_get_error_bits(wc->data);
-        rreq->status.MPI_TAG = MPIDI_OFI_init_get_tag(wc->tag);
-
-        if (unlikely(MPIDI_OFI_is_tag_sync(wc->tag))) {
-            MPIDI_OFI_REQUEST(rreq, pipeline_info.is_sync) = true;
-        }
-
-        uint32_t packed = MPIDI_OFI_idata_get_gpu_packed_bit(wc->data);
-        uint32_t n_chunks = MPIDI_OFI_idata_get_gpuchunk_bits(wc->data);
-        if (likely(packed == 0)) {
-            if (wc->len > 0) {
-                MPIR_Assert(n_chunks == 0);
-                /* First chunk arrives. */
-                MPI_Aint actual_unpack_bytes;
-                MPIR_gpu_req yreq;
-                mpi_errno =
-                    MPIR_Ilocalcopy_gpu(wc_buf, wc->len, MPIR_BYTE_INTERNAL, 0, NULL, recv_buf,
-                                        recv_count, datatype, 0, NULL, MPL_GPU_COPY_H2D,
-                                        engine_type, 1, &yreq);
-                MPIR_ERR_CHECK(mpi_errno);
-                actual_unpack_bytes = wc->len;
-                task =
-                    MPIDI_OFI_create_gpu_task(MPIDI_OFI_PIPELINE_RECV, wc_buf,
-                                              actual_unpack_bytes, rreq, yreq);
-                DL_APPEND(MPIDI_OFI_global.gpu_recv_task_queue[vci_local], task);
-                MPIDI_OFI_REQUEST(rreq, pipeline_info.offset) += (size_t) actual_unpack_bytes;
-            } else {
-                /* free this chunk */
-                MPIDU_genq_private_pool_free_cell(MPIDI_OFI_global.gpu_pipeline_recv_pool, wc_buf);
-                MPIR_Assert(n_chunks > 0);
-                /* Post recv for remaining chunks. */
-                MPIR_cc_dec(rreq->cc_ptr);
-                for (i = 0; i < n_chunks; i++) {
-                    MPIR_cc_inc(rreq->cc_ptr);
-
-                    size_t chunk_sz = MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ;
-
-                    char *host_buf = NULL;
-                    MPIDU_genq_private_pool_alloc_cell(MPIDI_OFI_global.gpu_pipeline_recv_pool,
-                                                       (void **) &host_buf);
-
-                    MPIDI_OFI_REQUEST(rreq, event_id) = MPIDI_OFI_EVENT_RECV_GPU_PIPELINE;
-
-                    MPIDI_OFI_gpu_pipeline_request *chunk_req = NULL;
-                    chunk_req = (MPIDI_OFI_gpu_pipeline_request *)
-                        MPL_malloc(sizeof(MPIDI_OFI_gpu_pipeline_request), MPL_MEM_BUFFER);
-                    if (chunk_req == NULL) {
-                        mpi_errno = MPIR_ERR_OTHER;
-                        goto fn_fail;
-                    }
-                    chunk_req->event_id = MPIDI_OFI_EVENT_RECV_GPU_PIPELINE;
-                    chunk_req->parent = rreq;
-                    chunk_req->buf = host_buf;
-                    chunk_req->offset = chunk_sz * i;
-                    int ret = 0;
-                    if (!MPIDI_OFI_global.gpu_recv_queue && host_buf) {
-                        ret = fi_trecv
-                            (MPIDI_OFI_global.ctx
-                             [MPIDI_OFI_REQUEST(rreq, pipeline_info.ctx_idx)].rx,
-                             host_buf, chunk_sz, NULL, remote_addr,
-                             MPIDI_OFI_REQUEST(rreq,
-                                               pipeline_info.match_bits) |
-                             MPIDI_OFI_GPU_PIPELINE_SEND, MPIDI_OFI_REQUEST(rreq,
-                                                                            pipeline_info.
-                                                                            mask_bits),
-                             (void *) &chunk_req->context);
-                    }
-                    if (MPIDI_OFI_global.gpu_recv_queue || !host_buf || ret != 0) {
-                        MPIDI_OFI_gpu_pending_recv_t *recv_task =
-                            MPIDI_OFI_create_recv_task(chunk_req, i, n_chunks);
-                        DL_APPEND(MPIDI_OFI_global.gpu_recv_queue, recv_task);
-                    }
-                }
-            }
-        } else {
-            MPIR_ERR_CHKANDJUMP(true, mpi_errno, MPI_ERR_OTHER, "**gpu_pipeline_packed");
-        }
-    } else {
-        if (likely(event_id == MPIDI_OFI_EVENT_RECV_GPU_PIPELINE)) {
-            /* FIXME: current design unpacks all bytes from host buffer, overflow check is missing. */
-            MPI_Aint actual_unpack_bytes;
-            MPIR_gpu_req yreq;
-            mpi_errno =
-                MPIR_Ilocalcopy_gpu(wc_buf, (MPI_Aint) wc->len, MPIR_BYTE_INTERNAL, 0, NULL,
-                                    (char *) recv_buf, (MPI_Aint) recv_count, datatype,
-                                    req->offset, NULL, MPL_GPU_COPY_H2D, engine_type, 1, &yreq);
-            MPIR_ERR_CHECK(mpi_errno);
-            actual_unpack_bytes = wc->len;
-            MPIDI_OFI_REQUEST(rreq, pipeline_info.offset) += (size_t) actual_unpack_bytes;
-            task =
-                MPIDI_OFI_create_gpu_task(MPIDI_OFI_PIPELINE_RECV, wc_buf, actual_unpack_bytes,
-                                          rreq, yreq);
-            DL_APPEND(MPIDI_OFI_global.gpu_recv_task_queue[vci_local], task);
-        } else {
-            MPIR_ERR_CHKANDJUMP(true, mpi_errno, MPI_ERR_OTHER, "**gpu_pipeline_packed");
-        }
-    }
-  fn_exit:
-    MPIR_FUNC_EXIT;
-    return mpi_errno;
-  fn_fail:
-    rreq->status.MPI_ERROR = mpi_errno;
-    goto fn_exit;
-}
-
-static int send_huge_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * sreq)
-{
-    int mpi_errno = MPI_SUCCESS;
-    int c, num_nics;
-    MPIR_FUNC_ENTER;
-
-    MPIR_cc_decr(sreq->cc_ptr, &c);
-
-    if (c == 0) {
-        MPIR_Comm *comm;
-        struct fid_mr **huge_send_mrs;
-
-        comm = sreq->comm;
-        num_nics = MPIDI_OFI_COMM(comm).enable_striping ? MPIDI_OFI_global.num_nics : 1;
-        huge_send_mrs = MPIDI_OFI_REQUEST(sreq, huge.send_mrs);
-
-        /* Clean up the memory region */
-        for (int i = 0; i < num_nics; i++) {
-            uint64_t key = fi_mr_key(huge_send_mrs[i]);
-            MPIDI_OFI_CALL(fi_close(&huge_send_mrs[i]->fid), mr_unreg);
-            if (!MPIDI_OFI_ENABLE_MR_PROV_KEY) {
-                MPIDI_OFI_mr_key_free(MPIDI_OFI_LOCAL_MR_KEY, key);
-            }
-        }
-        MPL_free(huge_send_mrs);
-
-        if (MPIDI_OFI_REQUEST(sreq, noncontig.pack.pack_buffer)) {
-            MPL_free(MPIDI_OFI_REQUEST(sreq, noncontig.pack.pack_buffer));
-        }
-
-        if (MPIDI_OFI_REQUEST(sreq, am_req)) {
-            MPIR_Request *am_sreq = MPIDI_OFI_REQUEST(sreq, am_req);
-            int handler_id = MPIDI_OFI_REQUEST(sreq, am_handler_id);
-            mpi_errno = MPIDIG_global.origin_cbs[handler_id] (am_sreq);
-        }
-
-        MPIDI_CH4_REQUEST_FREE(sreq);
-    }
-    /* c != 0, ssend */
-  fn_exit:
-    MPIR_FUNC_EXIT;
-    return mpi_errno;
-  fn_fail:
-    goto fn_exit;
-}
-
 static int ssend_ack_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * sreq)
 {
     MPIDI_OFI_ack_request_t *req = (MPIDI_OFI_ack_request_t *) sreq;
@@ -622,25 +413,12 @@ int MPIDI_OFI_dispatch_function(int vci, struct fi_cq_tagged_entry *wc, MPIR_Req
     } else if (MPIDI_OFI_REQUEST(req, event_id) == MPIDI_OFI_EVENT_AM_READ) {
         mpi_errno = am_read_event(vci, wc, req);
         goto fn_exit;
-    } else if (MPIDI_OFI_REQUEST(req, event_id) == MPIDI_OFI_EVENT_SEND_GPU_PIPELINE) {
-        mpi_errno = pipeline_send_event(wc, req);
-        goto fn_exit;
-    } else if (MPIDI_OFI_REQUEST(req, event_id) == MPIDI_OFI_EVENT_RECV_GPU_PIPELINE_INIT) {
-        mpi_errno = pipeline_recv_event(wc, req, MPIDI_OFI_EVENT_RECV_GPU_PIPELINE_INIT);
-        goto fn_exit;
-    } else if (MPIDI_OFI_REQUEST(req, event_id) == MPIDI_OFI_EVENT_RECV_GPU_PIPELINE) {
-        mpi_errno = pipeline_recv_event(wc, req, MPIDI_OFI_EVENT_RECV_GPU_PIPELINE);
-        goto fn_exit;
     } else if (unlikely(1)) {
         switch (MPIDI_OFI_REQUEST(req, event_id)) {
             case MPIDI_OFI_EVENT_PEEK:
                 mpi_errno = peek_event(vci, wc, req);
                 break;
 
-            case MPIDI_OFI_EVENT_RECV_HUGE:
-                mpi_errno = MPIDI_OFI_recv_event(vci, wc, req, MPIDI_OFI_EVENT_RECV_HUGE);
-                break;
-
             case MPIDI_OFI_EVENT_RECV_PACK:
                 mpi_errno = MPIDI_OFI_recv_event(vci, wc, req, MPIDI_OFI_EVENT_RECV_PACK);
                 break;
@@ -649,10 +427,6 @@ int MPIDI_OFI_dispatch_function(int vci, struct fi_cq_tagged_entry *wc, MPIR_Req
                 mpi_errno = MPIDI_OFI_recv_event(vci, wc, req, MPIDI_OFI_EVENT_RECV_NOPACK);
                 break;
 
-            case MPIDI_OFI_EVENT_SEND_HUGE:
-                mpi_errno = send_huge_event(vci, wc, req);
-                break;
-
             case MPIDI_OFI_EVENT_SEND_PACK:
                 mpi_errno = MPIDI_OFI_send_event(vci, wc, req, MPIDI_OFI_EVENT_SEND_PACK);
                 break;
@@ -669,12 +443,44 @@ int MPIDI_OFI_dispatch_function(int vci, struct fi_cq_tagged_entry *wc, MPIR_Req
                 mpi_errno = MPIDI_OFI_rndv_cts_event(vci, wc, req);
                 break;
 
-            case MPIDI_OFI_EVENT_CHUNK_DONE:
-                mpi_errno = chunk_done_event(vci, wc, req);
+            case MPIDI_OFI_EVENT_PIPELINE_RECV_DATASIZE:
+                mpi_errno = MPIDI_OFI_pipeline_recv_datasize_event(wc, req);
+                break;
+
+            case MPIDI_OFI_EVENT_PIPELINE_SEND_CHUNK:
+                mpi_errno = MPIDI_OFI_pipeline_send_chunk_event(wc, req);
+                break;
+
+            case MPIDI_OFI_EVENT_PIPELINE_RECV_CHUNK:
+                mpi_errno = MPIDI_OFI_pipeline_recv_chunk_event(wc, req);
                 break;
 
-            case MPIDI_OFI_EVENT_HUGE_CHUNK_DONE:
-                mpi_errno = MPIDI_OFI_huge_chunk_done_event(vci, wc, req);
+            case MPIDI_OFI_EVENT_RNDVREAD_RECV_MRS:
+                mpi_errno = MPIDI_OFI_rndvread_recv_mrs_event(wc, req);
+                break;
+
+            case MPIDI_OFI_EVENT_RNDVREAD_READ_CHUNK:
+                mpi_errno = MPIDI_OFI_rndvread_read_chunk_event(wc, req);
+                break;
+
+            case MPIDI_OFI_EVENT_RNDVREAD_ACK:
+                mpi_errno = MPIDI_OFI_rndvread_ack_event(wc, req);
+                break;
+
+            case MPIDI_OFI_EVENT_RNDVWRITE_RECV_MRS:
+                mpi_errno = MPIDI_OFI_rndvwrite_recv_mrs_event(wc, req);
+                break;
+
+            case MPIDI_OFI_EVENT_RNDVWRITE_WRITE_CHUNK:
+                mpi_errno = MPIDI_OFI_rndvwrite_write_chunk_event(wc, req);
+                break;
+
+            case MPIDI_OFI_EVENT_RNDVWRITE_ACK:
+                mpi_errno = MPIDI_OFI_rndvwrite_ack_event(wc, req);
+                break;
+
+            case MPIDI_OFI_EVENT_CHUNK_DONE:
+                mpi_errno = chunk_done_event(vci, wc, req);
                 break;
 
             case MPIDI_OFI_EVENT_INJECT_EMU:
@@ -759,12 +565,10 @@ int MPIDI_OFI_handle_cq_error(int vci, int nic, ssize_t ret)
                         case MPIDI_OFI_EVENT_RECV:
                         case MPIDI_OFI_EVENT_RECV_PACK:
                         case MPIDI_OFI_EVENT_RECV_NOPACK:
-                        case MPIDI_OFI_EVENT_RECV_HUGE:
                             MPIR_STATUS_SET_CANCEL_BIT(req->status, TRUE);
                             MPIR_STATUS_SET_COUNT(req->status, 0);
                             MPIR_Datatype_release_if_not_builtin(MPIDI_OFI_REQUEST(req, datatype));
-                            if ((event_id == MPIDI_OFI_EVENT_RECV_PACK ||
-                                 event_id == MPIDI_OFI_EVENT_GET_HUGE) &&
+                            if ((event_id == MPIDI_OFI_EVENT_RECV_PACK) &&
                                 MPIDI_OFI_REQUEST(req, noncontig.pack.pack_buffer)) {
                                 MPL_free(MPIDI_OFI_REQUEST(req, noncontig.pack.pack_buffer));
                             } else if (event_id == MPIDI_OFI_EVENT_RECV_NOPACK) {
diff --git a/src/mpid/ch4/netmod/ofi/ofi_events.h b/src/mpid/ch4/netmod/ofi/ofi_events.h
index 2c6721d7f99..7ca8d300b0d 100644
--- a/src/mpid/ch4/netmod/ofi/ofi_events.h
+++ b/src/mpid/ch4/netmod/ofi/ofi_events.h
@@ -11,47 +11,20 @@
 #include "ofi_am_events.h"
 #include "utlist.h"
 
-/*
-=== BEGIN_MPI_T_CVAR_INFO_BLOCK ===
-cvars:
-    - name        : MPIR_CVAR_CH4_OFI_GPU_RECEIVE_ENGINE_TYPE
-      category    : CH4_OFI
-      type        : enum
-      default     : copy_low_latency
-      class       : none
-      verbosity   : MPI_T_VERBOSITY_USER_BASIC
-      scope       : MPI_T_SCOPE_LOCAL
-      description : |-
-        Specifies GPU engine type for GPU pt2pt on the receiver side.
-        compute - use a compute engine
-        copy_high_bandwidth - use a high-bandwidth copy engine
-        copy_low_latency - use a low-latency copy engine
-        yaksa - use Yaksa
-
-=== END_MPI_T_CVAR_INFO_BLOCK ===
-*/
-
 int MPIDI_OFI_rma_done_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * in_req);
 int MPIDI_OFI_dispatch_function(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * req);
 int MPIDI_OFI_recv_rndv_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * rreq);
 int MPIDI_OFI_peek_rndv_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * rreq);
 int MPIDI_OFI_rndv_cts_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * req);
-
-MPL_STATIC_INLINE_PREFIX MPL_gpu_engine_type_t MPIDI_OFI_gpu_get_recv_engine_type(void)
-{
-    if (MPIR_CVAR_CH4_OFI_GPU_RECEIVE_ENGINE_TYPE ==
-        MPIR_CVAR_CH4_OFI_GPU_RECEIVE_ENGINE_TYPE_compute) {
-        return MPL_GPU_ENGINE_TYPE_COMPUTE;
-    } else if (MPIR_CVAR_CH4_OFI_GPU_RECEIVE_ENGINE_TYPE ==
-               MPIR_CVAR_CH4_OFI_GPU_RECEIVE_ENGINE_TYPE_copy_high_bandwidth) {
-        return MPL_GPU_ENGINE_TYPE_COPY_HIGH_BANDWIDTH;
-    } else if (MPIR_CVAR_CH4_OFI_GPU_RECEIVE_ENGINE_TYPE ==
-               MPIR_CVAR_CH4_OFI_GPU_RECEIVE_ENGINE_TYPE_copy_low_latency) {
-        return MPL_GPU_ENGINE_TYPE_COPY_LOW_LATENCY;
-    } else {
-        return MPL_GPU_ENGINE_TYPE_LAST;
-    }
-}
+int MPIDI_OFI_pipeline_recv_datasize_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r);
+int MPIDI_OFI_pipeline_send_chunk_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r);
+int MPIDI_OFI_pipeline_recv_chunk_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r);
+int MPIDI_OFI_rndvread_recv_mrs_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r);
+int MPIDI_OFI_rndvread_read_chunk_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r);
+int MPIDI_OFI_rndvread_ack_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r);
+int MPIDI_OFI_rndvwrite_recv_mrs_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r);
+int MPIDI_OFI_rndvwrite_write_chunk_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r);
+int MPIDI_OFI_rndvwrite_ack_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r);
 
 MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_cqe_get_source(struct fi_cq_tagged_entry *wc, bool has_err)
 {
@@ -83,12 +56,24 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send_event(int vci,
     if (MPIDI_OFI_REQUEST(sreq, am_req)) {
         MPIR_Request *am_sreq = MPIDI_OFI_REQUEST(sreq, am_req);
         int handler_id = MPIDI_OFI_REQUEST(sreq, am_handler_id);
-        mpi_errno = MPIDIG_global.origin_cbs[handler_id] (am_sreq);
+        if (handler_id == -1) {
+            /* native rndv direct */
+            MPIDI_OFI_rndv_common_t *p = &MPIDI_OFI_AMREQ_COMMON(am_sreq);
+            MPIR_Datatype_release_if_not_builtin(p->datatype);
+            MPIDI_Request_complete_fast(am_sreq);
+        } else {
+            mpi_errno = MPIDIG_global.origin_cbs[handler_id] (am_sreq);
+            MPIR_ERR_CHECK(mpi_errno);
+        }
     }
 
     MPIDI_Request_complete_fast(sreq);
+
+  fn_exit:
     MPIR_FUNC_EXIT;
     return mpi_errno;
+  fn_fail:
+    goto fn_exit;
 }
 
 MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_recv_complete(MPIR_Request * rreq, int event_id)
@@ -99,7 +84,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_recv_complete(MPIR_Request * rreq, int ev
 #ifndef MPIDI_CH4_DIRECT_NETMOD
     MPIDI_anysrc_free_partner(rreq);
 #endif
-    if ((event_id == MPIDI_OFI_EVENT_RECV_PACK || event_id == MPIDI_OFI_EVENT_GET_HUGE) &&
+    if ((event_id == MPIDI_OFI_EVENT_RECV_PACK) &&
         (MPIDI_OFI_REQUEST(rreq, noncontig.pack.pack_buffer))) {
         MPI_Aint count = MPIR_STATUS_GET_COUNT(rreq->status);
         mpi_errno = MPIR_Localcopy_gpu(MPIDI_OFI_REQUEST(rreq, noncontig.pack.pack_buffer), count,
@@ -130,8 +115,25 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_recv_complete(MPIR_Request * rreq, int ev
         MPI_Status *status = &rreq->status;
         MPIR_Request *am_req = MPIDI_OFI_REQUEST(rreq, am_req);
         int am_recv_id = MPIDI_OFI_REQUEST(rreq, am_handler_id);
-        mpi_errno = MPIDIG_global.tag_recv_cbs[am_recv_id] (am_req, status);
-        MPIR_ERR_CHECK(mpi_errno);
+        if (am_recv_id == -1) {
+            /* native rndv direct */
+
+            /* copy COUNT and MPI_ERROR, but skip MPI_SOURCE and MPI_RANK */
+            am_req->status.count_lo = rreq->status.count_lo;
+            am_req->status.count_hi_and_cancelled = rreq->status.count_hi_and_cancelled;
+            am_req->status.MPI_ERROR = rreq->status.MPI_ERROR;
+
+#ifndef MPIDI_CH4_DIRECT_NETMOD
+            MPIDI_anysrc_free_partner(am_req);
+#endif
+
+            MPIDI_OFI_rndv_common_t *p = &MPIDI_OFI_AMREQ_COMMON(am_req);
+            MPIR_Datatype_release_if_not_builtin(p->datatype);
+            MPIDI_Request_complete_fast(am_req);
+        } else {
+            mpi_errno = MPIDIG_global.tag_recv_cbs[am_recv_id] (am_req, status);
+            MPIR_ERR_CHECK(mpi_errno);
+        }
     }
     MPIR_Datatype_release_if_not_builtin(MPIDI_OFI_REQUEST(rreq, datatype));
     MPIDI_Request_complete_fast(rreq);
@@ -168,9 +170,6 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_recv_event(int vci, struct fi_cq_tagged_e
     if (MPIDI_OFI_is_tag_rndv(wc->tag)) {
         mpi_errno = MPIDI_OFI_recv_rndv_event(vci, wc, rreq);
         goto fn_exit;
-    } else if (MPIDI_OFI_is_tag_huge(wc->tag)) {
-        mpi_errno = MPIDI_OFI_recv_huge_event(vci, wc, rreq);
-        goto fn_exit;
     }
 
     /* If synchronous, send ack */
@@ -180,12 +179,6 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_recv_event(int vci, struct fi_cq_tagged_e
         MPIR_ERR_CHECK(mpi_errno);
     }
 
-    /* If striping is enabled, this data will be counted elsewhere. */
-    if (MPIDI_OFI_REQUEST(rreq, event_id) != MPIDI_OFI_EVENT_RECV_HUGE ||
-        !MPIDI_OFI_COMM(rreq->comm).enable_striping) {
-        MPIR_T_PVAR_COUNTER_INC(MULTINIC, nic_recvd_bytes_count[MPIDI_OFI_REQUEST(rreq, nic_num)],
-                                wc->len);
-    }
     mpi_errno = MPIDI_OFI_recv_complete(rreq, event_id);
 
   fn_exit:
diff --git a/src/mpid/ch4/netmod/ofi/ofi_huge.c b/src/mpid/ch4/netmod/ofi/ofi_huge.c
deleted file mode 100644
index 160ea7b29ab..00000000000
--- a/src/mpid/ch4/netmod/ofi/ofi_huge.c
+++ /dev/null
@@ -1,390 +0,0 @@
-/*
- * Copyright (C) by Argonne National Laboratory
- *     See COPYRIGHT in top-level directory
- */
-
-#include <mpidimpl.h>
-#include "ofi_impl.h"
-#include "ofi_events.h"
-
-static int get_huge(MPIR_Request * rreq);
-static int get_huge_issue_read(MPIR_Request * rreq);
-static int get_huge_complete(MPIR_Request * rreq);
-
-static int get_huge(MPIR_Request * rreq)
-{
-    int mpi_errno = MPI_SUCCESS;
-    MPIDI_OFI_huge_remote_info_t *info = MPIDI_OFI_REQUEST(rreq, huge.remote_info);
-
-    MPI_Aint cur_offset;
-    if (MPIDI_OFI_COMM(rreq->comm).enable_striping) {
-        cur_offset = MPIDI_OFI_STRIPE_CHUNK_SIZE;
-    } else {
-        cur_offset = MPIDI_OFI_global.max_msg_size;
-    }
-
-    MPI_Aint data_sz = MPIDI_OFI_REQUEST(rreq, util.iov.iov_len);
-
-    if (data_sz < info->msgsize) {
-        rreq->status.MPI_ERROR = MPI_ERR_TRUNCATE;
-        info->msgsize = data_sz;
-    }
-
-    if (info->msgsize <= cur_offset) {
-        /* huge message sent to small recv buffer */
-        mpi_errno = get_huge_complete(rreq);
-        MPIR_ERR_CHECK(mpi_errno);
-        goto fn_exit;
-    }
-
-    get_huge_issue_read(rreq);
-
-  fn_exit:
-    return mpi_errno;
-  fn_fail:
-    goto fn_exit;
-}
-
-static uintptr_t recv_rbase(MPIDI_OFI_huge_remote_info_t * remote_info)
-{
-    if (!MPIDI_OFI_ENABLE_MR_VIRT_ADDRESS) {
-        return 0;
-    } else {
-        return (uintptr_t) remote_info->send_buf;
-    }
-}
-
-static int get_huge_issue_read(MPIR_Request * rreq)
-{
-    int mpi_errno = MPI_SUCCESS;
-    MPIDI_OFI_huge_remote_info_t *info = MPIDI_OFI_REQUEST(rreq, huge.remote_info);
-    MPIR_Comm *comm = rreq->comm;
-    MPIR_FUNC_ENTER;
-
-    MPI_Aint cur_offset, bytesLeft;
-    if (MPIDI_OFI_COMM(rreq->comm).enable_striping) {
-        cur_offset = MPIDI_OFI_STRIPE_CHUNK_SIZE;
-    } else {
-        cur_offset = MPIDI_OFI_global.max_msg_size;
-    }
-    bytesLeft = info->msgsize - cur_offset;
-
-    void *recv_buf = MPIDI_OFI_REQUEST(rreq, util.iov.iov_base);
-
-    MPI_Aint chunk_size;
-    if (MPIDI_OFI_COMM(comm).enable_striping) {
-        chunk_size = (info->msgsize - MPIDI_OFI_STRIPE_CHUNK_SIZE) / MPIDI_OFI_global.num_nics;
-        chunk_size = MPL_MIN(chunk_size, MPIDI_OFI_global.max_msg_size);
-    } else {
-        chunk_size = MPIDI_OFI_global.max_msg_size;
-    }
-
-    int num_chunks = MPL_DIV_ROUNDUP(bytesLeft, chunk_size);
-
-    /* note: this is receiver read from sender */
-    int vci_remote = info->vci_src;
-    int vci_local = info->vci_dst;
-
-    /* We'll issue multiple fi_read for every chunks. All the chunks will be tracked by a
-     * chunks_outstanding counter. */
-    /* NOTE: there is a possibility completion happens in between issuing fi_read (due to
-     * MPIDI_OFI_CALL_RETRY). Thus we need initialize chunks_outstanding before issuing any
-     * chunk */
-    /* allocate and initialize cc_ptr. It will be freed by event completion when it reaches 0 */
-    MPIR_cc_t *cc_ptr;
-    cc_ptr = MPL_malloc(sizeof(MPIR_cc_t), MPL_MEM_OTHER);
-    MPIR_cc_set(cc_ptr, num_chunks);
-
-    int issued_chunks = 0;
-
-    int nic = 0;
-    while (bytesLeft > 0) {
-        int ctx_idx = MPIDI_OFI_get_ctx_index(vci_local, nic);
-        MPIDI_av_entry_t *av = MPIDIU_comm_rank_to_av(comm, info->origin_rank);
-        fi_addr_t addr = MPIDI_OFI_av_to_phys(av, vci_local, nic, vci_remote, nic);
-        uint64_t remote_key = info->rma_keys[nic];
-
-        MPI_Aint bytesToGet = MPL_MIN(chunk_size, bytesLeft);
-
-        MPIDI_OFI_read_chunk_t *chunk = MPL_malloc(sizeof(MPIDI_OFI_read_chunk_t), MPL_MEM_OTHER);
-        chunk->event_id = MPIDI_OFI_EVENT_HUGE_CHUNK_DONE;
-        chunk->localreq = rreq;
-        chunk->chunks_outstanding = cc_ptr;
-
-        MPIDI_OFI_cntr_incr(vci_local, nic);
-        MPIDI_OFI_CALL_RETRY(fi_read(MPIDI_OFI_global.ctx[ctx_idx].tx,
-                                     (void *) ((char *) recv_buf + cur_offset),
-                                     bytesToGet, NULL, addr, recv_rbase(info) + cur_offset,
-                                     remote_key, (void *) &chunk->context),
-                             vci_local, rdma_readfrom);
-        MPIR_T_PVAR_COUNTER_INC(MULTINIC, nic_recvd_bytes_count[nic], bytesToGet);
-        if (MPIDI_OFI_COMM(comm).enable_striping) {
-            MPIR_T_PVAR_COUNTER_INC(MULTINIC, striped_nic_recvd_bytes_count[nic], bytesToGet);
-            /* round-robin to next nic */
-            nic = (nic + 1) % MPIDI_OFI_global.num_nics;
-        }
-
-        issued_chunks++;
-        cur_offset += bytesToGet;
-        bytesLeft -= bytesToGet;
-    }
-
-    MPIR_Assert(issued_chunks == num_chunks);
-
-  fn_exit:
-    MPIR_FUNC_EXIT;
-    return mpi_errno;
-  fn_fail:
-    goto fn_exit;
-}
-
-static int get_huge_complete(MPIR_Request * rreq)
-{
-    int mpi_errno = MPI_SUCCESS;
-    MPIR_FUNC_ENTER;
-
-    MPIDI_OFI_huge_remote_info_t *info = MPIDI_OFI_REQUEST(rreq, huge.remote_info);
-
-    /* note: it's receiver ack sender */
-    int vci_remote = info->vci_src;
-    int vci_local = info->vci_dst;
-
-    /* important: save comm_ptr because MPIDI_OFI_recv_complete may free the request. */
-    MPIR_Comm *comm_ptr = rreq->comm;
-    MPIR_STATUS_SET_COUNT(rreq->status, info->msgsize);
-
-    mpi_errno = MPIDI_OFI_recv_complete(rreq, MPIDI_OFI_EVENT_GET_HUGE);
-
-    MPIDI_OFI_send_control_t ctrl;
-    ctrl.type = MPIDI_OFI_CTRL_HUGEACK;
-    ctrl.u.huge_ack.ackreq = info->ackreq;
-    mpi_errno = MPIDI_NM_am_send_hdr(info->origin_rank, comm_ptr,
-                                     MPIDI_OFI_INTERNAL_HANDLER_CONTROL,
-                                     &ctrl, sizeof(ctrl), vci_local, vci_remote);
-    MPIR_ERR_CHECK(mpi_errno);
-
-    MPL_free(info);
-
-  fn_exit:
-    MPIR_FUNC_EXIT;
-    return mpi_errno;
-  fn_fail:
-    goto fn_exit;
-}
-
-/* this function called by recv event of a huge message */
-int MPIDI_OFI_recv_huge_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * rreq)
-{
-    int mpi_errno = MPI_SUCCESS;
-    MPIR_Comm *comm_ptr;
-    MPIR_FUNC_ENTER;
-
-    bool ready_to_get = false;
-    if (MPIDI_OFI_REQUEST(rreq, event_id) != MPIDI_OFI_EVENT_RECV_HUGE) {
-        /* huge send recved by a small buffer */
-    } else if (MPIDI_OFI_COMM(rreq->comm).enable_striping) {
-        MPIR_Assert(wc->len == MPIDI_OFI_STRIPE_CHUNK_SIZE);
-    } else {
-        MPIR_Assert(wc->len == MPIDI_OFI_global.max_msg_size);
-    }
-
-    comm_ptr = rreq->comm;
-    MPIR_T_PVAR_COUNTER_INC(MULTINIC, nic_recvd_bytes_count[MPIDI_OFI_REQUEST(rreq, nic_num)],
-                            wc->len);
-    if (MPIDI_OFI_REQUEST(rreq, huge.remote_info)) {
-        /* this is mrecv, we already got remote info */
-        ready_to_get = true;
-    } else {
-        /* Check for remote control info */
-        MPIDI_OFI_huge_recv_list_t *list_ptr;
-        int comm_id = comm_ptr->recvcontext_id;
-        int rank = MPIDI_OFI_cqe_get_source(wc, false);
-        int tag = (MPIDI_OFI_TAG_MASK & wc->tag);
-
-        LL_FOREACH(MPIDI_OFI_global.per_vci[vci].huge_ctrl_head, list_ptr) {
-            if (list_ptr->comm_id == comm_id && list_ptr->rank == rank && list_ptr->tag == tag) {
-                MPIDI_OFI_REQUEST(rreq, huge.remote_info) = list_ptr->u.info;
-                LL_DELETE(MPIDI_OFI_global.per_vci[vci].huge_ctrl_head,
-                          MPIDI_OFI_global.per_vci[vci].huge_ctrl_tail, list_ptr);
-                MPL_free(list_ptr);
-                ready_to_get = true;
-                break;
-            }
-        }
-    }
-
-    if (!ready_to_get) {
-        MPIDI_OFI_huge_recv_list_t *list_ptr;
-
-        list_ptr = (MPIDI_OFI_huge_recv_list_t *) MPL_calloc(sizeof(*list_ptr), 1, MPL_MEM_BUFFER);
-        if (!list_ptr)
-            MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomem");
-
-        list_ptr->comm_id = comm_ptr->recvcontext_id;
-        list_ptr->rank = MPIDI_OFI_cqe_get_source(wc, false);
-        list_ptr->tag = (MPIDI_OFI_TAG_MASK & wc->tag);
-        list_ptr->u.rreq = rreq;
-
-        LL_APPEND(MPIDI_OFI_global.per_vci[vci].huge_recv_head,
-                  MPIDI_OFI_global.per_vci[vci].huge_recv_tail, list_ptr);
-        /* control handler will finish the recv */
-    } else {
-        /* proceed to get the huge message */
-        mpi_errno = get_huge(rreq);
-        MPIR_ERR_CHECK(mpi_errno);
-    }
-
-  fn_exit:
-    MPIR_FUNC_EXIT;
-    return mpi_errno;
-  fn_fail:
-    goto fn_exit;
-}
-
-/* This function is called when we receive a huge control message */
-int MPIDI_OFI_recv_huge_control(int vci, int comm_id, int rank, int tag,
-                                MPIDI_OFI_huge_remote_info_t * info_ptr)
-{
-    int mpi_errno = MPI_SUCCESS;
-    MPIR_FUNC_ENTER;
-
-    MPIDI_OFI_huge_recv_list_t *list_ptr;
-    MPIR_Request *rreq = NULL;
-    MPIDI_OFI_huge_remote_info_t *info;
-
-    /* need persist the info. It will eventually get freed at recv completion */
-    info = MPL_malloc(sizeof(MPIDI_OFI_huge_remote_info_t), MPL_MEM_OTHER);
-    MPIR_Assert(info);
-    memcpy(info, info_ptr, sizeof(*info));
-
-    /* If there has been a posted receive, search through the list of unmatched
-     * receives to find the one that goes with the incoming message. */
-    LL_FOREACH(MPIDI_OFI_global.per_vci[vci].huge_recv_head, list_ptr) {
-        if (list_ptr->comm_id == comm_id && list_ptr->rank == rank && list_ptr->tag == tag) {
-            rreq = list_ptr->u.rreq;
-            LL_DELETE(MPIDI_OFI_global.per_vci[vci].huge_recv_head,
-                      MPIDI_OFI_global.per_vci[vci].huge_recv_tail, list_ptr);
-            MPL_free(list_ptr);
-            break;
-        }
-    }
-
-    if (!rreq) {
-        list_ptr = (MPIDI_OFI_huge_recv_list_t *) MPL_calloc(sizeof(MPIDI_OFI_huge_recv_list_t),
-                                                             1, MPL_MEM_OTHER);
-        if (!list_ptr) {
-            MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomem");
-        }
-        list_ptr->comm_id = comm_id;
-        list_ptr->rank = rank;
-        list_ptr->tag = tag;
-        list_ptr->u.info = info;
-
-        LL_APPEND(MPIDI_OFI_global.per_vci[vci].huge_ctrl_head,
-                  MPIDI_OFI_global.per_vci[vci].huge_ctrl_tail, list_ptr);
-        /* let MPIDI_OFI_recv_huge_event finish the recv */
-    } else if (MPIDI_OFI_REQUEST(rreq, kind) == MPIDI_OFI_req_kind__mprobe) {
-        /* attach info and finish the mprobe */
-        MPIDI_OFI_REQUEST(rreq, huge.remote_info) = info;
-        MPIR_STATUS_SET_COUNT(rreq->status, info->msgsize);
-        MPL_atomic_release_store_int(&(MPIDI_OFI_REQUEST(rreq, peek_status)), MPIDI_OFI_PEEK_FOUND);
-    } else {
-        /* attach info and finish recv */
-        MPIDI_OFI_REQUEST(rreq, huge.remote_info) = info;
-        mpi_errno = get_huge(rreq);
-        MPIR_ERR_CHECK(mpi_errno);
-    }
-
-  fn_exit:
-    MPIR_FUNC_EXIT;
-    return mpi_errno;
-  fn_fail:
-    goto fn_exit;
-}
-
-int MPIDI_OFI_peek_huge_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * rreq)
-{
-    int mpi_errno = MPI_SUCCESS;
-    MPIR_FUNC_ENTER;
-
-    MPI_Aint count = 0;
-    MPIDI_OFI_huge_recv_list_t *list_ptr;
-    bool found_msg = false;
-
-    /* If this is a huge message, find the control message on the unexpected list that matches
-     * with this and return the size in that. */
-    LL_FOREACH(MPIDI_OFI_global.per_vci[vci].huge_ctrl_head, list_ptr) {
-        /* FIXME: fix the type of comm_id */
-        int comm_id = rreq->comm->recvcontext_id;
-        int rank = MPIDI_OFI_cqe_get_source(wc, false);
-        int tag = (int) (MPIDI_OFI_TAG_MASK & wc->tag);
-        if (list_ptr->comm_id == comm_id && list_ptr->rank == rank && list_ptr->tag == tag) {
-            count = list_ptr->u.info->msgsize;
-            found_msg = true;
-            break;
-        }
-    }
-    if (found_msg) {
-        if (MPIDI_OFI_REQUEST(rreq, kind) == MPIDI_OFI_req_kind__mprobe) {
-            MPIDI_OFI_REQUEST(rreq, huge.remote_info) = list_ptr->u.info;
-            LL_DELETE(MPIDI_OFI_global.per_vci[vci].huge_ctrl_head,
-                      MPIDI_OFI_global.per_vci[vci].huge_ctrl_tail, list_ptr);
-            MPL_free(list_ptr);
-        }
-        MPIR_STATUS_SET_COUNT(rreq->status, count);
-        /* peek_status should be the last thing to change in rreq. Reason is
-         * we use peek_status to indicate peek_event has completed and all the
-         * relevant values have been copied to rreq. */
-        MPL_atomic_release_store_int(&(MPIDI_OFI_REQUEST(rreq, peek_status)), MPIDI_OFI_PEEK_FOUND);
-    } else if (MPIDI_OFI_REQUEST(rreq, kind) == MPIDI_OFI_req_kind__probe) {
-        /* return not found for this probe. User can probe again. */
-        MPL_atomic_release_store_int(&(MPIDI_OFI_REQUEST(rreq, peek_status)),
-                                     MPIDI_OFI_PEEK_NOT_FOUND);
-    } else if (MPIDI_OFI_REQUEST(rreq, kind) == MPIDI_OFI_req_kind__mprobe) {
-        /* fill the status with wc info. Count is still missing */
-
-        /* post the rreq to list and let control handler handle it */
-        MPIDI_OFI_huge_recv_list_t *huge_list_ptr;
-
-        huge_list_ptr =
-            (MPIDI_OFI_huge_recv_list_t *) MPL_calloc(sizeof(*huge_list_ptr), 1, MPL_MEM_COMM);
-        MPIR_ERR_CHKANDJUMP(huge_list_ptr == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem");
-
-        huge_list_ptr->comm_id = rreq->comm->recvcontext_id;
-        huge_list_ptr->rank = MPIDI_OFI_cqe_get_source(wc, false);
-        huge_list_ptr->tag = MPIDI_OFI_TAG_MASK & wc->tag;
-        huge_list_ptr->u.rreq = rreq;
-
-        LL_APPEND(MPIDI_OFI_global.per_vci[vci].huge_recv_head,
-                  MPIDI_OFI_global.per_vci[vci].huge_recv_tail, huge_list_ptr);
-    }
-
-
-  fn_exit:
-    MPIR_FUNC_EXIT;
-    return mpi_errno;
-  fn_fail:
-    goto fn_exit;
-}
-
-int MPIDI_OFI_huge_chunk_done_event(int vci, struct fi_cq_tagged_entry *wc, void *req)
-{
-    int mpi_errno = MPI_SUCCESS;
-    MPIDI_OFI_read_chunk_t *chunk_req = (MPIDI_OFI_read_chunk_t *) req;
-
-    int c;
-    MPIR_cc_decr(chunk_req->chunks_outstanding, &c);
-
-    if (c == 0) {
-        MPL_free(chunk_req->chunks_outstanding);
-        mpi_errno = get_huge_complete(chunk_req->localreq);
-        MPIR_ERR_CHECK(mpi_errno);
-    }
-
-    MPL_free(chunk_req);
-
-  fn_exit:
-    return mpi_errno;
-  fn_fail:
-    goto fn_exit;
-}
diff --git a/src/mpid/ch4/netmod/ofi/ofi_impl.h b/src/mpid/ch4/netmod/ofi/ofi_impl.h
index 891cc98e187..8f441c16ca6 100644
--- a/src/mpid/ch4/netmod/ofi/ofi_impl.h
+++ b/src/mpid/ch4/netmod/ofi/ofi_impl.h
@@ -56,6 +56,8 @@ ATTRIBUTE((unused));
                                     MPIDI_OFI_global.prov_use[nic]->domain_attr->name : "(n/a)")
 #define MPIDI_OFI_DEFAULT_NIC_NAME (MPIDI_OFI_NIC_NAME(0))
 
+#define MPIDI_OFI_EAGER_THRESH (MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD == -1 ? MPIDI_OFI_global.max_msg_size : MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD)
+
 int MPIDI_OFI_progress_uninlined(int vci);
 int MPIDI_OFI_handle_cq_error(int vci, int nic, ssize_t ret);
 int MPIDI_OFI_flush_send_queue(void);
@@ -293,13 +295,6 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_mr_bind(struct fi_info *prov, struct fid_
 #define MPIDI_OFI_COLL_MR_KEY 1
 #define MPIDI_OFI_INVALID_MR_KEY 0xFFFFFFFFFFFFFFFFULL
 int MPIDI_OFI_retry_progress(int vci, int retry);
-int MPIDI_OFI_recv_huge_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * rreq);
-int MPIDI_OFI_recv_huge_control(int vci, int comm_id, int rank, int tag,
-                                MPIDI_OFI_huge_remote_info_t * info);
-int MPIDI_OFI_peek_huge_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * rreq);
-int MPIDI_OFI_huge_chunk_done_event(int vci, struct fi_cq_tagged_entry *wc, void *req);
-int MPIDI_OFI_control_handler(void *am_hdr, void *data, MPI_Aint data_sz,
-                              uint32_t attr, MPIR_Request ** req);
 int MPIDI_OFI_am_rdma_read_ack_handler(void *am_hdr, void *data,
                                        MPI_Aint in_data_sz, uint32_t attr, MPIR_Request ** req);
 int MPIDI_OFI_rndv_info_handler(void *am_hdr, void *data, MPI_Aint data_sz,
@@ -486,14 +481,20 @@ MPL_STATIC_INLINE_PREFIX bool MPIDI_OFI_is_tag_sync(uint64_t match_bits)
     return ((match_bits & MPIDI_OFI_PROTOCOL_MASK) == MPIDI_OFI_SYNC_SEND);
 }
 
-MPL_STATIC_INLINE_PREFIX bool MPIDI_OFI_is_tag_huge(uint64_t match_bits)
+MPL_STATIC_INLINE_PREFIX bool MPIDI_OFI_is_tag_rndv(uint64_t match_bits)
 {
-    return ((match_bits & MPIDI_OFI_PROTOCOL_MASK) == MPIDI_OFI_HUGE_SEND);
+    return (bool) (match_bits & MPIDI_OFI_RNDV_SEND);
 }
 
-MPL_STATIC_INLINE_PREFIX bool MPIDI_OFI_is_tag_rndv(uint64_t match_bits)
+MPL_STATIC_INLINE_PREFIX bool MPIDI_OFI_is_tag_rndv_pack(uint64_t match_bits)
+{
+    return ((match_bits & MPIDI_OFI_PROTOCOL_MASK) == MPIDI_OFI_RNDV_PACK);
+}
+
+MPL_STATIC_INLINE_PREFIX bool MPIDI_OFI_rndv_need_pack(int dt_contig, MPL_pointer_attr_t * attr)
 {
-    return ((match_bits & MPIDI_OFI_PROTOCOL_MASK) == MPIDI_OFI_RNDV_SEND);
+    /* assume noncontig data or device data can benefit from pipelined packing/unpacking */
+    return (!dt_contig || MPL_gpu_attr_is_dev(attr));
 }
 
 MPL_STATIC_INLINE_PREFIX uint64_t MPIDI_OFI_init_sendtag(int contextid, int source, int tag)
@@ -856,304 +857,35 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_gpu_free_pack_buffer(void *ptr)
     }
 }
 
-MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_gpu_pipeline_chunk_size(size_t data_sz)
-{
-    int chunk_size = MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ;
-    if (data_sz <= MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ) {
-        chunk_size = data_sz;
-    }
-    return chunk_size;
-}
-
-MPL_STATIC_INLINE_PREFIX MPIDI_OFI_gpu_task_t *MPIDI_OFI_create_gpu_task(MPIDI_OFI_pipeline_type_t
-                                                                         type, void *buf,
-                                                                         size_t len,
-                                                                         MPIR_Request * request,
-                                                                         MPIR_gpu_req yreq)
-{
-    MPIDI_OFI_gpu_task_t *task =
-        (MPIDI_OFI_gpu_task_t *) MPL_malloc(sizeof(MPIDI_OFI_gpu_task_t), MPL_MEM_OTHER);
-    MPIR_Assert(task != NULL);
-    task->type = type;
-    task->status = MPIDI_OFI_PIPELINE_READY;
-    task->buf = buf;
-    task->len = len;
-    task->request = request;
-    task->yreq = yreq;
-    task->prev = NULL;
-    task->next = NULL;
-    return task;
-}
-
-MPL_STATIC_INLINE_PREFIX MPIDI_OFI_gpu_pending_recv_t
-    * MPIDI_OFI_create_recv_task(MPIDI_OFI_gpu_pipeline_request * req, int idx, int n_chunks)
-{
-    MPIDI_OFI_gpu_pending_recv_t *task =
-        (MPIDI_OFI_gpu_pending_recv_t *) MPL_malloc(sizeof(MPIDI_OFI_gpu_pending_recv_t),
-                                                    MPL_MEM_OTHER);
-    MPIR_Assert(task);
-    task->req = req;
-    task->idx = idx;
-    task->n_chunks = n_chunks;
-    task->prev = NULL;
-    task->next = NULL;
-    return task;
-}
-
-MPL_STATIC_INLINE_PREFIX MPIDI_OFI_gpu_pending_send_t *MPIDI_OFI_create_send_task(MPIR_Request *
-                                                                                  req,
-                                                                                  void *send_buf,
-                                                                                  MPI_Aint count,
-                                                                                  MPI_Datatype
-                                                                                  datatype,
-                                                                                  MPL_pointer_attr_t
-                                                                                  attr,
-                                                                                  MPI_Aint left_sz,
-                                                                                  int dt_contig)
-{
-    MPIDI_OFI_gpu_pending_send_t *task =
-        (MPIDI_OFI_gpu_pending_send_t *) MPL_malloc(sizeof(MPIDI_OFI_gpu_pending_send_t),
-                                                    MPL_MEM_OTHER);
-    MPIR_Assert(task);
-    task->sreq = req;
-    task->attr = attr;
-    task->send_buf = send_buf;
-    task->datatype = datatype;
-    MPIR_Datatype_add_ref_if_not_builtin(datatype);
-    task->offset = 0;
-    task->n_chunks = 0;
-    task->left_sz = left_sz;
-    task->count = count;
-    task->dt_contig = dt_contig;
-    task->prev = NULL;
-    task->next = NULL;
-    return task;
-}
-
-static int MPIDI_OFI_gpu_progress_task(MPIDI_OFI_gpu_task_t * gpu_queue[], int vni);
-
-static int MPIDI_OFI_gpu_progress_send(void)
-{
-    int mpi_errno = MPI_SUCCESS;
-    MPL_gpu_engine_type_t engine_type =
-        (MPL_gpu_engine_type_t) MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE;
-
-    while (MPIDI_OFI_global.gpu_send_queue) {
-        char *host_buf = NULL;
-        MPI_Aint chunk_sz;
-        int vci_local = -1;
-
-        MPIDI_OFI_gpu_pending_send_t *send_task = MPIDI_OFI_global.gpu_send_queue;
-        int block_sz = MPIDI_OFI_REQUEST(send_task->sreq, pipeline_info.chunk_sz);
-        while (send_task->left_sz > 0) {
-            MPIDI_OFI_gpu_task_t *task = NULL;
-            chunk_sz = send_task->left_sz > block_sz ? block_sz : send_task->left_sz;
-            host_buf = NULL;
-            MPIDU_genq_private_pool_alloc_cell(MPIDI_OFI_global.gpu_pipeline_send_pool,
-                                               (void **) &host_buf);
-            if (host_buf == NULL) {
-                goto fn_exit;
-            }
-            MPI_Aint actual_pack_bytes;
-            MPIR_gpu_req yreq;
-            int commit = send_task->left_sz <= chunk_sz ? 1 : 0;
-            if (!commit &&
-                !MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST &&
-                send_task->n_chunks % MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK ==
-                MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK - 1)
-                commit = 1;
-            mpi_errno =
-                MPIR_Ilocalcopy_gpu((char *) send_task->send_buf, send_task->count,
-                                    send_task->datatype, send_task->offset, &send_task->attr,
-                                    host_buf, chunk_sz, MPIR_BYTE_INTERNAL, 0, NULL,
-                                    MPL_GPU_COPY_D2H, engine_type, commit, &yreq);
-            MPIR_ERR_CHECK(mpi_errno);
-            actual_pack_bytes = chunk_sz;
-            task =
-                MPIDI_OFI_create_gpu_task(MPIDI_OFI_PIPELINE_SEND, host_buf, actual_pack_bytes,
-                                          send_task->sreq, yreq);
-            send_task->offset += (size_t) actual_pack_bytes;
-            send_task->left_sz -= (size_t) actual_pack_bytes;
-            vci_local = MPIDI_OFI_REQUEST(send_task->sreq, pipeline_info.vci_local);
-            MPIR_Assert(vci_local < MPIDI_CH4_MAX_VCIS);
-            DL_APPEND(MPIDI_OFI_global.gpu_send_task_queue[vci_local], task);
-            send_task->n_chunks++;
-            /* Increase request completion cnt, cc is 1 more than necessary
-             * to prevent parent request being freed prematurally. */
-            MPIR_cc_inc(send_task->sreq->cc_ptr);
-        }
-        /* all done, decrease cc by 1 to allow parent request to be freed
-         * when complete */
-        MPIR_cc_dec(send_task->sreq->cc_ptr);
-        /* Update correct number of chunks in immediate data. */
-        MPIDI_OFI_idata_set_gpuchunk_bits(&MPIDI_OFI_REQUEST
-                                          (send_task->sreq, pipeline_info.cq_data),
-                                          send_task->n_chunks);
-        DL_DELETE(MPIDI_OFI_global.gpu_send_queue, send_task);
-        MPIR_Datatype_release_if_not_builtin(send_task->datatype);
-        MPL_free(send_task);
-
-        if (vci_local != -1)
-            MPIDI_OFI_gpu_progress_task(MPIDI_OFI_global.gpu_send_task_queue, vci_local);
-
-    }
-
-  fn_exit:
-    return mpi_errno;
-  fn_fail:
-    mpi_errno = MPI_ERR_OTHER;
-    goto fn_exit;
-}
-
-MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_gpu_progress_recv(void)
+MPL_STATIC_INLINE_PREFIX MPL_gpu_engine_type_t MPIDI_OFI_gpu_get_send_engine_type(void)
 {
-    int mpi_errno = MPI_SUCCESS;
-
-    while (MPIDI_OFI_global.gpu_recv_queue) {
-        MPIDI_OFI_gpu_pending_recv_t *recv_task = MPIDI_OFI_global.gpu_recv_queue;
-        MPIDI_OFI_gpu_pipeline_request *chunk_req = recv_task->req;
-        MPIR_Request *rreq = chunk_req->parent;
-        void *host_buf = chunk_req->buf;
-        if (!host_buf) {
-            MPIDU_genq_private_pool_alloc_cell(MPIDI_OFI_global.gpu_pipeline_recv_pool,
-                                               (void **) &host_buf);
-            if (!host_buf) {
-                break;
-            }
-            chunk_req->buf = host_buf;
-        }
-        fi_addr_t remote_addr = MPIDI_OFI_REQUEST(rreq, pipeline_info.remote_addr);
-
-        int ret = fi_trecv(MPIDI_OFI_global.ctx[MPIDI_OFI_REQUEST(rreq, pipeline_info.ctx_idx)].rx,
-                           (void *) host_buf,
-                           MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ, NULL, remote_addr,
-                           MPIDI_OFI_REQUEST(rreq,
-                                             pipeline_info.match_bits) |
-                           MPIDI_OFI_GPU_PIPELINE_SEND,
-                           MPIDI_OFI_REQUEST(rreq, pipeline_info.mask_bits),
-                           (void *) &chunk_req->context);
-        if (ret == 0) {
-            DL_DELETE(MPIDI_OFI_global.gpu_recv_queue, recv_task);
-            MPL_free(recv_task);
-        } else if (ret == -FI_EAGAIN || ret == -FI_ENOMEM) {
-            break;
-        } else {
-            goto fn_fail;
-        }
+    if (MPIR_CVAR_CH4_OFI_GPU_SEND_ENGINE_TYPE == MPIR_CVAR_CH4_OFI_GPU_SEND_ENGINE_TYPE_compute) {
+        return MPL_GPU_ENGINE_TYPE_COMPUTE;
+    } else if (MPIR_CVAR_CH4_OFI_GPU_SEND_ENGINE_TYPE ==
+               MPIR_CVAR_CH4_OFI_GPU_SEND_ENGINE_TYPE_copy_high_bandwidth) {
+        return MPL_GPU_ENGINE_TYPE_COPY_HIGH_BANDWIDTH;
+    } else if (MPIR_CVAR_CH4_OFI_GPU_SEND_ENGINE_TYPE ==
+               MPIR_CVAR_CH4_OFI_GPU_SEND_ENGINE_TYPE_copy_low_latency) {
+        return MPL_GPU_ENGINE_TYPE_COPY_LOW_LATENCY;
+    } else {
+        return MPL_GPU_ENGINE_TYPE_LAST;
     }
-
-  fn_exit:
-    return mpi_errno;
-  fn_fail:
-    mpi_errno = MPI_ERR_OTHER;
-    goto fn_exit;
 }
 
-static int MPIDI_OFI_gpu_progress_task(MPIDI_OFI_gpu_task_t * gpu_queue[], int vni)
+MPL_STATIC_INLINE_PREFIX MPL_gpu_engine_type_t MPIDI_OFI_gpu_get_recv_engine_type(void)
 {
-    int mpi_errno = MPI_SUCCESS;
-    MPIDI_OFI_gpu_task_t *task = NULL;
-    MPIDI_OFI_gpu_task_t *tmp;
-
-    DL_FOREACH_SAFE(gpu_queue[vni], task, tmp) {
-        if (task->status == MPIDI_OFI_PIPELINE_EXEC) {
-            /* Avoid the deadlock of re-launching an executing OFI task. */
-            goto fn_exit;
-        }
-
-        MPIR_gpu_req *yreq = &task->yreq;
-        int completed = 0;
-        if (yreq->type == MPIR_GPU_REQUEST) {
-            mpi_errno = MPL_gpu_test(&yreq->u.gpu_req, &completed);
-            MPIR_ERR_CHECK(mpi_errno);
-        } else if (yreq->type == MPIR_TYPEREP_REQUEST) {
-            MPIR_Typerep_test(yreq->u.y_req, &completed);
-        } else {
-            completed = 1;
-        }
-        if (completed == 1) {
-            /* GPU transfer completes. */
-            task->status = MPIDI_OFI_PIPELINE_EXEC;
-            MPIR_Request *request = task->request;
-
-            if (task->type == MPIDI_OFI_PIPELINE_SEND) {
-                MPIDI_OFI_gpu_pipeline_request *chunk_req = (MPIDI_OFI_gpu_pipeline_request *)
-                    MPL_malloc(sizeof(MPIDI_OFI_gpu_pipeline_request), MPL_MEM_BUFFER);
-                MPIR_ERR_CHKANDJUMP1(chunk_req == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem",
-                                     "**nomem %s", "GPU pipelining chunk_req alloc");
-                chunk_req->parent = request;
-                chunk_req->event_id = MPIDI_OFI_EVENT_SEND_GPU_PIPELINE;
-                chunk_req->buf = task->buf;
-                MPIDI_OFI_CALL(fi_tsenddata
-                               (MPIDI_OFI_global.ctx
-                                [MPIDI_OFI_REQUEST(request, pipeline_info.ctx_idx)].tx,
-                                task->buf, task->len, NULL /* desc */ ,
-                                MPIDI_OFI_REQUEST(request, pipeline_info.cq_data),
-                                MPIDI_OFI_REQUEST(request, pipeline_info.remote_addr),
-                                MPIDI_OFI_REQUEST(request,
-                                                  pipeline_info.match_bits) |
-                                MPIDI_OFI_GPU_PIPELINE_SEND, (void *) &chunk_req->context),
-                               tsenddata);
-                DL_DELETE(gpu_queue[vni], task);
-                MPL_free(task);
-            } else {
-                MPIR_Assert(task->type == MPIDI_OFI_PIPELINE_RECV);
-                int c;
-                MPIR_cc_decr(request->cc_ptr, &c);
-                if (c == 0) {
-                    /* If synchronous, send ack */
-                    if (unlikely(MPIDI_OFI_REQUEST(request, pipeline_info.is_sync))) {
-                        int context_id = MPIDI_OFI_REQUEST(request, context_id);
-                        mpi_errno = MPIDI_OFI_send_ack(request, context_id, NULL, 0);
-                        MPIR_ERR_CHECK(mpi_errno);
-                    }
-                    /* Set number of bytes in status. */
-                    MPIR_STATUS_SET_COUNT(request->status,
-                                          MPIDI_OFI_REQUEST(request, pipeline_info.offset));
-
-                    MPIR_Datatype_release_if_not_builtin(MPIDI_OFI_REQUEST(request, datatype));
-                    MPIR_Request_free(request);
-                }
-
-                /* For recv, now task can be deleted from DL. */
-                DL_DELETE(gpu_queue[vni], task);
-                /* Free host buffer, yaksa request and task. */
-                if (task->type == MPIDI_OFI_PIPELINE_RECV)
-                    MPIDU_genq_private_pool_free_cell(MPIDI_OFI_global.gpu_pipeline_recv_pool,
-                                                      task->buf);
-                else
-                    MPIDI_OFI_gpu_free_pack_buffer(task->buf);
-                MPL_free(task);
-            }
-        } else {
-            goto fn_exit;
-        }
+    if (MPIR_CVAR_CH4_OFI_GPU_RECEIVE_ENGINE_TYPE ==
+        MPIR_CVAR_CH4_OFI_GPU_RECEIVE_ENGINE_TYPE_compute) {
+        return MPL_GPU_ENGINE_TYPE_COMPUTE;
+    } else if (MPIR_CVAR_CH4_OFI_GPU_RECEIVE_ENGINE_TYPE ==
+               MPIR_CVAR_CH4_OFI_GPU_RECEIVE_ENGINE_TYPE_copy_high_bandwidth) {
+        return MPL_GPU_ENGINE_TYPE_COPY_HIGH_BANDWIDTH;
+    } else if (MPIR_CVAR_CH4_OFI_GPU_RECEIVE_ENGINE_TYPE ==
+               MPIR_CVAR_CH4_OFI_GPU_RECEIVE_ENGINE_TYPE_copy_low_latency) {
+        return MPL_GPU_ENGINE_TYPE_COPY_LOW_LATENCY;
+    } else {
+        return MPL_GPU_ENGINE_TYPE_LAST;
     }
-
-  fn_exit:
-    return mpi_errno;
-  fn_fail:
-    mpi_errno = MPI_ERR_OTHER;
-    goto fn_exit;
-}
-
-MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_gpu_progress(int vni)
-{
-    int mpi_errno = MPI_SUCCESS;
-
-    mpi_errno = MPIDI_OFI_gpu_progress_task(MPIDI_OFI_global.gpu_recv_task_queue, vni);
-    MPIR_ERR_CHECK(mpi_errno);
-    mpi_errno = MPIDI_OFI_gpu_progress_task(MPIDI_OFI_global.gpu_send_task_queue, vni);
-    MPIR_ERR_CHECK(mpi_errno);
-    mpi_errno = MPIDI_OFI_gpu_progress_send();
-    MPIR_ERR_CHECK(mpi_errno);
-    mpi_errno = MPIDI_OFI_gpu_progress_recv();
-    MPIR_ERR_CHECK(mpi_errno);
-
-  fn_exit:
-    return mpi_errno;
-  fn_fail:
-    goto fn_exit;
 }
 
 #endif /* OFI_IMPL_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ofi/ofi_init.c b/src/mpid/ch4/netmod/ofi/ofi_init.c
index f6480e3c57c..3aab17f80a0 100644
--- a/src/mpid/ch4/netmod/ofi/ofi_init.c
+++ b/src/mpid/ch4/netmod/ofi/ofi_init.c
@@ -375,21 +375,6 @@ categories :
         Specifies the maximum number of iovecs to allocate for RMA operations
         to/from noncontiguous buffers.
 
-    - name        : MPIR_CVAR_CH4_OFI_EAGER_MAX_MSG_SIZE
-      category    : CH4_OFI
-      type        : int
-      default     : -1
-      class       : none
-      verbosity   : MPI_T_VERBOSITY_USER_BASIC
-      scope       : MPI_T_SCOPE_LOCAL
-      description : >-
-        This cvar controls the message size at which OFI native path switches from eager to
-        rendezvous mode. It does not affect the AM path eager limit. Having this gives a way to
-        reliably test native non-path.
-        If the number is positive, OFI will init the MPIDI_OFI_global.max_msg_size to the value of
-        cvar. If the number is negative, OFI will init the MPIDI_OFI_globa.max_msg_size using
-        whatever provider gives (which might be unlimited for socket provider).
-
     - name        : MPIR_CVAR_CH4_OFI_MAX_NICS
       category    : CH4
       type        : int
@@ -470,30 +455,7 @@ categories :
       description : >-
         If true, enable OFI triggered ops for MPI collectives.
 
-    - name        : MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE
-      category    : CH4_OFI
-      type        : boolean
-      default     : false
-      class       : none
-      verbosity   : MPI_T_VERBOSITY_USER_BASIC
-      scope       : MPI_T_SCOPE_LOCAL
-      description : >-
-        If true, enable pipeline for GPU data transfer.
-        GPU pipeline does not support non-contiguous datatypes or mixed buffer types
-        (i.e. GPU send buffer, host recv buffer). If GPU pipeline is enabled, the unsupported
-        scenarios will cause undefined behavior if encountered.
-
-    - name        : MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD
-      category    : CH4_OFI
-      type        : int
-      default     : 131072
-      class       : none
-      verbosity   : MPI_T_VERBOSITY_USER_BASIC
-      scope       : MPI_T_SCOPE_LOCAL
-      description : >-
-        This is the threshold to start using GPU pipeline.
-
-    - name        : MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ
+    - name        : MPIR_CVAR_CH4_OFI_PIPELINE_CHUNK_SZ
       category    : CH4_OFI
       type        : int
       default     : 1048576
@@ -501,9 +463,9 @@ categories :
       verbosity   : MPI_T_VERBOSITY_USER_BASIC
       scope       : MPI_T_SCOPE_LOCAL
       description : >-
-        Specifies the buffer size (in bytes) for GPU pipeline data transfer.
+        Specifies the chunk size (in bytes) for pipeline data transfer.
 
-    - name        : MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK
+    - name        : MPIR_CVAR_CH4_OFI_PIPELINE_NUM_CHUNKS
       category    : CH4_OFI
       type        : int
       default     : 32
@@ -511,40 +473,35 @@ categories :
       verbosity   : MPI_T_VERBOSITY_USER_BASIC
       scope       : MPI_T_SCOPE_LOCAL
       description : >-
-        Specifies the number of buffers for GPU pipeline data transfer in
-        each block/chunk of the pool.
+        Specifies the number of chunk buffers for pipeline data transfer.
 
-    - name        : MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS
+    - name        : MPIR_CVAR_CH4_OFI_GPU_SEND_ENGINE_TYPE
       category    : CH4_OFI
-      type        : int
-      default     : 32
+      type        : enum
+      default     : copy_low_latency
       class       : none
       verbosity   : MPI_T_VERBOSITY_USER_BASIC
       scope       : MPI_T_SCOPE_LOCAL
-      description : >-
-        Specifies the total number of buffers for GPU pipeline data transfer
-
-    - name        : MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
-      category    : CH4_OFI
-      type        : int
-      default     : 0
-      class       : none
-      verbosity   : MPI_T_VERBOSITY_USER_BASIC
-      scope       : MPI_T_SCOPE_LOCAL
-      description : >-
-        Specifies the GPU engine type for GPU pipeline on the sender side,
-        default is MPL_GPU_ENGINE_TYPE_COMPUTE
-
-    - name        : MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
+      description : |-
+        Specifies GPU engine type for GPU pt2pt on the sender side.
+        compute - use a compute engine
+        copy_high_bandwidth - use a high-bandwidth copy engine
+        copy_low_latency - use a low-latency copy engine
+        yaksa - use Yaksa
+
+    - name        : MPIR_CVAR_CH4_OFI_GPU_RECEIVE_ENGINE_TYPE
       category    : CH4_OFI
-      type        : int
-      default     : 0
+      type        : enum
+      default     : copy_low_latency
       class       : none
       verbosity   : MPI_T_VERBOSITY_USER_BASIC
       scope       : MPI_T_SCOPE_LOCAL
-      description : >-
-        Specifies the GPU engine type for GPU pipeline on the receiver side,
-        default is MPL_GPU_ENGINE_TYPE_COMPUTE
+      description : |-
+        Specifies GPU engine type for GPU pt2pt on the receiver side.
+        compute - use a compute engine
+        copy_high_bandwidth - use a high-bandwidth copy engine
+        copy_low_latency - use a low-latency copy engine
+        yaksa - use Yaksa
 
 === END_MPI_T_CVAR_INFO_BLOCK ===
 */
@@ -646,8 +603,6 @@ int MPIDI_OFI_init_local(int *tag_bits)
 
     MPL_COMPILE_TIME_ASSERT(offsetof(struct MPIR_Request, dev.ch4.netmod) ==
                             offsetof(MPIDI_OFI_chunk_request, context));
-    MPL_COMPILE_TIME_ASSERT(offsetof(struct MPIR_Request, dev.ch4.netmod) ==
-                            offsetof(MPIDI_OFI_read_chunk_t, context));
     MPL_COMPILE_TIME_ASSERT(offsetof(struct MPIR_Request, dev.ch4.netmod) ==
                             offsetof(MPIDI_OFI_am_repost_request_t, context));
     MPL_COMPILE_TIME_ASSERT(offsetof(struct MPIR_Request, dev.ch4.netmod) ==
@@ -655,7 +610,7 @@ int MPIDI_OFI_init_local(int *tag_bits)
     MPL_COMPILE_TIME_ASSERT(offsetof(struct MPIR_Request, dev.ch4.netmod) ==
                             offsetof(MPIDI_OFI_dynamic_process_request_t, context));
     MPL_COMPILE_TIME_ASSERT(offsetof(struct MPIR_Request, dev.ch4.am.netmod_am.ofi.context) ==
-                            offsetof(struct MPIR_Request, dev.ch4.netmod.ofi.context));
+                            offsetof(struct MPIR_Request, dev.ch4.netmod.ofi.direct.context));
     MPL_COMPILE_TIME_ASSERT(sizeof(MPIDI_Devreq_t) >= sizeof(MPIDI_OFI_request_t));
 
     int err;
@@ -676,28 +631,6 @@ int MPIDI_OFI_init_local(int *tag_bits)
     /* -------------------------------- */
     MPIDIU_map_create(&MPIDI_OFI_global.win_map, MPL_MEM_RMA);
 
-    /* Create pack buffer pool for GPU pipeline */
-    if (MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE) {
-        mpi_errno =
-            MPIDU_genq_private_pool_create(MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ,
-                                           MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK,
-                                           MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS,
-                                           host_alloc_registered,
-                                           host_free_registered,
-                                           &MPIDI_OFI_global.gpu_pipeline_send_pool);
-        MPIR_ERR_CHECK(mpi_errno);
-        mpi_errno =
-            MPIDU_genq_private_pool_create(MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ,
-                                           MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK,
-                                           MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS,
-                                           host_alloc_registered,
-                                           host_free_registered,
-                                           &MPIDI_OFI_global.gpu_pipeline_recv_pool);
-        MPIR_ERR_CHECK(mpi_errno);
-        MPIDI_OFI_global.gpu_send_queue = NULL;
-        MPIDI_OFI_global.gpu_recv_queue = NULL;
-    }
-
     /* Initialize RMA keys allocator */
     MPIDI_OFI_mr_key_allocator_init();
 
@@ -794,8 +727,7 @@ int MPIDI_OFI_init_local(int *tag_bits)
     MPIR_Assert(MPIDI_OFI_DEFAULT_SHORT_SEND_SIZE <= MPIR_CVAR_CH4_PACK_BUFFER_SIZE);
 
     MPIDI_OFI_global.num_vcis = 1;
-    MPIDI_OFI_am_init(0);
-    MPIDI_OFI_am_post_recv(0, 0);
+    MPIDI_OFI_init_per_vci(0);
 
   fn_exit:
     *tag_bits = MPIDI_OFI_TAG_BITS;
@@ -984,6 +916,10 @@ int MPIDI_OFI_mpi_finalize_hook(void)
 
     MPIDIU_map_destroy(MPIDI_OFI_global.win_map);
 
+    for (int vci = 0; vci < MPIDI_OFI_global.num_vcis; vci++) {
+        MPIDU_genq_private_pool_destroy(MPIDI_OFI_global.per_vci[vci].pipeline_pool);
+    }
+
     if (MPIDI_OFI_ENABLE_AM) {
         for (int vci = 0; vci < MPIDI_OFI_global.num_vcis; vci++) {
             while (MPIDI_OFI_global.per_vci[vci].am_unordered_msgs) {
@@ -1007,11 +943,6 @@ int MPIDI_OFI_mpi_finalize_hook(void)
         }
     }
 
-    if (MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE) {
-        MPIDU_genq_private_pool_destroy(MPIDI_OFI_global.gpu_pipeline_send_pool);
-        MPIDU_genq_private_pool_destroy(MPIDI_OFI_global.gpu_pipeline_recv_pool);
-    }
-
     int err;
     MPID_Thread_mutex_destroy(&MPIDI_OFI_THREAD_UTIL_MUTEX, &err);
     MPIR_Assert(err == 0);
@@ -1458,13 +1389,7 @@ static int update_global_limits(struct fi_info *prov)
 
     MPIDI_OFI_global.max_buffered_send = prov->tx_attr->inject_size;
     MPIDI_OFI_global.max_buffered_write = prov->tx_attr->inject_size;
-    if (MPIR_CVAR_CH4_OFI_EAGER_MAX_MSG_SIZE > 0 &&
-        MPIR_CVAR_CH4_OFI_EAGER_MAX_MSG_SIZE <= prov->ep_attr->max_msg_size) {
-        /* Truncate max_msg_size to a user-selected value */
-        MPIDI_OFI_global.max_msg_size = MPIR_CVAR_CH4_OFI_EAGER_MAX_MSG_SIZE;
-    } else {
-        MPIDI_OFI_global.max_msg_size = MPL_MIN(prov->ep_attr->max_msg_size, MPIR_AINT_MAX);
-    }
+    MPIDI_OFI_global.max_msg_size = MPL_MIN(prov->ep_attr->max_msg_size, MPIR_AINT_MAX);
     MPIDI_OFI_global.cq_data_size = prov->domain_attr->cq_data_size;
     MPIDI_OFI_global.stripe_threshold = MPIR_CVAR_CH4_OFI_MULTI_NIC_STRIPING_THRESHOLD;
     if (prov->ep_attr->max_order_raw_size > MPIR_AINT_MAX) {
@@ -1499,11 +1424,6 @@ static int update_global_limits(struct fi_info *prov)
         MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch4|too_many_ranks");
     }
 
-    if (MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE && (prov->domain_attr->cq_data_size < 8 ||
-                                                  MPIDI_OFI_GPU_PIPELINE_SEND == 0)) {
-        MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch4|too_small_cqdata");
-    }
-
   fn_exit:
     return mpi_errno;
   fn_fail:
@@ -1584,7 +1504,35 @@ static void dump_global_settings(void)
 
 /* static functions for AM */
 
-int MPIDI_OFI_am_init(int vci)
+static int am_init(int vci);
+static int am_post_recv(int vci, int nic);
+
+int MPIDI_OFI_init_per_vci(int vci)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    /* Create chunk buffer pool (for pipeline etc.) */
+    mpi_errno = MPIDU_genq_private_pool_create(MPIR_CVAR_CH4_OFI_PIPELINE_CHUNK_SZ,
+                                               MPIR_CVAR_CH4_OFI_PIPELINE_NUM_CHUNKS,
+                                               MPIR_CVAR_CH4_OFI_PIPELINE_NUM_CHUNKS,
+                                               host_alloc_registered,
+                                               host_free_registered,
+                                               &MPIDI_OFI_global.per_vci[vci].pipeline_pool);
+    MPIR_ERR_CHECK(mpi_errno);
+
+    mpi_errno = am_init(vci);
+    MPIR_ERR_CHECK(mpi_errno);
+
+    mpi_errno = am_post_recv(vci, 0);
+    MPIR_ERR_CHECK(mpi_errno);
+
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+static int am_init(int vci)
 {
     int mpi_errno = MPI_SUCCESS;
 
@@ -1621,7 +1569,6 @@ int MPIDI_OFI_am_init(int vci)
         MPIDI_OFI_global.per_vci[vci].am_inflight_rma_send_mrs = 0;
 
         if (vci == 0) {
-            MPIDIG_am_reg_cb(MPIDI_OFI_INTERNAL_HANDLER_CONTROL, NULL, &MPIDI_OFI_control_handler);
             MPIDIG_am_reg_cb(MPIDI_OFI_AM_RDMA_READ_ACK, NULL, &MPIDI_OFI_am_rdma_read_ack_handler);
             MPIDIG_am_reg_cb(MPIDI_OFI_RNDV_INFO, NULL, &MPIDI_OFI_rndv_info_handler);
         }
@@ -1633,7 +1580,7 @@ int MPIDI_OFI_am_init(int vci)
     goto fn_exit;
 }
 
-int MPIDI_OFI_am_post_recv(int vci, int nic)
+static int am_post_recv(int vci, int nic)
 {
     int mpi_errno = MPI_SUCCESS;
 
diff --git a/src/mpid/ch4/netmod/ofi/ofi_init.h b/src/mpid/ch4/netmod/ofi/ofi_init.h
index f59e0befc1e..ef8845afa37 100644
--- a/src/mpid/ch4/netmod/ofi/ofi_init.h
+++ b/src/mpid/ch4/netmod/ofi/ofi_init.h
@@ -34,8 +34,7 @@ void MPIDI_OFI_update_global_settings(struct fi_info *prov);
 bool MPIDI_OFI_nic_already_used(const struct fi_info *prov, struct fi_info **others, int nic_count);
 
 int MPIDI_OFI_create_vci_context(int vci, int nic);
-int MPIDI_OFI_am_init(int vci);
-int MPIDI_OFI_am_post_recv(int vci, int nic);
+int MPIDI_OFI_init_per_vci(int vci);
 
 bool MPIDI_OFI_nic_is_up(struct fi_info *prov);
 
diff --git a/src/mpid/ch4/netmod/ofi/ofi_pipeline.c b/src/mpid/ch4/netmod/ofi/ofi_pipeline.c
new file mode 100644
index 00000000000..f591bcd01cd
--- /dev/null
+++ b/src/mpid/ch4/netmod/ofi/ofi_pipeline.c
@@ -0,0 +1,462 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ *     See COPYRIGHT in top-level directory
+ */
+
+#include "mpidimpl.h"
+#include "ofi_impl.h"
+#include "ofi_rndv.h"
+
+#define MPIDI_OFI_PIPILINE_INFLY_CHUNKS 10
+
+struct send_chunk_req {
+    char pad[MPIDI_REQUEST_HDR_SIZE];
+    struct fi_context context[MPIDI_OFI_CONTEXT_STRUCTS];
+    int event_id;
+    MPI_Aint chunk_sz;
+    void *chunk_buf;
+    struct iovec iov;
+    MPIR_Request *sreq;
+};
+
+struct recv_chunk_req {
+    char pad[MPIDI_REQUEST_HDR_SIZE];
+    struct fi_context context[MPIDI_OFI_CONTEXT_STRUCTS];
+    int event_id;
+    MPI_Aint chunk_sz;
+    void *chunk_buf;
+    MPIR_Request *rreq;
+    MPI_Aint recv_offset;       /* need remember where to copy chunk */
+};
+
+static int pipeline_send_poll(MPIX_Async_thing thing);
+static void spawn_send_copy(MPIX_Async_thing thing, MPIR_Request * sreq, MPIR_gpu_req * areq,
+                            int chunk_index, void *chunk_buf, MPI_Aint chunk_sz);
+static int send_copy_poll(MPIX_Async_thing thing);
+static bool send_copy_complete(MPIR_Request * sreq, int chunk_index,
+                               void *chunk_buf, MPI_Aint chunk_sz);
+static void send_chunk_complete(MPIR_Request * sreq, void *chunk_buf, MPI_Aint chunk_sz);
+
+static int pipeline_recv_poll(MPIX_Async_thing thing);
+static void recv_chunk_copy(MPIR_Request * rreq, void *chunk_buf, MPI_Aint chunk_sz,
+                            MPI_Aint offset);
+static void add_recv_copy(MPIR_Request * rreq, MPIR_gpu_req * areq, void *chunk_buf,
+                          MPI_Aint chunk_sz);
+static int recv_copy_poll(MPIX_Async_thing thing);
+static void recv_copy_complete(MPIR_Request * sreq, void *chunk_buf, MPI_Aint chunk_sz);
+
+/* NOTE: fields cached or duplicated in MPIDI_OFI_pipeline_t for clarity.
+ *       We can optimize if we are concerned with the overhead or request header size.
+ */
+int MPIDI_OFI_pipeline_send(MPIR_Request * sreq, int tag)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_OFI_pipeline_t *p = &MPIDI_OFI_AMREQ_PIPELINE(sreq);
+
+    p->remote_data_sz = MPL_MIN(p->remote_data_sz, p->data_sz);
+    p->remain_sz = p->remote_data_sz;
+    p->chunk_index = 0;
+    p->u.send.copy_offset = 0;
+    p->u.send.copy_infly = 0;   /* control to avoid overwhelming async progress */
+    p->u.send.send_infly = 0;   /* control to avoid overwhelming unexpected recv */
+
+    if (!MPIDI_OFI_CAN_SEND_CQ_DATASIZE(p->data_sz)) {
+        mpi_errno = MPIDI_OFI_RNDV_send_hdr(&p->data_sz, sizeof(MPI_Aint),
+                                            p->av, p->vci_local, p->vci_remote, p->match_bits);
+        MPIR_ERR_CHECK(mpi_errno);
+    }
+
+    mpi_errno = MPIR_Async_things_add(pipeline_send_poll, sreq, NULL);
+    /* poke progress? */
+
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+int MPIDI_OFI_pipeline_recv(MPIR_Request * rreq, int tag, int vci_src, int vci_dst)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_OFI_pipeline_t *p = &MPIDI_OFI_AMREQ_PIPELINE(rreq);
+
+    p->chunk_index = 0;
+    p->u.recv.recv_offset = 0;
+    p->u.recv.recv_infly = 0;   /* just need enough to match infly send */
+
+    if (p->remote_data_sz != -1) {
+        p->remote_data_sz = MPL_MIN(p->remote_data_sz, p->data_sz);
+        p->remain_sz = p->remote_data_sz;
+    } else {
+        mpi_errno = MPIDI_OFI_RNDV_recv_hdr(rreq, MPIDI_OFI_EVENT_PIPELINE_RECV_DATASIZE,
+                                            sizeof(MPI_Aint), p->av, p->vci_local, p->vci_remote,
+                                            p->match_bits);
+        MPIR_ERR_CHECK(mpi_errno);
+    }
+
+    mpi_errno = MPIR_Async_things_add(pipeline_recv_poll, rreq, NULL);
+    /* poke progress? */
+
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+/* callback from MPIDI_OFI_dispatch_function in ofi_events.c */
+
+int MPIDI_OFI_pipeline_recv_datasize_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Request *rreq = MPIDI_OFI_RNDV_GET_CONTROL_REQ(r);
+    MPIDI_OFI_pipeline_t *p = &MPIDI_OFI_AMREQ_PIPELINE(rreq);
+
+    MPI_Aint *hdr_data_sz = MPIDI_OFI_RNDV_GET_CONTROL_HDR(r);
+    MPIDI_OFI_RNDV_update_count(rreq, *hdr_data_sz);
+
+    p->remote_data_sz = MPL_MIN(*hdr_data_sz, p->data_sz);
+    p->remain_sz = p->remote_data_sz;
+
+    MPL_free(r);
+    return mpi_errno;
+}
+
+int MPIDI_OFI_pipeline_send_chunk_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    struct send_chunk_req *chunk_req = (void *) r;
+    send_chunk_complete(chunk_req->sreq, chunk_req->chunk_buf, chunk_req->chunk_sz);
+    MPL_free(chunk_req);
+
+    return mpi_errno;
+}
+
+int MPIDI_OFI_pipeline_recv_chunk_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    struct recv_chunk_req *chunk_req = (void *) r;
+    recv_chunk_copy(chunk_req->rreq, chunk_req->chunk_buf, chunk_req->chunk_sz,
+                    chunk_req->recv_offset);
+    MPL_free(chunk_req);
+
+    return mpi_errno;
+}
+
+/* ---- static routines for send side ---- */
+
+/* async send chunks until done */
+static int pipeline_send_poll(MPIX_Async_thing thing)
+{
+    int ret = MPIX_ASYNC_NOPROGRESS;
+    MPIR_Request *sreq = MPIR_Async_thing_get_state(thing);
+    MPIDI_OFI_pipeline_t *p = &MPIDI_OFI_AMREQ_PIPELINE(sreq);
+
+    /* CS required for genq pool and gpu imemcpy */
+    MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI_LOCK(p->vci_local));
+
+    while (p->u.send.copy_offset < p->remote_data_sz) {
+        /* limit copy_infly so it doesn't overwhelm async progress */
+        if (p->u.send.copy_infly >= MPIDI_OFI_PIPILINE_INFLY_CHUNKS) {
+            goto fn_exit;
+        }
+
+        void *chunk_buf;
+        MPI_Aint chunk_sz = MPIR_CVAR_CH4_OFI_PIPELINE_CHUNK_SZ;
+        if (chunk_sz > p->remote_data_sz - p->u.send.copy_offset) {
+            chunk_sz = p->remote_data_sz - p->u.send.copy_offset;
+        }
+
+        /* alloc a chunk */
+        MPIDU_genq_private_pool_alloc_cell(MPIDI_OFI_global.per_vci[p->vci_local].pipeline_pool,
+                                           &chunk_buf);
+        if (!chunk_buf) {
+            goto fn_exit;
+        }
+
+        /* async copy */
+        MPIR_gpu_req async_req;
+        int mpi_errno;
+        int engine_type = MPIDI_OFI_gpu_get_send_engine_type();
+        int copy_dir = MPL_GPU_COPY_DIRECTION_NONE;
+        mpi_errno = MPIR_Ilocalcopy_gpu(p->buf, p->count, p->datatype,
+                                        p->u.send.copy_offset, &p->attr, chunk_buf, chunk_sz,
+                                        MPIR_BYTE_INTERNAL, 0, NULL, copy_dir, engine_type,
+                                        1, &async_req);
+        MPIR_Assertp(mpi_errno == MPI_SUCCESS);
+        spawn_send_copy(thing, sreq, &async_req, p->chunk_index, chunk_buf, chunk_sz);
+        p->chunk_index++;
+        p->u.send.copy_offset += chunk_sz;
+        p->u.send.copy_infly++;
+    }
+
+    ret = MPIX_ASYNC_DONE;
+  fn_exit:
+    MPID_THREAD_CS_EXIT(VCI, MPIDI_VCI_LOCK(p->vci_local));
+    return ret;
+}
+
+/* ---- send_copy ---- */
+struct send_copy {
+    MPIR_Request *sreq;
+    /* async handle */
+    bool copy_done;
+    MPIR_gpu_req async_req;
+    /* for sending data */
+    int chunk_index;
+    void *chunk_buf;
+    MPI_Aint chunk_sz;
+};
+
+static void spawn_send_copy(MPIX_Async_thing thing, MPIR_Request * sreq, MPIR_gpu_req * areq,
+                            int chunk_index, void *chunk_buf, MPI_Aint chunk_sz)
+{
+    struct send_copy *p;
+    p = MPL_malloc(sizeof(*p), MPL_MEM_OTHER);
+    MPIR_Assert(p);
+
+    p->sreq = sreq;
+    p->copy_done = false;
+    p->async_req = *areq;
+    p->chunk_index = chunk_index;
+    p->chunk_buf = chunk_buf;
+    p->chunk_sz = chunk_sz;
+
+    MPIR_Async_thing_spawn(thing, send_copy_poll, p, NULL);
+}
+
+static int send_copy_poll(MPIX_Async_thing thing)
+{
+    struct send_copy *p = MPIR_Async_thing_get_state(thing);
+
+    /* this async task contains two parts: copy and send. Use the copy_done field to allow each part to be pending */
+    int is_done = p->copy_done;
+    if (!is_done) {
+        MPIR_async_test(&(p->async_req), &is_done);
+        if (is_done) {
+            MPIDI_OFI_pipeline_t *p_req = &MPIDI_OFI_AMREQ_PIPELINE(p->sreq);
+            p_req->u.send.copy_infly--;
+            p->copy_done = true;
+        }
+    }
+
+    if (!is_done) {
+        return MPIX_ASYNC_NOPROGRESS;
+    } else {
+        if (send_copy_complete(p->sreq, p->chunk_index, p->chunk_buf, p->chunk_sz)) {
+            MPL_free(p);
+            return MPIX_ASYNC_DONE;
+        } else {
+            /* We can't send the chunk for some reason. We'll try again */
+            return MPIX_ASYNC_NOPROGRESS;
+        }
+    }
+}
+
+static bool send_copy_complete(MPIR_Request * sreq, int chunk_index,
+                               void *chunk_buf, MPI_Aint chunk_sz)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_OFI_pipeline_t *p = &MPIDI_OFI_AMREQ_PIPELINE(sreq);
+
+    if (p->u.send.send_infly >= MPIDI_OFI_PIPILINE_INFLY_CHUNKS) {
+        return false;
+    }
+
+    struct send_chunk_req *chunk_req = MPL_malloc(sizeof(struct send_chunk_req), MPL_MEM_BUFFER);
+    MPIR_Assertp(chunk_req);
+
+    chunk_req->event_id = MPIDI_OFI_EVENT_PIPELINE_SEND_CHUNK;
+    chunk_req->sreq = sreq;
+    chunk_req->chunk_buf = (void *) chunk_buf;
+    chunk_req->chunk_sz = chunk_sz;
+    chunk_req->iov.iov_base = chunk_buf;
+    chunk_req->iov.iov_len = chunk_sz;
+
+    /* send */
+    int nic = chunk_index % MPIDI_OFI_global.num_nics;
+    int ctx_idx = MPIDI_OFI_get_ctx_index(p->vci_local, nic);
+    fi_addr_t addr = MPIDI_OFI_av_to_phys(p->av, p->vci_local, nic, p->vci_remote, nic);
+
+    struct fi_msg_tagged msg;
+    msg.msg_iov = &chunk_req->iov;
+    msg.desc = NULL;
+    msg.iov_count = 1;
+    msg.addr = addr;
+    msg.tag = p->match_bits;
+    msg.context = (void *) &(chunk_req->context);
+    msg.data = 0;
+
+    uint64_t flags = FI_COMPLETION | FI_MATCH_COMPLETE | FI_DELIVERY_COMPLETE;
+
+    MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI_LOCK(p->vci_local));
+    MPIDI_OFI_CALL_RETRY(fi_tsendmsg(MPIDI_OFI_global.ctx[ctx_idx].tx, &msg, flags),
+                         p->vci_local, tsendv);
+    MPID_THREAD_CS_EXIT(VCI, MPIDI_VCI_LOCK(p->vci_local));
+
+    p->u.send.send_infly++;
+    /* both send buffer and chunk_req will be freed in pipeline_send_event */
+
+    return true;
+  fn_fail:
+    MPIR_Assert(0);
+    return false;
+}
+
+static void send_chunk_complete(MPIR_Request * sreq, void *chunk_buf, MPI_Aint chunk_sz)
+{
+    MPIDI_OFI_pipeline_t *p = &MPIDI_OFI_AMREQ_PIPELINE(sreq);
+
+    p->u.send.send_infly--;
+    MPIDU_genq_private_pool_free_cell(MPIDI_OFI_global.per_vci[p->vci_local].pipeline_pool,
+                                      chunk_buf);
+
+    p->remain_sz -= chunk_sz;
+    if (p->remain_sz == 0) {
+        MPIR_Datatype_release_if_not_builtin(p->datatype);
+        MPIDI_Request_complete_fast(sreq);
+    }
+}
+
+/* ---- static routines for recv side ---- */
+
+/* async post recv chunks until done */
+static int pipeline_recv_poll(MPIX_Async_thing thing)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Request *rreq = MPIR_Async_thing_get_state(thing);
+    MPIDI_OFI_pipeline_t *p = &MPIDI_OFI_AMREQ_PIPELINE(rreq);
+
+    if (p->remote_data_sz == -1) {
+        /* Maybe we can issue 1 chunk anyway? */
+        return MPIX_ASYNC_NOPROGRESS;
+    }
+
+    int ret = MPIX_ASYNC_NOPROGRESS;
+    /* CS required for genq pool and gpu imemcpy */
+    MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI_LOCK(p->vci_local));
+
+    while (p->u.recv.recv_offset < p->remote_data_sz) {
+        /* only need issue enough recv_infly to match send_infly */
+        if (p->u.recv.recv_infly >= MPIDI_OFI_PIPILINE_INFLY_CHUNKS) {
+            goto fn_exit;
+        }
+
+        void *chunk_buf;
+        MPI_Aint chunk_sz = MPIR_CVAR_CH4_OFI_PIPELINE_CHUNK_SZ;
+        if (chunk_sz > p->remote_data_sz - p->u.recv.recv_offset) {
+            chunk_sz = p->remote_data_sz - p->u.recv.recv_offset;
+        }
+
+        /* alloc a chunk */
+        MPIDU_genq_private_pool_alloc_cell(MPIDI_OFI_global.per_vci[p->vci_local].pipeline_pool,
+                                           &chunk_buf);
+        if (!chunk_buf) {
+            goto fn_exit;
+        }
+
+        struct recv_chunk_req *chunk_req =
+            MPL_malloc(sizeof(struct recv_chunk_req), MPL_MEM_BUFFER);
+        MPIR_Assertp(chunk_req);
+
+        chunk_req->event_id = MPIDI_OFI_EVENT_PIPELINE_RECV_CHUNK;
+        chunk_req->rreq = rreq;
+        chunk_req->chunk_buf = (void *) chunk_buf;
+        chunk_req->chunk_sz = chunk_sz;
+        chunk_req->recv_offset = p->u.recv.recv_offset;
+
+        /* post recv */
+        int nic = p->chunk_index % MPIDI_OFI_global.num_nics;
+        int ctx_idx = MPIDI_OFI_get_ctx_index(p->vci_local, nic);
+        fi_addr_t addr = MPIDI_OFI_av_to_phys(p->av, p->vci_local, nic, p->vci_remote, nic);
+
+        MPIDI_OFI_CALL_RETRY(fi_trecv(MPIDI_OFI_global.ctx[ctx_idx].rx,
+                                      chunk_buf, chunk_sz, NULL, addr, p->match_bits, 0,
+                                      (void *) &chunk_req->context), p->vci_local, trecv);
+        p->chunk_index++;
+        p->u.recv.recv_offset += chunk_sz;
+        p->u.recv.recv_infly++;
+    }
+
+    ret = MPIX_ASYNC_DONE;
+  fn_exit:
+    MPID_THREAD_CS_EXIT(VCI, MPIDI_VCI_LOCK(p->vci_local));
+    return ret;
+  fn_fail:
+    MPIR_Assert(0);
+    goto fn_exit;
+}
+
+static void recv_chunk_copy(MPIR_Request * rreq, void *chunk_buf, MPI_Aint chunk_sz,
+                            MPI_Aint offset)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_OFI_pipeline_t *p = &MPIDI_OFI_AMREQ_PIPELINE(rreq);
+    p->u.recv.recv_infly--;
+
+    MPIR_gpu_req async_req;
+    int engine_type = MPIDI_OFI_gpu_get_recv_engine_type();
+    int copy_dir = MPL_GPU_COPY_DIRECTION_NONE;
+    mpi_errno = MPIR_Ilocalcopy_gpu(chunk_buf, chunk_sz, MPIR_BYTE_INTERNAL, 0, NULL,
+                                    (void *) p->buf, p->count, p->datatype, offset, &p->attr,
+                                    copy_dir, engine_type, 1, &async_req);
+    MPIR_Assertp(mpi_errno == MPI_SUCCESS);
+
+    add_recv_copy(rreq, &async_req, chunk_buf, chunk_sz);
+}
+
+/* async recv copy */
+struct recv_copy {
+    MPIR_Request *rreq;
+    MPIR_gpu_req async_req;
+    void *chunk_buf;
+    MPI_Aint chunk_sz;
+};
+
+static void add_recv_copy(MPIR_Request * rreq, MPIR_gpu_req * areq,
+                          void *chunk_buf, MPI_Aint chunk_sz)
+{
+    struct recv_copy *p;
+    p = MPL_malloc(sizeof(*p), MPL_MEM_OTHER);
+    MPIR_Assert(p);
+
+    p->rreq = rreq;
+    p->async_req = *areq;
+    p->chunk_buf = chunk_buf;
+    p->chunk_sz = chunk_sz;
+
+    MPIR_Async_things_add(recv_copy_poll, p, NULL);
+}
+
+static int recv_copy_poll(MPIX_Async_thing thing)
+{
+    struct recv_copy *p = MPIR_Async_thing_get_state(thing);
+
+    int is_done = 0;
+    MPIR_async_test(&(p->async_req), &is_done);
+
+    if (!is_done) {
+        return MPIX_ASYNC_NOPROGRESS;
+    } else {
+        recv_copy_complete(p->rreq, p->chunk_buf, p->chunk_sz);
+        MPL_free(p);
+        return MPIX_ASYNC_DONE;
+    }
+}
+
+static void recv_copy_complete(MPIR_Request * rreq, void *chunk_buf, MPI_Aint chunk_sz)
+{
+    MPIDI_OFI_pipeline_t *p = &MPIDI_OFI_AMREQ_PIPELINE(rreq);
+    MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI_LOCK(p->vci_local));
+
+    MPIDU_genq_private_pool_free_cell(MPIDI_OFI_global.per_vci[p->vci_local].pipeline_pool,
+                                      chunk_buf);
+
+    p->remain_sz -= chunk_sz;
+    if (p->remain_sz == 0) {
+        MPIR_Datatype_release_if_not_builtin(p->datatype);
+        MPIDI_Request_complete_fast(rreq);
+    }
+    MPID_THREAD_CS_EXIT(VCI, MPIDI_VCI_LOCK(p->vci_local));
+}
diff --git a/src/mpid/ch4/netmod/ofi/ofi_pre.h b/src/mpid/ch4/netmod/ofi/ofi_pre.h
index 98d41802bc2..795c8e5c6f5 100644
--- a/src/mpid/ch4/netmod/ofi/ofi_pre.h
+++ b/src/mpid/ch4/netmod/ofi/ofi_pre.h
@@ -185,6 +185,96 @@ typedef struct {
     MPI_Aint data_sz;           /* save data_sz to avoid double checking */
 } MPIDI_OFI_am_request_t;
 
+
+/* define common fields for the next 3 structs and common macros used to initialize these fields.
+ * These macros are "fragile" but they are not supposed to be used anywhere with ignorance.
+ * note: the missing semicolon on the last line is intentional.
+ */
+
+#define MPIDI_OFI_RNDV_COMMON_FIELDS \
+    const void *buf; \
+    MPI_Aint count; \
+    MPI_Datatype datatype; \
+    /* cached fields */ \
+    bool need_pack; \
+    MPL_pointer_attr_t attr; \
+    MPI_Aint data_sz; \
+    MPI_Aint remote_data_sz; \
+    /* send/recv fields */ \
+    int vci_local; \
+    int vci_remote; \
+    struct MPIDI_av_entry *av; \
+    uint64_t match_bits; \
+    /* only needed for sender to am_tag_send or replying probe */ \
+    int remote_rank
+
+typedef struct {
+    MPIDI_OFI_RNDV_COMMON_FIELDS;
+} MPIDI_OFI_rndv_common_t;
+
+typedef struct {
+    MPIDI_OFI_RNDV_COMMON_FIELDS;
+    union {
+        struct {
+            MPI_Aint copy_offset;
+            int copy_infly;
+            int send_infly;
+        } send;
+        struct {
+            MPI_Aint recv_offset;
+            int recv_infly;
+        } recv;
+    } u;
+    int chunk_index;
+    MPI_Aint remain_sz;
+} MPIDI_OFI_pipeline_t;
+
+typedef struct {
+    MPIDI_OFI_RNDV_COMMON_FIELDS;
+    MPI_Aint sz_per_nic;
+    union {
+        struct {
+            const void *data;
+            struct fid_mr **mrs;
+        } send;
+        struct {
+            union {
+                void *data;     /* !need_pack */
+                int copy_infly; /*  need_pack */
+            } u;
+            uint64_t remote_base;
+            uint64_t *rkeys;
+            MPI_Aint chunks_per_nic;
+            MPI_Aint cur_chunk_index;
+            int num_infly;
+            bool all_issued;
+        } recv;
+    } u;
+} MPIDI_OFI_rndvread_t;
+
+typedef struct {
+    MPIDI_OFI_RNDV_COMMON_FIELDS;
+    MPI_Aint sz_per_nic;
+    union {
+        struct {
+            union {
+                void *data;     /* !need_pack */
+                int copy_infly; /*  need_pack */
+            } u;
+            uint64_t remote_base;
+            uint64_t *rkeys;
+            MPI_Aint chunks_per_nic;
+            MPI_Aint cur_chunk_index;
+            int write_infly;
+            MPI_Aint chunks_remain;
+        } send;
+        struct {
+            const void *data;
+            struct fid_mr **mrs;
+        } recv;
+    } u;
+} MPIDI_OFI_rndvwrite_t;
+
 enum MPIDI_OFI_req_kind {
     MPIDI_OFI_req_kind__any,
     MPIDI_OFI_req_kind__probe,
@@ -210,10 +300,6 @@ typedef struct {
     int context_id;
 
     enum MPIDI_OFI_req_kind kind;
-    union {
-        struct fid_mr **send_mrs;
-        void *remote_info;
-    } huge;
     union {
         struct {
             char *pack_buffer;
@@ -226,23 +312,21 @@ typedef struct {
         struct iovec iov;
         void *inject_buf;       /* Internal buffer for inject emulation */
     } util;
-    struct {
-        fi_addr_t remote_addr;
-        int ctx_idx;
-        int vci_local;
-        int chunk_sz;
-        bool is_sync;
-        uint64_t cq_data;
-        uint64_t match_bits;
-        uint64_t mask_bits;
-        size_t offset;
-        size_t data_sz;
-        char *pack_recv_buf;
-        void *usm_host_buf;     /* recv */
-        MPIR_Request *req;
-    } pipeline_info;            /* GPU pipeline */
+} MPIDI_OFI_direct_t;
+
+typedef union {
+    MPIDI_OFI_direct_t direct;
+    MPIDI_OFI_rndv_common_t common;
+    MPIDI_OFI_pipeline_t pipeline;
+    MPIDI_OFI_rndvread_t read;
+    MPIDI_OFI_rndvwrite_t write;
 } MPIDI_OFI_request_t;
 
+#define MPIDI_OFI_AMREQ_COMMON(req)   ((req)->dev.ch4.netmod.ofi.common)
+#define MPIDI_OFI_AMREQ_PIPELINE(req) ((req)->dev.ch4.netmod.ofi.pipeline)
+#define MPIDI_OFI_AMREQ_READ(req)     ((req)->dev.ch4.netmod.ofi.read)
+#define MPIDI_OFI_AMREQ_WRITE(req)    ((req)->dev.ch4.netmod.ofi.write)
+
 typedef struct {
     int index;
 } MPIDI_OFI_dt_t;
diff --git a/src/mpid/ch4/netmod/ofi/ofi_probe.h b/src/mpid/ch4/netmod/ofi/ofi_probe.h
index 8a0d7f98da9..14ad8638740 100644
--- a/src/mpid/ch4/netmod/ofi/ofi_probe.h
+++ b/src/mpid/ch4/netmod/ofi/ofi_probe.h
@@ -49,7 +49,6 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_do_iprobe(int source,
     } else {
         MPIDI_OFI_REQUEST(rreq, kind) = MPIDI_OFI_req_kind__probe;
     }
-    MPIDI_OFI_REQUEST(rreq, huge.remote_info) = NULL;
     MPIDI_OFI_REQUEST(rreq, context_id) = comm->recvcontext_id + context_offset;
     rreq->comm = comm;
     MPIR_Comm_add_ref(comm);
diff --git a/src/mpid/ch4/netmod/ofi/ofi_progress.h b/src/mpid/ch4/netmod/ofi/ofi_progress.h
index 1413e34417b..94c3b0a2709 100644
--- a/src/mpid/ch4/netmod/ofi/ofi_progress.h
+++ b/src/mpid/ch4/netmod/ofi/ofi_progress.h
@@ -82,12 +82,9 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_NM_progress(int vci, int *made_progress)
          * to do, so simply return.
          * NOTE: it is not an error since global progress will poll every vci.
          */
-        return MPI_SUCCESS;
+        goto fn_exit;
     }
 
-    mpi_errno = MPIDI_OFI_gpu_progress(vci);
-    MPIR_ERR_CHECK(mpi_errno);
-
     if (unlikely(MPIDI_OFI_has_cq_buffered(vci))) {
         int num = MPIDI_OFI_get_buffered(vci, wc);
         mpi_errno = MPIDI_OFI_handle_cq_entries(vci, wc, num);
@@ -107,10 +104,13 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_NM_progress(int vci, int *made_progress)
                 mpi_errno = MPI_SUCCESS;
             else
                 mpi_errno = MPIDI_OFI_handle_cq_error(vci, nic, ret);
+
+            MPIR_ERR_CHECK(mpi_errno);
         }
 
         if (unlikely(mpi_errno == MPI_SUCCESS && MPIDI_OFI_global.per_vci[vci].deferred_am_isend_q)) {
             mpi_errno = MPIDI_OFI_handle_deferred_ops(vci);
+            MPIR_ERR_CHECK(mpi_errno);
         }
     }
 
diff --git a/src/mpid/ch4/netmod/ofi/ofi_recv.h b/src/mpid/ch4/netmod/ofi/ofi_recv.h
index de2637bdc62..9a78284e0f2 100644
--- a/src/mpid/ch4/netmod/ofi/ofi_recv.h
+++ b/src/mpid/ch4/netmod/ofi/ofi_recv.h
@@ -165,10 +165,6 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_do_irecv(void *buf,
         MPIR_Comm_add_ref(comm);
     }
 
-    if (!flags) {
-        MPIDI_OFI_REQUEST(rreq, huge.remote_info) = NULL;       /* for huge recv remote info */
-    }
-
     /* Calculate the correct NICs. */
     receiver_nic =
         MPIDI_OFI_multx_receiver_nic_index(comm, comm->recvcontext_id, rank, comm->rank, tag);
@@ -226,58 +222,6 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_do_irecv(void *buf,
             mpi_errno = MPI_SUCCESS;    /* Reset error code */
         }
 
-        if (force_gpu_pack && MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE &&
-            data_sz >= MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD) {
-            /* Pipeline path */
-            MPIDI_OFI_REQUEST(rreq, event_id) = MPIDI_OFI_EVENT_RECV_GPU_PIPELINE_INIT;
-            /* Only post first recv with pipeline chunk size. */
-            char *host_buf = NULL;
-            MPIDU_genq_private_pool_force_alloc_cell(MPIDI_OFI_global.gpu_pipeline_recv_pool,
-                                                     (void **) &host_buf);
-            MPIR_ERR_CHKANDJUMP1(host_buf == NULL, mpi_errno,
-                                 MPI_ERR_OTHER, "**nomem", "**nomem %s",
-                                 "Pipeline Init recv alloc");
-
-            fi_addr_t remote_addr;
-            if (MPI_ANY_SOURCE == rank)
-                remote_addr = FI_ADDR_UNSPEC;
-            else {
-                int sender_nic =
-                    MPIDI_OFI_multx_sender_nic_index(comm, comm->recvcontext_id, rank, comm->rank,
-                                                     MPIDI_OFI_init_get_tag(match_bits));
-                remote_addr =
-                    MPIDI_OFI_av_to_phys(addr, vci_local, receiver_nic, vci_remote, sender_nic);
-            }
-
-            /* Save pipeline information. */
-            MPIDI_OFI_REQUEST(rreq, pipeline_info.offset) = 0;
-            MPIDI_OFI_REQUEST(rreq, pipeline_info.is_sync) = false;
-            MPIDI_OFI_REQUEST(rreq, pipeline_info.remote_addr) = remote_addr;
-            MPIDI_OFI_REQUEST(rreq, pipeline_info.vci_local) = vci_local;
-            MPIDI_OFI_REQUEST(rreq, pipeline_info.match_bits) = match_bits;
-            MPIDI_OFI_REQUEST(rreq, pipeline_info.mask_bits) = mask_bits;
-            MPIDI_OFI_REQUEST(rreq, pipeline_info.data_sz) = data_sz;
-            MPIDI_OFI_REQUEST(rreq, pipeline_info.ctx_idx) = ctx_idx;
-
-            /* Save original buf, datatype and count */
-            MPIDI_OFI_REQUEST(rreq, noncontig.pack.pack_buffer) = host_buf;
-
-            MPIDI_OFI_gpu_pipeline_request *chunk_req;
-            chunk_req = (MPIDI_OFI_gpu_pipeline_request *)
-                MPL_malloc(sizeof(MPIDI_OFI_gpu_pipeline_request), MPL_MEM_BUFFER);
-            MPIR_ERR_CHKANDJUMP1(chunk_req == NULL, mpi_errno,
-                                 MPI_ERR_OTHER, "**nomem", "**nomem %s", "Recv chunk_req alloc");
-            chunk_req->event_id = MPIDI_OFI_EVENT_RECV_GPU_PIPELINE_INIT;
-            chunk_req->parent = rreq;
-            chunk_req->buf = host_buf;
-            MPIDI_OFI_CALL_RETRY(fi_trecv(MPIDI_OFI_global.ctx[ctx_idx].rx,
-                                          host_buf,
-                                          MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ,
-                                          NULL, remote_addr, match_bits, mask_bits,
-                                          (void *) &chunk_req->context), vci_local, trecv);
-            goto fn_exit;
-        }
-
         /* Unpack */
         MPIDI_OFI_REQUEST(rreq, event_id) = MPIDI_OFI_EVENT_RECV_PACK;
         MPIDI_OFI_REQUEST(rreq, noncontig.pack.pack_buffer) =
@@ -289,21 +233,18 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_do_irecv(void *buf,
         MPIDI_OFI_REQUEST(rreq, noncontig.pack.pack_buffer) = NULL;
     }
 
-    /* Read ordering unnecessary for context_id, so use relaxed load */
+    if (MPIDI_OFI_REQUEST(rreq, event_id) != MPIDI_OFI_EVENT_RECV_PACK)
+        MPIDI_OFI_REQUEST(rreq, event_id) = MPIDI_OFI_EVENT_RECV;
+
+    /* Simply posting a large recv buffer may take a overhead that defeats the benefit of e.g. pipeline.
+     * Sender will never directly send a message larger than EAGER_THRESH, so - */
+    if (data_sz > MPIDI_OFI_EAGER_THRESH && mode != MPIDI_OFI_AM_TAG_RECV) {
+        data_sz = MPIDI_OFI_EAGER_THRESH;
+    }
+
     MPIDI_OFI_REQUEST(rreq, util.iov.iov_base) = recv_buf;
     MPIDI_OFI_REQUEST(rreq, util.iov.iov_len) = data_sz;
 
-    if (unlikely(data_sz >= MPIDI_OFI_global.max_msg_size) && !MPIDI_OFI_COMM(comm).enable_striping) {
-        MPIDI_OFI_REQUEST(rreq, event_id) = MPIDI_OFI_EVENT_RECV_HUGE;
-        data_sz = MPIDI_OFI_global.max_msg_size;
-    } else if (MPIDI_OFI_COMM(comm).enable_striping &&
-               (data_sz >= MPIDI_OFI_global.stripe_threshold)) {
-        MPIDI_OFI_REQUEST(rreq, event_id) = MPIDI_OFI_EVENT_RECV_HUGE;
-        /* Receive has to be posted with size MPIDI_OFI_global.stripe_threshold to handle underflow */
-        data_sz = MPIDI_OFI_global.stripe_threshold;
-    } else if (MPIDI_OFI_REQUEST(rreq, event_id) != MPIDI_OFI_EVENT_RECV_PACK)
-        MPIDI_OFI_REQUEST(rreq, event_id) = MPIDI_OFI_EVENT_RECV;
-
     if (!flags) {
         fi_addr_t sender_addr;
         if (MPI_ANY_SOURCE == rank) {
diff --git a/src/mpid/ch4/netmod/ofi/ofi_rndv.c b/src/mpid/ch4/netmod/ofi/ofi_rndv.c
index a3700be5b21..b6fa3b57bd4 100644
--- a/src/mpid/ch4/netmod/ofi/ofi_rndv.c
+++ b/src/mpid/ch4/netmod/ofi/ofi_rndv.c
@@ -6,15 +6,64 @@
 #include "mpidimpl.h"
 #include "ofi_am_events.h"
 #include "ofi_events.h"
+#include "ofi_rndv.h"
 
 #define MPIDI_OFI_CTS_FLAG__NONE   0
 #define MPIDI_OFI_CTS_FLAG__PROBE  1
+#define MPIDI_OFI_CTS_FLAG__NEED_PACK 2
+
+static bool cts_is_probe(int flag)
+{
+    return (flag & MPIDI_OFI_CTS_FLAG__PROBE);
+}
+
+static int get_rndv_protocol(bool send_need_pack, bool recv_need_pack, MPI_Aint recv_data_sz)
+{
+    /* NOTE: some protocols may not work, fallback to auto */
+    switch (MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL) {
+        case MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL_pipeline:
+            return MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL_pipeline;
+        case MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL_read:
+            if (!send_need_pack) {
+                return MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL_read;
+            }
+            break;
+        case MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL_write:
+            if (!recv_need_pack) {
+                return MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL_write;
+            }
+            break;
+        case MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL_direct:
+            /* libfabric can't direct send > max_msg_size. Only sender knows both sizes from
+             * receiving CTS, thus we use recv_data_sz so both sides can agree.
+             * NOTE: psm3 has max_msg_size at 4294963200.
+             */
+            if (recv_data_sz <= MPIDI_OFI_global.max_msg_size) {
+                return MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL_direct;
+            }
+            break;
+    }
+
+    /* auto */
+    if (send_need_pack && recv_need_pack) {
+        return MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL_pipeline;
+    } else if (send_need_pack) {
+        return MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL_write;
+    } else if (recv_need_pack) {
+        return MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL_read;
+    } else if (recv_data_sz < MPIDI_OFI_global.max_msg_size) {
+        return MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL_direct;
+    } else {
+        return MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL_read;
+    }
+}
 
 /* receiver -> sender */
 struct rndv_cts {
     MPIR_Request *rreq;
     int am_tag;
     int flag;
+    MPI_Aint data_sz;
 };
 
 /* sender -> receiver */
@@ -26,14 +75,22 @@ struct rndv_info_hdr {
 
 /* ---- receiver side ---- */
 
-static int rndv_event_common(int vci, MPIR_Request * rreq, int *vci_src_out, int *vci_dst_out)
+int MPIDI_OFI_recv_rndv_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * rreq)
 {
     int mpi_errno = MPI_SUCCESS;
+    MPIR_FUNC_ENTER;
+
+    /* save context_offset for MPIDI_OFI_send_ack */
+    int context_id = MPIDI_OFI_REQUEST(rreq, context_id);
+
+    /* save and free up the OFI request */
+    void *buf = MPIDI_OFI_REQUEST(rreq, buf);
+    MPI_Aint count = MPIDI_OFI_REQUEST(rreq, count);
+    MPI_Datatype datatype = MPIDI_OFI_REQUEST(rreq, datatype);;
 
     /* if we were expecting an eager send, free the unneeded pack_buffer or iovs array */
     switch (MPIDI_OFI_REQUEST(rreq, event_id)) {
         case MPIDI_OFI_EVENT_RECV_PACK:
-        case MPIDI_OFI_EVENT_RECV_HUGE:
             MPL_free(MPIDI_OFI_REQUEST(rreq, noncontig.pack.pack_buffer));
             break;
         case MPIDI_OFI_EVENT_RECV_NOPACK:
@@ -41,61 +98,76 @@ static int rndv_event_common(int vci, MPIR_Request * rreq, int *vci_src_out, int
             break;
     }
 
-    /* save and free up the OFI request */
-    void *buf = MPIDI_OFI_REQUEST(rreq, buf);
-    MPI_Aint count = MPIDI_OFI_REQUEST(rreq, count);
-    MPI_Datatype datatype = MPIDI_OFI_REQUEST(rreq, datatype);;
+    int dt_contig;
+    MPIR_Datatype_is_contig(datatype, &dt_contig);
+
+    MPL_pointer_attr_t attr;
+    MPIR_GPU_query_pointer_attr(buf, &attr);
+
+    MPI_Aint data_sz;
+    MPIR_Datatype_get_size_macro(datatype, data_sz);
+    data_sz *= count;
 
-    /* next, convert it to an MPIDIG request */
-    /* the vci need be consistent with MPIDI_OFI_RECV_VNIS in ofi_recv.h */
     MPIR_Comm *comm = rreq->comm;
     int src_rank = rreq->status.MPI_SOURCE;
     int tag = rreq->status.MPI_TAG;
     int vci_src = MPIDI_get_vci(SRC_VCI_FROM_RECVER, comm, src_rank, comm->rank, tag);
     int vci_dst = MPIDI_get_vci(DST_VCI_FROM_RECVER, comm, src_rank, comm->rank, tag);
-    MPIR_Assert(vci == vci_dst);
-
-    /* TODO: optimize - since we are going to use am_tag_recv, we won't need most of the MPIDIG request fields */
-    mpi_errno = MPIDIG_request_init_internal(rreq, vci_dst /* local */ , vci_src /* remote */);
-    MPIR_ERR_CHECK(mpi_errno);
 
-    MPIDIG_REQUEST(rreq, buffer) = buf;
-    MPIDIG_REQUEST(rreq, count) = count;
-    MPIDIG_REQUEST(rreq, datatype) = datatype;
-
-  fn_exit:
-    *vci_src_out = vci_src;
-    *vci_dst_out = vci_dst;
-    return mpi_errno;
-  fn_fail:
-    goto fn_exit;
-}
-
-int MPIDI_OFI_recv_rndv_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * rreq)
-{
-    int mpi_errno = MPI_SUCCESS;
-    MPIR_FUNC_ENTER;
+    MPIDI_OFI_rndv_common_t *p = &MPIDI_OFI_AMREQ_COMMON(rreq);
+    p->buf = buf;
+    p->count = count;
+    p->datatype = datatype;
+    p->need_pack = MPIDI_OFI_rndv_need_pack(dt_contig, &attr);
+    p->attr = attr;
+    p->data_sz = data_sz;
+    p->vci_local = vci_dst;
+    p->vci_remote = vci_src;
+    p->av = MPIDIU_comm_rank_to_av(comm, src_rank);
+
+    MPI_Aint remote_data_sz = MPIDI_OFI_idata_get_size(wc->data);
+    if (remote_data_sz > 0) {
+        p->remote_data_sz = remote_data_sz;
+        MPIDI_OFI_RNDV_update_count(rreq, remote_data_sz);
+    } else {
+        /* mark remote_data_sz as unknown */
+        p->remote_data_sz = -1;
+    }
 
-    /* save context_offset for MPIDI_OFI_send_ack */
-    int context_id = MPIDI_OFI_REQUEST(rreq, context_id);
+    int am_tag = MPIDIG_get_next_am_tag(comm);
+    p->match_bits = MPIDI_OFI_init_sendtag(comm->recvcontext_id, 0, am_tag) | MPIDI_OFI_AM_SEND;
 
-    /* Convert rreq to an MPIDIG request */
-    int vci_src, vci_dst;
-    mpi_errno = rndv_event_common(vci, rreq, &vci_src, &vci_dst);
-    MPIR_ERR_CHECK(mpi_errno);
+    bool send_need_pack = MPIDI_OFI_is_tag_rndv_pack(wc->tag);
+    bool recv_need_pack = p->need_pack;
 
     /* prepare rndv_cts */
     struct rndv_cts hdr;
     hdr.rreq = rreq;
-    hdr.am_tag = MPIDIG_get_next_am_tag(rreq->comm);
+    hdr.am_tag = am_tag;
     hdr.flag = MPIDI_OFI_CTS_FLAG__NONE;
+    hdr.data_sz = data_sz;
+    if (recv_need_pack) {
+        hdr.flag |= MPIDI_OFI_CTS_FLAG__NEED_PACK;
+    }
 
-    /* am_tag_recv */
-    mpi_errno = MPIDI_NM_am_tag_recv(rreq->status.MPI_SOURCE, rreq->comm,
-                                     MPIDIG_TAG_RECV_COMPLETE, hdr.am_tag,
-                                     MPIDIG_REQUEST(rreq, buffer),
-                                     MPIDIG_REQUEST(rreq, count), MPIDIG_REQUEST(rreq, datatype),
-                                     vci_src, vci_dst, rreq);
+    switch (get_rndv_protocol(send_need_pack, recv_need_pack, p->data_sz)) {
+        case MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL_pipeline:
+            mpi_errno = MPIDI_OFI_pipeline_recv(rreq, hdr.am_tag, vci_src, vci_dst);
+            break;
+        case MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL_read:
+            mpi_errno = MPIDI_OFI_rndvread_recv(rreq, hdr.am_tag, vci_src, vci_dst);
+            break;
+        case MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL_write:
+            mpi_errno = MPIDI_OFI_rndvwrite_recv(rreq, hdr.am_tag, vci_src, vci_dst);
+            break;
+        case MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL_direct:
+            /* fall through */
+        default:
+            mpi_errno = MPIDI_NM_am_tag_recv(rreq->status.MPI_SOURCE, rreq->comm,
+                                             -1, hdr.am_tag,
+                                             (void *) p->buf, p->count, p->datatype,
+                                             vci_src, vci_dst, rreq);
+    }
     MPIR_ERR_CHECK(mpi_errno);
 
     /* send cts */
@@ -118,11 +190,6 @@ int MPIDI_OFI_peek_rndv_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Reque
     /* save context_offset for MPIDI_OFI_send_ack */
     int context_id = MPIDI_OFI_REQUEST(rreq, context_id);
 
-    /* Convert rreq to an MPIDIG request */
-    int vci_src, vci_dst;
-    mpi_errno = rndv_event_common(vci, rreq, &vci_src, &vci_dst);
-    MPIR_ERR_CHECK(mpi_errno);
-
     MPI_Aint data_sz;
     data_sz = MPIDI_OFI_idata_get_size(wc->data);
 
@@ -136,6 +203,7 @@ int MPIDI_OFI_peek_rndv_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Reque
         hdr.rreq = rreq;
         hdr.am_tag = -1;        /* don't issue am_tag_recv yet */
         hdr.flag = MPIDI_OFI_CTS_FLAG__PROBE;
+        hdr.data_sz = 0;
 
         mpi_errno = MPIDI_OFI_send_ack(rreq, context_id, &hdr, sizeof(hdr));
         MPIR_ERR_CHECK(mpi_errno);
@@ -172,31 +240,57 @@ int MPIDI_OFI_rndv_info_handler(void *am_hdr, void *data, MPI_Aint in_data_sz, u
 
 /* ---- sender side ---- */
 
-int MPIDI_OFI_rndv_cts_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * req)
+int MPIDI_OFI_rndv_cts_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * r)
 {
     int mpi_errno = MPI_SUCCESS;
     MPIR_FUNC_ENTER;
 
-    MPIDI_OFI_ack_request_t *ack_req = (MPIDI_OFI_ack_request_t *) req;
+    MPIDI_OFI_ack_request_t *ack_req = (MPIDI_OFI_ack_request_t *) r;
 
     MPIR_Request *sreq = ack_req->signal_req;
     struct rndv_cts *hdr = ack_req->ack_hdr;
-
-    /* sreq is already an MPIDIG request (ref. ofi_send.h) */
-    if (hdr->flag == MPIDI_OFI_CTS_FLAG__NONE) {
-        /* issue MPIDI_NM_am_tag_send, ref. MPIDIG_send_cts_target_msg_cb */
-        mpi_errno = MPIDI_NM_am_tag_send(MPIDIG_REQUEST(sreq, u.send.dest), sreq->comm,
-                                         MPIDIG_SEND_DATA, hdr->am_tag,
-                                         MPIDIG_REQUEST(sreq, buffer),
-                                         MPIDIG_REQUEST(sreq, count),
-                                         MPIDIG_REQUEST(sreq, datatype),
-                                         MPIDIG_REQUEST(sreq, req->local_vci),
-                                         MPIDIG_REQUEST(sreq, req->remote_vci), sreq);
-
+    MPIDI_OFI_rndv_common_t *p = &MPIDI_OFI_AMREQ_COMMON(sreq);
+
+    p->remote_data_sz = hdr->data_sz;
+    p->match_bits =
+        MPIDI_OFI_init_sendtag(sreq->comm->context_id, 0, hdr->am_tag) | MPIDI_OFI_AM_SEND;
+
+    if (!cts_is_probe(hdr->flag)) {
+        bool send_need_pack = p->need_pack;
+        bool recv_need_pack = hdr->flag & MPIDI_OFI_CTS_FLAG__NEED_PACK;
+
+        switch (get_rndv_protocol(send_need_pack, recv_need_pack, hdr->data_sz)) {
+            case MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL_pipeline:
+                mpi_errno = MPIDI_OFI_pipeline_send(sreq, hdr->am_tag);
+                break;
+            case MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL_read:
+                mpi_errno = MPIDI_OFI_rndvread_send(sreq, hdr->am_tag);
+                break;
+            case MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL_write:
+                mpi_errno = MPIDI_OFI_rndvwrite_send(sreq, hdr->am_tag);
+                break;
+            case MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL_direct:
+                /* fall through */
+            default:
+                if (p->data_sz < MPIDI_OFI_global.max_msg_size) {
+                    mpi_errno = MPIDI_NM_am_tag_send(p->remote_rank, sreq->comm, -1, hdr->am_tag,
+                                                     p->buf, p->count, p->datatype,
+                                                     p->vci_local, p->vci_remote, sreq);
+                } else {
+                    /* Only contig data here (if this ever change, FIXME) -
+                     * Send max_msg_size and receiver will get the truncation error.*/
+                    MPI_Aint true_extent, true_lb;
+                    MPIR_Type_get_true_extent_impl(p->datatype, &true_lb, &true_extent);
+                    void *data = MPIR_get_contig_ptr(p->buf, true_lb);
+                    mpi_errno = MPIDI_NM_am_tag_send(p->remote_rank, sreq->comm, -1, hdr->am_tag,
+                                                     data, MPIDI_OFI_global.max_msg_size,
+                                                     MPIR_BYTE_INTERNAL,
+                                                     p->vci_local, p->vci_remote, sreq);
+                }
+        }
         MPL_free(ack_req->ack_hdr);
         MPL_free(ack_req);
     } else {
-        MPIR_Assert(hdr->flag == MPIDI_OFI_CTS_FLAG__PROBE);
         /* re-issue the ack recv */
         MPIDI_OFI_CALL_RETRY(fi_trecv(MPIDI_OFI_global.ctx[ack_req->ctx_idx].rx,
                                       ack_req->ack_hdr, ack_req->ack_hdr_sz, NULL,
@@ -207,12 +301,10 @@ int MPIDI_OFI_rndv_cts_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Reques
         struct rndv_info_hdr rndv_info;
         rndv_info.sreq = sreq;
         rndv_info.rreq = hdr->rreq;
-        MPIDI_Datatype_check_size(MPIDIG_REQUEST(sreq, datatype), MPIDIG_REQUEST(sreq, count),
-                                  rndv_info.data_sz);
-        mpi_errno = MPIDI_NM_am_send_hdr_reply(sreq->comm, MPIDIG_REQUEST(sreq, u.send.dest),
+        rndv_info.data_sz = p->data_sz;
+        mpi_errno = MPIDI_NM_am_send_hdr_reply(sreq->comm, p->remote_rank,
                                                MPIDI_OFI_RNDV_INFO, &rndv_info, sizeof(rndv_info),
-                                               MPIDIG_REQUEST(sreq, req->local_vci),
-                                               MPIDIG_REQUEST(sreq, req->remote_vci));
+                                               p->vci_local, p->vci_remote);
     }
 
   fn_exit:
diff --git a/src/mpid/ch4/netmod/ofi/ofi_rndv.h b/src/mpid/ch4/netmod/ofi/ofi_rndv.h
new file mode 100644
index 00000000000..e36f9eef273
--- /dev/null
+++ b/src/mpid/ch4/netmod/ofi/ofi_rndv.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ *     See COPYRIGHT in top-level directory
+ */
+
+#ifndef OFI_RNDV_H_INCLUDED
+#define OFI_RNDV_H_INCLUDED
+
+/*
+=== BEGIN_MPI_T_CVAR_INFO_BLOCK ===
+
+cvars:
+    - name        : MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL
+      category    : CH4_OFI
+      type        : enum
+      default     : auto
+      class       : none
+      verbosity   : MPI_T_VERBOSITY_USER_BASIC
+      scope       : MPI_T_SCOPE_LOCAL
+      description : |-
+        When message size is greater than MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD,
+        specify large message protocol.
+        auto - decide protocols based on buffer attributes and datatypes.
+        pipeline - use pipeline protocol (forcing pack and unpack).
+        read - RDMA read.
+        write - RDMA write.
+        direct - direct send data using libfabric after the RNDV handshake.
+=== END_MPI_T_CVAR_INFO_BLOCK ===
+*/
+
+int MPIDI_OFI_pipeline_send(MPIR_Request * sreq, int tag);
+int MPIDI_OFI_pipeline_recv(MPIR_Request * rreq, int tag, int vci_src, int vci_dst);
+int MPIDI_OFI_rndvread_send(MPIR_Request * sreq, int tag);
+int MPIDI_OFI_rndvread_recv(MPIR_Request * rreq, int tag, int vci_src, int vci_dst);
+int MPIDI_OFI_rndvwrite_send(MPIR_Request * sreq, int tag);
+int MPIDI_OFI_rndvwrite_recv(MPIR_Request * rreq, int tag, int vci_src, int vci_dst);
+
+typedef struct {
+    char pad[MPIDI_REQUEST_HDR_SIZE];
+    struct fi_context context[MPIDI_OFI_CONTEXT_STRUCTS];
+    int event_id;
+    MPIR_Request *req;          /* sreq or rreq */
+    char hdr[];
+} MPIDI_OFI_RNDV_control_req_t;
+
+MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_RNDV_send_hdr(void *hdr, int hdr_sz, MPIDI_av_entry_t * av,
+                                                     int vci_local, int vci_remote,
+                                                     uint64_t match_bits)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    /* control message always use nic 0 */
+    int ctx_idx = MPIDI_OFI_get_ctx_index(vci_local, 0);
+    fi_addr_t addr = MPIDI_OFI_av_to_phys(av, vci_local, 0, vci_remote, 0);
+    MPIDI_OFI_CALL_RETRY(fi_tinject(MPIDI_OFI_global.ctx[ctx_idx].tx,
+                                    hdr, hdr_sz, addr, match_bits), vci_local, tinject);
+
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_RNDV_recv_hdr(MPIR_Request * parent_request, int event_id,
+                                                     int hdr_sz, MPIDI_av_entry_t * av,
+                                                     int vci_local, int vci_remote,
+                                                     uint64_t match_bits)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIDI_OFI_RNDV_control_req_t *control;
+    control = MPL_malloc(sizeof(MPIDI_OFI_RNDV_control_req_t) + hdr_sz, MPL_MEM_OTHER);
+    MPIR_Assertp(control);
+
+    control->event_id = event_id;
+    control->req = parent_request;
+
+    /* control message always use nic 0 */
+    int ctx_idx = MPIDI_OFI_get_ctx_index(vci_local, 0);
+    fi_addr_t addr = MPIDI_OFI_av_to_phys(av, vci_local, 0, vci_remote, 0);
+
+    MPIDI_OFI_CALL_RETRY(fi_trecv(MPIDI_OFI_global.ctx[ctx_idx].rx,
+                                  control->hdr, hdr_sz, NULL,
+                                  addr, match_bits, 0ULL, (void *) &control->context),
+                         vci_local, trecv);
+
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+MPL_STATIC_INLINE_PREFIX void MPIDI_OFI_RNDV_update_count(MPIR_Request * rreq, MPI_Aint data_sz)
+{
+    MPIDI_OFI_rndv_common_t *p = &MPIDI_OFI_AMREQ_COMMON(rreq);
+
+    MPIR_STATUS_SET_COUNT(rreq->status, data_sz);
+    if (data_sz > p->data_sz) {
+        rreq->status.MPI_ERROR = MPI_ERR_TRUNCATE;
+    }
+}
+
+#define MPIDI_OFI_RNDV_GET_CONTROL_HDR(r) ((void *)((MPIDI_OFI_RNDV_control_req_t *)(r))->hdr)
+#define MPIDI_OFI_RNDV_GET_CONTROL_REQ(r) ((MPIDI_OFI_RNDV_control_req_t *)(r))->req
+
+#endif /* OFI_RNDV_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ofi/ofi_rndv_rdma_common.inc b/src/mpid/ch4/netmod/ofi/ofi_rndv_rdma_common.inc
new file mode 100644
index 00000000000..77d4fdfdb6b
--- /dev/null
+++ b/src/mpid/ch4/netmod/ofi/ofi_rndv_rdma_common.inc
@@ -0,0 +1,116 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ *     See COPYRIGHT in top-level directory
+ */
+
+struct rdma_info {
+    MPI_Aint data_sz;
+    uint64_t base;              /* 0 unless MPIDI_OFI_ENABLE_MR_VIRT_ADDRESS is true */
+    int num_nics;               /* redundant since we assume sender/receiver agree on num_nics */
+    uint64_t rkeys[];
+};
+
+static MPI_Aint get_chunks_per_nic(MPI_Aint data_sz, int num_nics);
+
+static int prepare_rdma_info(const void *buf, MPI_Datatype datatype, MPI_Aint data_sz, int vci,
+                             int access, struct fid_mr **mrs, struct rdma_info *hdr)
+{
+    int mpi_errno = MPI_SUCCESS;
+    const void *data;
+    MPI_Aint true_extent, true_lb;
+    MPIR_Type_get_true_extent_impl(datatype, &true_lb, &true_extent);
+    data = MPIR_get_contig_ptr(buf, true_lb);
+
+    int num_nics = MPIDI_OFI_global.num_nics;
+    uint64_t rkeys[MPIDI_OFI_MAX_NICS];
+
+    /* prepare mr and rkey */
+    if (!MPIDI_OFI_ENABLE_MR_PROV_KEY) {
+        /* Set up a memory region for the lmt data transfer */
+        for (int i = 0; i < num_nics; i++) {
+            rkeys[i] = MPIDI_OFI_mr_key_alloc(MPIDI_OFI_LOCAL_MR_KEY, MPIDI_OFI_INVALID_MR_KEY);
+            MPIR_ERR_CHKANDJUMP(rkeys[i] == MPIDI_OFI_INVALID_MR_KEY, mpi_errno,
+                                MPI_ERR_OTHER, "**ofid_mr_key");
+        }
+    } else {
+        /* zero them to avoid warnings */
+        for (int i = 0; i < num_nics; i++) {
+            rkeys[i] = 0;
+        }
+    }
+    MPI_Aint chunks_per_nic = get_chunks_per_nic(data_sz, num_nics);
+    MPI_Aint chunk_sz = MPIR_CVAR_CH4_OFI_PIPELINE_CHUNK_SZ;
+    MPI_Aint sz_per_nic = chunks_per_nic * chunk_sz;
+    for (int i = 0; i < num_nics; i++) {
+        MPIDI_OFI_context_t *ctx = &MPIDI_OFI_global.ctx[MPIDI_OFI_get_ctx_index(vci, i)];
+        /* note: fi_mr_reg is expensive, distribute over num_nics */
+        void *nic_data = (char *) data + i * sz_per_nic;
+        MPI_Aint sz = (i != num_nics - 1) ? sz_per_nic : (data_sz - i * sz_per_nic);
+        MPIDI_OFI_CALL(fi_mr_reg(ctx->domain, nic_data, sz, access, 0ULL,
+                                 rkeys[i], 0ULL, &mrs[i], NULL), mr_reg);
+        mpi_errno = MPIDI_OFI_mr_bind(MPIDI_OFI_global.prov_use[0], mrs[i], ctx->ep,
+                                      NULL);
+        MPIR_ERR_CHECK(mpi_errno);
+    }
+    if (MPIDI_OFI_ENABLE_MR_PROV_KEY) {
+        for (int i = 0; i < num_nics; i++) {
+            rkeys[i] = fi_mr_key(mrs[i]);
+        }
+    }
+
+    /* prepare rdma_info */
+    hdr->data_sz = data_sz;
+    hdr->base = (uintptr_t) data;
+    for (int i = 0; i < num_nics; i++) {
+        hdr->rkeys[i] = rkeys[i];
+    }
+
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+/* utility routine for calculating chunks */
+/* Each nic is assigned with chunks_per_nic chunks. The last nic may have less chunks */
+
+static MPI_Aint get_chunks_per_nic(MPI_Aint data_sz, int num_nics)
+{
+    MPI_Aint chunk_sz = MPIR_CVAR_CH4_OFI_PIPELINE_CHUNK_SZ;
+    MPI_Aint num_chunks = data_sz / chunk_sz;
+    if (chunk_sz * num_chunks < data_sz) {
+        num_chunks++;
+    }
+
+    if (num_nics == 1) {
+        return num_chunks;
+    } else {
+        MPI_Aint chunks_per_nic = num_chunks / num_nics;
+        if (chunks_per_nic * num_nics < num_chunks) {
+            chunks_per_nic++;
+        }
+        return chunks_per_nic;
+    }
+}
+
+static void get_chunk_offsets(MPI_Aint chunk_index, int num_nics, MPI_Aint chunks_per_nic,
+                              MPI_Aint data_sz, MPI_Aint * total_offset_out, int *nic_out,
+                              MPI_Aint * nic_offset_out, MPI_Aint * chunk_sz_out)
+{
+    MPI_Aint chunk_sz = MPIR_CVAR_CH4_OFI_PIPELINE_CHUNK_SZ;
+    if (num_nics == 1) {
+        *nic_out = 0;
+        *nic_offset_out = *total_offset_out = chunk_index * chunk_sz;
+    } else {
+        int nic = chunk_index % num_nics;
+        MPI_Aint nic_chunk_index = chunk_index / chunks_per_nic;
+        *total_offset_out = (nic * chunks_per_nic + nic_chunk_index) * chunk_sz;
+        *nic_offset_out = nic_chunk_index * chunk_sz;
+    }
+    if (*total_offset_out + chunk_sz < data_sz) {
+        *chunk_sz_out = chunk_sz;
+    } else {
+        /* incomplete chunks */
+        *chunk_sz_out = MPL_MAX(0, data_sz - *total_offset_out);
+    }
+}
diff --git a/src/mpid/ch4/netmod/ofi/ofi_rndv_read.c b/src/mpid/ch4/netmod/ofi/ofi_rndv_read.c
new file mode 100644
index 00000000000..b2d0cb5653f
--- /dev/null
+++ b/src/mpid/ch4/netmod/ofi/ofi_rndv_read.c
@@ -0,0 +1,361 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ *     See COPYRIGHT in top-level directory
+ */
+
+#include <mpidimpl.h>
+#include "ofi_impl.h"
+#include "ofi_events.h"
+#include "ofi_rndv.h"
+
+#include "ofi_rndv_rdma_common.inc"
+
+#define MPIDI_OFI_RNDVREAD_INFLY_CHUNKS 10
+
+static int rndvread_read_poll(MPIX_Async_thing thing);
+static int recv_issue_read(MPIR_Request * parent_request, int event_id,
+                           void *buf, MPI_Aint data_sz, MPI_Aint offset,
+                           MPIDI_av_entry_t * av, int vci_local, int vci_remote, int nic,
+                           MPI_Aint remote_disp, uint64_t rkey);
+static int async_recv_copy(MPIR_Request * rreq, void *chunk_buf, MPI_Aint chunk_sz,
+                           void *buf, MPI_Aint count, MPI_Datatype datatype,
+                           MPI_Aint offset, MPL_pointer_attr_t * attr);
+static int recv_copy_poll(MPIX_Async_thing thing);
+static void recv_copy_complete(MPIR_Request * rreq, void *chunk_buf, MPI_Aint chunk_sz);
+static int check_recv_complete(MPIR_Request * rreq);
+
+/* -- sender side -- */
+
+int MPIDI_OFI_rndvread_send(MPIR_Request * sreq, int tag)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_OFI_rndvread_t *p = &MPIDI_OFI_AMREQ_READ(sreq);
+    MPIR_FUNC_ENTER;
+
+    MPIR_Assert(!p->need_pack);
+
+    MPI_Aint true_extent, true_lb;
+    MPIR_Type_get_true_extent_impl(p->datatype, &true_lb, &true_extent);
+    p->u.send.data = MPIR_get_contig_ptr(p->buf, true_lb);
+
+    int num_nics = MPIDI_OFI_global.num_nics;
+    p->u.send.mrs = MPL_malloc((num_nics * sizeof(struct fid_mr *)), MPL_MEM_OTHER);
+
+    int hdr_sz = sizeof(struct rdma_info) + num_nics * sizeof(uint64_t);
+    struct rdma_info *hdr = MPL_malloc(hdr_sz, MPL_MEM_OTHER);
+    MPIR_Assertp(hdr);
+
+    mpi_errno = prepare_rdma_info((void *) p->buf, p->datatype, p->data_sz, p->vci_local,
+                                  FI_REMOTE_READ, p->u.send.mrs, hdr);
+    MPIR_ERR_CHECK(mpi_errno);
+
+    /* send rdma_info */
+    mpi_errno = MPIDI_OFI_RNDV_send_hdr(hdr, hdr_sz,
+                                        p->av, p->vci_local, p->vci_remote, p->match_bits);
+    MPIR_ERR_CHECK(mpi_errno);
+
+    MPL_free(hdr);
+
+    /* issue recv for ack */
+    mpi_errno = MPIDI_OFI_RNDV_recv_hdr(sreq, MPIDI_OFI_EVENT_RNDVREAD_ACK, 0,
+                                        p->av, p->vci_local, p->vci_remote, p->match_bits);
+    MPIR_ERR_CHECK(mpi_errno);
+
+  fn_exit:
+    MPIR_FUNC_EXIT;
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+int MPIDI_OFI_rndvread_ack_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Request *sreq = MPIDI_OFI_RNDV_GET_CONTROL_REQ(r);
+    MPIDI_OFI_rndvread_t *p = &MPIDI_OFI_AMREQ_READ(sreq);
+
+    int num_nics = MPIDI_OFI_global.num_nics;
+    for (int i = 0; i < num_nics; i++) {
+        uint64_t key = fi_mr_key(p->u.send.mrs[i]);
+        MPIDI_OFI_CALL(fi_close(&p->u.send.mrs[i]->fid), mr_unreg);
+        if (!MPIDI_OFI_ENABLE_MR_PROV_KEY) {
+            MPIDI_OFI_mr_key_free(MPIDI_OFI_LOCAL_MR_KEY, key);
+        }
+    }
+    MPL_free(p->u.send.mrs);
+    MPL_free(r);
+
+    /* complete sreq */
+    MPIR_Datatype_release_if_not_builtin(p->datatype);
+    MPIDI_Request_complete_fast(sreq);
+
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+/* -- receiver side -- */
+
+int MPIDI_OFI_rndvread_recv(MPIR_Request * rreq, int tag, int vci_src, int vci_dst)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_OFI_rndvread_t *p = &MPIDI_OFI_AMREQ_READ(rreq);
+    MPIR_FUNC_ENTER;
+
+    if (!p->need_pack) {
+        MPI_Aint true_extent, true_lb;
+        MPIR_Type_get_true_extent_impl(p->datatype, &true_lb, &true_extent);
+        p->u.recv.u.data = MPIR_get_contig_ptr(p->buf, true_lb);
+    } else {
+        p->u.recv.u.copy_infly = 0;
+    }
+
+    /* recv the mrs */
+    int num_nics = MPIDI_OFI_global.num_nics;
+    MPI_Aint hdr_sz = sizeof(struct rdma_info) + num_nics * sizeof(uint64_t);
+    mpi_errno = MPIDI_OFI_RNDV_recv_hdr(rreq, MPIDI_OFI_EVENT_RNDVREAD_RECV_MRS, hdr_sz,
+                                        p->av, p->vci_local, p->vci_remote, p->match_bits);
+    MPIR_ERR_CHECK(mpi_errno);
+
+  fn_exit:
+    MPIR_FUNC_EXIT;
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+int MPIDI_OFI_rndvread_recv_mrs_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r)
+{
+    int mpi_errno = MPI_SUCCESS;
+    struct rdma_info *hdr = MPIDI_OFI_RNDV_GET_CONTROL_HDR(r);
+    MPIR_Request *rreq = MPIDI_OFI_RNDV_GET_CONTROL_REQ(r);
+    MPIDI_OFI_rndvread_t *p = &MPIDI_OFI_AMREQ_READ(rreq);
+
+    MPIDI_OFI_RNDV_update_count(rreq, hdr->data_sz);
+
+    int num_nics = MPIDI_OFI_global.num_nics;
+    p->remote_data_sz = MPL_MIN(hdr->data_sz, p->data_sz);
+    p->u.recv.remote_base = hdr->base;
+    p->u.recv.rkeys = MPL_malloc(num_nics * sizeof(uint64_t), MPL_MEM_OTHER);
+    for (int i = 0; i < num_nics; i++) {
+        p->u.recv.rkeys[i] = hdr->rkeys[i];
+    }
+
+    MPL_free(r);
+
+    /* setup chunks */
+    p->u.recv.chunks_per_nic = get_chunks_per_nic(p->remote_data_sz, num_nics);
+
+    p->u.recv.cur_chunk_index = 0;
+    p->u.recv.num_infly = 0;
+
+    /* issue fi_read */
+    mpi_errno = MPIR_Async_things_add(rndvread_read_poll, rreq, NULL);
+
+    return mpi_errno;
+}
+
+static int rndvread_read_poll(MPIX_Async_thing thing)
+{
+    int ret = MPIX_ASYNC_NOPROGRESS;
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Request *rreq = MPIR_Async_thing_get_state(thing);
+    MPIDI_OFI_rndvread_t *p = &MPIDI_OFI_AMREQ_READ(rreq);
+
+    /* CS required for genq pool and gpu imemcpy */
+    MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI_LOCK(p->vci_local));
+
+    int num_nics = MPIDI_OFI_global.num_nics;
+    while (p->u.recv.cur_chunk_index < p->u.recv.chunks_per_nic * num_nics) {
+        if (p->u.recv.num_infly >= MPIDI_OFI_RNDVREAD_INFLY_CHUNKS) {
+            goto fn_exit;
+        }
+        int nic;
+        MPI_Aint total_offset, nic_offset, chunk_sz;
+        get_chunk_offsets(p->u.recv.cur_chunk_index, num_nics,
+                          p->u.recv.chunks_per_nic, p->remote_data_sz,
+                          &total_offset, &nic, &nic_offset, &chunk_sz);
+
+        if (chunk_sz > 0) {
+            void *read_buf;
+            if (p->need_pack) {
+                /* alloc a chunk */
+                MPIDU_genq_private_pool_alloc_cell(MPIDI_OFI_global.
+                                                   per_vci[p->vci_local].pipeline_pool, &read_buf);
+                if (!read_buf) {
+                    goto fn_exit;
+                }
+            } else {
+                read_buf = (char *) p->u.recv.u.data + total_offset;
+            }
+            uint64_t disp;
+            if (MPIDI_OFI_ENABLE_MR_VIRT_ADDRESS) {
+                disp = p->u.recv.remote_base + total_offset;
+            } else {
+                disp = nic_offset;
+            }
+            mpi_errno = recv_issue_read(rreq, MPIDI_OFI_EVENT_RNDVREAD_READ_CHUNK,
+                                        read_buf, chunk_sz, total_offset,
+                                        p->av, p->vci_local, p->vci_remote, nic, disp,
+                                        p->u.recv.rkeys[nic]);
+            MPIR_ERR_CHECK(mpi_errno);
+            p->u.recv.num_infly++;
+        }
+        p->u.recv.cur_chunk_index++;
+    }
+
+    p->u.recv.all_issued = true;
+    ret = MPIX_ASYNC_DONE;
+
+  fn_exit:
+    MPID_THREAD_CS_EXIT(VCI, MPIDI_VCI_LOCK(p->vci_local));
+    return ret;
+  fn_fail:
+    ret = MPIX_ASYNC_NOPROGRESS;
+    goto fn_exit;
+}
+
+struct read_req {
+    char pad[MPIDI_REQUEST_HDR_SIZE];
+    struct fi_context context[MPIDI_OFI_CONTEXT_STRUCTS];
+    int event_id;
+    MPIR_Request *rreq;
+    /* In case we need unpack after read */
+    void *chunk_buf;
+    MPI_Aint chunk_sz;
+    MPI_Aint offset;
+};
+
+static int recv_issue_read(MPIR_Request * parent_request, int event_id,
+                           void *buf, MPI_Aint data_sz, MPI_Aint offset,
+                           MPIDI_av_entry_t * av, int vci_local, int vci_remote, int nic,
+                           MPI_Aint remote_disp, uint64_t rkey)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    struct read_req *r = MPL_malloc(sizeof(struct read_req), MPL_MEM_OTHER);
+    MPIR_Assertp(r);
+
+    r->event_id = event_id;
+    r->rreq = parent_request;
+    r->chunk_buf = buf;
+    r->chunk_sz = data_sz;
+    r->offset = offset;
+
+    /* control message always use nic 0 */
+    int ctx_idx = MPIDI_OFI_get_ctx_index(vci_local, nic);
+    fi_addr_t addr = MPIDI_OFI_av_to_phys(av, vci_local, nic, vci_remote, nic);
+
+    MPIDI_OFI_CALL_RETRY(fi_read(MPIDI_OFI_global.ctx[ctx_idx].tx,
+                                 buf, data_sz, NULL,
+                                 addr, remote_disp, rkey, (void *) &r->context),
+                         vci_local, rdma_readfrom);
+
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+int MPIDI_OFI_rndvread_read_chunk_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r)
+{
+    int mpi_errno = MPI_SUCCESS;
+    struct read_req *t = (void *) r;
+    MPIR_Request *rreq = t->rreq;
+    MPIDI_OFI_rndvread_t *p = &MPIDI_OFI_AMREQ_READ(rreq);
+
+    p->u.recv.num_infly--;
+    if (!p->need_pack) {
+        check_recv_complete(rreq);
+    } else {
+        /* async copy */
+        mpi_errno = async_recv_copy(rreq, t->chunk_buf, t->chunk_sz,
+                                    (void *) p->buf, p->count, p->datatype, t->offset, &p->attr);
+        p->u.recv.u.copy_infly++;
+
+    }
+    MPL_free(r);
+    return mpi_errno;
+}
+
+struct recv_copy {
+    MPIR_Request *rreq;
+    MPIR_gpu_req async_req;
+    void *chunk_buf;
+    MPI_Aint chunk_sz;
+};
+
+static int async_recv_copy(MPIR_Request * rreq, void *chunk_buf, MPI_Aint chunk_sz,
+                           void *buf, MPI_Aint count, MPI_Datatype datatype,
+                           MPI_Aint offset, MPL_pointer_attr_t * attr)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_gpu_req async_req;
+    int engine_type = MPIDI_OFI_gpu_get_recv_engine_type();
+    int copy_dir = MPL_GPU_COPY_DIRECTION_NONE;
+    mpi_errno = MPIR_Ilocalcopy_gpu(chunk_buf, chunk_sz, MPIR_BYTE_INTERNAL, 0, NULL,
+                                    buf, count, datatype, offset, attr,
+                                    copy_dir, engine_type, 1, &async_req);
+    MPIR_ERR_CHECK(mpi_errno);
+
+    struct recv_copy *p = MPL_malloc(sizeof(struct recv_copy), MPL_MEM_OTHER);
+    p->rreq = rreq;
+    p->async_req = async_req;
+    p->chunk_buf = chunk_buf;
+    p->chunk_sz = chunk_sz;
+
+    mpi_errno = MPIR_Async_things_add(recv_copy_poll, p, NULL);
+
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+static int recv_copy_poll(MPIX_Async_thing thing)
+{
+    struct recv_copy *p = MPIR_Async_thing_get_state(thing);
+
+    int is_done = 0;
+    MPIR_async_test(&(p->async_req), &is_done);
+
+    if (!is_done) {
+        return MPIX_ASYNC_NOPROGRESS;
+    } else {
+        recv_copy_complete(p->rreq, p->chunk_buf, p->chunk_sz);
+        MPL_free(p);
+        return MPIX_ASYNC_DONE;
+    }
+}
+
+static void recv_copy_complete(MPIR_Request * rreq, void *chunk_buf, MPI_Aint chunk_sz)
+{
+    MPIDI_OFI_rndvread_t *p = &MPIDI_OFI_AMREQ_READ(rreq);
+
+    MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI_LOCK(p->vci_local));
+    MPIDU_genq_private_pool_free_cell(MPIDI_OFI_global.per_vci[p->vci_local].pipeline_pool,
+                                      chunk_buf);
+
+    p->u.recv.u.copy_infly--;
+    check_recv_complete(rreq);
+    MPID_THREAD_CS_EXIT(VCI, MPIDI_VCI_LOCK(p->vci_local));
+}
+
+static int check_recv_complete(MPIR_Request * rreq)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_OFI_rndvread_t *p = &MPIDI_OFI_AMREQ_READ(rreq);
+    if (p->u.recv.all_issued && p->u.recv.num_infly == 0 &&
+        (!p->need_pack || p->u.recv.u.copy_infly == 0)) {
+        /* done. send ack */
+        mpi_errno = MPIDI_OFI_RNDV_send_hdr(NULL, 0, p->av, p->vci_local, p->vci_remote,
+                                            p->match_bits);
+        /* complete request */
+        MPL_free(p->u.recv.rkeys);
+        MPIR_Datatype_release_if_not_builtin(p->datatype);
+        MPIDI_Request_complete_fast(rreq);
+    }
+    return mpi_errno;
+}
diff --git a/src/mpid/ch4/netmod/ofi/ofi_rndv_write.c b/src/mpid/ch4/netmod/ofi/ofi_rndv_write.c
new file mode 100644
index 00000000000..f7251daa840
--- /dev/null
+++ b/src/mpid/ch4/netmod/ofi/ofi_rndv_write.c
@@ -0,0 +1,369 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ *     See COPYRIGHT in top-level directory
+ */
+
+#include <mpidimpl.h>
+#include "ofi_impl.h"
+#include "ofi_events.h"
+#include "ofi_rndv.h"
+
+#include "ofi_rndv_rdma_common.inc"
+
+#define MPIDI_OFI_RNDVWRITE_INFLY_CHUNKS 10
+
+static int rndvwrite_write_poll(MPIX_Async_thing thing);
+static int async_send_copy(MPIX_Async_thing thing, MPIR_Request * sreq, int nic, uint64_t disp,
+                           void *chunk_buf, MPI_Aint chunk_sz,
+                           const void *buf, MPI_Aint count, MPI_Datatype datatype,
+                           MPI_Aint offset, MPL_pointer_attr_t * attr);
+static int send_copy_poll(MPIX_Async_thing thing);
+static int send_issue_write(MPIR_Request * sreq, void *buf, MPI_Aint data_sz,
+                            int nic, MPI_Aint disp);
+
+/* export functions:
+ * int MPIDI_OFI_rndvwrite_send(MPIR_Request * sreq, int tag);
+ * int MPIDI_OFI_rndvwrite_write_chunk_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r);
+ * int MPIDI_OFI_rndvwrite_ack_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r);
+ *
+ * int MPIDI_OFI_rndvwrite_recv(MPIR_Request * rreq, int tag, int vci_src, int vci_dst);
+ * int MPIDI_OFI_rndvwrite_recv_mrs_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r);
+ */
+/* -- sender side -- */
+
+int MPIDI_OFI_rndvwrite_send(MPIR_Request * sreq, int tag)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_OFI_rndvwrite_t *p = &MPIDI_OFI_AMREQ_WRITE(sreq);
+    MPIR_FUNC_ENTER;
+
+    if (!p->need_pack) {
+        MPI_Aint true_extent, true_lb;
+        MPIR_Type_get_true_extent_impl(p->datatype, &true_lb, &true_extent);
+        p->u.send.u.data = MPIR_get_contig_ptr(p->buf, true_lb);
+    } else {
+        p->u.send.u.copy_infly = 0;
+    }
+
+    /* recv the mrs */
+    int num_nics = MPIDI_OFI_global.num_nics;
+    MPI_Aint hdr_sz = sizeof(struct rdma_info) + num_nics * sizeof(uint64_t);
+    mpi_errno = MPIDI_OFI_RNDV_recv_hdr(sreq, MPIDI_OFI_EVENT_RNDVWRITE_RECV_MRS, hdr_sz,
+                                        p->av, p->vci_local, p->vci_remote, p->match_bits);
+    MPIR_ERR_CHECK(mpi_errno);
+
+  fn_exit:
+    MPIR_FUNC_EXIT;
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+int MPIDI_OFI_rndvwrite_recv_mrs_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r)
+{
+    int mpi_errno = MPI_SUCCESS;
+    struct rdma_info *hdr = MPIDI_OFI_RNDV_GET_CONTROL_HDR(r);
+    MPIR_Request *sreq = MPIDI_OFI_RNDV_GET_CONTROL_REQ(r);
+    MPIDI_OFI_rndvwrite_t *p = &MPIDI_OFI_AMREQ_WRITE(sreq);
+
+    int num_nics = MPIDI_OFI_global.num_nics;
+    p->remote_data_sz = MPL_MIN(hdr->data_sz, p->data_sz);
+    p->u.send.remote_base = hdr->base;
+    p->u.send.rkeys = MPL_malloc(num_nics * sizeof(uint64_t), MPL_MEM_OTHER);
+    for (int i = 0; i < num_nics; i++) {
+        p->u.send.rkeys[i] = hdr->rkeys[i];
+    }
+
+    MPL_free(r);
+
+    /* setup chunks */
+    p->u.send.chunks_per_nic = get_chunks_per_nic(p->remote_data_sz, num_nics);
+    p->u.send.chunks_remain = p->u.send.chunks_per_nic * num_nics;
+
+    p->u.send.cur_chunk_index = 0;
+    p->u.send.write_infly = 0;
+
+    /* issue fi_write */
+    mpi_errno = MPIR_Async_things_add(rndvwrite_write_poll, sreq, NULL);
+
+    return mpi_errno;
+}
+
+static int rndvwrite_write_poll(MPIX_Async_thing thing)
+{
+    int ret = MPIX_ASYNC_NOPROGRESS;
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Request *sreq = MPIR_Async_thing_get_state(thing);
+    MPIDI_OFI_rndvwrite_t *p = &MPIDI_OFI_AMREQ_WRITE(sreq);
+
+    /* CS required for genq pool and gpu imemcpy */
+    MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI_LOCK(p->vci_local));
+
+    int num_nics = MPIDI_OFI_global.num_nics;
+    while (p->u.send.cur_chunk_index < p->u.send.chunks_per_nic * num_nics) {
+        int nic;
+        MPI_Aint total_offset, nic_offset, chunk_sz;
+        get_chunk_offsets(p->u.send.cur_chunk_index, num_nics,
+                          p->u.send.chunks_per_nic, p->remote_data_sz,
+                          &total_offset, &nic, &nic_offset, &chunk_sz);
+
+        if (chunk_sz <= 0) {
+            p->u.send.cur_chunk_index++;
+            p->u.send.chunks_remain--;
+            continue;
+        }
+
+        uint64_t disp;
+        if (MPIDI_OFI_ENABLE_MR_VIRT_ADDRESS) {
+            disp = p->u.send.remote_base + total_offset;
+        } else {
+            disp = nic_offset;
+        }
+
+        if (p->need_pack) {
+            if (p->u.send.u.copy_infly >= MPIDI_OFI_RNDVWRITE_INFLY_CHUNKS) {
+                goto fn_exit;
+            }
+
+            /* alloc a chunk */
+            void *chunk_buf;
+            MPIDU_genq_private_pool_alloc_cell(MPIDI_OFI_global.per_vci[p->vci_local].pipeline_pool,
+                                               &chunk_buf);
+            if (!chunk_buf) {
+                goto fn_exit;
+            }
+            /* issue async copy */
+            mpi_errno = async_send_copy(thing, sreq, nic, disp, chunk_buf, chunk_sz,
+                                        p->buf, p->count, p->datatype, total_offset, &p->attr);
+            MPIR_ERR_CHECK(mpi_errno);
+        } else {
+            if (p->u.send.write_infly >= MPIDI_OFI_RNDVWRITE_INFLY_CHUNKS) {
+                goto fn_exit;
+            }
+            void *write_buf = (char *) p->u.send.u.data + total_offset;
+            /* issue rdma write */
+            mpi_errno = send_issue_write(sreq, write_buf, chunk_sz, nic, disp);
+            MPIR_ERR_CHECK(mpi_errno);
+        }
+        p->u.send.cur_chunk_index++;
+    }
+
+    ret = MPIX_ASYNC_DONE;
+
+  fn_exit:
+    MPID_THREAD_CS_EXIT(VCI, MPIDI_VCI_LOCK(p->vci_local));
+    return ret;
+  fn_fail:
+    goto fn_exit;
+}
+
+struct send_copy {
+    MPIR_Request *sreq;
+    MPIR_gpu_req async_req;
+    void *chunk_buf;
+    MPI_Aint chunk_sz;
+    MPI_Aint offset;
+    int nic;
+    uint64_t disp;
+    bool is_done;
+};
+
+static int async_send_copy(MPIX_Async_thing thing, MPIR_Request * sreq, int nic, uint64_t disp,
+                           void *chunk_buf, MPI_Aint chunk_sz,
+                           const void *buf, MPI_Aint count, MPI_Datatype datatype,
+                           MPI_Aint offset, MPL_pointer_attr_t * attr)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    MPIR_gpu_req async_req;
+    int engine_type = MPIDI_OFI_gpu_get_send_engine_type();
+    int copy_dir = MPL_GPU_COPY_DIRECTION_NONE;
+    mpi_errno = MPIR_Ilocalcopy_gpu((void *) buf, count, datatype, offset, attr,
+                                    chunk_buf, chunk_sz, MPIR_BYTE_INTERNAL, 0, NULL,
+                                    copy_dir, engine_type, 1, &async_req);
+    MPIR_ERR_CHECK(mpi_errno);
+
+    struct send_copy *p = MPL_malloc(sizeof(struct send_copy), MPL_MEM_OTHER);
+    p->sreq = sreq;
+    p->async_req = async_req;
+    p->chunk_buf = chunk_buf;
+    p->chunk_sz = chunk_sz;
+    p->offset = offset;
+    p->nic = nic;
+    p->disp = disp;
+
+    MPIR_Async_thing_spawn(thing, send_copy_poll, p, NULL);
+
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+static int send_copy_poll(MPIX_Async_thing thing)
+{
+    struct send_copy *p = MPIR_Async_thing_get_state(thing);
+    MPIDI_OFI_rndvwrite_t *q = &MPIDI_OFI_AMREQ_WRITE(p->sreq);
+
+    if (!p->is_done) {
+        int is_done = 0;
+        MPIR_async_test(&(p->async_req), &is_done);
+        if (is_done) {
+            p->is_done = true;
+            q->u.send.u.copy_infly--;
+        }
+    }
+
+    if (!p->is_done) {
+        return MPIX_ASYNC_NOPROGRESS;
+    } else if (q->u.send.write_infly >= MPIDI_OFI_RNDVWRITE_INFLY_CHUNKS) {
+        return MPIX_ASYNC_NOPROGRESS;
+    } else {
+        MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI_LOCK(q->vci_local));
+        int mpi_errno = send_issue_write(p->sreq, p->chunk_buf, p->chunk_sz, p->nic, p->disp);
+        MPID_THREAD_CS_EXIT(VCI, MPIDI_VCI_LOCK(q->vci_local));
+        MPIR_Assertp(mpi_errno == MPI_SUCCESS);
+
+        MPL_free(p);
+        return MPIX_ASYNC_DONE;
+    }
+}
+
+struct write_req {
+    char pad[MPIDI_REQUEST_HDR_SIZE];
+    struct fi_context context[MPIDI_OFI_CONTEXT_STRUCTS];
+    int event_id;
+    MPIR_Request *sreq;
+    /* In case we need free the chunk after write */
+    void *chunk_buf;
+};
+
+static int send_issue_write(MPIR_Request * sreq, void *buf, MPI_Aint data_sz,
+                            int nic, MPI_Aint disp)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_OFI_rndvwrite_t *p = &MPIDI_OFI_AMREQ_WRITE(sreq);
+
+    struct write_req *t = MPL_malloc(sizeof(struct write_req), MPL_MEM_OTHER);
+    MPIR_Assertp(t);
+
+    t->event_id = MPIDI_OFI_EVENT_RNDVWRITE_WRITE_CHUNK;
+    t->sreq = sreq;
+    if (p->need_pack) {
+        t->chunk_buf = buf;
+    }
+
+    /* control message always use nic 0 */
+    int ctx_idx = MPIDI_OFI_get_ctx_index(p->vci_local, nic);
+    fi_addr_t addr = MPIDI_OFI_av_to_phys(p->av, p->vci_local, nic, p->vci_remote, nic);
+    uint64_t rkey = p->u.send.rkeys[nic];
+
+    MPIDI_OFI_CALL_RETRY(fi_write(MPIDI_OFI_global.ctx[ctx_idx].tx,
+                                  buf, data_sz, NULL, addr, disp, rkey, (void *) &t->context),
+                         p->vci_local, rdma_write);
+    p->u.send.write_infly++;
+
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+int MPIDI_OFI_rndvwrite_write_chunk_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r)
+{
+    int mpi_errno = MPI_SUCCESS;
+    struct write_req *t = (void *) r;
+    MPIR_Request *sreq = t->sreq;
+    MPIDI_OFI_rndvwrite_t *p = &MPIDI_OFI_AMREQ_WRITE(sreq);
+
+    if (p->need_pack) {
+        MPIDU_genq_private_pool_free_cell(MPIDI_OFI_global.per_vci[p->vci_local].pipeline_pool,
+                                          t->chunk_buf);
+    }
+
+    p->u.send.write_infly--;
+    p->u.send.chunks_remain--;
+    if (p->u.send.chunks_remain == 0) {
+        /* done. send ack. Also inform receiver our data_sz */
+        mpi_errno = MPIDI_OFI_RNDV_send_hdr(&p->data_sz, sizeof(MPI_Aint),
+                                            p->av, p->vci_local, p->vci_remote, p->match_bits);
+        /* complete request */
+        MPL_free(p->u.send.rkeys);
+        MPIR_Datatype_release_if_not_builtin(p->datatype);
+        MPIDI_Request_complete_fast(sreq);
+    }
+
+    MPL_free(r);
+    return mpi_errno;
+}
+
+/* -- receiver side -- */
+
+int MPIDI_OFI_rndvwrite_recv(MPIR_Request * rreq, int tag, int vci_src, int vci_dst)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIDI_OFI_rndvwrite_t *p = &MPIDI_OFI_AMREQ_WRITE(rreq);
+    MPIR_FUNC_ENTER;
+
+    MPIR_Assert(!p->need_pack);
+
+    int num_nics = MPIDI_OFI_global.num_nics;
+    p->u.recv.mrs = MPL_malloc((num_nics * sizeof(struct fid_mr *)), MPL_MEM_OTHER);
+
+    int hdr_sz = sizeof(struct rdma_info) + num_nics * sizeof(uint64_t);
+    struct rdma_info *hdr = MPL_malloc(hdr_sz, MPL_MEM_OTHER);
+    MPIR_Assertp(hdr);
+
+    mpi_errno = prepare_rdma_info(p->buf, p->datatype, p->data_sz, p->vci_local,
+                                  FI_REMOTE_WRITE, p->u.recv.mrs, hdr);
+    MPIR_ERR_CHECK(mpi_errno);
+
+    /* send rdma_info */
+    mpi_errno = MPIDI_OFI_RNDV_send_hdr(hdr, hdr_sz,
+                                        p->av, p->vci_local, p->vci_remote, p->match_bits);
+    MPIR_ERR_CHECK(mpi_errno);
+    MPL_free(hdr);
+
+    /* issue recv for ack */
+    mpi_errno = MPIDI_OFI_RNDV_recv_hdr(rreq, MPIDI_OFI_EVENT_RNDVWRITE_ACK,
+                                        sizeof(MPI_Aint) /* remote data_sz */ ,
+                                        p->av, p->vci_local, p->vci_remote, p->match_bits);
+    MPIR_ERR_CHECK(mpi_errno);
+
+  fn_exit:
+    MPIR_FUNC_EXIT;
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
+
+int MPIDI_OFI_rndvwrite_ack_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r)
+{
+    int mpi_errno = MPI_SUCCESS;
+    MPIR_Request *rreq = MPIDI_OFI_RNDV_GET_CONTROL_REQ(r);
+    MPIDI_OFI_rndvwrite_t *p = &MPIDI_OFI_AMREQ_WRITE(rreq);
+
+    /* check sender data_sz */
+    MPI_Aint *hdr_data_sz = MPIDI_OFI_RNDV_GET_CONTROL_HDR(r);
+    MPIDI_OFI_RNDV_update_count(rreq, *hdr_data_sz);
+
+    int num_nics = MPIDI_OFI_global.num_nics;
+    for (int i = 0; i < num_nics; i++) {
+        uint64_t key = fi_mr_key(p->u.recv.mrs[i]);
+        MPIDI_OFI_CALL(fi_close(&p->u.recv.mrs[i]->fid), mr_unreg);
+        if (!MPIDI_OFI_ENABLE_MR_PROV_KEY) {
+            MPIDI_OFI_mr_key_free(MPIDI_OFI_LOCAL_MR_KEY, key);
+        }
+    }
+    MPL_free(p->u.recv.mrs);
+    MPL_free(r);
+
+    /* complete rreq */
+    MPIR_Datatype_release_if_not_builtin(p->datatype);
+    MPIDI_Request_complete_fast(rreq);
+
+  fn_exit:
+    return mpi_errno;
+  fn_fail:
+    goto fn_exit;
+}
diff --git a/src/mpid/ch4/netmod/ofi/ofi_send.h b/src/mpid/ch4/netmod/ofi/ofi_send.h
index 56c3a8f5185..ae636e7a784 100644
--- a/src/mpid/ch4/netmod/ofi/ofi_send.h
+++ b/src/mpid/ch4/netmod/ofi/ofi_send.h
@@ -23,20 +23,6 @@
         Set MPIR_CVAR_CH4_OFI_ENABLE_INJECT=0 to disable buffered send for small messages. This
         may help avoid hang due to lack of global progress.
 
-    - name        : MPIR_CVAR_CH4_OFI_GPU_SEND_ENGINE_TYPE
-      category    : CH4_OFI
-      type        : enum
-      default     : copy_low_latency
-      class       : none
-      verbosity   : MPI_T_VERBOSITY_USER_BASIC
-      scope       : MPI_T_SCOPE_LOCAL
-      description : |-
-        Specifies GPU engine type for GPU pt2pt on the sender side.
-        compute - use a compute engine
-        copy_high_bandwidth - use a high-bandwidth copy engine
-        copy_low_latency - use a low-latency copy engine
-        yaksa - use Yaksa
-
     - name        : MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD
       category    : CH4_OFI
       type        : int
@@ -55,21 +41,6 @@
 === END_MPI_T_CVAR_INFO_BLOCK ===
 */
 
-MPL_STATIC_INLINE_PREFIX MPL_gpu_engine_type_t MPIDI_OFI_gpu_get_send_engine_type(void)
-{
-    if (MPIR_CVAR_CH4_OFI_GPU_SEND_ENGINE_TYPE == MPIR_CVAR_CH4_OFI_GPU_SEND_ENGINE_TYPE_compute) {
-        return MPL_GPU_ENGINE_TYPE_COMPUTE;
-    } else if (MPIR_CVAR_CH4_OFI_GPU_SEND_ENGINE_TYPE ==
-               MPIR_CVAR_CH4_OFI_GPU_SEND_ENGINE_TYPE_copy_high_bandwidth) {
-        return MPL_GPU_ENGINE_TYPE_COPY_HIGH_BANDWIDTH;
-    } else if (MPIR_CVAR_CH4_OFI_GPU_SEND_ENGINE_TYPE ==
-               MPIR_CVAR_CH4_OFI_GPU_SEND_ENGINE_TYPE_copy_low_latency) {
-        return MPL_GPU_ENGINE_TYPE_COPY_LOW_LATENCY;
-    } else {
-        return MPL_GPU_ENGINE_TYPE_LAST;
-    }
-}
-
 MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_issue_ack_recv(MPIR_Request * sreq, MPIR_Comm * comm,
                                                       int context_offset, int dst_rank,
                                                       int tag, MPIDI_av_entry_t * addr,
@@ -255,173 +226,6 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send_normal(const void *data, MPI_Aint da
     goto fn_exit;
 }
 
-MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send_huge(const void *data, MPI_Aint data_sz,
-                                                 uint64_t cq_data, int dst_rank, int tag,
-                                                 MPIR_Comm * comm, uint64_t match_bits,
-                                                 MPIDI_av_entry_t * addr,
-                                                 int vci_local, int vci_remote,
-                                                 int sender_nic, int receiver_nic,
-                                                 MPIR_Request * sreq,
-                                                 MPL_pointer_attr_t attr, bool do_striping)
-{
-    int mpi_errno = MPI_SUCCESS;
-    MPIR_FUNC_ENTER;
-
-    int ctx_idx = MPIDI_OFI_get_ctx_index(vci_local, sender_nic);
-
-    int num_nics;
-    uint64_t msg_size;
-    if (do_striping) {
-        num_nics = MPIDI_OFI_global.num_nics;
-        msg_size = MPIDI_OFI_STRIPE_CHUNK_SIZE;
-    } else {
-        num_nics = 1;
-        msg_size = MPIDI_OFI_global.max_msg_size;
-    }
-
-    uint64_t rma_keys[MPIDI_OFI_MAX_NICS];
-    struct fid_mr **huge_send_mrs;
-    huge_send_mrs =
-        (struct fid_mr **) MPL_malloc((num_nics * sizeof(struct fid_mr *)), MPL_MEM_BUFFER);
-    if (!MPIDI_OFI_ENABLE_MR_PROV_KEY) {
-        /* Set up a memory region for the lmt data transfer */
-        for (int i = 0; i < num_nics; i++) {
-            rma_keys[i] = MPIDI_OFI_mr_key_alloc(MPIDI_OFI_LOCAL_MR_KEY, MPIDI_OFI_INVALID_MR_KEY);
-            MPIR_ERR_CHKANDJUMP(rma_keys[i] == MPIDI_OFI_INVALID_MR_KEY, mpi_errno,
-                                MPI_ERR_OTHER, "**ofid_mr_key");
-        }
-    } else {
-        /* zero them to avoid warnings */
-        for (int i = 0; i < num_nics; i++) {
-            rma_keys[i] = 0;
-        }
-    }
-
-    for (int i = 0; i < num_nics; i++) {
-        MPIDI_OFI_context_t *ctx = &MPIDI_OFI_global.ctx[MPIDI_OFI_get_ctx_index(vci_local, i)];
-        MPIDI_OFI_CALL(fi_mr_reg(ctx->domain, data, data_sz, FI_REMOTE_READ, 0ULL, rma_keys[i],
-                                 0ULL, &huge_send_mrs[i], NULL), mr_reg);
-        mpi_errno = MPIDI_OFI_mr_bind(MPIDI_OFI_global.prov_use[0], huge_send_mrs[i], ctx->ep,
-                                      NULL);
-        MPIR_ERR_CHECK(mpi_errno);
-    }
-    MPIDI_OFI_REQUEST(sreq, huge.send_mrs) = huge_send_mrs;
-    if (MPIDI_OFI_ENABLE_MR_PROV_KEY) {
-        /* MR_BASIC */
-        for (int i = 0; i < num_nics; i++) {
-            rma_keys[i] = fi_mr_key(huge_send_mrs[i]);
-        }
-    }
-
-    /* Send the maximum amount of data that we can here to get things
-     * started, then do the rest using the MR below. This can be confirmed
-     * in the MPIDI_OFI_get_huge code where we start the offset at
-     * MPIDI_OFI_global.max_msg_size */
-    sreq->comm = comm;
-    MPIR_Comm_add_ref(comm);
-
-    /* send ctrl message first */
-    MPIDI_OFI_send_control_t ctrl;
-    ctrl.type = MPIDI_OFI_CTRL_HUGE;
-    for (int i = 0; i < num_nics; i++) {
-        ctrl.u.huge.info.rma_keys[i] = rma_keys[i];
-    }
-    ctrl.u.huge.info.comm_id = comm->context_id;
-    ctrl.u.huge.info.tag = tag;
-    ctrl.u.huge.info.origin_rank = comm->rank;
-    ctrl.u.huge.info.vci_src = vci_local;
-    ctrl.u.huge.info.vci_dst = vci_remote;
-    ctrl.u.huge.info.send_buf = (void *) data;
-    ctrl.u.huge.info.msgsize = data_sz;
-    ctrl.u.huge.info.ackreq = sreq;
-
-    mpi_errno = MPIDI_NM_am_send_hdr(dst_rank, comm, MPIDI_OFI_INTERNAL_HANDLER_CONTROL,
-                                     &ctrl, sizeof(ctrl), vci_local, vci_remote);
-    MPIR_ERR_CHECK(mpi_errno);
-
-    /* send main native message next */
-    MPIR_cc_inc(sreq->cc_ptr);
-    MPIDI_OFI_REQUEST(sreq, event_id) = MPIDI_OFI_EVENT_SEND_HUGE;
-
-    fi_addr_t dest = MPIDI_OFI_av_to_phys(addr, vci_local, sender_nic, vci_remote, receiver_nic);
-    match_bits |= MPIDI_OFI_HUGE_SEND;  /* Add the bit for a huge message */
-    MPIDI_OFI_CALL_RETRY(fi_tsenddata(MPIDI_OFI_global.ctx[ctx_idx].tx,
-                                      data, msg_size, NULL /* desc */ ,
-                                      cq_data, dest, match_bits,
-                                      (void *) &(MPIDI_OFI_REQUEST(sreq, context))),
-                         vci_local, tsenddata);
-    /* FIXME: sender_nic may not be the actual nic */
-    MPIR_T_PVAR_COUNTER_INC(MULTINIC, nic_sent_bytes_count[sender_nic], msg_size);
-    MPIR_T_PVAR_COUNTER_INC(MULTINIC, striped_nic_sent_bytes_count[sender_nic], msg_size);
-
-  fn_exit:
-    return mpi_errno;
-  fn_fail:
-    goto fn_exit;
-}
-
-MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send_pipeline(const void *buf, MPI_Aint count,
-                                                     MPI_Datatype datatype,
-                                                     uint64_t cq_data, int dst_rank, int tag,
-                                                     MPIR_Comm * comm, uint64_t match_bits,
-                                                     MPIDI_av_entry_t * addr,
-                                                     int vci_local, int vci_remote,
-                                                     int sender_nic, int receiver_nic,
-                                                     MPIR_Request * sreq,
-                                                     int dt_contig, size_t data_sz,
-                                                     MPL_pointer_attr_t attr)
-{
-    int mpi_errno = MPI_SUCCESS;
-    MPIR_FUNC_ENTER;
-
-    int ctx_idx = MPIDI_OFI_get_ctx_index(vci_local, sender_nic);
-
-    uint32_t n_chunks = 0;
-    int chunk_size = MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ;
-    /* Update correct number of chunks in immediate data. */
-    chunk_size = MPIDI_OFI_gpu_pipeline_chunk_size(data_sz);
-    n_chunks = data_sz / chunk_size;
-    if (data_sz % chunk_size)
-        n_chunks++;
-    MPIDI_OFI_idata_set_gpuchunk_bits(&cq_data, n_chunks);
-
-    /* Update sender packed bit if necessary. */
-    uint64_t is_packed = datatype == MPI_PACKED ? 1 : 0;
-    MPIDI_OFI_idata_set_gpu_packed_bit(&cq_data, is_packed);
-    MPIR_ERR_CHKANDJUMP(is_packed, mpi_errno, MPI_ERR_OTHER, "**gpu_pipeline_packed");
-
-    MPIDI_OFI_REQUEST(sreq, event_id) = MPIDI_OFI_EVENT_SEND;
-
-    /* Save pipeline information. */
-    MPIDI_OFI_REQUEST(sreq, pipeline_info.chunk_sz) = chunk_size;
-    MPIDI_OFI_REQUEST(sreq, pipeline_info.cq_data) = cq_data;
-    MPIDI_OFI_REQUEST(sreq, pipeline_info.remote_addr) =
-        MPIDI_OFI_av_to_phys(addr, vci_local, sender_nic, vci_remote, receiver_nic);
-    MPIDI_OFI_REQUEST(sreq, pipeline_info.vci_local) = vci_local;
-    MPIDI_OFI_REQUEST(sreq, pipeline_info.ctx_idx) = ctx_idx;
-    MPIDI_OFI_REQUEST(sreq, pipeline_info.match_bits) = match_bits;
-    MPIDI_OFI_REQUEST(sreq, pipeline_info.data_sz) = data_sz;
-
-    /* send an empty message for tag matching */
-    MPIDI_OFI_CALL_RETRY(fi_tinjectdata(MPIDI_OFI_global.ctx[ctx_idx].tx,
-                                        NULL,
-                                        0,
-                                        cq_data,
-                                        MPIDI_OFI_REQUEST(sreq, pipeline_info.remote_addr),
-                                        match_bits), vci_local, tinjectdata);
-    MPIR_T_PVAR_COUNTER_INC(MULTINIC, nic_sent_bytes_count[sender_nic], data_sz);
-
-    MPIDI_OFI_gpu_pending_send_t *send_task =
-        MPIDI_OFI_create_send_task(sreq, (void *) buf, count, datatype, attr, data_sz, dt_contig);
-    DL_APPEND(MPIDI_OFI_global.gpu_send_queue, send_task);
-    MPIDI_OFI_gpu_progress_send();
-
-  fn_exit:
-    return mpi_errno;
-  fn_fail:
-    goto fn_exit;
-}
-
 MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send_fallback(const void *buf, MPI_Aint count,
                                                      MPI_Datatype datatype,
                                                      int dst_rank, int tag,
@@ -434,6 +238,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send_fallback(const void *buf, MPI_Aint c
     MPIR_FUNC_ENTER;
 
     MPIDI_OFI_REQUEST_CREATE(*request, MPIR_REQUEST_KIND__SEND, vci_src);
+    MPIDI_OFI_REQUEST(*request, am_req) = NULL;
 
     MPIR_Request *sreq = *request;
 
@@ -491,8 +296,6 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send_fallback(const void *buf, MPI_Aint c
     goto fn_exit;
 }
 
-#define EAGER_THRESH (MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD == -1 ? MPIDI_OFI_global.max_msg_size : MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD)
-
 MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send(const void *buf, MPI_Aint count, MPI_Datatype datatype,
                                             int dst_rank, int tag, MPIR_Comm * comm,
                                             int context_offset, MPIDI_av_entry_t * addr,
@@ -516,9 +319,6 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send(const void *buf, MPI_Aint count, MPI
     bool need_mr = false;
     bool do_inject = false;
     bool do_iov = false;
-    bool do_gpu_pipelining = false;
-    bool do_striping = false;
-    bool do_huge = false;
 
     /* check gpu */
     MPL_pointer_attr_t attr;
@@ -549,12 +349,6 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send(const void *buf, MPI_Aint count, MPI
                 }
             }
         }
-
-        if (need_pack && MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE &&
-            data_sz >= MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD) {
-            do_gpu_pipelining = true;
-            need_pack = false;
-        }
     }
 
     if (MPIR_CVAR_CH4_OFI_ENABLE_INJECT && !syncflag && !is_init &&
@@ -562,22 +356,9 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send(const void *buf, MPI_Aint count, MPI
         do_inject = true;
     }
 
-    /* check striping path */
-    if (MPIDI_OFI_COMM(comm).enable_striping && data_sz >= MPIDI_OFI_global.stripe_threshold) {
-        syncflag = false;
-        do_striping = true;
-    }
-
-    /* check striping path */
-    if (!do_striping && data_sz >= MPIDI_OFI_global.max_msg_size) {
-        syncflag = false;
-        do_huge = true;
-    }
-
     /* noncontig? try iov or need pack */
     if (!need_pack && !dt_contig) {
-        if (MPIDI_OFI_ENABLE_PT2PT_NOPACK && !do_inject && !do_striping && !do_huge &&
-            !do_gpu_pipelining) {
+        if (MPIDI_OFI_ENABLE_PT2PT_NOPACK && !do_inject) {
             MPI_Aint num_contig;
             MPIR_Typerep_get_iov_len(count, datatype, &num_contig);
             if (num_contig <= MPIDI_OFI_global.tx_iov_limit) {
@@ -626,19 +407,27 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send(const void *buf, MPI_Aint count, MPI
 
         *request = MPIR_Request_create_complete(MPIR_REQUEST_KIND__SEND);
         MPIR_ERR_CHECK(mpi_errno);
-    } else if (!is_am && data_sz > EAGER_THRESH) {
-        MPIR_Request *sreq = MPIDIG_request_create(MPIR_REQUEST_KIND__SEND, 2,
-                                                   vci_src /* local */ , vci_dst /* remote */);
-        MPIR_ERR_CHKANDJUMP(!sreq, mpi_errno, MPI_ERR_OTHER, "**nomemreq");
+    } else if (!is_am && data_sz > MPIDI_OFI_EAGER_THRESH) {
+        /* new pipeline send */
+        MPIR_Request *sreq;
+        MPIDI_OFI_REQUEST_CREATE(sreq, MPIR_REQUEST_KIND__SEND, vci_src);
         *request = sreq;
         sreq->comm = comm;
         MPIR_Comm_add_ref(comm);
-        MPIDIG_REQUEST(sreq, req->local_vci) = vci_src;
-        MPIDIG_REQUEST(sreq, req->remote_vci) = vci_dst;
-        MPIDIG_REQUEST(sreq, buffer) = (void *) buf;
-        MPIDIG_REQUEST(sreq, count) = count;
-        MPIDIG_REQUEST(sreq, datatype) = datatype;
-        MPIDIG_REQUEST(sreq, u.send.dest) = dst_rank;
+
+        MPIDI_OFI_rndv_common_t *p = &MPIDI_OFI_AMREQ_COMMON(sreq);
+        p->buf = buf;
+        p->count = count;
+        p->datatype = datatype;
+        p->need_pack = MPIDI_OFI_rndv_need_pack(dt_contig, &attr);
+        p->attr = attr;
+        p->data_sz = data_sz;
+        p->vci_local = vci_src;
+        p->vci_remote = vci_dst;
+        p->av = addr;
+        p->remote_rank = dst_rank;
+        /* match_bits will be set at receiving CTS */
+
         MPIR_Datatype_add_ref_if_not_builtin(datatype);
 
         mpi_errno = MPIDI_OFI_issue_ack_recv(sreq, comm, context_offset, dst_rank, tag, addr,
@@ -646,7 +435,11 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send(const void *buf, MPI_Aint count, MPI
                                              sizeof(MPIDI_OFI_ack_request_t));
         MPIR_ERR_CHECK(mpi_errno);
         /* inject a zero-size message with MPIDI_OFI_RNDV_SEND in match_bits */
-        match_bits |= MPIDI_OFI_RNDV_SEND;
+        if (p->need_pack) {
+            match_bits |= MPIDI_OFI_RNDV_PACK;
+        } else {
+            match_bits |= MPIDI_OFI_RNDV_SEND;
+        }
         MPIDI_OFI_idata_set_size(&cq_data, data_sz);    /* optionally use cq_data to carry data_sz */
         mpi_errno = MPIDI_OFI_send_lightweight(NULL, 0, cq_data, dst_rank, tag, comm,
                                                match_bits, addr,
@@ -655,6 +448,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send(const void *buf, MPI_Aint count, MPI
     } else {
         /* normal path */
         MPIDI_OFI_REQUEST_CREATE(*request, MPIR_REQUEST_KIND__SEND, vci_src);
+        MPIDI_OFI_REQUEST(*request, am_req) = NULL;
         MPIR_Request *sreq = *request;
 
         if (syncflag) {
@@ -687,31 +481,6 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send(const void *buf, MPI_Aint count, MPI
             MPIDI_OFI_REQUEST(sreq, noncontig.pack.pack_buffer) = NULL;
         }
 
-        if (do_gpu_pipelining) {
-            mpi_errno = MPIDI_OFI_send_pipeline(buf, count, datatype, cq_data, dst_rank, tag, comm,
-                                                match_bits, addr, vci_src, vci_dst,
-                                                sender_nic, receiver_nic,
-                                                *request, dt_contig, data_sz, attr);
-            MPIR_ERR_CHECK(mpi_errno);
-            goto fn_exit;
-        }
-
-        if (do_huge) {
-            mpi_errno = MPIDI_OFI_send_huge(data, data_sz, cq_data, dst_rank, tag, comm,
-                                            match_bits, addr, vci_src, vci_dst,
-                                            sender_nic, receiver_nic, *request, attr, false);
-            MPIR_ERR_CHECK(mpi_errno);
-            goto fn_exit;
-        }
-
-        if (do_striping) {
-            mpi_errno = MPIDI_OFI_send_huge(data, data_sz, cq_data, dst_rank, tag, comm,
-                                            match_bits, addr, vci_src, vci_dst,
-                                            sender_nic, receiver_nic, *request, attr, true);
-            MPIR_ERR_CHECK(mpi_errno);
-            goto fn_exit;
-        }
-
         /* NOTE: all previous send modes contains sync semantics already */
 
         if (do_iov) {
@@ -782,7 +551,6 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_NM_mpi_isend(const void *buf, MPI_Aint count,
         mpi_errno = MPIDI_OFI_send_fallback(buf, count, datatype, rank, tag, comm,
                                             context_offset, addr, vci_src, vci_dst, request);
         MPIR_ERR_CHECK(mpi_errno);
-        MPIDI_OFI_REQUEST(*request, am_req) = NULL;
     } else {
         bool syncflag = (bool) MPIR_PT2PT_ATTR_GET_SYNCFLAG(attr);
         bool is_init = (bool) MPIR_PT2PT_ATTR_GET_INITFLAG(attr);
@@ -790,7 +558,6 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_NM_mpi_isend(const void *buf, MPI_Aint count,
                                    context_offset, addr, vci_src, vci_dst, request,
                                    false /* is_am */ , syncflag, is_init);
         MPIR_ERR_CHECK(mpi_errno);
-        MPIDI_OFI_REQUEST(*request, am_req) = NULL;
     }
 
   fn_exit:
diff --git a/src/mpid/ch4/netmod/ofi/ofi_types.h b/src/mpid/ch4/netmod/ofi/ofi_types.h
index e3c3d464ae5..2fe1bdf4cf4 100644
--- a/src/mpid/ch4/netmod/ofi/ofi_types.h
+++ b/src/mpid/ch4/netmod/ofi/ofi_types.h
@@ -98,18 +98,18 @@ static inline uint32_t MPIDI_OFI_idata_get_gpuchunk_bits(uint64_t idata)
     return (idata >> MPIDI_OFI_IDATA_GPUCHUNK_OFFSET);
 }
 
-#define MPIDI_OFI_PROTOCOL_BITS (6)
+#define MPIDI_OFI_PROTOCOL_BITS (5)
 /* define protocol bits without MPIDI_OFI_PROTOCOL_SHIFT */
+/* The first 3 bits defines separate tag spaces */
 #define MPIDI_OFI_ACK_SEND_0      1ULL
 #define MPIDI_OFI_DYNPROC_SEND_0       2ULL
-#define MPIDI_OFI_GPU_PIPELINE_SEND_0  4ULL
-#define MPIDI_OFI_AM_SEND_0           32ULL
-/* the above defines separate tag spaces */
+#define MPIDI_OFI_AM_SEND_0            4ULL
+/* the next 2 bits defines 3 meta values */
 #define MPIDI_OFI_SYNC_SEND_0          8ULL
-#define MPIDI_OFI_HUGE_SEND_0         16ULL
-#define MPIDI_OFI_RNDV_SEND_0         24ULL
+#define MPIDI_OFI_RNDV_SEND_0         16ULL
+#define MPIDI_OFI_RNDV_PACK_0         24ULL     /* sender require packing, thus only support pipeline or rndv write */
 /* these two are really tag-carried meta data, thus require to be masked in receive */
-#define MPIDI_OFI_PROTOCOL_MASK_0     (MPIDI_OFI_SYNC_SEND_0 | MPIDI_OFI_HUGE_SEND_0)
+#define MPIDI_OFI_PROTOCOL_MASK_0     (MPIDI_OFI_SYNC_SEND_0 | MPIDI_OFI_RNDV_SEND_0)
 
 /* Define constants for default bits allocation. The actual bits are defined in
  * ofi_capability_sets.h, which may use these defaults or define its own.
@@ -129,11 +129,10 @@ static inline uint32_t MPIDI_OFI_idata_get_gpuchunk_bits(uint64_t idata)
 #define MPIDI_OFI_PROTOCOL_SHIFT     (MPIDI_OFI_CONTEXT_BITS + MPIDI_OFI_SOURCE_BITS + MPIDI_OFI_TAG_BITS)
 #define MPIDI_OFI_ACK_SEND           (MPIDI_OFI_ACK_SEND_0 << MPIDI_OFI_PROTOCOL_SHIFT)
 #define MPIDI_OFI_DYNPROC_SEND       (MPIDI_OFI_DYNPROC_SEND_0 << MPIDI_OFI_PROTOCOL_SHIFT)
-#define MPIDI_OFI_GPU_PIPELINE_SEND  (MPIDI_OFI_GPU_PIPELINE_SEND_0 << MPIDI_OFI_PROTOCOL_SHIFT)
-#define MPIDI_OFI_SYNC_SEND          (MPIDI_OFI_SYNC_SEND_0 << MPIDI_OFI_PROTOCOL_SHIFT)
-#define MPIDI_OFI_HUGE_SEND          (MPIDI_OFI_HUGE_SEND_0 << MPIDI_OFI_PROTOCOL_SHIFT)
 #define MPIDI_OFI_AM_SEND            (MPIDI_OFI_AM_SEND_0 << MPIDI_OFI_PROTOCOL_SHIFT)
+#define MPIDI_OFI_SYNC_SEND          (MPIDI_OFI_SYNC_SEND_0 << MPIDI_OFI_PROTOCOL_SHIFT)
 #define MPIDI_OFI_RNDV_SEND          (MPIDI_OFI_RNDV_SEND_0 << MPIDI_OFI_PROTOCOL_SHIFT)
+#define MPIDI_OFI_RNDV_PACK          (MPIDI_OFI_RNDV_PACK_0 << MPIDI_OFI_PROTOCOL_SHIFT)
 
 #define MPIDI_OFI_PROTOCOL_MASK      (MPIDI_OFI_PROTOCOL_MASK_0 << MPIDI_OFI_PROTOCOL_SHIFT)
 #define MPIDI_OFI_CONTEXT_MASK       (((1ULL << MPIDI_OFI_CONTEXT_BITS) - 1) << (MPIDI_OFI_SOURCE_BITS + MPIDI_OFI_TAG_BITS))
@@ -168,7 +167,7 @@ static inline uint32_t MPIDI_OFI_idata_get_gpuchunk_bits(uint64_t idata)
 #define MPIDI_OFI_AMREQUEST(req,field)     ((req)->dev.ch4.am.netmod_am.ofi.field)
 #define MPIDI_OFI_AM_SREQ_HDR(req,field) ((req)->dev.ch4.am.netmod_am.ofi.sreq_hdr->field)
 #define MPIDI_OFI_AM_RREQ_HDR(req,field) ((req)->dev.ch4.am.netmod_am.ofi.rreq_hdr->field)
-#define MPIDI_OFI_REQUEST(req,field)       ((req)->dev.ch4.netmod.ofi.field)
+#define MPIDI_OFI_REQUEST(req,field)       ((req)->dev.ch4.netmod.ofi.direct.field)
 #define MPIDI_OFI_AV(av)                   ((av)->netmod.ofi)
 
 #define MPIDI_OFI_COMM(comm)     ((comm)->dev.ch4.netmod.ofi)
@@ -188,26 +187,28 @@ enum {
     MPIDI_OFI_EVENT_ABORT,
     MPIDI_OFI_EVENT_SEND,
     MPIDI_OFI_EVENT_RECV,
-    MPIDI_OFI_EVENT_SEND_GPU_PIPELINE,
-    MPIDI_OFI_EVENT_RECV_GPU_PIPELINE_INIT,
-    MPIDI_OFI_EVENT_RECV_GPU_PIPELINE,
     MPIDI_OFI_EVENT_AM_SEND,
     MPIDI_OFI_EVENT_AM_SEND_RDMA,
     MPIDI_OFI_EVENT_AM_SEND_PIPELINE,
     MPIDI_OFI_EVENT_AM_RECV,
     MPIDI_OFI_EVENT_AM_READ,
     MPIDI_OFI_EVENT_PEEK,
-    MPIDI_OFI_EVENT_RECV_HUGE,
     MPIDI_OFI_EVENT_RECV_PACK,
     MPIDI_OFI_EVENT_RECV_NOPACK,
-    MPIDI_OFI_EVENT_SEND_HUGE,
     MPIDI_OFI_EVENT_SEND_PACK,
     MPIDI_OFI_EVENT_SEND_NOPACK,
     MPIDI_OFI_EVENT_SSEND_ACK,
     MPIDI_OFI_EVENT_RNDV_CTS,
-    MPIDI_OFI_EVENT_GET_HUGE,
+    MPIDI_OFI_EVENT_PIPELINE_SEND_CHUNK,
+    MPIDI_OFI_EVENT_PIPELINE_RECV_CHUNK,
+    MPIDI_OFI_EVENT_PIPELINE_RECV_DATASIZE,
+    MPIDI_OFI_EVENT_RNDVREAD_RECV_MRS,
+    MPIDI_OFI_EVENT_RNDVREAD_READ_CHUNK,
+    MPIDI_OFI_EVENT_RNDVREAD_ACK,
+    MPIDI_OFI_EVENT_RNDVWRITE_RECV_MRS,
+    MPIDI_OFI_EVENT_RNDVWRITE_WRITE_CHUNK,
+    MPIDI_OFI_EVENT_RNDVWRITE_ACK,
     MPIDI_OFI_EVENT_CHUNK_DONE,
-    MPIDI_OFI_EVENT_HUGE_CHUNK_DONE,
     MPIDI_OFI_EVENT_INJECT_EMU,
     MPIDI_OFI_EVENT_DYNPROC_DONE,
 };
@@ -301,6 +302,7 @@ typedef struct {
     MPIDI_OFI_am_repost_request_t am_reqs[MPIDI_OFI_MAX_NUM_AM_BUFFERS];
 
     MPIDU_genq_private_pool_t am_hdr_buf_pool;
+    MPIDU_genq_private_pool_t pipeline_pool;
 
     /* Queue to store defferend am send */
     MPIDI_OFI_deferred_am_isend_req_t *deferred_am_isend_q;
@@ -325,12 +327,6 @@ typedef struct {
     int cq_buffered_static_tail;
     MPIDI_OFI_cq_list_t *cq_buffered_dynamic_head, *cq_buffered_dynamic_tail;
 
-    /* queues to matching huge recv and control message */
-    struct MPIDI_OFI_huge_recv_list *huge_ctrl_head;
-    struct MPIDI_OFI_huge_recv_list *huge_ctrl_tail;
-    struct MPIDI_OFI_huge_recv_list *huge_recv_head;
-    struct MPIDI_OFI_huge_recv_list *huge_recv_tail;
-
     char pad MPL_ATTR_ALIGNED(MPL_CACHELINE_SIZE);
 } MPIDI_OFI_per_vci_t;
 
@@ -511,14 +507,6 @@ typedef struct {
     /* stores the maximum of last recently used regular memory region key */
     uint64_t global_max_regular_mr_key;
 
-    /* GPU pipeline */
-    MPIDU_genq_private_pool_t gpu_pipeline_send_pool;
-    MPIDU_genq_private_pool_t gpu_pipeline_recv_pool;
-    MPIDI_OFI_gpu_task_t *gpu_send_task_queue[MPIDI_CH4_MAX_VCIS];
-    MPIDI_OFI_gpu_task_t *gpu_recv_task_queue[MPIDI_CH4_MAX_VCIS];
-    MPIDI_OFI_gpu_pending_recv_t *gpu_recv_queue;
-    MPIDI_OFI_gpu_pending_send_t *gpu_send_queue;
-
     int addrnamelen;            /* OFI uses the same name length within a provider. */
     /* To support dynamic av tables, we need a way to tell which entries are empty.
      * ch4 av tables are initialize to 0s. Thus we need know which "0" is valid. */
@@ -530,30 +518,6 @@ typedef struct {
 #endif
 } MPIDI_OFI_global_t;
 
-typedef struct {
-    int comm_id;
-    int origin_rank;
-    int tag;
-    MPIR_Request *ackreq;
-    void *send_buf;
-    size_t msgsize;
-    uint64_t rma_keys[MPIDI_OFI_MAX_NICS];
-    int vci_src;
-    int vci_dst;
-} MPIDI_OFI_huge_remote_info_t;
-
-typedef struct {
-    int16_t type;
-    union {
-        struct {
-            MPIDI_OFI_huge_remote_info_t info;
-        } huge;
-        struct {
-            MPIR_Request *ackreq;
-        } huge_ack;
-    } u;
-} MPIDI_OFI_send_control_t;
-
 typedef struct MPIDI_OFI_win_acc_hint {
     uint64_t dtypes_max_count[MPIDI_OFI_DT_MAX];        /* translate CH4 which_accumulate_ops hints to
                                                          * atomicity support of all OFI datatypes. A datatype
@@ -646,38 +610,17 @@ typedef struct MPIDI_OFI_target_mr {
     uint64_t mr_key;
 } MPIDI_OFI_target_mr_t;
 
-typedef struct MPIDI_OFI_read_chunk {
-    char pad[MPIDI_REQUEST_HDR_SIZE];
-    struct fi_context context[MPIDI_OFI_CONTEXT_STRUCTS];       /* fixed field, do not move */
-    int event_id;               /* fixed field, do not move */
-    MPIR_Request *localreq;
-    MPIR_cc_t *chunks_outstanding;
-} MPIDI_OFI_read_chunk_t;
-
-/* The list of posted huge receives that haven't been matched yet. These need
- * to get matched up when handling the control message that starts transferring
- * data from the remote memory region and we need a way of matching up the
- * control messages with the "real" requests. */
-typedef struct MPIDI_OFI_huge_recv_list {
-    int comm_id;
-    int rank;
-    int tag;
-    union {
-        MPIDI_OFI_huge_remote_info_t *info;     /* ctrl list */
-        MPIR_Request *rreq;     /* recv list */
-    } u;
-    struct MPIDI_OFI_huge_recv_list *next;
-} MPIDI_OFI_huge_recv_list_t;
-
 /* Externs */
 extern MPIDI_OFI_global_t MPIDI_OFI_global;
 
 extern MPIDI_OFI_capabilities_t MPIDI_OFI_caps_list[MPIDI_OFI_NUM_SETS];
 
+#define MPIDI_OFI_CAN_SEND_CQ_DATASIZE(data_sz) (MPIDI_OFI_global.cq_data_size == 8 && (data_sz) <= INT32_MAX)
+
 static inline void MPIDI_OFI_idata_set_size(uint64_t * data_field, MPI_Aint data_sz)
 {
     *data_field &= 0xffffffff;
-    if (MPIDI_OFI_global.cq_data_size == 8 && data_sz <= INT32_MAX) {
+    if (MPIDI_OFI_CAN_SEND_CQ_DATASIZE(data_sz)) {
         *data_field |= ((uint64_t) data_sz << 32);
     }
 }
diff --git a/src/mpid/ch4/netmod/ofi/ofi_vci.c b/src/mpid/ch4/netmod/ofi/ofi_vci.c
index 8cc176fbb77..7000054e93f 100644
--- a/src/mpid/ch4/netmod/ofi/ofi_vci.c
+++ b/src/mpid/ch4/netmod/ofi/ofi_vci.c
@@ -65,8 +65,7 @@ int MPIDI_OFI_comm_set_vcis(MPIR_Comm * comm, int num_implicit, int num_reserved
     MPIR_ERR_CHECK(mpi_errno);
 
     for (int vci = 1; vci < MPIDI_OFI_global.num_vcis; vci++) {
-        MPIDI_OFI_am_init(vci);
-        MPIDI_OFI_am_post_recv(vci, 0);
+        MPIDI_OFI_init_per_vci(vci);
     }
 
     if (MPIR_CVAR_DEBUG_SUMMARY && comm->rank == 0) {
diff --git a/src/mpid/ch4/netmod/ofi/util.c b/src/mpid/ch4/netmod/ofi/util.c
index 115cb63a3b8..5253cc9b9b9 100644
--- a/src/mpid/ch4/netmod/ofi/util.c
+++ b/src/mpid/ch4/netmod/ofi/util.c
@@ -174,41 +174,6 @@ void MPIDI_OFI_mr_key_allocator_destroy(void)
     MPID_THREAD_CS_EXIT(VCI, mr_key_allocator_lock);
 }
 
-int MPIDI_OFI_control_handler(void *am_hdr, void *data, MPI_Aint data_sz,
-                              uint32_t attr, MPIR_Request ** req)
-{
-    int mpi_errno = MPI_SUCCESS;
-    MPIDI_OFI_send_control_t *ctrlsend = (MPIDI_OFI_send_control_t *) am_hdr;
-
-    if (attr & MPIDIG_AM_ATTR__IS_ASYNC) {
-        *req = NULL;
-    }
-
-    int local_vci = MPIDIG_AM_ATTR_DST_VCI(attr);
-    MPIR_AssertDeclValue(int remote_vci, MPIDIG_AM_ATTR_SRC_VCI(attr));
-    switch (ctrlsend->type) {
-        case MPIDI_OFI_CTRL_HUGEACK:
-            mpi_errno = MPIDI_OFI_dispatch_function(local_vci, NULL, ctrlsend->u.huge_ack.ackreq);
-            break;
-
-        case MPIDI_OFI_CTRL_HUGE:
-            MPIR_Assert(local_vci == ctrlsend->u.huge.info.vci_dst);
-            MPIR_Assert(remote_vci == ctrlsend->u.huge.info.vci_src);
-            mpi_errno = MPIDI_OFI_recv_huge_control(local_vci,
-                                                    ctrlsend->u.huge.info.comm_id,
-                                                    ctrlsend->u.huge.info.origin_rank,
-                                                    ctrlsend->u.huge.info.tag,
-                                                    &(ctrlsend->u.huge.info));
-            break;
-
-        default:
-            fprintf(stderr, "Bad control type: 0x%08x  %d\n", ctrlsend->type, ctrlsend->type);
-            MPIR_Assert(0);
-    }
-
-    return mpi_errno;
-}
-
 static bool check_mpi_acc_valid(MPI_Datatype dtype, MPI_Op op)
 {
     bool valid_flag = false;
diff --git a/src/mpid/ch4/netmod/ucx/ucx_am.h b/src/mpid/ch4/netmod/ucx/ucx_am.h
index 57d2404088e..ecfcb295401 100644
--- a/src/mpid/ch4/netmod/ucx/ucx_am.h
+++ b/src/mpid/ch4/netmod/ucx/ucx_am.h
@@ -272,7 +272,7 @@ MPL_STATIC_INLINE_PREFIX bool MPIDI_NM_am_check_eager(MPI_Aint am_hdr_sz, MPI_Ai
 #endif
 }
 
-MPL_STATIC_INLINE_PREFIX bool MPIDI_NM_am_can_do_tag(void)
+MPL_STATIC_INLINE_PREFIX bool MPIDI_NM_am_can_do_tag(MPIR_Request * rreq)
 {
     return true;
 }
diff --git a/src/mpid/ch4/shm/ipc/gpu/gpu_post.c b/src/mpid/ch4/shm/ipc/gpu/gpu_post.c
index 1f23ae06d32..813659a49c7 100644
--- a/src/mpid/ch4/shm/ipc/gpu/gpu_post.c
+++ b/src/mpid/ch4/shm/ipc/gpu/gpu_post.c
@@ -603,21 +603,7 @@ static int gpu_ipc_async_poll(MPIX_Async_thing thing)
     int is_done = 0;
 
     struct gpu_ipc_async *p = MPIR_Async_thing_get_state(thing);
-    switch (p->yreq.type) {
-        case MPIR_NULL_REQUEST:
-            /* a dummy, immediately complete */
-            is_done = 1;
-            break;
-        case MPIR_TYPEREP_REQUEST:
-            MPIR_Typerep_test(p->yreq.u.y_req, &is_done);
-            break;
-        case MPIR_GPU_REQUEST:
-            err = MPL_gpu_test(&p->yreq.u.gpu_req, &is_done);
-            MPIR_Assertp(err == MPL_SUCCESS);
-            break;
-        default:
-            MPIR_Assert(0);
-    }
+    MPIR_async_test(&(p->yreq), &is_done);
 
     if (is_done) {
         int vci = MPIDIG_REQUEST(p->req, req->local_vci);
diff --git a/src/mpid/ch4/shm/src/shm_am.h b/src/mpid/ch4/shm/src/shm_am.h
index 6faf22d6a07..c313f3daafd 100644
--- a/src/mpid/ch4/shm/src/shm_am.h
+++ b/src/mpid/ch4/shm/src/shm_am.h
@@ -117,7 +117,7 @@ MPL_STATIC_INLINE_PREFIX bool MPIDI_SHM_am_check_eager(MPI_Aint am_hdr_sz, MPI_A
     return (am_hdr_sz + data_sz) <= MPIDI_POSIX_am_eager_limit();
 }
 
-MPL_STATIC_INLINE_PREFIX bool MPIDI_SHM_am_can_do_tag(void)
+MPL_STATIC_INLINE_PREFIX bool MPIDI_SHM_am_can_do_tag(MPIR_Request * rreq)
 {
     return false;
 }
diff --git a/src/mpid/ch4/src/mpidig.h b/src/mpid/ch4/src/mpidig.h
index e889502e9ca..850c8c19b04 100644
--- a/src/mpid/ch4/src/mpidig.h
+++ b/src/mpid/ch4/src/mpidig.h
@@ -62,7 +62,6 @@ enum {
     MPIDI_IPC_ACK,
     MPIDI_IPC_WRITE,
 
-    MPIDI_OFI_INTERNAL_HANDLER_CONTROL,
     MPIDI_OFI_AM_RDMA_READ_ACK,
     MPIDI_OFI_RNDV_INFO,
 
diff --git a/src/mpid/ch4/src/mpidig_pt2pt_callbacks.c b/src/mpid/ch4/src/mpidig_pt2pt_callbacks.c
index 3b524639d65..5407c33ac0a 100644
--- a/src/mpid/ch4/src/mpidig_pt2pt_callbacks.c
+++ b/src/mpid/ch4/src/mpidig_pt2pt_callbacks.c
@@ -13,9 +13,10 @@ static int recv_target_cmpl_cb(MPIR_Request * rreq);
 static int can_do_tag(MPIR_Request * rreq)
 {
 #ifdef MPIDI_CH4_DIRECT_NETMOD
-    return MPIDI_NM_am_can_do_tag();
+    return MPIDI_NM_am_can_do_tag(rreq);
 #else
-    return MPIDI_REQUEST(rreq, is_local) ? MPIDI_SHM_am_can_do_tag() : MPIDI_NM_am_can_do_tag();
+    return MPIDI_REQUEST(rreq, is_local) ?
+        MPIDI_SHM_am_can_do_tag(rreq) : MPIDI_NM_am_can_do_tag(rreq);
 #endif
 }
 
diff --git a/test/mpi/bench/Makefile.am b/test/mpi/bench/Makefile.am
index e2e8d81e567..471acd0ee32 100644
--- a/test/mpi/bench/Makefile.am
+++ b/test/mpi/bench/Makefile.am
@@ -15,6 +15,7 @@ noinst_PROGRAMS = \
     get_bw \
     put_bw \
     p2p_one \
+    p2p_self \
     barrier \
     bcast
 
diff --git a/test/mpi/bench/macros/bench_frame.def b/test/mpi/bench/macros/bench_frame.def
index a0567d0f6db..78b2ebd20c5 100644
--- a/test/mpi/bench/macros/bench_frame.def
+++ b/test/mpi/bench/macros/bench_frame.def
@@ -23,10 +23,8 @@ subcode: bench_frame
         $global grank, gsize: int
         MPI_Comm_rank(MPI_COMM_WORLD, &grank);
         MPI_Comm_size(MPI_COMM_WORLD, &gsize);
-        $(if:MIN_PROCS)
-            $if gsize < $(MIN_PROCS)
-                printf("! Test $(_pagename) requires $(MIN_PROCS) processes !\n");
-                return 1
+
+        $call check_launch
 
         MPI_Comm comm = MPI_COMM_WORLD;
 
@@ -60,6 +58,22 @@ subcode: bench_frame
             MTest_Finalize(0);
         $(else)
             MPI_Finalize();
+    # -----
+    subcode: check_launch
+        $(if:MIN_PROCS)
+            $if gsize < $(MIN_PROCS)
+                printf("! Test $(_pagename) requires $(MIN_PROCS) processes !\n");
+                return 1
+        $(if:MEM_TYPES=sendrecv)
+            $if gsize % 2 == 1
+                printf("! Test $(_pagename) requires even number of processes to form even/odd pairs !\n");
+            $global gsrc, gdst: int
+            $if grank % 2 == 0
+                gsrc = grank
+                gdst = grank + 1
+            $else
+                gsrc = grank - 1
+                gdst = grank
 
 macros:
     use_double: 1
diff --git a/test/mpi/bench/macros/bench_p2p.def b/test/mpi/bench/macros/bench_p2p.def
index 08c302cf847..524c7a4eae5 100644
--- a/test/mpi/bench/macros/bench_p2p.def
+++ b/test/mpi/bench/macros/bench_p2p.def
@@ -15,7 +15,6 @@
  */
 
 macros:
-    MIN_PROCS: 2
     MEM_TYPES: sendrecv
 
 subcode: _autoload
diff --git a/test/mpi/bench/macros/mtest.def b/test/mpi/bench/macros/mtest.def
index 6eebb85d09b..757ca7cb84c 100644
--- a/test/mpi/bench/macros/mtest.def
+++ b/test/mpi/bench/macros/mtest.def
@@ -4,10 +4,10 @@ macros:
 subcode: mtest_malloc(size)
     MTestArgList *head = MTestArgListCreate(argc, argv)
     $(if:MEM_TYPES=sendrecv)
-        int send_rank = 0, recv_rank = 1;
-        $(for:a in send,recv)
-            $if grank == $(a)_rank
-                $call alloc_mem_dev, $(a)mem, $(a)dev
+        $if grank == gsrc
+            $call alloc_mem_dev, sendmem, senddev
+        $elif grank == gdst
+            $call alloc_mem_dev, recvmem, recvdev
     $(else)
         # all procs allocating the same memory types
         $call alloc_mem_dev, memtype, device
diff --git a/test/mpi/bench/p2p_one.def b/test/mpi/bench/p2p_one.def
index af0fbf93cf4..6cf215da888 100644
--- a/test/mpi/bench/p2p_one.def
+++ b/test/mpi/bench/p2p_one.def
@@ -6,7 +6,6 @@ include: macros/mtest.def
  */
 
 macros:
-    MIN_PROCS: 2
     MEM_TYPES: sendrecv
 
 subcode: _autoload
@@ -27,27 +26,28 @@ page: p2p_one, bench_frame
     MAX_BUFSIZE: 1000000000
     data: buf, size, MPI_CHAR
 
-    int rank;
-    MPI_Comm_rank(comm, &rank)
-
-    src = 0
-    dst = 1
     size = MAX_BUFSIZE
 
     /* Repeat a few times to see variations or the need to warm up */
     $for iter=0:5
-        $if rank == src
+        $if grank == gsrc
             tf_start = MPI_Wtime()
             $call @send_side
-            MPI_Recv(NULL, 0, MPI_DATATYPE_NULL, dst, TAG, comm, MPI_STATUS_IGNORE)
+            $call recv_sync, gdst
             tf_latency = MPI_Wtime() - tf_start
             $call report_one
-        $elif rank == dst
+        $elif grank == gdst
             $call @recv_side
-            MPI_Send(NULL, 0, MPI_DATATYPE_NULL, src, TAG, comm)
+            $call send_sync, gsrc
 
     subcode: send_side
-        MPI_Send($(data), dst, TAG, comm)
+        MPI_Send($(data), gdst, TAG, comm)
 
     subcode: recv_side
-        MPI_Recv($(data), src, TAG, comm, MPI_STATUS_IGNORE)
+        MPI_Recv($(data), gsrc, TAG, comm, MPI_STATUS_IGNORE)
+
+    subcode: send_sync(rank)
+        MPI_Send(NULL, 0, MPI_DATATYPE_NULL, $(rank), TAG, comm)
+
+    subcode: recv_sync(rank)
+        MPI_Recv(NULL, 0, MPI_DATATYPE_NULL, $(rank), TAG, comm, MPI_STATUS_IGNORE)
diff --git a/test/mpi/bench/p2p_self.def b/test/mpi/bench/p2p_self.def
new file mode 100644
index 00000000000..03824222ad1
--- /dev/null
+++ b/test/mpi/bench/p2p_self.def
@@ -0,0 +1,74 @@
+include: macros/bench_frame.def
+include: macros/mtest.def
+
+/* Similar to p2p_one, but measure bandwidth of self messaging.
+ * The results should be comparable to a memcpy bandwidth.
+ */
+
+subcode: report_header
+    printf("%12s %10s %12s\n", "msgsize", "latency(sec)", "bandwidth(GB/s)");
+
+subcode: report_one
+    tf_bw = size / tf_latency / 1e9
+    printf("%12d %10.3f %12.3f\n", size, tf_latency, tf_bw);
+
+page: p2p_self, self_frame
+    MAX_BUFSIZE: 1000000000
+
+    $call allocate_buffers
+
+    comm = MPI_COMM_SELF
+    int tag = 0;
+    int size = $(MAX_BUFSIZE);
+
+    MPI_Request reqs[2];
+    $for iter=0:5
+        tf_start = MPI_Wtime()
+        MPI_Isend(send_buf, size, MPI_CHAR, 0, tag, comm, &reqs[0])
+        MPI_Irecv(recv_buf, size, MPI_CHAR, 0, tag, comm, &reqs[1])
+        MPI_Waitall(2, reqs, MPI_STATUSES_IGNORE)
+        tf_latency = MPI_Wtime() - tf_start
+        $call report_one
+
+subcode: self_frame
+    $include stdio
+    $include stdlib
+    $(if:HAS_MTEST)
+        $include mpitest.h
+    $(else)
+        $include mpi
+
+    $function main
+        $(if:HAS_MTEST)
+            MTest_Init(NULL, NULL);
+        $(else)
+            MPI_Init(NULL, NULL);
+
+        printf("TEST $(_pagename):\n")
+        $call @report_header
+        $call main
+        printf("\n")
+
+        $(if:HAS_MTEST)
+            MTest_Finalize(0);
+        $(else)
+            MPI_Finalize();
+
+subcode: allocate_buffers
+    $my send_buf, recv_buf: void *
+    $(set:size=MAX_BUFSIZE)
+    $(if:HAS_MTEST)
+        MTestArgList *head = MTestArgListCreate(argc, argv)
+        $my mtest_mem_type_e memtype, int device
+        $(for:send, recv)
+            memtype = MTestArgListGetMemType(head, "$1mem")
+            device = MTestArgListGetInt_with_default(head, "$1dev", 0)
+            MTestMalloc($(size), memtype, NULL, &$1_buf, device)
+        MTestArgListDestroy(head)
+    $(else)
+        send_buf = malloc($(size))
+        recv_buf = malloc($(size))
+    $if !send_buf || !recv_buf
+        printf("! Failed to allocate buffers (size=%d)\n", MAX_BUFSIZE)
+        return 1
+
diff --git a/test/mpi/errors/pt2pt/testlist b/test/mpi/errors/pt2pt/testlist
index eca0402fe21..89aa8841e16 100644
--- a/test/mpi/errors/pt2pt/testlist
+++ b/test/mpi/errors/pt2pt/testlist
@@ -1,6 +1,9 @@
 proberank 1
 truncmsg1 2
-truncmsg1 2 env=MPIR_CVAR_CH4_OFI_EAGER_MAX_MSG_SIZE=16384
+truncmsg1 2 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384
+truncmsg1 2 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384 env=MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL=pipeline
+truncmsg1 2 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384 env=MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL=read
+truncmsg1 2 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384 env=MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL=write
 truncmsg2 2
 truncmsg_mrecv 2
 # multiple completion ests
diff --git a/test/mpi/maint/jenkins/xfail.conf b/test/mpi/maint/jenkins/xfail.conf
index da0bafb9ede..5184dbd2be0 100644
--- a/test/mpi/maint/jenkins/xfail.conf
+++ b/test/mpi/maint/jenkins/xfail.conf
@@ -116,8 +116,5 @@ mpich-.*-arm.* * * * * /^reduce 10/           xfail=ticket0       coll/testlist.
 * * * * *               /^allgather_gpu/   xfail=ticket6657    coll/testlist.gpu
 * * * * *               /^allgatherv_gpu/   xfail=ticket6657    coll/testlist.gpu
 
-# GPU pipelining requires provider cq_data_sz > 8 which isn't met by psm3
-* * * ch4:ofi *         /^.*MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1.*/    xfail=ticket0           pt2pt/testlist.gpu
-
 # F08 profiling test only works with dynamic linking
 * * noshared * *        /^profile1f90/  xfail=ticket0   f08/profile/testlist
diff --git a/test/mpi/pt2pt/testlist.gpu b/test/mpi/pt2pt/testlist.gpu
index 57640705ac3..103030fd7c7 100644
--- a/test/mpi/pt2pt/testlist.gpu
+++ b/test/mpi/pt2pt/testlist.gpu
@@ -18,5 +18,5 @@ sendrecv1 2 arg=-typelist=MPI_INT,MPI_INT:4+MPI_DOUBLE:8 arg=-counts=1,17,50,100
 sendrecv1 2 arg=-typelist=MPI_INT,MPI_INT:4+MPI_DOUBLE:8 arg=-counts=1,17,50,100,512,65530 arg=-seed=200 arg=-testsizes=8,100 arg=-sendmem=shared arg=-recvmem=host
 sendrecv1 2 arg=-typelist=MPI_INT,MPI_INT:4+MPI_DOUBLE:8 arg=-counts=1,17,50,100,512,65530 arg=-seed=200 arg=-testsizes=8,100 arg=-sendmem=shared arg=-recvmem=reg_host
 sendrecv1 2 arg=-typelist=MPI_INT,MPI_INT:4+MPI_DOUBLE:8 arg=-counts=1,17,50,100,512,65530 arg=-seed=200 arg=-testsizes=8,100 arg=-sendmem=shared arg=-recvmem=shared
-sendrecv1 2 arg=-typelist=MPI_INT,MPI_INT:4+MPI_DOUBLE:8 arg=-counts=1,17,50,100,512,65530 arg=-seed=200 arg=-testsizes=8 arg=-sendmem=device arg=-recvmem=device env=MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 timeLimit=600
-pipeline 2 env=MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 env=MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=4 env=MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=4 env=MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 env=MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
+sendrecv1 2 arg=-typelist=MPI_INT,MPI_INT:4+MPI_DOUBLE:8 arg=-counts=1,17,50,100,512,65530 arg=-seed=200 arg=-testsizes=8 arg=-sendmem=device arg=-recvmem=device env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=100000 env=MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL=pipeline timeLimit=600
+pipeline 2 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=131072 env=MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL=pipeline env=MPIR_CVAR_CH4_OFI_PIPELINE_NUM_CHUNKS=4 env=MPIR_CVAR_CH4_OFI_PIPELINE_CHUNK_SZ=1048576
diff --git a/test/mpi/pt2pt/testlist.in b/test/mpi/pt2pt/testlist.in
index fa98c1ba95d..e4043924c06 100644
--- a/test/mpi/pt2pt/testlist.in
+++ b/test/mpi/pt2pt/testlist.in
@@ -35,10 +35,13 @@ rqstatus 2
 rqfree 4 resultTest=TestAllowWarnings env=MPIR_CVAR_FINALIZE_WAIT=1
 greq1 1
 probe_unexp 4
-probe_unexp 4 env=MPIR_CVAR_CH4_OFI_EAGER_MAX_MSG_SIZE=16384
+probe_unexp 4 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384
+probe_unexp 4 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384 env=MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL=pipeline
+probe_unexp 4 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384 env=MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL=read
+probe_unexp 4 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384 env=MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL=write
 probe_unexp 4 env=MPIR_CVAR_CH4_OFI_AM_LONG_FORCE_PIPELINE=true
 probenull 1
-probenull 1 env=MPIR_CVAR_CH4_OFI_EAGER_MAX_MSG_SIZE=16384
+probenull 1 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384
 probenull 1 env=MPIR_CVAR_CH4_OFI_AM_LONG_FORCE_PIPELINE=true
 recvnull 1
 # For testing, scancel will run with 1 process as well
@@ -57,13 +60,19 @@ waitany_null 1
 # perhaps disable in the release tarball
 large_message 3 mem=6.5
 mprobe 2
-mprobe 2 env=MPIR_CVAR_CH4_OFI_EAGER_MAX_MSG_SIZE=16384
+mprobe 2 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384
+mprobe 2 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384 env=MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL=pipeline
+mprobe 2 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384 env=MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL=read
+mprobe 2 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384 env=MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL=write
 mprobe 2 env=MPIR_CVAR_CH4_OFI_AM_LONG_FORCE_PIPELINE=true
 big_count_status 1
 many_isend 3
 manylmt 2
 huge_underflow 2
-huge_underflow 2 env=MPIR_CVAR_CH4_OFI_EAGER_MAX_MSG_SIZE=16384
+huge_underflow 2 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384
+huge_underflow 2 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384 env=MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL=pipeline
+huge_underflow 2 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384 env=MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL=read
+huge_underflow 2 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384 env=MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL=write
 huge_underflow 2 env=MPIR_CVAR_CH4_OFI_AM_LONG_FORCE_PIPELINE=true
 huge_anysrc 2
 huge_dupcomm 2
diff --git a/test/mpi/threads/pt2pt/testlist b/test/mpi/threads/pt2pt/testlist
index 8bd6ced6531..ad45659662c 100644
--- a/test/mpi/threads/pt2pt/testlist
+++ b/test/mpi/threads/pt2pt/testlist
@@ -41,15 +41,15 @@ mt_mprobe_sendirecv 2 arg=-iter=1024 arg=-count=1
 mt_improbe_isendrecv 2 arg=-iter=1024 arg=-count=1
 mt_improbe_sendrecv 2 arg=-iter=1024 arg=-count=1
 
-mt_probe_sendrecv_huge 2 arg=-iter=64 arg=-count=4096 env=MPIR_CVAR_CH4_OFI_EAGER_MAX_MSG_SIZE=16384
-mt_iprobe_sendrecv_huge 2 arg=-iter=64 arg=-count=4096 env=MPIR_CVAR_CH4_OFI_EAGER_MAX_MSG_SIZE=16384
-mt_mprobe_sendrecv_huge 2 arg=-iter=64 arg=-count=4096 env=MPIR_CVAR_CH4_OFI_EAGER_MAX_MSG_SIZE=16384
-mt_improbe_sendrecv_huge 2 arg=-iter=64 arg=-count=4096 env=MPIR_CVAR_CH4_OFI_EAGER_MAX_MSG_SIZE=16384
-
-mt_probe_sendrecv_huge 2 arg=-iter=64 arg=-count=65530 env=MPIR_CVAR_CH4_OFI_EAGER_MAX_MSG_SIZE=16384
-mt_iprobe_sendrecv_huge 2 arg=-iter=64 arg=-count=65530 env=MPIR_CVAR_CH4_OFI_EAGER_MAX_MSG_SIZE=16384
-mt_mprobe_sendrecv_huge 2 arg=-iter=64 arg=-count=65530 env=MPIR_CVAR_CH4_OFI_EAGER_MAX_MSG_SIZE=16384
-mt_improbe_sendrecv_huge 2 arg=-iter=64 arg=-count=65530 env=MPIR_CVAR_CH4_OFI_EAGER_MAX_MSG_SIZE=16384
+mt_probe_sendrecv_huge 2 arg=-iter=64 arg=-count=4096 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384
+mt_iprobe_sendrecv_huge 2 arg=-iter=64 arg=-count=4096 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384
+mt_mprobe_sendrecv_huge 2 arg=-iter=64 arg=-count=4096 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384
+mt_improbe_sendrecv_huge 2 arg=-iter=64 arg=-count=4096 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384
+
+mt_probe_sendrecv_huge 2 arg=-iter=64 arg=-count=65530 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384
+mt_iprobe_sendrecv_huge 2 arg=-iter=64 arg=-count=65530 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384
+mt_mprobe_sendrecv_huge 2 arg=-iter=64 arg=-count=65530 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384
+mt_improbe_sendrecv_huge 2 arg=-iter=64 arg=-count=65530 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384
 
 mt_sendrecv_pers_huge 2 arg=-iter=64 arg=-count=4096
 mt_bsendrecv_pers_huge 2 arg=-iter=8 arg=-count=4096