Skip to content

Commit 331b425

Browse files
zachdworkinj-xiong
authored andcommitted
prov/shm: Add unmap_region function
This function is mainly for the niche case where on progress_connreq a peer is added to the map with its region needing to be mapped, and then after mapping it, it's discovered that the newly mapped peer's process died. In this case we need to unmap them and free any resources that were opened for communicating with them. Remove lock from map_to_region and unmap_region functions and require lock acquirement before calling those functions. This is necessary because on av removal path, map will be double locked if the functions also process locking the map. The map_to_region function is updated to mirror this policy. Signed-off-by: Zach Dworkin <[email protected]>
1 parent 07254b6 commit 331b425

File tree

5 files changed

+80
-41
lines changed

5 files changed

+80
-41
lines changed

prov/shm/src/smr_av.c

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -69,9 +69,12 @@ static void smr_map_cleanup(struct smr_map *map)
6969
{
7070
int64_t i;
7171

72-
for (i = 0; i < SMR_MAX_PEERS; i++)
73-
smr_map_del(map, i);
72+
for (i = 0; i < SMR_MAX_PEERS; i++) {
73+
if (map->peers[i].peer.id < 0)
74+
continue;
7475

76+
smr_map_del(map, i);
77+
}
7578
ofi_rbmap_cleanup(&map->rbmap);
7679
}
7780

@@ -210,7 +213,6 @@ static int smr_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr, size_t count
210213
dlist_foreach(&util_av->ep_list, av_entry) {
211214
util_ep = container_of(av_entry, struct util_ep, av_entry);
212215
smr_ep = container_of(util_ep, struct smr_ep, util_ep);
213-
smr_unmap_from_endpoint(smr_ep->region, id);
214216
if (smr_av->smr_map.num_peers > 0)
215217
smr_ep->region->max_sar_buf_per_peer =
216218
SMR_MAX_PEERS /

prov/shm/src/smr_ep.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,9 @@ int64_t smr_verify_peer(struct smr_ep *ep, fi_addr_t fi_addr)
223223
return id;
224224

225225
if (!ep->region->map->peers[id].region) {
226+
ofi_spin_lock(&ep->region->map->lock);
226227
ret = smr_map_to_region(&smr_prov, ep->region->map, id);
228+
ofi_spin_unlock(&ep->region->map->lock);
227229
if (ret)
228230
return -1;
229231
}

prov/shm/src/smr_progress.c

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -878,7 +878,9 @@ static void smr_progress_connreq(struct smr_ep *ep, struct smr_cmd *cmd)
878878

879879
peer_smr = smr_peer_region(ep->region, idx);
880880
if (!peer_smr) {
881+
ofi_spin_lock(&ep->region->map->lock);
881882
ret = smr_map_to_region(&smr_prov, ep->region->map, idx);
883+
ofi_spin_unlock(&ep->region->map->lock);
882884
if (ret) {
883885
FI_WARN(&smr_prov, FI_LOG_EP_CTRL,
884886
"Could not map peer region\n");
@@ -891,14 +893,11 @@ static void smr_progress_connreq(struct smr_ep *ep, struct smr_cmd *cmd)
891893
if (peer_smr->pid != (int) cmd->msg.hdr.data) {
892894
/* TODO track and update/complete in error any transfers
893895
* to or from old mapping
894-
*
895-
* TODO create smr_unmap_region
896-
* this needs to close peer_smr->map->peers[idx].pid_fd
897-
* This case will also return an unmapped region because the idx
898-
* is valid but the region was unmapped
899896
*/
900-
munmap(peer_smr, peer_smr->total_size);
897+
ofi_spin_lock(&ep->region->map->lock);
898+
smr_unmap_region(&smr_prov, ep->region->map, idx, false);
901899
smr_map_to_region(&smr_prov, ep->region->map, idx);
900+
ofi_spin_unlock(&ep->region->map->lock);
902901
peer_smr = smr_peer_region(ep->region, idx);
903902
}
904903

prov/shm/src/smr_util.c

Lines changed: 66 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -367,16 +367,15 @@ int smr_map_to_region(const struct fi_provider *prov, struct smr_map *map,
367367
}
368368
pthread_mutex_unlock(&ep_list_lock);
369369

370-
ofi_spin_lock(&map->lock);
371370
if (peer_buf->region)
372-
goto unlock;
371+
return FI_SUCCESS;
373372

373+
assert(ofi_spin_held(&map->lock));
374374
fd = shm_open(name, O_RDWR, S_IRUSR | S_IWUSR);
375375
if (fd < 0) {
376-
ret = -errno;
377376
FI_WARN_ONCE(prov, FI_LOG_AV,
378377
"shm_open error: name %s errno %d\n", name, errno);
379-
goto unlock;
378+
return -errno;
380379
}
381380

382381
memset(tmp, 0, sizeof(tmp));
@@ -437,8 +436,6 @@ int smr_map_to_region(const struct fi_provider *prov, struct smr_map *map,
437436

438437
out:
439438
close(fd);
440-
unlock:
441-
ofi_spin_unlock(&map->lock);
442439
return ret;
443440
}
444441

@@ -448,6 +445,7 @@ void smr_map_to_endpoint(struct smr_region *region, int64_t id)
448445
struct smr_region *peer_smr;
449446
struct smr_peer_data *local_peers;
450447

448+
assert(ofi_spin_held(&region->map->lock));
451449
peer_smr = smr_peer_region(region, id);
452450
if (region->map->peers[id].peer.id < 0 || !peer_smr)
453451
return;
@@ -479,32 +477,81 @@ void smr_map_to_endpoint(struct smr_region *region, int64_t id)
479477
return;
480478
}
481479

480+
void smr_unmap_region(const struct fi_provider *prov, struct smr_map *map,
481+
int64_t peer_id, bool local)
482+
{
483+
struct smr_region *peer_region;
484+
struct smr_peer *peer;
485+
struct util_ep *util_ep;
486+
struct smr_ep *smr_ep;
487+
struct smr_av *av;
488+
int ret = 0;
489+
490+
assert(ofi_spin_held(&map->lock));
491+
peer_region = map->peers[peer_id].region;
492+
if (!peer_region)
493+
return;
494+
495+
peer = &map->peers[peer_id];
496+
av = container_of(map, struct smr_av, smr_map);
497+
dlist_foreach_container(&av->util_av.ep_list, struct util_ep, util_ep,
498+
av_entry) {
499+
smr_ep = container_of(util_ep, struct smr_ep, util_ep);
500+
smr_unmap_from_endpoint(smr_ep->region, peer_id);
501+
}
502+
503+
/* Don't unmap memory owned by this pid because the endpoint it belongs
504+
* to might still be active.
505+
*/
506+
if (local)
507+
return;
508+
509+
if (map->flags & SMR_FLAG_HMEM_ENABLED) {
510+
ret = ofi_hmem_host_unregister(peer_region);
511+
if (ret)
512+
FI_WARN(prov, FI_LOG_EP_CTRL,
513+
"unable to unregister shm with iface\n");
514+
515+
if (peer->pid_fd != -1) {
516+
close(peer->pid_fd);
517+
peer->pid_fd = -1;
518+
}
519+
}
520+
521+
munmap(peer_region, peer_region->total_size);
522+
peer->region = NULL;
523+
}
524+
482525
void smr_unmap_from_endpoint(struct smr_region *region, int64_t id)
483526
{
484527
struct smr_region *peer_smr;
485528
struct smr_peer_data *local_peers, *peer_peers;
486529
int64_t peer_id;
487530

488-
local_peers = smr_peer_data(region);
489531
if (region->map->peers[id].peer.id < 0)
490532
return;
491533

492534
peer_smr = smr_peer_region(region, id);
493-
peer_id = smr_peer_data(region)[id].addr.id;
494-
535+
assert(peer_smr);
495536
peer_peers = smr_peer_data(peer_smr);
537+
peer_id = smr_peer_data(region)[id].addr.id;
496538

497539
peer_peers[peer_id].addr.id = -1;
498540
peer_peers[peer_id].name_sent = 0;
499541

542+
local_peers = smr_peer_data(region);
500543
ofi_xpmem_release(&local_peers[peer_id].xpmem);
501544
}
502545

503546
void smr_exchange_all_peers(struct smr_region *region)
504547
{
505548
int64_t i;
549+
550+
ofi_spin_lock(&region->map->lock);
506551
for (i = 0; i < SMR_MAX_PEERS; i++)
507552
smr_map_to_endpoint(region, i);
553+
554+
ofi_spin_unlock(&region->map->lock);
508555
}
509556

510557
int smr_map_add(const struct fi_provider *prov, struct smr_map *map,
@@ -546,37 +593,24 @@ int smr_map_add(const struct fi_provider *prov, struct smr_map *map,
546593

547594
void smr_map_del(struct smr_map *map, int64_t id)
548595
{
549-
struct dlist_entry *entry;
596+
struct smr_ep_name *name;
597+
bool local = false;
550598

551599
assert(id >= 0 && id < SMR_MAX_PEERS);
552-
553600
pthread_mutex_lock(&ep_list_lock);
554-
entry = dlist_find_first_match(&ep_name_list, smr_match_name,
555-
smr_no_prefix(map->peers[id].peer.name));
601+
dlist_foreach_container(&ep_name_list, struct smr_ep_name, name, entry) {
602+
if (strcmp(name->name, map->peers[id].peer.name)) {
603+
local = true;
604+
break;
605+
}
606+
}
556607
pthread_mutex_unlock(&ep_list_lock);
557-
558608
ofi_spin_lock(&map->lock);
559-
(void) ofi_rbmap_find_delete(&map->rbmap,
560-
(void *) map->peers[id].peer.name);
561-
609+
smr_unmap_region(&smr_prov, map, id, local);
562610
map->peers[id].fiaddr = FI_ADDR_NOTAVAIL;
563611
map->peers[id].peer.id = -1;
564612
map->num_peers--;
565-
566-
if (!map->peers[id].region)
567-
goto unlock;
568-
569-
if (!entry) {
570-
if (map->flags & SMR_FLAG_HMEM_ENABLED) {
571-
if (map->peers[id].pid_fd != -1)
572-
close(map->peers[id].pid_fd);
573-
574-
(void) ofi_hmem_host_unregister(map->peers[id].region);
575-
}
576-
munmap(map->peers[id].region, map->peers[id].region->total_size);
577-
map->peers[id].region = NULL;
578-
}
579-
unlock:
613+
ofi_rbmap_find_delete(&map->rbmap, map->peers[id].peer.name);
580614
ofi_spin_unlock(&map->lock);
581615
}
582616

prov/shm/src/smr_util.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -356,6 +356,8 @@ void smr_cleanup(void);
356356
int smr_map_to_region(const struct fi_provider *prov, struct smr_map *map,
357357
int64_t id);
358358
void smr_map_to_endpoint(struct smr_region *region, int64_t id);
359+
void smr_unmap_region(const struct fi_provider *prov, struct smr_map *map,
360+
int64_t id, bool found);
359361
void smr_unmap_from_endpoint(struct smr_region *region, int64_t id);
360362
void smr_exchange_all_peers(struct smr_region *region);
361363
int smr_map_add(const struct fi_provider *prov, struct smr_map *map,

0 commit comments

Comments
 (0)