From b32b52f29ac93bb80c4d83b0c91e75b523e5c760 Mon Sep 17 00:00:00 2001 From: jianyinglang Date: Wed, 26 Dec 2018 14:08:38 -0800 Subject: [PATCH 1/5] Add Horovod distributed broadcast op to the bridge --- src/ngraph_builder.cc | 11 +++++++++++ src/ngraph_mark_for_clustering.cc | 3 +++ 2 files changed, 14 insertions(+) diff --git a/src/ngraph_builder.cc b/src/ngraph_builder.cc index 406ac8f9..e7099610 100644 --- a/src/ngraph_builder.cc +++ b/src/ngraph_builder.cc @@ -1599,6 +1599,16 @@ static Status TranslateDepthwiseConv2dNativeOp( return Status::OK(); } +static Status TranslateDistBroadcastOp( + const Node* op, const std::vector& static_input_map, + Builder::OpMap& ng_op_map) { + shared_ptr ng_input; + TF_RETURN_IF_ERROR(GetInputNodes(ng_op_map, op, &ng_input)); + + SaveNgOp(ng_op_map, op->name(), make_shared(ng_input)); + return Status::OK(); +} + static Status TranslateExpandDimsOp( const Node* op, const std::vector& static_input_map, Builder::OpMap& ng_op_map) { @@ -4070,6 +4080,7 @@ const static std::map< {"Greater", TranslateBinaryOp}, {"GreaterEqual", TranslateBinaryOp}, {"HorovodAllreduce", TranslateAllreduceOp}, + {"HorovodBroadcast", TranslateDistBroadcastOp}, {"Identity", TranslateIdentityOp}, {"L2Loss", TranslateL2LossOp}, {"Less", TranslateBinaryOp}, diff --git a/src/ngraph_mark_for_clustering.cc b/src/ngraph_mark_for_clustering.cc index df1f11f3..aa67e120 100644 --- a/src/ngraph_mark_for_clustering.cc +++ b/src/ngraph_mark_for_clustering.cc @@ -256,6 +256,8 @@ Status MarkForClustering(Graph* graph) { #ifdef NGRAPH_DISTRIBUTED confirmation_function_map["HorovodAllreduce"] = SimpleConfirmationFunction(); + confirmation_function_map["HorovodBroadcast"] = + SimpleConfirmationFunction(); #endif confirmation_function_map["Identity"] = SimpleConfirmationFunction(); confirmation_function_map["L2Loss"] = SimpleConfirmationFunction(); @@ -388,6 +390,7 @@ Status MarkForClustering(Graph* graph) { type_constraint_map["GreaterEqual"]["T"] = NGraphDTypes(); #ifdef NGRAPH_DISTRIBUTED type_constraint_map["HorovodAllreduce"]["T"] = NGraphNumericDTypes(); + type_constraint_map["HorovodBroadcast"]["T"] = NGraphNumericDTypes(); #endif type_constraint_map["Identity"]["T"] = NGraphDTypes(); type_constraint_map["L2Loss"]["T"] = NGraphNumericDTypes(); From d107bfb5aed62be62d9c060f26e26571cbb2d49d Mon Sep 17 00:00:00 2001 From: jianyinglang Date: Mon, 25 Mar 2019 17:27:21 -0700 Subject: [PATCH 2/5] Update --- src/ngraph_builder.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/ngraph_builder.cc b/src/ngraph_builder.cc index 3116bb4d..c3356a57 100644 --- a/src/ngraph_builder.cc +++ b/src/ngraph_builder.cc @@ -34,7 +34,7 @@ #include "tensorflow/core/lib/core/errors.h" #if defined(NGRAPH_DISTRIBUTED) -#include +#include "ngraph/distributed.hpp" #endif using namespace std; @@ -1572,13 +1572,13 @@ static Status TranslateDepthwiseConv2dNativeOp( return Status::OK(); } -static Status TranslateDistBroadcastOp( +static Status TranslateBroadcastDistributedOp( const Node* op, const std::vector& static_input_map, Builder::OpMap& ng_op_map) { shared_ptr ng_input; TF_RETURN_IF_ERROR(GetInputNodes(ng_op_map, op, &ng_input)); - SaveNgOp(ng_op_map, op->name(), make_shared(ng_input)); + SaveNgOp(ng_op_map, op->name(), make_shared(ng_input)); return Status::OK(); } @@ -3829,7 +3829,7 @@ const static std::map< {"Greater", TranslateBinaryOp}, {"GreaterEqual", TranslateBinaryOp}, {"HorovodAllreduce", TranslateAllreduceOp}, - {"HorovodBroadcast", TranslateDistBroadcastOp}, + {"HorovodBroadcast", TranslateBroadcastDistributedOp}, {"Identity", TranslateIdentityOp}, {"L2Loss", TranslateL2LossOp}, {"Less", TranslateBinaryOp}, From 3ed3b1692f37e00a6f5e162494686dda89dc1b59 Mon Sep 17 00:00:00 2001 From: jianyinglang Date: Wed, 27 Mar 2019 17:15:34 -0700 Subject: [PATCH 3/5] debugging --- src/ngraph_builder.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ngraph_builder.cc b/src/ngraph_builder.cc index a810ea20..483af6e7 100644 --- a/src/ngraph_builder.cc +++ b/src/ngraph_builder.cc @@ -1652,7 +1652,8 @@ static Status TranslateBroadcastDistributedOp( shared_ptr ng_input; TF_RETURN_IF_ERROR(GetInputNodes(ng_op_map, op, &ng_input)); - SaveNgOp(ng_op_map, op->name(), make_shared(ng_input)); + auto ng_broadcast_distributed = ConstructNgNode(op->name(), ng_input); + SaveNgOp(ng_op_map, op->name(), ng_broadcast_distributed); return Status::OK(); } From b891183de3e2fa3dd45c7e44d0707affd9f151d1 Mon Sep 17 00:00:00 2001 From: jianyinglang Date: Thu, 18 Apr 2019 10:48:06 -0700 Subject: [PATCH 4/5] More exploring --- CMakeLists.txt | 2 +- build_ngtf.py | 2 +- src/ngraph_builder.cc | 6 +++++- src/ngraph_utils.cc | 23 +++++++++++++---------- src/ngraph_utils.h | 3 ++- 5 files changed, 22 insertions(+), 14 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b250032a..bf58c74c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -239,7 +239,7 @@ if (NOT USE_PRE_BUILT_NGRAPH) ExternalProject_Add( ext_ngraph GIT_REPOSITORY https://github.com/NervanaSystems/ngraph - GIT_TAG v0.18.0-rc.1 + GIT_TAG langjian/MPI_test CMAKE_ARGS -DNGRAPH_DISTRIBUTED_ENABLE=${NGRAPH_DISTRIBUTED_ENABLE} -DNGRAPH_INSTALL_PREFIX=${NGRAPH_ARTIFACTS_DIR} diff --git a/build_ngtf.py b/build_ngtf.py index 55bb5fae..d7b866d2 100755 --- a/build_ngtf.py +++ b/build_ngtf.py @@ -24,7 +24,7 @@ def main(): ''' # Component versions - ngraph_version = "v0.18.0-rc.1" + ngraph_version = "langjian/MPI_test" tf_version = "v1.13.1" # Command line parser options diff --git a/src/ngraph_builder.cc b/src/ngraph_builder.cc index f9016459..1676a966 100644 --- a/src/ngraph_builder.cc +++ b/src/ngraph_builder.cc @@ -4554,6 +4554,9 @@ Status Builder::TranslateGraph( if (n->type_string() == "HorovodAllreduce") { NGRAPH_VLOG(1) << "[NGRAPH_TF RANK: " << rank_id << "]: " << n->name(); } + if (n->type_string() == "HorovodBroadcast") { + NGRAPH_VLOG(1) << "[NGRAPH_TF RANK: " << rank_id << "]: " << n->name(); + } #endif } } @@ -4653,7 +4656,8 @@ Status Builder::TranslateGraph( ng_function = make_shared(ng_result_list, ng_parameter_list); #if defined NGRAPH_DISTRIBUTED - AllreduceOpControlOrder(ng_function); + OpControlOrder(ng_function, "AllReduce"); + OpControlOrder(ng_function, "BroadcastDistributed"); #endif // diff --git a/src/ngraph_utils.cc b/src/ngraph_utils.cc index 9bf89d45..d98840e2 100644 --- a/src/ngraph_utils.cc +++ b/src/ngraph_utils.cc @@ -28,6 +28,9 @@ #include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/platform/default/logging.h" #include "tensorflow/core/platform/protobuf.h" +#if defined NGRAPH_DISTRIBUTED +#include "ngraph/distributed.hpp" +#endif using namespace std; namespace ng = ngraph; @@ -352,24 +355,24 @@ bool DumpTrackedGraphs() { std::getenv("NGRAPH_TF_DUMP_TRACKED_GRAPHS") != nullptr; } -void AllreduceOpControlOrder( - const std::shared_ptr& ng_function) { +void OpControlOrder( + const std::shared_ptr& ng_function, const std::string& op_name) { // Get the serialized ops and stored the allreduce ops to a vector and - ng::NodeVector allreduce_op_list; + ng::NodeVector op_list; for (const shared_ptr& node : ng_function->get_ordered_ops()) { - if (node->description() == "AllReduce") { - allreduce_op_list.push_back(node); + if (node->description() == op_name) { + op_list.push_back(node); } // Sort the allreduce ops according to the TF names - std::sort(allreduce_op_list.begin(), allreduce_op_list.end(), + std::sort(op_list.begin(), op_list.end(), [](const shared_ptr& x, const shared_ptr& y) { return x->get_friendly_name() < y->get_friendly_name(); }); // Add control dependency in for the allreduce ops - if (allreduce_op_list.size() > 1) { - for (size_t i = 1; i < allreduce_op_list.size(); ++i) { - auto pre_node = allreduce_op_list[i - 1]; - auto cur_node = allreduce_op_list[i]; + if (op_list.size() > 1) { + for (size_t i = 1; i < op_list.size(); ++i) { + auto pre_node = op_list[i - 1]; + auto cur_node = op_list[i]; cur_node->add_control_dependency(pre_node); } } diff --git a/src/ngraph_utils.h b/src/ngraph_utils.h index b586c1f3..a54ae3df 100644 --- a/src/ngraph_utils.h +++ b/src/ngraph_utils.h @@ -263,7 +263,8 @@ bool DumpEncapsulatedGraphs(); bool DumpTrackedGraphs(); // Insert constrol dependency for AllReduce ops to ensure execution order -void AllreduceOpControlOrder(const std::shared_ptr&); +void OpControlOrder(const std::shared_ptr&, + const std::string&); } // namespace ngraph_bridge } // namespace tensorflow From b15d7c8f8f094aef9d2ee8aeb5069d827ac36934 Mon Sep 17 00:00:00 2001 From: jianyinglang Date: Thu, 18 Apr 2019 14:01:46 -0700 Subject: [PATCH 5/5] Fix format --- src/ngraph_builder.cc | 3 ++- src/ngraph_utils.cc | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/ngraph_builder.cc b/src/ngraph_builder.cc index 1e724789..762284a8 100644 --- a/src/ngraph_builder.cc +++ b/src/ngraph_builder.cc @@ -1660,7 +1660,8 @@ static Status TranslateBroadcastDistributedOp( shared_ptr ng_input; TF_RETURN_IF_ERROR(GetInputNodes(ng_op_map, op, &ng_input)); - auto ng_broadcast_distributed = ConstructNgNode(op->name(), ng_input); + auto ng_broadcast_distributed = + ConstructNgNode(op->name(), ng_input); SaveNgOp(ng_op_map, op->name(), ng_broadcast_distributed); return Status::OK(); } diff --git a/src/ngraph_utils.cc b/src/ngraph_utils.cc index 71a9004d..d0e57a48 100644 --- a/src/ngraph_utils.cc +++ b/src/ngraph_utils.cc @@ -359,8 +359,8 @@ bool DumpTrackedGraphs() { std::getenv("NGRAPH_TF_DUMP_TRACKED_GRAPHS") != nullptr; } -void OpControlOrder( - const std::shared_ptr& ng_function, const std::string& op_name) { +void OpControlOrder(const std::shared_ptr& ng_function, + const std::string& op_name) { // Get the serialized ops and stored the allreduce ops to a vector and ng::NodeVector op_list; for (const shared_ptr& node : ng_function->get_ordered_ops()) {