remove debug print, attemp to derive num partitions from lowering

lsy323 · lsy323 · commit 3865f67e6847 · 2025-03-10T02:57:56.000Z
diff --git a/torch_xla/csrc/lowering_context.cpp b/torch_xla/csrc/lowering_context.cpp
@@ -93,7 +93,6 @@ LoweringContext::LoweringContext(const std::string& name,
                                  torch::lazy::BackendDevice device)
     : torch::lazy::LoweringContext(name, device),
       builder_(name),
-      num_computation_partitions_(1),
       stack_frame_index_builder_(std::make_shared<StackFrameIndexBuilder>()) {}
 
 LoweringContext::LoweringContext(
@@ -102,7 +101,6 @@ LoweringContext::LoweringContext(
     torch::lazy::Util::EmissionMap emit_status)
     : torch::lazy::LoweringContext(name, device, {}, emit_status),
       builder_(name),
-      num_computation_partitions_(1),
       stack_frame_index_builder_(std::make_shared<StackFrameIndexBuilder>()) {
   for (auto node : post_order) {
     LowerNode(node);
@@ -133,7 +131,6 @@ xla::XlaOp LoweringContext::GetParameter(
       xla::OpSharding sharding = data->GetSharding();
       xla::XlaScopedShardingAssignment scoped_sharding(builder(), sharding);
       param = xla::Parameter(builder(), param_index, shape, param_name);
-      UpdateNumPartitions(param);
     } else {
       param = xla::Parameter(builder(), param_index, shape, param_name);
     }
@@ -257,28 +254,6 @@ XlaOpVector LoweringContext::LowerNode(const torch::lazy::Node* node) {
         mutable_dims->Set(dim, kUnboundedSize);
       }
     }
-    std::for_each(result_ops.begin(), result_ops.end(),
-                  [this](xla::XlaOp xla_op) {
-                    UpdateNumPartitions(xla_op);  // Calling the member function
-                  });
-    // for (auto xla_op : result_ops) {
-    //   UpdateNumPartitions(xla_op);
-    //   // std::optional<OpSharding> op_sharding =
-    //   //   ConsumeValue(builder()->GetOpSharding(xla_op));
-    //   // if (op_sharding.has_value()) {
-    //   //   size_t curr_num_partitions =
-    //   //     op_sharding.value().tile_assignment_devices().size();
-    //   //   if (num_computation_partitions_ != 1) {
-    //   //     XLA_CHECK_EQ(curr_num_partitions, num_computation_partitions_)
-    //   <<
-    //   //       "Number of partitions must be the same for all ops in a HLO
-    //   graph.";
-    //   //     continue;
-    //   //   }
-    //   //   num_computation_partitions_ =
-    //   op_sharding.value().tile_assignment_devices().size();
-    //   // }
-    // }
   } catch (const std::exception& ex) {
     ReportBuilderError(node, ex.what());
   }
@@ -349,24 +324,4 @@ torch::lazy::ComputationPtr LoweringContext::Build() {
       builder_.name(), std::move(xla_computation), device_);
 }
 
-void LoweringContext::UpdateNumPartitions(const xla::XlaOp& op) {
-  std::optional<xla::OpSharding> op_sharding =
-      ConsumeValue(builder()->GetOpSharding(op));
-  if (op_sharding.has_value()) {
-    size_t curr_num_partitions =
-        op_sharding.value().tile_assignment_devices().size();
-    if (curr_num_partitions == 0) {
-      return;
-    }
-    if (num_computation_partitions_ != 1) {
-      XLA_CHECK_EQ(curr_num_partitions, num_computation_partitions_)
-          << "Number of partitions must be the same for all ops in a HLO "
-             "graph.";
-      return;
-    }
-    std::cout << "curr_num_partitions: " << curr_num_partitions << std::endl;
-    num_computation_partitions_ = curr_num_partitions;
-  }
-}
-
 }  // namespace torch_xla
diff --git a/torch_xla/csrc/lowering_context.h b/torch_xla/csrc/lowering_context.h
@@ -113,18 +113,10 @@ class LoweringContext : public torch::lazy::LoweringContext {
     return emitted_outputs_;
   }
 
-  size_t GetComputationNumPartitions() const {
-    return num_computation_partitions_;
-  }
-
   // Return stack frame id
   int64_t AddStackFrameLocation(const torch::lazy::SourceLocation& source,
                                 int64_t parent_id);
 
- protected:
-  // Update the number of partitions from a XlaOp.
-  void UpdateNumPartitions(const xla::XlaOp& op);
-
  private:
   struct Parameter {
     xla::XlaOp param;
@@ -141,8 +133,6 @@ class LoweringContext : public torch::lazy::LoweringContext {
   std::vector<xla::XlaOp> root_tuple_;
   OutputMap<xla::XlaOp> emitted_outputs_;
   std::string name_;
-  // Number of partitions of the lowered XLA computation.
-  size_t num_computation_partitions_;
 
   std::shared_ptr<StackFrameIndexBuilder> stack_frame_index_builder_;
 };  // namespace torch_xla
diff --git a/torch_xla/csrc/runtime/computation_client.h b/torch_xla/csrc/runtime/computation_client.h
@@ -225,7 +225,6 @@ class ComputationClient {
         xla::XlaComputation computation, std::string compilation_device,
         std::vector<std::string> devices, const xla::Shape* output_shape,
         bool parameter_is_tupled_arguments = false, bool is_sharded = false,
-        size_t computation_num_partitions = 1,
         bool allow_spmd_sharding_propagation_to_output = true,
         bool use_auto_spmd_partitioning = false,
         std::vector<int64_t> auto_spmd_mesh_shape = {},
@@ -236,7 +235,6 @@ class ComputationClient {
           output_shape(output_shape),
           parameter_is_tupled_arguments(parameter_is_tupled_arguments),
           is_sharded(is_sharded),
-          computation_num_partitions(computation_num_partitions),
           allow_spmd_sharding_propagation_to_output(
               allow_spmd_sharding_propagation_to_output),
           use_auto_spmd_partitioning(use_auto_spmd_partitioning),
@@ -250,7 +248,6 @@ class ComputationClient {
     const xla::Shape* output_shape = nullptr;
     bool parameter_is_tupled_arguments;
     bool is_sharded;
-    size_t computation_num_partitions = 1;
     bool allow_spmd_sharding_propagation_to_output;
     bool use_auto_spmd_partitioning;
     std::vector<int64_t> auto_spmd_mesh_shape;
diff --git a/torch_xla/csrc/runtime/pjrt_computation_client.cc b/torch_xla/csrc/runtime/pjrt_computation_client.cc
@@ -334,7 +334,6 @@ ComputationClient::DataPtr PjRtComputationClient::CopyToDevice(
 std::shared_ptr<PjRtComputationClient::PjRtData>
 PjRtComputationClient::ReplicateShardedData(
     const ComputationClient::DataPtr& handle) {
-  std::cout << "PjRtComputationClient::ReplicateShardedData" << std::endl;
   if (auto unsharded_data = std::dynamic_pointer_cast<PjRtData>(handle)) {
     return unsharded_data;
   } else if (auto sharded_data =
@@ -348,9 +347,7 @@ PjRtComputationClient::ReplicateShardedData(
     }
     xla::XlaBuilder builder("ReplicateShardedData");
     xla::Shape shape = sharded_data->shape();
-    xla::OpSharding sharding = sharded_data->GetSharding();
-    builder.SetSharding(sharding);
-    size_t num_partitions = sharding.tile_assignment_devices().size();
+    builder.SetSharding(sharded_data->GetSharding());
 
     // perform a simple identity calculation to reassemble the input as
     // replicated output.
@@ -374,7 +371,6 @@ PjRtComputationClient::ReplicateShardedData(
                          GetCompilationDevices(device, {}), &shape,
                          /*should_wrap_parameter=*/false,
                          /*is_sharded=*/true,
-                         /*computation_num_partitions*/ num_partitions,
                          /*allow_spmd_sharding_propagation_to_output=*/false});
     std::vector<
         std::shared_ptr<torch_xla::runtime::ComputationClient::Computation>>
@@ -541,7 +537,6 @@ std::vector<xla::Literal> PjRtComputationClient::TransferFromDevice(
 
 std::vector<ComputationClient::ComputationPtr> PjRtComputationClient::Compile(
     std::vector<ComputationClient::CompileInstance> instances) {
-  std::cout << "in compile" << std::endl;
   auto metrics_fn = CompileMetric;
   if (instances[0].eager_mode) {
     metrics_fn = EagerCompileMetric;
@@ -551,9 +546,7 @@ std::vector<ComputationClient::ComputationPtr> PjRtComputationClient::Compile(
                                   tsl::profiler::TraceMeLevel::kInfo);
   std::vector<ComputationClient::ComputationPtr> computations;
 
-  std::cout << "instances.size(): " << instances.size() << std::endl;
   for (auto& instance : instances) {
-    std::cout << "instance devices " << instance.devices << std::endl;
     xla::CompileOptions compile_options;
     if (instance.is_sharded) {
       // TODO(yeounoh) multi-host, multi-slice configurations
@@ -570,8 +563,6 @@ std::vector<ComputationClient::ComputationPtr> PjRtComputationClient::Compile(
       if (runtime::sys_util::GetEnvBool("XLA_USE_LOCAL_SPMD", false)) {
         num_partitions = GetNumLocalDevices();
       }
-      // num_partitions = static_cast<int>(instance.computation_num_partitions);
-      std::cout << "num_partitions: " << num_partitions << std::endl;
       compile_options.executable_build_options.set_num_partitions(
           num_partitions);
       compile_options.executable_build_options.set_num_replicas(1);
@@ -668,7 +659,6 @@ std::vector<ComputationClient::ComputationPtr> PjRtComputationClient::Compile(
 
     CreateCompileHandlesCounter()->AddValue(1);
   }
-  std::cout << "finish compile" << std::endl;
   return computations;
 }
 
@@ -720,7 +710,6 @@ PjRtComputationClient::ExecuteComputation(
     const ComputationClient::Computation& computation,
     absl::Span<const ComputationClient::DataPtr> arguments,
     const std::string& device, const ExecuteComputationOptions& options) {
-  std::cout << "in execute" << std::endl;
   // Shared ownership of the timed section ensures that it will only get logged
   // once both `ExecuteComputation` and the async work in `ExecuteSharded` are
   // complete; a copy is held from the lambda that releases it when done.
@@ -788,7 +777,6 @@ PjRtComputationClient::ExecuteComputation(
   CreateDataHandlesCounter()->AddValue(datas.size());
 
   TF_VLOG(1) << "Returning " << datas.size() << " results";
-  std::cout << "finish execute" << std::endl;
   return datas;
 }
 
@@ -798,10 +786,6 @@ PjRtComputationClient::ExecuteReplicated(
     absl::Span<const ComputationClient::DataPtr> arguments,
     absl::Span<const std::string> devices,
     const ExecuteReplicatedOptions& options) {
-  std::cout << "in execute replicated" << std::endl;
-  for (auto d : devices) {
-    std::cout << "device: " << d << std::endl;
-  }
   // Shared ownership of the timed section ensures that it will only get logged
   // once both `ExecuteReplicated` and the async work in `Execute` are
   // complete; a copy is held from the lambda that releases it when done.
@@ -939,7 +923,6 @@ PjRtComputationClient::ExecuteReplicated(
   }
 
   TF_VLOG(1) << "Returning " << data_handles.size() << " sharded outputs.";
-  std::cout << "finish execute replicated" << std::endl;
   return data_handles;
 }
 
@@ -1002,17 +985,12 @@ xla::PjRtDevice* PjRtComputationClient::StringToPjRtDevice(
 
 void PjRtComputationClient::WaitDeviceOps(
     absl::Span<const std::string> devices) {
-  std::cout << "in wait device ops" << std::endl;
-  for (auto d : devices) {
-    std::cout << "device: " << d << std::endl;
-  }
   TF_VLOG(3) << "Waiting for " << absl::StrJoin(devices, ", ");
   operation_manager_.WaitForDevices(
       devices.empty()
           ? (UseVirtualDevice() ? std::vector<std::string>({spmd_device_str})
                                 : GetLocalDevices())
           : devices);
-  std::cout << "finish wait device ops" << std::endl;
 }
 
 std::map<std::string, Metric> PjRtComputationClient::GetMetrics() const {
diff --git a/torch_xla/csrc/xla_graph_executor.cpp b/torch_xla/csrc/xla_graph_executor.cpp
@@ -1391,16 +1391,12 @@ XLAGraphExecutor::CompilationResult XLAGraphExecutor::Compile(
   // Always execute sharded when running in SPMD mode
   bool is_sharded = (coll.device == GetVirtualDevice()) || UseVirtualDevice();
   // Annotate HLO sharding selectively in the compuation.
-  bool is_sharded_2 = ShardingUtil::SetHloSharding(&lowering_ctx);
-
-  std::cout << "is_sharded_2: " << is_sharded_2 << std::endl;
+  ShardingUtil::SetHloSharding(&lowering_ctx);
 
   SetBufferDonors(&lowering_ctx, buffer_donor_indices);
 
   xla::XlaComputation computation = ConsumeValue(lowering_ctx.BuildXla());
   xla::ProgramShape program_shape = ConsumeValue(computation.GetProgramShape());
-  size_t computation_num_partitions =
-      lowering_ctx.GetComputationNumPartitions();
 
   // TODO(yeounoh) enable wrapping with auto-sharding.
   bool should_wrap_parameter =
@@ -1426,15 +1422,11 @@ XLAGraphExecutor::CompilationResult XLAGraphExecutor::Compile(
       program_shape.result(), static_cast<XlaDeviceType>(coll.device.type()));
 
   std::vector<runtime::ComputationClient::CompileInstance> instances;
-  std::cout << "computation_num_partitions: " << computation_num_partitions
-            << std::endl;
   instances.emplace_back(std::move(computation), coll.device.toString(),
                          runtime::GetComputationClient()->GetCompilationDevices(
                              coll.device.toString(), devices),
-                         &shape, should_wrap_parameter, is_sharded,
-                         computation_num_partitions);
+                         &shape, should_wrap_parameter, is_sharded);
   instances.front().eager_mode = UseEagerMode();
-  instances.front().computation_num_partitions = computation_num_partitions;
   if (use_autosharding) {
     TF_VLOG(5) << "use_auto_spmd_partitioning is set.";
     TF_CHECK(is_sharded) << "Auto-sharding pass requires SPMD mode.";
@@ -1463,8 +1455,6 @@ XLAGraphExecutor::CompilationResult XLAGraphExecutor::Compile(
   TF_VLOG(3) << "Compiling IR graph hash "
              << torch::lazy::HashToString(coll.hash) << " on device "
              << coll.device << " ...";
-  std::cout << "check instance num partitions"
-            << instances.front().computation_num_partitions << std::endl;
   std::vector<std::shared_ptr<runtime::ComputationClient::Computation>>
       computations =
           runtime::GetComputationClient()->Compile(std::move(instances));
diff --git a/torch_xla/csrc/xla_sharding_util.cpp b/torch_xla/csrc/xla_sharding_util.cpp
@@ -192,9 +192,6 @@ bool ShardingUtil::SetHloSharding(LoweringContext* lowering_ctx) {
         XlaBuilderFriend::GetInstruction(elem.second);
     const std::shared_ptr<xla::OpSharding> sharding =
         xla_node->GetSharding(elem.first.index);
-    if (sharding != nullptr) {
-      std::cout << "check opsharding " << sharding->DebugString() << std::endl;
-    }
     if (sharding != nullptr && sharding->type() != xla::OpSharding::UNKNOWN) {
       *instruction->mutable_sharding() = *sharding;
       is_sharded = true;
@@ -375,33 +372,15 @@ ShardingUtil::GetShardReplicaAndIndicesForDevices(
       shard_indices[i] = std::make_pair(global_ordinal, indices);
     }
   } else if (sharding.type() == xla::OpSharding::OTHER) {
-    std::vector<int64_t> tile_assignment_devices(
-        sharding.tile_assignment_devices().begin(),
-        sharding.tile_assignment_devices().end());
-    size_t num_local_devices =
-        runtime::GetComputationClient()->GetNumLocalDevices();
-    size_t num_global_devices =
-        runtime::GetComputationClient()->GetNumGlobalDevices();
-    // XLA_CHECK(tile_assignment_devices.size() == 0 ||
-    //           tile_assignment_devices.size() == num_global_devices ||
-    //           tile_assignment_devices.size() == num_local_devices)
-    //     << "Number of tile_assignment_devices must be the number of global "
-    //        "devices or local devices, or 0, got unexpected size of "
-    //     << tile_assignment_devices.size();
     size_t num_tiles =
         std::accumulate(sharding.tile_assignment_dimensions().begin(),
                         sharding.tile_assignment_dimensions().end(), 1,
                         [](int a, int b) { return a * b; });
-    std::cout << "Num local devices " << num_local_devices << std::endl;
-    std::cout << "Num tile assignment size " << tile_assignment_devices.size()
-              << std::endl;
     std::unordered_map<int, int> device_index =
         build_index_map(devices, num_tiles);
-    std::cout << "Check device_index " << std::endl;
-    for (const auto& pair : device_index) {
-      std::cout << "Key: " << pair.first << ", Value: " << pair.second
-                << std::endl;
-    }
+    std::vector<int64_t> tile_assignment_devices(
+        sharding.tile_assignment_devices().begin(),
+        sharding.tile_assignment_devices().end());
     if (!sharding.iota_reshape_dims().empty()) {
       auto tileAssignment = xla::TileAssignment(
           sharding.tile_assignment_dimensions(), sharding.iota_reshape_dims(),
@@ -411,10 +390,7 @@ ShardingUtil::GetShardReplicaAndIndicesForDevices(
     }
     for (size_t i = 0; i < tile_assignment_devices.size(); i++) {
       int64_t core = tile_assignment_devices[i];
-      std::cout << "Check core " << core << std::endl;
       if (device_index.find(core) == device_index.end()) {
-        std::cout << "current core " << core << " is not in device_index"
-                  << std::endl;
         // Skip any shards whose device is not part of the `devices` list.
         continue;
       }
@@ -464,8 +440,6 @@ ShardingUtil::GetShardReplicaAndIndicesForDevices(
 std::vector<at::Tensor> ShardingUtil::ShardTensor(
     const at::Tensor& tensor, const XLATensor::ShardingSpecPtr shardings,
     const std::vector<std::string>& devices, bool padded) {
-  std::cout << "ShardingUtil::ShardTensor check devices " << devices
-            << std::endl;
   xla::OpSharding sharding;
   bool minibatch = false;
   if (shardings != nullptr) {
@@ -496,8 +470,6 @@ std::vector<at::Tensor> ShardingUtil::ShardTensor(
                      std::back_inserter(shard_indices),
                      [](auto& pair) { return pair.second; });
     }
-    std::cout << "ShardingUtil::ShardTensor check shard_indices: "
-              << shard_indices << std::endl;
 
     for (size_t i = 0; i < shard_indices.size(); i++) {
       at::Tensor shard = tensor.index(
diff --git a/torch_xla/distributed/spmd/xla_sharding.py b/torch_xla/distributed/spmd/xla_sharding.py
@@ -130,10 +130,6 @@ def get_op_sharding(self,
 
     tile_assignment, group_assignment, replication_groups, sharding_type = self._get_op_sharding_args(
         partition_spec)
-    print(f"check tile_assignment: {tile_assignment}")
-    print(f"check group_assignment: {group_assignment}")
-    print(f"check replication_groups: {replication_groups}")
-    print(f"check sharding_type: {sharding_type}")
     return torch_xla._XLAC.OpSharding(tile_assignment, group_assignment,
                                       replication_groups, sharding_type)