Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix npu bug. #3136

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 17 additions & 2 deletions mmcv/ops/csrc/pytorch/npu/chamfer_distance_npu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,34 @@ using namespace std;

void chamfer_distance_forward_npu(Tensor XYZ1, Tensor XYZ2, Tensor dist1,
Tensor dist2, Tensor idx1, Tensor idx2) {
bool is_half = XYZ1.scalar_type() == at::kHalf;
at::Tensor xyz1 = at::ones_like(XYZ1);
at::Tensor xyz2 = at::ones_like(XYZ2);
at::Tensor distf1 = at::ones_like(dist1);
at::Tensor distf2 = at::ones_like(dist2);
xyz1 = XYZ1.transpose(1, 2).transpose(0, 1);
xyz2 = XYZ2.transpose(1, 2).transpose(0, 1);
if (is_half) {
xyz1 = xyz1.to(at::kFloat);
xyz2 = xyz2.to(at::kFloat);
distf1 = dist1.to(at::kFloat);
distf2 = dist2.to(at::kFloat);
}
OpCommand cmd;
cmd.Name("ChamferDistance")
.Input(xyz1)
.Input(xyz2)
.Output(dist1)
.Output(dist2)
.Output(distf1)
.Output(distf2)
.Output(idx1)
.Output(idx2)
.Run();
if (is_half) {
distf1 = distf1.to(at::kHalf);
distf2 = distf2.to(at::kHalf);
}
dist1.copy_(distf1);
dist2.copy_(distf2);
}

void chamfer_distance_backward_npu(Tensor xyz1, Tensor xyz2, Tensor idx1,
Expand Down
105 changes: 85 additions & 20 deletions mmcv/ops/csrc/pytorch/npu/focal_loss_npu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,21 @@ using namespace std;

void sigmoid_focal_loss_forward_npu(Tensor input, Tensor target, Tensor weight,
Tensor output, float gamma, float alpha) {
at::Tensor input_y = input;
at::Tensor output_y = output;
bool is_half = input.scalar_type() == at::kHalf;
if (is_half) {
input_y = input.to(at::kFloat);
output_y = output.to(at::kFloat);
}
int64_t weight_size = weight.size(0);
at::Tensor weight_y = at::ones_like(input_y);
if (weight_size > 0) {
weight_y = at::broadcast_to(weight, input.sizes());
if (is_half) {
weight_y = weight_y.to(at::kFloat);
}
}
int64_t n_class = input.size(1);
at::Tensor target_y = at::ones_like(input);
if (n_class == 1) {
Expand All @@ -12,24 +27,26 @@ void sigmoid_focal_loss_forward_npu(Tensor input, Tensor target, Tensor weight,
target_y = at::add(target_y, 1.0);
} else {
target_y = at::one_hot(target, n_class);
weight_y = at::mul(weight_y, target_y);
weight_y = at::sum(weight_y, 1, true);
weight_y = at::broadcast_to(weight_y, input.sizes());
}
target_y = target_y.to(at::kInt);
int64_t weight_size = weight.size(0);
at::Tensor weight_y = at::ones_like(input);
if (weight_size > 0) {
weight_y = at::broadcast_to(weight, input.sizes());
}
OpCommand cmd;
string reduction = "none";
cmd.Name("SigmoidFocalLoss")
.Input(input)
.Input(input_y)
.Input(target_y)
.Input(weight_y)
.Output(output)
.Output(output_y)
.Attr("gamma", gamma)
.Attr("alpha", alpha)
.Attr("reduction", reduction)
.Run();
if (is_half) {
output_y = output_y.to(at::kHalf);
}
output.copy_(output_y);
}

void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
Expand All @@ -38,34 +55,51 @@ void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
void sigmoid_focal_loss_backward_npu(Tensor input, Tensor target, Tensor weight,
Tensor grad_input, float gamma,
float alpha) {
at::Tensor input_y = input;
at::Tensor grad_input_y = grad_input;
bool is_half = input.scalar_type() == at::kHalf;
if (is_half) {
input_y = input.to(at::kFloat);
grad_input_y = grad_input.to(at::kFloat);
}
int64_t weight_size = weight.size(0);
at::Tensor weight_y = at::ones_like(input_y);
if (weight_size > 0) {
weight_y = at::broadcast_to(weight, input.sizes());
if (is_half) {
weight_y = weight_y.to(at::kFloat);
}
}
int64_t n_class = input.size(1);
at::Tensor target_y = at::ones_like(input);
if (n_class == 1) {
target_y = at::reshape(target, input.sizes());
} else {
target_y = at::one_hot(target, n_class);
weight_y = at::mul(weight_y, target_y);
weight_y = at::sum(weight_y, 1, true);
weight_y = at::broadcast_to(weight_y, input.sizes());
target_y = at::mul(target_y, -1.0);
target_y = at::add(target_y, 1.0);
}
target_y = target_y.to(at::kInt);
at::Tensor grad_up = at::ones_like(input);
int64_t weight_size = weight.size(0);
at::Tensor weight_y = at::ones_like(input);
if (weight_size > 0) {
weight_y = at::broadcast_to(weight, input.sizes());
}
OpCommand cmd;
string reduction = "none";
cmd.Name("SigmoidFocalLossGrad")
.Input(input)
.Input(input_y)
.Input(target_y)
.Input(grad_up)
.Input(weight_y)
.Output(grad_input)
.Output(grad_input_y)
.Attr("gamma", gamma)
.Attr("alpha", alpha)
.Attr("reduction", reduction)
.Run();
if (is_half) {
grad_input_y = grad_input_y.to(at::kHalf);
}
grad_input.copy_(grad_input_y);
}

void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
Expand All @@ -74,26 +108,40 @@ void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,

void softmax_focal_loss_forward_npu(Tensor input, Tensor target, Tensor weight,
Tensor output, float gamma, float alpha) {
at::Tensor input_y = input;
bool is_half = input.scalar_type() == at::kHalf;
if (is_half) {
input_y = input.to(at::kFloat);
}
int64_t n_class = input.size(1);
at::Tensor target_y = at::one_hot(target, n_class);
target_y = target_y.to(at::kInt);
int64_t weight_size = weight.size(0);
at::Tensor weight_y = at::ones_like(input);
at::Tensor weight_y = at::ones_like(input_y);
if (weight_size > 0) {
weight_y = at::broadcast_to(weight, input.sizes());
if (is_half) {
weight_y = weight_y.to(at::kFloat);
}
weight_y = at::mul(weight_y, target_y);
weight_y = at::sum(weight_y, 1, true);
weight_y = at::broadcast_to(weight_y, input.sizes());
}
at::Tensor op_output = at::ones_like(input);
at::Tensor op_output = at::ones_like(input_y);
OpCommand cmd;
string reduction = "none";
cmd.Name("SoftmaxFocalLoss")
.Input(input)
.Input(input_y)
.Input(target_y)
.Input(weight_y)
.Output(op_output)
.Attr("gamma", gamma)
.Attr("alpha", alpha)
.Attr("reduction", reduction)
.Run();
if (is_half) {
op_output = op_output.to(at::kHalf);
}
int64_t n_batch = input.size(0);
c10::SmallVector<int64_t, 2> offsets = {0, 0};
c10::SmallVector<int64_t, 2> sizes = {n_batch, 1};
Expand Down Expand Up @@ -124,27 +172,44 @@ void softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
void softmax_focal_loss_backward_npu(Tensor input, Tensor target, Tensor weight,
Tensor buff, Tensor grad_input,
float gamma, float alpha) {
at::Tensor input_y = input;
at::Tensor grad_input_y = grad_input;
bool is_half = input.scalar_type() == at::kHalf;
if (is_half) {
input_y = input.to(at::kFloat);
grad_input_y = grad_input.to(at::kFloat);
}
int64_t n_class = input.size(1);
at::Tensor target_y = at::one_hot(target, n_class);
target_y = target_y.to(at::kInt);
at::Tensor grad_up = at::ones_like(input);
int64_t weight_size = weight.size(0);
at::Tensor weight_y = at::ones_like(input);
at::Tensor weight_y = at::ones_like(input_y);
if (weight_size > 0) {
weight_y = at::broadcast_to(weight, input.sizes());
if (is_half) {
weight_y = weight_y.to(at::kFloat);
}
weight_y = at::mul(weight_y, target_y);
weight_y = at::sum(weight_y, 1, true);
weight_y = at::broadcast_to(weight_y, input.sizes());
}
OpCommand cmd;
string reduction = "none";
cmd.Name("SoftmaxFocalLossGrad")
.Input(input)
.Input(input_y)
.Input(target_y)
.Input(grad_up)
.Input(weight_y)
.Output(grad_input)
.Output(grad_input_y)
.Attr("gamma", gamma)
.Attr("alpha", alpha)
.Attr("reduction", reduction)
.Run();
if (is_half) {
grad_input_y = grad_input_y.to(at::kHalf);
}
grad_input.copy_(grad_input_y);
}

void softmax_focal_loss_backward_impl(Tensor input, Tensor target,
Expand Down
16 changes: 13 additions & 3 deletions mmcv/ops/csrc/pytorch/npu/gather_points_npu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,12 @@ void gather_points_forward_npu(int b, int c, int n, int npoints,
void gather_points_backward_npu(int b, int c, int n, int npoints,
const Tensor grad_out, const Tensor idx,
Tensor grad_points) {
at::Tensor grad_out_cast = grad_out;
at::Tensor grad_points_cast = grad_points;
if (grad_out.scalar_type() == at::ScalarType::Half) {
grad_out_cast = grad_out.to(at::kFloat);
grad_points_cast = grad_points.to(at::kFloat);
}
at::Tensor indices = idx;
if (idx.scalar_type() != at::ScalarType::Int) {
indices = idx.to(at::kInt);
Expand All @@ -37,11 +43,11 @@ void gather_points_backward_npu(int b, int c, int n, int npoints,
for (uint64_t i = 0; i < shape.size(); i++) {
pad_size.emplace_back(shape[i]);
}
at::Tensor trans_grad_points = grad_points.transpose(1, 2).contiguous();
at::Tensor trans_grad_points = grad_points_cast.transpose(1, 2).contiguous();
at::Tensor grad_points_view = trans_grad_points.view(
{trans_grad_points.sizes()[0] * trans_grad_points.sizes()[1],
trans_grad_points.sizes()[2]});
at::Tensor trans_grad_out = grad_out.transpose(1, 2).contiguous();
at::Tensor trans_grad_out = grad_out_cast.transpose(1, 2).contiguous();
trans_grad_out = trans_grad_out.view(
{trans_grad_out.sizes()[0] * trans_grad_out.sizes()[1],
trans_grad_out.sizes()[2]});
Expand All @@ -63,7 +69,11 @@ void gather_points_backward_npu(int b, int c, int n, int npoints,
at::Tensor grad_points_result =
grad_points_view.view(trans_grad_points.sizes());
grad_points_result = grad_points_result.transpose(1, 2);
grad_points.copy_(grad_points_result);
at::Tensor grad_points_result_cast = grad_points_result;
if (grad_out.scalar_type() == at::ScalarType::Half) {
grad_points_result_cast = grad_points_result.to(at::kHalf);
}
grad_points.copy_(grad_points_result_cast);
}

void gather_points_forward_impl(int b, int c, int n, int npoints,
Expand Down
4 changes: 2 additions & 2 deletions mmcv/ops/csrc/pytorch/npu/knn_npu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@ using namespace std;
void knn_forward_npu(int b, int n, int m, int nsample, const Tensor xyz,
const Tensor new_xyz, Tensor idx, Tensor dist2) {
// transpose known from [B, N, 3] to [B, 3, N]
at::Tensor source = xyz.transpose(1, 2).contiguous();
at::Tensor source = xyz.transpose(2, 1).contiguous();
at::Tensor target = new_xyz.contiguous();

bool is_from_knn = true;
EXEC_NPU_CMD(aclnnKnn, source, target, nsample, is_from_knn, idx, dist2);
EXEC_NPU_CMD(aclnnKnn, source, target, is_from_knn, dist2);
}

void knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,
Expand Down
14 changes: 10 additions & 4 deletions mmcv/ops/csrc/pytorch/npu/roi_pool_npu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,23 +50,29 @@ void roi_pool_backward_npu(Tensor grad_output, Tensor rois, Tensor argmax,
int64_t pooled_height_64 = pooled_height;
int64_t pooled_width_64 = pooled_width;
int64_t pooled_channel = 1;
at::Tensor argmax_trans = argmax.transpose(1, 2).transpose(2, 3);
at::Tensor grad_output_trans = grad_output.transpose(1, 2).transpose(2, 3);
at::Tensor roi_actual_num =
at::empty_like(rois, rois.options().dtype(at::kInt));
at::Tensor x = at::ones_like(grad_input);
at::Tensor x = at::ones_like(grad_input).transpose(1, 2).transpose(2, 3);
at::Tensor y = at::zeros_like(x);
OpCommand cmd;
cmd.Name("RoiPoolingGradWithArgMax")
.Input(grad_output)
.Input(grad_output_trans)
.Input(x)
.Input(rois)
.Input(roi_actual_num)
.Input(argmax)
.Output(grad_input)
.Input(argmax_trans)
.Output(y)
.Attr("pooled_h", pooled_height_64)
.Attr("pooled_w", pooled_width_64)
.Attr("spatial_scale_h", spatial_scale)
.Attr("spatial_scale_w", spatial_scale)
.Attr("pool_channel", pooled_channel)
.Run();
at::Tensor result = y.transpose(2, 3).transpose(1, 2);
at::Tensor res = result.contiguous();
grad_input.copy_(res);
}

void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
Expand Down
5 changes: 3 additions & 2 deletions mmcv/ops/csrc/pytorch/npu/stack_ball_query_npu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@ void stack_ball_query_forward_npu(float max_radius, int nsample,
const Tensor new_xyz_batch_cnt,
const Tensor xyz, const Tensor xyz_batch_cnt,
Tensor idx) {
at::Tensor xyz_transpose = xyz.transpose(0, 1).contiguous();
at::Tensor xyz_transpose = xyz.transpose(0, 1).contiguous().to(at::kFloat);
at::Tensor new_xyz_fp32 = new_xyz.to(at::kFloat);
double max_radius_double = double(max_radius);
EXEC_NPU_CMD(aclnnStackBallQuery, xyz_transpose, new_xyz, xyz_batch_cnt,
EXEC_NPU_CMD(aclnnStackBallQuery, xyz_transpose, new_xyz_fp32, xyz_batch_cnt,
new_xyz_batch_cnt, max_radius_double, nsample, idx);
}

Expand Down
29 changes: 19 additions & 10 deletions mmcv/ops/csrc/pytorch/npu/three_interpolate_npu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,21 @@ void three_interpolate_forward_npu(int b, int c, int m, int n,
TORCH_CHECK((originDtype == at::kFloat || originDtype == at::kHalf),
"three_interpolate_forward ascend only support fp32 and fp16.");

auto point_c_trans = points.transpose(1, 2);

auto point_c_trans = points.transpose(1, 2).to(at::kFloat);
auto weight_cast = weight.to(at::kFloat);
auto out_cast = out.to(at::kFloat);
OpCommand cmd;
cmd.Name("ThreeInterpolate")
.Input(point_c_trans)
.Input(idx)
.Input(weight)
.Output(out)
.Input(weight_cast)
.Output(out_cast)
.Run();

auto output = out.view({b, n, c}).transpose(1, 2);
if (originDtype == at::kHalf) {
out_cast = out_cast.to(at::kHalf);
}
auto output = out_cast.view({b, n, c}).transpose(1, 2);
auto res = output.contiguous();
out.copy_(res);
}
Expand All @@ -34,12 +38,17 @@ void three_interpolate_backward_npu(int b, int c, int n, int m,
TORCH_CHECK((originDtype == at::kFloat || originDtype == at::kHalf),
"three_interpolate_backward ascend only support fp32 and fp16.");

auto grad_x = at::unsqueeze(grad_out, 3);
auto grad_y = at::unsqueeze(grad_points, 3);

EXEC_NPU_CMD(aclnnThreeInterpolateBackward, grad_x, idx, weight, m, grad_y);
auto grad_x = at::unsqueeze(grad_out, 3).to(at::kFloat);
auto grad_y = at::unsqueeze(grad_points, 3).to(at::kFloat);
auto weight_cast = weight.to(at::kFloat);
EXEC_NPU_CMD(aclnnThreeInterpolateBackward, grad_x, idx, weight_cast, m,
grad_y);

auto output = at::squeeze(grad_y, 3);
auto grad_y_cast = grad_y;
if (originDtype == at::kHalf) {
grad_y_cast = grad_y.to(at::kHalf);
}
auto output = at::squeeze(grad_y_cast, 3);
auto res = output.contiguous();
grad_points.copy_(res);
}
Expand Down
Loading
Loading