fix layernorm and graph ir logic

seanxcwang · doxutx · commit 571319042801 · 2024-05-20T11:32:05.000+08:00
diff --git a/source/tnn/device/cuda/acc/cuda_layer_norm_layer_acc.cu b/source/tnn/device/cuda/acc/cuda_layer_norm_layer_acc.cu
@@ -63,7 +63,7 @@ template<typename T>
 __global__ void ln_mul_add_kernel(const T *input, T *output, const T *scale, const T *bias,
                                   const LNFloat2 *mean_var,
                                   const int count, const float eps) {
-    int offset = blockIdx.y * blockDim.y + threadIdx.x;
+    int offset = blockIdx.y * blockDim.x + threadIdx.x;
     int total_offset = blockIdx.x * count + offset;
     if (offset < count) {
         const float* mean_var_float = reinterpret_cast<const float*>(mean_var);
diff --git a/source/tnn/optimizer/graph_matcher/ir.cc b/source/tnn/optimizer/graph_matcher/ir.cc
@@ -995,6 +995,21 @@ namespace TNN_NS {
         // 5. remove unused Nodes
         // NB. we need to keep the original graph output tensor names un-changed.
 
+        auto return_check = [&]() {
+            std::set<std::string> graph_outputs;
+            for (auto &output: g->outputs())
+                graph_outputs.insert(output->name);
+
+            return std::any_of(anchor->nodes.begin(), anchor->nodes.end(), [&](const std::shared_ptr<Node> &node) {
+                return std::any_of(node->info->outputs.begin(), node->info->outputs.end(), [&](const std::string &output) {
+                    return (graph_outputs.count(output));
+                });
+            });
+        };
+
+        if (return_check())
+            return;
+
         std::set<std::string> tensor_names;
         for(auto & p : tensor_map) tensor_names.insert(p.first);
         for(auto &name : tensor_names) renameTensor(name, name_prefix + name);