NVIDIA · terrykong · Oct 8, 2024
diff --git a/rosetta/rosetta/projects/maxtext/xla_flags/llama2-7b-1N8G.env b/rosetta/rosetta/projects/maxtext/xla_flags/llama2-7b-1N8G.env
@@ -6,15 +6,13 @@ export XLA_FLAGS="\
     --xla_gpu_enable_latency_hiding_scheduler=true \
     --xla_gpu_enable_triton_gemm=false \
     --xla_gpu_graph_level=0 \
-    --xla_gpu_enable_highest_priority_async_stream=true \
     --xla_gpu_all_reduce_combine_threshold_bytes=${THRESHOLD_BYTES} \
     --xla_gpu_all_gather_combine_threshold_bytes=$((THRESHOLD_BYTES/(NUM_NODES*NUM_GPUS))) \
     --xla_gpu_reduce_scatter_combine_threshold_bytes=$((THRESHOLD_BYTES/(NUM_NODES*NUM_GPUS*2))) \
     --xla_gpu_enable_pipelined_all_gather=true \
     --xla_gpu_enable_pipelined_reduce_scatter=true \
     --xla_gpu_enable_pipelined_all_reduce=true \
     --xla_gpu_enable_while_loop_double_buffering=true \
-    --xla_gpu_enable_triton_softmax_fusion=false \
     --xla_gpu_enable_all_gather_combine_by_dim=false \
     --xla_gpu_enable_reduce_scatter_combine_by_dim=false \
     --xla_disable_hlo_passes=rematerialization \

diff --git a/rosetta/rosetta/projects/pax/xla_flags/common.env b/rosetta/rosetta/projects/pax/xla_flags/common.env
@@ -3,8 +3,6 @@ THRESHOLD_BYTES=51200
 export XLA_FLAGS="\
     --xla_gpu_enable_latency_hiding_scheduler=true \
     --xla_allow_excess_precision \
-    --xla_gpu_enable_highest_priority_async_stream=true \
-    --xla_gpu_enable_triton_softmax_fusion=false \
     --xla_gpu_all_reduce_combine_threshold_bytes=${THRESHOLD_BYTES} \
     --xla_gpu_graph_level=0 \
     "

diff --git a/rosetta/rosetta/projects/pax/xla_flags/gpt-126m.env b/rosetta/rosetta/projects/pax/xla_flags/gpt-126m.env
@@ -3,8 +3,6 @@ THRESHOLD_BYTES=33554432
 export XLA_FLAGS="\
     --xla_gpu_enable_latency_hiding_scheduler=true \
     --xla_allow_excess_precision \
-    --xla_gpu_enable_highest_priority_async_stream=true \
-    --xla_gpu_enable_triton_softmax_fusion=false \
     --xla_gpu_all_reduce_combine_threshold_bytes=${THRESHOLD_BYTES} \
     --xla_gpu_graph_level=0 \
     --xla_gpu_enable_cudnn_fmha=false \

diff --git a/rosetta/rosetta/projects/pax/xla_flags/grok-proxy.env b/rosetta/rosetta/projects/pax/xla_flags/grok-proxy.env
@@ -5,8 +5,6 @@ REDUCE_SCATTER_THRESHOLD_BYTES=402653184
 export XLA_FLAGS="\
     --xla_gpu_enable_latency_hiding_scheduler=true \
     --xla_allow_excess_precision \
-    --xla_gpu_enable_highest_priority_async_stream=true \
-    --xla_gpu_enable_triton_softmax_fusion=false \
     --xla_gpu_all_reduce_combine_threshold_bytes=${ALL_REDUCE_THRESHOLD_BYTES} \
     --xla_gpu_graph_level=0 \
     --xla_gpu_all_gather_combine_threshold_bytes=${ALL_GATHER_THRESHOLD_BYTES} \