clamped indices to detangle from OOB

Akshi22 · Akshi22 · commit 3225b248b4e7 · 2025-12-06T00:04:44.000Z
diff --git a/axlearn/common/mixture_of_experts.py b/axlearn/common/mixture_of_experts.py
@@ -180,12 +180,18 @@ def calculate_token_position_to_id(block_position_indices, tokens_indices,
         group_indices = jnp.arange(G)[None, :, None, None]
         group_indices = jnp.broadcast_to(group_indices, (O, G, num_tokens, E))
 
+        # Clamp block_position_indices to prevent out-of-bounds access
+        max_valid_index = num_blocks * block_size
+        block_position_indices = jnp.clip(block_position_indices, 0, max_valid_index)
+
         token_position_to_id = jnp.zeros((O, G, num_blocks * block_size + 1), dtype=jnp.int32)
         token_position_to_id = token_position_to_id.at[batch_indices, group_indices, block_position_indices].set(tokens_indices+1)
 
         token_position_to_id = token_position_to_id[:, :, 1:]
         token_position_to_id = token_position_to_id - 1
         token_position_to_id = jnp.where(token_position_to_id==-1, total_tokens, token_position_to_id)   
+        # Clamp final result to prevent out-of-bounds access
+        token_position_to_id = jnp.clip(token_position_to_id, 0, total_tokens)
         dest_output = dest_output.at[0].set(token_position_to_id)
         return dest_output
 
@@ -215,6 +221,8 @@ def blockwise_mm_per_group_native(hidden_states, expert_affinities_masked, gate_
     def body_fun(b, carry):
         output_jax = carry
         local_token_position_to_id = token_position_to_id[b, :]
+        # Clamp indices to prevent out-of-bounds access on Neuron hardware
+        local_token_position_to_id = jnp.clip(local_token_position_to_id, 0, hidden_states.shape[0] - 1)
         hidden_states_padded = hidden_states
         expert_affinities_padded = expert_affinities
         local_hidden_states = hidden_states_padded[local_token_position_to_id].astype(jnp.float32)
@@ -965,6 +973,9 @@ def compute_token_assignments(token_permutation_idx, num_experts, expert_capacit
             group_indices = group_indices.reshape(O, G, -1)
             
             token_permutation_idx = token_permutation_idx.reshape(O, G, -1)
+            # Clamp token_permutation_idx to prevent out-of-bounds scatter access
+            max_valid_index = expert_capacity * num_experts
+            token_permutation_idx = jnp.clip(token_permutation_idx, 0, max_valid_index)
 
             # Create scatter indices
             scatter_indices = jnp.stack(
@@ -1222,8 +1233,9 @@ def get_token_position_to_id(
         group_indices = jnp.arange(G)[None, :, None, None]
         group_indices = jnp.broadcast_to(group_indices, (O, G, num_tokens, E))
 
-        # (O, G, S*top_k, E)
-        # block_position_indices
+        # Clamp block_position_indices to prevent out-of-bounds scatter access
+        max_valid_index = num_blocks * block_size
+        block_position_indices = jnp.clip(block_position_indices, 0, max_valid_index)
         
         # Create scatter indices
         scatter_indices = jnp.stack([batch_indices, group_indices, block_position_indices], axis=-1, dtype=jnp.int32)
@@ -1247,6 +1259,8 @@ def get_token_position_to_id(
         
         token_position_to_id = token_position_to_id - 1
         token_position_to_id = jnp.where(token_position_to_id==-1, num_tokens,token_position_to_id)
+        # Clamp final token_position_to_id to prevent out-of-bounds access
+        token_position_to_id = jnp.clip(token_position_to_id, 0, num_tokens)
         token_position_to_id = self._remat_name(token_position_to_id, "blockwisegating.token_position_to_id")
         return token_position_to_id
     
@@ -1473,6 +1487,12 @@ def forward(self, logits):
             check_rep=False
         )
         token_position_to_id = token_position_to_id_sm(expert_capacity, block_position_indices, local_num_experts)
+        # Clamp token_position_to_id indices
+        token_position_to_id = jnp.clip(token_position_to_id, 0, S - 1)
+        
+        # Clamp block_to_expert indices
+        block_to_expert = jnp.clip(block_to_expert, 0, cfg.num_experts - 1)
+
         router_z_loss = _router_z_loss(logits)
         
         return self.Output(
diff --git a/axlearn/experiments/text/gpt/envy.py b/axlearn/experiments/text/gpt/envy.py
@@ -97,7 +97,7 @@
 
 MAX_SEQUENCE_LENGTH = {
     "test": 8192,
-    "Switch-Base": 8192,
+    "Switch-Base": 2048,
     "Switch-Large": 8192,
     "Switch-XXL": 8192,
     "Mistral-toy": 256,
diff --git a/profile.slurm b/profile.slurm
@@ -12,5 +12,5 @@ export PROFILE_JOB_NAME=$2
 export PROFILE_JOB_ID=$1
 export AXLEARN_PROFILE_MODE=capture
 
-srun -l setup_node.sh ../may-artifacts/
+srun -l setup_node.sh /fsx/akshiaws/jul-end-artifacts
 srun -l runner.sh
diff --git a/runner.sh b/runner.sh
@@ -140,7 +140,7 @@ export NEURON_CC_FLAGS="${NEURON_CC_FLAGS} --dump=${NEURON_DUMP_PATH}"
 
 # use to add debug logging at module level in xla
 export TF_CPP_MIN_LOG_LEVEL=0
-export TF_CPP_VMODULE="neuron_token_threading=2"
+export TF_CPP_VMODULE="neuron_token_threading=2,neuron_repeated_dus_to_concat=3"
 
 # JAX Cache
 # export JAX_COMPILATION_CACHE_DIR="cache/"
diff --git a/runners/full_convergence_16x10b.slurm b/runners/full_convergence_16x10b.slurm
@@ -28,7 +28,7 @@ export RENAME_JOB=true
 export RENAME_JOB_PREFIX=rh
 
 if [ ${1:-1} = "1" ]; then
-    srun -l ./setup_node.sh ../may-artifacts/
+    srun -l ./setup_node.sh /fsx/akshiaws/jul-end-artifacts
 else
     echo "Skip installing"
 fi
diff --git a/runners/full_convergence_8x20b.slurm b/runners/full_convergence_8x20b.slurm
@@ -35,7 +35,7 @@ export RENAME_JOB=true
 export RENAME_JOB_PREFIX=rh
 
 if [ ${1:-1} = "1" ]; then
-    srun -l ./setup_node.sh /fsx/aahila/jul-artifacts/
+    srun -l ./setup_node.sh /fsx/akshiaws/jul-end-artifacts
 else
     echo "Skip installing"
 fi
diff --git a/runners/full_convergence_8x7b.slurm b/runners/full_convergence_8x7b.slurm
@@ -35,7 +35,7 @@ export RENAME_JOB=true
 export RENAME_JOB_PREFIX=rh
 
 if [ ${1:-1} = "1" ]; then
-    srun -l ./setup_node.sh ../may-artifacts/
+    srun -l ./setup_node.sh /fsx/akshiaws/jul-end-artifacts
 else
     echo "Skip installing"
 fi
diff --git a/runners/run.slurm b/runners/run.slurm
@@ -29,12 +29,12 @@ elif [ $mode = "repeated" ]; then
     echo "Using repeated"
     export AXLEARN_REPEATED=1
     export VENV_NAME=jaxmoe
-    # Set to use repeated, make sure to use /fsx/huilgolr/may-artifacts/repeated/libneuronxla-2.2.20250521+7e624b6.dev-py3-none-linux_x86_64
+    # Set to use repeated, make sure to use /fsx/huilgolr/jul-end-artifacts/repeated/libneuronxla-2.2.20250521+7e624b6.dev-py3-none-linux_x86_64
 fi
 
 if [ ${2:-1} = "1" ]; then
     echo "Installing"
-    srun -l ./setup_node.sh ../may-artifacts/
+    srun -l ./setup_node.sh /fsx/akshiaws/jul-end-artifacts
 else
     echo "Skip installing"
 fi
diff --git a/runners/run_full.slurm b/runners/run_full.slurm
@@ -29,12 +29,12 @@ elif [ $mode = "repeated" ]; then
     echo "Using repeated"
     export AXLEARN_REPEATED=1
     export VENV_NAME=jaxmoe
-    # Set to use repeated, make sure to use /fsx/huilgolr/may-artifacts/repeated/libneuronxla-2.2.20250521+7e624b6.dev-py3-none-linux_x86_64
+    # Set to use repeated, make sure to use /fsx/huilgolr/jul-end-artifacts/repeated/libneuronxla-2.2.20250521+7e624b6.dev-py3-none-linux_x86_64
 fi
 
 if [ ${2:-1} = "1" ]; then
     echo "Installing"
-    srun -l ./setup_node.sh ../may-artifacts/
+    srun -l ./setup_node.sh /fsx/akshiaws/jul-end-artifacts
 else
     echo "Skip installing"
 fi
diff --git a/runners/switch_source.sh b/runners/switch_source.sh
@@ -1,15 +1,26 @@
 #!/bin/bash
+#export AXLEARN_JAX_BACKEND="cpu"
+
 # comment this out to run full model
-export AXLEARN_NUM_LAYERS=2
+export AXLEARN_NUM_LAYERS=4
 export AXLEARN_REMAT_LAYER=selective
 export AXLEARN_MODEL_NAME="envy-Switch-Base"
 export AXLEARN_TP_DEGREE=4
 export AXLEARN_EP_DEGREE=4
 export AXLEARN_SEQ_DEGREE=4
+export AXLEARN_FSDP_DEGREE=1
 export AXLEARN_TRAIN_BATCH_SIZE=4
+
 # use v2 index calc
 export AXLEARN_USE_BLOCKWISE=2
 
+# 0: dense, 1: sparse, 2: alternating
+export AXLEARN_MOE_LAYER_FREQ=2
+
+export EP_WITHIN_NODE=1
+# export AXLEARN_PROFILE_MODE="tracerun"
+# export NEURON_RT_LOCAL_CORE_DUMP_DIRECTORY=""
+
 if [ "${AXLEARN_SEQ_DEGREE:-0}" -gt 1 ]; then
     export AXLEARN_FLASH_ATTENTION=0
 else
@@ -18,11 +29,12 @@ fi
 
 export AXLEARN_REPEATED=1
 # it expects the env to be at ../$VENV_NAME
-export VENV_NAME=jaxmoe
-
+export VENV_NAME=akshiaws/jaxmoe
 # to simulate slurm job run
 export SLURM_PROCID=0
 # to output artifacts at this path ./artifacts/JOB_ID/
-export JOB_ID=switchbaseep64
-rm -rf ./artifacts/$JOB_ID/
-bash runner.sh 2>&1 | tee log_$JOB_ID.out
+export JOB_ID=dummy
+
+rm -rf /fsx/akshiaws/artifacts/$JOB_ID/
+bash /fsx/akshiaws/axlearn/runner.sh
+2>&1 | tee log_$JOB_ID.out