intel
diff --git a/‎.github/workflows/sycl-linux-build.yml‎
Lines changed: 18 additions & 4 deletions b/‎.github/workflows/sycl-linux-build.yml‎
Lines changed: 18 additions & 4 deletions
diff --git a/‎.github/workflows/sycl-linux-precommit.yml‎
Lines changed: 52 additions & 0 deletions b/‎.github/workflows/sycl-linux-precommit.yml‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎.github/workflows/sycl-nightly.yml‎
Lines changed: 11 additions & 2 deletions b/‎.github/workflows/sycl-nightly.yml‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎.github/workflows/sycl-ur-perf-benchmarking.yml‎
Lines changed: 20 additions & 12 deletions b/‎.github/workflows/sycl-ur-perf-benchmarking.yml‎
Lines changed: 20 additions & 12 deletions
diff --git a/‎.github/workflows/sycl-windows-build.yml‎
Lines changed: 19 additions & 4 deletions b/‎.github/workflows/sycl-windows-build.yml‎
Lines changed: 19 additions & 4 deletions
diff --git a/‎.github/workflows/sycl-windows-precommit.yml‎
Lines changed: 28 additions & 0 deletions b/‎.github/workflows/sycl-windows-precommit.yml‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎clang/include/clang/Basic/OffloadArch.h‎
Lines changed: 1 addition & 1 deletion b/‎clang/include/clang/Basic/OffloadArch.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎clang/lib/CodeGen/BackendUtil.cpp‎
Lines changed: 10 additions & 4 deletions b/‎clang/lib/CodeGen/BackendUtil.cpp‎
Lines changed: 10 additions & 4 deletions
@@ -75,6 +75,9 @@ on:
       e2e_binaries_preview_artifact:
         type: string
         required: false
+      e2e_binaries_new_offload_model_artifact:
+        type: string
+        required: false
 
     outputs:
       build_conclusion:
@@ -260,10 +263,6 @@ jobs:
       if: ${{ !cancelled() && contains(inputs.changes, 'libclc') }}
       run: |
         cmake --build $GITHUB_WORKSPACE/build --target check-libclc
-    - name: check-libdevice
-      if: ${{ !cancelled() && contains(inputs.changes, 'sycl') }}
-      run: |
-        cmake --build $GITHUB_WORKSPACE/build --target check-libdevice
     - name: Check E2E test requirements
       if: ${{ !cancelled() && !contains(inputs.changes, 'sycl') }}
       run: |
@@ -339,6 +338,21 @@ jobs:
         sycl_compiler: $GITHUB_WORKSPACE/toolchain/bin/clang++
         extra_lit_opts: --param sycl_build_targets="spir;nvidia;amd"
 
+    - name: Build E2E tests with NewOffloadModel
+      if: |
+        inputs.e2e_binaries_new_offload_model_artifact &&
+        !cancelled()
+        && steps.build.conclusion == 'success' &&
+        contains(github.event.pull_request.labels.*.name, 'new-offload-model')
+      uses: ./devops/actions/run-tests/e2e
+      with:
+        ref: ${{ inputs.ref || github.sha }}
+        testing_mode: build-only
+        target_devices: all
+        binaries_artifact: ${{ inputs.e2e_binaries_new_offload_model_artifact }}
+        sycl_compiler: $GITHUB_WORKSPACE/toolchain/bin/clang++
+        extra_lit_opts: --param sycl_build_targets="spir;nvidia;amd" --param enable_new_offload_model=True
+
     - name: Build E2E tests with SPIR-V Backend
       if: ${{ inputs.e2e_binaries_spirv_backend_artifact && !cancelled() && steps.build.conclusion == 'success' }}
       uses: ./devops/actions/run-tests/e2e
 
@@ -66,6 +66,7 @@ jobs:
       toolchain_artifact: sycl_linux_default
       e2e_binaries_artifact: e2e_bin
       e2e_binaries_preview_artifact: e2e_bin_preview
+      e2e_binaries_new_offload_model_artifact: e2e_bin_new_offload_model
 
   # Build and run native cpu e2e tests separately as cannot currently
   # build all the e2e tests
@@ -236,6 +237,57 @@ jobs:
       skip_run: ${{matrix.use_igc_dev && contains(github.event.pull_request.labels.*.name, 'ci-no-devigc') || matrix.skip_run || 'false'}}
       env: ${{ matrix.env || (contains(needs.detect_changes.outputs.filters, 'esimd') && '{}' || '{"LIT_FILTER_OUT":"ESIMD/"}') }}
 
+  E2E-with-new-offload-model:
+    needs: [build, detect_changes, compat_read_exclude]
+    if: |
+      !cancelled() &&
+      needs.build.outputs.build_conclusion == 'success' &&
+      contains(github.event.pull_request.labels.*.name, 'new-offload-model')
+    permissions:
+      contents: write
+      packages: read
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - name: Intel / GEN 12 Integrated
+            runner: '["Linux", "gen12"]'
+            target_devices: opencl:cpu;opencl:gpu
+          - name: Intel / Arc A-Series Graphics
+            runner: '["Linux", "arc"]'
+            target_devices: level_zero:gpu
+          - name: Intel / Ponte Vecchio GPU
+            runner: '["Linux", "pvc"]'
+            target_devices: level_zero:gpu
+          - name: Intel / Battlemage Graphics
+            runner: '["Linux", "bmg"]'
+            target_devices: level_zero_v2:gpu
+          - name: NVIDIA/CUDA
+            runner: '["Linux", "cuda"]'
+            image_options: -u 1001 --gpus all --cap-add SYS_ADMIN
+            target_devices: cuda:gpu
+          - name: AMD/HIP
+            runner: '["Linux", "amdgpu"]'
+            image_options: -u 1001 --device=/dev/dri --device=/dev/kfd
+            target_devices: hip:gpu
+            extra_lit_opts: -j 1
+
+    uses: ./.github/workflows/sycl-linux-run-tests.yml
+    with:
+      name: ${{ matrix.name }} with NewOffloadModel
+      runner: ${{ matrix.runner }}
+      image: ${{ matrix.image }}
+      image_options: ${{ matrix.image_options || '-u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN' }}
+      target_devices: ${{ matrix.target_devices }}
+      extra_lit_opts: ${{ matrix.extra_lit_opts }} --param enable_new_offload_model=True
+      repo_ref: ${{ github.sha }}
+      toolchain_artifact: ${{ needs.build.outputs.toolchain_artifact }}
+      toolchain_artifact_filename: ${{ needs.build.outputs.toolchain_artifact_filename }}
+      toolchain_decompress_command: ${{ needs.build.outputs.toolchain_decompress_command }}
+      binaries_artifact: 'e2e_bin_new_offload_model'
+      testing_mode: 'run-only'
+
+
   test-perf:
     needs: [build, detect_changes]
     permissions:
 
@@ -92,6 +92,7 @@ jobs:
       toolchain_artifact_filename: sycl_linux_libcxx.tar.zst
 
   ubuntu2204_test:
+    name: ubuntu2204_test
     needs: [ubuntu2204_build]
     permissions:
       contents: write
@@ -100,6 +101,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
+        enable_new_offload_model: ['False', 'True']
         include:
           - name: AMD/HIP
             runner: '["Linux", "amdgpu"]'
@@ -111,6 +113,7 @@ jobs:
             image_options: -u 1001 --device=/dev/dri --device=/dev/kfd
             target_devices: hip:gpu
             extra_lit_opts: --param test-preview-mode=True
+            enable_new_offload_model: 'False'
 
           - name: NVIDIA/CUDA
             runner: '["Linux", "cuda"]'
@@ -122,6 +125,7 @@ jobs:
             image_options: -u 1001 --gpus all --cap-add SYS_ADMIN
             target_devices: cuda:gpu
             extra_lit_opts: --param test-preview-mode=True
+            enable_new_offload_model: 'False'
 
           - name: Intel L0 Gen12 GPU
             runner: '["Linux", "gen12"]'
@@ -139,6 +143,7 @@ jobs:
             runner: '["Linux", "bmg"]'
             target_devices: level_zero:gpu
             extra_lit_opts: --param test-preview-mode=True
+            enable_new_offload_model: 'False'
 
           - name: Intel L0 Arc A-Series GPU
             runner: '["Linux", "arc"]'
@@ -167,15 +172,16 @@ jobs:
             runner: '["Linux", "pvc"]'
             target_devices: level_zero:gpu
             extra_lit_opts: --param test-preview-mode=True
+            enable_new_offload_model: 'False'
 
     uses: ./.github/workflows/sycl-linux-run-tests.yml
     with:
-      name: ${{ matrix.name }}
+      name: ${{ matrix.name }} with ${{ matrix.enable_new_offload_model == 'True' && 'New Offload Model' || 'Old Offload Model' }}
       runner: ${{ matrix.runner }}
       image_options: ${{ matrix.image_options || '-u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN' }}
       target_devices: ${{ matrix.target_devices }}
       tests_selector: e2e
-      extra_lit_opts: "--param 'cxx_flags=-D_GLIBCXX_USE_CXX11_ABI=0' ${{ matrix.extra_lit_opts }}"
+      extra_lit_opts: "--param 'cxx_flags=-D_GLIBCXX_USE_CXX11_ABI=0' ${{ matrix.extra_lit_opts }} --param enable_new_offload_model=${{ matrix.enable_new_offload_model }}"
       repo_ref: ${{ github.sha }}
       toolchain_artifact: ${{ needs.ubuntu2204_build.outputs.toolchain_artifact }}
       toolchain_artifact_filename: ${{ needs.ubuntu2204_build.outputs.toolchain_artifact_filename }}
@@ -214,13 +220,15 @@ jobs:
 
   e2e-win:
     needs: build-win
+    name: E2E win with ${{ matrix.enable_new_offload_model == 'True' && 'New Offload Model' || 'Old Offload Model' }}
     # Continue if build was successful.
     if: |
       !cancelled()
       && needs.build-win.outputs.build_conclusion == 'success'
     strategy:
       fail-fast: false
       matrix:
+        enable_new_offload_model: ['False', 'True']
         include:
           - name: Intel Gen12 GPU
             runner: '["Windows", "gen12"]'
@@ -237,6 +245,7 @@ jobs:
       runner: ${{ matrix.runner }}
       target_devices: level_zero:gpu
       toolchain_artifact_filename: ${{ needs.build-win.outputs.toolchain_artifact_filename }}
+      extra_lit_opts: --param enable_new_offload_model=${{ matrix.enable_new_offload_model }}
 
   cuda-aws-start:
     needs: [ubuntu2204_build]
 
@@ -56,16 +56,9 @@ on:
         description: Save and upload results (to https://intel.github.io/llvm/benchmarks)
         type: choice
         options:
-          - false
-          - true
-        default: true
-      exit_on_failure:
-        description: Fail benchmark script on any error. Limit number of iterations to just test correctness.
-        type: choice
-        options:
-          - false
-          - true
-        default: false
+          - 'false'
+          - 'true'
+        default: 'true'
       runner:
         description: Self-hosted runner to use for the benchmarks
         type: choice
@@ -78,6 +71,21 @@ on:
         options:
           - 'level_zero:gpu'
           - 'level_zero_v2:gpu'
+      exit_on_failure:
+        description: Fail benchmark script on any error. If true, limit number of iterations to just test correctness.
+        type: choice
+        options:
+          - 'false'
+          - 'true'
+        default: 'false'
+      # Special input to trigger nightly benchmarking; rest of inputs are ignored in that case.
+      trigger_nightly:
+        description: Trigger nightly benchmarking (run and save Baseline)
+        type: choice
+        options:
+          - 'false'
+          - 'true'
+        default: 'false'
 
 concurrency:
   # Cancel a currently running workflow for:
@@ -96,7 +104,7 @@ jobs:
   # Manual trigger (dispatch) path:
   sanitize_inputs_dispatch:
     name: '[Dispatch] Sanitize inputs'
-    if: github.event_name == 'workflow_dispatch'
+    if: ${{ github.event_name == 'workflow_dispatch' && inputs.trigger_nightly == 'false' }}
     runs-on: ubuntu-latest
     env:
       COMMIT_HASH: ${{ inputs.commit_hash }}
@@ -194,7 +202,7 @@ jobs:
   # Nightly benchmarking path:
   build_nightly:
     name: '[Nightly] Build SYCL'
-    if: github.repository == 'intel/llvm' && github.event_name == 'schedule'
+    if: ${{ github.repository == 'intel/llvm' && (github.event_name == 'schedule' || inputs.trigger_nightly == 'true') }}
     uses: ./.github/workflows/sycl-linux-build.yml
     secrets: inherit
     with:
 
@@ -50,6 +50,10 @@ on:
         type: string
         required: false
 
+      e2e_binaries_new_offload_model_artifact:
+        type: string
+        required: false
+
     outputs:
       build_conclusion:
         value: ${{ jobs.build.outputs.build_conclusion }}
@@ -186,10 +190,6 @@ jobs:
       if: ${{ !cancelled() && contains(inputs.changes, 'xptifw') }}
       run: |
         cmake --build build --target check-xptifw
-    - name: check-libdevice
-      if: ${{ !cancelled() && contains(inputs.changes, 'sycl') }}
-      run: |
-        cmake --build build --target check-libdevice
     - name: Generate/diff new ABI symbols
       if: ${{ !cancelled() && contains(inputs.changes, 'sycl') }}
       shell: bash
@@ -255,6 +255,21 @@ jobs:
         extra_lit_opts: --param sycl_build_targets="spir"
         cxx: ${{ inputs.cxx }}
 
+    - name: Build E2E tests with New Offload Model
+      if: |
+        inputs.e2e_binaries_artifact &&
+        !cancelled() &&
+        steps.build.conclusion == 'success' &&
+        contains(github.event.pull_request.labels.*.name, 'new-offload-model')
+      uses: ./devops/actions/run-tests/windows/e2e
+      with:
+        ref: ${{ inputs.ref || github.sha }}
+        testing_mode: build-only
+        target_devices: all
+        binaries_artifact: ${{ inputs.e2e_binaries_new_offload_model_artifact }}
+        extra_lit_opts: --param sycl_build_targets="spir" --param enable_new_offload_model=True
+        cxx: ${{ inputs.cxx }}
+
     - name:  Detect hung tests
       if: always()
       shell: powershell
 
@@ -57,6 +57,7 @@ jobs:
     with:
       changes: ${{ needs.detect_changes.outputs.filters }}
       e2e_binaries_artifact: sycl_windows_e2ebin
+      e2e_binaries_new_offload_model_artifact: sycl_windows_e2ebin_with_new_offload_model
 
   run_prebuilt_e2e_tests:
     needs: build
@@ -82,3 +83,30 @@ jobs:
       toolchain_artifact_filename: ${{ needs.build.outputs.toolchain_artifact_filename }}
       testing_mode: run-only
       binaries_artifact: sycl_windows_e2ebin
+
+  run_prebuilt_e2e_with_new_offload_model_tests:
+    needs: build
+    # Continue if build was successful.
+    if: |
+      !cancelled() &&
+      needs.build.outputs.build_conclusion == 'success' &&
+      contains(github.event.pull_request.labels.*.name, 'new-offload-model')
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - name: Intel GEN12 Graphics with Level Zero
+            runner: '["Windows","gen12"]'
+          - name: Intel Arc Graphics with Level Zero
+            runner: '["Windows","arc"]'
+          - name: Intel Battlemage Graphics with Level Zero
+            runner: '["Windows","bmg"]'
+    uses: ./.github/workflows/sycl-windows-run-tests.yml
+    with:
+      name: ${{ matrix.name }}
+      runner: ${{ matrix.runner }}
+      target_devices: "level_zero:gpu"
+      toolchain_artifact_filename: ${{ needs.build.outputs.toolchain_artifact_filename }}
+      testing_mode: run-only
+      binaries_artifact: sycl_windows_e2ebin_with_new_offload_model
+      extra_lit_opts: --param enable_new_offload_model=True
@@ -170,7 +170,7 @@ enum class OffloadArch {
   LNL_M,
   LAST,
 
-  CudaDefault = OffloadArch::SM_52,
+  CudaDefault = OffloadArch::SM_75,
   HIPDefault = OffloadArch::GFX906,
 };
 
 
@@ -1104,7 +1104,7 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
     // configure the pipeline.
     OptimizationLevel Level = mapToLevel(CodeGenOpts);
 
-    if (LangOpts.SYCLIsDevice)
+    if (LangOpts.SYCLIsDevice) {
       PB.registerPipelineStartEPCallback([&](ModulePassManager &MPM,
                                              OptimizationLevel Level) {
         MPM.addPass(SYCLVirtualFunctionsAnalysisPass());
@@ -1118,17 +1118,23 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
             /*FP64ConvEmu=*/CodeGenOpts.FP64ConvEmu,
             /*ExcludeAspects=*/{"fp64"}));
         MPM.addPass(SYCLPropagateJointMatrixUsagePass());
-        // Lowers static/dynamic local memory builtin calls.
-        MPM.addPass(SYCLLowerWGLocalMemoryPass());
         // Compile-time properties pass must create standard metadata as early
         // as possible to make them available for other passes.
         MPM.addPass(CompileTimePropertiesPass());
       });
-    else if (LangOpts.SYCLIsHost && !LangOpts.SYCLESIMDBuildHostCode)
+      PB.registerOptimizerEarlyEPCallback(
+          [](ModulePassManager &MPM, OptimizationLevel, ThinOrFullLTOPhase) {
+            // Allocate static local memory in SYCL kernel scope for each
+            // allocation call. This pass must run after AlwaysInline pass due
+            // to current implementation restriction.
+            MPM.addPass(SYCLLowerWGLocalMemoryPass());
+          });
+    } else if (LangOpts.SYCLIsHost && !LangOpts.SYCLESIMDBuildHostCode) {
       PB.registerPipelineStartEPCallback(
           [&](ModulePassManager &MPM, OptimizationLevel Level) {
             MPM.addPass(ESIMDRemoveHostCodePass());
           });
+    }
 
     // Add the InferAddressSpaces and SYCLOptimizeBarriers passes for all
     // the SPIR[V] targets