Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add torch 2.3, 2.4 #3447

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
name: acpt-pytorch-2.4-cuda12.1
version: auto
type: environment
spec: spec.yaml
extra_config: environment.yaml
test:
pytest:
enabled: true
pip_requirements: tests/requirements.txt
tests_dir: tests
categories: ["PyTorch", "Training"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
FROM mcr.microsoft.com/aifx/acpt/stable-ubuntu2004-cu121-py310-torch24x:{{latest-image-tag:biweekly\.\d{6}\.\d{1}.*}}

# Install pip dependencies
COPY requirements.txt .
RUN pip install -r requirements.txt --no-cache-dir

# Inference requirements
COPY --from=mcr.microsoft.com/azureml/o16n-base/python-assets:20230419.v1 /artifacts /var/
RUN /var/requirements/install_system_requirements.sh && \
cp /var/configuration/rsyslog.conf /etc/rsyslog.conf && \
cp /var/configuration/nginx.conf /etc/nginx/sites-available/app && \
ln -sf /etc/nginx/sites-available/app /etc/nginx/sites-enabled/app && \
rm -f /etc/nginx/sites-enabled/default
ENV SVDIR=/var/runit
ENV WORKER_TIMEOUT=400
EXPOSE 5001 8883 8888

# support Deepspeed launcher requirement of passwordless ssh login
RUN apt-get update
RUN apt-get install -y openssh-server openssh-client
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
azureml-core=={{latest-pypi-version}}
azureml-dataset-runtime=={{latest-pypi-version}}
azureml-defaults=={{latest-pypi-version}}
azure-ml=={{latest-pypi-version}}
azure-ml-component=={{latest-pypi-version}}
azureml-mlflow=={{latest-pypi-version}}
azureml-contrib-services=={{latest-pypi-version}}
azureml-contrib-services=={{latest-pypi-version}}
torch-tb-profiler~=0.4.0
azureml-inference-server-http
inference-schema
MarkupSafe==2.1.2
regex
pybind11
urllib3>=1.26.18
cryptography>=42.0.4
aiohttp>=3.8.5
py-spy==0.3.12
debugpy~=1.6.3
ipykernel~=6.0
tensorboard
psutil~=5.8.0
matplotlib~=3.5.0
tqdm~=4.66.3
py-cpuinfo==5.0.0
torch-tb-profiler~=0.4.0
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
image:
name: azureml/curated/acpt-pytorch-2.4-cuda12.1
os: linux
context:
dir: context
dockerfile: Dockerfile
template_files:
- Dockerfile
- requirements.txt
publish:
location: mcr
visibility: public
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
$schema: https://azuremlschemas.azureedge.net/latest/environment.schema.json

description: >-
Recommended environment for Deep Learning in public preview with PyTorch on Azure containing the Azure ML SDK with the latest compatible versions of Ubuntu, Python, PyTorch, CUDA\RocM, combined with optimizers like ORT Training,+DeepSpeed+MSCCL+ORT MoE and more. The image introduces newly released PyTorch 2.1 for early testing, and preview of new fastcheckpointing capability called Nebula.
Azure Container Registry:acptdev.azurecr.io/test/public/aifx/acpt/stable-ubuntu2204-cu124-py310-torch212

name: "{{asset.name}}"
version: "{{asset.version}}"

build:
path: "{{image.context.path}}"
dockerfile_path: "{{image.dockerfile.path}}"

os_type: linux

tags:
PyTorch: "2.4"
GPU: Cuda121
OS: Ubuntu20.04
Training: ""
Preview: ""
Python: "3.10"
DeepSpeed: "0.14.4"
ONNXRuntime: "1.18.0"
torch_ORT: "1.17.0"
Checkpointing:Nebula: "0.16.13"
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

"""Tests running a sample job in the pytorch 2.0 environment."""
import os
import time
from pathlib import Path
from azure.ai.ml import command, Output, MLClient, PyTorchDistribution
from azure.ai.ml.entities import Environment, BuildContext, JobResourceConfiguration
from azure.identity import AzureCliCredential
import subprocess

BUILD_CONTEXT = Path("../context")
JOB_SOURCE_CODE = "../../acpt-tests/src"
TIMEOUT_MINUTES = os.environ.get("timeout_minutes", 60)
STD_LOG = Path("artifacts/user_logs/std_log.txt")


def test_pytorch_2_4():
"""Tests a sample job using pytorch 2.4 as the environment."""
this_dir = Path(__file__).parent

subscription_id = os.environ.get("subscription_id")
resource_group = os.environ.get("resource_group")
workspace_name = os.environ.get("workspace")

ml_client = MLClient(
AzureCliCredential(), subscription_id, resource_group, workspace_name
)

env_name = "acpt-pytorch-2_4-cuda12_4"

env_docker_context = Environment(
build=BuildContext(path=this_dir / BUILD_CONTEXT),
name=env_name,
description="Pytorch 2.4 environment created from a Docker context.",
)
ml_client.environments.create_or_update(env_docker_context)

# create the command
job = command(
code=this_dir / JOB_SOURCE_CODE, # local path where the code is stored
command="pip install -r requirements.txt && pip install multiprocess==0.70.15"
" && python pretrain_glue.py --tensorboard_log_dir \"/outputs/runs/\""
" --deepspeed ds_config.json --num_train_epochs 5 --output_dir outputs --disable_tqdm 1"
" --local_rank $RANK --evaluation_strategy \"epoch\" --logging_strategy \"epoch\""
" --per_device_train_batch_size 93 --gradient_accumulation_steps 1"
" --per_device_eval_batch_size 93 --learning_rate 3e-05 --adam_beta1 0.8 --adam_beta2 0.999"
" --weight_decay 3e-07 --warmup_steps 500 --fp16 --logging_steps 1000"
" --model_checkpoint \"bert-large-uncased\"",
outputs={
"output": Output(
type="uri_folder",
mode="rw_mount",
path="azureml://datastores/workspaceblobstore/paths/outputs"
)
},
environment=f"{env_name}@latest",
compute=os.environ.get("gpu_v100_cluster"),
display_name="bert-pretrain-GLUE",
description="Pretrain the BERT model on the GLUE dataset.",
experiment_name="pytorch24_Cuda124_py310_Experiment",
distribution=PyTorchDistribution(process_count_per_instance=1),
resources=JobResourceConfiguration(instance_count=2, shm_size='3100m'),
)

returned_job = ml_client.create_or_update(job)
assert returned_job is not None

# Poll until final status is reached or timed out
timeout = time.time() + (TIMEOUT_MINUTES * 60)
while time.time() <= timeout:
current_status = ml_client.jobs.get(returned_job.name).status
if current_status in ["Completed", "Failed"]:
break
time.sleep(30) # sleep 30 seconds

bashCommand = "ls"
process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
output, error = process.communicate()
print(output)
print(error)

if current_status == "Failed" or current_status == "Cancelled":
ml_client.jobs.download(returned_job.name)
if STD_LOG.exists():
print(f"*** BEGIN {STD_LOG} ***")
with open(STD_LOG, "r") as f:
print(f.read(), end="")
print(f"*** END {STD_LOG} ***")
else:
ml_client.jobs.stream(returned_job.name)

assert current_status == "Completed"
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
azure-ai-ml==1.2.0
azure.identity==1.10.0
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
name: acpt-pytorch-2.5-cuda12.4
version: auto
type: environment
spec: spec.yaml
extra_config: environment.yaml
test:
pytest:
enabled: true
pip_requirements: tests/requirements.txt
tests_dir: tests
categories: ["PyTorch", "Training"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
FROM mcr.microsoft.com/aifx/acpt/stable-ubuntu2204-cu124-py310-torch25x:{{latest-image-tag:biweekly\.\d{6}\.\d{1}.*}}

# Install pip dependencies
COPY requirements.txt .
RUN pip install -r requirements.txt --no-cache-dir

# Inference requirements
COPY --from=mcr.microsoft.com/azureml/o16n-base/python-assets:20230419.v1 /artifacts /var/
RUN /var/requirements/install_system_requirements.sh && \
cp /var/configuration/rsyslog.conf /etc/rsyslog.conf && \
cp /var/configuration/nginx.conf /etc/nginx/sites-available/app && \
ln -sf /etc/nginx/sites-available/app /etc/nginx/sites-enabled/app && \
rm -f /etc/nginx/sites-enabled/default
ENV SVDIR=/var/runit
ENV WORKER_TIMEOUT=400
EXPOSE 5001 8883 8888

# support Deepspeed launcher requirement of passwordless ssh login
RUN apt-get update
RUN apt-get install -y openssh-server openssh-client
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
azureml-core=={{latest-pypi-version}}
azureml-dataset-runtime=={{latest-pypi-version}}
azureml-defaults=={{latest-pypi-version}}
azure-ml=={{latest-pypi-version}}
azure-ml-component=={{latest-pypi-version}}
azureml-mlflow=={{latest-pypi-version}}
azureml-contrib-services=={{latest-pypi-version}}
azureml-contrib-services=={{latest-pypi-version}}
torch-tb-profiler~=0.4.0
azureml-inference-server-http
inference-schema
MarkupSafe==2.1.2
regex
pybind11
urllib3>=1.26.18
cryptography>=42.0.4
aiohttp>=3.8.5
py-spy==0.3.12
debugpy~=1.6.3
ipykernel~=6.0
tensorboard
psutil~=5.8.0
matplotlib~=3.5.0
tqdm~=4.66.3
py-cpuinfo==5.0.0
torch-tb-profiler~=0.4.0
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
image:
name: azureml/curated/acpt-pytorch-2.5-cuda12.4
os: linux
context:
dir: context
dockerfile: Dockerfile
template_files:
- Dockerfile
- requirements.txt
publish:
location: mcr
visibility: public
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
$schema: https://azuremlschemas.azureedge.net/latest/environment.schema.json

description: >-
Recommended environment for Deep Learning in public preview with PyTorch on Azure containing the Azure ML SDK with the latest compatible versions of Ubuntu, Python, PyTorch, CUDA\RocM, combined with optimizers like ORT Training,+DeepSpeed+MSCCL+ORT MoE and more. The image introduces newly released PyTorch 2.1 for early testing, and preview of new fastcheckpointing capability called Nebula.
Azure Container Registry:acptdev.azurecr.io/test/public/aifx/acpt/stable-ubuntu2004-cu121-py310-torch212

name: "{{asset.name}}"
version: "{{asset.version}}"

build:
path: "{{image.context.path}}"
dockerfile_path: "{{image.dockerfile.path}}"

os_type: linux

tags:
PyTorch: "2.5.0"
GPU: Cuda124
OS: Ubuntu22.04
Training: ""
Preview: ""
Python: "3.10"
DeepSpeed: "0.14.4"
ONNXRuntime: "1.18.0"
torch_ORT: "1.17.0"
Checkpointing:Nebula: "0.16.13"
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

"""Tests running a sample job in the pytorch 2.5 environment."""
import os
import time
from pathlib import Path
from azure.ai.ml import command, Output, MLClient, PyTorchDistribution
from azure.ai.ml.entities import Environment, BuildContext, JobResourceConfiguration
from azure.identity import AzureCliCredential
import subprocess

BUILD_CONTEXT = Path("../context")
JOB_SOURCE_CODE = "../../acpt-tests/src"
TIMEOUT_MINUTES = os.environ.get("timeout_minutes", 60)
STD_LOG = Path("artifacts/user_logs/std_log.txt")


def test_pytorch_2_5():
"""Tests a sample job using pytorch 2.5 as the environment."""
this_dir = Path(__file__).parent

subscription_id = os.environ.get("subscription_id")
resource_group = os.environ.get("resource_group")
workspace_name = os.environ.get("workspace")

ml_client = MLClient(
AzureCliCredential(), subscription_id, resource_group, workspace_name
)

env_name = "acpt-pytorch-2_5-cuda12_4"

env_docker_context = Environment(
build=BuildContext(path=this_dir / BUILD_CONTEXT),
name=env_name,
description="Pytorch 2.5 environment created from a Docker context.",
)
ml_client.environments.create_or_update(env_docker_context)

# create the command
job = command(
code=this_dir / JOB_SOURCE_CODE, # local path where the code is stored
command="pip install -r requirements.txt && pip install multiprocess==0.70.15"
" && python pretrain_glue.py --tensorboard_log_dir \"/outputs/runs/\""
" --deepspeed ds_config.json --num_train_epochs 5 --output_dir outputs --disable_tqdm 1"
" --local_rank $RANK --evaluation_strategy \"epoch\" --logging_strategy \"epoch\""
" --per_device_train_batch_size 93 --gradient_accumulation_steps 1"
" --per_device_eval_batch_size 93 --learning_rate 3e-05 --adam_beta1 0.8 --adam_beta2 0.999"
" --weight_decay 3e-07 --warmup_steps 500 --fp16 --logging_steps 1000"
" --model_checkpoint \"bert-large-uncased\"",
outputs={
"output": Output(
type="uri_folder",
mode="rw_mount",
path="azureml://datastores/workspaceblobstore/paths/outputs"
)
},
environment=f"{env_name}@latest",
compute=os.environ.get("gpu_v100_cluster"),
display_name="bert-pretrain-GLUE",
description="Pretrain the BERT model on the GLUE dataset.",
experiment_name="pytorch25_Cuda124_py310_Experiment",
distribution=PyTorchDistribution(process_count_per_instance=1),
resources=JobResourceConfiguration(instance_count=2, shm_size='3100m'),
)

returned_job = ml_client.create_or_update(job)
assert returned_job is not None

# Poll until final status is reached or timed out
timeout = time.time() + (TIMEOUT_MINUTES * 60)
while time.time() <= timeout:
current_status = ml_client.jobs.get(returned_job.name).status
if current_status in ["Completed", "Failed"]:
break
time.sleep(30) # sleep 30 seconds

bashCommand = "ls"
process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
output, error = process.communicate()
print(output)
print(error)

if current_status == "Failed" or current_status == "Cancelled":
ml_client.jobs.download(returned_job.name)
if STD_LOG.exists():
print(f"*** BEGIN {STD_LOG} ***")
with open(STD_LOG, "r") as f:
print(f.read(), end="")
print(f"*** END {STD_LOG} ***")
else:
ml_client.jobs.stream(returned_job.name)

assert current_status == "Completed"
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
azure-ai-ml==1.2.0
azure.identity==1.10.0
Loading