Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add MXJob v1 api and controller #1296

Merged
merged 1 commit into from
Jul 11, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
bin/
/tf-operator
vendor/
testbin/*
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where is this being generated from?

Copy link
Member Author

@Jeffwan Jeffwan Jul 11, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


# IDEs
.vscode/

# Compiled python files.
Expand Down
8 changes: 8 additions & 0 deletions PROJECT
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,12 @@ resources:
kind: TFJob
path: github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1
version: v1
- api:
crdVersion: v1
namespaced: true
controller: true
group: kubeflow.org
kind: MXJob
path: github.com/kubeflow/tf-operator/pkg/apis/mxnet/v1
version: v1
version: "3"
6,899 changes: 6,899 additions & 0 deletions config/crd/bases/kubeflow.org_mxjobs.yaml

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions config/crd/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ resources:
- bases/kubeflow.org_xgboostjobs.yaml
- bases/kubeflow.org_pytorchjobs.yaml
- bases/kubeflow.org_tfjobs.yaml
- bases/kubeflow.org_mxjobs.yaml
#+kubebuilder:scaffold:crdkustomizeresource

patchesStrategicMerge:
Expand All @@ -13,13 +14,15 @@ patchesStrategicMerge:
#- patches/webhook_in_xgboostjobs.yaml
#- patches/webhook_in_pytorchjobs.yaml
#- patches/webhook_in_tfjobs.yaml
#- patches/webhook_in_mxjobs.yaml
#+kubebuilder:scaffold:crdkustomizewebhookpatch

# [CERTMANAGER] To enable webhook, uncomment all the sections with [CERTMANAGER] prefix.
# patches here are for enabling the CA injection for each CRD
#- patches/cainjection_in_xgboostjobs.yaml
#- patches/cainjection_in_pytorchjobs.yaml
#- patches/cainjection_in_tfjobs.yaml
#- patches/cainjection_in_mxjobs.yaml
#+kubebuilder:scaffold:crdkustomizecainjectionpatch

# the following config is for teaching kustomize how to do kustomization for CRDs.
Expand Down
7 changes: 7 additions & 0 deletions config/crd/patches/cainjection_in_mxjobs.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# The following patch adds a directive for certmanager to inject CA into the CRD
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
cert-manager.io/inject-ca-from: $(CERTIFICATE_NAMESPACE)/$(CERTIFICATE_NAME)
name: mxjobs.kubeflow.org
14 changes: 14 additions & 0 deletions config/crd/patches/webhook_in_mxjobs.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# The following patch enables a conversion webhook for the CRD
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
name: mxjobs.kubeflow.org
spec:
conversion:
strategy: Webhook
webhook:
clientConfig:
service:
namespace: system
name: webhook-service
path: /convert
24 changes: 24 additions & 0 deletions config/rbac/mxjob_editor_role.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# permissions for end users to edit mxjobs.
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: mxjob-editor-role
rules:
- apiGroups:
- kubeflow.org
resources:
- mxjobs
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- kubeflow.org
resources:
- mxjobs/status
verbs:
- get
20 changes: 20 additions & 0 deletions config/rbac/mxjob_viewer_role.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# permissions for end users to view mxjobs.
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: mxjob-viewer-role
rules:
- apiGroups:
- kubeflow.org
resources:
- mxjobs
verbs:
- get
- list
- watch
- apiGroups:
- kubeflow.org
resources:
- mxjobs/status
verbs:
- get
45 changes: 45 additions & 0 deletions examples/mxnet/mxjob_dist_v1.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
apiVersion: "kubeflow.org/v1"
kind: "MXJob"
metadata:
name: "mxnet-job"
spec:
jobMode: MXTrain
mxReplicaSpecs:
Scheduler:
replicas: 1
restartPolicy: Never
template:
spec:
containers:
- name: mxnet
image: mxjob/mxnet:gpu
ports:
- containerPort: 9991
name: mxjob-port
Server:
replicas: 1
restartPolicy: Never
template:
spec:
containers:
- name: mxnet
image: mxjob/mxnet:gpu
ports:
- containerPort: 9991
name: mxjob-port
Worker:
replicas: 1
restartPolicy: Never
template:
spec:
containers:
- name: mxnet
image: mxjob/mxnet:gpu
command: ["python"]
args: ["/incubator-mxnet/example/image-classification/train_mnist.py","--num-epochs","10","--num-layers","2","--kv-store","dist_device_sync","--gpus","0"]
resources:
limits:
nvidia.com/gpu: 1
ports:
- containerPort: 9991
name: mxjob-port
20 changes: 14 additions & 6 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,19 +22,22 @@ import (

// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
// to ensure that exec-entrypoint and run can make use of them.
pytorchv1 "github.com/kubeflow/tf-operator/pkg/apis/pytorch/v1"
tensorflowv1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1"
xgboostv1 "github.com/kubeflow/tf-operator/pkg/apis/xgboost/v1"
pytorchcontroller "github.com/kubeflow/tf-operator/pkg/controller.v1/pytorch"
tensorflowcontroller "github.com/kubeflow/tf-operator/pkg/controller.v1/tensorflow"
xgboostcontroller "github.com/kubeflow/tf-operator/pkg/controller.v1/xgboost"
"k8s.io/apimachinery/pkg/runtime"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
_ "k8s.io/client-go/plugin/pkg/client/auth"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/healthz"
"sigs.k8s.io/controller-runtime/pkg/log/zap"

mxnetv1 "github.com/kubeflow/tf-operator/pkg/apis/mxnet/v1"
pytorchv1 "github.com/kubeflow/tf-operator/pkg/apis/pytorch/v1"
tensorflowv1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1"
xgboostv1 "github.com/kubeflow/tf-operator/pkg/apis/xgboost/v1"
mxnetcontroller "github.com/kubeflow/tf-operator/pkg/controller.v1/mxnet"
pytorchcontroller "github.com/kubeflow/tf-operator/pkg/controller.v1/pytorch"
tensorflowcontroller "github.com/kubeflow/tf-operator/pkg/controller.v1/tensorflow"
xgboostcontroller "github.com/kubeflow/tf-operator/pkg/controller.v1/xgboost"
//+kubebuilder:scaffold:imports
)

Expand All @@ -49,6 +52,7 @@ func init() {
utilruntime.Must(xgboostv1.AddToScheme(scheme))
utilruntime.Must(pytorchv1.AddToScheme(scheme))
utilruntime.Must(tensorflowv1.AddToScheme(scheme))
utilruntime.Must(mxnetv1.AddToScheme(scheme))
//+kubebuilder:scaffold:scheme
}

Expand Down Expand Up @@ -96,6 +100,10 @@ func main() {
setupLog.Error(err, "unable to create controller", "controller", "TFJob")
os.Exit(1)
}
if err = mxnetcontroller.NewReconciler(mgr).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "MXJob")
os.Exit(1)
}
//+kubebuilder:scaffold:builder

if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
Expand Down
18 changes: 18 additions & 0 deletions pkg/apis/mxnet/v1/constants.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
package v1

import commonv1 "github.com/kubeflow/common/pkg/apis/common/v1"

const (
// EnvKubeflowNamespace is ENV for kubeflow namespace specified by user.
EnvKubeflowNamespace = "KUBEFLOW_NAMESPACE"

// DefaultPortName is name of the port used to communicate between scheduler and
// servers & workers.
DefaultPortName = "mxjob-port"
// DefaultContainerName is the name of the MXJob container.
DefaultContainerName = "mxnet"
// DefaultPort is default value of the port.
DefaultPort = 9091
// DefaultRestartPolicy is default RestartPolicy for MXReplicaSpec.
DefaultRestartPolicy = commonv1.RestartPolicyNever
)
34 changes: 34 additions & 0 deletions pkg/apis/mxnet/v1/groupversion_info.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
// Copyright YEAR The Kubeflow Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package v1 contains API Schema definitions for the kubeflow.org v1 API group
//+kubebuilder:object:generate=true
//+groupName=kubeflow.org
package v1

import (
"k8s.io/apimachinery/pkg/runtime/schema"
"sigs.k8s.io/controller-runtime/pkg/scheme"
)

var (
// GroupVersion is group version used to register these objects
GroupVersion = schema.GroupVersion{Group: "kubeflow.org", Version: "v1"}

// SchemeBuilder is used to add go types to the GroupVersionKind scheme
SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion}

// AddToScheme adds the types in this group-version to the given scheme.
AddToScheme = SchemeBuilder.AddToScheme
)
113 changes: 113 additions & 0 deletions pkg/apis/mxnet/v1/mxjob_types.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
// Copyright YEAR The Kubeflow Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package v1

import (
common "github.com/kubeflow/common/pkg/apis/common/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

// MXJobSpec defines the desired state of MXJob
type MXJobSpec struct {
// RunPolicy encapsulates various runtime policies of the distributed training
// job, for example how to clean up resources and how long the job can stay
// active.
RunPolicy common.RunPolicy `json:",inline"`

// JobMode specify the kind of MXjob to do. Different mode may have
// different MXReplicaSpecs request
JobMode JobModeType `json:"jobMode"`

// MXReplicaSpecs is map of common.ReplicaType and common.ReplicaSpec
// specifies the MX replicas to run.
// For example,
// {
// "Scheduler": common.ReplicaSpec,
// "Server": common.ReplicaSpec,
// "Worker": common.ReplicaSpec,
// }
MXReplicaSpecs map[common.ReplicaType]*common.ReplicaSpec `json:"mxReplicaSpecs"`
}

// JobModeType id the type for JobMode
type JobModeType string

const (
// Train Mode, in this mode requested MXReplicaSpecs need
// has Server, Scheduler, Worker
MXTrain JobModeType = "MXTrain"

// Tune Mode, in this mode requested MXReplicaSpecs need
// has Tuner
MXTune JobModeType = "MXTune"
)

const (
// MXReplicaTypeScheduler is the type for scheduler replica in MXNet.
MXReplicaTypeScheduler common.ReplicaType = "Scheduler"

// MXReplicaTypeServer is the type for parameter servers of distributed MXNet.
MXReplicaTypeServer common.ReplicaType = "Server"

// MXReplicaTypeWorker is the type for workers of distributed MXNet.
// This is also used for non-distributed MXNet.
MXReplicaTypeWorker common.ReplicaType = "Worker"

// MXReplicaTypeTunerTracker
// This the auto-tuning tracker e.g. autotvm tracker, it will dispatch tuning task to TunerServer
MXReplicaTypeTunerTracker common.ReplicaType = "TunerTracker"

// MXReplicaTypeTunerServer
MXReplicaTypeTunerServer common.ReplicaType = "TunerServer"

// MXReplicaTuner is the type for auto-tuning of distributed MXNet.
// This is also used for non-distributed MXNet.
MXReplicaTypeTuner common.ReplicaType = "Tuner"
)

// MXJobStatus defines the observed state of MXJob
type MXJobStatus struct {
// INSERT ADDITIONAL STATUS FIELD - define observed state of cluster
// Important: Run "make" to regenerate code after modifying this file
}

// +genclient
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
// +resource:path=mxjob

//+kubebuilder:object:root=true
//+kubebuilder:subresource:status

// MXJob is the Schema for the mxjobs API
type MXJob struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty"`

Spec MXJobSpec `json:"spec,omitempty"`
Status common.JobStatus `json:"status,omitempty"`
}

//+kubebuilder:object:root=true

// MXJobList contains a list of MXJob
type MXJobList struct {
metav1.TypeMeta `json:",inline"`
metav1.ListMeta `json:"metadata,omitempty"`
Items []MXJob `json:"items"`
}

func init() {
SchemeBuilder.Register(&MXJob{}, &MXJobList{})
}
Loading