databricks
diff --git a/‎.gitignore
-3 b/‎.gitignore
-3
diff --git a/‎README.md
+7-5 b/‎README.md
+7-5
diff --git a/‎databricks_template_schema.json
+31-13 b/‎databricks_template_schema.json
+31-13
diff --git a/‎library/input_validation.tmpl
+13 b/‎library/input_validation.tmpl
+13
diff --git a/‎library/template_variables.tmpl
+26 b/‎library/template_variables.tmpl
+26
diff --git a/‎template/{{.input_root_dir}}/_params_testing_only.txt.tmpl
+6 b/‎template/{{.input_root_dir}}/_params_testing_only.txt.tmpl
+6
diff --git a/‎template/{{.input_root_dir}}/docs/project-overview.md.tmpl
+2-2 b/‎template/{{.input_root_dir}}/docs/project-overview.md.tmpl
+2-2
diff --git a/‎template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/databricks.yml.tmpl
+2-3 b/‎template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/databricks.yml.tmpl
+2-3
diff --git a/‎template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/batch_inference/README.md.tmpl
+2-1 b/‎template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/batch_inference/README.md.tmpl
+2-1
diff --git a/‎template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/batch_inference/notebooks/BatchInference.py.tmpl
+27-6 b/‎template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/batch_inference/notebooks/BatchInference.py.tmpl
+27-6
diff --git a/‎template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/batch_inference/predict.py.tmpl
+2-1 b/‎template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/batch_inference/predict.py.tmpl
+2-1
@@ -5,9 +5,6 @@
 # local bundle files
 **/.databricks/*
 
-mlops-stacks-using-bundle.iml
-mlops-stacks-using-bundle.ipr
-mlops-stacks-using-bundle.iws
 *.hcl
 .idea/
 .vscode/
 
@@ -19,11 +19,11 @@ Your organization can use the default stack as is or customize it as needed, e.g
 adapt individual components to fit your organization's best practices. See the
 [stack customization guide](stack-customization.md) for more details.
 
-Using Databricks MLOps stacks, data scientists can quickly get started iterating on ML code for new projects while ops engineers set up CI/CD and ML service state
-management, with an easy transition to production. You can also use MLOps stacks as a building block
+Using Databricks MLOps stack, data scientists can quickly get started iterating on ML code for new projects while ops engineers set up CI/CD and ML service state
+management, with an easy transition to production. You can also use MLOps stack as a building block
 in automation for creating new data science projects with production-grade CI/CD pre-configured.
 
-![MLOps Stacks diagram](doc-images/mlops-stack.png)
+![MLOps Stack diagram](doc-images/mlops-stack.png)
 
 See the [FAQ](#FAQ) for questions on common use cases.
 
@@ -68,9 +68,11 @@ ready to productionize a model. We recommend specifying any known parameters upf
  * ``input_release_branch``: Name of the release branch. The production jobs (model training, batch inference) defined in this
     repo pull ML code from this branch.
  * ``input_read_user_group``: User group name to give READ permissions to for project resources (ML jobs, integration test job runs, and machine learning resources). A group with this name must exist in both the staging and prod workspaces. Defaults to "users", which grants read permission to all users in the staging/prod workspaces. You can specify a custom group name e.g. to restrict read permissions to members of the team working on the current ML project.
+  * ``input_include_models_in_unity_catalog``: If selected, models will be registered to [Unity Catalog](https://docs.databricks.com/en/mlflow/models-in-uc.html#models-in-unity-catalog). Models will be registered under a three-level namespace of `<catalog>.<schema_name>.<model_name>`, according the the target environment in which the model registration code is executed. Thus, if model registration code runs in the `prod` environment, the model will be registered to the `prod` catalog under the namespace `<prod>.<schema>.<model_name>`. This assumes that the respective catalogs exist in Unity Catalog (e.g. `dev`, `staging` and `prod` catalogs). Target environment names, and catalogs to be used are defined in the Databricks bundles files, and can be updated as needed.
+ * ``input_schema_name``: If using [Models in Unity Catalog](https://docs.databricks.com/en/mlflow/models-in-uc.html#models-in-unity-catalog), specify the name of the schema under which the models should be registered. Defaults to "schema_name", however we recommend changing this during project initialization (e.g. a schema may map to a specific ML use case, such as `fraud_detection`). We default to using the same `schema_name` across catalogs, thus this schema must exist in each catalog used. For example, the training pipeline when executed in the staging environment will register the model to `staging.<schema_name>.<model_name>`, whereas the same pipeline executed in the prod environment will register the mode to `prod.<schema_name>.<model_name>`.
+ * ``input_unity_catalog_read_user_group``: If using [Models in Unity Catalog](https://docs.databricks.com/en/mlflow/models-in-uc.html#models-in-unity-catalog), define the name of the user group to grant `EXECUTE` (read & use model) privileges for the registered model. Defaults to "account users".
  * ``input_include_feature_store``: If selected, will provide [Databricks Feature Store](https://docs.databricks.com/machine-learning/feature-store/index.html) stack components including: project structure and sample feature Python modules, feature engineering notebooks, ML resource configs to provision and manage Feature Store jobs, and automated integration tests covering feature engineering and training.
  * ``input_include_mlflow_recipes``: If selected, will provide [MLflow Recipes](https://mlflow.org/docs/latest/recipes.html) stack components, dividing the training pipeline into configurable steps and profiles.
- 
 
 See the generated ``README.md`` for next steps!
 
@@ -111,7 +113,7 @@ for details on how to do this.
 
 ### Does the MLOps stack cover data (ETL) pipelines?
 
-Since MLOps Stacks is based on [databricks CLI bundles](https://docs.databricks.com/dev-tools/cli/bundle-commands.html),
+Since MLOps Stack is based on [databricks CLI bundles](https://docs.databricks.com/dev-tools/cli/bundle-commands.html),
 it's not limited only to ML workflows and assets - it works for assets across the Databricks Lakehouse. For instance, while the existing ML 
 code samples contain feature engineering, training, model validation, deployment and batch inference workflows,
 you can use it for Delta Live Tables pipelines as well.
 
@@ -4,66 +4,84 @@
       "order": 1,
       "type": "string",
       "default": "my-mlops-project",
-      "description": "Project Name"
+      "description": "Welcome to MLOps Stack. For detailed information on project generation, see the README at https://github.com/databricks/mlops-stack/blob/main/README.md. \n\nProject Name"
     },
     "input_root_dir": {
       "order": 2,
       "type": "string",
       "default": "my-mlops-project",
-      "description": "Root directory name. Use a name different from the project name if you intend to use monorepo"
+      "description": "\nRoot directory name. Use a name different from the project name if you intend to use monorepo"
     },
     "input_cloud": {
       "order": 3,
       "type": "string",
-      "description": "Select cloud. \nChoose from azure, aws",
+      "description": "\nSelect cloud. \nChoose from azure, aws",
       "default": "azure"
     },
     "input_cicd_platform": {
       "order": 4,
       "type": "string",
-      "description": "Select CICD platform. \nChoose from github_actions, github_actions_for_github_enterprise_servers, azure_devops",
+      "description": "\nSelect CICD platform. \nChoose from github_actions, github_actions_for_github_enterprise_servers, azure_devops",
       "default": "github_actions"
     },
     "input_databricks_staging_workspace_host": {
       "order": 5,
       "type": "string",
       "default": "",
-      "description": "URL of staging Databricks workspace, used to run CI tests on PRs and preview config changes before they're deployed to production. Default: \nAzure - https://adb-xxxx.xx.azuredatabricks.net\nAWS - https://your-staging-workspace.cloud.databricks.com\n"
+      "description": "\nURL of staging Databricks workspace, used to run CI tests on PRs and preview config changes before they're deployed to production. Default: \nAzure - https://adb-xxxx.xx.azuredatabricks.net\nAWS - https://your-staging-workspace.cloud.databricks.com\n"
     },
     "input_databricks_prod_workspace_host": {
       "order": 6,
       "type": "string",
       "default": "",
-      "description": "URL of production Databricks workspace. Default: \nAzure - https://adb-xxxx.xx.azuredatabricks.net\nAWS - https://your-prod-workspace.cloud.databricks.com\n"
+      "description": "\nURL of production Databricks workspace. Default: \nAzure - https://adb-xxxx.xx.azuredatabricks.net\nAWS - https://your-prod-workspace.cloud.databricks.com\n"
     },
     "input_default_branch": {
       "order": 7,
       "type": "string",
       "default": "main",
-      "description": "Name of the default branch, where the prod and staging ML resources are deployed from and the latest ML code is staged. Default:"
+      "description": "\nName of the default branch, where the prod and staging ML resources are deployed from and the latest ML code is staged. Default"
     },
     "input_release_branch": {
       "order": 8,
       "type": "string",
       "default": "release",
-      "description": "Name of the release branch. The production jobs (model training, batch inference) defined in this stack pull ML code from this branch. Default:"
+      "description": "\nName of the release branch. The production jobs (model training, batch inference) defined in this stack pull ML code from this branch. Default"
     },
     "input_read_user_group": {
       "order": 9,
       "type": "string",
       "default": "users",
-      "description": "User group name to give READ permissions to for project resources (ML jobs, integration test job runs, and machine learning resources). A group with this name must exist in both the staging and prod workspaces. Default:"
+      "description": "\nUser group name to give READ permissions to for project resources (ML jobs, integration test job runs, and machine learning resources). A group with this name must exist in both the staging and prod workspaces. Default"
     },
-    "input_include_feature_store": {
+    "input_include_models_in_unity_catalog": {
       "order": 10,
       "type": "string",
-      "description": "Whether to include feature store. \nChoose from no, yes",
+      "description": "\nWhether to use the Model Registry with Unity Catalog. \nChoose from no, yes",
+      "default": "yes"
+    },
+    "input_schema_name": {
+      "order": 11,
+      "type": "string",
+      "description": "\nName of schema to use when registering a model in Unity Catalog. \nNote that this schema must already exist. Default",
+      "default": "schema_name"
+    },
+    "input_unity_catalog_read_user_group": {
+      "order": 12,
+      "type": "string",
+      "default": "account users",
+      "description": "\nUser group name to give EXECUTE privileges to models in Unity Catalog. A group with this name must exist in the Unity Catalog that the staging and prod workspaces can access. Default"
+    },
+    "input_include_feature_store": {
+      "order": 13,
+      "type": "string",
+      "description": "\nWhether to include Feature Store. \nChoose from no, yes",
       "default": "no"
     },
     "input_include_mlflow_recipes": {
-      "order": 11,
+      "order": 14,
       "type": "string",
-      "description": "Whether to include mlflow recipes. \nChoose from no, yes",
+      "description": "\nWhether to include MLflow Recipes. \nChoose from no, yes",
       "default": "no"
     }
   }
 
@@ -38,8 +38,21 @@
         {{ fail `Azure DevOps is not supported as a cicd_platform option with cloud=aws. If cloud=aws the currently supported cicd_platform is GitHub Actions.` }}
     {{- end -}}
 
+    - Validate schema_name for invalid characters
+    {{- if eq .input_include_models_in_unity_catalog `yes` -}}
+        {{- if ((regexp `[ ./\\]+`).MatchString .input_schema_name) -}}
+            {{ fail `schema_name contained invalid characters. Valid schema names cannot contain any of the following characters: " ", ".", "\", "/"` }}
+        {{- end -}}
+    {{- end -}}    
+
     - Validate feature store and recipes
     {{- if and (eq .input_include_feature_store `yes`) (eq .input_include_mlflow_recipes `yes`) -}}
         {{ fail `Feature Store cannot be used with MLflow recipes. Please only use one of the two or neither.` }}
     {{- end -}}
+
+    - Validate feature store and recipes
+    {{- if and (eq .input_include_models_in_unity_catalog `yes`) (eq .input_include_mlflow_recipes `yes`) -}}
+        {{ fail `The Model Registry in Unity Catalog cannot be used with MLflow recipes. Please only use one of the two or neither.` }}
+    {{- end -}}
+
 {{- end -}}
@@ -113,6 +113,32 @@
     {{- end -}}
 {{- end }}
 
+{{ define `include_models_in_unity_catalog` -}}
+    {{- if (eq .input_include_models_in_unity_catalog `no`) -}}
+        no
+    {{- else if (eq .input_include_models_in_unity_catalog `yes`) -}}
+        yes
+    {{- else -}}
+        {{ fail `Invalid selection of include_models_in_unity_catalog. Please choose from [no, yes]` }}
+    {{- end -}}
+{{- end }}
+
+{{ define `schema_name` -}}
+    {{- if (eq .input_include_models_in_unity_catalog `yes`) -}}
+        {{ .input_schema_name }}
+    {{- else -}}
+        {{ "" }}
+    {{- end -}}
+{{- end }}
+
+{{ define `unity_catalog_read_user_group` -}}
+    {{- if (eq .input_include_models_in_unity_catalog `yes`) -}}
+        {{ .input_unity_catalog_read_user_group }}
+    {{- else -}}
+        {{ "account users" }}
+    {{- end -}}
+{{- end }}
+
 {{ define `cloud_specific_node_type_id` -}}
     {{- if (eq .input_cloud `aws`) -}}
         i3.xlarge
 
@@ -9,6 +9,9 @@ input_release_branch={{.input_release_branch}}
 input_read_user_group={{.input_read_user_group}}
 input_include_feature_store={{.input_include_feature_store}}
 input_include_mlflow_recipes={{.input_include_mlflow_recipes}}
+input_include_models_in_unity_catalog={{.input_include_models_in_unity_catalog}}
+input_schema_name={{.input_schema_name}}
+input_unity_catalog_read_user_group={{.input_unity_catalog_read_user_group}}
 
 root_dir={{ template `root_dir` . }}
 project_name={{ template `project_name` . }}
@@ -23,6 +26,9 @@ release_branch={{ template `release_branch` . }}
 read_user_group={{ template `read_user_group` . }}
 include_feature_store={{ template `include_feature_store` . }}
 include_mlflow_recipes={{ template `include_mlflow_recipes` . }}
+include_models_in_unity_catalog={{ template `include_models_in_unity_catalog` . }}
+schema_name={{ template `schema_name` . }}
+unity_catalog_read_user_group={{ template `unity_catalog_read_user_group` . }}
 cloud_specific_node_type_id={{ template `cloud_specific_node_type_id` . }}
 framework={{ template `framework` . }}
 model_name={{ template `model_name` . }}
 
@@ -6,10 +6,10 @@
 This project defines an ML pipeline for automated retraining and batch inference of an ML model
 on tabular data.
 
-See the full pipeline structure below. The [stacks README](https://github.com/databricks/mlops-stack/blob/main/Pipeline.md)
+See the full pipeline structure below. The [stack README](https://github.com/databricks/mlops-stack/blob/main/Pipeline.md)
 contains additional details on how ML pipelines are tested and deployed across each of the dev, staging, prod environments below.
 
-![MLOps Stacks diagram](images/mlops-stack-summary.png)
+![MLOps Stack diagram](images/mlops-stack-summary.png)
 
 
 ## Code structure
 
@@ -2,15 +2,14 @@
 bundle:
   name: {{template `project_name` .}}
 
-
 variables:
   experiment_name:
     description: Experiment name for the model training.
     default: /Users/${workspace.current_user.userName}/${bundle.target}-{{template `experiment_base_name` .}}
   model_name:
     description: Model name for the model training.
-    default: ${bundle.target}-{{template `model_name` .}}
-
+    {{ if (eq .input_include_models_in_unity_catalog `no`) }}default: ${bundle.target}-{{template `model_name` .}}
+    {{- else -}}default: {{template `model_name` .}}{{end}}
 
 include:
   # Resources folder contains ML artifact resources for the ml project that defines model and experiment
 
@@ -47,7 +47,8 @@ df = spark.table(
 ).drop("fare_amount")
 
 df.write.mode("overwrite").saveAsTable(
-    name="hive_metastore.default.taxi_scoring_sample"
+    {{ if (eq .input_include_models_in_unity_catalog `yes`) }}name="hive_metastore.default.taxi_scoring_sample"
+    {{- else -}}name="<catalog>.{{template `schema_name` .}}.feature_store_inference_input"{{ end }}
 )
 ```
 {{ end }}
@@ -26,12 +26,21 @@ dbutils.widgets.dropdown("env", "dev", ["dev", "staging", "prod"], "Environment
 dbutils.widgets.text("input_table_name", "", label="Input Table Name")
 # Delta table to store the output predictions.
 dbutils.widgets.text("output_table_name", "", label="Output Table Name")
+{{- if (eq .input_include_models_in_unity_catalog "no") }}
 # Batch inference model name
-dbutils.widgets.text("model_name", "", label="Model Name")
+dbutils.widgets.text(
+    "model_name", "dev-{{template `model_name` .}}", label="Model Name"
+)
+{{else}}
+# Unity Catalog registered model name to use for the trained mode.
+dbutils.widgets.text(
+    "model_name", "dev.{{template `schema_name` .}}.{{template `model_name` .}}", label="Model Name"
+){{end}}
 
 # COMMAND ----------
 
 import os
+
 notebook_path =  '/Workspace/' + os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get())
 %cd $notebook_path
 
@@ -55,7 +64,9 @@ sys.path.append("../..")
 # COMMAND ----------
 
 # DBTITLE 1,Define input and output variables
-from utils import get_deployed_model_stage_for_env
+{{- if (eq .input_include_models_in_unity_catalog "no") }}
+from utils import get_deployed_model_stage_for_env{{else}}
+from utils import get_deployed_model_alias_for_env{{end}}
 
 env = dbutils.widgets.get("env")
 input_table_name = dbutils.widgets.get("input_table_name")
@@ -64,18 +75,28 @@ model_name = dbutils.widgets.get("model_name")
 assert input_table_name != "", "input_table_name notebook parameter must be specified"
 assert output_table_name != "", "output_table_name notebook parameter must be specified"
 assert model_name != "", "model_name notebook parameter must be specified"
+{{- if (eq .input_include_models_in_unity_catalog "no") }}
 stage = get_deployed_model_stage_for_env(env)
-model_uri = f"models:/{model_name}/{stage}"
+model_uri = f"models:/{model_name}/{stage}"{{else}}
+alias = get_deployed_model_alias_for_env(env)
+model_uri = f"models:/{model_name}@{alias}"{{end}}
 
-# Get model version from stage
-from mlflow import MlflowClient
+# COMMAND ----------
 
+from mlflow import MlflowClient
+{{ if (eq .input_include_models_in_unity_catalog "no") }}
+# Get model version from stage
 model_version_infos = MlflowClient().search_model_versions("name = '%s'" % model_name)
 model_version = max(
     int(version.version)
     for version in model_version_infos
     if version.current_stage == stage
-)
+){{else}}
+# Get model version from alias
+client = MlflowClient(registry_uri="databricks-uc")
+model_version = client.get_model_version_by_alias(model_name, alias).version{{end}}
+
+# COMMAND ----------
 
 # Get datetime
 from datetime import datetime
 
@@ -9,6 +9,7 @@ def predict_batch(
     Apply the model at the specified URI for batch inference on the table with name input_table_name,
     writing results to the table with name output_table_name
     """
+    {{ if (eq .input_include_models_in_unity_catalog "yes") }}mlflow.set_registry_uri("databricks-uc"){{ end }}
     table = spark_session.table(input_table_name)
     {{ if (eq .input_include_feature_store `yes`) }}
     from databricks.feature_store import FeatureStoreClient
@@ -26,7 +27,7 @@ def predict_batch(
     )
     {{ else }}
     predict = mlflow.pyfunc.spark_udf(
-        spark_session, model_uri, result_type="string", env_manager="conda"
+        spark_session, model_uri, result_type="string", env_manager="virtualenv"
     )
     output_df = (
         table.withColumn("prediction", predict(struct(*table.columns)))
Original file line number	Diff line number	Diff line change
`@@ -47,7 +47,8 @@ df = spark.table(`
`47`	`47`	`).drop("fare_amount")`
`48`	`48`
`49`	`49`	`df.write.mode("overwrite").saveAsTable(`
`50`		`- name="hive_metastore.default.taxi_scoring_sample"`
	`50`	+ {{ if (eq .input_include_models_in_unity_catalog `yes`) }}name="hive_metastore.default.taxi_scoring_sample"
	`51`	+ {{- else -}}name="<catalog>.{{template `schema_name` .}}.feature_store_inference_input"{{ end }}
`51`	`52`	`)`
`52`	`53`	```
`53`	`54`	`{{ end }}`