Multitask jobs support
Please note the following - since attribute libraries
is not supported on the job level in multistep jobs, during deployment the dependencies will be propagated towards every task definition.
You can read more about multistep jobs here (AWS, Azure, GCP).
Please find some examples for multitask jobs below.
Sample multitask jobs based on Jobs API 2.0
{
"default": {
"jobs": [
{
"name": "multitask-job-name",
"tasks": [
{
"task_key": "first-task",
"description": "some description",
"new_cluster": {
"spark_version": "7.3.x-cpu-ml-scala2.12",
"node_type_id": "some-node-type",
"num_workers": 2
},
"max_retries": 0,
"spark_python_task": {
"python_file": "placeholder_1.py"
}
},
{
"task_key": "second",
"description": "some description",
"new_cluster": {
"spark_version": "7.3.x-cpu-ml-scala2.12",
"node_type_id": "some-node-type",
"num_workers": 2
},
"max_retries": 0,
"spark_python_task": {
"python_file": "placeholder_1.py"
},
"depends_on": [
{
"task_key": "first-task"
}
]
}
]
}
]
}
}
# http://yaml.org/spec/1.2/spec.html
# https://learnxinyminutes.com/docs/yaml/
custom:
basic-cluster-props: &basic-cluster-props
spark_version: "7.3.x-cpu-ml-scala2.12"
node_type_id: "some-node-type"
basic-static-cluster: &basic-static-cluster
new_cluster:
<<: *basic-cluster-props
num_workers: 2
environments:
default:
jobs:
- name: "your-job-name"
tasks:
- task_key: "first-task"
<<: *basic-static-cluster
spark_python_task:
python_file: "./placeholder_1.py"
- task_key: "second-task"
<<: *basic-static-cluster
spark_python_task:
python_file: "./placeholder_2.py"
depends_on:
- task_key: "second-task"
Sample multitask jobs based on Jobs API 2.1
Jobs API 2.1 introduces a lot of useful features for job management, and we encourage developers to use this API. If you would like to enable this API, please do one of the following:
In case if you’re using local Databricks CLI profiles, please follow this documentation
In your CI pipeline, simply set this environment variable:
export DATABRICKS_JOBS_API_VERSION=2.1
to enable the latest features
{
"default": {
"strict_path_adjustment_policy": true,
"jobs": [
{
"name": "dbx_jobs_v21_test",
"job_clusters": [
{
"new_cluster": {
"spark_version": "9.1.x-cpu-ml-scala2.12",
"num_workers": 1,
"node_type_id": "{some-node-type-id}"
},
"job_cluster_key": "basic-cluster"
}
],
"tasks": [
{
"task_key": "first-task",
"job_cluster_key": "basic-cluster",
"spark_python_task": {
"python_file": "file://some/entrypoint.py",
"parameters": [
"--conf-file",
"file:fuse://some/conf/file.yml"
]
}
},
{
"task_key": "second-task",
"job_cluster_key": "basic-cluster",
"spark_python_task": {
"python_file": "file://some/entrypoint.py",
"parameters": [
"--conf-file",
"file:fuse://some/conf/file.yml"
]
}
},
{
"task_key": "third-task",
"job_cluster_key": "basic-cluster",
"depends_on": [
{
"task_key": "first-task"
},
{
"task_key": "second-task"
}
],
"spark_python_task": {
"python_file": "file://some/entrypoint.py",
"parameters": [
"--conf-file",
"file:fuse://some/conf/file.yml"
]
}
}
]
}
]
}
}
custom:
basic-cluster-props: &basic-cluster-props
spark_version: "9.1.x-cpu-ml-scala2.12"
basic-static-cluster: &basic-static-cluster
new_cluster:
<<: *basic-cluster-props
num_workers: 1
node_type_id: "{some-node-type-id}"
environments:
default:
strict_path_adjustment_policy: true
jobs:
- name: "dbx_jobs_v21_test"
job_clusters:
- job_cluster_key: "basic-cluster"
<<: *basic-static-cluster
tasks:
- task_key: "first-task"
job_cluster_key: "basic-cluster"
spark_python_task:
python_file: "file://some/entrypoint.py"
parameters: ["--conf-file", "file:fuse://some/conf/file.yml"]
- task_key: "second-task"
job_cluster_key: "basic-cluster"
spark_python_task:
python_file: "file://some/entrypoint.py"
parameters: ["--conf-file", "file:fuse://some/conf/file.yml"]
- task_key: "third-task"
job_cluster_key: "basic-cluster"
depends_on:
- task_key: "first-task"
- task_key: "second-task"
spark_python_task:
python_file: "file://some/entrypoint.py"
parameters: ["--conf-file", "file:fuse://conf/test/sample_etl_config.yml"]