Multitask jobs support
Since version 0.2.0 you can also use dbx
together with multitask job feature.
Please note the following - since attribute libraries
is not supported on the job level in multistep jobs, during deployment the dependencies will be propagated towards every task definition.
You can read more about multistep jobs here (AWS, Azure, GCP).
Here are some examples for multitask job definitions:
{
"default": {
"jobs": [
{
"name": "multitask-job-name",
"tasks": [
{
"task_key": "first-task",
"description": "some description",
"new_cluster": {
"spark_version": "7.3.x-cpu-ml-scala2.12",
"node_type_id": "<some-node-type>",
"num_workers": 2
},
"max_retries": 0,
"spark_python_task": {
"python_file": "placeholder_1.py"
}
},
{
"task_key": "second",
"description": "some description",
"new_cluster": {
"spark_version": "7.3.x-cpu-ml-scala2.12",
"node_type_id": "<some-node-type>",
"num_workers": 2
},
"max_retries": 0,
"spark_python_task": {
"python_file": "placeholder_1.py"
},
"depends_on": [
{
"task_key": "first-task"
}
]
}
]
}
]
}
}
# http://yaml.org/spec/1.2/spec.html
# https://learnxinyminutes.com/docs/yaml/
custom:
basic-cluster-props: &basic-cluster-props
spark_version: "7.3.x-cpu-ml-scala2.12"
node_type_id: "<some-node-type>"
basic-static-cluster: &basic-static-cluster
new_cluster:
<<: *basic-cluster-props
num_workers: 2
environments:
default:
jobs:
- name: "your-job-name"
tasks:
- task_key: "first-task"
<<: *basic-static-cluster
spark_python_task:
python_file: "./placeholder_1.py"
- task_key: "second-task"
<<: *basic-static-cluster
spark_python_task:
python_file: "./placeholder_2.py"
depends_on:
- task_key: "second-task"