Scala Multi Task Deployment YAML Example

Project structure

.
├── .dbx
│   ├── lock.json  # please note that this file shall be ignored and not added to your git repository.
│   └── project.json
├── .gitignore
├── README.md
├── build.sbt
│
├── conf
│   ├── deployment.yml
│
├── project
│   ├── assembly.sbt
│   ├── build.properties
│   └── plugins.sbt
├── project
│   ├── target
├── target
├── src
│   ├── main
│   │   └── scala
│   └── test
│       └── scala

Useful commands

  • Get list of DBR versions - $ databricks clusters spark-versions

  • Get list of node_type_ids - $ databricks clusters list-node-types

Example deployment file

Note

Documentation tries to list all the options, based on your need some options may not be relevant. For example, if instance_pool_name is used, then note_type_id may not be relevant and so on.

custom:
  basic-cluster-props: &basic-cluster-props
    spark_version: "your-spark-version"
    node_type_id: "your-node-type-id"
    spark_conf:
      spark.databricks.delta.preview.enabled: 'true'
    instance_pool_id: <enter pool id>
    driver_instance_pool_id: <enter pool id>
    runtime_engine: STANDARD
    init_scripts:
      - dbfs:
        destination: dbfs:/<enter your path>

  basic-auto-scale-props: &basic-auto-scale-props
    autoscale:
    min_workers: 2
    max_workers: 4

  basic-static-cluster: &basic-static-cluster
    new_cluster:
    <<: *basic-cluster-props
    num_workers: 2

  basic-autoscale-cluster: &basic-autoscale-cluster
    new_cluster:
    <<: # merge these two maps and place them here.
      - *basic-cluster-props
      - *basic-auto-scale-props

environments:
  default:
    jobs:
      - name: "your-job-name"
        email_notifications:
        on_start: ["user@email.com"]
        on_success: ["user@email.com"]
        on_failure: ["user@email.com"]
        no_alert_for_skipped_runs: false

        #http://www.quartz-scheduler.org/documentation/quartz-2.3.0/tutorials/crontrigger.html
        schedule:
        quartz_cron_expression: "00 25 03 * * ?"
        timezone_id: "UTC"
        pause_status: "PAUSED"

        tags:
          your-key: "your-value"
          your-key1: "your-value1"

        format: MULTI_TASK

        permissions:
          access_control_list:
            - user_name: "user@email.com"
              permission_level: "IS_OWNER"
            #- group_name: "your-group-name"
            #permission_level: "CAN_VIEW"
            #- user_name: "user2@databricks.com"
            #permission_level: "CAN_VIEW"
            #- user_name: "user3@databricks.com"
            #permission_level: "CAN_VIEW"

        job_clusters:
          - job_cluster_key: "basic-cluster"
              <<: *basic-static-cluster
          - job_cluster_key: "basic-autoscale-cluster"
              <<: *basic-autoscale-cluster

        tasks:
          - task_key: "your-task-01"
            job_cluster_key: "basic-cluster"
            max_retries: 1
            spark_jar_task:
              jar_uri: ''
              main_class_name: com.myorg.myproject.myclass
              run_as_repl: true
            libraries:
              - jar: file://target/scala-2.12/myproject.myclass-0.1.0.jar

            min_retry_interval_millis: 900000
            retry_on_timeout: false
            timeout_seconds: 0
            email_notifications:
              on_start:
                - user@email.com
              on_success:
                - user@email.com
              on_failure:
                - user1@email.com
                - user2@email.com

          - task_key: "your-task-02"
            job_cluster_key: "basic-cluster"
            max_retries: 1
            spark_jar_task:
              jar_uri: ''
              main_class_name: com.myorg.myproject.myclass2
              run_as_repl: true
            libraries:
              - jar: file://target/scala-2.12/myproject.myclass2-0.1.0.jar
            min_retry_interval_millis: 900000
            retry_on_timeout: false
            timeout_seconds: 0
            email_notifications:
              on_start:
                - user@email.com
              on_success:
                - user@email.com
              on_failure:
                - user1@email.com
                - user2@email.com
            depends_on:
              - task_key: "your-task-01"