From bee4647fd4e2e4ff9036afc17e9d607fecb740b4 Mon Sep 17 00:00:00 2001 From: Liam Perritt Date: Wed, 27 May 2026 16:33:36 +1000 Subject: [PATCH 1/3] Convert 'pipeline_buncle_template' into a DAB custom template --- pipeline_bundle_template/README.md | 151 +++++++++++++----- pipeline_bundle_template/databricks.yml | 30 ---- .../databricks_template_schema.json | 93 +++++++++++ pipeline_bundle_template/fixtures/.gitkeep | 22 --- .../dataflowspec/[flow]TARGET TABLE_main.json | 43 ----- .../[standard]TARGET TABLE_main.json | 29 ---- .../expectations/TARGET TABLE_dqe.json | 24 --- .../[standard]TARGET TABLE_main.json | 29 ---- .../schemas/TARGET TABLE_schema.json | 17 -- .../{{.project_name}}}/.gitignore | 0 .../template/{{.project_name}}/.skip.tmpl | 4 + .../template/{{.project_name}}/README.md.tmpl | 59 +++++++ .../{{.project_name}}/databricks.yml.tmpl | 37 +++++ .../{{.project_name}}/fixtures/.gitkeep | 0 .../{{.project_name}}}/pytest.ini | 0 .../{{.pipeline_name}}_pipeline.yml.tmpl} | 8 +- ...]{{.example_target_table}}_main.json.tmpl} | 26 +-- ...d]{{.example_target_table}}_main.json.tmpl | 24 +++ ...{.example_target_table}}_dqe.json.example} | 2 +- ...xample_target_table}}_schema.json.example} | 2 +- .../src/init/post/README.md | 0 .../{{.project_name}}}/src/init/pre/README.md | 0 .../src/libraries/README.md | 0 .../dev_substitutions.json.tmpl | 5 + .../prod_substitutions.json.example.tmpl} | 6 +- .../{{.project_name}}}/src/python/README.md | 0 .../{{.project_name}}/tests/main_test.py | 2 + pipeline_bundle_template/tests/main_test.py | 6 - 28 files changed, 357 insertions(+), 262 deletions(-) delete mode 100644 pipeline_bundle_template/databricks.yml create mode 100644 pipeline_bundle_template/databricks_template_schema.json delete mode 100644 pipeline_bundle_template/fixtures/.gitkeep delete mode 100644 pipeline_bundle_template/src/dataflows/PIPELINE_NAME_1/dataflowspec/[flow]TARGET TABLE_main.json delete mode 100644 pipeline_bundle_template/src/dataflows/PIPELINE_NAME_1/dataflowspec/[standard]TARGET TABLE_main.json delete mode 100644 pipeline_bundle_template/src/dataflows/PIPELINE_NAME_1/expectations/TARGET TABLE_dqe.json delete mode 100644 pipeline_bundle_template/src/dataflows/PIPELINE_NAME_2/dataflowspec/[standard]TARGET TABLE_main.json delete mode 100644 pipeline_bundle_template/src/dataflows/PIPELINE_NAME_2/schemas/TARGET TABLE_schema.json rename pipeline_bundle_template/{ => template/{{.project_name}}}/.gitignore (100%) create mode 100644 pipeline_bundle_template/template/{{.project_name}}/.skip.tmpl create mode 100644 pipeline_bundle_template/template/{{.project_name}}/README.md.tmpl create mode 100644 pipeline_bundle_template/template/{{.project_name}}/databricks.yml.tmpl create mode 100644 pipeline_bundle_template/template/{{.project_name}}/fixtures/.gitkeep rename pipeline_bundle_template/{ => template/{{.project_name}}}/pytest.ini (100%) rename pipeline_bundle_template/{resources/PIPELINE NAME_pipeline.yml => template/{{.project_name}}/resources/{{.pipeline_name}}_pipeline.yml.tmpl} (67%) rename pipeline_bundle_template/{src/dataflows/PIPELINE_NAME_2/dataflowspec/[flow]TARGET TABLE_main.json => template/{{.project_name}}/src/dataflows/{{.pipeline_name}}/dataflowspec/[flow]{{.example_target_table}}_main.json.tmpl} (53%) create mode 100644 pipeline_bundle_template/template/{{.project_name}}/src/dataflows/{{.pipeline_name}}/dataflowspec/[standard]{{.example_target_table}}_main.json.tmpl rename pipeline_bundle_template/{src/dataflows/PIPELINE_NAME_2/expectations/TARGET TABLE_dqe.json => template/{{.project_name}}/src/dataflows/{{.pipeline_name}}/expectations/{{.example_target_table}}_dqe.json.example} (99%) rename pipeline_bundle_template/{src/dataflows/PIPELINE_NAME_1/schemas/TARGET TABLE_schema.json => template/{{.project_name}}/src/dataflows/{{.pipeline_name}}/schemas/{{.example_target_table}}_schema.json.example} (99%) rename pipeline_bundle_template/{ => template/{{.project_name}}}/src/init/post/README.md (100%) rename pipeline_bundle_template/{ => template/{{.project_name}}}/src/init/pre/README.md (100%) rename pipeline_bundle_template/{ => template/{{.project_name}}}/src/libraries/README.md (100%) create mode 100644 pipeline_bundle_template/template/{{.project_name}}/src/pipeline_configs/dev_substitutions.json.tmpl rename pipeline_bundle_template/{src/pipeline_configs/ENV_substitutions.json => template/{{.project_name}}/src/pipeline_configs/prod_substitutions.json.example.tmpl} (58%) rename pipeline_bundle_template/{ => template/{{.project_name}}}/src/python/README.md (100%) create mode 100644 pipeline_bundle_template/template/{{.project_name}}/tests/main_test.py delete mode 100644 pipeline_bundle_template/tests/main_test.py diff --git a/pipeline_bundle_template/README.md b/pipeline_bundle_template/README.md index cd7403b..2a40d4f 100644 --- a/pipeline_bundle_template/README.md +++ b/pipeline_bundle_template/README.md @@ -1,54 +1,125 @@ -# bronze_sample +# `pipeline_bundle_template` — Databricks Asset Bundle custom template -The 'bronze_sample' project was generated by using the default-python template. +This folder is a [DAB custom template][custom-templates] for scaffolding new Lakeflow Framework +pipeline bundles. End users **don't edit files here** — they run `databricks bundle init` against +this folder and get a new bundle populated from their answers. -## Prerequisites: -1. Execute the setup_data Notebook once bundle is deployed, to setup the Staging source tables and data. +[custom-templates]: https://docs.databricks.com/aws/en/dev-tools/bundles/templates#custom-templates -## Getting started +## Initializing a new bundle -1. Update the databricks.yml file with appropriate details (line 4 and line 23 and 25). +From the repo root: -1. Update the pipelines yml's in the resources folder accordingly: - - Change schemas. +```bash +databricks bundle init ./pipeline_bundle_template --output-dir /path/to/output +``` -1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html +Or against this folder hosted at a Git URL: -1. Authenticate to your Databricks workspace, if you have not done so already: - ``` - $ databricks configure - ``` +```bash +databricks bundle init https://github.com/liamperritt/lakeflow_framework --template-dir pipeline_bundle_template +``` -1. To deploy a development copy of this project, type: - ``` - $ databricks bundle deploy --target dev - ``` - (Note that "dev" is the default target, so the `--target` parameter - is optional here.) +The CLI will prompt for the values declared in `databricks_template_schema.json` (see below) +and emit a new bundle under `//`. - This deploys everything that's defined for this project. - For example, the default template would deploy a job called - `[dev yourname] silver_ar_job` to your workspace. - You can find that job by opening your workpace and clicking on **Workflows**. +Requires Databricks CLI `>= 0.218.0`. -1. Similarly, to deploy a production copy, type: - ``` - $ databricks bundle deploy --target prod - ``` +## Folder layout - Note that the default job from the template has a schedule that runs every day - (defined in resources/silver_ar_job.yml). The schedule - is paused when deploying in development mode (see - https://docs.databricks.com/dev-tools/bundles/deployment-modes.html). +``` +pipeline_bundle_template/ +├── databricks_template_schema.json # prompt definitions +└── template/ # Go-templated source tree + └── {{.project_name}}/ # root folder is named from the project_name prompt + ├── databricks.yml.tmpl + ├── README.md.tmpl + ├── .skip.tmpl # conditional file-skip rules + ├── resources/ + │ └── {{.pipeline_name}}_pipeline.yml.tmpl + └── src/ + ├── dataflows/{{.pipeline_name}}/ + │ ├── dataflowspec/[flow]{{.example_target_table}}_main.json.tmpl + │ ├── dataflowspec/[standard]{{.example_target_table}}_main.json.tmpl + │ ├── schemas/{{.example_target_table}}_schema.json + │ └── expectations/{{.example_target_table}}_dqe.json + └── pipeline_configs/dev_substitutions.json.tmpl +``` -1. To run a job or pipeline, use the "run" command: - ``` - $ databricks bundle run - ``` +The Databricks CLI runs Go's `text/template` engine over every file under `template/` (and over +the path segments themselves). Files with a `.tmpl` suffix have their contents substituted and the +suffix stripped; non-`.tmpl` files are copied verbatim (path segments are still substituted). -1. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from - https://docs.databricks.com/dev-tools/vscode-ext.html. +## Prompts (`databricks_template_schema.json`) -1. For documentation on the Databricks asset bundles format used - for this project, and for CI/CD configuration, see - https://docs.databricks.com/dev-tools/bundles/index.html. +| Property | Type | Default | Purpose | +|---|---|---|---| +| `project_name` | string | _required_ | bundle name + output root folder | +| `pipeline_name` | string | `my_pipeline` | first pipeline; drives `resources/*.yml` and `src/dataflows/*` folder names | +| `layer` | enum (bronze/silver/gold) | `bronze` | medallion layer; baked into `layer` DAB variable default | +| `catalog` | string | `main` | UC catalog; baked into `catalog` DAB variable default | +| `schema` | string | `{{.project_name}}` | UC schema; baked into `schema` DAB variable default | +| `include_example_dataflows` | enum (yes/no) | `yes` | if `no`, `.skip.tmpl` omits the `src/dataflows/{{.pipeline_name}}` folder | +| `example_target_table` | string | `my_target_table` | (skipped if no examples) target table; drives `dataFlowId`, `flowGroupId`, filenames | +| `example_source_table` | string | `my_source_table` | (skipped if no examples) upstream source table | +| `source_catalog` | string | `{{.catalog}}` | (skipped if no examples) pre-populated into `dev_substitutions.json` as the `SOURCE_CAT_SCHEMA` token | +| `source_schema` | string | `{{.schema}}` | (skipped if no examples) pre-populated into `dev_substitutions.json` | + +## What gets derived vs. what stays as scaffolding + +Every single-value placeholder in the source dataflow JSON files is **derived** from the prompts +above (no extra typing). For example, in the rendered `[flow]_main.json`: +- `dataFlowId` = `_flow` +- `dataFlowGroup` = `` +- `flowGroupId` = `fg_` +- `view` key = `v_` +- `sourceDetails.database` = `{SOURCE_CAT_SCHEMA}` (resolved at pipeline runtime via `dev_substitutions.json`) + +A few values are **hardcoded sensible defaults** the user edits if their data source differs: +- `sourceType` = `delta` +- `quarantineMode` = `off` + +A few variable-length lists **stay as literal `<...>` scaffolding** because they can't be cleanly +prompted (the count varies): +- Schema fields in `{{.example_target_table}}_schema.json` +- DQE constraints in `{{.example_target_table}}_dqe.json` +- `selectExp` column list in `[standard]{{.example_target_table}}_main.json` +- Extra tokens / `prefix_suffix` entries in `dev_substitutions.json` + +## Extending the template + +To add a new prompt: + +1. Add a property entry to `databricks_template_schema.json` (set `type`, `description`, `default`, + `order`, plus optional `enum`, `pattern`, `pattern_match_failure_message`, `skip_prompt_if`). +2. Reference it in any `.tmpl` file as `{{.your_new_property}}`. +3. Test with `databricks bundle init ./pipeline_bundle_template --output-dir /tmp/init-test` and + inspect the generated bundle. + +To conditionally skip files based on user answers, extend `template/{{.project_name}}/.skip.tmpl`: + +``` +{{- if eq .some_property "value" -}} +{{ skip (printf "path/to/%s" .other_property) }} +{{- end -}} +``` + +The `skip` function takes a glob pattern relative to `template/{{.project_name}}/`. To compose +paths from other properties, use Go template's `printf` — `{{...}}` inside string literals is +**not** re-processed. + +## Verification (manual) + +```bash +# Init with examples +databricks bundle init ./pipeline_bundle_template --output-dir /tmp/test-init + +# Validate +cd /tmp/test-init/ +databricks bundle validate --target dev + +# Init without examples (verify skip path) +databricks bundle init ./pipeline_bundle_template --output-dir /tmp/test-init-skip +# answer 'no' to include_example_dataflows +# confirm src/dataflows/ is absent +``` diff --git a/pipeline_bundle_template/databricks.yml b/pipeline_bundle_template/databricks.yml deleted file mode 100644 index f728fa0..0000000 --- a/pipeline_bundle_template/databricks.yml +++ /dev/null @@ -1,30 +0,0 @@ -# This is a Databricks asset bundle definition for bronze_sample. -# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. -bundle: - name: bronze_sample - -include: - - resources/*.yml - -variables: - catalog: - description: The target UC catalog - framework_source_path: - description: The full workspace path to the framwework src folder - default: /Workspace/Users//.bundle/dlt_framework//current/files/src - schema: - description: The target UC schema - workspace_host: - description: workspace url used for API calls from Framework (usually same as deployment URL) e.g. https://e2-demo-field-eng.cloud.databricks.com/ - layer: - description: The target layer - default: bronze - logical_env: - description: The logical environment - default: "" - -targets: - # The 'dev' target, for development purposes. This target is the default. - dev: - mode: development - default: true diff --git a/pipeline_bundle_template/databricks_template_schema.json b/pipeline_bundle_template/databricks_template_schema.json new file mode 100644 index 0000000..afe3079 --- /dev/null +++ b/pipeline_bundle_template/databricks_template_schema.json @@ -0,0 +1,93 @@ +{ + "welcome_message": "\nWelcome to the Lakeflow Framework pipeline bundle template.\n\nYou'll be prompted for a few details to scaffold a new pipeline bundle.\nDefaults are provided in [brackets]; press Enter to accept them.\n", + "properties": { + "project_name": { + "type": "string", + "description": "Project Name (used as the DAB bundle name and the root folder of the generated project)", + "default": "my_project", + "order": 1, + "pattern": "^[a-z][a-z0-9_]{2,}$", + "pattern_match_failure_message": "Project name must start with a lowercase letter and contain only lowercase letters, digits, and underscores (minimum 3 characters)." + }, + "pipeline_name": { + "type": "string", + "description": "Pipeline Name (used in the initial pipeline resource yml filename and as the dataflow group folder under src/dataflows/)", + "default": "{{.project_name}}", + "order": 2, + "pattern": "^[a-z][a-z0-9_]+$", + "pattern_match_failure_message": "Pipeline name must start with a lowercase letter and contain only lowercase letters, digits, and underscores." + }, + "layer": { + "type": "string", + "description": "Layer (medallion layer for this bundle's pipeline)", + "enum": ["bronze", "silver", "gold"], + "default": "bronze", + "order": 3 + }, + "catalog": { + "type": "string", + "description": "Catalog (target Unity Catalog catalog for this bundle's outputs - baked into the catalog DAB variable default)", + "default": "main", + "order": 4 + }, + "schema": { + "type": "string", + "description": "Schema (target Unity Catalog schema for this bundle's outputs - baked into the schema DAB variable default)", + "default": "{{.project_name}}", + "order": 5 + }, + "include_example_dataflows": { + "type": "string", + "description": "Include Example Dataflow? (recommended for new users)", + "enum": ["yes", "no"], + "default": "yes", + "order": 6 + }, + "example_target_table": { + "type": "string", + "description": "Example Target Table (name of the target table this example dataflow produces - drives dataFlowId, flowGroupId, filenames, etc.)", + "default": "my_target_table", + "order": 7, + "skip_prompt_if": { + "properties": { + "include_example_dataflows": { "const": "no" } + } + } + }, + "example_source_table": { + "type": "string", + "description": "Example Source Table (name of the upstream source table the example dataflow reads from)", + "default": "my_source_table", + "order": 8, + "skip_prompt_if": { + "properties": { + "include_example_dataflows": { "const": "no" } + } + } + }, + "source_catalog": { + "type": "string", + "description": "Source Catalog (Unity Catalog catalog where the example_source_table lives - pre-populated into dev_substitutions.json so the bundle works without manual edits)", + "default": "{{.catalog}}", + "order": 9, + "skip_prompt_if": { + "properties": { + "include_example_dataflows": { "const": "no" } + } + } + }, + "source_schema": { + "type": "string", + "description": "Source Schema (Unity Catalog schema where the example_source_table lives - pre-populated into dev_substitutions.json)", + "default": "{{.schema}}", + "order": 10, + "skip_prompt_if": { + "properties": { + "include_example_dataflows": { "const": "no" } + } + } + } + }, + "success_message": "\nProject '{{.project_name}}' created.\n\nNext steps:\n cd {{.project_name}}\n databricks bundle validate --target dev\n databricks bundle deploy --target dev\n\nWhat's left for you to fill in (the variable-length scaffolding):\n - src/dataflows/{{.pipeline_name}}/schemas/{{.example_target_table}}_schema.json.example\n (replace the / placeholders with your actual table columns, then remove the '.example' file suffix)\n - src/dataflows/{{.pipeline_name}}/expectations/{{.example_target_table}}_dqe.json.example\n (define your data quality constraints then remove the '.example' file suffix, or delete the file if not needed)\n - src/pipeline_configs/dev_substitutions.json\n (the SOURCE_CAT_SCHEMA token is already wired up; add more tokens here if you need them)\n - src/pipeline_configs/prod_substitutions.json.example\n (add prefix/suffix config and include more tokens here if you need them, then remove the '.example' file suffix)\n\nThe framework_source_path default in databricks.yml assumes the Lakeflow Framework's\n'dev' target is deployed. Override per-environment in your DAB targets if needed.\n", + "min_databricks_cli_version": "v0.218.0" +} diff --git a/pipeline_bundle_template/fixtures/.gitkeep b/pipeline_bundle_template/fixtures/.gitkeep deleted file mode 100644 index fa25d27..0000000 --- a/pipeline_bundle_template/fixtures/.gitkeep +++ /dev/null @@ -1,22 +0,0 @@ -# Fixtures - -This folder is reserved for fixtures, such as CSV files. - -Below is an example of how to load fixtures as a data frame: - -``` -import pandas as pd -import os - -def get_absolute_path(*relative_parts): - if 'dbutils' in globals(): - base_dir = os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) # type: ignore - path = os.path.normpath(os.path.join(base_dir, *relative_parts)) - return path if path.startswith("/Workspace") else "/Workspace" + path - else: - return os.path.join(*relative_parts) - -csv_file = get_absolute_path("..", "fixtures", "mycsv.csv") -df = pd.read_csv(csv_file) -display(df) -``` diff --git a/pipeline_bundle_template/src/dataflows/PIPELINE_NAME_1/dataflowspec/[flow]TARGET TABLE_main.json b/pipeline_bundle_template/src/dataflows/PIPELINE_NAME_1/dataflowspec/[flow]TARGET TABLE_main.json deleted file mode 100644 index 555d733..0000000 --- a/pipeline_bundle_template/src/dataflows/PIPELINE_NAME_1/dataflowspec/[flow]TARGET TABLE_main.json +++ /dev/null @@ -1,43 +0,0 @@ -{ - "dataFlowId": "", - "dataFlowGroup": "", - "dataFlowType": "flow", - "targetFormat": "delta", - "targetDetails": { - "table": "", - "tableProperties": { - "delta.enableChangeDataFeed": "true" - }, - "schemaPath": "_schema.json" - }, - "dataQualityExpectationsEnabled": true, - "quarantineMode": "", - "quarantineTargetDetails": { - "targetFormat": "delta" - }, - "flowGroups": [ - { - "flowGroupId": "", - "flows": { - "f_target": { - "flowType": "append_view", - "flowDetails": { - "targetTable": "", - "sourceView": "" - }, - "views": { - "v_": { - "mode": "stream", - "sourceType": "", - "sourceDetails": { - "database": "{}", - "table": "", - "cdfEnabled": true - } - } - } - } - } - } - ] -} \ No newline at end of file diff --git a/pipeline_bundle_template/src/dataflows/PIPELINE_NAME_1/dataflowspec/[standard]TARGET TABLE_main.json b/pipeline_bundle_template/src/dataflows/PIPELINE_NAME_1/dataflowspec/[standard]TARGET TABLE_main.json deleted file mode 100644 index 3871f44..0000000 --- a/pipeline_bundle_template/src/dataflows/PIPELINE_NAME_1/dataflowspec/[standard]TARGET TABLE_main.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "dataFlowId": "", - "dataFlowGroup": "", - "dataFlowType": "standard", - "sourceSystem": "", - "sourceType": "", - "sourceViewName": "", - "sourceDetails": { - "database": "{}", - "table": "", - "cdfEnabled": true, - "selectExp": [ - "", - "", - "" - ] - }, - "mode": "stream", - "targetFormat": "delta", - "targetDetails": { - "table": "", - "tableProperties": { - "delta.enableChangeDataFeed": "true" - }, - "schemaPath": "_schema.json" - }, - "dataQualityExpectationsEnabled": false, - "quarantineMode": "" -} \ No newline at end of file diff --git a/pipeline_bundle_template/src/dataflows/PIPELINE_NAME_1/expectations/TARGET TABLE_dqe.json b/pipeline_bundle_template/src/dataflows/PIPELINE_NAME_1/expectations/TARGET TABLE_dqe.json deleted file mode 100644 index 808265a..0000000 --- a/pipeline_bundle_template/src/dataflows/PIPELINE_NAME_1/expectations/TARGET TABLE_dqe.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "expect_or_drop": [ - { - "name": "", - "constraint": "", - "tag": "", - "enabled": true - }, - { - "name": "", - "constraint": "", - "tag": "", - "enabled": false - } - ], - "expect_or_fail": [ - { - "name": "", - "constraint": "", - "tag": "", - "enabled": true - } - ] -} \ No newline at end of file diff --git a/pipeline_bundle_template/src/dataflows/PIPELINE_NAME_2/dataflowspec/[standard]TARGET TABLE_main.json b/pipeline_bundle_template/src/dataflows/PIPELINE_NAME_2/dataflowspec/[standard]TARGET TABLE_main.json deleted file mode 100644 index 3871f44..0000000 --- a/pipeline_bundle_template/src/dataflows/PIPELINE_NAME_2/dataflowspec/[standard]TARGET TABLE_main.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "dataFlowId": "", - "dataFlowGroup": "", - "dataFlowType": "standard", - "sourceSystem": "", - "sourceType": "", - "sourceViewName": "", - "sourceDetails": { - "database": "{}", - "table": "", - "cdfEnabled": true, - "selectExp": [ - "", - "", - "" - ] - }, - "mode": "stream", - "targetFormat": "delta", - "targetDetails": { - "table": "", - "tableProperties": { - "delta.enableChangeDataFeed": "true" - }, - "schemaPath": "_schema.json" - }, - "dataQualityExpectationsEnabled": false, - "quarantineMode": "" -} \ No newline at end of file diff --git a/pipeline_bundle_template/src/dataflows/PIPELINE_NAME_2/schemas/TARGET TABLE_schema.json b/pipeline_bundle_template/src/dataflows/PIPELINE_NAME_2/schemas/TARGET TABLE_schema.json deleted file mode 100644 index 340285d..0000000 --- a/pipeline_bundle_template/src/dataflows/PIPELINE_NAME_2/schemas/TARGET TABLE_schema.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "type": "struct", - "fields": [ - { - "name": "", - "type": "", - "nullable": true, - "metadata": {} - }, - { - "name": "", - "type": "", - "nullable": false, - "metadata": {} - } - ] -} \ No newline at end of file diff --git a/pipeline_bundle_template/.gitignore b/pipeline_bundle_template/template/{{.project_name}}/.gitignore similarity index 100% rename from pipeline_bundle_template/.gitignore rename to pipeline_bundle_template/template/{{.project_name}}/.gitignore diff --git a/pipeline_bundle_template/template/{{.project_name}}/.skip.tmpl b/pipeline_bundle_template/template/{{.project_name}}/.skip.tmpl new file mode 100644 index 0000000..811c735 --- /dev/null +++ b/pipeline_bundle_template/template/{{.project_name}}/.skip.tmpl @@ -0,0 +1,4 @@ +{{ skip ".skip" -}} +{{- if eq .include_example_dataflows "no" -}} +{{ skip (printf "src/dataflows/%s" .pipeline_name) }} +{{- end -}} diff --git a/pipeline_bundle_template/template/{{.project_name}}/README.md.tmpl b/pipeline_bundle_template/template/{{.project_name}}/README.md.tmpl new file mode 100644 index 0000000..62a145c --- /dev/null +++ b/pipeline_bundle_template/template/{{.project_name}}/README.md.tmpl @@ -0,0 +1,59 @@ +# {{.project_name}} + +A Lakeflow Framework pipeline bundle scaffolded from the `pipeline_bundle_template` custom template. + +## Layout + +``` +{{.project_name}}/ +├── databricks.yml # bundle config + runtime DAB variables +├── resources/ +│ └── {{.pipeline_name}}_pipeline.yml # SDP/DLT pipeline resource +└── src/ + ├── dataflows/{{.pipeline_name}}/ # DataflowSpec JSON files + │ ├── dataflowspec/ # [flow] and [standard] specs + │ ├── schemas/ # Spark schema definitions + │ └── expectations/ # Data quality constraints + ├── pipeline_configs/ + │ └── dev_substitutions.json # token substitution map + ├── init/{pre,post}/ # optional lifecycle scripts + ├── python/ # user Python modules + └── libraries/ # wheels / loose modules +``` + +## Prerequisites + +The Lakeflow Framework must already be deployed to your workspace at the path referenced by the +`framework_source_path` DAB variable. By default this points at the framework's `dev` target: +`/Workspace/Users//.bundle/lakeflow_framework/dev/current/files/src`. Override per-environment +in your DAB targets if you need a different framework deployment. + +## What's left to fill in + +The template substituted every single-value placeholder for you, but four spots are +variable-length lists you'll edit by hand: + +1. **Schema fields** — `src/dataflows/{{.pipeline_name}}/schemas/{{.example_target_table}}_schema.json` — + replace the `` / `` entries with your actual columns. +2. **DQE constraints** — `src/dataflows/{{.pipeline_name}}/expectations/{{.example_target_table}}_dqe.json` — + define your data quality `expect_or_drop` / `expect_or_fail` constraints (or delete the file). +3. **`selectExp` columns** — `src/dataflows/{{.pipeline_name}}/dataflowspec/[standard]{{.example_target_table}}_main.json` — + replace the `` entries with the actual source columns to project. +4. **`dev_substitutions.json`** — `src/pipeline_configs/dev_substitutions.json` — the `SOURCE_CAT_SCHEMA` + token is already wired up to `{{.source_catalog}}.{{.source_schema}}`; add more tokens here only + if your dataflow JSONs reference them. + +## Getting started + +1. Install the Databricks CLI: https://docs.databricks.com/dev-tools/cli/databricks-cli.html +2. Authenticate to your workspace: `databricks configure` +3. Validate the bundle: `databricks bundle validate --target dev` +4. Deploy: `databricks bundle deploy --target dev` +5. Run the pipeline: `databricks bundle run` + +## Targets + +- `dev` (default) — development mode, deployments are scoped to your user. +- `prod` — production mode; override variables per-environment as needed. + +See https://docs.databricks.com/dev-tools/bundles/index.html for the DAB reference. diff --git a/pipeline_bundle_template/template/{{.project_name}}/databricks.yml.tmpl b/pipeline_bundle_template/template/{{.project_name}}/databricks.yml.tmpl new file mode 100644 index 0000000..ba65763 --- /dev/null +++ b/pipeline_bundle_template/template/{{.project_name}}/databricks.yml.tmpl @@ -0,0 +1,37 @@ +# Databricks asset bundle definition for {{.project_name}}. +# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. +bundle: + name: {{.project_name}} + +include: + - resources/*.yml + +variables: + catalog: + description: The target UC catalog + default: {{.catalog}} + schema: + description: The target UC schema + default: {{.schema}} + framework_source_path: + description: The full workspace path to the lakeflow_framework src folder + default: /Workspace/Users/{{user_name}}/.bundle/lakeflow_framework/dev/current/files/src + workspace_host: + description: Workspace url used for API calls from the framework (usually the same as the deployment URL) + default: {{workspace_host}} + layer: + description: The medallion layer (bronze, silver, or gold) + default: {{.layer}} + logical_env: + description: Optional suffix appended to pipeline names to distinguish logical environments + default: "" + +targets: + # 'dev' target — development mode, default. + dev: + mode: development + default: true + + # 'prod' target — override variables per-environment as needed. + prod: + mode: production diff --git a/pipeline_bundle_template/template/{{.project_name}}/fixtures/.gitkeep b/pipeline_bundle_template/template/{{.project_name}}/fixtures/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/pipeline_bundle_template/pytest.ini b/pipeline_bundle_template/template/{{.project_name}}/pytest.ini similarity index 100% rename from pipeline_bundle_template/pytest.ini rename to pipeline_bundle_template/template/{{.project_name}}/pytest.ini diff --git a/pipeline_bundle_template/resources/PIPELINE NAME_pipeline.yml b/pipeline_bundle_template/template/{{.project_name}}/resources/{{.pipeline_name}}_pipeline.yml.tmpl similarity index 67% rename from pipeline_bundle_template/resources/PIPELINE NAME_pipeline.yml rename to pipeline_bundle_template/template/{{.project_name}}/resources/{{.pipeline_name}}_pipeline.yml.tmpl index 20f52fa..06968eb 100644 --- a/pipeline_bundle_template/resources/PIPELINE NAME_pipeline.yml +++ b/pipeline_bundle_template/template/{{.project_name}}/resources/{{.pipeline_name}}_pipeline.yml.tmpl @@ -1,7 +1,7 @@ resources: pipelines: - dlt_framework__pipeline: - name: dlt_framework__pipeline${var.logical_env} + dlt_framework_{{.pipeline_name}}_pipeline: + name: dlt_framework_{{.pipeline_name}}_pipeline${var.logical_env} channel: CURRENT serverless: true catalog: ${var.catalog} @@ -11,10 +11,10 @@ resources: path: ${var.framework_source_path}/dlt_pipeline configuration: - bundle.sourcePath: /Workspace/${workspace.file_path}/src + bundle.sourcePath: ${workspace.file_path}/src bundle.target: ${bundle.target} framework.sourcePath: ${var.framework_source_path} workspace.host: ${var.workspace_host} pipeline.layer: ${var.layer} logicalEnv: ${var.logical_env} - pipeline.dataFlowIdFilter: + pipeline.dataFlowGroupFilter: {{.pipeline_name}} diff --git a/pipeline_bundle_template/src/dataflows/PIPELINE_NAME_2/dataflowspec/[flow]TARGET TABLE_main.json b/pipeline_bundle_template/template/{{.project_name}}/src/dataflows/{{.pipeline_name}}/dataflowspec/[flow]{{.example_target_table}}_main.json.tmpl similarity index 53% rename from pipeline_bundle_template/src/dataflows/PIPELINE_NAME_2/dataflowspec/[flow]TARGET TABLE_main.json rename to pipeline_bundle_template/template/{{.project_name}}/src/dataflows/{{.pipeline_name}}/dataflowspec/[flow]{{.example_target_table}}_main.json.tmpl index 555d733..aaf4677 100644 --- a/pipeline_bundle_template/src/dataflows/PIPELINE_NAME_2/dataflowspec/[flow]TARGET TABLE_main.json +++ b/pipeline_bundle_template/template/{{.project_name}}/src/dataflows/{{.pipeline_name}}/dataflowspec/[flow]{{.example_target_table}}_main.json.tmpl @@ -1,37 +1,37 @@ { - "dataFlowId": "", - "dataFlowGroup": "", + "dataFlowId": "{{.example_target_table}}_flow", + "dataFlowGroup": "{{.pipeline_name}}", "dataFlowType": "flow", "targetFormat": "delta", "targetDetails": { - "table": "", + "table": "{{.example_target_table}}", "tableProperties": { "delta.enableChangeDataFeed": "true" }, - "schemaPath": "_schema.json" + "schemaPath": "{{.example_target_table}}_schema.json" }, "dataQualityExpectationsEnabled": true, - "quarantineMode": "", + "quarantineMode": "off", "quarantineTargetDetails": { "targetFormat": "delta" }, "flowGroups": [ { - "flowGroupId": "", + "flowGroupId": "fg_{{.example_target_table}}", "flows": { "f_target": { "flowType": "append_view", "flowDetails": { - "targetTable": "", - "sourceView": "" + "targetTable": "{{.example_target_table}}", + "sourceView": "v_{{.example_source_table}}" }, "views": { - "v_": { + "v_{{.example_source_table}}": { "mode": "stream", - "sourceType": "", + "sourceType": "delta", "sourceDetails": { - "database": "{}", - "table": "", + "database": "{SOURCE_CAT_SCHEMA}", + "table": "{{.example_source_table}}", "cdfEnabled": true } } @@ -40,4 +40,4 @@ } } ] -} \ No newline at end of file +} diff --git a/pipeline_bundle_template/template/{{.project_name}}/src/dataflows/{{.pipeline_name}}/dataflowspec/[standard]{{.example_target_table}}_main.json.tmpl b/pipeline_bundle_template/template/{{.project_name}}/src/dataflows/{{.pipeline_name}}/dataflowspec/[standard]{{.example_target_table}}_main.json.tmpl new file mode 100644 index 0000000..d2ba9ae --- /dev/null +++ b/pipeline_bundle_template/template/{{.project_name}}/src/dataflows/{{.pipeline_name}}/dataflowspec/[standard]{{.example_target_table}}_main.json.tmpl @@ -0,0 +1,24 @@ +{ + "dataFlowId": "{{.example_target_table}}_standard", + "dataFlowGroup": "{{.pipeline_name}}", + "dataFlowType": "standard", + "sourceSystem": "example", + "sourceType": "delta", + "sourceViewName": "v_{{.example_source_table}}", + "sourceDetails": { + "database": "{SOURCE_CAT_SCHEMA}", + "table": "{{.example_source_table}}", + "cdfEnabled": true + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "{{.example_target_table}}", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "{{.example_target_table}}_schema.json" + }, + "dataQualityExpectationsEnabled": false, + "quarantineMode": "off" +} diff --git a/pipeline_bundle_template/src/dataflows/PIPELINE_NAME_2/expectations/TARGET TABLE_dqe.json b/pipeline_bundle_template/template/{{.project_name}}/src/dataflows/{{.pipeline_name}}/expectations/{{.example_target_table}}_dqe.json.example similarity index 99% rename from pipeline_bundle_template/src/dataflows/PIPELINE_NAME_2/expectations/TARGET TABLE_dqe.json rename to pipeline_bundle_template/template/{{.project_name}}/src/dataflows/{{.pipeline_name}}/expectations/{{.example_target_table}}_dqe.json.example index 808265a..e99236c 100644 --- a/pipeline_bundle_template/src/dataflows/PIPELINE_NAME_2/expectations/TARGET TABLE_dqe.json +++ b/pipeline_bundle_template/template/{{.project_name}}/src/dataflows/{{.pipeline_name}}/expectations/{{.example_target_table}}_dqe.json.example @@ -21,4 +21,4 @@ "enabled": true } ] -} \ No newline at end of file +} diff --git a/pipeline_bundle_template/src/dataflows/PIPELINE_NAME_1/schemas/TARGET TABLE_schema.json b/pipeline_bundle_template/template/{{.project_name}}/src/dataflows/{{.pipeline_name}}/schemas/{{.example_target_table}}_schema.json.example similarity index 99% rename from pipeline_bundle_template/src/dataflows/PIPELINE_NAME_1/schemas/TARGET TABLE_schema.json rename to pipeline_bundle_template/template/{{.project_name}}/src/dataflows/{{.pipeline_name}}/schemas/{{.example_target_table}}_schema.json.example index 340285d..e3729b4 100644 --- a/pipeline_bundle_template/src/dataflows/PIPELINE_NAME_1/schemas/TARGET TABLE_schema.json +++ b/pipeline_bundle_template/template/{{.project_name}}/src/dataflows/{{.pipeline_name}}/schemas/{{.example_target_table}}_schema.json.example @@ -14,4 +14,4 @@ "metadata": {} } ] -} \ No newline at end of file +} diff --git a/pipeline_bundle_template/src/init/post/README.md b/pipeline_bundle_template/template/{{.project_name}}/src/init/post/README.md similarity index 100% rename from pipeline_bundle_template/src/init/post/README.md rename to pipeline_bundle_template/template/{{.project_name}}/src/init/post/README.md diff --git a/pipeline_bundle_template/src/init/pre/README.md b/pipeline_bundle_template/template/{{.project_name}}/src/init/pre/README.md similarity index 100% rename from pipeline_bundle_template/src/init/pre/README.md rename to pipeline_bundle_template/template/{{.project_name}}/src/init/pre/README.md diff --git a/pipeline_bundle_template/src/libraries/README.md b/pipeline_bundle_template/template/{{.project_name}}/src/libraries/README.md similarity index 100% rename from pipeline_bundle_template/src/libraries/README.md rename to pipeline_bundle_template/template/{{.project_name}}/src/libraries/README.md diff --git a/pipeline_bundle_template/template/{{.project_name}}/src/pipeline_configs/dev_substitutions.json.tmpl b/pipeline_bundle_template/template/{{.project_name}}/src/pipeline_configs/dev_substitutions.json.tmpl new file mode 100644 index 0000000..2e08c11 --- /dev/null +++ b/pipeline_bundle_template/template/{{.project_name}}/src/pipeline_configs/dev_substitutions.json.tmpl @@ -0,0 +1,5 @@ +{ + "tokens": { + "SOURCE_CAT_SCHEMA": "{{.source_catalog}}.{{.source_schema}}", + } +} diff --git a/pipeline_bundle_template/src/pipeline_configs/ENV_substitutions.json b/pipeline_bundle_template/template/{{.project_name}}/src/pipeline_configs/prod_substitutions.json.example.tmpl similarity index 58% rename from pipeline_bundle_template/src/pipeline_configs/ENV_substitutions.json rename to pipeline_bundle_template/template/{{.project_name}}/src/pipeline_configs/prod_substitutions.json.example.tmpl index ec2d947..368d59f 100644 --- a/pipeline_bundle_template/src/pipeline_configs/ENV_substitutions.json +++ b/pipeline_bundle_template/template/{{.project_name}}/src/pipeline_configs/prod_substitutions.json.example.tmpl @@ -1,7 +1,7 @@ { "tokens": { - "": "", - "": "" + "SOURCE_CAT_SCHEMA": "{{.source_catalog}}.{{.source_schema}}", + "": "" }, "prefix_suffix": { "": { @@ -9,4 +9,4 @@ "suffix": "" } } -} \ No newline at end of file +} diff --git a/pipeline_bundle_template/src/python/README.md b/pipeline_bundle_template/template/{{.project_name}}/src/python/README.md similarity index 100% rename from pipeline_bundle_template/src/python/README.md rename to pipeline_bundle_template/template/{{.project_name}}/src/python/README.md diff --git a/pipeline_bundle_template/template/{{.project_name}}/tests/main_test.py b/pipeline_bundle_template/template/{{.project_name}}/tests/main_test.py new file mode 100644 index 0000000..5a96945 --- /dev/null +++ b/pipeline_bundle_template/template/{{.project_name}}/tests/main_test.py @@ -0,0 +1,2 @@ +def test_main(): + assert 1 == 1 diff --git a/pipeline_bundle_template/tests/main_test.py b/pipeline_bundle_template/tests/main_test.py deleted file mode 100644 index 333ffa3..0000000 --- a/pipeline_bundle_template/tests/main_test.py +++ /dev/null @@ -1,6 +0,0 @@ -from bronze_sample.main import get_taxis, get_spark - - -def test_main(): - taxis = get_taxis(get_spark()) - assert taxis.count() > 5 From e5545034a1f5585b5b36a72baaa576288b30a6f1 Mon Sep 17 00:00:00 2001 From: Liam Perritt Date: Wed, 27 May 2026 16:34:00 +1000 Subject: [PATCH 2/3] Bump version to v0.16.0 --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 5277a0e..49dd460 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v0.15.3 \ No newline at end of file +v0.16.0 \ No newline at end of file From 87c4d26c321ddf6de2276d526279e7670e4004d1 Mon Sep 17 00:00:00 2001 From: Liam Perritt Date: Wed, 27 May 2026 16:40:57 +1000 Subject: [PATCH 3/3] Set dataQualityExpectationsEnabled to 'off' --- .../dataflowspec/[flow]{{.example_target_table}}_main.json.tmpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline_bundle_template/template/{{.project_name}}/src/dataflows/{{.pipeline_name}}/dataflowspec/[flow]{{.example_target_table}}_main.json.tmpl b/pipeline_bundle_template/template/{{.project_name}}/src/dataflows/{{.pipeline_name}}/dataflowspec/[flow]{{.example_target_table}}_main.json.tmpl index aaf4677..b86a398 100644 --- a/pipeline_bundle_template/template/{{.project_name}}/src/dataflows/{{.pipeline_name}}/dataflowspec/[flow]{{.example_target_table}}_main.json.tmpl +++ b/pipeline_bundle_template/template/{{.project_name}}/src/dataflows/{{.pipeline_name}}/dataflowspec/[flow]{{.example_target_table}}_main.json.tmpl @@ -10,7 +10,7 @@ }, "schemaPath": "{{.example_target_table}}_schema.json" }, - "dataQualityExpectationsEnabled": true, + "dataQualityExpectationsEnabled": false, "quarantineMode": "off", "quarantineTargetDetails": { "targetFormat": "delta"