From 98d001d363b041e784f6ae8d41ea5e9d0b687dea Mon Sep 17 00:00:00 2001 From: Theo Kiehn <162969167+tkiehn@users.noreply.github.com> Date: Tue, 19 Nov 2024 08:21:53 +0100 Subject: [PATCH 01/50] add macro to process new prejoin list syntax --- .../helpers/stage_processing_macros.sql | 73 ++++++++++++++++++- 1 file changed, 72 insertions(+), 1 deletion(-) diff --git a/macros/internal/helpers/stage_processing_macros.sql b/macros/internal/helpers/stage_processing_macros.sql index 6ecf2676..75a90978 100644 --- a/macros/internal/helpers/stage_processing_macros.sql +++ b/macros/internal/helpers/stage_processing_macros.sql @@ -123,4 +123,75 @@ {%- endif %} {%- endfor -%} -{%- endmacro -%} \ No newline at end of file +{%- endmacro -%} + + +{%- macro process_prejoined_columns(prejoined_columns=none) -%} +{# Check if the new list syntax is used for prejoined columns + If so parse it to dictionaries #} + +{% if not datavault4dbt.is_list(prejoined_columns) %} + {% do return(prejoined_columns) %} +{% else %} + {# if the (new) list syntax for prejoins is used + it needs to be converted to the old syntax #} + + {# Initialize emtpy dict which will be filled by each entry #} + {% set return_dict = {} %} + + {# Iterate over each dictionary in the prejoined_colums-list #} + {% for dict_item in prejoined_columns %} + + {# If column aliases are present they they have to map 1:1 to the extract_columns #} + {% if datavault4dbt.is_something(dict_item.aliases) + and not dict_item.aliases|length == dict_item.extract_columns|length%} + {{ exceptions.raise_compiler_error("Prejoin aliases must have the same length as extract_columns") }} + {% endif %} + + {# If multiple columns from the same source should be extracted each column has to be processed once #} + {% if datavault4dbt.is_list(dict_item.extract_columns) %} + {% for column in dict_item.extract_columns %} + {# If aliases are defined they should be used as dict keys + These will be used as new column names #} + {% if datavault4dbt.is_something(dict_item.aliases) %} + {% set dict_key = dict_item.aliases[loop.index-1] %} + {% else %} + {% set dict_key = dict_item.extract_columns[loop.index-1] %} + {% endif %} + + {% set tmp_dict %} + {{dict_key}}: + ref_model: {{dict_item.ref_model}} + bk: {{dict_item.extract_columns[loop.index-1]}} + this_column_name: {{dict_item.this_column_name}} + ref_column_name: {{dict_item.ref_column_name}} + {% endset %} + {% do return_dict.update(fromyaml(tmp_dict)) %} + {% endfor %} + + {% else %} + + {# If aliases are defined they should be used as dict keys + These will be used as new column names #} + {% if datavault4dbt.is_something(dict_item.aliases) %} + {% set dict_key = dict_item.aliases[loop.index-1] %} + {% else %} + {% set dict_key = dict_item.extract_columns[loop.index-1] %} + {% endif %} + + {% set tmp_dict %} + {{dict_key}}: + ref_model: {{dict_item.ref_model}} + bk: {{dict_item.extract_columns[loop.index-1]}} + this_column_name: {{dict_item.this_column_name}} + ref_column_name: {{dict_item.ref_column_name}} + {% endset %} + {% do return_dict.update(fromyaml(tmp_dict)) %} + {% endif %} + {% endfor %} + + {%- do return(return_dict) -%} + +{% endif %} + +{%- endmacro -%} From 4b0e02d5c27b82decb0c396318b26aae08000229 Mon Sep 17 00:00:00 2001 From: Theo Kiehn <162969167+tkiehn@users.noreply.github.com> Date: Tue, 19 Nov 2024 08:38:16 +0100 Subject: [PATCH 02/50] add process_prejoined_columns macro to top-level stage macro --- macros/staging/stage.sql | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/macros/staging/stage.sql b/macros/staging/stage.sql index 76b17ed0..403df72c 100644 --- a/macros/staging/stage.sql +++ b/macros/staging/stage.sql @@ -120,6 +120,11 @@ {%- if datavault4dbt.is_nothing(ldts) -%} {%- set ldts = datavault4dbt.current_timestamp() -%} {%- endif -%} + + {# To parse the list syntax of prejoined columns #} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {%- set prejoined_columns = datavault4dbt.process_prejoined_columns(prejoined_columns) -%} + {%- endif -%} {{- adapter.dispatch('stage', 'datavault4dbt')(include_source_columns=include_source_columns, ldts=ldts, From 07ec2dec22339ed4c54a70ce0f38b53ddb0eb13d Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Wed, 20 Nov 2024 18:29:51 +0100 Subject: [PATCH 03/50] change prejoin-logic to perform less joins --- macros/staging/bigquery/stage.sql | 37 +++++++++++++++++--------- macros/staging/databricks/stage.sql | 38 +++++++++++++++++---------- macros/staging/exasol/stage.sql | 37 +++++++++++++++++--------- macros/staging/fabric/stage.sql | 40 +++++++++++++++++++---------- macros/staging/oracle/stage.sql | 37 +++++++++++++++++--------- macros/staging/postgres/stage.sql | 37 +++++++++++++++++--------- macros/staging/redshift/stage.sql | 38 +++++++++++++++++---------- macros/staging/snowflake/stage.sql | 37 +++++++++++++++++--------- macros/staging/synapse/stage.sql | 39 ++++++++++++++++++---------- 9 files changed, 226 insertions(+), 114 deletions(-) diff --git a/macros/staging/bigquery/stage.sql b/macros/staging/bigquery/stage.sql index 8c94c387..0486c5f2 100644 --- a/macros/staging/bigquery/stage.sql +++ b/macros/staging/bigquery/stage.sql @@ -256,6 +256,7 @@ missing_columns AS ( ), {%- endif -%} + {%- if datavault4dbt.is_something(prejoined_columns) %} {# Prejoining Business Keys of other source objects for Link purposes #} prejoined_columns AS ( @@ -263,14 +264,12 @@ prejoined_columns AS ( SELECT {% if final_columns_to_select | length > 0 -%} {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} - {% endif %} - {%- for col, vals in prejoined_columns.items() -%} - ,pj_{{loop.index}}.{{ vals['bk'] }} AS {{ col }} - {% endfor -%} - - FROM {{ last_cte }} lcte + {%- endif -%} - {% for col, vals in prejoined_columns.items() %} + {#- prepare join statements -#} + {%- set prejoin_statements_list = [] -%} + {%- set processed_prejoin_hashes = [] -%} + {%- for col, vals in prejoined_columns.items() -%} {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} @@ -309,15 +308,29 @@ prejoined_columns AS ( {%- set operator = vals['operator'] -%} {%- endif -%} - {%- set prejoin_alias = 'pj_' + loop.index|string -%} - left join {{ relation }} as {{ prejoin_alias }} - on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} + {%- set prejoin_hash = '`' ~ local_md5(relation~vals['this_column_name']~operator~vals['ref_column_name']) ~ '`' -%} - {% endfor %} + {%- if not prejoin_hash in processed_prejoin_hashes %} + {%- do processed_prejoin_hashes.append(prejoin_hash) %} + {%- set prejoin_join_statement_tmp -%} + left join {{ relation }} as {{ prejoin_hash }} + on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_hash], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} + + {% endset -%} + {%- do prejoin_statements_list.append(prejoin_join_statement_tmp) -%} + {%- endif -%} + +{# select the prejoined columns #} + ,{{prejoin_hash}}.{{ vals['bk'] }} AS {{ col }} + {% endfor -%} + + FROM {{ last_cte }} lcte + + {{ prejoin_statements_list|join(' ')}} {% set last_cte = "prejoined_columns" -%} - {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names %} + {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names -%} ), {%- endif -%} diff --git a/macros/staging/databricks/stage.sql b/macros/staging/databricks/stage.sql index fc76044b..a3ff3b28 100644 --- a/macros/staging/databricks/stage.sql +++ b/macros/staging/databricks/stage.sql @@ -253,6 +253,7 @@ missing_columns AS ( ), {%- endif -%} + {%- if datavault4dbt.is_something(prejoined_columns) %} {# Prejoining Business Keys of other source objects for Link purposes #} prejoined_columns AS ( @@ -260,14 +261,12 @@ prejoined_columns AS ( SELECT {% if final_columns_to_select | length > 0 -%} {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} - {% endif %} - {%- for col, vals in prejoined_columns.items() -%} - ,pj_{{loop.index}}.{{ vals['bk'] }} AS {{ col }} - {% endfor -%} - - FROM {{ last_cte }} lcte + {%- endif -%} - {% for col, vals in prejoined_columns.items() %} + {#- prepare join statements -#} + {%- set prejoin_statements_list = [] -%} + {%- set processed_prejoin_hashes = [] -%} + {%- for col, vals in prejoined_columns.items() -%} {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} @@ -306,19 +305,32 @@ prejoined_columns AS ( {%- set operator = vals['operator'] -%} {%- endif -%} - {%- set prejoin_alias = 'pj_' + loop.index|string -%} - left join {{ relation }} as {{ prejoin_alias }} - on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} + {%- set prejoin_hash = '`' ~ local_md5(relation~vals['this_column_name']~operator~vals['ref_column_name']) ~ '`' -%} - {% endfor %} + {%- if not prejoin_hash in processed_prejoin_hashes %} + {%- do processed_prejoin_hashes.append(prejoin_hash) %} + {%- set prejoin_join_statement_tmp -%} + left join {{ relation }} as {{ prejoin_hash }} + on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_hash], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} + + {% endset -%} + {%- do prejoin_statements_list.append(prejoin_join_statement_tmp) -%} + {%- endif -%} + +{# select the prejoined columns #} + ,{{prejoin_hash}}.{{ vals['bk'] }} AS {{ col }} + {% endfor -%} + + FROM {{ last_cte }} lcte + + {{ prejoin_statements_list|join(' ')}} {% set last_cte = "prejoined_columns" -%} - {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names %} + {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names -%} ), {%- endif -%} - {%- if datavault4dbt.is_something(derived_columns) %} {# Adding derived columns to the selection #} derived_columns AS ( diff --git a/macros/staging/exasol/stage.sql b/macros/staging/exasol/stage.sql index 50dd35d3..9812f462 100644 --- a/macros/staging/exasol/stage.sql +++ b/macros/staging/exasol/stage.sql @@ -244,6 +244,7 @@ missing_columns AS ( ), {%- endif -%} + {%- if datavault4dbt.is_something(prejoined_columns) %} {# Prejoining Business Keys of other source objects for Link purposes #} prejoined_columns AS ( @@ -251,14 +252,12 @@ prejoined_columns AS ( SELECT {% if final_columns_to_select | length > 0 -%} {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} - {% endif %} - {%- for col, vals in prejoined_columns.items() -%} - ,pj_{{loop.index}}.{{ vals['bk'] }} AS "{{ col | upper }}" - {% endfor -%} - - FROM {{ last_cte }} lcte + {%- endif -%} - {% for col, vals in prejoined_columns.items() %} + {#- prepare join statements -#} + {%- set prejoin_statements_list = [] -%} + {%- set processed_prejoin_hashes = [] -%} + {%- for col, vals in prejoined_columns.items() -%} {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} @@ -297,15 +296,29 @@ prejoined_columns AS ( {%- set operator = vals['operator'] -%} {%- endif -%} - {%- set prejoin_alias = 'pj_' + loop.index|string -%} - left join {{ relation }} as {{ prejoin_alias }} - on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} + {%- set prejoin_hash = '"' ~ local_md5(relation~vals['this_column_name']~operator~vals['ref_column_name']) ~ '"' -%} - {% endfor %} + {%- if not prejoin_hash in processed_prejoin_hashes %} + {%- do processed_prejoin_hashes.append(prejoin_hash) %} + {%- set prejoin_join_statement_tmp -%} + left join {{ relation }} as {{ prejoin_hash }} + on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_hash], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} + + {% endset -%} + {%- do prejoin_statements_list.append(prejoin_join_statement_tmp) -%} + {%- endif -%} + +{# select the prejoined columns #} + ,{{prejoin_hash}}.{{ vals['bk'] }} AS {{ col }} + {% endfor -%} + + FROM {{ last_cte }} lcte + + {{ prejoin_statements_list|join(' ')}} {% set last_cte = "prejoined_columns" -%} - {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names %} + {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names -%} ), {%- endif -%} diff --git a/macros/staging/fabric/stage.sql b/macros/staging/fabric/stage.sql index 605f6861..d042d7a9 100644 --- a/macros/staging/fabric/stage.sql +++ b/macros/staging/fabric/stage.sql @@ -253,23 +253,21 @@ missing_columns AS ( ), {%- endif -%} -{%- if datavault4dbt.is_something(prejoined_columns) %} -{%- set final_columns_to_select = (final_columns_to_select + derived_input_columns) | unique | list -%} +{%- if datavault4dbt.is_something(prejoined_columns) %} {# Prejoining Business Keys of other source objects for Link purposes #} + prejoined_columns AS ( SELECT {% if final_columns_to_select | length > 0 -%} {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} - {% endif %} - {%- for col, vals in prejoined_columns.items() -%} - ,pj_{{loop.index}}.{{datavault4dbt.escape_column_names(vals['bk'])}} AS {{datavault4dbt.escape_column_names(col)}} - {% endfor -%} - - FROM {{ last_cte }} lcte + {%- endif -%} - {% for col, vals in prejoined_columns.items() %} + {#- prepare join statements -#} + {%- set prejoin_statements_list = [] -%} + {%- set processed_prejoin_hashes = [] -%} + {%- for col, vals in prejoined_columns.items() -%} {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} @@ -308,15 +306,29 @@ prejoined_columns AS ( {%- set operator = vals['operator'] -%} {%- endif -%} - {%- set prejoin_alias = 'pj_' + loop.index|string -%} - left join {{ relation }} as {{ prejoin_alias }} - on {{ datavault4dbt.multikey(columns=datavault4dbt.escape_column_names(vals['this_column_name']), prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=datavault4dbt.escape_column_names(vals['ref_column_name'])) }} + {%- set prejoin_hash = '"' ~ local_md5(relation~vals['this_column_name']~operator~vals['ref_column_name']) ~ '"' -%} - {% endfor %} + {%- if not prejoin_hash in processed_prejoin_hashes %} + {%- do processed_prejoin_hashes.append(prejoin_hash) %} + {%- set prejoin_join_statement_tmp -%} + left join {{ relation }} as {{ prejoin_hash }} + on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_hash], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} + + {% endset -%} + {%- do prejoin_statements_list.append(prejoin_join_statement_tmp) -%} + {%- endif -%} + +{# select the prejoined columns #} + ,{{prejoin_hash}}.{{ vals['bk'] }} AS {{ col }} + {% endfor -%} + + FROM {{ last_cte }} lcte + + {{ prejoin_statements_list|join(' ')}} {% set last_cte = "prejoined_columns" -%} - {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names %} + {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names -%} ), {%- endif -%} diff --git a/macros/staging/oracle/stage.sql b/macros/staging/oracle/stage.sql index c2be1409..40afc202 100644 --- a/macros/staging/oracle/stage.sql +++ b/macros/staging/oracle/stage.sql @@ -263,6 +263,7 @@ missing_columns AS ( ), {%- endif -%} + {%- if datavault4dbt.is_something(prejoined_columns) %} {# Prejoining Business Keys of other source objects for Link purposes #} prejoined_columns AS ( @@ -270,14 +271,12 @@ prejoined_columns AS ( SELECT {% if final_columns_to_select | length > 0 -%} {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} - {% endif %} - {%- for col, vals in prejoined_columns.items() -%} - ,pj_{{loop.index}}.{{ vals['bk'] }} AS {{ col }} - {% endfor -%} - - FROM {{ last_cte }} lcte + {%- endif -%} - {% for col, vals in prejoined_columns.items() %} + {#- prepare join statements -#} + {%- set prejoin_statements_list = [] -%} + {%- set processed_prejoin_hashes = [] -%} + {%- for col, vals in prejoined_columns.items() -%} {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} @@ -316,15 +315,29 @@ prejoined_columns AS ( {%- set operator = vals['operator'] -%} {%- endif -%} - {%- set prejoin_alias = 'pj_' + loop.index|string -%} - left join {{ relation }} {{ prejoin_alias }} - on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} + {%- set prejoin_hash = '"' ~ local_md5(relation~vals['this_column_name']~operator~vals['ref_column_name']) ~ '"' -%} - {% endfor %} + {%- if not prejoin_hash in processed_prejoin_hashes %} + {%- do processed_prejoin_hashes.append(prejoin_hash) %} + {%- set prejoin_join_statement_tmp -%} + left join {{ relation }} as {{ prejoin_hash }} + on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_hash], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} + + {% endset -%} + {%- do prejoin_statements_list.append(prejoin_join_statement_tmp) -%} + {%- endif -%} + +{# select the prejoined columns #} + ,{{prejoin_hash}}.{{ vals['bk'] }} AS {{ col }} + {% endfor -%} + + FROM {{ last_cte }} lcte + + {{ prejoin_statements_list|join(' ')}} {% set last_cte = "prejoined_columns" -%} - {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names %} + {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names -%} ), {%- endif -%} diff --git a/macros/staging/postgres/stage.sql b/macros/staging/postgres/stage.sql index 9edd3a38..1f4ef548 100644 --- a/macros/staging/postgres/stage.sql +++ b/macros/staging/postgres/stage.sql @@ -256,6 +256,7 @@ missing_columns AS ( ), {%- endif -%} + {%- if datavault4dbt.is_something(prejoined_columns) %} {# Prejoining Business Keys of other source objects for Link purposes #} prejoined_columns AS ( @@ -263,14 +264,12 @@ prejoined_columns AS ( SELECT {% if final_columns_to_select | length > 0 -%} {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} - {% endif %} - {%- for col, vals in prejoined_columns.items() -%} - ,pj_{{loop.index}}.{{ vals['bk'] }} AS {{ col }} - {% endfor -%} - - FROM {{ last_cte }} lcte + {%- endif -%} - {% for col, vals in prejoined_columns.items() %} + {#- prepare join statements -#} + {%- set prejoin_statements_list = [] -%} + {%- set processed_prejoin_hashes = [] -%} + {%- for col, vals in prejoined_columns.items() -%} {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} @@ -309,15 +308,29 @@ prejoined_columns AS ( {%- set operator = vals['operator'] -%} {%- endif -%} - {%- set prejoin_alias = 'pj_' + loop.index|string -%} - left join {{ relation }} as {{ prejoin_alias }} - on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} + {%- set prejoin_hash = '"' ~ local_md5(relation~vals['this_column_name']~operator~vals['ref_column_name']) ~ '"' -%} - {% endfor %} + {%- if not prejoin_hash in processed_prejoin_hashes %} + {%- do processed_prejoin_hashes.append(prejoin_hash) %} + {%- set prejoin_join_statement_tmp -%} + left join {{ relation }} as {{ prejoin_hash }} + on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_hash], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} + + {% endset -%} + {%- do prejoin_statements_list.append(prejoin_join_statement_tmp) -%} + {%- endif -%} + +{# select the prejoined columns #} + ,{{prejoin_hash}}.{{ vals['bk'] }} AS {{ col }} + {% endfor -%} + + FROM {{ last_cte }} lcte + + {{ prejoin_statements_list|join(' ')}} {% set last_cte = "prejoined_columns" -%} - {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names %} + {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names -%} ), {%- endif -%} diff --git a/macros/staging/redshift/stage.sql b/macros/staging/redshift/stage.sql index a7704c03..6c9238b0 100644 --- a/macros/staging/redshift/stage.sql +++ b/macros/staging/redshift/stage.sql @@ -260,16 +260,14 @@ missing_columns AS ( prejoined_columns AS ( SELECT - {% if final_columns_to_select | length > 0 -%} + {% if final_columns_to_select | length > 0 -%} {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} - {% endif %} - {%- for col, vals in prejoined_columns.items() -%} - ,pj_{{loop.index}}.{{ vals['bk'] }} AS {{ col }} - {% endfor -%} - - FROM {{ last_cte }} lcte + {%- endif -%} - {% for col, vals in prejoined_columns.items() %} + {#- prepare join statements -#} + {%- set prejoin_statements_list = [] -%} + {%- set processed_prejoin_hashes = [] -%} + {%- for col, vals in prejoined_columns.items() -%} {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} @@ -308,15 +306,29 @@ prejoined_columns AS ( {%- set operator = vals['operator'] -%} {%- endif -%} - {%- set prejoin_alias = 'pj_' + loop.index|string -%} - left join {{ relation }} as {{ prejoin_alias }} - on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} + {%- set prejoin_hash = '"' ~ local_md5(relation~vals['this_column_name']~operator~vals['ref_column_name']) ~ '"' -%} - {% endfor %} + {%- if not prejoin_hash in processed_prejoin_hashes %} + {%- do processed_prejoin_hashes.append(prejoin_hash) %} + {%- set prejoin_join_statement_tmp -%} + left join {{ relation }} as {{ prejoin_hash }} + on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_hash], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} + + {% endset -%} + {%- do prejoin_statements_list.append(prejoin_join_statement_tmp) -%} + {%- endif -%} + +{# select the prejoined columns #} + ,{{prejoin_hash}}.{{ vals['bk'] }} AS {{ col }} + {% endfor -%} + + FROM {{ last_cte }} lcte + + {{ prejoin_statements_list|join(' ')}} {% set last_cte = "prejoined_columns" -%} - {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names %} + {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names -%} ), {%- endif -%} diff --git a/macros/staging/snowflake/stage.sql b/macros/staging/snowflake/stage.sql index 956c632e..ed4c9d22 100644 --- a/macros/staging/snowflake/stage.sql +++ b/macros/staging/snowflake/stage.sql @@ -263,6 +263,7 @@ missing_columns AS ( ), {%- endif -%} + {%- if datavault4dbt.is_something(prejoined_columns) %} {# Prejoining Business Keys of other source objects for Link purposes #} prejoined_columns AS ( @@ -270,14 +271,12 @@ prejoined_columns AS ( SELECT {% if final_columns_to_select | length > 0 -%} {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} - {% endif %} - {%- for col, vals in prejoined_columns.items() -%} - ,pj_{{loop.index}}.{{ vals['bk'] }} AS {{ col }} - {% endfor -%} - - FROM {{ last_cte }} lcte + {%- endif -%} - {% for col, vals in prejoined_columns.items() %} + {#- prepare join statements -#} + {%- set prejoin_statements_list = [] -%} + {%- set processed_prejoin_hashes = [] -%} + {%- for col, vals in prejoined_columns.items() -%} {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} @@ -316,15 +315,29 @@ prejoined_columns AS ( {%- set operator = vals['operator'] -%} {%- endif -%} - {%- set prejoin_alias = 'pj_' + loop.index|string -%} - left join {{ relation }} as {{ prejoin_alias }} - on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} + {%- set prejoin_hash = '"' ~ local_md5(relation~vals['this_column_name']~operator~vals['ref_column_name']) ~ '"' -%} - {% endfor %} + {%- if not prejoin_hash in processed_prejoin_hashes %} + {%- do processed_prejoin_hashes.append(prejoin_hash) %} + {%- set prejoin_join_statement_tmp -%} + left join {{ relation }} as {{ prejoin_hash }} + on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_hash], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} + + {% endset -%} + {%- do prejoin_statements_list.append(prejoin_join_statement_tmp) -%} + {%- endif -%} + +{# select the prejoined columns #} + ,{{prejoin_hash}}.{{ vals['bk'] }} AS {{ col }} + {% endfor -%} + + FROM {{ last_cte }} lcte + + {{ prejoin_statements_list|join(' ')}} {% set last_cte = "prejoined_columns" -%} - {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names %} + {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names -%} ), {%- endif -%} diff --git a/macros/staging/synapse/stage.sql b/macros/staging/synapse/stage.sql index 84edee88..88356508 100644 --- a/macros/staging/synapse/stage.sql +++ b/macros/staging/synapse/stage.sql @@ -255,23 +255,20 @@ missing_columns AS ( ), {%- endif -%} -{%- if datavault4dbt.is_something(prejoined_columns) %} -{%- set final_columns_to_select = (final_columns_to_select + derived_input_columns) | unique | list -%} +{%- if datavault4dbt.is_something(prejoined_columns) %} {# Prejoining Business Keys of other source objects for Link purposes #} prejoined_columns AS ( SELECT {% if final_columns_to_select | length > 0 -%} {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} - {% endif %} - {%- for col, vals in prejoined_columns.items() -%} - ,pj_{{loop.index}}.{{ vals['bk'] }} AS {{ col }} - {% endfor -%} - - FROM {{ last_cte }} lcte + {%- endif -%} - {% for col, vals in prejoined_columns.items() %} + {#- prepare join statements -#} + {%- set prejoin_statements_list = [] -%} + {%- set processed_prejoin_hashes = [] -%} + {%- for col, vals in prejoined_columns.items() -%} {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} @@ -310,15 +307,29 @@ prejoined_columns AS ( {%- set operator = vals['operator'] -%} {%- endif -%} - {%- set prejoin_alias = 'pj_' + loop.index|string -%} - left join {{ relation }} as {{ prejoin_alias }} - on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} + {%- set prejoin_hash = '"' ~ local_md5(relation~vals['this_column_name']~operator~vals['ref_column_name']) ~ '"' -%} - {% endfor %} + {%- if not prejoin_hash in processed_prejoin_hashes %} + {%- do processed_prejoin_hashes.append(prejoin_hash) %} + {%- set prejoin_join_statement_tmp -%} + left join {{ relation }} as {{ prejoin_hash }} + on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_hash], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} + + {% endset -%} + {%- do prejoin_statements_list.append(prejoin_join_statement_tmp) -%} + {%- endif -%} + +{# select the prejoined columns #} + ,{{prejoin_hash}}.{{ vals['bk'] }} AS {{ col }} + {% endfor -%} + + FROM {{ last_cte }} lcte + + {{ prejoin_statements_list|join(' ')}} {% set last_cte = "prejoined_columns" -%} - {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names %} + {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names -%} ), {%- endif -%} From fa4087e5aad952b5038305f972dbb9448f0982b3 Mon Sep 17 00:00:00 2001 From: Theo Kiehn <162969167+tkiehn@users.noreply.github.com> Date: Wed, 27 Nov 2024 15:43:22 +0100 Subject: [PATCH 04/50] add check and compilation error if a prejoined column is defined twice --- .../helpers/stage_processing_macros.sql | 24 +++++++++++++------ 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/macros/internal/helpers/stage_processing_macros.sql b/macros/internal/helpers/stage_processing_macros.sql index 75a90978..71b0c718 100644 --- a/macros/internal/helpers/stage_processing_macros.sql +++ b/macros/internal/helpers/stage_processing_macros.sql @@ -144,7 +144,7 @@ {# If column aliases are present they they have to map 1:1 to the extract_columns #} {% if datavault4dbt.is_something(dict_item.aliases) - and not dict_item.aliases|length == dict_item.extract_columns|length%} + and not dict_item.aliases|length == dict_item.extract_columns|length %} {{ exceptions.raise_compiler_error("Prejoin aliases must have the same length as extract_columns") }} {% endif %} @@ -154,15 +154,20 @@ {# If aliases are defined they should be used as dict keys These will be used as new column names #} {% if datavault4dbt.is_something(dict_item.aliases) %} - {% set dict_key = dict_item.aliases[loop.index-1] %} + {% set dict_key = dict_item.aliases[loop.index0] %} {% else %} - {% set dict_key = dict_item.extract_columns[loop.index-1] %} + {% set dict_key = dict_item.extract_columns[loop.index0] %} + {% endif %} + + {# To make sure each column or alias is present only once #} + {% if dict_key|lower in return_dict.keys()|map('lower') %} + {{ exceptions.raise_compiler_error("Prejoined Column name or alias '" ~ dict_key ~ "' is defined twice.") }} {% endif %} {% set tmp_dict %} {{dict_key}}: ref_model: {{dict_item.ref_model}} - bk: {{dict_item.extract_columns[loop.index-1]}} + bk: {{dict_item.extract_columns[loop.index0]}} this_column_name: {{dict_item.this_column_name}} ref_column_name: {{dict_item.ref_column_name}} {% endset %} @@ -174,15 +179,20 @@ {# If aliases are defined they should be used as dict keys These will be used as new column names #} {% if datavault4dbt.is_something(dict_item.aliases) %} - {% set dict_key = dict_item.aliases[loop.index-1] %} + {% set dict_key = dict_item.aliases[loop.index0] %} {% else %} - {% set dict_key = dict_item.extract_columns[loop.index-1] %} + {% set dict_key = dict_item.extract_columns[loop.index0] %} + {% endif %} + + {# To make sure each column or alias is present only once #} + {% if dict_key|lower in return_dict.keys()|map('lower') %} + {{ exceptions.raise_compiler_error("Prejoined Column name or alias '" ~ dict_key ~ "' is defined twice.") }} {% endif %} {% set tmp_dict %} {{dict_key}}: ref_model: {{dict_item.ref_model}} - bk: {{dict_item.extract_columns[loop.index-1]}} + bk: {{dict_item.extract_columns[loop.index0]}} this_column_name: {{dict_item.this_column_name}} ref_column_name: {{dict_item.ref_column_name}} {% endset %} From 54b8720e9ccdb23e0f0fc0e282c6ec9c61ff5eb7 Mon Sep 17 00:00:00 2001 From: Theo Kiehn <162969167+tkiehn@users.noreply.github.com> Date: Wed, 27 Nov 2024 16:17:06 +0100 Subject: [PATCH 05/50] add amount of extract_columns and aliases to amount-mismatch compilation error message --- macros/internal/helpers/stage_processing_macros.sql | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/macros/internal/helpers/stage_processing_macros.sql b/macros/internal/helpers/stage_processing_macros.sql index 71b0c718..ad84a86a 100644 --- a/macros/internal/helpers/stage_processing_macros.sql +++ b/macros/internal/helpers/stage_processing_macros.sql @@ -145,7 +145,8 @@ {# If column aliases are present they they have to map 1:1 to the extract_columns #} {% if datavault4dbt.is_something(dict_item.aliases) and not dict_item.aliases|length == dict_item.extract_columns|length %} - {{ exceptions.raise_compiler_error("Prejoin aliases must have the same length as extract_columns") }} + {{ exceptions.raise_compiler_error("Prejoin aliases must have the same length as extract_columns. Got " + ~ dict_item.extract_columns|length ~ " extract_columns and " ~ dict_item.aliases|length ~ " aliases.") }} {% endif %} {# If multiple columns from the same source should be extracted each column has to be processed once #} From 60b2a985ef13f85f0ce80dbdc3dd93791a563c48 Mon Sep 17 00:00:00 2001 From: Theo Kiehn <162969167+tkiehn@users.noreply.github.com> Date: Wed, 27 Nov 2024 16:50:45 +0100 Subject: [PATCH 06/50] add prejoin with source to processing-macro --- .../internal/helpers/stage_processing_macros.sql | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/macros/internal/helpers/stage_processing_macros.sql b/macros/internal/helpers/stage_processing_macros.sql index ad84a86a..5881fc6a 100644 --- a/macros/internal/helpers/stage_processing_macros.sql +++ b/macros/internal/helpers/stage_processing_macros.sql @@ -167,7 +167,14 @@ {% set tmp_dict %} {{dict_key}}: + {%- if 'ref_model' in dict_item.keys()|map('lower') %} ref_model: {{dict_item.ref_model}} + {%- elif 'src_name' in dict_item.keys()|map('lower') and 'src_table' in dict_item.keys()|map('lower') %} + src_name: {{dict_item.src_name}} + src_table: {{dict_item.src_table}} + {%- else %} + {{ exceptions.raise_compiler_error("Either ref_model or src_name and src_table have to be defined for each prejoin") }} + {%- endif %} bk: {{dict_item.extract_columns[loop.index0]}} this_column_name: {{dict_item.this_column_name}} ref_column_name: {{dict_item.ref_column_name}} @@ -192,7 +199,14 @@ {% set tmp_dict %} {{dict_key}}: + {%- if 'ref_model' in dict_item.keys()|map('lower') %} ref_model: {{dict_item.ref_model}} + {%- elif 'src_name' in dict_item.keys()|map('lower') and 'src_table' in dict_item.keys()|map('lower') %} + src_name: {{dict_item.src_name}} + src_table: {{dict_item.src_table}} + {%- else %} + {{ exceptions.raise_compiler_error("Either ref_model or src_name and src_table have to be defined for each prejoin") }} + {%- endif %} bk: {{dict_item.extract_columns[loop.index0]}} this_column_name: {{dict_item.this_column_name}} ref_column_name: {{dict_item.ref_column_name}} From 9f1a06b718cb2a90b90af7ef0785ac33561b9859 Mon Sep 17 00:00:00 2001 From: Tim Kirschke Date: Mon, 2 Dec 2024 12:11:29 +0100 Subject: [PATCH 07/50] removed unnecessary md file --- General_Features.md | 8 -------- 1 file changed, 8 deletions(-) delete mode 100644 General_Features.md diff --git a/General_Features.md b/General_Features.md deleted file mode 100644 index f7de1ca4..00000000 --- a/General_Features.md +++ /dev/null @@ -1,8 +0,0 @@ -This package enforces a few features over all macros: - - - Ready for both Persistent Staging Areas and Transient Staging Areas, due to the allowance of multiple deltas in all macros, without losing any intermediate changes - - Enforcing standards in naming conventions by implementing global variables for technical columns - - Following the insert-only-approach by using a mix of tables and views - - Creating a snapshot-based Business interface by using a centralized snapshot table supporting logarithmic logic - - Optimizing incremental loads by implementing a high-water-mark that also works for entities that are loaded from multiple sources - From d18d28b2e8499cb755c9c626b3d1872d80ff854f Mon Sep 17 00:00:00 2001 From: Tim Kirschke Date: Mon, 2 Dec 2024 15:09:57 +0100 Subject: [PATCH 08/50] First version of yaml parsing, needs to be turned into macro --- macros/tables/control_snap_v0.sql | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/macros/tables/control_snap_v0.sql b/macros/tables/control_snap_v0.sql index 2a4a8f3e..b569c115 100644 --- a/macros/tables/control_snap_v0.sql +++ b/macros/tables/control_snap_v0.sql @@ -61,7 +61,36 @@ #} -{%- macro control_snap_v0(start_date, daily_snapshot_time, sdts_alias=none, end_date=none) -%} +{%- macro control_snap_v0(yaml_metadata, start_date, daily_snapshot_time, sdts_alias=none, end_date=none) -%} + + + {% set start_date_description = " + Description of Parameter 'start_date': + start_date::timestamp Defines the earliest timestamp that should be available inside the snapshot_table. The time part of this + timestamp needs to be set to '00:00:00'. The format of this timestamp must equal to the timestamp format + defined in the global variable 'datavault4dbt.timestamp_format'. + + Examples: + '2015-01-01T00-00-00' This snapshot table would hold daily snapshots beginning at 2015. + " %} + + + {% if datavault4dbt.is_something(yaml_metadata) %} + {%- set yaml_metadata = fromyaml(yaml_metadata) -%} + {% if 'start_date' in yaml_metadata.keys() %} + {% set start_date = yaml_metadata.get('start_date') %} + {% elif datavault4dbt.is_something(start_date) %} + {% set start_date = start_date %} + {% do exceptions.warn("[" ~ this ~ "] Warning: yaml-metadata given, but parameter 'start_date' not defined in there. Using 'start_date' parameter defined outside. We advise to use only one method of parameter passing.") %} + {% else %} + {{ exceptions.raise_compiler_error("[" ~ this ~ "] Error: yaml-metadata given, but required parameter 'start_date' not defined in there or outside in the parameter." ~ start_date_description ) }} + {% endif %} + {% elif datavault4dbt.is_something(start_date) %} + {% set start_date_start_dateoutput = start_date %} + {% else %} + {{ exceptions.raise_compiler_error("[" ~ this ~ "] Error: Required parameter 'start_date' not defined. Define it either directly, or inside yaml-metadata." ~ start_date_description ) }} + {% endif %} + {%- set sdts_alias = datavault4dbt.replace_standard(sdts_alias, 'datavault4dbt.sdts_alias', 'sdts') -%} From 24ee6baf5bc7784bdcee755413ef3ba990c29039 Mon Sep 17 00:00:00 2001 From: Tim Kirschke Date: Mon, 2 Dec 2024 16:41:19 +0100 Subject: [PATCH 09/50] First version of macro, and Control_snap_v0 adjusted to use it. --- .../metadata_processing.yml | 21 ++++++++ .../yaml_metadata_parser.sql | 27 ++++++++++ macros/tables/control_snap_v0.sql | 51 ++++++++++--------- 3 files changed, 76 insertions(+), 23 deletions(-) create mode 100644 macros/internal/metadata_processing/metadata_processing.yml create mode 100644 macros/internal/metadata_processing/yaml_metadata_parser.sql diff --git a/macros/internal/metadata_processing/metadata_processing.yml b/macros/internal/metadata_processing/metadata_processing.yml new file mode 100644 index 00000000..6fbd0a53 --- /dev/null +++ b/macros/internal/metadata_processing/metadata_processing.yml @@ -0,0 +1,21 @@ +version: 2 + +macros: + - name: yaml_metadata_parser + description: A macro to parse yaml-metadata into single parameters. Used in top-level front-end macros. + arguments: + - name: name + type: string + description: The name of the parameter you want to extract of the yaml-metadata. + - name: yaml_metadata + type: string + description: The yaml-string that holds the definition of other parameters. Needs to be in yaml format. + - name: parameter + type: variable + description: The forwarded parameter of the top-level macro. This is used, if the yaml-metadata is none. + - name: required + type: boolean + description: Whether this parameter is required for the top-level macro. Default is False. + - name: documentation + type: string + description: A string that holds documentation of this parameter. diff --git a/macros/internal/metadata_processing/yaml_metadata_parser.sql b/macros/internal/metadata_processing/yaml_metadata_parser.sql new file mode 100644 index 00000000..c0aced7b --- /dev/null +++ b/macros/internal/metadata_processing/yaml_metadata_parser.sql @@ -0,0 +1,27 @@ +{% macro yaml_metadata_parser(name=none, yaml_metadata=none, parameter=none, required=False, documentation=none) %} + + {% if datavault4dbt.is_something(yaml_metadata) %} + {%- set yaml_metadata = fromyaml(yaml_metadata) -%} + {% if name in yaml_metadata.keys() %} + {% set return_value = yaml_metadata.get(name) %} + {% elif datavault4dbt.is_something(parameter) %} + {% set return_value = parameter %} + + {% if required %} + {% do exceptions.warn("[" ~ this ~ "] Warning: yaml-metadata given, but parameter '" ~ name ~ "' not defined in there. Using '" ~ name ~ "' parameter defined outside. We advise to use only one method of parameter passing.") %} + {% else %} + + {% elif required %} + {{ exceptions.raise_compiler_error("[" ~ this ~ "] Error: yaml-metadata given, but required parameter '" ~ name ~ "' not defined in there or outside in the parameter. \n Description of parameter '" ~ name ~ "': \n" ~ documentation ) }} + {% endif %} + {% elif datavault4dbt.is_something(parameter) %} + {% set return_value = parameter %} + {% elif required %} + {{ exceptions.raise_compiler_error("[" ~ this ~ "] Error: Required parameter '" ~ name ~ "' not defined. Define it either directly, or inside yaml-metadata. \n Description of parameter '" ~ name ~ "': \n" ~ documentation ) }} + {% else %} + {% set return_value = None %} + {% endif %} + + {{ return(return_value) }} + +{% endmacro %} \ No newline at end of file diff --git a/macros/tables/control_snap_v0.sql b/macros/tables/control_snap_v0.sql index b569c115..06f54d69 100644 --- a/macros/tables/control_snap_v0.sql +++ b/macros/tables/control_snap_v0.sql @@ -65,33 +65,38 @@ {% set start_date_description = " - Description of Parameter 'start_date': - start_date::timestamp Defines the earliest timestamp that should be available inside the snapshot_table. The time part of this - timestamp needs to be set to '00:00:00'. The format of this timestamp must equal to the timestamp format - defined in the global variable 'datavault4dbt.timestamp_format'. + start_date::timestamp Defines the earliest timestamp that should be available inside the snapshot_table. The time part of this + timestamp needs to be set to '00:00:00'. The format of this timestamp must equal to the timestamp format + defined in the global variable 'datavault4dbt.timestamp_format'. - Examples: - '2015-01-01T00-00-00' This snapshot table would hold daily snapshots beginning at 2015. + Examples: + '2015-01-01T00-00-00' This snapshot table would hold daily snapshots beginning at 2015. + " %} + + {% set daily_snapshot_time_description = " + daily_snapshot_time::time Defines the time that your daily snapshots should have. Usually this is either something right before + daily business starts, or after daily business is over. + + Examples: + '07:30:00' The snapshots inside this table would all have the time '07:30:00'. + '23:00:00' The snapshots inside this table would all have the time '23:00:00'. + " %} + + {% set sdts_alias_description = " + sdts_alias::string Defines the name of the snapshot date timestamp column inside the snapshot_table. It is optional, + if not set will use the global variable `datavault4dbt.sdts_alias` set inside dbt_project.yml " %} + {% set end_date_description = " + end_date::timestamp Defines the latest timestamp that should be available inside the snapshot_table. + " %} + + + {%- set start_date = datavault4dbt.yaml_metadata_parser(name='start_date', yaml_metadata=yaml_metadata, parameter=start_date, required=True, documentation=start_date_description) -%} + {%- set daily_snapshot_time = datavault4dbt.yaml_metadata_parser(name='daily_snapshot_time', yaml_metadata=yaml_metadata, parameter=daily_snapshot_time, required=True, documentation=daily_snapshot_time_description) -%} + {%- set sdts_alias = datavault4dbt.yaml_metadata_parser(name='sdts_alias', yaml_metadata=yaml_metadata, parameter=sdts_alias, required=False, documentation=sdts_alias_description) -%} + {%- set end_date = datavault4dbt.yaml_metadata_parser(name='end_date', yaml_metadata=yaml_metadata, parameter=end_date, required=False, documentation=end_date_description) -%} - {% if datavault4dbt.is_something(yaml_metadata) %} - {%- set yaml_metadata = fromyaml(yaml_metadata) -%} - {% if 'start_date' in yaml_metadata.keys() %} - {% set start_date = yaml_metadata.get('start_date') %} - {% elif datavault4dbt.is_something(start_date) %} - {% set start_date = start_date %} - {% do exceptions.warn("[" ~ this ~ "] Warning: yaml-metadata given, but parameter 'start_date' not defined in there. Using 'start_date' parameter defined outside. We advise to use only one method of parameter passing.") %} - {% else %} - {{ exceptions.raise_compiler_error("[" ~ this ~ "] Error: yaml-metadata given, but required parameter 'start_date' not defined in there or outside in the parameter." ~ start_date_description ) }} - {% endif %} - {% elif datavault4dbt.is_something(start_date) %} - {% set start_date_start_dateoutput = start_date %} - {% else %} - {{ exceptions.raise_compiler_error("[" ~ this ~ "] Error: Required parameter 'start_date' not defined. Define it either directly, or inside yaml-metadata." ~ start_date_description ) }} - {% endif %} - - {%- set sdts_alias = datavault4dbt.replace_standard(sdts_alias, 'datavault4dbt.sdts_alias', 'sdts') -%} {{ adapter.dispatch('control_snap_v0', 'datavault4dbt')(start_date=start_date, From 956e71abdbba5b8fbdeb2ce5e3cf4adce406a14c Mon Sep 17 00:00:00 2001 From: Theo Kiehn <162969167+tkiehn@users.noreply.github.com> Date: Tue, 3 Dec 2024 10:34:33 +0100 Subject: [PATCH 10/50] Update yaml_metadata_parser.sql, fix if-nesting, rename metadata_dict --- .../metadata_processing/yaml_metadata_parser.sql | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/macros/internal/metadata_processing/yaml_metadata_parser.sql b/macros/internal/metadata_processing/yaml_metadata_parser.sql index c0aced7b..c02c7a19 100644 --- a/macros/internal/metadata_processing/yaml_metadata_parser.sql +++ b/macros/internal/metadata_processing/yaml_metadata_parser.sql @@ -1,16 +1,12 @@ {% macro yaml_metadata_parser(name=none, yaml_metadata=none, parameter=none, required=False, documentation=none) %} {% if datavault4dbt.is_something(yaml_metadata) %} - {%- set yaml_metadata = fromyaml(yaml_metadata) -%} - {% if name in yaml_metadata.keys() %} - {% set return_value = yaml_metadata.get(name) %} + {%- set metadata_dict = fromyaml(yaml_metadata) -%} + {% if name in metadata_dict.keys() %} + {% set return_value = metadata_dict.get(name) %} {% elif datavault4dbt.is_something(parameter) %} {% set return_value = parameter %} - - {% if required %} - {% do exceptions.warn("[" ~ this ~ "] Warning: yaml-metadata given, but parameter '" ~ name ~ "' not defined in there. Using '" ~ name ~ "' parameter defined outside. We advise to use only one method of parameter passing.") %} - {% else %} - + {% do exceptions.warn("[" ~ this ~ "] Warning: yaml-metadata given, but parameter '" ~ name ~ "' not defined in there. Using '" ~ name ~ "' parameter defined outside. We advise to use only one method of parameter passing.") %} {% elif required %} {{ exceptions.raise_compiler_error("[" ~ this ~ "] Error: yaml-metadata given, but required parameter '" ~ name ~ "' not defined in there or outside in the parameter. \n Description of parameter '" ~ name ~ "': \n" ~ documentation ) }} {% endif %} @@ -24,4 +20,4 @@ {{ return(return_value) }} -{% endmacro %} \ No newline at end of file +{% endmacro %} From 3a94499cc813f04e45e81eae75f33f149720254a Mon Sep 17 00:00:00 2001 From: Tim Kirschke Date: Tue, 3 Dec 2024 11:00:29 +0100 Subject: [PATCH 11/50] Fixed warnings --- .../yaml_metadata_parser.sql | 9 ++++----- macros/tables/control_snap_v0.sql | 19 ------------------- 2 files changed, 4 insertions(+), 24 deletions(-) diff --git a/macros/internal/metadata_processing/yaml_metadata_parser.sql b/macros/internal/metadata_processing/yaml_metadata_parser.sql index c0aced7b..88bbbc52 100644 --- a/macros/internal/metadata_processing/yaml_metadata_parser.sql +++ b/macros/internal/metadata_processing/yaml_metadata_parser.sql @@ -4,13 +4,12 @@ {%- set yaml_metadata = fromyaml(yaml_metadata) -%} {% if name in yaml_metadata.keys() %} {% set return_value = yaml_metadata.get(name) %} + {% if datavault4dbt.is_something(parameter)%} + {% do exceptions.warn("[" ~ this ~ "] Warning: Parameter '" ~ name ~ "' defined both in yaml-metadata and separately. Definition in yaml-metadata will be used, and separate parameter is ignored.") %} + {% endif %} {% elif datavault4dbt.is_something(parameter) %} {% set return_value = parameter %} - - {% if required %} - {% do exceptions.warn("[" ~ this ~ "] Warning: yaml-metadata given, but parameter '" ~ name ~ "' not defined in there. Using '" ~ name ~ "' parameter defined outside. We advise to use only one method of parameter passing.") %} - {% else %} - + {% do exceptions.warn("[" ~ this ~ "] Warning: yaml-metadata given, but parameter '" ~ name ~ "' not defined in there. Using '" ~ name ~ "' parameter defined outside. We advise to use only one method of parameter passing.") %} {% elif required %} {{ exceptions.raise_compiler_error("[" ~ this ~ "] Error: yaml-metadata given, but required parameter '" ~ name ~ "' not defined in there or outside in the parameter. \n Description of parameter '" ~ name ~ "': \n" ~ documentation ) }} {% endif %} diff --git a/macros/tables/control_snap_v0.sql b/macros/tables/control_snap_v0.sql index 06f54d69..7d009ee3 100644 --- a/macros/tables/control_snap_v0.sql +++ b/macros/tables/control_snap_v0.sql @@ -40,25 +40,6 @@ logic that is applied in the version 1 snapshot table on top of this one. This column is automatically set to TRUE. - Parameters: - - start_date::timestamp Defines the earliest timestamp that should be available inside the snapshot_table. The time part of this - timestamp needs to be set to '00:00:00'. The format of this timestamp must equal to the timestamp format - defined in the global variable 'datavault4dbt.timestamp_format'. - - Examples: - '2015-01-01T00-00-00' This snapshot table would hold daily snapshots beginning at 2015. - - daily_snapshot_time::time Defines the time that your daily snapshots should have. Usually this is either something right before - daily business starts, or after daily business is over. - - Examples: - '07:30:00' The snapshots inside this table would all have the time '07:30:00'. - '23:00:00' The snapshots inside this table would all have the time '23:00:00'. - - sdts_alias::string Defines the name of the snapshot date timestamp column inside the snapshot_table. It is optional, - if not set will use the global variable `datavault4dbt.sdts_alias` set inside dbt_project.yml - #} {%- macro control_snap_v0(yaml_metadata, start_date, daily_snapshot_time, sdts_alias=none, end_date=none) -%} From 3fed9637774800dd34936ff0e3bcedb58eee3e87 Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Mon, 9 Dec 2024 16:29:55 +0100 Subject: [PATCH 12/50] move stage_processing_macros.sql into staging folder --- macros/{internal/helpers => staging}/stage_processing_macros.sql | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename macros/{internal/helpers => staging}/stage_processing_macros.sql (100%) diff --git a/macros/internal/helpers/stage_processing_macros.sql b/macros/staging/stage_processing_macros.sql similarity index 100% rename from macros/internal/helpers/stage_processing_macros.sql rename to macros/staging/stage_processing_macros.sql From edf3dc472862a38dc2b17598066ddf903433f344 Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Wed, 11 Dec 2024 15:09:34 +0100 Subject: [PATCH 13/50] change extract_input_columns, process_prejoined_columns. add extract_prejoin_column_names change extract_input_columns to handle the new prejoin list syntax. change process_prejoined_columns to parse the old dict syntax to the new list syntax, merging prejoins with the same conditions. add extract_prejoin_column_names which can handle the new list syntax --- macros/staging/stage_processing_macros.sql | 178 ++++++++++----------- 1 file changed, 85 insertions(+), 93 deletions(-) diff --git a/macros/staging/stage_processing_macros.sql b/macros/staging/stage_processing_macros.sql index 5881fc6a..3f77d594 100644 --- a/macros/staging/stage_processing_macros.sql +++ b/macros/staging/stage_processing_macros.sql @@ -54,24 +54,27 @@ {# Do nothing. No source column required. #} {%- elif value is mapping and value.is_hashdiff -%} {%- do extracted_input_columns.append(value['columns']) -%} - {%- elif value is mapping and 'this_column_name' in value.keys() -%} - {%- if datavault4dbt.is_list(value['this_column_name'])-%} - {%- for column in value['this_column_name'] -%} - {%- do extracted_input_columns.append(column) -%} - {%- endfor -%} - {%- else -%} - {%- do extracted_input_columns.append(value['this_column_name']) -%} - {%- endif -%} {%- else -%} {%- do extracted_input_columns.append(value) -%} {%- endif -%} {%- endfor -%} - - {%- do return(extracted_input_columns) -%} + + {%- elif datavault4dbt.is_list(columns_dict) -%} + {% for prejoin in columns_dict %} + {%- if datavault4dbt.is_list(prejoin['this_column_name'])-%} + {%- for column in prejoin['this_column_name'] -%} + {%- do extracted_input_columns.append(column) -%} + {%- endfor -%} + {%- else -%} + {%- do extracted_input_columns.append(prejoin['this_column_name']) -%} + {%- endif -%} + {% endfor %} {%- else -%} {%- do return([]) -%} {%- endif -%} + {%- do return(extracted_input_columns) -%} + {%- endmacro -%} @@ -127,96 +130,85 @@ {%- macro process_prejoined_columns(prejoined_columns=none) -%} -{# Check if the new list syntax is used for prejoined columns - If so parse it to dictionaries #} - -{% if not datavault4dbt.is_list(prejoined_columns) %} - {% do return(prejoined_columns) %} -{% else %} - {# if the (new) list syntax for prejoins is used - it needs to be converted to the old syntax #} - - {# Initialize emtpy dict which will be filled by each entry #} - {% set return_dict = {} %} - - {# Iterate over each dictionary in the prejoined_colums-list #} - {% for dict_item in prejoined_columns %} - - {# If column aliases are present they they have to map 1:1 to the extract_columns #} - {% if datavault4dbt.is_something(dict_item.aliases) - and not dict_item.aliases|length == dict_item.extract_columns|length %} - {{ exceptions.raise_compiler_error("Prejoin aliases must have the same length as extract_columns. Got " - ~ dict_item.extract_columns|length ~ " extract_columns and " ~ dict_item.aliases|length ~ " aliases.") }} - {% endif %} - - {# If multiple columns from the same source should be extracted each column has to be processed once #} - {% if datavault4dbt.is_list(dict_item.extract_columns) %} - {% for column in dict_item.extract_columns %} - {# If aliases are defined they should be used as dict keys - These will be used as new column names #} - {% if datavault4dbt.is_something(dict_item.aliases) %} - {% set dict_key = dict_item.aliases[loop.index0] %} - {% else %} - {% set dict_key = dict_item.extract_columns[loop.index0] %} + {# Check if the old syntax is used for prejoined columns + If so parse it to new list syntax #} + + {% if datavault4dbt.is_list(prejoined_columns) %} + {% do return(prejoined_columns) %} + {% else %} + {% set output = [] %} + + {% for key, value in prejoined_columns.items() %} + {% set ref_model = value.get('ref_model') %} + {% set src_name = value.get('src_name') %} + {% set src_table = value.get('src_table') %} + {%- if 'operator' not in value.keys() -%} + {%- do value.update({'operator': 'AND'}) -%} + {%- set operator = 'AND' -%} + {%- else -%} + {%- set operator = value.get('operator') -%} + {%- endif -%} + + {% set match_criteria = ( + ref_model and output | selectattr('ref_model', 'equalto', ref_model) or + src_name and output | selectattr('src_name', 'equalto', src_name) | selectattr('src_table', 'equalto', src_table) + ) | selectattr('this_column_name', 'equalto', value.this_column_name) + | selectattr('ref_column_name', 'equalto', value.ref_column_name) + | selectattr('operator', 'equalto', value.operator) + | list | first %} + + {% if match_criteria %} + {% do match_criteria['extract_columns'].append(value.bk) %} + {% do match_criteria['aliases'].append(key) %} + {% else %} + {% set new_item = { + 'extract_columns': [value.bk], + 'aliases': [key], + 'this_column_name': value.this_column_name, + 'ref_column_name': value.ref_column_name, + 'operator': operator + } %} + + {% if ref_model %} + {% do new_item.update({'ref_model': ref_model}) %} + {% elif src_name and src_table %} + {% do new_item.update({'src_name': src_name, 'src_table': src_table}) %} {% endif %} + + {% do output.append(new_item) %} + {% endif %} + {% endfor %} + {% endif %} - {# To make sure each column or alias is present only once #} - {% if dict_key|lower in return_dict.keys()|map('lower') %} - {{ exceptions.raise_compiler_error("Prejoined Column name or alias '" ~ dict_key ~ "' is defined twice.") }} - {% endif %} + {%- do return(output) -%} - {% set tmp_dict %} - {{dict_key}}: - {%- if 'ref_model' in dict_item.keys()|map('lower') %} - ref_model: {{dict_item.ref_model}} - {%- elif 'src_name' in dict_item.keys()|map('lower') and 'src_table' in dict_item.keys()|map('lower') %} - src_name: {{dict_item.src_name}} - src_table: {{dict_item.src_table}} - {%- else %} - {{ exceptions.raise_compiler_error("Either ref_model or src_name and src_table have to be defined for each prejoin") }} - {%- endif %} - bk: {{dict_item.extract_columns[loop.index0]}} - this_column_name: {{dict_item.this_column_name}} - ref_column_name: {{dict_item.ref_column_name}} - {% endset %} - {% do return_dict.update(fromyaml(tmp_dict)) %} - {% endfor %} +{%- endmacro -%} - {% else %} - {# If aliases are defined they should be used as dict keys - These will be used as new column names #} - {% if datavault4dbt.is_something(dict_item.aliases) %} - {% set dict_key = dict_item.aliases[loop.index0] %} - {% else %} - {% set dict_key = dict_item.extract_columns[loop.index0] %} - {% endif %} +{%- macro extract_prejoin_column_names(prejoined_columns=none) -%} - {# To make sure each column or alias is present only once #} - {% if dict_key|lower in return_dict.keys()|map('lower') %} - {{ exceptions.raise_compiler_error("Prejoined Column name or alias '" ~ dict_key ~ "' is defined twice.") }} - {% endif %} + {%- set extracted_column_names = [] -%} + + {% if not datavault4dbt.is_something(prejoined_columns) %} + {%- do return(extracted_column_names) -%} + {% endif %} - {% set tmp_dict %} - {{dict_key}}: - {%- if 'ref_model' in dict_item.keys()|map('lower') %} - ref_model: {{dict_item.ref_model}} - {%- elif 'src_name' in dict_item.keys()|map('lower') and 'src_table' in dict_item.keys()|map('lower') %} - src_name: {{dict_item.src_name}} - src_table: {{dict_item.src_table}} - {%- else %} - {{ exceptions.raise_compiler_error("Either ref_model or src_name and src_table have to be defined for each prejoin") }} - {%- endif %} - bk: {{dict_item.extract_columns[loop.index0]}} - this_column_name: {{dict_item.this_column_name}} - ref_column_name: {{dict_item.ref_column_name}} - {% endset %} - {% do return_dict.update(fromyaml(tmp_dict)) %} + {% for prejoin in prejoined_columns %} + {% if datavault4dbt.is_list(prejoin['aliases']) %} + {% for alias in prejoin['aliases'] %} + {%- do extracted_column_names.append(alias) -%} + {% endfor %} + {% elif datavault4dbt.is_something(prejoin['aliases']) %} + {%- do extracted_column_names.append(prejoin['aliases']) -%} + {% elif datavault4dbt.is_list(prejoin['extract_columns']) %} + {% for column in prejoin['extract_columns'] %} + {%- do extracted_column_names.append(column) -%} + {% endfor %} + {% else %} + {%- do extracted_column_names.append(prejoin['extract_columns']) -%} {% endif %} - {% endfor %} - - {%- do return(return_dict) -%} - -{% endif %} + {%- endfor -%} + + {%- do return(extracted_column_names) -%} {%- endmacro -%} From da7d00047dc4add7f634f7c0e5b2ad5740a7f008 Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Wed, 11 Dec 2024 15:10:05 +0100 Subject: [PATCH 14/50] add staging.yml with descriptions of process_prejoined_columns and extract_prejoin_column_names --- macros/staging/staging.yml | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 macros/staging/staging.yml diff --git a/macros/staging/staging.yml b/macros/staging/staging.yml new file mode 100644 index 00000000..86988615 --- /dev/null +++ b/macros/staging/staging.yml @@ -0,0 +1,23 @@ +version: 2 + +macros: + - name: process_prejoined_columns + description: > + A macro to process prejoined columns. If a list of dictioniaries(new syntax) is provided it will do nothing and return the list. + If a dictionary of dictionaries if provided(old syntax) it will be transformed to the new syntax. + When multiple columns are to be extracted from the same prejoin-target and with the same conditions(columns and operator) they will be combined into one item. + arguments: + - name: prejoined_columns + type: list or dictionary + description: The value of the prejoined_columns as defined in the yaml_metadata of the stage-model. + + - name: extract_prejoin_column_names + description: > + A macro to extract the names of the prejoined columns of each staging-model. + Takes a list of prejoins and will add the aliases of the prejoins to the return-list. + If no aliases are present it will return the names of the extracted columns. + Returns an empty list if the passed parameter is empty. + arguments: + - name: prejoined_columns + type: list + description: The prejoined_columns as process by the process_prejoined_columns-macro \ No newline at end of file From a970a0a08e43e776ae3bdc84a31c2a8a4fd0527f Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Wed, 11 Dec 2024 15:11:38 +0100 Subject: [PATCH 15/50] postgres: modify stage to handle new prejoin syntax and simplify setting of commas for ghost records --- macros/staging/postgres/stage.sql | 196 +++++++++++++++--------------- 1 file changed, 100 insertions(+), 96 deletions(-) diff --git a/macros/staging/postgres/stage.sql b/macros/staging/postgres/stage.sql index 1f4ef548..f5cd9898 100644 --- a/macros/staging/postgres/stage.sql +++ b/macros/staging/postgres/stage.sql @@ -96,7 +96,7 @@ {# Getting the column names for all additional columns #} {%- set derived_column_names = datavault4dbt.extract_column_names(derived_columns) -%} {%- set hashed_column_names = datavault4dbt.extract_column_names(hashed_columns) -%} -{%- set prejoined_column_names = datavault4dbt.extract_column_names(prejoined_columns) -%} +{%- set prejoined_column_names = datavault4dbt.extract_prejoin_column_names(prejoined_columns) -%} {%- set missing_column_names = datavault4dbt.extract_column_names(missing_columns) -%} {%- set exclude_column_names = hashed_column_names + prejoined_column_names + missing_column_names + ldts_rsrc_input_column_names %} {%- set source_and_derived_column_names = (all_source_columns + derived_column_names) | unique | list -%} @@ -266,15 +266,41 @@ prejoined_columns AS ( {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} {%- endif -%} - {#- prepare join statements -#} - {%- set prejoin_statements_list = [] -%} - {%- set processed_prejoin_hashes = [] -%} - {%- for col, vals in prejoined_columns.items() -%} + {# Iterate over each prejoin, doing logic checks and generating the select-statements #} + {%- for prejoin in prejoined_columns -%} + {%- set prejoin_alias = 'pj_' + loop.index|string -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {# If extract_columns and/or aliases are passed as string convert them to a list so they can be used as iterators later #} + {%- if not datavault4dbt.is_list(prejoin['extract_columns'])-%} + {%- do prejoin.update({'extract_columns': [prejoin['extract_columns']]}) -%} + {%- endif -%} + {%- if not datavault4dbt.is_list(prejoin['aliases']) and datavault4dbt.is_something(prejoin['aliases']) -%} + {%- do prejoin.update({'aliases': [prejoin['aliases']]}) -%} + {%- endif -%} + + {# If passed, make sure there are as many aliases as there are extract_columns, ensuring a 1:1 mapping #} + {%- if datavault4dbt.is_something(prejoin['aliases']) -%} + {%- if not prejoin['aliases']|length == prejoin['extract_columns']|length -%} + {%- do exceptions.raise_compiler_error("Prejoin aliases must have the same length as extract_columns. Got " + ~ prejoin['extract_columns']|length ~ " extract_column(s) and " ~ prejoin['aliases']|length ~ " aliase(s).") -%} + {%- endif -%} + {%- endif -%} + + {# Generate the columns for the SELECT-statement #} + {%- for column in prejoin['extract_columns'] %} + ,{{ prejoin_alias }}.{{ column }} {% if datavault4dbt.is_something(prejoin['aliases']) -%} AS {{ prejoin['aliases'][loop.index0] }} {% endif -%} + {%- endfor -%} + {%- endfor %} + + FROM {{ last_cte }} lcte + + {# Iterate over prejoins and generate the join-statements #} + {%- for prejoin in prejoined_columns -%} + + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- else -%} {%- set error_message -%} Prejoin error: Invalid target entity definition. Allowed are: @@ -295,42 +321,25 @@ prejoined_columns AS ( ref_column_name: join_columns_in_ref_model Got: - {{ col }}: {{ vals }} + {{ prejoin }} {%- endset -%} - {%- do exceptions.raise_compiler_error(error_message) -%} + {%- do exceptions.raise_compiler_error(error_message) -%} {%- endif -%} -{# This sets a default value for the operator that connects multiple joining conditions. Only when it is not set by user. #} - {%- if 'operator' not in vals.keys() -%} + {%- if 'operator' not in prejoin.keys() -%} {%- set operator = 'AND' -%} {%- else -%} - {%- set operator = vals['operator'] -%} + {%- set operator = prejoin['operator'] -%} {%- endif -%} - - - {%- set prejoin_hash = '"' ~ local_md5(relation~vals['this_column_name']~operator~vals['ref_column_name']) ~ '"' -%} - - {%- if not prejoin_hash in processed_prejoin_hashes %} - {%- do processed_prejoin_hashes.append(prejoin_hash) %} - {%- set prejoin_join_statement_tmp -%} - left join {{ relation }} as {{ prejoin_hash }} - on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_hash], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} - - {% endset -%} - {%- do prejoin_statements_list.append(prejoin_join_statement_tmp) -%} - {%- endif -%} - -{# select the prejoined columns #} - ,{{prejoin_hash}}.{{ vals['bk'] }} AS {{ col }} - {% endfor -%} - - FROM {{ last_cte }} lcte - - {{ prejoin_statements_list|join(' ')}} + {%- set prejoin_alias = 'pj_' + loop.index|string %} + + left join {{ relation }} as {{ prejoin_alias }} + on {{ datavault4dbt.multikey(columns=prejoin['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=prejoin['ref_column_name']) }} + {%- endfor -%} {% set last_cte = "prejoined_columns" -%} - {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names -%} + {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names %} ), {%- endif -%} @@ -457,65 +466,61 @@ unknown_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ unknown_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ unknown_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for all source columns, except the ldts, rsrc & edwSequence column #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{- datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes#} - {% for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes #} + {%- for prejoin in prejoined_columns -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {%- set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} - {{ log('pj_relation_columns: ' ~ pj_relation_columns, false ) }} - - {% for column in pj_relation_columns -%} - - {% if column.name|lower == vals['bk']|lower -%} - {{ log('column found? yes, for column :' ~ column.name , false) }} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', alias=col) }} + {{ log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) }} + + {%- for column in pj_relation_columns -%} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} {%- endfor -%} - {%- if not loop.last %},{% endif %} {% endfor -%} {%- endif %} - {%- if datavault4dbt.is_something(derived_columns) -%}, - {# Additionally generating Ghost Records for Derived Columns #} - {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + {%- if datavault4dbt.is_something(derived_columns) -%} + {# Additionally generating Ghost Records for Derived Columns #} + {% for column_name, properties in derived_columns_with_datatypes_DICT.items() %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} @@ -527,62 +532,61 @@ error_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ error_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ error_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for Source Columns #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for Missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes #} - {%- for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes#} + {% for prejoin in prejoined_columns %} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} + {{- log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) -}} - {% for column in pj_relation_columns -%} - {% if column.name|lower == vals['bk']|lower -%} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', alias=col) -}} - {%- endif -%} - {%- endfor -%} - {%- if not loop.last -%},{%- endif %} - {% endfor -%} + {% for column in pj_relation_columns -%} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', alias=prejoin['aliases'][prejoin_col_index]) }} + {%- endif -%} - {%- endif -%} + {%- endfor -%} + {% endfor -%} + {%- endif %} - {%- if datavault4dbt.is_something(derived_columns) %}, + {%- if datavault4dbt.is_something(derived_columns) %} {# Additionally generating Ghost Records for Derived Columns #} {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} From f8767ae1e1c73f283db929f5b596d0727af739e4 Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Wed, 11 Dec 2024 16:46:20 +0100 Subject: [PATCH 16/50] bigquery: stage: implement new prejoin syntax --- macros/staging/bigquery/stage.sql | 188 +++++++++++++++--------------- 1 file changed, 97 insertions(+), 91 deletions(-) diff --git a/macros/staging/bigquery/stage.sql b/macros/staging/bigquery/stage.sql index 0486c5f2..8e9b1d47 100644 --- a/macros/staging/bigquery/stage.sql +++ b/macros/staging/bigquery/stage.sql @@ -96,7 +96,7 @@ {# Getting the column names for all additional columns #} {%- set derived_column_names = datavault4dbt.extract_column_names(derived_columns) -%} {%- set hashed_column_names = datavault4dbt.extract_column_names(hashed_columns) -%} -{%- set prejoined_column_names = datavault4dbt.extract_column_names(prejoined_columns) -%} +{%- set prejoined_column_names = datavault4dbt.extract_prejoin_column_names(prejoined_columns) -%} {%- set missing_column_names = datavault4dbt.extract_column_names(missing_columns) -%} {%- set exclude_column_names = hashed_column_names + prejoined_column_names + missing_column_names + ldts_rsrc_input_column_names %} {%- set source_and_derived_column_names = (all_source_columns + derived_column_names) | unique | list -%} @@ -183,6 +183,8 @@ {# Setting the ldts default datatype #} {% set ldts_default_dtype = datavault4dbt.timestamp_default_dtype() %} +{{ datavault4dbt.prepend_generated_by() }} + WITH {# Selecting everything that we need from the source relation. #} @@ -266,15 +268,41 @@ prejoined_columns AS ( {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} {%- endif -%} - {#- prepare join statements -#} - {%- set prejoin_statements_list = [] -%} - {%- set processed_prejoin_hashes = [] -%} - {%- for col, vals in prejoined_columns.items() -%} + {# Iterate over each prejoin, doing logic checks and generating the select-statements #} + {%- for prejoin in prejoined_columns -%} + {%- set prejoin_alias = 'pj_' + loop.index|string -%} + + {# If extract_columns and/or aliases are passed as string convert them to a list so they can be used as iterators later #} + {%- if not datavault4dbt.is_list(prejoin['extract_columns'])-%} + {%- do prejoin.update({'extract_columns': [prejoin['extract_columns']]}) -%} + {%- endif -%} + {%- if not datavault4dbt.is_list(prejoin['aliases']) and datavault4dbt.is_something(prejoin['aliases']) -%} + {%- do prejoin.update({'aliases': [prejoin['aliases']]}) -%} + {%- endif -%} + + {# If passed, make sure there are as many aliases as there are extract_columns, ensuring a 1:1 mapping #} + {%- if datavault4dbt.is_something(prejoin['aliases']) -%} + {%- if not prejoin['aliases']|length == prejoin['extract_columns']|length -%} + {%- do exceptions.raise_compiler_error("Prejoin aliases must have the same length as extract_columns. Got " + ~ prejoin['extract_columns']|length ~ " extract_column(s) and " ~ prejoin['aliases']|length ~ " aliase(s).") -%} + {%- endif -%} + {%- endif -%} + + {# Generate the columns for the SELECT-statement #} + {%- for column in prejoin['extract_columns'] %} + ,{{ prejoin_alias }}.{{ column }} {% if datavault4dbt.is_something(prejoin['aliases']) -%} AS {{ prejoin['aliases'][loop.index0] }} {% endif -%} + {%- endfor -%} + {%- endfor %} + + FROM {{ last_cte }} lcte + + {# Iterate over prejoins and generate the join-statements #} + {%- for prejoin in prejoined_columns -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- else -%} {%- set error_message -%} Prejoin error: Invalid target entity definition. Allowed are: @@ -295,39 +323,22 @@ prejoined_columns AS ( ref_column_name: join_columns_in_ref_model Got: - {{ col }}: {{ vals }} + {{ prejoin }} {%- endset -%} {%- do exceptions.raise_compiler_error(error_message) -%} {%- endif -%} -{# This sets a default value for the operator that connects multiple joining conditions. Only when it is not set by user. #} - {%- if 'operator' not in vals.keys() -%} + {%- if 'operator' not in prejoin.keys() -%} {%- set operator = 'AND' -%} {%- else -%} - {%- set operator = vals['operator'] -%} - {%- endif -%} - - - {%- set prejoin_hash = '`' ~ local_md5(relation~vals['this_column_name']~operator~vals['ref_column_name']) ~ '`' -%} - - {%- if not prejoin_hash in processed_prejoin_hashes %} - {%- do processed_prejoin_hashes.append(prejoin_hash) %} - {%- set prejoin_join_statement_tmp -%} - left join {{ relation }} as {{ prejoin_hash }} - on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_hash], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} - - {% endset -%} - {%- do prejoin_statements_list.append(prejoin_join_statement_tmp) -%} + {%- set operator = prejoin['operator'] -%} {%- endif -%} - -{# select the prejoined columns #} - ,{{prejoin_hash}}.{{ vals['bk'] }} AS {{ col }} - {% endfor -%} - - FROM {{ last_cte }} lcte - - {{ prejoin_statements_list|join(' ')}} + {%- set prejoin_alias = 'pj_' + loop.index|string %} + + left join {{ relation }} as {{ prejoin_alias }} + on {{ datavault4dbt.multikey(columns=prejoin['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=prejoin['ref_column_name']) }} + {%- endfor -%} {% set last_cte = "prejoined_columns" -%} {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names -%} @@ -457,65 +468,61 @@ unknown_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ unknown_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ unknown_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for all source columns, except the ldts, rsrc & edwSequence column #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{- datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes#} - {% for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes #} + {%- for prejoin in prejoined_columns -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {%- set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} - {{ log('pj_relation_columns: ' ~ pj_relation_columns, false ) }} - - {% for column in pj_relation_columns -%} - - {% if column.name|lower == vals['bk']|lower -%} - {{ log('column found? yes, for column :' ~ column.name , false) }} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', alias=col) }} + {{ log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) }} + + {%- for column in pj_relation_columns -%} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} {%- endfor -%} - {%- if not loop.last %},{% endif %} {% endfor -%} {%- endif %} - {%- if datavault4dbt.is_something(derived_columns) -%}, - {# Additionally generating Ghost Records for Derived Columns #} - {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + {%- if datavault4dbt.is_something(derived_columns) -%} + {# Additionally generating Ghost Records for Derived Columns #} + {% for column_name, properties in derived_columns_with_datatypes_DICT.items() %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} @@ -527,62 +534,61 @@ error_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ error_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ error_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for Source Columns #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for Missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes #} - {%- for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes#} + {% for prejoin in prejoined_columns %} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} + {{- log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) -}} {% for column in pj_relation_columns -%} - {% if column.name|lower == vals['bk']|lower -%} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', alias=col) -}} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} + {%- endfor -%} - {%- if not loop.last -%},{%- endif %} {% endfor -%} + {%- endif %} - {%- endif -%} - - {%- if datavault4dbt.is_something(derived_columns) %}, + {%- if datavault4dbt.is_something(derived_columns) %} {# Additionally generating Ghost Records for Derived Columns #} {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} From 90dc5c8898730921baacfdc860f4d179f5bb94c7 Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Wed, 11 Dec 2024 16:47:12 +0100 Subject: [PATCH 17/50] databricks: stage: implement new prejoin syntax --- macros/staging/databricks/stage.sql | 198 +++++++++++++++------------- 1 file changed, 104 insertions(+), 94 deletions(-) diff --git a/macros/staging/databricks/stage.sql b/macros/staging/databricks/stage.sql index a3ff3b28..e7bd16f3 100644 --- a/macros/staging/databricks/stage.sql +++ b/macros/staging/databricks/stage.sql @@ -96,7 +96,7 @@ {# Getting the column names for all additional columns #} {%- set derived_column_names = datavault4dbt.extract_column_names(derived_columns) -%} {%- set hashed_column_names = datavault4dbt.extract_column_names(hashed_columns) -%} -{%- set prejoined_column_names = datavault4dbt.extract_column_names(prejoined_columns) -%} +{%- set prejoined_column_names = datavault4dbt.extract_prejoin_column_names(prejoined_columns) -%} {%- set missing_column_names = datavault4dbt.extract_column_names(missing_columns) -%} {%- set exclude_column_names = hashed_column_names + prejoined_column_names + missing_column_names + ldts_rsrc_input_column_names %} {%- set source_and_derived_column_names = (all_source_columns + derived_column_names) | unique | list -%} @@ -177,8 +177,13 @@ {% set error_value_rsrc = var('datavault4dbt.default_error_rsrc', 'ERROR') %} {% set unknown_value_rsrc = var('datavault4dbt.default_unknown_rsrc', 'SYSTEM') %} -{# Setting the rsrc default datatype #} -{% set rsrc_default_dtype = datavault4dbt.string_default_dtype(type=rsrc) %} +{# Setting the rsrc default datatype and length #} +{% set rsrc_default_dtype = datavault4dbt.string_default_dtype(type='rsrc') %} + +{# Setting the ldts default datatype #} +{% set ldts_default_dtype = datavault4dbt.timestamp_default_dtype() %} + +{{ datavault4dbt.prepend_generated_by() }} WITH @@ -206,7 +211,7 @@ source_data AS ( ldts_rsrc_data AS ( SELECT - {{ ldts }} AS {{ load_datetime_col_name}}, + CAST( {{ ldts }} as {{ ldts_default_dtype }} ) AS {{ load_datetime_col_name }}, CAST( {{ rsrc }} as {{ rsrc_default_dtype }} ) AS {{ record_source_col_name }} {%- if datavault4dbt.is_something(sequence) %}, {{ sequence }} AS edwSequence @@ -263,15 +268,41 @@ prejoined_columns AS ( {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} {%- endif -%} - {#- prepare join statements -#} - {%- set prejoin_statements_list = [] -%} - {%- set processed_prejoin_hashes = [] -%} - {%- for col, vals in prejoined_columns.items() -%} + {# Iterate over each prejoin, doing logic checks and generating the select-statements #} + {%- for prejoin in prejoined_columns -%} + {%- set prejoin_alias = 'pj_' + loop.index|string -%} + + {# If extract_columns and/or aliases are passed as string convert them to a list so they can be used as iterators later #} + {%- if not datavault4dbt.is_list(prejoin['extract_columns'])-%} + {%- do prejoin.update({'extract_columns': [prejoin['extract_columns']]}) -%} + {%- endif -%} + {%- if not datavault4dbt.is_list(prejoin['aliases']) and datavault4dbt.is_something(prejoin['aliases']) -%} + {%- do prejoin.update({'aliases': [prejoin['aliases']]}) -%} + {%- endif -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {# If passed, make sure there are as many aliases as there are extract_columns, ensuring a 1:1 mapping #} + {%- if datavault4dbt.is_something(prejoin['aliases']) -%} + {%- if not prejoin['aliases']|length == prejoin['extract_columns']|length -%} + {%- do exceptions.raise_compiler_error("Prejoin aliases must have the same length as extract_columns. Got " + ~ prejoin['extract_columns']|length ~ " extract_column(s) and " ~ prejoin['aliases']|length ~ " aliase(s).") -%} + {%- endif -%} + {%- endif -%} + + {# Generate the columns for the SELECT-statement #} + {%- for column in prejoin['extract_columns'] %} + ,{{ prejoin_alias }}.{{ column }} {% if datavault4dbt.is_something(prejoin['aliases']) -%} AS {{ prejoin['aliases'][loop.index0] }} {% endif -%} + {%- endfor -%} + {%- endfor %} + + FROM {{ last_cte }} lcte + + {# Iterate over prejoins and generate the join-statements #} + {%- for prejoin in prejoined_columns -%} + + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- else -%} {%- set error_message -%} Prejoin error: Invalid target entity definition. Allowed are: @@ -292,45 +323,29 @@ prejoined_columns AS ( ref_column_name: join_columns_in_ref_model Got: - {{ col }}: {{ vals }} + {{ prejoin }} {%- endset -%} {%- do exceptions.raise_compiler_error(error_message) -%} {%- endif -%} -{# This sets a default value for the operator that connects multiple joining conditions. Only when it is not set by user. #} - {%- if 'operator' not in vals.keys() -%} + {%- if 'operator' not in prejoin.keys() -%} {%- set operator = 'AND' -%} {%- else -%} - {%- set operator = vals['operator'] -%} - {%- endif -%} - - - {%- set prejoin_hash = '`' ~ local_md5(relation~vals['this_column_name']~operator~vals['ref_column_name']) ~ '`' -%} - - {%- if not prejoin_hash in processed_prejoin_hashes %} - {%- do processed_prejoin_hashes.append(prejoin_hash) %} - {%- set prejoin_join_statement_tmp -%} - left join {{ relation }} as {{ prejoin_hash }} - on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_hash], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} - - {% endset -%} - {%- do prejoin_statements_list.append(prejoin_join_statement_tmp) -%} + {%- set operator = prejoin['operator'] -%} {%- endif -%} - -{# select the prejoined columns #} - ,{{prejoin_hash}}.{{ vals['bk'] }} AS {{ col }} - {% endfor -%} - - FROM {{ last_cte }} lcte - - {{ prejoin_statements_list|join(' ')}} + {%- set prejoin_alias = 'pj_' + loop.index|string %} + + left join {{ relation }} as {{ prejoin_alias }} + on {{ datavault4dbt.multikey(columns=prejoin['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=prejoin['ref_column_name']) }} + {%- endfor -%} {% set last_cte = "prejoined_columns" -%} {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names -%} ), {%- endif -%} + {%- if datavault4dbt.is_something(derived_columns) %} {# Adding derived columns to the selection #} derived_columns AS ( @@ -451,65 +466,61 @@ unknown_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ unknown_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ unknown_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for all source columns, except the ldts, rsrc & edwSequence column #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{- datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes#} - {% for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes #} + {%- for prejoin in prejoined_columns -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {%- set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} - {{ log('pj_relation_columns: ' ~ pj_relation_columns, false ) }} - - {% for column in pj_relation_columns -%} - - {% if column.name|lower == vals['bk']|lower -%} - {{ log('column found? yes, for column :' ~ column.name , false) }} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', alias=col) }} + {{ log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) }} + + {%- for column in pj_relation_columns -%} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} {%- endfor -%} - {%- if not loop.last %},{% endif %} {% endfor -%} {%- endif %} - {%- if datavault4dbt.is_something(derived_columns) -%}, - {# Additionally generating Ghost Records for Derived Columns #} - {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + {%- if datavault4dbt.is_something(derived_columns) -%} + {# Additionally generating Ghost Records for Derived Columns #} + {% for column_name, properties in derived_columns_with_datatypes_DICT.items() %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} @@ -521,62 +532,61 @@ error_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ error_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ error_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for Source Columns #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for Missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes #} - {%- for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes#} + {% for prejoin in prejoined_columns %} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} + {{- log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) -}} {% for column in pj_relation_columns -%} - {% if column.name|lower == vals['bk']|lower -%} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', alias=col) -}} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} + {%- endfor -%} - {%- if not loop.last -%},{%- endif %} {% endfor -%} + {%- endif %} - {%- endif -%} - - {%- if datavault4dbt.is_something(derived_columns) %}, + {%- if datavault4dbt.is_something(derived_columns) %} {# Additionally generating Ghost Records for Derived Columns #} {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} From 706b2af95cd9a1f4887b6451f69611f480f1c9f6 Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Wed, 11 Dec 2024 16:47:20 +0100 Subject: [PATCH 18/50] exasol stage: implement new prejoin syntax --- macros/staging/exasol/stage.sql | 194 +++++++++++++++++--------------- 1 file changed, 106 insertions(+), 88 deletions(-) diff --git a/macros/staging/exasol/stage.sql b/macros/staging/exasol/stage.sql index 9812f462..a654057c 100644 --- a/macros/staging/exasol/stage.sql +++ b/macros/staging/exasol/stage.sql @@ -178,6 +178,8 @@ {# Setting the ldts default datatype #} {% set ldts_default_dtype = datavault4dbt.timestamp_default_dtype() %} +{{ datavault4dbt.prepend_generated_by() }} + WITH {# Selecting everything that we need from the source relation. #} @@ -188,6 +190,12 @@ source_data AS ( FROM {{ source_relation }} + {% if is_incremental() %} + WHERE {{ ldts }} > (SELECT max({{ load_datetime_col_name}}) + FROM {{ this }} + WHERE {{ load_datetime_col_name}} != {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} ) + {%- endif -%} + {% set last_cte = "source_data" -%} ), @@ -254,15 +262,41 @@ prejoined_columns AS ( {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} {%- endif -%} - {#- prepare join statements -#} - {%- set prejoin_statements_list = [] -%} - {%- set processed_prejoin_hashes = [] -%} - {%- for col, vals in prejoined_columns.items() -%} + {# Iterate over each prejoin, doing logic checks and generating the select-statements #} + {%- for prejoin in prejoined_columns -%} + {%- set prejoin_alias = 'pj_' + loop.index|string -%} + + {# If extract_columns and/or aliases are passed as string convert them to a list so they can be used as iterators later #} + {%- if not datavault4dbt.is_list(prejoin['extract_columns'])-%} + {%- do prejoin.update({'extract_columns': [prejoin['extract_columns']]}) -%} + {%- endif -%} + {%- if not datavault4dbt.is_list(prejoin['aliases']) and datavault4dbt.is_something(prejoin['aliases']) -%} + {%- do prejoin.update({'aliases': [prejoin['aliases']]}) -%} + {%- endif -%} + + {# If passed, make sure there are as many aliases as there are extract_columns, ensuring a 1:1 mapping #} + {%- if datavault4dbt.is_something(prejoin['aliases']) -%} + {%- if not prejoin['aliases']|length == prejoin['extract_columns']|length -%} + {%- do exceptions.raise_compiler_error("Prejoin aliases must have the same length as extract_columns. Got " + ~ prejoin['extract_columns']|length ~ " extract_column(s) and " ~ prejoin['aliases']|length ~ " aliase(s).") -%} + {%- endif -%} + {%- endif -%} + + {# Generate the columns for the SELECT-statement #} + {%- for column in prejoin['extract_columns'] %} + ,{{ prejoin_alias }}.{{ column }} {% if datavault4dbt.is_something(prejoin['aliases']) -%} AS {{ prejoin['aliases'][loop.index0] }} {% endif -%} + {%- endfor -%} + {%- endfor %} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + FROM {{ last_cte }} lcte + + {# Iterate over prejoins and generate the join-statements #} + {%- for prejoin in prejoined_columns -%} + + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- else -%} {%- set error_message -%} Prejoin error: Invalid target entity definition. Allowed are: @@ -283,42 +317,25 @@ prejoined_columns AS ( ref_column_name: join_columns_in_ref_model Got: - {{ col }}: {{ vals }} + {{ prejoin }} {%- endset -%} {%- do exceptions.raise_compiler_error(error_message) -%} {%- endif -%} -{# This sets a default value for the operator that connects multiple joining conditions. Only when it is not set by user. #} - {%- if 'operator' not in vals.keys() -%} + {%- if 'operator' not in prejoin.keys() -%} {%- set operator = 'AND' -%} {%- else -%} - {%- set operator = vals['operator'] -%} + {%- set operator = prejoin['operator'] -%} {%- endif -%} - - - {%- set prejoin_hash = '"' ~ local_md5(relation~vals['this_column_name']~operator~vals['ref_column_name']) ~ '"' -%} - - {%- if not prejoin_hash in processed_prejoin_hashes %} - {%- do processed_prejoin_hashes.append(prejoin_hash) %} - {%- set prejoin_join_statement_tmp -%} - left join {{ relation }} as {{ prejoin_hash }} - on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_hash], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} - - {% endset -%} - {%- do prejoin_statements_list.append(prejoin_join_statement_tmp) -%} - {%- endif -%} - -{# select the prejoined columns #} - ,{{prejoin_hash}}.{{ vals['bk'] }} AS {{ col }} - {% endfor -%} - - FROM {{ last_cte }} lcte - - {{ prejoin_statements_list|join(' ')}} + {%- set prejoin_alias = 'pj_' + loop.index|string %} + + left join {{ relation }} as {{ prejoin_alias }} + on {{ datavault4dbt.multikey(columns=prejoin['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=prejoin['ref_column_name']) }} + {%- endfor -%} {% set last_cte = "prejoined_columns" -%} - {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names -%} + {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names %} ), {%- endif -%} @@ -442,63 +459,64 @@ hashed_columns AS ( {%- if enable_ghost_records and not is_incremental() %} {# Creating Ghost Record for unknown case, based on datatype #} unknown_values AS ( + SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ unknown_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ unknown_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for all source columns, except the ldts, rsrc & edwSequence column #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, col_size=column.char_size, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col|upper, datatype=dtype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{- datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes#} - {% for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes #} + {%- for prejoin in prejoined_columns -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {%- set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} - {% for column in pj_relation_columns -%} - {% if column.name|lower == vals['bk']|lower -%} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col|upper, datatype=column.dtype, col_size=column.char_size, ghost_record_type='unknown') }} + {{ log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) }} + + {%- for column in pj_relation_columns -%} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} {%- endfor -%} - {%- if not loop.last %},{% endif %} {% endfor -%} {%- endif %} - {%- if datavault4dbt.is_something(derived_columns) -%}, - {# Additionally generating Ghost Records for Derived Columns #} - {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + {%- if datavault4dbt.is_something(derived_columns) -%} + {# Additionally generating Ghost Records for Derived Columns #} + {% for column_name, properties in derived_columns_with_datatypes_DICT.items() %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} @@ -510,62 +528,61 @@ error_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ error_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ error_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for Source Columns #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, col_size=column.char_size, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for Missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col|upper, datatype=dtype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes #} - {%- for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes#} + {% for prejoin in prejoined_columns %} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} + {{- log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) -}} {% for column in pj_relation_columns -%} - {% if column.name|lower == vals['bk']|lower -%} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col|upper, datatype=column.dtype, col_size=column.char_size, ghost_record_type='error') -}} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} + {%- endfor -%} - {%- if not loop.last -%},{%- endif %} {% endfor -%} + {%- endif %} - {%- endif -%} - - {%- if datavault4dbt.is_something(derived_columns) %}, + {%- if datavault4dbt.is_something(derived_columns) %} {# Additionally generating Ghost Records for Derived Columns #} {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='error') }} - {%- if not loop.last %},{% endif %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} @@ -595,12 +612,13 @@ columns_to_select AS ( {%- if enable_ghost_records and not is_incremental() %} UNION ALL + SELECT {{ datavault4dbt.print_list(datavault4dbt.escape_column_names(final_columns_to_select)) }} FROM ghost_records -{%- endif -%} +{% endif %} ) SELECT * FROM columns_to_select From 20d012c0c18aee99fb7eb93d1090d65480b6b174 Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Wed, 11 Dec 2024 16:47:26 +0100 Subject: [PATCH 19/50] fabric stage: implement new prejoin syntax --- macros/staging/fabric/stage.sql | 185 +++++++++++++++++--------------- 1 file changed, 96 insertions(+), 89 deletions(-) diff --git a/macros/staging/fabric/stage.sql b/macros/staging/fabric/stage.sql index d042d7a9..14b511c6 100644 --- a/macros/staging/fabric/stage.sql +++ b/macros/staging/fabric/stage.sql @@ -89,12 +89,11 @@ {# Getting the column names for all additional columns #} {%- set derived_column_names = datavault4dbt.extract_column_names(derived_columns) -%} {%- set hashed_column_names = datavault4dbt.extract_column_names(hashed_columns) -%} -{%- set prejoined_column_names = datavault4dbt.extract_column_names(prejoined_columns) -%} +{%- set prejoined_column_names = datavault4dbt.extract_prejoin_column_names(prejoined_columns) -%} {%- set missing_column_names = datavault4dbt.extract_column_names(missing_columns) -%} {%- set exclude_column_names = derived_column_names + hashed_column_names + prejoined_column_names + missing_column_names + ldts_rsrc_input_column_names %} {%- set source_and_derived_column_names = (all_source_columns + derived_column_names) | unique | list -%} {%- set all_columns = adapter.get_columns_in_relation( source_relation ) -%} - {%- set columns_without_excluded_columns = [] -%} {%- set final_columns_to_select = [] -%} @@ -134,8 +133,11 @@ {%- set only_include_from_source = (derived_input_columns + hashed_input_columns + prejoined_input_columns + ma_keys) | unique | list -%} {%- else -%} + {%- set only_include_from_source = (derived_input_columns + hashed_input_columns + prejoined_input_columns) | unique | list -%} + {%- endif -%} + {%- set source_columns_to_select = only_include_from_source -%} {%- endif-%} @@ -264,15 +266,41 @@ prejoined_columns AS ( {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} {%- endif -%} - {#- prepare join statements -#} - {%- set prejoin_statements_list = [] -%} - {%- set processed_prejoin_hashes = [] -%} - {%- for col, vals in prejoined_columns.items() -%} + {# Iterate over each prejoin, doing logic checks and generating the select-statements #} + {%- for prejoin in prejoined_columns -%} + {%- set prejoin_alias = 'pj_' + loop.index|string -%} + + {# If extract_columns and/or aliases are passed as string convert them to a list so they can be used as iterators later #} + {%- if not datavault4dbt.is_list(prejoin['extract_columns'])-%} + {%- do prejoin.update({'extract_columns': [prejoin['extract_columns']]}) -%} + {%- endif -%} + {%- if not datavault4dbt.is_list(prejoin['aliases']) and datavault4dbt.is_something(prejoin['aliases']) -%} + {%- do prejoin.update({'aliases': [prejoin['aliases']]}) -%} + {%- endif -%} + + {# If passed, make sure there are as many aliases as there are extract_columns, ensuring a 1:1 mapping #} + {%- if datavault4dbt.is_something(prejoin['aliases']) -%} + {%- if not prejoin['aliases']|length == prejoin['extract_columns']|length -%} + {%- do exceptions.raise_compiler_error("Prejoin aliases must have the same length as extract_columns. Got " + ~ prejoin['extract_columns']|length ~ " extract_column(s) and " ~ prejoin['aliases']|length ~ " aliase(s).") -%} + {%- endif -%} + {%- endif -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {# Generate the columns for the SELECT-statement #} + {%- for column in prejoin['extract_columns'] %} + ,{{ prejoin_alias }}.{{ column }} {% if datavault4dbt.is_something(prejoin['aliases']) -%} AS {{ prejoin['aliases'][loop.index0] }} {% endif -%} + {%- endfor -%} + {%- endfor %} + + FROM {{ last_cte }} lcte + + {# Iterate over prejoins and generate the join-statements #} + {%- for prejoin in prejoined_columns -%} + + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- else -%} {%- set error_message -%} Prejoin error: Invalid target entity definition. Allowed are: @@ -293,39 +321,22 @@ prejoined_columns AS ( ref_column_name: join_columns_in_ref_model Got: - {{ col }}: {{ vals }} + {{ prejoin }} {%- endset -%} {%- do exceptions.raise_compiler_error(error_message) -%} {%- endif -%} -{# This sets a default value for the operator that connects multiple joining conditions. Only when it is not set by user. #} - {%- if 'operator' not in vals.keys() -%} + {%- if 'operator' not in prejoin.keys() -%} {%- set operator = 'AND' -%} {%- else -%} - {%- set operator = vals['operator'] -%} - {%- endif -%} - - - {%- set prejoin_hash = '"' ~ local_md5(relation~vals['this_column_name']~operator~vals['ref_column_name']) ~ '"' -%} - - {%- if not prejoin_hash in processed_prejoin_hashes %} - {%- do processed_prejoin_hashes.append(prejoin_hash) %} - {%- set prejoin_join_statement_tmp -%} - left join {{ relation }} as {{ prejoin_hash }} - on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_hash], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} - - {% endset -%} - {%- do prejoin_statements_list.append(prejoin_join_statement_tmp) -%} + {%- set operator = prejoin['operator'] -%} {%- endif -%} - -{# select the prejoined columns #} - ,{{prejoin_hash}}.{{ vals['bk'] }} AS {{ col }} - {% endfor -%} - - FROM {{ last_cte }} lcte - - {{ prejoin_statements_list|join(' ')}} + {%- set prejoin_alias = 'pj_' + loop.index|string %} + + left join {{ relation }} as {{ prejoin_alias }} + on {{ datavault4dbt.multikey(columns=prejoin['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=prejoin['ref_column_name']) }} + {%- endfor -%} {% set last_cte = "prejoined_columns" -%} {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names -%} @@ -455,65 +466,61 @@ unknown_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ unknown_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ unknown_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for all source columns, except the ldts, rsrc & edwSequence column #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column.name), datatype=column.dtype, ghost_record_type='unknown', col_size=column.char_size) }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column.name), datatype=column.dtype, ghost_record_type='unknown', col_size=column.char_size) }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(col), datatype=dtype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(col), datatype=dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes#} - {% for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes #} + {%- for prejoin in prejoined_columns -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {%- set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} - {{ log('pj_relation_columns: ' ~ pj_relation_columns, false ) }} + {{ log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) }} {%- for column in pj_relation_columns -%} - - {%- if column.name|lower == vals['bk']|lower -%} - {{- log('column found? yes, for column :' ~ column.name , false) -}} - {{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column.name), datatype=column.dtype, ghost_record_type='unknown', alias=datavault4dbt.escape_column_names(col)) }} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} {%- endfor -%} - {%- if not loop.last %},{%- endif %} {% endfor -%} {%- endif %} - {%- if datavault4dbt.is_something(derived_columns) -%}, - {# Additionally generating Ghost Records for Derived Columns #} + {%- if datavault4dbt.is_something(derived_columns) -%} + {# Additionally generating Ghost Records for Derived Columns #} {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column_name), datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column_name), datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} @@ -525,62 +532,61 @@ error_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ error_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ error_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for Source Columns #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column.name), datatype=column.dtype, ghost_record_type='error', col_size=column.char_size) }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column.name), datatype=column.dtype, ghost_record_type='error', col_size=column.char_size) }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for Missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(col), datatype=dtype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(col), datatype=dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes #} - {%- for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes#} + {% for prejoin in prejoined_columns %} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} + {{- log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) -}} {% for column in pj_relation_columns -%} - {% if column.name|lower == vals['bk']|lower -%} - {{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column.name), datatype=column.dtype, ghost_record_type='error', alias=datavault4dbt.escape_column_names(col)) -}} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} + {%- endfor -%} - {%- if not loop.last -%},{%- endif %} {% endfor -%} + {%- endif %} - {%- endif -%} - - {%- if datavault4dbt.is_something(derived_columns) %}, + {%- if datavault4dbt.is_something(derived_columns) %} {# Additionally generating Ghost Records for Derived Columns #} {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column_name), datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='error') }} - {%- if not loop.last %},{% endif %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column_name), datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} @@ -607,6 +613,7 @@ columns_to_select AS ( {{ datavault4dbt.print_list(datavault4dbt.escape_column_names(final_columns_to_select)) }} FROM {{ last_cte }} + {% if enable_ghost_records and not is_incremental() %} UNION ALL From 36617f33d36b2cf2eda28437731e705afb89e0ea Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Wed, 11 Dec 2024 16:47:37 +0100 Subject: [PATCH 20/50] oracle stage: implement new prejoin syntax --- macros/staging/oracle/stage.sql | 191 ++++++++++++++++---------------- 1 file changed, 98 insertions(+), 93 deletions(-) diff --git a/macros/staging/oracle/stage.sql b/macros/staging/oracle/stage.sql index 40afc202..e34df521 100644 --- a/macros/staging/oracle/stage.sql +++ b/macros/staging/oracle/stage.sql @@ -96,12 +96,11 @@ {# Getting the column names for all additional columns #} {%- set derived_column_names = datavault4dbt.extract_column_names(derived_columns) -%} {%- set hashed_column_names = datavault4dbt.extract_column_names(hashed_columns) -%} -{%- set prejoined_column_names = datavault4dbt.extract_column_names(prejoined_columns) -%} +{%- set prejoined_column_names = datavault4dbt.extract_prejoin_column_names(prejoined_columns) -%} {%- set missing_column_names = datavault4dbt.extract_column_names(missing_columns) -%} {%- set exclude_column_names = hashed_column_names + prejoined_column_names + missing_column_names + ldts_rsrc_input_column_names %} {%- set source_and_derived_column_names = (all_source_columns + derived_column_names) | unique | list -%} {%- set all_columns = adapter.get_columns_in_relation( source_relation ) -%} - {%- set columns_without_excluded_columns = [] -%} {%- set final_columns_to_select = [] -%} @@ -189,6 +188,8 @@ {# Setting the ldts default datatype #} {% set ldts_default_dtype = datavault4dbt.timestamp_default_dtype() %} +{{ datavault4dbt.prepend_generated_by() }} + WITH {# Selecting everything that we need from the source relation. #} @@ -273,15 +274,41 @@ prejoined_columns AS ( {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} {%- endif -%} - {#- prepare join statements -#} - {%- set prejoin_statements_list = [] -%} - {%- set processed_prejoin_hashes = [] -%} - {%- for col, vals in prejoined_columns.items() -%} + {# Iterate over each prejoin, doing logic checks and generating the select-statements #} + {%- for prejoin in prejoined_columns -%} + {%- set prejoin_alias = 'pj_' + loop.index|string -%} + + {# If extract_columns and/or aliases are passed as string convert them to a list so they can be used as iterators later #} + {%- if not datavault4dbt.is_list(prejoin['extract_columns'])-%} + {%- do prejoin.update({'extract_columns': [prejoin['extract_columns']]}) -%} + {%- endif -%} + {%- if not datavault4dbt.is_list(prejoin['aliases']) and datavault4dbt.is_something(prejoin['aliases']) -%} + {%- do prejoin.update({'aliases': [prejoin['aliases']]}) -%} + {%- endif -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {# If passed, make sure there are as many aliases as there are extract_columns, ensuring a 1:1 mapping #} + {%- if datavault4dbt.is_something(prejoin['aliases']) -%} + {%- if not prejoin['aliases']|length == prejoin['extract_columns']|length -%} + {%- do exceptions.raise_compiler_error("Prejoin aliases must have the same length as extract_columns. Got " + ~ prejoin['extract_columns']|length ~ " extract_column(s) and " ~ prejoin['aliases']|length ~ " aliase(s).") -%} + {%- endif -%} + {%- endif -%} + + {# Generate the columns for the SELECT-statement #} + {%- for column in prejoin['extract_columns'] %} + ,{{ prejoin_alias }}.{{ column }} {% if datavault4dbt.is_something(prejoin['aliases']) -%} AS {{ prejoin['aliases'][loop.index0] }} {% endif -%} + {%- endfor -%} + {%- endfor %} + + FROM {{ last_cte }} lcte + + {# Iterate over prejoins and generate the join-statements #} + {%- for prejoin in prejoined_columns -%} + + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- else -%} {%- set error_message -%} Prejoin error: Invalid target entity definition. Allowed are: @@ -302,39 +329,22 @@ prejoined_columns AS ( ref_column_name: join_columns_in_ref_model Got: - {{ col }}: {{ vals }} + {{ prejoin }} {%- endset -%} {%- do exceptions.raise_compiler_error(error_message) -%} {%- endif -%} -{# This sets a default value for the operator that connects multiple joining conditions. Only when it is not set by user. #} - {%- if 'operator' not in vals.keys() -%} + {%- if 'operator' not in prejoin.keys() -%} {%- set operator = 'AND' -%} {%- else -%} - {%- set operator = vals['operator'] -%} - {%- endif -%} - - - {%- set prejoin_hash = '"' ~ local_md5(relation~vals['this_column_name']~operator~vals['ref_column_name']) ~ '"' -%} - - {%- if not prejoin_hash in processed_prejoin_hashes %} - {%- do processed_prejoin_hashes.append(prejoin_hash) %} - {%- set prejoin_join_statement_tmp -%} - left join {{ relation }} as {{ prejoin_hash }} - on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_hash], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} - - {% endset -%} - {%- do prejoin_statements_list.append(prejoin_join_statement_tmp) -%} + {%- set operator = prejoin['operator'] -%} {%- endif -%} - -{# select the prejoined columns #} - ,{{prejoin_hash}}.{{ vals['bk'] }} AS {{ col }} - {% endfor -%} - - FROM {{ last_cte }} lcte - - {{ prejoin_statements_list|join(' ')}} + {%- set prejoin_alias = 'pj_' + loop.index|string %} + + left join {{ relation }} as {{ prejoin_alias }} + on {{ datavault4dbt.multikey(columns=prejoin['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=prejoin['ref_column_name']) }} + {%- endfor -%} {% set last_cte = "prejoined_columns" -%} {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names -%} @@ -465,65 +475,61 @@ unknown_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ unknown_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ unknown_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for all source columns, except the ldts, rsrc & edwSequence column #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{- datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes#} - {% for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes #} + {%- for prejoin in prejoined_columns -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {%- set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} - {{ log('pj_relation_columns: ' ~ pj_relation_columns, false ) }} - - {% for column in pj_relation_columns -%} - - {% if column.name|lower == vals['bk']|lower -%} - {{ log('column found? yes, for column :' ~ column.name , false) }} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', alias=col) }} + {{ log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) }} + + {%- for column in pj_relation_columns -%} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} {%- endfor -%} - {%- if not loop.last %},{% endif %} {% endfor -%} {%- endif %} - {%- if datavault4dbt.is_something(derived_columns) -%}, - {# Additionally generating Ghost Records for Derived Columns #} - {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + {%- if datavault4dbt.is_something(derived_columns) -%} + {# Additionally generating Ghost Records for Derived Columns #} + {% for column_name, properties in derived_columns_with_datatypes_DICT.items() %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} @@ -536,62 +542,61 @@ error_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ error_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ error_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for Source Columns #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for Missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes #} - {%- for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes#} + {% for prejoin in prejoined_columns %} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} + {{- log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) -}} {% for column in pj_relation_columns -%} - {% if column.name|lower == vals['bk']|lower -%} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', alias=col) -}} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} + {%- endfor -%} - {%- if not loop.last -%},{%- endif %} {% endfor -%} + {%- endif %} - {%- endif -%} - - {%- if datavault4dbt.is_something(derived_columns) %}, + {%- if datavault4dbt.is_something(derived_columns) %} {# Additionally generating Ghost Records for Derived Columns #} {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='error') }} - {%- if not loop.last %},{% endif %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} @@ -634,4 +639,4 @@ columns_to_select AS ( SELECT * FROM columns_to_select -{%- endmacro -%} \ No newline at end of file +{%- endmacro -%} From 7adba6909cb635c141fb222e5056f689a4cfa1e2 Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Wed, 11 Dec 2024 16:47:56 +0100 Subject: [PATCH 21/50] postgres stage add prepend_generated_by() --- macros/staging/postgres/stage.sql | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/macros/staging/postgres/stage.sql b/macros/staging/postgres/stage.sql index f5cd9898..c59cc9c6 100644 --- a/macros/staging/postgres/stage.sql +++ b/macros/staging/postgres/stage.sql @@ -101,7 +101,6 @@ {%- set exclude_column_names = hashed_column_names + prejoined_column_names + missing_column_names + ldts_rsrc_input_column_names %} {%- set source_and_derived_column_names = (all_source_columns + derived_column_names) | unique | list -%} {%- set all_columns = adapter.get_columns_in_relation( source_relation ) -%} - {%- set columns_without_excluded_columns = [] -%} {%- set final_columns_to_select = [] -%} @@ -184,6 +183,8 @@ {# Setting the ldts default datatype #} {% set ldts_default_dtype = datavault4dbt.timestamp_default_dtype() %} +{{ datavault4dbt.prepend_generated_by() }} + WITH {# Selecting everything that we need from the source relation. #} From 3070e0a4c6afceceb578e8e7e5e701f63136c7e5 Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Wed, 11 Dec 2024 16:48:02 +0100 Subject: [PATCH 22/50] redshift stage: implement new prejoin syntax --- macros/staging/redshift/stage.sql | 191 ++++++++++++++++-------------- 1 file changed, 99 insertions(+), 92 deletions(-) diff --git a/macros/staging/redshift/stage.sql b/macros/staging/redshift/stage.sql index 6c9238b0..b9861cb1 100644 --- a/macros/staging/redshift/stage.sql +++ b/macros/staging/redshift/stage.sql @@ -95,7 +95,7 @@ {# Getting the column names for all additional columns #} {%- set derived_column_names = datavault4dbt.extract_column_names(derived_columns) -%} {%- set hashed_column_names = datavault4dbt.extract_column_names(hashed_columns) -%} -{%- set prejoined_column_names = datavault4dbt.extract_column_names(prejoined_columns) -%} +{%- set prejoined_column_names = datavault4dbt.extract_prejoin_column_names(prejoined_columns) -%} {%- set missing_column_names = datavault4dbt.extract_column_names(missing_columns) -%} {%- set exclude_column_names = hashed_column_names + prejoined_column_names + missing_column_names + ldts_rsrc_input_column_names %} {%- set source_and_derived_column_names = (all_source_columns + derived_column_names) | unique | list -%} @@ -183,6 +183,8 @@ {# Setting the ldts default datatype #} {% set ldts_default_dtype = datavault4dbt.timestamp_default_dtype() %} +{{ datavault4dbt.prepend_generated_by() }} + WITH {# Selecting everything that we need from the source relation. #} @@ -255,6 +257,7 @@ missing_columns AS ( ), {%- endif -%} + {%- if datavault4dbt.is_something(prejoined_columns) %} {# Prejoining Business Keys of other source objects for Link purposes #} prejoined_columns AS ( @@ -264,15 +267,41 @@ prejoined_columns AS ( {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} {%- endif -%} - {#- prepare join statements -#} - {%- set prejoin_statements_list = [] -%} - {%- set processed_prejoin_hashes = [] -%} - {%- for col, vals in prejoined_columns.items() -%} + {# Iterate over each prejoin, doing logic checks and generating the select-statements #} + {%- for prejoin in prejoined_columns -%} + {%- set prejoin_alias = 'pj_' + loop.index|string -%} + + {# If extract_columns and/or aliases are passed as string convert them to a list so they can be used as iterators later #} + {%- if not datavault4dbt.is_list(prejoin['extract_columns'])-%} + {%- do prejoin.update({'extract_columns': [prejoin['extract_columns']]}) -%} + {%- endif -%} + {%- if not datavault4dbt.is_list(prejoin['aliases']) and datavault4dbt.is_something(prejoin['aliases']) -%} + {%- do prejoin.update({'aliases': [prejoin['aliases']]}) -%} + {%- endif -%} + + {# If passed, make sure there are as many aliases as there are extract_columns, ensuring a 1:1 mapping #} + {%- if datavault4dbt.is_something(prejoin['aliases']) -%} + {%- if not prejoin['aliases']|length == prejoin['extract_columns']|length -%} + {%- do exceptions.raise_compiler_error("Prejoin aliases must have the same length as extract_columns. Got " + ~ prejoin['extract_columns']|length ~ " extract_column(s) and " ~ prejoin['aliases']|length ~ " aliase(s).") -%} + {%- endif -%} + {%- endif -%} + + {# Generate the columns for the SELECT-statement #} + {%- for column in prejoin['extract_columns'] %} + ,{{ prejoin_alias }}.{{ column }} {% if datavault4dbt.is_something(prejoin['aliases']) -%} AS {{ prejoin['aliases'][loop.index0] }} {% endif -%} + {%- endfor -%} + {%- endfor %} + + FROM {{ last_cte }} lcte - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {# Iterate over prejoins and generate the join-statements #} + {%- for prejoin in prejoined_columns -%} + + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- else -%} {%- set error_message -%} Prejoin error: Invalid target entity definition. Allowed are: @@ -293,39 +322,22 @@ prejoined_columns AS ( ref_column_name: join_columns_in_ref_model Got: - {{ col }}: {{ vals }} + {{ prejoin }} {%- endset -%} {%- do exceptions.raise_compiler_error(error_message) -%} {%- endif -%} -{# This sets a default value for the operator that connects multiple joining conditions. Only when it is not set by user. #} - {%- if 'operator' not in vals.keys() -%} + {%- if 'operator' not in prejoin.keys() -%} {%- set operator = 'AND' -%} {%- else -%} - {%- set operator = vals['operator'] -%} - {%- endif -%} - - - {%- set prejoin_hash = '"' ~ local_md5(relation~vals['this_column_name']~operator~vals['ref_column_name']) ~ '"' -%} - - {%- if not prejoin_hash in processed_prejoin_hashes %} - {%- do processed_prejoin_hashes.append(prejoin_hash) %} - {%- set prejoin_join_statement_tmp -%} - left join {{ relation }} as {{ prejoin_hash }} - on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_hash], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} - - {% endset -%} - {%- do prejoin_statements_list.append(prejoin_join_statement_tmp) -%} + {%- set operator = prejoin['operator'] -%} {%- endif -%} - -{# select the prejoined columns #} - ,{{prejoin_hash}}.{{ vals['bk'] }} AS {{ col }} - {% endfor -%} - - FROM {{ last_cte }} lcte - - {{ prejoin_statements_list|join(' ')}} + {%- set prejoin_alias = 'pj_' + loop.index|string %} + + left join {{ relation }} as {{ prejoin_alias }} + on {{ datavault4dbt.multikey(columns=prejoin['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=prejoin['ref_column_name']) }} + {%- endfor -%} {% set last_cte = "prejoined_columns" -%} {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names -%} @@ -467,65 +479,61 @@ unknown_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ unknown_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ unknown_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, - {# Generating Ghost Records for all source columns, except the ldts, rsrc & edwSequence column and derived_columns #} + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} + {# Generating Ghost Records for all source columns, except the ldts, rsrc & edwSequence column #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.data_type, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{- datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes#} - {% for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes #} + {%- for prejoin in prejoined_columns -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {%- set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} - {{ log('pj_relation_columns: ' ~ pj_relation_columns, false ) }} - - {% for column in pj_relation_columns -%} - - {% if column.name|lower == vals['bk']|lower -%} - {{ log('column found? yes, for column :' ~ column.name , false) }} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.data_type, ghost_record_type='unknown', alias=col) }} + {{ log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) }} + + {%- for column in pj_relation_columns -%} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} {%- endfor -%} - {%- if not loop.last %},{% endif %} {% endfor -%} {%- endif %} - {%- if datavault4dbt.is_something(derived_columns) -%}, - {# Additionally generating Ghost Records for Derived Columns #} - {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + {%- if datavault4dbt.is_something(derived_columns) -%} + {# Additionally generating Ghost Records for Derived Columns #} + {% for column_name, properties in derived_columns_with_datatypes_DICT.items() %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} @@ -537,62 +545,61 @@ error_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ error_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ error_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for Source Columns #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.data_type, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for Missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes #} - {%- for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes#} + {% for prejoin in prejoined_columns %} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} + {{- log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) -}} {% for column in pj_relation_columns -%} - {% if column.name|lower == vals['bk']|lower -%} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.data_type, ghost_record_type='error', alias=col) -}} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} + {%- endfor -%} - {%- if not loop.last -%},{%- endif %} {% endfor -%} + {%- endif %} - {%- endif -%} - - {%- if datavault4dbt.is_something(derived_columns) %}, + {%- if datavault4dbt.is_something(derived_columns) %} {# Additionally generating Ghost Records for Derived Columns #} {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} From ef8b3750ae44f4955dabcb4b5ef7da4c54b59fa6 Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Wed, 11 Dec 2024 16:48:08 +0100 Subject: [PATCH 23/50] snowflake stage: implement new prejoin syntax --- macros/staging/snowflake/stage.sql | 188 +++++++++++++++-------------- 1 file changed, 97 insertions(+), 91 deletions(-) diff --git a/macros/staging/snowflake/stage.sql b/macros/staging/snowflake/stage.sql index ed4c9d22..14810ed4 100644 --- a/macros/staging/snowflake/stage.sql +++ b/macros/staging/snowflake/stage.sql @@ -96,7 +96,7 @@ {# Getting the column names for all additional columns #} {%- set derived_column_names = datavault4dbt.extract_column_names(derived_columns) -%} {%- set hashed_column_names = datavault4dbt.extract_column_names(hashed_columns) -%} -{%- set prejoined_column_names = datavault4dbt.extract_column_names(prejoined_columns) -%} +{%- set prejoined_column_names = datavault4dbt.extract_prejoin_column_names(prejoined_columns) -%} {%- set missing_column_names = datavault4dbt.extract_column_names(missing_columns) -%} {%- set exclude_column_names = hashed_column_names + prejoined_column_names + missing_column_names + ldts_rsrc_input_column_names %} {%- set source_and_derived_column_names = (all_source_columns + derived_column_names) | unique | list -%} @@ -189,6 +189,8 @@ {# Setting the ldts default datatype #} {% set ldts_default_dtype = datavault4dbt.timestamp_default_dtype() %} +{{ datavault4dbt.prepend_generated_by() }} + WITH {# Selecting everything that we need from the source relation. #} @@ -273,15 +275,41 @@ prejoined_columns AS ( {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} {%- endif -%} - {#- prepare join statements -#} - {%- set prejoin_statements_list = [] -%} - {%- set processed_prejoin_hashes = [] -%} - {%- for col, vals in prejoined_columns.items() -%} + {# Iterate over each prejoin, doing logic checks and generating the select-statements #} + {%- for prejoin in prejoined_columns -%} + {%- set prejoin_alias = 'pj_' + loop.index|string -%} + + {# If extract_columns and/or aliases are passed as string convert them to a list so they can be used as iterators later #} + {%- if not datavault4dbt.is_list(prejoin['extract_columns'])-%} + {%- do prejoin.update({'extract_columns': [prejoin['extract_columns']]}) -%} + {%- endif -%} + {%- if not datavault4dbt.is_list(prejoin['aliases']) and datavault4dbt.is_something(prejoin['aliases']) -%} + {%- do prejoin.update({'aliases': [prejoin['aliases']]}) -%} + {%- endif -%} + + {# If passed, make sure there are as many aliases as there are extract_columns, ensuring a 1:1 mapping #} + {%- if datavault4dbt.is_something(prejoin['aliases']) -%} + {%- if not prejoin['aliases']|length == prejoin['extract_columns']|length -%} + {%- do exceptions.raise_compiler_error("Prejoin aliases must have the same length as extract_columns. Got " + ~ prejoin['extract_columns']|length ~ " extract_column(s) and " ~ prejoin['aliases']|length ~ " aliase(s).") -%} + {%- endif -%} + {%- endif -%} + + {# Generate the columns for the SELECT-statement #} + {%- for column in prejoin['extract_columns'] %} + ,{{ prejoin_alias }}.{{ column }} {% if datavault4dbt.is_something(prejoin['aliases']) -%} AS {{ prejoin['aliases'][loop.index0] }} {% endif -%} + {%- endfor -%} + {%- endfor %} + + FROM {{ last_cte }} lcte + + {# Iterate over prejoins and generate the join-statements #} + {%- for prejoin in prejoined_columns -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- else -%} {%- set error_message -%} Prejoin error: Invalid target entity definition. Allowed are: @@ -302,39 +330,22 @@ prejoined_columns AS ( ref_column_name: join_columns_in_ref_model Got: - {{ col }}: {{ vals }} + {{ prejoin }} {%- endset -%} {%- do exceptions.raise_compiler_error(error_message) -%} {%- endif -%} -{# This sets a default value for the operator that connects multiple joining conditions. Only when it is not set by user. #} - {%- if 'operator' not in vals.keys() -%} + {%- if 'operator' not in prejoin.keys() -%} {%- set operator = 'AND' -%} {%- else -%} - {%- set operator = vals['operator'] -%} - {%- endif -%} - - - {%- set prejoin_hash = '"' ~ local_md5(relation~vals['this_column_name']~operator~vals['ref_column_name']) ~ '"' -%} - - {%- if not prejoin_hash in processed_prejoin_hashes %} - {%- do processed_prejoin_hashes.append(prejoin_hash) %} - {%- set prejoin_join_statement_tmp -%} - left join {{ relation }} as {{ prejoin_hash }} - on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_hash], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} - - {% endset -%} - {%- do prejoin_statements_list.append(prejoin_join_statement_tmp) -%} + {%- set operator = prejoin['operator'] -%} {%- endif -%} - -{# select the prejoined columns #} - ,{{prejoin_hash}}.{{ vals['bk'] }} AS {{ col }} - {% endfor -%} - - FROM {{ last_cte }} lcte - - {{ prejoin_statements_list|join(' ')}} + {%- set prejoin_alias = 'pj_' + loop.index|string %} + + left join {{ relation }} as {{ prejoin_alias }} + on {{ datavault4dbt.multikey(columns=prejoin['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=prejoin['ref_column_name']) }} + {%- endfor -%} {% set last_cte = "prejoined_columns" -%} {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names -%} @@ -408,65 +419,61 @@ unknown_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ unknown_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ unknown_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for all source columns, except the ldts, rsrc & edwSequence column #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{- datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes#} - {% for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes #} + {%- for prejoin in prejoined_columns -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {%- set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} - {{ log('pj_relation_columns: ' ~ pj_relation_columns, false ) }} - - {% for column in pj_relation_columns -%} - - {% if column.name|lower == vals['bk']|lower -%} - {{ log('column found? yes, for column :' ~ column.name , false) }} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', alias=col) }} + {{ log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) }} + + {%- for column in pj_relation_columns -%} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} {%- endfor -%} - {%- if not loop.last %},{% endif %} {% endfor -%} {%- endif %} - {%- if datavault4dbt.is_something(derived_columns) -%}, - {# Additionally generating Ghost Records for Derived Columns #} - {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + {%- if datavault4dbt.is_something(derived_columns) -%} + {# Additionally generating Ghost Records for Derived Columns #} + {% for column_name, properties in derived_columns_with_datatypes_DICT.items() %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} @@ -478,62 +485,61 @@ error_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ error_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ error_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for Source Columns #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for Missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes #} - {%- for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes#} + {% for prejoin in prejoined_columns %} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} + {{- log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) -}} {% for column in pj_relation_columns -%} - {% if column.name|lower == vals['bk']|lower -%} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', alias=col) -}} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} + {%- endfor -%} - {%- if not loop.last -%},{%- endif %} {% endfor -%} + {%- endif %} - {%- endif -%} - - {%- if datavault4dbt.is_something(derived_columns) %}, + {%- if datavault4dbt.is_something(derived_columns) %} {# Additionally generating Ghost Records for Derived Columns #} {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='error') }} - {%- if not loop.last %},{% endif %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} From e8910462b546e1921234eaab1086296407fc290c Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Wed, 11 Dec 2024 16:48:16 +0100 Subject: [PATCH 24/50] synapse stage: implement new prejoin syntax --- macros/staging/synapse/stage.sql | 184 ++++++++++++++++--------------- 1 file changed, 95 insertions(+), 89 deletions(-) diff --git a/macros/staging/synapse/stage.sql b/macros/staging/synapse/stage.sql index 88356508..71525cf0 100644 --- a/macros/staging/synapse/stage.sql +++ b/macros/staging/synapse/stage.sql @@ -179,6 +179,8 @@ {# Setting the ldts default datatype #} {% set ldts_default_dtype = datavault4dbt.timestamp_default_dtype() %} +{{ datavault4dbt.prepend_generated_by() }} + WITH {# Selecting everything that we need from the source relation. #} @@ -265,15 +267,41 @@ prejoined_columns AS ( {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} {%- endif -%} - {#- prepare join statements -#} - {%- set prejoin_statements_list = [] -%} - {%- set processed_prejoin_hashes = [] -%} - {%- for col, vals in prejoined_columns.items() -%} + {# Iterate over each prejoin, doing logic checks and generating the select-statements #} + {%- for prejoin in prejoined_columns -%} + {%- set prejoin_alias = 'pj_' + loop.index|string -%} + + {# If extract_columns and/or aliases are passed as string convert them to a list so they can be used as iterators later #} + {%- if not datavault4dbt.is_list(prejoin['extract_columns'])-%} + {%- do prejoin.update({'extract_columns': [prejoin['extract_columns']]}) -%} + {%- endif -%} + {%- if not datavault4dbt.is_list(prejoin['aliases']) and datavault4dbt.is_something(prejoin['aliases']) -%} + {%- do prejoin.update({'aliases': [prejoin['aliases']]}) -%} + {%- endif -%} + + {# If passed, make sure there are as many aliases as there are extract_columns, ensuring a 1:1 mapping #} + {%- if datavault4dbt.is_something(prejoin['aliases']) -%} + {%- if not prejoin['aliases']|length == prejoin['extract_columns']|length -%} + {%- do exceptions.raise_compiler_error("Prejoin aliases must have the same length as extract_columns. Got " + ~ prejoin['extract_columns']|length ~ " extract_column(s) and " ~ prejoin['aliases']|length ~ " aliase(s).") -%} + {%- endif -%} + {%- endif -%} + + {# Generate the columns for the SELECT-statement #} + {%- for column in prejoin['extract_columns'] %} + ,{{ prejoin_alias }}.{{ column }} {% if datavault4dbt.is_something(prejoin['aliases']) -%} AS {{ prejoin['aliases'][loop.index0] }} {% endif -%} + {%- endfor -%} + {%- endfor %} + + FROM {{ last_cte }} lcte + + {# Iterate over prejoins and generate the join-statements #} + {%- for prejoin in prejoined_columns -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- else -%} {%- set error_message -%} Prejoin error: Invalid target entity definition. Allowed are: @@ -294,39 +322,22 @@ prejoined_columns AS ( ref_column_name: join_columns_in_ref_model Got: - {{ col }}: {{ vals }} + {{ prejoin }} {%- endset -%} {%- do exceptions.raise_compiler_error(error_message) -%} {%- endif -%} -{# This sets a default value for the operator that connects multiple joining conditions. Only when it is not set by user. #} - {%- if 'operator' not in vals.keys() -%} + {%- if 'operator' not in prejoin.keys() -%} {%- set operator = 'AND' -%} {%- else -%} - {%- set operator = vals['operator'] -%} - {%- endif -%} - - - {%- set prejoin_hash = '"' ~ local_md5(relation~vals['this_column_name']~operator~vals['ref_column_name']) ~ '"' -%} - - {%- if not prejoin_hash in processed_prejoin_hashes %} - {%- do processed_prejoin_hashes.append(prejoin_hash) %} - {%- set prejoin_join_statement_tmp -%} - left join {{ relation }} as {{ prejoin_hash }} - on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_hash], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} - - {% endset -%} - {%- do prejoin_statements_list.append(prejoin_join_statement_tmp) -%} + {%- set operator = prejoin['operator'] -%} {%- endif -%} - -{# select the prejoined columns #} - ,{{prejoin_hash}}.{{ vals['bk'] }} AS {{ col }} - {% endfor -%} - - FROM {{ last_cte }} lcte - - {{ prejoin_statements_list|join(' ')}} + {%- set prejoin_alias = 'pj_' + loop.index|string %} + + left join {{ relation }} as {{ prejoin_alias }} + on {{ datavault4dbt.multikey(columns=prejoin['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=prejoin['ref_column_name']) }} + {%- endfor -%} {% set last_cte = "prejoined_columns" -%} {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names -%} @@ -456,65 +467,61 @@ unknown_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ unknown_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ unknown_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for all source columns, except the ldts, rsrc & edwSequence column #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', col_size=column.char_size) }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column.name), datatype=column.dtype, ghost_record_type='unknown', col_size=column.char_size) }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(col), datatype=dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes#} - {% for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes #} + {%- for prejoin in prejoined_columns -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {%- set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} - {{ log('pj_relation_columns: ' ~ pj_relation_columns, false ) }} - - {% for column in pj_relation_columns -%} - - {% if column.name|lower == vals['bk']|lower -%} - {{ log('column found? yes, for column :' ~ column.name , false) }} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', alias=col) }} + {{ log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) }} + + {%- for column in pj_relation_columns -%} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} {%- endfor -%} - {%- if not loop.last %},{% endif %} {% endfor -%} {%- endif %} - {%- if datavault4dbt.is_something(derived_columns) -%}, - {# Additionally generating Ghost Records for Derived Columns #} + {%- if datavault4dbt.is_something(derived_columns) -%} + {# Additionally generating Ghost Records for Derived Columns #} {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column_name), datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} @@ -526,62 +533,61 @@ error_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ error_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ error_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for Source Columns #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', col_size=column.char_size) }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column.name), datatype=column.dtype, ghost_record_type='error', col_size=column.char_size) }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for Missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(col), datatype=dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes #} - {%- for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes#} + {% for prejoin in prejoined_columns %} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} + {{- log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) -}} {% for column in pj_relation_columns -%} - {% if column.name|lower == vals['bk']|lower -%} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', alias=col) -}} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} + {%- endfor -%} - {%- if not loop.last -%},{%- endif %} {% endfor -%} + {%- endif %} - {%- endif -%} - - {%- if datavault4dbt.is_something(derived_columns) %}, + {%- if datavault4dbt.is_something(derived_columns) %} {# Additionally generating Ghost Records for Derived Columns #} {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='error') }} - {%- if not loop.last %},{% endif %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column_name), datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} From a065dc1540c2cf700e3fec130e001db9b959fcb6 Mon Sep 17 00:00:00 2001 From: Tim Kirschke Date: Wed, 11 Dec 2024 17:24:56 +0100 Subject: [PATCH 25/50] First batch of modified front-end macros --- macros/tables/control_snap_v0.sql | 2 +- macros/tables/control_snap_v1.sql | 97 ++++++++++++++++++------------- macros/tables/eff_sat_v0.sql | 11 +++- macros/tables/hub.sql | 28 ++++++--- macros/tables/link.sql | 24 ++++++-- macros/tables/ma_sat_v0.sql | 31 +++++++--- macros/tables/ma_sat_v1.sql | 33 +++++++++-- macros/tables/nh_link.sql | 28 +++++++-- macros/tables/nh_sat.sql | 28 ++++++--- macros/tables/pit.sql | 53 +++++++++++++---- macros/tables/rec_track_sat.sql | 37 ++++++++---- macros/tables/ref_hub.sql | 25 +++++++- macros/tables/ref_table.sql | 41 ++++++++++++- 13 files changed, 333 insertions(+), 105 deletions(-) diff --git a/macros/tables/control_snap_v0.sql b/macros/tables/control_snap_v0.sql index 7d009ee3..fc800c4b 100644 --- a/macros/tables/control_snap_v0.sql +++ b/macros/tables/control_snap_v0.sql @@ -42,7 +42,7 @@ #} -{%- macro control_snap_v0(yaml_metadata, start_date, daily_snapshot_time, sdts_alias=none, end_date=none) -%} +{%- macro control_snap_v0(yaml_metadata=none, start_date=none, daily_snapshot_time=none, sdts_alias=none, end_date=none) -%} {% set start_date_description = " diff --git a/macros/tables/control_snap_v1.sql b/macros/tables/control_snap_v1.sql index d33775c9..53ada37b 100644 --- a/macros/tables/control_snap_v1.sql +++ b/macros/tables/control_snap_v1.sql @@ -29,38 +29,9 @@ Parameters: - control_snap_v0::string The name of the underlying version 0 control snapshot table. Needs to be - available as a dbt model. - - log_logic::dictionary Defining the desired durations of each granularity. Available granularities - are 'daily', 'weekly', 'monthly', and 'yearly'. For each granularity the - duration can be defined as an integer, and the time unit for that duration. - The units include (in BigQuery): DAY, WEEK, MONTH, QUARTER, YEAR. Besides - defining a duration and a unit for each granularity, there is also the option - to set a granularity to 'forever'. E.g. reporting requires daily snapshots - for 3 months, and after that the monthly snapshots should be kept forever. - - If log_logic is not set, no logic will be applied, and all snapshots will stay - active. The other dynamic columns are calculated anyway. - - The duration is always counted from the current date. - - EXASOL: Due to a missing "DAY OF WEEK" Function in Exasol, is_weekly is currently - not supported and needs to be left out of the log_logic definition. - - Examples: - {'daily': {'duration': 3, This configuration would keep daily - 'unit': 'MONTH', snapshots for 3 months, weekly snapshots - 'forever': 'FALSE'}, for 1 year, monthly snapshots for 5 - 'weekly': {'duration': 1, years and yearly snapshots forever. - 'unit': 'YEAR'}, If 'forever' is not defined here, it - 'monthly': {'duration': 5, is automatically set to 'FALSE'. - 'unit': 'YEAR'}, therefore it could have been left out - 'yearly': {'forever': 'TRUE'} } in the configuration for daily snapshots. - - {'daily': {'duration': 90, This would keep daily snapshots for 90 - 'unit': 'DAY'}, days, and monthly snapshots forever. - 'monthly': {'forever': 'TRUE'}} + + + sdts_alias::string Defines the name of the snapshot date timestamp column inside the snapshot_table. It is optional, if not set will use the global variable `datavault4dbt.sdts_alias` @@ -68,12 +39,60 @@ #} -{%- macro control_snap_v1(control_snap_v0, log_logic=none, sdts_alias=none) -%} - -{%- set sdts_alias = datavault4dbt.replace_standard(sdts_alias, 'datavault4dbt.sdts_alias', 'sdts') -%} - -{{ adapter.dispatch('control_snap_v1', 'datavault4dbt')(control_snap_v0=control_snap_v0, - log_logic=log_logic, - sdts_alias=sdts_alias) }} +{%- macro control_snap_v1(yaml_metadata=none, control_snap_v0=none, log_logic=none, sdts_alias=none) -%} + + {% set control_snap_v0_description = " + control_snap_v0::string The name of the underlying version 0 control snapshot table. Needs to be + available as a dbt model. + " %} + + {% set log_logic_description = " + log_logic::dictionary Defining the desired durations of each granularity. Available granularities + are 'daily', 'weekly', 'monthly', and 'yearly'. For each granularity the + duration can be defined as an integer, and the time unit for that duration. + The units include (in BigQuery): DAY, WEEK, MONTH, QUARTER, YEAR. Besides + defining a duration and a unit for each granularity, there is also the option + to set a granularity to 'forever'. E.g. reporting requires daily snapshots + for 3 months, and after that the monthly snapshots should be kept forever. + + If log_logic is not set, no logic will be applied, and all snapshots will stay + active. The other dynamic columns are calculated anyway. + + The duration is always counted from the current date. + + EXASOL: Due to a missing 'DAY OF WEEK' Function in Exasol, is_weekly is currently + not supported and needs to be left out of the log_logic definition. + + Examples: + {'daily': {'duration': 3, This configuration would keep daily + 'unit': 'MONTH', snapshots for 3 months, weekly snapshots + 'forever': 'FALSE'}, for 1 year, monthly snapshots for 5 + 'weekly': {'duration': 1, years and yearly snapshots forever. + 'unit': 'YEAR'}, If 'forever' is not defined here, it + 'monthly': {'duration': 5, is automatically set to 'FALSE'. + 'unit': 'YEAR'}, therefore it could have been left out + 'yearly': {'forever': 'TRUE'} } in the configuration for daily snapshots. + + {'daily': {'duration': 90, This would keep daily snapshots for 90 + 'unit': 'DAY'}, days, and monthly snapshots forever. + 'monthly': {'forever': 'TRUE'}} + " %} + + {% set sdts_alias_description = " + sdts_alias::string Defines the name of the snapshot date timestamp column inside the snapshot_table. + It is optional, if not set will use the global variable `datavault4dbt.sdts_alias` + set inside dbt_project.yml + " %} + + {% set control_snap_v0 = datavault4dbt.yaml_metadata_parser(name='control_snap_v0', yaml_metadata=yaml_metadata, parameter=control_snap_v0, required=True, documentation=control_snap_v0_description) %} + {% set log_logic = datavault4dbt.yaml_metadata_parser(name='log_logic', yaml_metadata=yaml_metadata, parameter=log_logic, required=False, documentation=log_logic_description) %} + {% set sdts_alias = datavault4dbt.yaml_metadata_parser(name='sdts_alias', yaml_metadata=yaml_metadata, parameter=sdts_alias, required=False, documentation=sdts_alias_description) %} + + + {%- set sdts_alias = datavault4dbt.replace_standard(sdts_alias, 'datavault4dbt.sdts_alias', 'sdts') -%} + + {{ adapter.dispatch('control_snap_v1', 'datavault4dbt')(control_snap_v0=control_snap_v0, + log_logic=log_logic, + sdts_alias=sdts_alias) }} {%- endmacro -%} diff --git a/macros/tables/eff_sat_v0.sql b/macros/tables/eff_sat_v0.sql index 2857e263..f5da8437 100644 --- a/macros/tables/eff_sat_v0.sql +++ b/macros/tables/eff_sat_v0.sql @@ -1,5 +1,13 @@ -{%- macro eff_sat_v0(source_model, tracked_hashkey, src_ldts=none, src_rsrc=none, is_active_alias=none, source_is_single_batch=true, disable_hwm=false) -%} +{%- macro eff_sat_v0(yaml_metadata=none, source_model=none, tracked_hashkey=none, src_ldts=none, src_rsrc=none, is_active_alias=none, source_is_single_batch=true, disable_hwm=false) -%} + {% set source_model = datavault4dbt.yaml_metadata_parser(name='source_model', yaml_metadata=yaml_metadata, parameter=source_model, required=True, documentation='Name of the source model') %} + {% set tracked_hashkey = datavault4dbt.yaml_metadata_parser(name='tracked_hashkey', yaml_metadata=yaml_metadata, parameter=tracked_hashkey, required=True, documentation='Name of the hashkey column to be tracked') %} + {% set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation='Name of the loaddate column in the source model. Optional.') %} + {% set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation='Name of the record source column in the source model. Optional.') %} + {% set is_active_alias = datavault4dbt.yaml_metadata_parser(name='is_active_alias', yaml_metadata=yaml_metadata, parameter=is_active_alias, required=False, documentation='Name of the new active flag column. Optional.') %} + {% set source_is_single_batch = datavault4dbt.yaml_metadata_parser(name='source_is_single_batch', yaml_metadata=yaml_metadata, parameter=source_is_single_batch, required=False, documentation='Whether the source contains only one batch. Optional, default True.') %} + {% set disable_hwm = datavault4dbt.yaml_metadata_parser(name='disable_hwm', yaml_metadata=yaml_metadata, parameter=disable_hwm, required=False, documentation='Whether the High Water Mark should be disabled or not. Optional.') %} + {# Applying the default aliases as stored inside the global variables, if src_ldts, src_rsrc, and ledts_alias are not set. #} {%- set src_ldts = datavault4dbt.replace_standard(src_ldts, 'datavault4dbt.ldts_alias', 'ldts') -%} {%- set src_rsrc = datavault4dbt.replace_standard(src_rsrc, 'datavault4dbt.rsrc_alias', 'rsrc') -%} @@ -13,4 +21,5 @@ source_is_single_batch=source_is_single_batch, disable_hwm=disable_hwm) ) }} + {%- endmacro -%} \ No newline at end of file diff --git a/macros/tables/hub.sql b/macros/tables/hub.sql index fa9f00a9..02dd66ce 100644 --- a/macros/tables/hub.sql +++ b/macros/tables/hub.sql @@ -7,15 +7,19 @@ - Supports multiple updates per batch and therefore initial loading - Can use a dynamic high-water-mark to optimize loading performance of multiple loads - Allows source mappings for deviations between source column names and hub column names +#} + - Parameters: +{%- macro hub(yaml_metadata=none, hashkey=none, business_keys=none, source_models=none, src_ldts=none, src_rsrc=none, disable_hwm=false) -%} + {% set hashkey_description = " hashkey::string Name of the hashkey column inside the stage, that should be used as PK of the Hub. Examples: 'hk_account_h' This hashkey column was created before inside the corresponding staging area, using the stage macro. + " %} - + {% set business_keys_description = " business_keys::string|list of strings Name(s) of the business key columns that should be loaded into the hub and are the input of the hashkey column. Needs to be available inside the stage model. If the names differ between multiple sources, you should define here how the business keys should be called inside the final hub model. The actual input column names need to be defined inside the 'source_model' @@ -25,8 +29,9 @@ 'account_key' This hub only has one business key and therefore only one is defined here. ['account_key', 'account_number'] This hub has two business keys which are both defined here. + " %} - + {% set source_models_description = " source_models::dictionary Dictionary with information about the source models. The keys of the dict are the names of the source models, and the value of each source model is another dictionary. This inner dictionary requires the key 'bk_columns' to be set (which contains the name of the business keys of that source model), and can have the optional keys 'hk_column', 'rsrc_static'. @@ -68,17 +73,24 @@ If the record source is the same over all loads, then it might look something like this: 'SAP/Accounts/'. Here everything would be static over all loads and therefore the rsrc_static can be set to 'SAP/Accounts/' without any wildcards in place. + " %} + {% set src_ldts_description = " src_ldts::string Name of the ldts column inside the source models. Is optional, will use the global variable 'datavault4dbt.ldts_alias'. Needs to use the same column name as defined as alias inside the staging model. + " %} + {% set src_rsrc_description = " src_rsrc::string Name of the rsrc column inside the source models. Is optional, will use the global variable 'datavault4dbt.rsrc_alias'. Needs to use the same column name as defined as alias inside the staging model. - -#} - - -{%- macro hub(hashkey, business_keys, source_models, src_ldts=none, src_rsrc=none, disable_hwm=false) -%} + " %} + + {%- set hashkey = datavault4dbt.yaml_metadata_parser(name='hashkey', yaml_metadata=yaml_metadata, parameter=hashkey, required=True, documentation=hashkey_description) -%} + {%- set business_keys = datavault4dbt.yaml_metadata_parser(name='business_keys', yaml_metadata=yaml_metadata, parameter=business_keys, required=True, documentation=business_keys_description) -%} + {%- set source_models = datavault4dbt.yaml_metadata_parser(name='source_models', yaml_metadata=yaml_metadata, parameter=source_models, required=True, documentation=source_models_description) -%} + {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} + {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} + {%- set disable_hwm = datavault4dbt.yaml_metadata_parser(name='disable_hwm', yaml_metadata=yaml_metadata, parameter=disable_hwm, required=False, documentation='Whether the High Water Mark should be turned off. Optional, default False.') -%} {# Applying the default aliases as stored inside the global variables, if src_ldts and src_rsrc are not set. #} diff --git a/macros/tables/link.sql b/macros/tables/link.sql index 7818f06e..9df0a4a2 100644 --- a/macros/tables/link.sql +++ b/macros/tables/link.sql @@ -3,16 +3,20 @@ if multiple sources share the same business definitions. Typically a link would only be loaded by multiple sources, if those multiple sources also share the business definitions of the hubs, and therefore load the connected hubs together as well. If multiple sources are used, it is required that they all have the same number of foreign keys inside, otherwise they would not share the same business definition of that link. +#} - Parameters: +{%- macro link(yaml_metadata=none, link_hashkey=none, foreign_hashkeys=none, source_models=none, src_ldts=none, src_rsrc=none, disable_hwm=false) -%} + {% set link_hashkey_description = " link_hashkey::string Name of the link hashkey column inside the stage. Should get calculated out of all business keys inside the link. Examples: 'hk_account_contact_l' This hashkey column belongs to the link between account and contact, and was created at the staging layer by the stage macro. + " %} + {% set foreign_hashkeys_description = " foreign_hashkeys::list of strings List of all hashkey columns inside the link, that refer to other hub entities. All hashkey columns must be available inside the stage area. @@ -20,7 +24,9 @@ ['hk_account_h', 'hk_contact_h'] The link between account and contact needs to contain both the hashkey of account and contact to enable joins the corresponding hub entities. + " %} + {% set source_models_description = " source_models::dictionary Dictionary with information about the source models. The keys of the dict are the names of the source models, and the value of each source model is another dictionary. This inner dictionary requires to have the keys 'rsrc_static', and optionally the keys 'hk_column' and 'fk_columns'. @@ -59,16 +65,24 @@ If my rsrc would be the same over all loads, then it might look something like this: 'SAP/Accounts/'. Here everything would be static over all loads and therefore I would set rsrc_static to 'SAP/Accounts/' without any wildcards in place. + " %} + {% set src_ldts_description = " src_ldts::string Name of the ldts column inside the source models. Is optional, will use the global variable 'datavault4dbt.ldts_alias'. Needs to use the same column name as defined as alias inside the staging model. + " %} + {% set rsrc_description = " src_rsrc::string Name of the rsrc column inside the source models. Is optional, will use the global variable 'datavault4dbt.rsrc_alias'. Needs to use the same column name as defined as alias inside the staging model. - -#} - -{%- macro link(link_hashkey, foreign_hashkeys, source_models, src_ldts=none, src_rsrc=none, disable_hwm=false) -%} + " %} + + {%- set link_hashkey = datavault4dbt.yaml_metadata_parser(name='link_hashkey', yaml_metadata=yaml_metadata, parameter=link_hashkey, required=True, documentation=link_hashkey_description) -%} + {%- set foreign_hashkeys = datavault4dbt.yaml_metadata_parser(name='foreign_hashkeys', yaml_metadata=yaml_metadata, parameter=foreign_hashkeys, required=True, documentation=foreign_hashkeys_description) -%} + {%- set source_models = datavault4dbt.yaml_metadata_parser(name='source_models', yaml_metadata=yaml_metadata, parameter=source_models, required=True, documentation=source_models_description) -%} + {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} + {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} + {%- set disable_hwm = datavault4dbt.yaml_metadata_parser(name='disable_hwm', yaml_metadata=yaml_metadata, parameter=disable_hwm, required=False, documentation='Whether the High Water Mark should be turned off. Optional, default False.') -%} {# Applying the default aliases as stored inside the global variables, if src_ldts and src_rsrc are not set. #} diff --git a/macros/tables/ma_sat_v0.sql b/macros/tables/ma_sat_v0.sql index 4bdcf1c9..84330694 100644 --- a/macros/tables/ma_sat_v0.sql +++ b/macros/tables/ma_sat_v0.sql @@ -8,9 +8,11 @@ Features: - Can handle multiple updates per batch, without losing intermediate changes. therefore initial loading is supported. - Using a dynamic high-water-mark to optimize loading performance of multiple loads +#} - Parameters: +{%- macro ma_sat_v0(yaml_metadata=none, parent_hashkey=none, src_hashdiff=none, src_ma_key=none, src_payload=none, source_model=none, src_ldts=none, src_rsrc=none) -%} + {% set parent_hashkey_description = " parent_hashkey::string Name of the hashkey column inside the stage of the object that this satellite is attached to. Examples: @@ -19,7 +21,9 @@ 'hk_account_contact_l' The satellite would be attached to the link between account and contact, which has the column 'hk_account_contact_l' as a hashkey column. + " %} + {% set src_hashdiff_description = " src_hashdiff::string Name of the hashdiff column of this satellite, that was created inside the staging area and is calculated out of the entire payload of this satellite. The stage must hold one hashdiff per satellite entity. @@ -28,7 +32,9 @@ 'hd_account_data_sfdc_s' Since we recommend naming the hashdiff column similar to the name of the satellite entity, just with a prefix, this would be the hashdiff column of the data satellite for account. + " %} + {% set src_ma_key_description = " src_ma_key::string|list of strings Name(s) of the multi-active keys inside the staging area. Need to be the same ones, as defined in the stage model. @@ -39,7 +45,9 @@ ['phonetype', 'company'] In this case, the combination of the two columns 'phonetype' and 'company' is treated as the multi-active key. + " %} + {% set src_payload_description = " src_payload::list of strings A list of all the descriptive attributes that should be included in this satellite. Needs to be the columns that are fed into the hashdiff calculation of this satellite. Do not include the multi-active key in the payload of a multi-active satellite, it is included automatically! @@ -48,23 +56,32 @@ ['name', 'address', 'country', 'phone', 'email'] This satellite would hold the columns 'name', 'address', 'country', 'phone' and 'email', coming out of the underlying staging area. + " %} + {% set source_model_description = " source_model::string Name of the underlying staging model, must be available inside dbt as a model. Examples: 'stage_account' This satellite is loaded out of the stage for account. + " %} + {% set src_ldts_description = " src_ldts::string Name of the ldts column inside the source model. Is optional, will use the global variable 'datavault4dbt.ldts_alias'. Needs to use the same column name as defined as alias inside the staging model. + " %} + {% set src_rsrc_description = " src_rsrc::string Name of the rsrc column inside the source model. Is optional, will use the global variable 'datavault4dbt.rsrc_alias'. Needs to use the same column name as defined as alias inside the staging model. - - - -#} - -{%- macro ma_sat_v0(parent_hashkey, src_hashdiff, src_ma_key, src_payload, source_model, src_ldts=none, src_rsrc=none) -%} + " %} + + {%- set parent_hashkey = datavault4dbt.yaml_metadata_parser(name='parent_hashkey', yaml_metadata=yaml_metadata, parameter=parent_hashkey, required=True, documentation=parent_hashkey_description) -%} + {%- set src_hashdiff = datavault4dbt.yaml_metadata_parser(name='src_hashdiff', yaml_metadata=yaml_metadata, parameter=src_hashdiff, required=True, documentation=src_hashdiff_description) -%} + {%- set src_ma_key = datavault4dbt.yaml_metadata_parser(name='src_ma_key', yaml_metadata=yaml_metadata, parameter=src_ma_key, required=True, documentation=src_ma_key_description) -%} + {%- set src_payload = datavault4dbt.yaml_metadata_parser(name='src_payload', yaml_metadata=yaml_metadata, parameter=src_payload, required=True, documentation=src_payload_description) -%} + {%- set source_model = datavault4dbt.yaml_metadata_parser(name='source_model', yaml_metadata=yaml_metadata, parameter=source_model, required=True, documentation=source_model_description) -%} + {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} + {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} {# Applying the default aliases as stored inside the global variables, if src_ldts and src_rsrc are not set. #} diff --git a/macros/tables/ma_sat_v1.sql b/macros/tables/ma_sat_v1.sql index ce411472..0c663bd7 100644 --- a/macros/tables/ma_sat_v1.sql +++ b/macros/tables/ma_sat_v1.sql @@ -7,15 +7,19 @@ - Calculates virtualized load-end-dates to correctly identify multiple active records per batch - Enforces insert-only approach by view materialization - Allows multiple attributes to be used as the multi-active-attribute +#} - Parameters: +{%- macro ma_sat_v1(yaml_metadata=none, sat_v0=none, hashkey=none, hashdiff=none, ma_attribute=none, src_ldts=none, src_rsrc=none, ledts_alias=none, add_is_current_flag=false) -%} + {% set sat_v0_description = " sat_v0::string Name of the underlying version 0 multi-active satellite. Examples: 'contact_phonenumber_0_s' This satellite would be the version 1 satellite of the underlying version 0 phone number satellite for contacts. + " %} + {% set hashkey_description = " hashkey::string Name of the parent hashkey column inside the version 0 satellite. Would either be the hashkey of a hub or a link. Needs to be similar to the 'parent_hashkey' parameter inside the sat_v0 model. @@ -25,7 +29,9 @@ 'hk_order_contact_l' The satellite would be attached to the link between order and contact, which has the column 'hk_order_contact_l' as a hashkey column. + " %} + {% set hashdiff_description = " hashdiff::string Name of the hashdiff column inside the underlying version 0 satellite. Needs to be similar to the 'src_hashdiff' parameter inside the sat_v0 model. Must not include the ma_attribute in calculation. @@ -33,7 +39,9 @@ 'hd_contact_phonenumber_s' Since we recommend naming the hashdiff column similar to the name of the satellite entity, just with a prefix, this would be the hashdiff column of the phone number satellite for contacts. + " %} + {% set ma_attribute_description = " ma_attribute::string|list of strings Name of the multi active attribute inside the v0 satellite. This needs to be identified under the requirement that the combination of hashkey + ldts + ma_attribute is unique over the entire stage / satellite. @@ -46,25 +54,38 @@ ['phone_type', 'iid'] If a contact could have multiple mobile phone numbers, the phone_type alone would not be enough to uniquely identify a record inside a hashkey+ldts combination. Additionally the attribute iid, which is an increasing identifier within a phone_type, is added as a ma_attribute. + " %} - + {% set src_ldts_description = " src_ldts::string Name of the ldts column inside the source models. Is optional, will use the global variable 'datavault4dbt.ldts_alias'. Needs to use the same column name as defined as alias inside the staging model. + " %} + {% set src_rsrc_description = " src_rsrc::string Name of the rsrc column inside the source models. Is optional, will use the global variable 'datavault4dbt.rsrc_alias'. Needs to use the same column name as defined as alias inside the staging model. + " %} + {% set ledts_alias_description = " ledts_alias::string Desired alias for the load end date column. Is optional, will use the global variable 'datavault4dbt.ledts_alias' if set here. + " %} + {% set add_is_current_flag_description = " add_is_current_flag::boolean Optional parameter to add a new column to the v1 sat based on the load end date timestamp (ledts). Default is false. If set to true it will add this is_current flag to the v1 sat. For each record this column will be set to true if the load end date time stamp is equal to the variable end of all times. If its not, then the record is not current therefore it will be set to false. - -#} - -{%- macro ma_sat_v1(sat_v0, hashkey, hashdiff, ma_attribute, src_ldts=none, src_rsrc=none, ledts_alias=none, add_is_current_flag=false) -%} + " %} + + {%- set sat_v0 = datavault4dbt.yaml_metadata_parser(name='sat_v0', yaml_metadata=yaml_metadata, parameter=sat_v0, required=True, documentation=sat_v0_description) -%} + {%- set hashkey = datavault4dbt.yaml_metadata_parser(name='hashkey', yaml_metadata=yaml_metadata, parameter=hashkey, required=True, documentation=hashkey_description) -%} + {%- set hashdiff = datavault4dbt.yaml_metadata_parser(name='hashdiff', yaml_metadata=yaml_metadata, parameter=hashdiff, required=True, documentation=hashdiff_description) -%} + {%- set ma_attribute = datavault4dbt.yaml_metadata_parser(name='ma_attribute', yaml_metadata=yaml_metadata, parameter=ma_attribute, required=True, documentation=ma_attribute_description) -%} + {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} + {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} + {%- set ledts_alias = datavault4dbt.yaml_metadata_parser(name='ledts_alias', yaml_metadata=yaml_metadata, parameter=ledts_alias, required=False, documentation=ledts_alias_description) -%} + {%- set add_is_current_flag = datavault4dbt.yaml_metadata_parser(name='add_is_current_flag', yaml_metadata=yaml_metadata, parameter=add_is_current_flag, required=False, documentation=add_is_current_flag_description) -%} {# Applying the default aliases as stored inside the global variables, if src_ldts, src_rsrc, and ledts_alias are not set. #} diff --git a/macros/tables/nh_link.sql b/macros/tables/nh_link.sql index d6ebef77..9eceecc7 100644 --- a/macros/tables/nh_link.sql +++ b/macros/tables/nh_link.sql @@ -4,23 +4,29 @@ number of foreign keys inside, otherwise they would not share the same business definition of that non-historized link. In the background a non-historized link uses exactly the same loading logic as a regular link, but adds the descriptive attributes as additional payload. +#} - Parameters: - +{%- macro nh_link(yaml_metadata=none, link_hashkey=none, payload=none, source_models=none, foreign_hashkeys=none, src_ldts=none, src_rsrc=none, disable_hwm=false, source_is_single_batch=false) -%} + + {% set link_hashkey_description = " link_hashkey::string Name of the non-historized link hashkey column inside the stage. Should get calculated out of all business keys inside the link. Examples: 'hk_transaction_account_nl' This hashkey column belongs to the non-historized link between transaction and account, and was created at the staging layer by the stage macro. + " %} + {% set foreign_hashkeys_description = " foreign_hashkeys::list of strings List of all hashkey columns inside the non-historized link, that refer to other hub entities. All hashkey columns must be available inside the stage area. Examples: ['hk_transaction_h', 'hk_account_h'] The non-historized link between transaction and account needs to contain both the hashkey of transaction and account to enable joins to the corresponding hub entities. + " %} + {% set payload_description = " payload::list of strings A list of all the descriptive attributes that should be the payload of this non-historized link. If the names differ between source models, this list will define how the columns are named inside the result non historized link. The mapping which columns to use from which source model then need to be defined inside the parameter 'payload' inside the variable 'source_models'. @@ -28,7 +34,9 @@ Examples: ['currency_isocode', 'amount', 'purpose', 'transaction_date'] The non-historized link will be enriched by the descriptive attributes 'currency_isocode', 'amount', 'purpose' and 'transaction_date'. + " %} + {% set source_models_description = " source_models::dictionary Dictionary with information about the source models. The keys of the dict are the names of the source models, and the value of each source model is another dictionary. This inner dictionary optionally has the keys 'hk_column', 'fk_columns', 'payload' and 'rsrc_static'. @@ -72,17 +80,27 @@ If the record source is the same over all loads, then it might look something like this: 'SAP/Accounts/'. Here everything would be static over all loads and therefore the rsrc_static would be set to 'SAP/Accounts/' without any wildcards in place. + " %} - + {% set src_ldts_description = " src_ldts::string Name of the ldts column inside the source models. Is optional, will use the global variable 'datavault4dbt.ldts_alias'. Needs to use the same column name as defined as alias inside the staging model. + " %} + {% set src_rsrc_description = " src_rsrc::string Name of the rsrc column inside the source models. Is optional, will use the global variable 'datavault4dbt.rsrc_alias'. Needs to use the same column name as defined as alias inside the staging model. + " %} -#} + {%- set link_hashkey = datavault4dbt.yaml_metadata_parser(name='link_hashkey', yaml_metadata=yaml_metadata, parameter=link_hashkey, required=True, documentation=link_hashkey_description) -%} + {%- set payload = datavault4dbt.yaml_metadata_parser(name='payload', yaml_metadata=yaml_metadata, parameter=payload, required=True, documentation=payload_description) -%} + {%- set source_models = datavault4dbt.yaml_metadata_parser(name='source_models', yaml_metadata=yaml_metadata, parameter=source_models, required=True, documentation=source_models_description) -%} + {%- set foreign_hashkeys = datavault4dbt.yaml_metadata_parser(name='foreign_hashkeys', yaml_metadata=yaml_metadata, parameter=foreign_hashkeys, required=False, documentation=foreign_hashkeys_description) -%} + {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} + {%- set rsrc = datavault4dbt.yaml_metadata_parser(name='rsrc', yaml_metadata=yaml_metadata, parameter=rsrc, required=False, documentation=rsrc_description) -%} + {%- set disable_hwm = datavault4dbt.yaml_metadata_parser(name='disable_hwm', yaml_metadata=yaml_metadata, parameter=disable_hwm, required=False, documentation='Whether the High Water Mark should be turned off. Optional, default False.') -%} + {%- set source_is_single_batch = datavault4dbt.yaml_metadata_parser(name='source_is_single_batch', yaml_metadata=yaml_metadata, parameter=source_is_single_batch, required=False, documentation='Whether the source contains only one batch. Optional, default False.') -%} -{%- macro nh_link(link_hashkey, payload, source_models, foreign_hashkeys=none, src_ldts=none, src_rsrc=none, disable_hwm=false, source_is_single_batch=false) -%} {# Applying the default aliases as stored inside the global variables, if src_ldts and src_rsrc are not set. #} {%- set src_ldts = datavault4dbt.replace_standard(src_ldts, 'datavault4dbt.ldts_alias', 'ldts') -%} diff --git a/macros/tables/nh_sat.sql b/macros/tables/nh_sat.sql index 2f059a6f..b5295c71 100644 --- a/macros/tables/nh_sat.sql +++ b/macros/tables/nh_sat.sql @@ -6,9 +6,11 @@ Features: - High-Perfomance loading of non-historized satellite data +#} - Parameters: +{%- macro nh_sat(yaml_metadata=none, parent_hashkey=none, src_payload=none, source_model=none, src_ldts=none, src_rsrc=none, source_is_single_batch=false) -%} + {% set parent_hashkey_description = " parent_hashkey::string Name of the hashkey column inside the stage of the object that this satellite is attached to. Examples: @@ -17,30 +19,40 @@ 'hk_account_contact_l' The satellite would be attached to the link between account and contact, which has the column 'hk_account_contact_l' as a hashkey column. + " %} + {% set src_payload_description = " src_payload::list of strings A list of all the descriptive attributes that should be included in this satellite. Examples: ['name', 'address', 'country', 'phone', 'email'] This satellite would hold the columns 'name', 'address', 'country', 'phone' and 'email', coming out of the underlying staging area. + "%} + {% set source_model_description = " source_model::string Name of the underlying staging model, must be available inside dbt as a model. Examples: 'stage_account' This satellite is loaded out of the stage for account. + "%} + {% set src_ldts_description = " src_ldts::string Name of the ldts column inside the source model. Is optional, will use the global variable 'datavault4dbt.ldts_alias'. Needs to use the same column name as defined as alias inside the staging model. + " %} - src_rsrc::string Name of the rsrc column inside the source model. Is optional, will use the global variable 'datavault4dbt.rsrc_alias'. + {% set src_rsrc_description = " + src_rsrc::string Name of the rsrc column inside the source model. Is optional, will use the global variable 'datavault4dbt.rsrc_alias'. Needs to use the same column name as defined as alias inside the staging model. - - - -#} - -{%- macro nh_sat(parent_hashkey, src_payload, source_model, src_ldts=none, src_rsrc=none, source_is_single_batch=false) -%} + " %} + + {%- set parent_hashkey = datavault4dbt.yaml_metadata_parser(name='parent_hashkey', yaml_metadata=yaml_metadata, parameter=parent_hashkey, required=True, documentation=parent_hashkey_description) -%} + {%- set src_payload = datavault4dbt.yaml_metadata_parser(name='src_payload', yaml_metadata=yaml_metadata, parameter=src_payload, required=True, documentation=src_payload_description) -%} + {%- set source_model = datavault4dbt.yaml_metadata_parser(name='source_model', yaml_metadata=yaml_metadata, parameter=source_model, required=True, documentation=source_model_description) -%} + {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} + {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} + {%- set source_is_single_batch = datavault4dbt.yaml_metadata_parser(name='source_is_single_batch', yaml_metadata=yaml_metadata, parameter=source_is_single_batch, required=False, documentation='Whether the source contains only one batch. Optional, default False.') -%} {# Applying the default aliases as stored inside the global variables, if src_ldts and src_rsrc are not set. #} diff --git a/macros/tables/pit.sql b/macros/tables/pit.sql index c2a42b3f..c087455c 100644 --- a/macros/tables/pit.sql +++ b/macros/tables/pit.sql @@ -9,49 +9,80 @@ - Strongly improves performance if upstream queries require many JOIN operations - Creates a unique dimension key to optimize loading performance of incremental loads - Allows to insert a static string as record source column, matching business vault definition of a record source +#} - Parameters: - pit_type::string String to insert into the 'pit_type' column. Has to be prefixed by a !. - Allows for future implementations of other PIT variants, like T-PITs etc. - Can be set freely, something like 'PIT' could be the default. - Is optional, if not set, no column will be added. +{%- macro pit(yaml_metadata=none, tracked_entity=none, hashkey=none, sat_names=none, snapshot_relation=none, dimension_key=none, snapshot_trigger_column=none, ldts=none, custom_rsrc=none, ledts=none, sdts=none, pit_type=none) -%} + + {% set tracked_entity_description = " tracked_entity::string Name of the tracked Hub entity. Must be available as a model inside the dbt project. + " %} + {% set hashkey_description = " hashkey::string The name of the hashkey column inside the previously referred Hub entity. + " %} + {% set sat_names_description = " sat_names::list of strings A list of all the satellites that should be included in this PIT table. Can only be satellites that are attached to the tracked Hub, and should typically include all those satellites. You should always refer here to the version 1 satellites, since those hold the load-end-date. The macro currently supports regular satellites and nh-satellites. + " %} + {% set snapshot_relation_description = " snapshot_relation::string The name of the snapshot relation. It needs to be available as a model inside this dbt project. + " %} + {% set snapshot_trigger_column_description = " snapshot_trigger_column::string The name of the column inside the previously mentioned snapshot relation, that is boolean and identifies the snapshots that should be included in the PIT table. + " %} + {% set dimension_key_description = " dimension_key::string The desired name of the dimension key inside the PIT table. Should follow some naming conventions. Recommended is the name of the hashkey with a '_d' suffix. + " %} + {% set ldts_description = " ldts::string Name of the ldts column inside all source models. Is optional, will use the global variable 'datavault4dbt.ldts_alias'. Needs to use the same column name as defined as alias inside the staging model. + " %} + {% set custom_rsrc_description = " custom_rsrc::string A custom string that should be inserted into the 'rsrc' column inside the PIT table. Since a PIT table is a business vault entity, the technical record source is no longer used here. Is optional, if not defined, no column is added. + " %} + {% set ledts_description = " ledts::string Name of the load-end-date column inside the satellites. Is optional, will use the global variable 'datavault4dbt.ledts_alias' if not set here. - + " %} + + {% set sdts_description = " sdts::string Name of the snapshot date timestamp column inside the snapshot table. It is optional, will use the global variable 'datavault4dbt.sdts_alias' if not set here. + " %} -#} - - - -{%- macro pit(tracked_entity, hashkey, sat_names, snapshot_relation, dimension_key, snapshot_trigger_column=none, ldts=none, custom_rsrc=none, ledts=none, sdts=none, pit_type=none) -%} + {% set pit_type_description = " + pit_type::string String to insert into the 'pit_type' column. Has to be prefixed by a !. + Allows for future implementations of other PIT variants, like T-PITs etc. + Can be set freely, something like 'PIT' could be the default. + Is optional, if not set, no column will be added. + " %} + + {%- set tracked_entity = datavault4dbt.yaml_metadata_parser(name='tracked_entity', yaml_metadata=yaml_metadata, parameter=tracked_entity, required=True, documentation=tracked_entity_description) -%} + {%- set hashkey = datavault4dbt.yaml_metadata_parser(name='hashkey', yaml_metadata=yaml_metadata, parameter=hashkey, required=True, documentation=hashkey_description) -%} + {%- set sat_names = datavault4dbt.yaml_metadata_parser(name='sat_names', yaml_metadata=yaml_metadata, parameter=sat_names, required=True, documentation=sat_names_description) -%} + {%- set snapshot_relation = datavault4dbt.yaml_metadata_parser(name='snapshot_relation', yaml_metadata=yaml_metadata, parameter=snapshot_relation, required=True, documentation=snapshot_relation_description) -%} + {%- set dimension_key = datavault4dbt.yaml_metadata_parser(name='dimension_key', yaml_metadata=yaml_metadata, parameter=dimension_key, required=True, documentation=dimension_key_description) -%} + {%- set snapshot_trigger_column = datavault4dbt.yaml_metadata_parser(name='snapshot_trigger_column', yaml_metadata=yaml_metadata, parameter=snapshot_trigger_column, required=False, documentation=snapshot_trigger_column_description) -%} + {%- set ldts = datavault4dbt.yaml_metadata_parser(name='ldts', yaml_metadata=yaml_metadata, parameter=ldts, required=False, documentation=ldts_description) -%} + {%- set custom_rsrc = datavault4dbt.yaml_metadata_parser(name='custom_rsrc', yaml_metadata=yaml_metadata, parameter=custom_rsrc, required=False, documentation=custom_rsrc_description) -%} + {%- set ledts = datavault4dbt.yaml_metadata_parser(name='ledts', yaml_metadata=yaml_metadata, parameter=ledts, required=False, documentation=ledts_description) -%} + {%- set sdts = datavault4dbt.yaml_metadata_parser(name='sdts', yaml_metadata=yaml_metadata, parameter=sdts, required=False, documentation=sdts_description) -%} + {%- set pit_type = datavault4dbt.yaml_metadata_parser(name='pit_type', yaml_metadata=yaml_metadata, parameter=pit_type, required=False, documentation=pit_type_description) -%} {# Applying the default aliases as stored inside the global variables, if ldts, sdts and ledts are not set. #} diff --git a/macros/tables/rec_track_sat.sql b/macros/tables/rec_track_sat.sql index 13d024b2..161f054c 100644 --- a/macros/tables/rec_track_sat.sql +++ b/macros/tables/rec_track_sat.sql @@ -11,28 +11,32 @@ - Supports multiple updates per batch and therefore initial loading - Using a dynamic high-water-mark to optimize loading performance of multiple loads - Can either track link- or hub-hashkeys +#} - Parameters: +{%- macro rec_track_sat(yaml_metadata=none, tracked_hashkey=none, source_models=none, src_ldts=none, src_rsrc=none, src_stg=none, disable_hwm=false) -%} + {% set tracked_hashkey_description = " tracked_hashkey::string The name of the hashkey column you want to track. Needs to be available in the underlying staging layer. If you want to track multiple hashkeys out of one stage, you need to create one record tracking satellite for each hashkey. Examples: - "hk_contact_h" This record tracking satellite tracks the appearance of the hashkey for the contact hub. + 'hk_contact_h' This record tracking satellite tracks the appearance of the hashkey for the contact hub. - "hk_contact_account_l" This record tracking satellite tracks the appearance of the hashkey for the link between contacts and accounts. + 'hk_contact_account_l' This record tracking satellite tracks the appearance of the hashkey for the link between contacts and accounts. + " %} + {% set source_models_description = " source_models::dictionary Dictionary with information about the source model. The key of the dict is the name of the source model, and the value is another dictionary. This inner dictionary requires to have the keys 'rsrc_static', and optionally the key 'hk_column'. Examples: - {'stage_account': {'hk_column': 'hk_account_h', This record tracking satellite tracks the hashkey "hk_account_h" inside the - 'rsrc_static': '*/SAP/Accounts/*'}} source model named "stage_account". + {'stage_account': {'hk_column': 'hk_account_h', This record tracking satellite tracks the hashkey 'hk_account_h' inside the + 'rsrc_static': '*/SAP/Accounts/*'}} source model named 'stage_account'. {'stage_contact': {'rsrc_static': '*/SALESFORCE/Contact/*'}, This tracks the appearance of one hub hashkey that is loaded from the two source - 'stage_partner': {'hk_column': 'hk_partner_h', models "stage_contact" and "stage_partner". For "stage_account" no 'hk_column' is defined, + 'stage_partner': {'hk_column': 'hk_partner_h', models 'stage_contact' and 'stage_partner'. For 'stage_account' no 'hk_column' is defined, 'rsrc_static': '*/SALESFORCE/Partners/*'}} and therefore the input of the upper level variable 'tracked_hashkey' will be used. - For "stage_partner" the name of the hashkey column differs from the upper level definition + For 'stage_partner' the name of the hashkey column differs from the upper level definition and therefore this other name is set under the variable 'hk_column.' The 'rsrc_static' attribute defines a STRING or a list of strings that will always be @@ -53,19 +57,28 @@ If the rsrc_static is not set in one of the source models, then the assumption is made that or this source there is always the same value for any record in the record source column. The macro will then get automatically this unique value querying the source model. - + " %} + {% set src_ldts_description = " src_ldts::string Name of the ldts column inside the source models. Is optional, will use the global variable 'datavault4dbt.ldts_alias'. Needs to use the same column name as defined as alias inside the staging model. + " %} + {% set src_rsrc_description = " src_rsrc::string Name of the rsrc column inside the source models. Is optional, will use the global variable 'datavault4dbt.rsrc_alias'. Needs to use the same column name as defined as alias inside the staging model. + " %} + {% set src_stg_description = " src_stg::string Name of the source stage model. Is optional, will use the global variable 'datavault4dbt.stg_alias'. - -#} - -{%- macro rec_track_sat(tracked_hashkey, source_models, src_ldts=none, src_rsrc=none, src_stg=none, disable_hwm=false) -%} + " %} + + {%- set tracked_hashkey = datavault4dbt.yaml_metadata_parser(name='tracked_hashkey', yaml_metadata=yaml_metadata, parameter=tracked_hashkey, required=True, documentation=tracked_hashkey_description) -%} + {%- set source_models = datavault4dbt.yaml_metadata_parser(name='source_models', yaml_metadata=yaml_metadata, parameter=source_models, required=True, documentation=source_models_description) -%} + {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} + {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} + {%- set src_stg = datavault4dbt.yaml_metadata_parser(name='src_stg', yaml_metadata=yaml_metadata, parameter=src_stg, required=False, documentation=src_stg_description) -%} + {%- set disable_hwm = datavault4dbt.yaml_metadata_parser(name='disable_hwm', yaml_metadata=yaml_metadata, parameter=disable_hwm, required=False, documentation='Whether the High Water Mark should be turned off. Optional, default False.') -%} {# Applying the default aliases as stored inside the global variables, if src_ldts and src_rsrc are not set. #} diff --git a/macros/tables/ref_hub.sql b/macros/tables/ref_hub.sql index ccd7f488..e445f88c 100644 --- a/macros/tables/ref_hub.sql +++ b/macros/tables/ref_hub.sql @@ -29,7 +29,30 @@ ref_keys: N_NATIONKEY -{%- macro ref_hub(ref_keys, source_models, src_ldts=none, src_rsrc=none) -%} +{%- macro ref_hub(yaml_metadata=none, ref_keys=none, source_models=none, src_ldts=none, src_rsrc=none) -%} + + {% set ref_keys_description = " + ref_keys::string|list of strings Name of the reference key(s) available in the source model(s). + " %} + + {% set source_models_description = " + source_models::dictionary Similar to other source_models parameters, e.g. in Hubs or Links. + " %} + + {% set src_ldts_description = " + src_ldts::string Name of the ldts column inside the source models. Is optional, will use the global variable 'datavault4dbt.ldts_alias'. + Needs to use the same column name as defined as alias inside the staging model. + " %} + + {% set src_rsrc_description = " + src_rsrc::string Name of the rsrc column inside the source models. Is optional, will use the global variable 'datavault4dbt.rsrc_alias'. + Needs to use the same column name as defined as alias inside the staging model. + " %} + + {%- set ref_keys = datavault4dbt.yaml_metadata_parser(name='ref_keys', yaml_metadata=yaml_metadata, parameter=ref_keys, required=True, documentation=ref_keys_description) -%} + {%- set source_models = datavault4dbt.yaml_metadata_parser(name='source_models', yaml_metadata=yaml_metadata, parameter=source_models, required=True, documentation=source_models_description) -%} + {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} + {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} {# Applying the default aliases as stored inside the global variables, if src_ldts and src_rsrc are not set. #} diff --git a/macros/tables/ref_table.sql b/macros/tables/ref_table.sql index 567851ac..efcd4640 100644 --- a/macros/tables/ref_table.sql +++ b/macros/tables/ref_table.sql @@ -86,7 +86,46 @@ Include / Exclude per Satellite: #} -{%- macro ref_table(ref_hub, ref_satellites, src_ldts=none, src_rsrc=none, historized='latest', snapshot_relation=none, snapshot_trigger_column=none) -%} +{%- macro ref_table(yaml_metadata=none, ref_hub=none, ref_satellites=none, src_ldts=none, src_rsrc=none, historized='latest', snapshot_relation=none, snapshot_trigger_column=none) -%} + + {% set ref_hub_description = " + ref_hub::string Name of the underlying ref_hub model. + " %} + + {% set ref_satellites = " + ref_satellites::string|list of strings Name(s) of the reference satellites to be included in this ref_table. Optional: 'include' & 'exclude' as dictionary keys for each satellite. + " %} + + {% set src_ldts_description = " + src_ldts::string Name of the ldts column inside the source models. Is optional, will use the global variable 'datavault4dbt.ldts_alias'. + Needs to use the same column name as defined as alias inside the staging model. + " %} + + {% set src_rsrc_description = " + src_rsrc::string Name of the rsrc column inside the source models. Is optional, will use the global variable 'datavault4dbt.rsrc_alias'. + Needs to use the same column name as defined as alias inside the staging model. + " %} + + {% set historized_description = " + historized::string Possible values are 'full', 'latest', or 'snapshot'. Influences how much history this reference table will hold. + " %} + + {% set snapshot_relation_description = " + snapshot_relation::string Only required, if 'historized' set to 'snapshot'. Name of the snapshot_v1 model to be used. + " %} + + {% set snapshot_trigger_column_description = " + snapshot_trigger_column::string Only required, if 'historized' set to 'snapshot'. Defaults to global variable 'datavault4dbt.sdts_alias'. Only needs to be set if alias deviates from global variable. + " %} + + {%- set ref_hub = datavault4dbt.yaml_metadata_parser(name='ref_hub', yaml_metadata=yaml_metadata, parameter=ref_hub, required=True, documentation=ref_hub_description) -%} + {%- set ref_satellites = datavault4dbt.yaml_metadata_parser(name='ref_satellites', yaml_metadata=yaml_metadata, parameter=ref_satellites, required=True, documentation=ref_satellites_description) -%} + {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} + {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} + {%- set historized = datavault4dbt.yaml_metadata_parser(name='historized', yaml_metadata=yaml_metadata, parameter=historized, required=False, documentation=historized_description) -%} + {%- set snapshot_relation = datavault4dbt.yaml_metadata_parser(name='snapshot_relation', yaml_metadata=yaml_metadata, parameter=snapshot_relation, required=False, documentation=snapshot_relation_description) -%} + {%- set snapshot_trigger_column = datavault4dbt.yaml_metadata_parser(name='snapshot_trigger_column', yaml_metadata=yaml_metadata, parameter=snapshot_trigger_column, required=False, documentation=snapshot_trigger_column_description) -%} + {# Applying the default aliases as stored inside the global variables, if src_ldts and src_rsrc are not set. #} {%- set src_ldts = datavault4dbt.replace_standard(src_ldts, 'datavault4dbt.ldts_alias', 'ldts') -%} From 71bfee3fefeac66cd9549f6e6419ce062cf85437 Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Thu, 12 Dec 2024 08:26:53 +0100 Subject: [PATCH 26/50] synapse stage: remove column name escaping in ghost record macro call --- macros/staging/synapse/stage.sql | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/macros/staging/synapse/stage.sql b/macros/staging/synapse/stage.sql index 71525cf0..9234f9e7 100644 --- a/macros/staging/synapse/stage.sql +++ b/macros/staging/synapse/stage.sql @@ -473,7 +473,7 @@ unknown_values AS ( {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for all source columns, except the ldts, rsrc & edwSequence column #} {%- for column in columns_without_excluded_columns %} - ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column.name), datatype=column.dtype, ghost_record_type='unknown', col_size=column.char_size) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', col_size=column.char_size) }} {%- endfor -%} {%- endif -%} @@ -481,7 +481,7 @@ unknown_values AS ( {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for missing columns #} {%- for col, dtype in missing_columns.items() %} - ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(col), datatype=dtype, ghost_record_type='unknown') }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} @@ -513,7 +513,7 @@ unknown_values AS ( {%- if datavault4dbt.is_something(derived_columns) -%} {# Additionally generating Ghost Records for Derived Columns #} {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column_name), datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='unknown') }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} @@ -539,7 +539,7 @@ error_values AS ( {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for Source Columns #} {%- for column in columns_without_excluded_columns %} - ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column.name), datatype=column.dtype, ghost_record_type='error', col_size=column.char_size) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', col_size=column.char_size) }} {%- endfor -%} {%- endif -%} @@ -547,7 +547,7 @@ error_values AS ( {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for Missing columns #} {%- for col, dtype in missing_columns.items() %} - ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(col), datatype=dtype, ghost_record_type='error') }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} @@ -579,7 +579,7 @@ error_values AS ( {%- if datavault4dbt.is_something(derived_columns) %} {# Additionally generating Ghost Records for Derived Columns #} {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column_name), datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='error') }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} From e683e45f3ef20b748fd8a6e7e6685458be5c2eb5 Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Thu, 12 Dec 2024 10:18:49 +0100 Subject: [PATCH 27/50] fabric stage fix escape column names --- macros/staging/fabric/stage.sql | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/macros/staging/fabric/stage.sql b/macros/staging/fabric/stage.sql index 14b511c6..19d55efb 100644 --- a/macros/staging/fabric/stage.sql +++ b/macros/staging/fabric/stage.sql @@ -288,7 +288,7 @@ prejoined_columns AS ( {# Generate the columns for the SELECT-statement #} {%- for column in prejoin['extract_columns'] %} - ,{{ prejoin_alias }}.{{ column }} {% if datavault4dbt.is_something(prejoin['aliases']) -%} AS {{ prejoin['aliases'][loop.index0] }} {% endif -%} + ,{{ prejoin_alias }}.{{ datavault4dbt.escape_column_names(column) }} {% if datavault4dbt.is_something(prejoin['aliases']) -%} AS {{ datavault4dbt.escape_column_names(prejoin['aliases'][loop.index0]) }} {% endif -%} {%- endfor -%} {%- endfor %} @@ -335,7 +335,7 @@ prejoined_columns AS ( {%- set prejoin_alias = 'pj_' + loop.index|string %} left join {{ relation }} as {{ prejoin_alias }} - on {{ datavault4dbt.multikey(columns=prejoin['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=prejoin['ref_column_name']) }} + on {{ datavault4dbt.multikey(columns=datavault4dbt.escape_column_names(prejoin['this_column_name']), prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=datavault4dbt.escape_column_names(prejoin['ref_column_name'])) }} {%- endfor -%} {% set last_cte = "prejoined_columns" -%} @@ -502,7 +502,7 @@ unknown_values AS ( {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} {{ log('column found? yes, for column: ' ~ column.name , false) }} - ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', alias=prejoin['aliases'][prejoin_col_index]) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column.name), datatype=column.dtype, ghost_record_type='unknown', alias=datavault4dbt.escape_column_names(prejoin['aliases'][prejoin_col_index])) }} {%- endif -%} {%- endfor -%} @@ -568,7 +568,7 @@ error_values AS ( {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} {{ log('column found? yes, for column: ' ~ column.name , false) }} - ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', alias=prejoin['aliases'][prejoin_col_index]) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column.name), datatype=column.dtype, ghost_record_type='error', alias=datavault4dbt.escape_column_names(prejoin['aliases'][prejoin_col_index])) }} {%- endif -%} {%- endfor -%} From 3a86469d0e65bdfe7c0d877bee52e05082c87b45 Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Thu, 12 Dec 2024 11:33:09 +0100 Subject: [PATCH 28/50] synapse, fabric stages: fix derived input columns --- macros/staging/fabric/stage.sql | 2 ++ macros/staging/synapse/stage.sql | 2 ++ 2 files changed, 4 insertions(+) diff --git a/macros/staging/fabric/stage.sql b/macros/staging/fabric/stage.sql index 19d55efb..ef94f2de 100644 --- a/macros/staging/fabric/stage.sql +++ b/macros/staging/fabric/stage.sql @@ -259,6 +259,8 @@ missing_columns AS ( {%- if datavault4dbt.is_something(prejoined_columns) %} {# Prejoining Business Keys of other source objects for Link purposes #} +{%- set final_columns_to_select = (final_columns_to_select + derived_input_columns) | unique | list -%} + prejoined_columns AS ( SELECT diff --git a/macros/staging/synapse/stage.sql b/macros/staging/synapse/stage.sql index 9234f9e7..a4b6f806 100644 --- a/macros/staging/synapse/stage.sql +++ b/macros/staging/synapse/stage.sql @@ -262,6 +262,8 @@ missing_columns AS ( {# Prejoining Business Keys of other source objects for Link purposes #} prejoined_columns AS ( +{%- set final_columns_to_select = (final_columns_to_select + derived_input_columns) | unique | list -%} + SELECT {% if final_columns_to_select | length > 0 -%} {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} From 57e21daeb9c13121dcd4a0605ac6ea12d8d16cfc Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Thu, 12 Dec 2024 11:58:02 +0100 Subject: [PATCH 29/50] oracle stage: include col_size to ghost records --- macros/staging/oracle/stage.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/macros/staging/oracle/stage.sql b/macros/staging/oracle/stage.sql index e34df521..1d4ba239 100644 --- a/macros/staging/oracle/stage.sql +++ b/macros/staging/oracle/stage.sql @@ -521,7 +521,7 @@ unknown_values AS ( {%- if datavault4dbt.is_something(derived_columns) -%} {# Additionally generating Ghost Records for Derived Columns #} {% for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='unknown') }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} @@ -588,7 +588,7 @@ error_values AS ( {%- if datavault4dbt.is_something(derived_columns) %} {# Additionally generating Ghost Records for Derived Columns #} {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='error') }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} From c8df4d3101677582564f9e70748a2afedeecf782 Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Thu, 12 Dec 2024 13:53:36 +0100 Subject: [PATCH 30/50] synapse stage: fix prejoin_column_names --- macros/staging/synapse/stage.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/macros/staging/synapse/stage.sql b/macros/staging/synapse/stage.sql index a4b6f806..77c51daa 100644 --- a/macros/staging/synapse/stage.sql +++ b/macros/staging/synapse/stage.sql @@ -89,7 +89,7 @@ {# Getting the column names for all additional columns #} {%- set derived_column_names = datavault4dbt.extract_column_names(derived_columns) -%} {%- set hashed_column_names = datavault4dbt.extract_column_names(hashed_columns) -%} -{%- set prejoined_column_names = datavault4dbt.extract_column_names(prejoined_columns) -%} +{%- set prejoined_column_names = datavault4dbt.extract_prejoin_column_names(prejoined_columns) -%} {%- set missing_column_names = datavault4dbt.extract_column_names(missing_columns) -%} {%- set exclude_column_names = derived_column_names + hashed_column_names + prejoined_column_names + missing_column_names + ldts_rsrc_input_column_names %} {%- set source_and_derived_column_names = (all_source_columns + derived_column_names) | unique | list -%} From f484bbb9f3a0538d6dea327c720343547565196b Mon Sep 17 00:00:00 2001 From: Tim Kirschke Date: Fri, 13 Dec 2024 12:31:38 +0100 Subject: [PATCH 31/50] Finished yaml implementation, not tested yet --- macros/staging/stage.sql | 103 +++++++++++++++++++++++------------ macros/tables/ref_sat_v0.sql | 53 +++++++++++++++++- macros/tables/ref_sat_v1.sql | 44 ++++++++++++++- macros/tables/sat_v0.sql | 54 ++++++++++++++++-- macros/tables/sat_v1.sql | 33 +++++++++-- 5 files changed, 238 insertions(+), 49 deletions(-) diff --git a/macros/staging/stage.sql b/macros/staging/stage.sql index 76b17ed0..39761149 100644 --- a/macros/staging/stage.sql +++ b/macros/staging/stage.sql @@ -1,15 +1,21 @@ {# This macro creates the staging layer for the Data Vault model. This layer is mainly for hashing, and additionally gives the option to create derived columns, conduct prejoins and add NULL values for missing columns. Always create one stage per source table that you want to add to the Data Vault model. The staging layer is not to harmonize data. That will be done in the later layers. + #} + - Parameters: +{%- macro stage(yaml_metadata=none, ldts=none, rsrc=none, source_model=none, include_source_columns=true, hashed_columns=none, derived_columns=none, sequence=none, prejoined_columns=none, missing_columns=none, multi_active_config=none, enable_ghost_records=true) -%} + + {% set ldts_description = " ldts::string Name of the column inside the source data, that holds information about the Load Date Timestamp. Can also be a SQL expression. Examples: 'edwLoadDate' Uses the column called 'edwLoadDate' as it is from the source model. 'PARSE_TIMESTAMP('%Y-%m-%dT%H-%M-%S', edwLoadDate)' Applies the SQL function 'PARSE_TIMESTAMP' on the input column 'edwLoadDate'. + " %} + {% set rsrc_description = " rsrc::string Name of the column inside the source data, that holds information about the Record Source. Can also be a SQL expression or a static string. A static string must begin with a '!'. @@ -17,7 +23,9 @@ 'edwRecordSource' Uses the column called 'edwRecordSource' as it is from the source model. '!SAP.Accounts' Uses the static string 'SAP.Customers' as rsrc. 'CONCAT(source_system, '||', source_object)' Applies the SQL function 'CONCAT' to concatenate two source columns. + " %} + {% set source_model_description = " source_model::string | dictionary Can be just a string holding the name of the referred dbt model to use as a source. But if the 'source' functionality inside the .yml file is used, it must be a dictionary with 'source_name': 'source_table'. @@ -25,10 +33,14 @@ 'source_account' The source model that you want to use for the stage is available as another dbt model with the name 'source_account'. {'source_data': 'source_account'} The source model that you want to use for the stage is available as a source defined inside the .yml file with the name 'source_data', and you select the table 'source_account' out of that source. + " %} + {% set include_source_columns_description = " include_source_columns::boolean Defines if all columns from the referred source table should be included in the result table, or if only the added columns should be part of the result table. By default the source columns should be included. + " %} + {% set hashed_columns_description = " hashed_columns::dictionary Defines the names and input for all hashkeys and hashdiffs to create. The key of each hash column is the name of the hash column. The value for Hashkeys is a list of input Business Keys, for Hashdiffs another dictionary with the pairs 'is_hashdiff:true' and 'columns: '. @@ -38,7 +50,9 @@ 'hd_account_s': {'is_hashdiff': true, keys 'account_number' and 'account_key'. A hashdiff called 'hd_account_s' is calculated 'columns': ['name', 'address', 'phone', 'email']}} out of the descriptive attributes 'name', 'address', 'phone', and 'email'. More hashkeys and hashdiffs would be added as other keys of the dictionary. + " %} + {% set derived_columns_description = " derived_columns::dictionary Defines values and datatypes for derived ('added' or 'calculated') columns. The values of this dictionary are the desired column names, the value is another dictionary with the keys 'value' (holding a column name, a SQL expression, or a static string beginning with '!') and 'datatype' (holding a valid SQL datatype for the target database). @@ -49,14 +63,18 @@ 'country_isocode': {'value': '!GER', The column 'country_isocode' inserts the static string 'EUR' for all rows. 'datatype': 'STRING'}, The column 'account_name' duplicates an already existing column and gives 'account_name': {'value': 'name', it another name. More derived columns can be added as other keys of - 'datatype': 'String'}} the dictionary. + 'datatype': 'String'}} + " %} + {% set sequence_description = " sequence::string Name of the column inside the source data, that holds a sequence number that was generated during the data source extraction process. Optional and not required. Example: 'edwSequence' Uses the column 'edwSequence' that is available inside the source data. + " %} + {% set prejoined_columns_description = " prejoined_columns::dictionary Defines information about information that needs to be prejoined. Most commonly used to create links, when the source data does not hold the Business Key, but the technical key of the referred object. The values of the dict are the aliases you want to give the prejoined columns. Typically, but not always, this should be the same as the name of the prejoined column inside the prejoined entity. For each prejoined column @@ -77,7 +95,9 @@ 'ref_column_name': 'Id'}} we would now have a self-prejoin ON 'account.master_account_id = account.Id'. Because the table 'account' already has a column 'account_key', we rename the prejoined column to 'master_account_key'. More prejoined columns can be added as other keys of the dictionary. + " %} + {% set missing_columns_description = " missing_columns::dictionary If the schema of the source changes over time and columns are disappearing, this parameter gives you the option to create additional columns holding NULL values, that replace columns that were previously there. By this procedure, hashdiff calculations and satellite payloads wont break. The dictionary holds the names of those columns as keys, and the SQL datatypes of these columns as values. @@ -85,7 +105,9 @@ Example: {'legacy_account_uuid': 'INT64', Two additional columns are added to the source table holding NULL values. The column 'legacy_account_uuid' will 'shipping_address' : 'STRING'} have the datatype 'INT64' and the column 'shipping_address' will have the datatype 'STRING'. + " %} + {% set multi_active_config_description = " multi_active_config::dictionary If the source data holds multi-active data, define here the column(s) holding the multi-active key and the main hashkey column. If the source data is multi-active but has no natural multi-active key, create one using the row_number SQL function (or similar) one layer before. Then insert the name of that artificial column into the multi-active-key parameter. The combination of the multi-active key(s), the main-hashkey and the ldts column should be unique in the final result satellite. @@ -96,41 +118,50 @@ 'main_hashkey_column': 'hk_contact_h'} That means, that the combination of main_hashkey, ldts and 'phonetype' is unique inside the source system. {'multi_active_key': ['phonetype', 'company'], This source data comes with two multi-active keys. The combination of those two, the main_hashkey and ldts is unique - 'main_hashkey_column': 'hk_contact_h'} inside the source system. + 'main_hashkey_column': 'hk_contact_h'} inside the source system. + " %} + {% set enable_ghost_records_description = " enable_ghost_records::boolean If set to true, the stage will be created with ghost records. By default, ghost records are enabled. Optional Parameter - - #} - - - - {%- macro stage(ldts, rsrc, source_model, include_source_columns=true, hashed_columns=none, derived_columns=none, sequence=none, prejoined_columns=none, missing_columns=none, multi_active_config=none, enable_ghost_records=true) -%} - - {# If include_source_columns is passed but its empty then it is set with the default value (true) #} - {%- if include_source_columns is none or include_source_columns == "" -%} - {%- set include_source_columns = true -%} - {%- endif -%} - - {# If enable_ghost_records is passed but its empty then it is set with the default value (true) #} - {%- if enable_ghost_records is none or enable_ghost_records == "" -%} - {%- set enable_ghost_records = true -%} - {%- endif -%} - - {# If ldts is empty replace it with the current timestamp #} - {%- if datavault4dbt.is_nothing(ldts) -%} - {%- set ldts = datavault4dbt.current_timestamp() -%} - {%- endif -%} - - {{- adapter.dispatch('stage', 'datavault4dbt')(include_source_columns=include_source_columns, - ldts=ldts, - rsrc=rsrc, - source_model=source_model, - hashed_columns=hashed_columns, - derived_columns=derived_columns, - sequence=sequence, - prejoined_columns=prejoined_columns, - missing_columns=missing_columns, - multi_active_config=multi_active_config, - enable_ghost_records=enable_ghost_records) -}} + " %} + + {%- set ldts = datavault4dbt.yaml_metadata_parser(name='ldts', yaml_metadata=yaml_metadata, parameter=ldts, required=True, documentation=ldts_description) -%} + {%- set rsrc = datavault4dbt.yaml_metadata_parser(name='rsrc', yaml_metadata=yaml_metadata, parameter=rsrc, required=True, documentation=rsrc_description) -%} + {%- set source_model = datavault4dbt.yaml_metadata_parser(name='source_model', yaml_metadata=yaml_metadata, parameter=source_model, required=True, documentation=source_model_description) -%} + {%- set include_source_columns = datavault4dbt.yaml_metadata_parser(name='include_source_columns', yaml_metadata=yaml_metadata, parameter=include_source_columns, required=False, documentation=include_source_columns_description) -%} + {%- set hashed_columns = datavault4dbt.yaml_metadata_parser(name='hashed_columns', yaml_metadata=yaml_metadata, parameter=hashed_columns, required=False, documentation=hashed_columns_description) -%} + {%- set derived_columns = datavault4dbt.yaml_metadata_parser(name='derived_columns', yaml_metadata=yaml_metadata, parameter=derived_columns, required=False, documentation=derived_columns_description) -%} + {%- set sequence = datavault4dbt.yaml_metadata_parser(name='sequence', yaml_metadata=yaml_metadata, parameter=sequence, required=False, documentation=sequence_description) -%} + {%- set prejoined_columns = datavault4dbt.yaml_metadata_parser(name='prejoined_columns', yaml_metadata=yaml_metadata, parameter=prejoined_columns, required=False, documentation=prejoined_columns_description) -%} + {%- set missing_columns = datavault4dbt.yaml_metadata_parser(name='missing_columns', yaml_metadata=yaml_metadata, parameter=missing_columns, required=False, documentation=missing_columns_description) -%} + {%- set multi_active_config = datavault4dbt.yaml_metadata_parser(name='multi_active_config', yaml_metadata=yaml_metadata, parameter=multi_active_config, required=False, documentation=multi_active_config_description) -%} + {%- set enable_ghost_records = datavault4dbt.yaml_metadata_parser(name='enable_ghost_records', yaml_metadata=yaml_metadata, parameter=enable_ghost_records, required=False, documentation=enable_ghost_records_description) -%} + + {# If include_source_columns is passed but its empty then it is set with the default value (true) #} + {%- if include_source_columns is none or include_source_columns == "" -%} + {%- set include_source_columns = true -%} + {%- endif -%} + + {# If enable_ghost_records is passed but its empty then it is set with the default value (true) #} + {%- if enable_ghost_records is none or enable_ghost_records == "" -%} + {%- set enable_ghost_records = true -%} + {%- endif -%} + + {# If ldts is empty replace it with the current timestamp #} + {%- if datavault4dbt.is_nothing(ldts) -%} + {%- set ldts = datavault4dbt.current_timestamp() -%} + {%- endif -%} + + {{- adapter.dispatch('stage', 'datavault4dbt')(include_source_columns=include_source_columns, + ldts=ldts, + rsrc=rsrc, + source_model=source_model, + hashed_columns=hashed_columns, + derived_columns=derived_columns, + sequence=sequence, + prejoined_columns=prejoined_columns, + missing_columns=missing_columns, + multi_active_config=multi_active_config, + enable_ghost_records=enable_ghost_records) -}} {%- endmacro -%} diff --git a/macros/tables/ref_sat_v0.sql b/macros/tables/ref_sat_v0.sql index 474620d7..1c4ff2ec 100644 --- a/macros/tables/ref_sat_v0.sql +++ b/macros/tables/ref_sat_v0.sql @@ -26,7 +26,58 @@ src_payload: -{%- macro ref_sat_v0(parent_ref_keys, src_hashdiff, src_payload, source_model, src_ldts=none, src_rsrc=none, disable_hwm=false, source_is_single_batch=false) -%} +{%- macro ref_sat_v0(yaml_metadata=none, parent_ref_keys=none, src_hashdiff=none, src_payload=none, source_model=none, src_ldts=none, src_rsrc=none, disable_hwm=false, source_is_single_batch=false) -%} + + {% set parent_ref_keys_description = " + parent_ref_keys::string|list of strings Name of the reference key(s) of the parent ref_hub. + " %} + + {% set src_hashdiff_description = " + src_hashdiff::string Name of the hashdiff column of this ref satellite, that was created inside the staging area and is + calculated out of the entire payload of this ref satellite. The stage must hold one hashdiff per + ref satellite entity. + + Examples: + 'hd_nation_sfdc_rs' Since we recommend naming the hashdiff column similar to the name + of the ref satellite entity, just with a prefix, this would be the + hashdiff column of the ref satelliet for nation. + " %} + + {% set src_payload_description = " + src_payload::list of strings A list of all the descriptive attributes that should be included in this ref satellite. Needs to be the + columns that are fed into the hashdiff calculation of this ref satellite. + + Examples: + ['name', 'continent', 'area'] This ref satellite would hold the columns 'name', 'continent', and 'area' + coming out of the underlying staging area. + " %} + + {% set source_model_description = " + source_model::string Name of the underlying staging model, must be available inside dbt as a model. + + Examples: + 'stage_nation' This ref satellite is loaded out of the stage for account. + " %} + + {% set src_ldts_description = " + src_ldts::string Name of the ldts column inside the source models. Is optional, will use the global variable 'datavault4dbt.ldts_alias'. + Needs to use the same column name as defined as alias inside the staging model. + " %} + + {% set src_rsrc_description = " + src_rsrc::string Name of the rsrc column inside the source models. Is optional, will use the global variable 'datavault4dbt.rsrc_alias'. + Needs to use the same column name as defined as alias inside the staging model. + " %} + + + {%- set parent_ref_keys = datavault4dbt.yaml_metadata_parser(name='parent_ref_keys', yaml_metadata=yaml_metadata, parameter=parent_ref_keys, required=True, documentation=parent_ref_keys_description) -%} + {%- set src_hashdiff = datavault4dbt.yaml_metadata_parser(name='src_hashdiff', yaml_metadata=yaml_metadata, parameter=src_hashdiff, required=True, documentation=src_hashdiff_description) -%} + {%- set src_payload = datavault4dbt.yaml_metadata_parser(name='src_payload', yaml_metadata=yaml_metadata, parameter=src_payload, required=True, documentation=src_payload_description) -%} + {%- set source_model = datavault4dbt.yaml_metadata_parser(name='source_model', yaml_metadata=yaml_metadata, parameter=source_model, required=True, documentation=source_model_description) -%} + {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} + {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} + {%- set disable_hwm = datavault4dbt.yaml_metadata_parser(name='disable_hwm', yaml_metadata=yaml_metadata, parameter=disable_hwm, required=False, documentation='Whether the High Water Mark should be turned off. Optional, default False.') -%} + {%- set source_is_single_batch = datavault4dbt.yaml_metadata_parser(name='source_is_single_batch', yaml_metadata=yaml_metadata, parameter=source_is_single_batch, required=False, documentation='Whether the source contains only one batch. Optional, default False.') -%} {# Applying the default aliases as stored inside the global variables, if src_ldts, src_rsrc, and ledts_alias are not set. #} {%- set src_ldts = datavault4dbt.replace_standard(src_ldts, 'datavault4dbt.ldts_alias', 'ldts') -%} diff --git a/macros/tables/ref_sat_v1.sql b/macros/tables/ref_sat_v1.sql index 7b3ea907..52ccfc3e 100644 --- a/macros/tables/ref_sat_v1.sql +++ b/macros/tables/ref_sat_v1.sql @@ -22,8 +22,50 @@ add_is_current_flag: true -{%- macro ref_sat_v1(ref_sat_v0, ref_keys, hashdiff, src_ldts=none, src_rsrc=none, ledts_alias=none, add_is_current_flag=false) -%} +{%- macro ref_sat_v1(yaml_metadata=none, ref_sat_v0=none, ref_keys=none, hashdiff=none, src_ldts=none, src_rsrc=none, ledts_alias=none, add_is_current_flag=false) -%} + {% set ref_sat_v0_description = " + ref_sat_v0::string Name of the underlying ref_sat_v0 dbt model + " %} + + {% set ref_keys_description = " + ref_keys::string | list of strings Name(s) of the reference key(s) in the underlying reference sat v0. + " %} + + {% set hashdiff_description = " + hashdiff::string Name of the Hashdiff column in the underlying reference sat v0. + "%} + + {% set src_ldts_description = " + src_ldts::string Name of the ldts column inside the source models. Is optional, will use the global variable 'datavault4dbt.ldts_alias'. + Needs to use the same column name as defined as alias inside the staging model. + " %} + + {% set src_rsrc_description = " + src_rsrc::string Name of the rsrc column inside the source models. Is optional, will use the global variable 'datavault4dbt.rsrc_alias'. + Needs to use the same column name as defined as alias inside the staging model. + " %} + + {% set ledts_alias_description = " + ledts_alias::string Desired alias for the load end date column. Is optional, will use the global variable 'datavault4dbt.ledts_alias' if + set here. + " %} + + {% set add_is_current_flag_description = " + add_is_current_flag::boolean Optional parameter to add a new column to the v1 sat based on the load end date timestamp (ledts). Default is false. If + set to true it will add this is_current flag to the v1 sat. For each record this column will be set to true if the load + end date time stamp is equal to the variable end of all times. If its not, then the record is not current therefore it + will be set to false. + " %} + + {%- set ref_sat_v0 = datavault4dbt.yaml_metadata_parser(name='ref_sat_v0', yaml_metadata=yaml_metadata, parameter=ref_sat_v0, required=True, documentation=ref_sat_v0_description) -%} + {%- set ref_keys = datavault4dbt.yaml_metadata_parser(name='ref_keys', yaml_metadata=yaml_metadata, parameter=ref_keys, required=True, documentation=ref_keys_description) -%} + {%- set hashdiff = datavault4dbt.yaml_metadata_parser(name='hashdiff', yaml_metadata=yaml_metadata, parameter=hashdiff, required=True, documentation=hashdiff_description) -%} + {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} + {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} + {%- set ledts_alias = datavault4dbt.yaml_metadata_parser(name='ledts_alias', yaml_metadata=yaml_metadata, parameter=ledts_alias, required=False, documentation=ledts_alias_description) -%} + {%- set add_is_current_flag = datavault4dbt.yaml_metadata_parser(name='add_is_current_flag', yaml_metadata=yaml_metadata, parameter=add_is_current_flag, required=False, documentation=add_is_current_flag_description) -%} + {# Applying the default aliases as stored inside the global variables, if src_ldts, src_rsrc, and ledts_alias are not set. #} {%- set src_ldts = datavault4dbt.replace_standard(src_ldts, 'datavault4dbt.ldts_alias', 'ldts') -%} diff --git a/macros/tables/sat_v0.sql b/macros/tables/sat_v0.sql index cf1a8c71..e9f8cf8e 100644 --- a/macros/tables/sat_v0.sql +++ b/macros/tables/sat_v0.sql @@ -19,6 +19,34 @@ 'hk_account_contact_l' The satellite would be attached to the link between account and contact, which has the column 'hk_account_contact_l' as a hashkey column. + + + + + + + src_ldts::string Name of the ldts column inside the source model. Is optional, will use the global variable 'datavault4dbt.ldts_alias'. + Needs to use the same column name as defined as alias inside the staging model. + + src_rsrc::string Name of the rsrc column inside the source model. Is optional, will use the global variable 'datavault4dbt.rsrc_alias'. + Needs to use the same column name as defined as alias inside the staging model. + +#} + +{%- macro sat_v0(yaml_metadata=none, parent_hashkey=none, src_hashdiff=none, src_payload=none, source_model=none, src_ldts=none, src_rsrc=none, disable_hwm=false, source_is_single_batch=false) -%} + + {% set parent_hashkey_description = " + parent_hashkey::string Name of the hashkey column inside the stage of the object that this satellite is attached to. + + Examples: + 'hk_account_h' The satellite would be attached to the hub account, which has the + column 'hk_account_h' as a hashkey column. + + 'hk_account_contact_l' The satellite would be attached to the link between account and contact, + which has the column 'hk_account_contact_l' as a hashkey column. + " %} + + {% set src_hashdiff_description = " src_hashdiff::string Name of the hashdiff column of this satellite, that was created inside the staging area and is calculated out of the entire payload of this satellite. The stage must hold one hashdiff per satellite entity. @@ -27,7 +55,9 @@ 'hd_account_data_sfdc_s' Since we recommend naming the hashdiff column similar to the name of the satellite entity, just with a prefix, this would be the hashdiff column of the data satellite for account. + " %} + {% set src_payload_description = " src_payload::list of strings A list of all the descriptive attributes that should be included in this satellite. Needs to be the columns that are fed into the hashdiff calculation of this satellite. @@ -35,21 +65,33 @@ ['name', 'address', 'country', 'phone', 'email'] This satellite would hold the columns 'name', 'address', 'country', 'phone' and 'email', coming out of the underlying staging area. + " %} + {% set source_model_description = " source_model::string Name of the underlying staging model, must be available inside dbt as a model. Examples: 'stage_account' This satellite is loaded out of the stage for account. + " %} - src_ldts::string Name of the ldts column inside the source model. Is optional, will use the global variable 'datavault4dbt.ldts_alias'. + {% set src_ldts_description = " + src_ldts::string Name of the ldts column inside the source models. Is optional, will use the global variable 'datavault4dbt.ldts_alias'. Needs to use the same column name as defined as alias inside the staging model. + " %} - src_rsrc::string Name of the rsrc column inside the source model. Is optional, will use the global variable 'datavault4dbt.rsrc_alias'. + {% set src_rsrc_description = " + src_rsrc::string Name of the rsrc column inside the source models. Is optional, will use the global variable 'datavault4dbt.rsrc_alias'. Needs to use the same column name as defined as alias inside the staging model. - -#} - -{%- macro sat_v0(parent_hashkey, src_hashdiff, src_payload, source_model, src_ldts=none, src_rsrc=none, disable_hwm=false, source_is_single_batch=false) -%} + " %} + + {%- set parent_hashkey = datavault4dbt.yaml_metadata_parser(name='parent_hashkey', yaml_metadata=yaml_metadata, parameter=parent_hashkey, required=True, documentation=parent_hashkey_description) -%} + {%- set src_hashdiff = datavault4dbt.yaml_metadata_parser(name='src_hashdiff', yaml_metadata=yaml_metadata, parameter=src_hashdiff, required=True, documentation=src_hashdiff_description) -%} + {%- set src_payload = datavault4dbt.yaml_metadata_parser(name='src_payload', yaml_metadata=yaml_metadata, parameter=src_payload, required=True, documentation=src_payload_description) -%} + {%- set source_model = datavault4dbt.yaml_metadata_parser(name='source_model', yaml_metadata=yaml_metadata, parameter=source_model, required=True, documentation=source_model_description) -%} + {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} + {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} + {%- set disable_hwm = datavault4dbt.yaml_metadata_parser(name='disable_hwm', yaml_metadata=yaml_metadata, parameter=disable_hwm, required=False, documentation='Whether the High Water Mark should be turned off. Optional, default False.') -%} + {%- set source_is_single_batch = datavault4dbt.yaml_metadata_parser(name='source_is_single_batch', yaml_metadata=yaml_metadata, parameter=source_is_single_batch, required=False, documentation='Whether the source contains only one batch. Optional, default False.') -%} {# Applying the default aliases as stored inside the global variables, if src_ldts, src_rsrc, and ledts_alias are not set. #} {%- set src_ldts = datavault4dbt.replace_standard(src_ldts, 'datavault4dbt.ldts_alias', 'ldts') -%} diff --git a/macros/tables/sat_v1.sql b/macros/tables/sat_v1.sql index 6f743ec0..13ad1425 100644 --- a/macros/tables/sat_v1.sql +++ b/macros/tables/sat_v1.sql @@ -2,15 +2,19 @@ This macro calculates a virtualized load end date on top of a version 0 satellite. This column is generated for usage in the PIT tables, and only virtualized to follow the insert-only approach. Usually one version 1 sat would be created for each version 0 sat. A version 1 satellite should be materialized as a view by default. +#} - Parameters: +{%- macro sat_v1(yaml_metadata=none, sat_v0=none, hashkey=none, hashdiff=none, src_ldts=none, src_rsrc=none, ledts_alias=none, add_is_current_flag=false, include_payload=true) -%} + {% set sat_v0_description = " sat_v0::string Name of the underlying version 0 satellite. Examples: 'account_data_sfdc_0_s' This satellite would be the version 1 satellite of the underlying version 0 data satellite for account. + " %} + {% set hashkey_description = " hashkey::string Name of the parent hashkey column inside the version 0 satellite. Would either be the hashkey of a hub or a link. Needs to be similar to the 'parent_hashkey' parameter inside the sat_v0 model. @@ -20,7 +24,9 @@ 'hk_account_contact_l' The satellite would be attached to the link between account and contact, which has the column 'hk_account_contact_l' as a hashkey column. + " %} + {% set hashdiff_description = " hashdiff::string Name of the hashdiff column inside the underlying version 0 satellite. Needs to be similar to the 'src_hashdiff' pararmeter inside the sat_v0 model. @@ -28,25 +34,42 @@ 'hd_account_data_sfdc_s' Since we recommend naming the hashdiff column similar to the name of the satellite entity, just with a prefix, this would be the hashdiff column of the data satellite for account. + " %} + {% set src_ldts_description = " src_ldts::string Name of the ldts column inside the source models. Is optional, will use the global variable 'datavault4dbt.ldts_alias'. Needs to use the same column name as defined as alias inside the staging model. + " %} + {% set src_rsrc_description = " src_rsrc::string Name of the rsrc column inside the source models. Is optional, will use the global variable 'datavault4dbt.rsrc_alias'. Needs to use the same column name as defined as alias inside the staging model. + " %} + {% set ledts_alias_description = " ledts_alias::string Desired alias for the load end date column. Is optional, will use the global variable 'datavault4dbt.ledts_alias' if set here. - + " %} + + {% set add_is_current_flag_description = " add_is_current_flag::boolean Optional parameter to add a new column to the v1 sat based on the load end date timestamp (ledts). Default is false. If set to true it will add this is_current flag to the v1 sat. For each record this column will be set to true if the load end date time stamp is equal to the variable end of all times. If its not, then the record is not current therefore it will be set to false. - + " %} + + {% set include_payload_description = " include_payload::boolean Optional parameter to specify if the v1 sat should have the payload columns from sat v0 or not. Default is true. -#} + " %} -{%- macro sat_v1(sat_v0, hashkey, hashdiff, src_ldts=none, src_rsrc=none, ledts_alias=none, add_is_current_flag=false, include_payload=true) -%} + {%- set sat_v0 = datavault4dbt.yaml_metadata_parser(name='sat_v0', yaml_metadata=yaml_metadata, parameter=sat_v0, required=True, documentation=sat_v0_description) -%} + {%- set hashkey = datavault4dbt.yaml_metadata_parser(name='hashkey', yaml_metadata=yaml_metadata, parameter=hashkey, required=True, documentation=hashkey_description) -%} + {%- set hashdiff = datavault4dbt.yaml_metadata_parser(name='hashdiff', yaml_metadata=yaml_metadata, parameter=hashdiff, required=True, documentation=hashdiff_description) -%} + {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} + {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} + {%- set ledts_alias = datavault4dbt.yaml_metadata_parser(name='ledts_alias', yaml_metadata=yaml_metadata, parameter=ledts_alias, required=False, documentation=ledts_alias_description) -%} + {%- set add_is_current_flag = datavault4dbt.yaml_metadata_parser(name='add_is_current_flag', yaml_metadata=yaml_metadata, parameter=add_is_current_flag, required=False, documentation=add_is_current_flag_description) -%} + {%- set include_payload = datavault4dbt.yaml_metadata_parser(name='include_payload', yaml_metadata=yaml_metadata, parameter=include_payload, required=False, documentation=include_payload_description) -%} {# Applying the default aliases as stored inside the global variables, if src_ldts, src_rsrc, and ledts_alias are not set. #} From 164205770fb06cd2d674e0471e2853f252fd6645 Mon Sep 17 00:00:00 2001 From: Tim Kirschke Date: Fri, 13 Dec 2024 14:50:36 +0100 Subject: [PATCH 32/50] Fixed returning none for optional parameters --- macros/internal/metadata_processing/yaml_metadata_parser.sql | 2 ++ 1 file changed, 2 insertions(+) diff --git a/macros/internal/metadata_processing/yaml_metadata_parser.sql b/macros/internal/metadata_processing/yaml_metadata_parser.sql index c3529c6a..86e53e73 100644 --- a/macros/internal/metadata_processing/yaml_metadata_parser.sql +++ b/macros/internal/metadata_processing/yaml_metadata_parser.sql @@ -12,6 +12,8 @@ {% do exceptions.warn("[" ~ this ~ "] Warning: yaml-metadata given, but parameter '" ~ name ~ "' not defined in there. Using '" ~ name ~ "' parameter defined outside. We advise to use only one method of parameter passing.") %} {% elif required %} {{ exceptions.raise_compiler_error("[" ~ this ~ "] Error: yaml-metadata given, but required parameter '" ~ name ~ "' not defined in there or outside in the parameter. \n Description of parameter '" ~ name ~ "': \n" ~ documentation ) }} + {% else %} + {% set return_value = None %} {% endif %} {% elif datavault4dbt.is_something(parameter) %} {% set return_value = parameter %} From 2f9e218f0fa7b829bb6eae62c6bce6fb8279e4c6 Mon Sep 17 00:00:00 2001 From: Tim Kirschke Date: Thu, 19 Dec 2024 09:50:28 +0100 Subject: [PATCH 33/50] Fixed ref satellite parameter --- macros/tables/ref_table.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/macros/tables/ref_table.sql b/macros/tables/ref_table.sql index efcd4640..807cc746 100644 --- a/macros/tables/ref_table.sql +++ b/macros/tables/ref_table.sql @@ -92,7 +92,7 @@ Include / Exclude per Satellite: ref_hub::string Name of the underlying ref_hub model. " %} - {% set ref_satellites = " + {% set ref_satellites_description = " ref_satellites::string|list of strings Name(s) of the reference satellites to be included in this ref_table. Optional: 'include' & 'exclude' as dictionary keys for each satellite. " %} From 5e203ca5f51e10d011c18fd02a37c112a29a4caf Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Mon, 6 Jan 2025 17:00:35 +0100 Subject: [PATCH 34/50] unify formatting of yaml_metadata_parser calls --- macros/staging/stage.sql | 32 +++++----- macros/tables/control_snap_v0.sql | 8 +-- macros/tables/control_snap_v1.sql | 17 +---- macros/tables/eff_sat_v0.sql | 14 ++-- macros/tables/hub.sql | 12 ++-- macros/tables/link.sql | 12 ++-- macros/tables/ma_sat_v0.sql | 14 ++-- macros/tables/ma_sat_v1.sql | 16 ++--- macros/tables/nh_link.sql | 16 ++--- macros/tables/nh_sat.sql | 12 ++-- macros/tables/pit.sql | 24 ++++--- macros/tables/rec_track_sat.sql | 12 ++-- macros/tables/ref_hub.sql | 39 ++---------- macros/tables/ref_sat_v0.sql | 44 +++---------- macros/tables/ref_sat_v1.sql | 38 ++--------- macros/tables/ref_table.sql | 102 ++---------------------------- macros/tables/sat_v0.sql | 40 +++--------- macros/tables/sat_v1.sql | 16 ++--- 18 files changed, 130 insertions(+), 338 deletions(-) diff --git a/macros/staging/stage.sql b/macros/staging/stage.sql index 39761149..d80cef64 100644 --- a/macros/staging/stage.sql +++ b/macros/staging/stage.sql @@ -62,8 +62,8 @@ 'datatype': 'INT64'}, the number of days between two columns available inside the source data. 'country_isocode': {'value': '!GER', The column 'country_isocode' inserts the static string 'EUR' for all rows. 'datatype': 'STRING'}, The column 'account_name' duplicates an already existing column and gives - 'account_name': {'value': 'name', it another name. More derived columns can be added as other keys of - 'datatype': 'String'}} + 'account_name': {'value': 'name', it another name. More derived columns can be added as additional keys of + 'datatype': 'String'}} the dictionary. " %} {% set sequence_description = " @@ -89,12 +89,12 @@ 'bk': 'contractnumber', name (specified in 'bk') from the source table 'contract' in the source 'source_data' 'this_column_name': 'ContractId', by joining on 'this.ContractId = contract.Id'. In this case the prejoined 'ref_column_name': 'Id'}, column alias equals the name of the original business key column, which should be - 'master_account_key' {'ref_model': 'account_prep', or a self-prejoin happens, and then you would have to rename the final columns to not + 'master_account_key': {'ref_model': 'account_prep', or a self-prejoin happens, and then you would have to rename the final columns to not 'bk': 'account_key', have duplicate column names. The column 'master_account_key' holds values of the column 'this_column_name': 'master_account_id', 'account_key' inside the pre-populated dbt model 'account_prep'. If this prejoin is done inside account, 'ref_column_name': 'Id'}} we would now have a self-prejoin ON 'account.master_account_id = account.Id'. Because the table 'account' already has a column 'account_key', we rename the prejoined column - to 'master_account_key'. More prejoined columns can be added as other keys of the dictionary. + to 'master_account_key'. More prejoined columns can be added as additional keys of the dictionary. " %} {% set missing_columns_description = " @@ -122,20 +122,20 @@ " %} {% set enable_ghost_records_description = " - enable_ghost_records::boolean If set to true, the stage will be created with ghost records. By default, ghost records are enabled. Optional Parameter + enable_ghost_records::boolean If set to true, the stage will be created with ghost records. By default, ghost records are enabled. Optional Parameter. " %} - {%- set ldts = datavault4dbt.yaml_metadata_parser(name='ldts', yaml_metadata=yaml_metadata, parameter=ldts, required=True, documentation=ldts_description) -%} - {%- set rsrc = datavault4dbt.yaml_metadata_parser(name='rsrc', yaml_metadata=yaml_metadata, parameter=rsrc, required=True, documentation=rsrc_description) -%} - {%- set source_model = datavault4dbt.yaml_metadata_parser(name='source_model', yaml_metadata=yaml_metadata, parameter=source_model, required=True, documentation=source_model_description) -%} - {%- set include_source_columns = datavault4dbt.yaml_metadata_parser(name='include_source_columns', yaml_metadata=yaml_metadata, parameter=include_source_columns, required=False, documentation=include_source_columns_description) -%} - {%- set hashed_columns = datavault4dbt.yaml_metadata_parser(name='hashed_columns', yaml_metadata=yaml_metadata, parameter=hashed_columns, required=False, documentation=hashed_columns_description) -%} - {%- set derived_columns = datavault4dbt.yaml_metadata_parser(name='derived_columns', yaml_metadata=yaml_metadata, parameter=derived_columns, required=False, documentation=derived_columns_description) -%} - {%- set sequence = datavault4dbt.yaml_metadata_parser(name='sequence', yaml_metadata=yaml_metadata, parameter=sequence, required=False, documentation=sequence_description) -%} - {%- set prejoined_columns = datavault4dbt.yaml_metadata_parser(name='prejoined_columns', yaml_metadata=yaml_metadata, parameter=prejoined_columns, required=False, documentation=prejoined_columns_description) -%} - {%- set missing_columns = datavault4dbt.yaml_metadata_parser(name='missing_columns', yaml_metadata=yaml_metadata, parameter=missing_columns, required=False, documentation=missing_columns_description) -%} - {%- set multi_active_config = datavault4dbt.yaml_metadata_parser(name='multi_active_config', yaml_metadata=yaml_metadata, parameter=multi_active_config, required=False, documentation=multi_active_config_description) -%} - {%- set enable_ghost_records = datavault4dbt.yaml_metadata_parser(name='enable_ghost_records', yaml_metadata=yaml_metadata, parameter=enable_ghost_records, required=False, documentation=enable_ghost_records_description) -%} + {%- set ldts = datavault4dbt.yaml_metadata_parser(name='ldts', yaml_metadata=yaml_metadata, parameter=ldts, required=True, documentation=ldts_description) -%} + {%- set rsrc = datavault4dbt.yaml_metadata_parser(name='rsrc', yaml_metadata=yaml_metadata, parameter=rsrc, required=True, documentation=rsrc_description) -%} + {%- set source_model = datavault4dbt.yaml_metadata_parser(name='source_model', yaml_metadata=yaml_metadata, parameter=source_model, required=True, documentation=source_model_description) -%} + {%- set include_source_columns = datavault4dbt.yaml_metadata_parser(name='include_source_columns', yaml_metadata=yaml_metadata, parameter=include_source_columns, required=False, documentation=include_source_columns_description) -%} + {%- set hashed_columns = datavault4dbt.yaml_metadata_parser(name='hashed_columns', yaml_metadata=yaml_metadata, parameter=hashed_columns, required=False, documentation=hashed_columns_description) -%} + {%- set derived_columns = datavault4dbt.yaml_metadata_parser(name='derived_columns', yaml_metadata=yaml_metadata, parameter=derived_columns, required=False, documentation=derived_columns_description) -%} + {%- set sequence = datavault4dbt.yaml_metadata_parser(name='sequence', yaml_metadata=yaml_metadata, parameter=sequence, required=False, documentation=sequence_description) -%} + {%- set prejoined_columns = datavault4dbt.yaml_metadata_parser(name='prejoined_columns', yaml_metadata=yaml_metadata, parameter=prejoined_columns, required=False, documentation=prejoined_columns_description) -%} + {%- set missing_columns = datavault4dbt.yaml_metadata_parser(name='missing_columns', yaml_metadata=yaml_metadata, parameter=missing_columns, required=False, documentation=missing_columns_description) -%} + {%- set multi_active_config = datavault4dbt.yaml_metadata_parser(name='multi_active_config', yaml_metadata=yaml_metadata, parameter=multi_active_config, required=False, documentation=multi_active_config_description) -%} + {%- set enable_ghost_records = datavault4dbt.yaml_metadata_parser(name='enable_ghost_records', yaml_metadata=yaml_metadata, parameter=enable_ghost_records, required=False, documentation=enable_ghost_records_description) -%} {# If include_source_columns is passed but its empty then it is set with the default value (true) #} {%- if include_source_columns is none or include_source_columns == "" -%} diff --git a/macros/tables/control_snap_v0.sql b/macros/tables/control_snap_v0.sql index fc800c4b..423899de 100644 --- a/macros/tables/control_snap_v0.sql +++ b/macros/tables/control_snap_v0.sql @@ -73,10 +73,10 @@ " %} - {%- set start_date = datavault4dbt.yaml_metadata_parser(name='start_date', yaml_metadata=yaml_metadata, parameter=start_date, required=True, documentation=start_date_description) -%} - {%- set daily_snapshot_time = datavault4dbt.yaml_metadata_parser(name='daily_snapshot_time', yaml_metadata=yaml_metadata, parameter=daily_snapshot_time, required=True, documentation=daily_snapshot_time_description) -%} - {%- set sdts_alias = datavault4dbt.yaml_metadata_parser(name='sdts_alias', yaml_metadata=yaml_metadata, parameter=sdts_alias, required=False, documentation=sdts_alias_description) -%} - {%- set end_date = datavault4dbt.yaml_metadata_parser(name='end_date', yaml_metadata=yaml_metadata, parameter=end_date, required=False, documentation=end_date_description) -%} + {%- set start_date = datavault4dbt.yaml_metadata_parser(name='start_date', yaml_metadata=yaml_metadata, parameter=start_date, required=True, documentation=start_date_description) -%} + {%- set daily_snapshot_time = datavault4dbt.yaml_metadata_parser(name='daily_snapshot_time', yaml_metadata=yaml_metadata, parameter=daily_snapshot_time, required=True, documentation=daily_snapshot_time_description) -%} + {%- set sdts_alias = datavault4dbt.yaml_metadata_parser(name='sdts_alias', yaml_metadata=yaml_metadata, parameter=sdts_alias, required=False, documentation=sdts_alias_description) -%} + {%- set end_date = datavault4dbt.yaml_metadata_parser(name='end_date', yaml_metadata=yaml_metadata, parameter=end_date, required=False, documentation=end_date_description) -%} {%- set sdts_alias = datavault4dbt.replace_standard(sdts_alias, 'datavault4dbt.sdts_alias', 'sdts') -%} diff --git a/macros/tables/control_snap_v1.sql b/macros/tables/control_snap_v1.sql index 53ada37b..146b1c5d 100644 --- a/macros/tables/control_snap_v1.sql +++ b/macros/tables/control_snap_v1.sql @@ -26,17 +26,6 @@ is_last_rolling_year::boolean Captures if a sdts is inside the range that starts two years ago (from the current date) and ranges until one year ago (from the current date). - - Parameters: - - - - - - sdts_alias::string Defines the name of the snapshot date timestamp column inside the snapshot_table. - It is optional, if not set will use the global variable `datavault4dbt.sdts_alias` - set inside dbt_project.yml - #} {%- macro control_snap_v1(yaml_metadata=none, control_snap_v0=none, log_logic=none, sdts_alias=none) -%} @@ -84,9 +73,9 @@ set inside dbt_project.yml " %} - {% set control_snap_v0 = datavault4dbt.yaml_metadata_parser(name='control_snap_v0', yaml_metadata=yaml_metadata, parameter=control_snap_v0, required=True, documentation=control_snap_v0_description) %} - {% set log_logic = datavault4dbt.yaml_metadata_parser(name='log_logic', yaml_metadata=yaml_metadata, parameter=log_logic, required=False, documentation=log_logic_description) %} - {% set sdts_alias = datavault4dbt.yaml_metadata_parser(name='sdts_alias', yaml_metadata=yaml_metadata, parameter=sdts_alias, required=False, documentation=sdts_alias_description) %} + {% set control_snap_v0 = datavault4dbt.yaml_metadata_parser(name='control_snap_v0', yaml_metadata=yaml_metadata, parameter=control_snap_v0, required=True, documentation=control_snap_v0_description) %} + {% set log_logic = datavault4dbt.yaml_metadata_parser(name='log_logic', yaml_metadata=yaml_metadata, parameter=log_logic, required=False, documentation=log_logic_description) %} + {% set sdts_alias = datavault4dbt.yaml_metadata_parser(name='sdts_alias', yaml_metadata=yaml_metadata, parameter=sdts_alias, required=False, documentation=sdts_alias_description) %} {%- set sdts_alias = datavault4dbt.replace_standard(sdts_alias, 'datavault4dbt.sdts_alias', 'sdts') -%} diff --git a/macros/tables/eff_sat_v0.sql b/macros/tables/eff_sat_v0.sql index f5da8437..2905065b 100644 --- a/macros/tables/eff_sat_v0.sql +++ b/macros/tables/eff_sat_v0.sql @@ -1,12 +1,12 @@ {%- macro eff_sat_v0(yaml_metadata=none, source_model=none, tracked_hashkey=none, src_ldts=none, src_rsrc=none, is_active_alias=none, source_is_single_batch=true, disable_hwm=false) -%} - {% set source_model = datavault4dbt.yaml_metadata_parser(name='source_model', yaml_metadata=yaml_metadata, parameter=source_model, required=True, documentation='Name of the source model') %} - {% set tracked_hashkey = datavault4dbt.yaml_metadata_parser(name='tracked_hashkey', yaml_metadata=yaml_metadata, parameter=tracked_hashkey, required=True, documentation='Name of the hashkey column to be tracked') %} - {% set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation='Name of the loaddate column in the source model. Optional.') %} - {% set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation='Name of the record source column in the source model. Optional.') %} - {% set is_active_alias = datavault4dbt.yaml_metadata_parser(name='is_active_alias', yaml_metadata=yaml_metadata, parameter=is_active_alias, required=False, documentation='Name of the new active flag column. Optional.') %} - {% set source_is_single_batch = datavault4dbt.yaml_metadata_parser(name='source_is_single_batch', yaml_metadata=yaml_metadata, parameter=source_is_single_batch, required=False, documentation='Whether the source contains only one batch. Optional, default True.') %} - {% set disable_hwm = datavault4dbt.yaml_metadata_parser(name='disable_hwm', yaml_metadata=yaml_metadata, parameter=disable_hwm, required=False, documentation='Whether the High Water Mark should be disabled or not. Optional.') %} + {% set source_model = datavault4dbt.yaml_metadata_parser(name='source_model', yaml_metadata=yaml_metadata, parameter=source_model, required=True, documentation='Name of the source model') %} + {% set tracked_hashkey = datavault4dbt.yaml_metadata_parser(name='tracked_hashkey', yaml_metadata=yaml_metadata, parameter=tracked_hashkey, required=True, documentation='Name of the hashkey column to be tracked') %} + {% set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation='Name of the loaddate column in the source model. Optional.') %} + {% set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation='Name of the record source column in the source model. Optional.') %} + {% set is_active_alias = datavault4dbt.yaml_metadata_parser(name='is_active_alias', yaml_metadata=yaml_metadata, parameter=is_active_alias, required=False, documentation='Name of the new active flag column. Optional.') %} + {% set source_is_single_batch = datavault4dbt.yaml_metadata_parser(name='source_is_single_batch', yaml_metadata=yaml_metadata, parameter=source_is_single_batch, required=False, documentation='Whether the source contains only one batch. Optional, default True.') %} + {% set disable_hwm = datavault4dbt.yaml_metadata_parser(name='disable_hwm', yaml_metadata=yaml_metadata, parameter=disable_hwm, required=False, documentation='Whether the High Water Mark should be disabled or not. Optional.') %} {# Applying the default aliases as stored inside the global variables, if src_ldts, src_rsrc, and ledts_alias are not set. #} {%- set src_ldts = datavault4dbt.replace_standard(src_ldts, 'datavault4dbt.ldts_alias', 'ldts') -%} diff --git a/macros/tables/hub.sql b/macros/tables/hub.sql index 02dd66ce..f6221320 100644 --- a/macros/tables/hub.sql +++ b/macros/tables/hub.sql @@ -85,12 +85,12 @@ Needs to use the same column name as defined as alias inside the staging model. " %} - {%- set hashkey = datavault4dbt.yaml_metadata_parser(name='hashkey', yaml_metadata=yaml_metadata, parameter=hashkey, required=True, documentation=hashkey_description) -%} - {%- set business_keys = datavault4dbt.yaml_metadata_parser(name='business_keys', yaml_metadata=yaml_metadata, parameter=business_keys, required=True, documentation=business_keys_description) -%} - {%- set source_models = datavault4dbt.yaml_metadata_parser(name='source_models', yaml_metadata=yaml_metadata, parameter=source_models, required=True, documentation=source_models_description) -%} - {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} - {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} - {%- set disable_hwm = datavault4dbt.yaml_metadata_parser(name='disable_hwm', yaml_metadata=yaml_metadata, parameter=disable_hwm, required=False, documentation='Whether the High Water Mark should be turned off. Optional, default False.') -%} + {%- set hashkey = datavault4dbt.yaml_metadata_parser(name='hashkey', yaml_metadata=yaml_metadata, parameter=hashkey, required=True, documentation=hashkey_description) -%} + {%- set business_keys = datavault4dbt.yaml_metadata_parser(name='business_keys', yaml_metadata=yaml_metadata, parameter=business_keys, required=True, documentation=business_keys_description) -%} + {%- set source_models = datavault4dbt.yaml_metadata_parser(name='source_models', yaml_metadata=yaml_metadata, parameter=source_models, required=True, documentation=source_models_description) -%} + {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} + {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} + {%- set disable_hwm = datavault4dbt.yaml_metadata_parser(name='disable_hwm', yaml_metadata=yaml_metadata, parameter=disable_hwm, required=False, documentation='Whether the High Water Mark should be turned off. Optional, default False.') -%} {# Applying the default aliases as stored inside the global variables, if src_ldts and src_rsrc are not set. #} diff --git a/macros/tables/link.sql b/macros/tables/link.sql index 9df0a4a2..38c0ba13 100644 --- a/macros/tables/link.sql +++ b/macros/tables/link.sql @@ -77,12 +77,12 @@ Needs to use the same column name as defined as alias inside the staging model. " %} - {%- set link_hashkey = datavault4dbt.yaml_metadata_parser(name='link_hashkey', yaml_metadata=yaml_metadata, parameter=link_hashkey, required=True, documentation=link_hashkey_description) -%} - {%- set foreign_hashkeys = datavault4dbt.yaml_metadata_parser(name='foreign_hashkeys', yaml_metadata=yaml_metadata, parameter=foreign_hashkeys, required=True, documentation=foreign_hashkeys_description) -%} - {%- set source_models = datavault4dbt.yaml_metadata_parser(name='source_models', yaml_metadata=yaml_metadata, parameter=source_models, required=True, documentation=source_models_description) -%} - {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} - {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} - {%- set disable_hwm = datavault4dbt.yaml_metadata_parser(name='disable_hwm', yaml_metadata=yaml_metadata, parameter=disable_hwm, required=False, documentation='Whether the High Water Mark should be turned off. Optional, default False.') -%} + {%- set link_hashkey = datavault4dbt.yaml_metadata_parser(name='link_hashkey', yaml_metadata=yaml_metadata, parameter=link_hashkey, required=True, documentation=link_hashkey_description) -%} + {%- set foreign_hashkeys = datavault4dbt.yaml_metadata_parser(name='foreign_hashkeys', yaml_metadata=yaml_metadata, parameter=foreign_hashkeys, required=True, documentation=foreign_hashkeys_description) -%} + {%- set source_models = datavault4dbt.yaml_metadata_parser(name='source_models', yaml_metadata=yaml_metadata, parameter=source_models, required=True, documentation=source_models_description) -%} + {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} + {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} + {%- set disable_hwm = datavault4dbt.yaml_metadata_parser(name='disable_hwm', yaml_metadata=yaml_metadata, parameter=disable_hwm, required=False, documentation='Whether the High Water Mark should be turned off. Optional, default False.') -%} {# Applying the default aliases as stored inside the global variables, if src_ldts and src_rsrc are not set. #} diff --git a/macros/tables/ma_sat_v0.sql b/macros/tables/ma_sat_v0.sql index 84330694..fa535870 100644 --- a/macros/tables/ma_sat_v0.sql +++ b/macros/tables/ma_sat_v0.sql @@ -75,13 +75,13 @@ Needs to use the same column name as defined as alias inside the staging model. " %} - {%- set parent_hashkey = datavault4dbt.yaml_metadata_parser(name='parent_hashkey', yaml_metadata=yaml_metadata, parameter=parent_hashkey, required=True, documentation=parent_hashkey_description) -%} - {%- set src_hashdiff = datavault4dbt.yaml_metadata_parser(name='src_hashdiff', yaml_metadata=yaml_metadata, parameter=src_hashdiff, required=True, documentation=src_hashdiff_description) -%} - {%- set src_ma_key = datavault4dbt.yaml_metadata_parser(name='src_ma_key', yaml_metadata=yaml_metadata, parameter=src_ma_key, required=True, documentation=src_ma_key_description) -%} - {%- set src_payload = datavault4dbt.yaml_metadata_parser(name='src_payload', yaml_metadata=yaml_metadata, parameter=src_payload, required=True, documentation=src_payload_description) -%} - {%- set source_model = datavault4dbt.yaml_metadata_parser(name='source_model', yaml_metadata=yaml_metadata, parameter=source_model, required=True, documentation=source_model_description) -%} - {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} - {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} + {%- set parent_hashkey = datavault4dbt.yaml_metadata_parser(name='parent_hashkey', yaml_metadata=yaml_metadata, parameter=parent_hashkey, required=True, documentation=parent_hashkey_description) -%} + {%- set src_hashdiff = datavault4dbt.yaml_metadata_parser(name='src_hashdiff', yaml_metadata=yaml_metadata, parameter=src_hashdiff, required=True, documentation=src_hashdiff_description) -%} + {%- set src_ma_key = datavault4dbt.yaml_metadata_parser(name='src_ma_key', yaml_metadata=yaml_metadata, parameter=src_ma_key, required=True, documentation=src_ma_key_description) -%} + {%- set src_payload = datavault4dbt.yaml_metadata_parser(name='src_payload', yaml_metadata=yaml_metadata, parameter=src_payload, required=True, documentation=src_payload_description) -%} + {%- set source_model = datavault4dbt.yaml_metadata_parser(name='source_model', yaml_metadata=yaml_metadata, parameter=source_model, required=True, documentation=source_model_description) -%} + {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} + {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} {# Applying the default aliases as stored inside the global variables, if src_ldts and src_rsrc are not set. #} diff --git a/macros/tables/ma_sat_v1.sql b/macros/tables/ma_sat_v1.sql index 0c663bd7..53ba79ef 100644 --- a/macros/tables/ma_sat_v1.sql +++ b/macros/tables/ma_sat_v1.sql @@ -78,14 +78,14 @@ will be set to false. " %} - {%- set sat_v0 = datavault4dbt.yaml_metadata_parser(name='sat_v0', yaml_metadata=yaml_metadata, parameter=sat_v0, required=True, documentation=sat_v0_description) -%} - {%- set hashkey = datavault4dbt.yaml_metadata_parser(name='hashkey', yaml_metadata=yaml_metadata, parameter=hashkey, required=True, documentation=hashkey_description) -%} - {%- set hashdiff = datavault4dbt.yaml_metadata_parser(name='hashdiff', yaml_metadata=yaml_metadata, parameter=hashdiff, required=True, documentation=hashdiff_description) -%} - {%- set ma_attribute = datavault4dbt.yaml_metadata_parser(name='ma_attribute', yaml_metadata=yaml_metadata, parameter=ma_attribute, required=True, documentation=ma_attribute_description) -%} - {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} - {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} - {%- set ledts_alias = datavault4dbt.yaml_metadata_parser(name='ledts_alias', yaml_metadata=yaml_metadata, parameter=ledts_alias, required=False, documentation=ledts_alias_description) -%} - {%- set add_is_current_flag = datavault4dbt.yaml_metadata_parser(name='add_is_current_flag', yaml_metadata=yaml_metadata, parameter=add_is_current_flag, required=False, documentation=add_is_current_flag_description) -%} + {%- set sat_v0 = datavault4dbt.yaml_metadata_parser(name='sat_v0', yaml_metadata=yaml_metadata, parameter=sat_v0, required=True, documentation=sat_v0_description) -%} + {%- set hashkey = datavault4dbt.yaml_metadata_parser(name='hashkey', yaml_metadata=yaml_metadata, parameter=hashkey, required=True, documentation=hashkey_description) -%} + {%- set hashdiff = datavault4dbt.yaml_metadata_parser(name='hashdiff', yaml_metadata=yaml_metadata, parameter=hashdiff, required=True, documentation=hashdiff_description) -%} + {%- set ma_attribute = datavault4dbt.yaml_metadata_parser(name='ma_attribute', yaml_metadata=yaml_metadata, parameter=ma_attribute, required=True, documentation=ma_attribute_description) -%} + {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} + {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} + {%- set ledts_alias = datavault4dbt.yaml_metadata_parser(name='ledts_alias', yaml_metadata=yaml_metadata, parameter=ledts_alias, required=False, documentation=ledts_alias_description) -%} + {%- set add_is_current_flag = datavault4dbt.yaml_metadata_parser(name='add_is_current_flag', yaml_metadata=yaml_metadata, parameter=add_is_current_flag, required=False, documentation=add_is_current_flag_description) -%} {# Applying the default aliases as stored inside the global variables, if src_ldts, src_rsrc, and ledts_alias are not set. #} diff --git a/macros/tables/nh_link.sql b/macros/tables/nh_link.sql index 9eceecc7..dabbb0df 100644 --- a/macros/tables/nh_link.sql +++ b/macros/tables/nh_link.sql @@ -92,14 +92,14 @@ Needs to use the same column name as defined as alias inside the staging model. " %} - {%- set link_hashkey = datavault4dbt.yaml_metadata_parser(name='link_hashkey', yaml_metadata=yaml_metadata, parameter=link_hashkey, required=True, documentation=link_hashkey_description) -%} - {%- set payload = datavault4dbt.yaml_metadata_parser(name='payload', yaml_metadata=yaml_metadata, parameter=payload, required=True, documentation=payload_description) -%} - {%- set source_models = datavault4dbt.yaml_metadata_parser(name='source_models', yaml_metadata=yaml_metadata, parameter=source_models, required=True, documentation=source_models_description) -%} - {%- set foreign_hashkeys = datavault4dbt.yaml_metadata_parser(name='foreign_hashkeys', yaml_metadata=yaml_metadata, parameter=foreign_hashkeys, required=False, documentation=foreign_hashkeys_description) -%} - {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} - {%- set rsrc = datavault4dbt.yaml_metadata_parser(name='rsrc', yaml_metadata=yaml_metadata, parameter=rsrc, required=False, documentation=rsrc_description) -%} - {%- set disable_hwm = datavault4dbt.yaml_metadata_parser(name='disable_hwm', yaml_metadata=yaml_metadata, parameter=disable_hwm, required=False, documentation='Whether the High Water Mark should be turned off. Optional, default False.') -%} - {%- set source_is_single_batch = datavault4dbt.yaml_metadata_parser(name='source_is_single_batch', yaml_metadata=yaml_metadata, parameter=source_is_single_batch, required=False, documentation='Whether the source contains only one batch. Optional, default False.') -%} + {%- set link_hashkey = datavault4dbt.yaml_metadata_parser(name='link_hashkey', yaml_metadata=yaml_metadata, parameter=link_hashkey, required=True, documentation=link_hashkey_description) -%} + {%- set payload = datavault4dbt.yaml_metadata_parser(name='payload', yaml_metadata=yaml_metadata, parameter=payload, required=True, documentation=payload_description) -%} + {%- set source_models = datavault4dbt.yaml_metadata_parser(name='source_models', yaml_metadata=yaml_metadata, parameter=source_models, required=True, documentation=source_models_description) -%} + {%- set foreign_hashkeys = datavault4dbt.yaml_metadata_parser(name='foreign_hashkeys', yaml_metadata=yaml_metadata, parameter=foreign_hashkeys, required=False, documentation=foreign_hashkeys_description) -%} + {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} + {%- set rsrc = datavault4dbt.yaml_metadata_parser(name='rsrc', yaml_metadata=yaml_metadata, parameter=rsrc, required=False, documentation=rsrc_description) -%} + {%- set disable_hwm = datavault4dbt.yaml_metadata_parser(name='disable_hwm', yaml_metadata=yaml_metadata, parameter=disable_hwm, required=False, documentation='Whether the High Water Mark should be turned off. Optional, default False.') -%} + {%- set source_is_single_batch = datavault4dbt.yaml_metadata_parser(name='source_is_single_batch', yaml_metadata=yaml_metadata, parameter=source_is_single_batch, required=False, documentation='Whether the source contains only one batch. Optional, default False.') -%} {# Applying the default aliases as stored inside the global variables, if src_ldts and src_rsrc are not set. #} diff --git a/macros/tables/nh_sat.sql b/macros/tables/nh_sat.sql index b5295c71..3b1d1214 100644 --- a/macros/tables/nh_sat.sql +++ b/macros/tables/nh_sat.sql @@ -47,12 +47,12 @@ Needs to use the same column name as defined as alias inside the staging model. " %} - {%- set parent_hashkey = datavault4dbt.yaml_metadata_parser(name='parent_hashkey', yaml_metadata=yaml_metadata, parameter=parent_hashkey, required=True, documentation=parent_hashkey_description) -%} - {%- set src_payload = datavault4dbt.yaml_metadata_parser(name='src_payload', yaml_metadata=yaml_metadata, parameter=src_payload, required=True, documentation=src_payload_description) -%} - {%- set source_model = datavault4dbt.yaml_metadata_parser(name='source_model', yaml_metadata=yaml_metadata, parameter=source_model, required=True, documentation=source_model_description) -%} - {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} - {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} - {%- set source_is_single_batch = datavault4dbt.yaml_metadata_parser(name='source_is_single_batch', yaml_metadata=yaml_metadata, parameter=source_is_single_batch, required=False, documentation='Whether the source contains only one batch. Optional, default False.') -%} + {%- set parent_hashkey = datavault4dbt.yaml_metadata_parser(name='parent_hashkey', yaml_metadata=yaml_metadata, parameter=parent_hashkey, required=True, documentation=parent_hashkey_description) -%} + {%- set src_payload = datavault4dbt.yaml_metadata_parser(name='src_payload', yaml_metadata=yaml_metadata, parameter=src_payload, required=True, documentation=src_payload_description) -%} + {%- set source_model = datavault4dbt.yaml_metadata_parser(name='source_model', yaml_metadata=yaml_metadata, parameter=source_model, required=True, documentation=source_model_description) -%} + {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} + {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} + {%- set source_is_single_batch = datavault4dbt.yaml_metadata_parser(name='source_is_single_batch', yaml_metadata=yaml_metadata, parameter=source_is_single_batch, required=False, documentation='Whether the source contains only one batch. Optional, default False.') -%} {# Applying the default aliases as stored inside the global variables, if src_ldts and src_rsrc are not set. #} diff --git a/macros/tables/pit.sql b/macros/tables/pit.sql index c087455c..18c83652 100644 --- a/macros/tables/pit.sql +++ b/macros/tables/pit.sql @@ -11,8 +11,6 @@ - Allows to insert a static string as record source column, matching business vault definition of a record source #} - - {%- macro pit(yaml_metadata=none, tracked_entity=none, hashkey=none, sat_names=none, snapshot_relation=none, dimension_key=none, snapshot_trigger_column=none, ldts=none, custom_rsrc=none, ledts=none, sdts=none, pit_type=none) -%} {% set tracked_entity_description = " @@ -72,17 +70,17 @@ Is optional, if not set, no column will be added. " %} - {%- set tracked_entity = datavault4dbt.yaml_metadata_parser(name='tracked_entity', yaml_metadata=yaml_metadata, parameter=tracked_entity, required=True, documentation=tracked_entity_description) -%} - {%- set hashkey = datavault4dbt.yaml_metadata_parser(name='hashkey', yaml_metadata=yaml_metadata, parameter=hashkey, required=True, documentation=hashkey_description) -%} - {%- set sat_names = datavault4dbt.yaml_metadata_parser(name='sat_names', yaml_metadata=yaml_metadata, parameter=sat_names, required=True, documentation=sat_names_description) -%} - {%- set snapshot_relation = datavault4dbt.yaml_metadata_parser(name='snapshot_relation', yaml_metadata=yaml_metadata, parameter=snapshot_relation, required=True, documentation=snapshot_relation_description) -%} - {%- set dimension_key = datavault4dbt.yaml_metadata_parser(name='dimension_key', yaml_metadata=yaml_metadata, parameter=dimension_key, required=True, documentation=dimension_key_description) -%} - {%- set snapshot_trigger_column = datavault4dbt.yaml_metadata_parser(name='snapshot_trigger_column', yaml_metadata=yaml_metadata, parameter=snapshot_trigger_column, required=False, documentation=snapshot_trigger_column_description) -%} - {%- set ldts = datavault4dbt.yaml_metadata_parser(name='ldts', yaml_metadata=yaml_metadata, parameter=ldts, required=False, documentation=ldts_description) -%} - {%- set custom_rsrc = datavault4dbt.yaml_metadata_parser(name='custom_rsrc', yaml_metadata=yaml_metadata, parameter=custom_rsrc, required=False, documentation=custom_rsrc_description) -%} - {%- set ledts = datavault4dbt.yaml_metadata_parser(name='ledts', yaml_metadata=yaml_metadata, parameter=ledts, required=False, documentation=ledts_description) -%} - {%- set sdts = datavault4dbt.yaml_metadata_parser(name='sdts', yaml_metadata=yaml_metadata, parameter=sdts, required=False, documentation=sdts_description) -%} - {%- set pit_type = datavault4dbt.yaml_metadata_parser(name='pit_type', yaml_metadata=yaml_metadata, parameter=pit_type, required=False, documentation=pit_type_description) -%} + {%- set tracked_entity = datavault4dbt.yaml_metadata_parser(name='tracked_entity', yaml_metadata=yaml_metadata, parameter=tracked_entity, required=True, documentation=tracked_entity_description) -%} + {%- set hashkey = datavault4dbt.yaml_metadata_parser(name='hashkey', yaml_metadata=yaml_metadata, parameter=hashkey, required=True, documentation=hashkey_description) -%} + {%- set sat_names = datavault4dbt.yaml_metadata_parser(name='sat_names', yaml_metadata=yaml_metadata, parameter=sat_names, required=True, documentation=sat_names_description) -%} + {%- set snapshot_relation = datavault4dbt.yaml_metadata_parser(name='snapshot_relation', yaml_metadata=yaml_metadata, parameter=snapshot_relation, required=True, documentation=snapshot_relation_description) -%} + {%- set dimension_key = datavault4dbt.yaml_metadata_parser(name='dimension_key', yaml_metadata=yaml_metadata, parameter=dimension_key, required=True, documentation=dimension_key_description) -%} + {%- set snapshot_trigger_column = datavault4dbt.yaml_metadata_parser(name='snapshot_trigger_column', yaml_metadata=yaml_metadata, parameter=snapshot_trigger_column, required=False, documentation=snapshot_trigger_column_description) -%} + {%- set ldts = datavault4dbt.yaml_metadata_parser(name='ldts', yaml_metadata=yaml_metadata, parameter=ldts, required=False, documentation=ldts_description) -%} + {%- set custom_rsrc = datavault4dbt.yaml_metadata_parser(name='custom_rsrc', yaml_metadata=yaml_metadata, parameter=custom_rsrc, required=False, documentation=custom_rsrc_description) -%} + {%- set ledts = datavault4dbt.yaml_metadata_parser(name='ledts', yaml_metadata=yaml_metadata, parameter=ledts, required=False, documentation=ledts_description) -%} + {%- set sdts = datavault4dbt.yaml_metadata_parser(name='sdts', yaml_metadata=yaml_metadata, parameter=sdts, required=False, documentation=sdts_description) -%} + {%- set pit_type = datavault4dbt.yaml_metadata_parser(name='pit_type', yaml_metadata=yaml_metadata, parameter=pit_type, required=False, documentation=pit_type_description) -%} {# Applying the default aliases as stored inside the global variables, if ldts, sdts and ledts are not set. #} diff --git a/macros/tables/rec_track_sat.sql b/macros/tables/rec_track_sat.sql index 161f054c..b02bb75d 100644 --- a/macros/tables/rec_track_sat.sql +++ b/macros/tables/rec_track_sat.sql @@ -73,12 +73,12 @@ src_stg::string Name of the source stage model. Is optional, will use the global variable 'datavault4dbt.stg_alias'. " %} - {%- set tracked_hashkey = datavault4dbt.yaml_metadata_parser(name='tracked_hashkey', yaml_metadata=yaml_metadata, parameter=tracked_hashkey, required=True, documentation=tracked_hashkey_description) -%} - {%- set source_models = datavault4dbt.yaml_metadata_parser(name='source_models', yaml_metadata=yaml_metadata, parameter=source_models, required=True, documentation=source_models_description) -%} - {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} - {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} - {%- set src_stg = datavault4dbt.yaml_metadata_parser(name='src_stg', yaml_metadata=yaml_metadata, parameter=src_stg, required=False, documentation=src_stg_description) -%} - {%- set disable_hwm = datavault4dbt.yaml_metadata_parser(name='disable_hwm', yaml_metadata=yaml_metadata, parameter=disable_hwm, required=False, documentation='Whether the High Water Mark should be turned off. Optional, default False.') -%} + {%- set tracked_hashkey = datavault4dbt.yaml_metadata_parser(name='tracked_hashkey', yaml_metadata=yaml_metadata, parameter=tracked_hashkey, required=True, documentation=tracked_hashkey_description) -%} + {%- set source_models = datavault4dbt.yaml_metadata_parser(name='source_models', yaml_metadata=yaml_metadata, parameter=source_models, required=True, documentation=source_models_description) -%} + {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} + {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} + {%- set src_stg = datavault4dbt.yaml_metadata_parser(name='src_stg', yaml_metadata=yaml_metadata, parameter=src_stg, required=False, documentation=src_stg_description) -%} + {%- set disable_hwm = datavault4dbt.yaml_metadata_parser(name='disable_hwm', yaml_metadata=yaml_metadata, parameter=disable_hwm, required=False, documentation='Whether the High Water Mark should be turned off. Optional, default False.') -%} {# Applying the default aliases as stored inside the global variables, if src_ldts and src_rsrc are not set. #} diff --git a/macros/tables/ref_hub.sql b/macros/tables/ref_hub.sql index e445f88c..84d13687 100644 --- a/macros/tables/ref_hub.sql +++ b/macros/tables/ref_hub.sql @@ -1,34 +1,3 @@ -{# -Example model: - -{{ config(materialized='incremental', - schema='Core') }} - -{%- set yaml_metadata -%} -source_models: stg_nation -ref_keys: N_NATIONKEY -{%- endset -%} - -{% set metadata_dict = fromyaml(yaml_metadata) %} - -{{ datavault4dbt.ref_hub(source_models=metadata_dict['source_models'], - ref_keys=metadata_dict['ref_keys']) }} - -#} - - - - - - - - - - - - - - {%- macro ref_hub(yaml_metadata=none, ref_keys=none, source_models=none, src_ldts=none, src_rsrc=none) -%} {% set ref_keys_description = " @@ -49,10 +18,10 @@ ref_keys: N_NATIONKEY Needs to use the same column name as defined as alias inside the staging model. " %} - {%- set ref_keys = datavault4dbt.yaml_metadata_parser(name='ref_keys', yaml_metadata=yaml_metadata, parameter=ref_keys, required=True, documentation=ref_keys_description) -%} - {%- set source_models = datavault4dbt.yaml_metadata_parser(name='source_models', yaml_metadata=yaml_metadata, parameter=source_models, required=True, documentation=source_models_description) -%} - {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} - {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} + {%- set ref_keys = datavault4dbt.yaml_metadata_parser(name='ref_keys', yaml_metadata=yaml_metadata, parameter=ref_keys, required=True, documentation=ref_keys_description) -%} + {%- set source_models = datavault4dbt.yaml_metadata_parser(name='source_models', yaml_metadata=yaml_metadata, parameter=source_models, required=True, documentation=source_models_description) -%} + {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} + {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} {# Applying the default aliases as stored inside the global variables, if src_ldts and src_rsrc are not set. #} diff --git a/macros/tables/ref_sat_v0.sql b/macros/tables/ref_sat_v0.sql index 1c4ff2ec..7c8e43f7 100644 --- a/macros/tables/ref_sat_v0.sql +++ b/macros/tables/ref_sat_v0.sql @@ -1,31 +1,3 @@ -{# -Example model: - -{{ config(materialized='incremental', - schema='Core') }} - -{%- set yaml_metadata -%} -source_model: stg_nation -parent_ref_keys: N_NATIONKEY -src_hashdiff: hd_nation_rs -src_payload: - - N_COMMENT - - N_NAME - - N_REGIONKEY -{%- endset -%} - -{% set metadata_dict = fromyaml(yaml_metadata) %} - -{{ datavault4dbt.ref_sat_v0(source_model=metadata_dict['source_model'], - parent_ref_keys=metadata_dict['parent_ref_keys'], - src_hashdiff=metadata_dict['src_hashdiff'], - src_payload=metadata_dict['src_payload']) }} - -#} - - - - {%- macro ref_sat_v0(yaml_metadata=none, parent_ref_keys=none, src_hashdiff=none, src_payload=none, source_model=none, src_ldts=none, src_rsrc=none, disable_hwm=false, source_is_single_batch=false) -%} {% set parent_ref_keys_description = " @@ -70,14 +42,14 @@ src_payload: " %} - {%- set parent_ref_keys = datavault4dbt.yaml_metadata_parser(name='parent_ref_keys', yaml_metadata=yaml_metadata, parameter=parent_ref_keys, required=True, documentation=parent_ref_keys_description) -%} - {%- set src_hashdiff = datavault4dbt.yaml_metadata_parser(name='src_hashdiff', yaml_metadata=yaml_metadata, parameter=src_hashdiff, required=True, documentation=src_hashdiff_description) -%} - {%- set src_payload = datavault4dbt.yaml_metadata_parser(name='src_payload', yaml_metadata=yaml_metadata, parameter=src_payload, required=True, documentation=src_payload_description) -%} - {%- set source_model = datavault4dbt.yaml_metadata_parser(name='source_model', yaml_metadata=yaml_metadata, parameter=source_model, required=True, documentation=source_model_description) -%} - {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} - {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} - {%- set disable_hwm = datavault4dbt.yaml_metadata_parser(name='disable_hwm', yaml_metadata=yaml_metadata, parameter=disable_hwm, required=False, documentation='Whether the High Water Mark should be turned off. Optional, default False.') -%} - {%- set source_is_single_batch = datavault4dbt.yaml_metadata_parser(name='source_is_single_batch', yaml_metadata=yaml_metadata, parameter=source_is_single_batch, required=False, documentation='Whether the source contains only one batch. Optional, default False.') -%} + {%- set parent_ref_keys = datavault4dbt.yaml_metadata_parser(name='parent_ref_keys', yaml_metadata=yaml_metadata, parameter=parent_ref_keys, required=True, documentation=parent_ref_keys_description) -%} + {%- set src_hashdiff = datavault4dbt.yaml_metadata_parser(name='src_hashdiff', yaml_metadata=yaml_metadata, parameter=src_hashdiff, required=True, documentation=src_hashdiff_description) -%} + {%- set src_payload = datavault4dbt.yaml_metadata_parser(name='src_payload', yaml_metadata=yaml_metadata, parameter=src_payload, required=True, documentation=src_payload_description) -%} + {%- set source_model = datavault4dbt.yaml_metadata_parser(name='source_model', yaml_metadata=yaml_metadata, parameter=source_model, required=True, documentation=source_model_description) -%} + {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} + {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} + {%- set disable_hwm = datavault4dbt.yaml_metadata_parser(name='disable_hwm', yaml_metadata=yaml_metadata, parameter=disable_hwm, required=False, documentation='Whether the High Water Mark should be turned off. Optional, default False.') -%} + {%- set source_is_single_batch = datavault4dbt.yaml_metadata_parser(name='source_is_single_batch', yaml_metadata=yaml_metadata, parameter=source_is_single_batch, required=False, documentation='Whether the source contains only one batch. Optional, default False.') -%} {# Applying the default aliases as stored inside the global variables, if src_ldts, src_rsrc, and ledts_alias are not set. #} {%- set src_ldts = datavault4dbt.replace_standard(src_ldts, 'datavault4dbt.ldts_alias', 'ldts') -%} diff --git a/macros/tables/ref_sat_v1.sql b/macros/tables/ref_sat_v1.sql index 52ccfc3e..abe56d81 100644 --- a/macros/tables/ref_sat_v1.sql +++ b/macros/tables/ref_sat_v1.sql @@ -1,27 +1,3 @@ -{# -Example model: - -{{ config(materialized='view', - schema='Core') }} - -{%- set yaml_metadata -%} -ref_sat_v0: nation_rs -ref_keys: N_NATIONKEY -hashdiff: hd_nation_rs -add_is_current_flag: true -{%- endset -%} - -{% set metadata_dict = fromyaml(yaml_metadata) %} - -{{ datavault4dbt.ref_sat_v1(ref_sat_v0=metadata_dict['ref_sat_v0'], - ref_keys=metadata_dict['ref_keys'], - hashdiff=metadata_dict['hashdiff'], - add_is_current_flag=metadata_dict['add_is_current_flag']) }} - -#} - - - {%- macro ref_sat_v1(yaml_metadata=none, ref_sat_v0=none, ref_keys=none, hashdiff=none, src_ldts=none, src_rsrc=none, ledts_alias=none, add_is_current_flag=false) -%} {% set ref_sat_v0_description = " @@ -58,13 +34,13 @@ add_is_current_flag: true will be set to false. " %} - {%- set ref_sat_v0 = datavault4dbt.yaml_metadata_parser(name='ref_sat_v0', yaml_metadata=yaml_metadata, parameter=ref_sat_v0, required=True, documentation=ref_sat_v0_description) -%} - {%- set ref_keys = datavault4dbt.yaml_metadata_parser(name='ref_keys', yaml_metadata=yaml_metadata, parameter=ref_keys, required=True, documentation=ref_keys_description) -%} - {%- set hashdiff = datavault4dbt.yaml_metadata_parser(name='hashdiff', yaml_metadata=yaml_metadata, parameter=hashdiff, required=True, documentation=hashdiff_description) -%} - {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} - {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} - {%- set ledts_alias = datavault4dbt.yaml_metadata_parser(name='ledts_alias', yaml_metadata=yaml_metadata, parameter=ledts_alias, required=False, documentation=ledts_alias_description) -%} - {%- set add_is_current_flag = datavault4dbt.yaml_metadata_parser(name='add_is_current_flag', yaml_metadata=yaml_metadata, parameter=add_is_current_flag, required=False, documentation=add_is_current_flag_description) -%} + {%- set ref_sat_v0 = datavault4dbt.yaml_metadata_parser(name='ref_sat_v0', yaml_metadata=yaml_metadata, parameter=ref_sat_v0, required=True, documentation=ref_sat_v0_description) -%} + {%- set ref_keys = datavault4dbt.yaml_metadata_parser(name='ref_keys', yaml_metadata=yaml_metadata, parameter=ref_keys, required=True, documentation=ref_keys_description) -%} + {%- set hashdiff = datavault4dbt.yaml_metadata_parser(name='hashdiff', yaml_metadata=yaml_metadata, parameter=hashdiff, required=True, documentation=hashdiff_description) -%} + {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} + {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} + {%- set ledts_alias = datavault4dbt.yaml_metadata_parser(name='ledts_alias', yaml_metadata=yaml_metadata, parameter=ledts_alias, required=False, documentation=ledts_alias_description) -%} + {%- set add_is_current_flag = datavault4dbt.yaml_metadata_parser(name='add_is_current_flag', yaml_metadata=yaml_metadata, parameter=add_is_current_flag, required=False, documentation=add_is_current_flag_description) -%} {# Applying the default aliases as stored inside the global variables, if src_ldts, src_rsrc, and ledts_alias are not set. #} diff --git a/macros/tables/ref_table.sql b/macros/tables/ref_table.sql index 807cc746..0cdfe7d4 100644 --- a/macros/tables/ref_table.sql +++ b/macros/tables/ref_table.sql @@ -1,91 +1,3 @@ -{# -Example models: - -Fully historized: - - {{ config(schema='core', materialized='view') }} - - {%- set yaml_metadata -%} - ref_hub: 'nation_rh' - ref_satellites: - - nation_rs1 - - nation_p_rs - historized: 'full' - {%- endset -%} - - {% set metadata_dict = fromyaml(yaml_metadata) %} - - {{ datavault4dbt.ref_table(ref_hub=metadata_dict['ref_hub'], - ref_satellites=metadata_dict['ref_satellites'], - historized=metadata_dict['historized'], - snapshot_relation=metadata_dict['snapshot_relation']) }} - -Only latest data: - - {{ config(schema='core', materialized='view') }} - - {%- set yaml_metadata -%} - ref_hub: 'nation_rh' - ref_satellites: - - nation_rs1 - - nation_p_rs - historized: 'latest' - {%- endset -%} - - {% set metadata_dict = fromyaml(yaml_metadata) %} - - {{ datavault4dbt.ref_table(ref_hub=metadata_dict['ref_hub'], - ref_satellites=metadata_dict['ref_satellites'], - historized=metadata_dict['historized'], - snapshot_relation=metadata_dict['snapshot_relation']) }} - -Snapshot Based: - - {{ config(schema='core', materialized='view') }} - - {%- set yaml_metadata -%} - ref_hub: 'nation_rh' - ref_satellites: - - nation_rs1 - - nation_p_rs - historized: 'snapshot' - snapshot_relation: snap_v1 - {%- endset -%} - - {% set metadata_dict = fromyaml(yaml_metadata) %} - - {{ datavault4dbt.ref_table(ref_hub=metadata_dict['ref_hub'], - ref_satellites=metadata_dict['ref_satellites'], - historized=metadata_dict['historized'], - snapshot_relation=metadata_dict['snapshot_relation']) }} - -Include / Exclude per Satellite: - - {{ config(schema='core', materialized='view') }} - - {%- set yaml_metadata -%} - ref_hub: 'nation_rh' - ref_satellites: - nation_rs1: - exclude: - - N_NAME - nation_p_rs: - include: - - N_NAME - historized: 'full' - {%- endset -%} - - {% set metadata_dict = fromyaml(yaml_metadata) %} - - {{ datavault4dbt.ref_table(ref_hub=metadata_dict['ref_hub'], - ref_satellites=metadata_dict['ref_satellites'], - historized=metadata_dict['historized'], - snapshot_relation=metadata_dict['snapshot_relation']) }} - - -#} - - {%- macro ref_table(yaml_metadata=none, ref_hub=none, ref_satellites=none, src_ldts=none, src_rsrc=none, historized='latest', snapshot_relation=none, snapshot_trigger_column=none) -%} {% set ref_hub_description = " @@ -118,13 +30,13 @@ Include / Exclude per Satellite: snapshot_trigger_column::string Only required, if 'historized' set to 'snapshot'. Defaults to global variable 'datavault4dbt.sdts_alias'. Only needs to be set if alias deviates from global variable. " %} - {%- set ref_hub = datavault4dbt.yaml_metadata_parser(name='ref_hub', yaml_metadata=yaml_metadata, parameter=ref_hub, required=True, documentation=ref_hub_description) -%} - {%- set ref_satellites = datavault4dbt.yaml_metadata_parser(name='ref_satellites', yaml_metadata=yaml_metadata, parameter=ref_satellites, required=True, documentation=ref_satellites_description) -%} - {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} - {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} - {%- set historized = datavault4dbt.yaml_metadata_parser(name='historized', yaml_metadata=yaml_metadata, parameter=historized, required=False, documentation=historized_description) -%} - {%- set snapshot_relation = datavault4dbt.yaml_metadata_parser(name='snapshot_relation', yaml_metadata=yaml_metadata, parameter=snapshot_relation, required=False, documentation=snapshot_relation_description) -%} - {%- set snapshot_trigger_column = datavault4dbt.yaml_metadata_parser(name='snapshot_trigger_column', yaml_metadata=yaml_metadata, parameter=snapshot_trigger_column, required=False, documentation=snapshot_trigger_column_description) -%} + {%- set ref_hub = datavault4dbt.yaml_metadata_parser(name='ref_hub', yaml_metadata=yaml_metadata, parameter=ref_hub, required=True, documentation=ref_hub_description) -%} + {%- set ref_satellites = datavault4dbt.yaml_metadata_parser(name='ref_satellites', yaml_metadata=yaml_metadata, parameter=ref_satellites, required=True, documentation=ref_satellites_description) -%} + {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} + {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} + {%- set historized = datavault4dbt.yaml_metadata_parser(name='historized', yaml_metadata=yaml_metadata, parameter=historized, required=False, documentation=historized_description) -%} + {%- set snapshot_relation = datavault4dbt.yaml_metadata_parser(name='snapshot_relation', yaml_metadata=yaml_metadata, parameter=snapshot_relation, required=False, documentation=snapshot_relation_description) -%} + {%- set snapshot_trigger_column = datavault4dbt.yaml_metadata_parser(name='snapshot_trigger_column', yaml_metadata=yaml_metadata, parameter=snapshot_trigger_column, required=False, documentation=snapshot_trigger_column_description) -%} {# Applying the default aliases as stored inside the global variables, if src_ldts and src_rsrc are not set. #} diff --git a/macros/tables/sat_v0.sql b/macros/tables/sat_v0.sql index e9f8cf8e..de5e156b 100644 --- a/macros/tables/sat_v0.sql +++ b/macros/tables/sat_v0.sql @@ -7,30 +7,6 @@ Features: - Can handle multiple updates per batch, without losing intermediate changes. therefore initial loading is supported. - Using a dynamic high-water-mark to optimize loading performance of multiple loads - - Parameters: - - parent_hashkey::string Name of the hashkey column inside the stage of the object that this satellite is attached to. - - Examples: - 'hk_account_h' The satellite would be attached to the hub account, which has the - column 'hk_account_h' as a hashkey column. - - 'hk_account_contact_l' The satellite would be attached to the link between account and contact, - which has the column 'hk_account_contact_l' as a hashkey column. - - - - - - - - src_ldts::string Name of the ldts column inside the source model. Is optional, will use the global variable 'datavault4dbt.ldts_alias'. - Needs to use the same column name as defined as alias inside the staging model. - - src_rsrc::string Name of the rsrc column inside the source model. Is optional, will use the global variable 'datavault4dbt.rsrc_alias'. - Needs to use the same column name as defined as alias inside the staging model. - #} {%- macro sat_v0(yaml_metadata=none, parent_hashkey=none, src_hashdiff=none, src_payload=none, source_model=none, src_ldts=none, src_rsrc=none, disable_hwm=false, source_is_single_batch=false) -%} @@ -84,14 +60,14 @@ Needs to use the same column name as defined as alias inside the staging model. " %} - {%- set parent_hashkey = datavault4dbt.yaml_metadata_parser(name='parent_hashkey', yaml_metadata=yaml_metadata, parameter=parent_hashkey, required=True, documentation=parent_hashkey_description) -%} - {%- set src_hashdiff = datavault4dbt.yaml_metadata_parser(name='src_hashdiff', yaml_metadata=yaml_metadata, parameter=src_hashdiff, required=True, documentation=src_hashdiff_description) -%} - {%- set src_payload = datavault4dbt.yaml_metadata_parser(name='src_payload', yaml_metadata=yaml_metadata, parameter=src_payload, required=True, documentation=src_payload_description) -%} - {%- set source_model = datavault4dbt.yaml_metadata_parser(name='source_model', yaml_metadata=yaml_metadata, parameter=source_model, required=True, documentation=source_model_description) -%} - {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} - {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} - {%- set disable_hwm = datavault4dbt.yaml_metadata_parser(name='disable_hwm', yaml_metadata=yaml_metadata, parameter=disable_hwm, required=False, documentation='Whether the High Water Mark should be turned off. Optional, default False.') -%} - {%- set source_is_single_batch = datavault4dbt.yaml_metadata_parser(name='source_is_single_batch', yaml_metadata=yaml_metadata, parameter=source_is_single_batch, required=False, documentation='Whether the source contains only one batch. Optional, default False.') -%} + {%- set parent_hashkey = datavault4dbt.yaml_metadata_parser(name='parent_hashkey', yaml_metadata=yaml_metadata, parameter=parent_hashkey, required=True, documentation=parent_hashkey_description) -%} + {%- set src_hashdiff = datavault4dbt.yaml_metadata_parser(name='src_hashdiff', yaml_metadata=yaml_metadata, parameter=src_hashdiff, required=True, documentation=src_hashdiff_description) -%} + {%- set src_payload = datavault4dbt.yaml_metadata_parser(name='src_payload', yaml_metadata=yaml_metadata, parameter=src_payload, required=True, documentation=src_payload_description) -%} + {%- set source_model = datavault4dbt.yaml_metadata_parser(name='source_model', yaml_metadata=yaml_metadata, parameter=source_model, required=True, documentation=source_model_description) -%} + {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} + {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} + {%- set disable_hwm = datavault4dbt.yaml_metadata_parser(name='disable_hwm', yaml_metadata=yaml_metadata, parameter=disable_hwm, required=False, documentation='Whether the High Water Mark should be turned off. Optional, default False.') -%} + {%- set source_is_single_batch = datavault4dbt.yaml_metadata_parser(name='source_is_single_batch', yaml_metadata=yaml_metadata, parameter=source_is_single_batch, required=False, documentation='Whether the source contains only one batch. Optional, default False.') -%} {# Applying the default aliases as stored inside the global variables, if src_ldts, src_rsrc, and ledts_alias are not set. #} {%- set src_ldts = datavault4dbt.replace_standard(src_ldts, 'datavault4dbt.ldts_alias', 'ldts') -%} diff --git a/macros/tables/sat_v1.sql b/macros/tables/sat_v1.sql index 13ad1425..a573974d 100644 --- a/macros/tables/sat_v1.sql +++ b/macros/tables/sat_v1.sql @@ -62,14 +62,14 @@ include_payload::boolean Optional parameter to specify if the v1 sat should have the payload columns from sat v0 or not. Default is true. " %} - {%- set sat_v0 = datavault4dbt.yaml_metadata_parser(name='sat_v0', yaml_metadata=yaml_metadata, parameter=sat_v0, required=True, documentation=sat_v0_description) -%} - {%- set hashkey = datavault4dbt.yaml_metadata_parser(name='hashkey', yaml_metadata=yaml_metadata, parameter=hashkey, required=True, documentation=hashkey_description) -%} - {%- set hashdiff = datavault4dbt.yaml_metadata_parser(name='hashdiff', yaml_metadata=yaml_metadata, parameter=hashdiff, required=True, documentation=hashdiff_description) -%} - {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} - {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} - {%- set ledts_alias = datavault4dbt.yaml_metadata_parser(name='ledts_alias', yaml_metadata=yaml_metadata, parameter=ledts_alias, required=False, documentation=ledts_alias_description) -%} - {%- set add_is_current_flag = datavault4dbt.yaml_metadata_parser(name='add_is_current_flag', yaml_metadata=yaml_metadata, parameter=add_is_current_flag, required=False, documentation=add_is_current_flag_description) -%} - {%- set include_payload = datavault4dbt.yaml_metadata_parser(name='include_payload', yaml_metadata=yaml_metadata, parameter=include_payload, required=False, documentation=include_payload_description) -%} + {%- set sat_v0 = datavault4dbt.yaml_metadata_parser(name='sat_v0', yaml_metadata=yaml_metadata, parameter=sat_v0, required=True, documentation=sat_v0_description) -%} + {%- set hashkey = datavault4dbt.yaml_metadata_parser(name='hashkey', yaml_metadata=yaml_metadata, parameter=hashkey, required=True, documentation=hashkey_description) -%} + {%- set hashdiff = datavault4dbt.yaml_metadata_parser(name='hashdiff', yaml_metadata=yaml_metadata, parameter=hashdiff, required=True, documentation=hashdiff_description) -%} + {%- set src_ldts = datavault4dbt.yaml_metadata_parser(name='src_ldts', yaml_metadata=yaml_metadata, parameter=src_ldts, required=False, documentation=src_ldts_description) -%} + {%- set src_rsrc = datavault4dbt.yaml_metadata_parser(name='src_rsrc', yaml_metadata=yaml_metadata, parameter=src_rsrc, required=False, documentation=src_rsrc_description) -%} + {%- set ledts_alias = datavault4dbt.yaml_metadata_parser(name='ledts_alias', yaml_metadata=yaml_metadata, parameter=ledts_alias, required=False, documentation=ledts_alias_description) -%} + {%- set add_is_current_flag = datavault4dbt.yaml_metadata_parser(name='add_is_current_flag', yaml_metadata=yaml_metadata, parameter=add_is_current_flag, required=False, documentation=add_is_current_flag_description) -%} + {%- set include_payload = datavault4dbt.yaml_metadata_parser(name='include_payload', yaml_metadata=yaml_metadata, parameter=include_payload, required=False, documentation=include_payload_description) -%} {# Applying the default aliases as stored inside the global variables, if src_ldts, src_rsrc, and ledts_alias are not set. #} From aa0ea49e35fb69210949a54ae966b0573dca95a9 Mon Sep 17 00:00:00 2001 From: Tim Kirschke <81677440+tkirschke@users.noreply.github.com> Date: Tue, 7 Jan 2025 08:32:05 +0100 Subject: [PATCH 35/50] Update hash_standardization.sql --- macros/supporting/hash_standardization.sql | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/macros/supporting/hash_standardization.sql b/macros/supporting/hash_standardization.sql index a2cbb75f..c2ee8a0e 100644 --- a/macros/supporting/hash_standardization.sql +++ b/macros/supporting/hash_standardization.sql @@ -447,15 +447,15 @@ CONCAT('\"', REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(TRIM(CAST([EXPRESSION] AS STR {%- else -%} {%- if case_sensitive -%} - {%- set standardise_prefix = "IFNULL({}(NULLIF(CAST(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(UPPER(CONCAT(".format(hash_alg)-%} + {%- set standardise_prefix = "IFNULL(CAST({}(NULLIF(CAST(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(UPPER(CONCAT(".format(hash_alg)-%} {%- if alias is not none -%} - {%- set standardise_suffix = "\n)), r'\\n', '') \n, r'\\t', '') \n, r'\\v', '') \n, r'\\r', '') AS STRING), '[ALL_NULL]')), CAST({} AS {})) AS {}".format(zero_key, datatype, alias)-%} + {%- set standardise_suffix = "\n)), r'\\n', '') \n, r'\\t', '') \n, r'\\v', '') \n, r'\\r', '') AS STRING), '[ALL_NULL]')) as {}), CAST({} AS {})) AS {}".format(datatype, zero_key, datatype, alias)-%} {%- else -%} - {%- set standardise_suffix = "\n)), r'\\n', '') \n, r'\\t', '') \n, r'\\v', '') \n, r'\\r', '') AS STRING), '[ALL_NULL]')), CAST({} AS {}))".format(zero_key, datatype)-%} + {%- set standardise_suffix = "\n)), r'\\n', '') \n, r'\\t', '') \n, r'\\v', '') \n, r'\\r', '') AS STRING), '[ALL_NULL]')) as {}), CAST({} AS {}))".format(datatype, zero_key, datatype)-%} {%- endif -%} {%- else -%} - {%- set standardise_prefix = "IFNULL({}(NULLIF(CAST(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(CONCAT(".format(hash_alg)-%} - {%- set standardise_suffix = "\n), r'\\n', '') \n, r'\\t', '') \n, r'\\v', '') \n, r'\\r', '') AS STRING), '[ALL_NULL]')), CAST({} AS {})) AS {}".format(zero_key, datatype, alias)-%} + {%- set standardise_prefix = "IFNULL(CAST({}(NULLIF(CAST(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(CONCAT(".format(hash_alg)-%} + {%- set standardise_suffix = "\n), r'\\n', '') \n, r'\\t', '') \n, r'\\v', '') \n, r'\\r', '') AS STRING), '[ALL_NULL]')) as {}), CAST({} AS {})) AS {}".format(datatype, zero_key, datatype, alias)-%} {%- endif -%} {%- endif -%} @@ -934,7 +934,7 @@ CONCAT('\"', REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(TRIM(CAST([EXPRESSION] AS STR {%- else -%} {%- if case_sensitive -%} - {%- set standardise_prefix = "IFNULL({}(ARRAY_JOIN(SORT_ARRAY(ARRAY_AGG(NULLIF(CAST(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(UPPER(CONCAT(".format(hash_alg)-%} + {%- set standardise_prefix = "IFNULL(CAST({}(ARRAY_JOIN(SORT_ARRAY(ARRAY_AGG(NULLIF(CAST(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(UPPER(CONCAT(".format(hash_alg)-%} {%- if alias is not none -%} {%- set standardise_suffix = "\n)), r'\\n', '') \n, r'\\t', '') \n, r'\\v', '') \n, r'\\r', '') AS STRING), '[ALL_NULL]'))),',')), {}) AS {}".format(zero_key, alias)-%} @@ -942,12 +942,12 @@ CONCAT('\"', REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(TRIM(CAST([EXPRESSION] AS STR {%- set standardise_suffix = "\n)), r'\\n', '') \n, r'\\t', '') \n, r'\\v', '') \n, r'\\r', '') AS STRING), '[ALL_NULL]'))),',')), {})".format(zero_key)-%} {%- endif -%} {%- else -%} - {%- set standardise_prefix = "IFNULL({}(ARRAY_JOIN(SORT_ARRAY(ARRAY_AGG(NULLIF(CAST(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(CONCAT(".format(hash_alg)-%} + {%- set standardise_prefix = "IFNULL(CAST({}(ARRAY_JOIN(SORT_ARRAY(ARRAY_AGG(NULLIF(CAST(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(CONCAT(".format(hash_alg)-%} {%- if alias is not none -%} - {%- set standardise_suffix = "\n), r'\\n', '') \n, r'\\t', '') \n, r'\\v', '') \n, r'\\r', '') AS STRING), '[ALL_NULL]'))),',')), {}) AS {}".format(zero_key, alias)-%} + {%- set standardise_suffix = "\n), r'\\n', '') \n, r'\\t', '') \n, r'\\v', '') \n, r'\\r', '') AS STRING), '[ALL_NULL]'))),',')) AS {}), {}) AS {}".format(datatype, zero_key, alias)-%} {%- else -%} - {%- set standardise_suffix = "\n), r'\\n', '') \n, r'\\t', '') \n, r'\\v', '') \n, r'\\r', '') AS STRING), '[ALL_NULL]'))),',')), {})".format(zero_key)-%} + {%- set standardise_suffix = "\n), r'\\n', '') \n, r'\\t', '') \n, r'\\v', '') \n, r'\\r', '') AS STRING), '[ALL_NULL]'))),',')) AS {}), {})".format(datatype, zero_key)-%} {%- endif -%} {%- endif -%} @@ -984,4 +984,4 @@ CONCAT('\"', REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(TRIM(CAST([EXPRESSION] AS STR {%- endif -%} {%- do dict_result.update({"standardise_suffix": standardise_suffix, "standardise_prefix": standardise_prefix }) -%} {{ return(dict_result | tojson ) }} -{%- endmacro -%} \ No newline at end of file +{%- endmacro -%} From 1e2f721de125bc6b339574b9105a1833c0e47083 Mon Sep 17 00:00:00 2001 From: Theo Kiehn <162969167+tkiehn@users.noreply.github.com> Date: Tue, 7 Jan 2025 09:37:04 +0100 Subject: [PATCH 36/50] Update hash_standardization.sql add hashing datatype fix for sha1 on databricks also for multi-active hash standardization --- macros/supporting/hash_standardization.sql | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/macros/supporting/hash_standardization.sql b/macros/supporting/hash_standardization.sql index c2ee8a0e..bef0d50a 100644 --- a/macros/supporting/hash_standardization.sql +++ b/macros/supporting/hash_standardization.sql @@ -937,17 +937,17 @@ CONCAT('\"', REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(TRIM(CAST([EXPRESSION] AS STR {%- set standardise_prefix = "IFNULL(CAST({}(ARRAY_JOIN(SORT_ARRAY(ARRAY_AGG(NULLIF(CAST(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(UPPER(CONCAT(".format(hash_alg)-%} {%- if alias is not none -%} - {%- set standardise_suffix = "\n)), r'\\n', '') \n, r'\\t', '') \n, r'\\v', '') \n, r'\\r', '') AS STRING), '[ALL_NULL]'))),',')), {}) AS {}".format(zero_key, alias)-%} + {%- set standardise_suffix = "\n)), r'\\n', '') \n, r'\\t', '') \n, r'\\v', '') \n, r'\\r', '') AS STRING), '[ALL_NULL]'))),',')) AS {}), CAST({} AS {})) AS {}".format(datatype, zero_key, datatype, alias)-%} {%- else -%} - {%- set standardise_suffix = "\n)), r'\\n', '') \n, r'\\t', '') \n, r'\\v', '') \n, r'\\r', '') AS STRING), '[ALL_NULL]'))),',')), {})".format(zero_key)-%} + {%- set standardise_suffix = "\n)), r'\\n', '') \n, r'\\t', '') \n, r'\\v', '') \n, r'\\r', '') AS STRING), '[ALL_NULL]'))),',')) AS {}), CAST({} AS {}))".format(datatype, zero_key, datatype)-%} {%- endif -%} {%- else -%} {%- set standardise_prefix = "IFNULL(CAST({}(ARRAY_JOIN(SORT_ARRAY(ARRAY_AGG(NULLIF(CAST(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(CONCAT(".format(hash_alg)-%} {%- if alias is not none -%} - {%- set standardise_suffix = "\n), r'\\n', '') \n, r'\\t', '') \n, r'\\v', '') \n, r'\\r', '') AS STRING), '[ALL_NULL]'))),',')) AS {}), {}) AS {}".format(datatype, zero_key, alias)-%} + {%- set standardise_suffix = "\n), r'\\n', '') \n, r'\\t', '') \n, r'\\v', '') \n, r'\\r', '') AS STRING), '[ALL_NULL]'))),',')) AS {}), CAST({} AS {})) AS {}".format(datatype, zero_key, datatype, alias)-%} {%- else -%} - {%- set standardise_suffix = "\n), r'\\n', '') \n, r'\\t', '') \n, r'\\v', '') \n, r'\\r', '') AS STRING), '[ALL_NULL]'))),',')) AS {}), {})".format(datatype, zero_key)-%} + {%- set standardise_suffix = "\n), r'\\n', '') \n, r'\\t', '') \n, r'\\v', '') \n, r'\\r', '') AS STRING), '[ALL_NULL]'))),',')) AS {}), CAST({} AS {}))".format(datatype, zero_key, datatype)-%} {%- endif -%} {%- endif -%} From de6861f5d4c8b68be357d2dd400d7ce4e3f6d908 Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Tue, 7 Jan 2025 09:57:10 +0100 Subject: [PATCH 37/50] Move warnings to log and improve content of the messages --- macros/internal/metadata_processing/yaml_metadata_parser.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/macros/internal/metadata_processing/yaml_metadata_parser.sql b/macros/internal/metadata_processing/yaml_metadata_parser.sql index 86e53e73..1b35d0ee 100644 --- a/macros/internal/metadata_processing/yaml_metadata_parser.sql +++ b/macros/internal/metadata_processing/yaml_metadata_parser.sql @@ -5,11 +5,11 @@ {% if name in metadata_dict.keys() %} {% set return_value = metadata_dict.get(name) %} {% if datavault4dbt.is_something(parameter)%} - {% do exceptions.warn("[" ~ this ~ "] Warning: Parameter '" ~ name ~ "' defined both in yaml-metadata and separately. Definition in yaml-metadata will be used, and separate parameter is ignored.") %} + {{ log("[" ~ this ~ "] Parameter '" ~ name ~ "' defined both in yaml-metadata and separately. Value from yaml-metadata will be used, and separate parameter is ignored.", info=False) }} {% endif %} {% elif datavault4dbt.is_something(parameter) %} {% set return_value = parameter %} - {% do exceptions.warn("[" ~ this ~ "] Warning: yaml-metadata given, but parameter '" ~ name ~ "' not defined in there. Using '" ~ name ~ "' parameter defined outside. We advise to use only one method of parameter passing.") %} + {{ log("[" ~ this ~ "] yaml-metadata given, but parameter '" ~ name ~ "' not defined in there. Applying '" ~ parameter ~ "' which is either a parameter passed separately or the default value.", info=False) }} {% elif required %} {{ exceptions.raise_compiler_error("[" ~ this ~ "] Error: yaml-metadata given, but required parameter '" ~ name ~ "' not defined in there or outside in the parameter. \n Description of parameter '" ~ name ~ "': \n" ~ documentation ) }} {% else %} From ec1366c6a689a3e200839cf3bca4e228d046655a Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Tue, 7 Jan 2025 15:32:47 +0100 Subject: [PATCH 38/50] add ghost record for DATETIME on BigQuery --- macros/supporting/ghost_record_per_datatype.sql | 2 ++ 1 file changed, 2 insertions(+) diff --git a/macros/supporting/ghost_record_per_datatype.sql b/macros/supporting/ghost_record_per_datatype.sql index 03698646..84973a54 100644 --- a/macros/supporting/ghost_record_per_datatype.sql +++ b/macros/supporting/ghost_record_per_datatype.sql @@ -32,6 +32,7 @@ {%- if ghost_record_type == 'unknown' -%} {%- if datatype == 'TIMESTAMP' %} {{ datavault4dbt.string_to_timestamp( timestamp_format , beginning_of_all_times) }} as {{ alias }} + {%- elif datatype == 'DATETIME'%} CAST({{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} AS {{ datatype }}) as {{ alias }} {%- elif datatype == 'DATE'-%} PARSE_DATE('{{date_format}}','{{ beginning_of_all_times_date }}') as {{ alias }} {%- elif datatype == 'STRING' %} '{{unknown_value__STRING}}' as {{ alias }} {%- elif datatype == 'INT64' %} CAST({{unknown_value__numeric}} as INT64) as {{ alias }} @@ -41,6 +42,7 @@ {% endif %} {%- elif ghost_record_type == 'error' -%} {%- if datatype == 'TIMESTAMP' %} {{ datavault4dbt.string_to_timestamp( timestamp_format , end_of_all_times) }} as {{ alias }} + {%- elif datatype == 'DATETIME'%} CAST({{ datavault4dbt.string_to_timestamp(timestamp_format, end_of_all_times) }} AS {{ datatype }}) as {{ alias }} {%- elif datatype == 'DATE'-%} PARSE_DATE('{{date_format}}', '{{ end_of_all_times_date }}') as {{ alias }} {%- elif datatype == 'STRING' %} '{{error_value__STRING}}' as {{ alias }} {%- elif datatype == 'INT64' %} CAST({{error_value__numeric}} as INT64) as {{ alias }} From bef34c1875aeb9627a4189b97617b10461a66a1f Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Tue, 7 Jan 2025 16:25:38 +0100 Subject: [PATCH 39/50] fix oracle ghost records for timestamps --- macros/supporting/ghost_record_per_datatype.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/macros/supporting/ghost_record_per_datatype.sql b/macros/supporting/ghost_record_per_datatype.sql index 84973a54..d04110ea 100644 --- a/macros/supporting/ghost_record_per_datatype.sql +++ b/macros/supporting/ghost_record_per_datatype.sql @@ -607,7 +607,7 @@ {%- set error_value__numeric = var('datavault4dbt.error_value__numeric', '-2') -%} {%- if ghost_record_type == 'unknown' -%} - {%- if datatype == 'TIMESTAMP' or datatype == 'TIMESTAMP WITH TIMEZONE' or datatype == 'TIMESTAMP WITH LOCAL TIMEZONE' %} {{- datavault4dbt.string_to_timestamp( timestamp_format , beginning_of_all_times) }} as {{ alias }} + {%- if datatype == 'TIMESTAMP' or datatype == 'TIMESTAMP WITH TIME ZONE' or datatype == 'TIMESTAMP WITH LOCAL TIME ZONE' %} {{- datavault4dbt.string_to_timestamp( timestamp_format , beginning_of_all_times) }} as {{ alias }} {%- elif datatype == 'DATE'-%} TO_DATE('{{ beginning_of_all_times_date }}', '{{ date_format }}' ) as "{{ alias }}" {%- elif datatype == 'VARCHAR'-%} '{{unknown_value__STRING}}' as {{ alias }} {%- elif datatype == 'VARCHAR2'-%} '{{unknown_value__STRING}}' as {{ alias }} @@ -620,7 +620,7 @@ {%- else %} CAST(NULL as {{ datatype }}) as {{ alias }} {% endif %} {%- elif ghost_record_type == 'error' -%} - {%- if datatype == 'TIMESTAMP' or datatype == 'TIMESTAMP WITH TIMEZONE' or datatype == 'TIMESTAMP WITH LOCAL TIMEZONE' %} {{- datavault4dbt.string_to_timestamp( timestamp_format , end_of_all_times) }} as {{ alias }} + {%- if datatype == 'TIMESTAMP' or datatype == 'TIMESTAMP WITH TIME ZONE' or datatype == 'TIMESTAMP WITH LOCAL TIME ZONE' %} {{- datavault4dbt.string_to_timestamp( timestamp_format , end_of_all_times) }} as {{ alias }} {%- elif datatype == 'DATE'-%} TO_DATE('{{ end_of_all_times_date }}', '{{ date_format }}' ) as "{{ alias }}" {%- elif datatype == 'VARCHAR'-%} CAST('{{error_value__STRING}}' as VARCHAR2(40)) as {{ alias }} {%- elif datatype == 'VARCHAR2'-%} '{{error_value__STRING}}' as {{ alias }} From 659d8eaa53ab1921ffa16d38545d5b16430aecf7 Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Wed, 8 Jan 2025 10:41:06 +0100 Subject: [PATCH 40/50] fix behaviour of derived columns throughout the CTEs for all adapers except synapse and fabric --- macros/staging/bigquery/stage.sql | 6 +++--- macros/staging/databricks/stage.sql | 7 ++++--- macros/staging/exasol/stage.sql | 2 ++ macros/staging/oracle/stage.sql | 5 +++-- macros/staging/postgres/stage.sql | 4 +++- macros/staging/redshift/stage.sql | 4 +++- macros/staging/snowflake/stage.sql | 5 +++-- 7 files changed, 21 insertions(+), 12 deletions(-) diff --git a/macros/staging/bigquery/stage.sql b/macros/staging/bigquery/stage.sql index 8c94c387..80e417b8 100644 --- a/macros/staging/bigquery/stage.sql +++ b/macros/staging/bigquery/stage.sql @@ -98,15 +98,17 @@ {%- set hashed_column_names = datavault4dbt.extract_column_names(hashed_columns) -%} {%- set prejoined_column_names = datavault4dbt.extract_column_names(prejoined_columns) -%} {%- set missing_column_names = datavault4dbt.extract_column_names(missing_columns) -%} -{%- set exclude_column_names = hashed_column_names + prejoined_column_names + missing_column_names + ldts_rsrc_input_column_names %} +{%- set exclude_column_names = derived_column_names + hashed_column_names + prejoined_column_names + missing_column_names + ldts_rsrc_input_column_names %} {%- set source_and_derived_column_names = (all_source_columns + derived_column_names) | unique | list -%} {%- set all_columns = adapter.get_columns_in_relation( source_relation ) -%} {%- set columns_without_excluded_columns = [] -%} {%- set final_columns_to_select = [] -%} +{%- set derived_input_columns = datavault4dbt.extract_input_columns(derived_columns) -%} {%- if include_source_columns -%} {%- set source_columns_to_select = datavault4dbt.process_columns_to_select(all_source_columns, exclude_column_names) | list -%} + {%- set source_columns_to_select = (source_columns_to_select + derived_input_columns) | unique | list -%} {%- for column in all_columns -%} @@ -224,8 +226,6 @@ ldts_rsrc_data AS ( {%- set last_cte = "ldts_rsrc_data" -%} {%- set final_columns_to_select = alias_columns + final_columns_to_select %} - {%- set final_columns_to_select = datavault4dbt.process_columns_to_select(final_columns_to_select, derived_column_names) | list -%} - {%- set columns_without_excluded_columns_tmp = [] -%} {%- for column in columns_without_excluded_columns -%} {%- if column.name | lower not in derived_column_names | map('lower') -%} diff --git a/macros/staging/databricks/stage.sql b/macros/staging/databricks/stage.sql index fc76044b..3048cd9f 100644 --- a/macros/staging/databricks/stage.sql +++ b/macros/staging/databricks/stage.sql @@ -98,15 +98,17 @@ {%- set hashed_column_names = datavault4dbt.extract_column_names(hashed_columns) -%} {%- set prejoined_column_names = datavault4dbt.extract_column_names(prejoined_columns) -%} {%- set missing_column_names = datavault4dbt.extract_column_names(missing_columns) -%} -{%- set exclude_column_names = hashed_column_names + prejoined_column_names + missing_column_names + ldts_rsrc_input_column_names %} +{%- set exclude_column_names = derived_column_names + hashed_column_names + prejoined_column_names + missing_column_names + ldts_rsrc_input_column_names %} {%- set source_and_derived_column_names = (all_source_columns + derived_column_names) | unique | list -%} {%- set all_columns = adapter.get_columns_in_relation( source_relation ) -%} {%- set columns_without_excluded_columns = [] -%} {%- set final_columns_to_select = [] -%} +{%- set derived_input_columns = datavault4dbt.extract_input_columns(derived_columns) -%} {%- if include_source_columns -%} {%- set source_columns_to_select = datavault4dbt.process_columns_to_select(all_source_columns, exclude_column_names) | list -%} + {%- set source_columns_to_select = (source_columns_to_select + derived_input_columns) | unique | list -%} {%- for column in all_columns -%} @@ -221,8 +223,7 @@ ldts_rsrc_data AS ( {%- set last_cte = "ldts_rsrc_data" -%} {%- set final_columns_to_select = alias_columns + final_columns_to_select %} - {%- set final_columns_to_select = datavault4dbt.process_columns_to_select(final_columns_to_select, derived_column_names) | list -%} - + {%- set columns_without_excluded_columns_tmp = [] -%} {%- for column in columns_without_excluded_columns -%} {%- if column.name | lower not in derived_column_names | map('lower') -%} diff --git a/macros/staging/exasol/stage.sql b/macros/staging/exasol/stage.sql index 50dd35d3..7ef3c2c7 100644 --- a/macros/staging/exasol/stage.sql +++ b/macros/staging/exasol/stage.sql @@ -99,9 +99,11 @@ {%- set columns_without_excluded_columns = [] -%} {%- set final_columns_to_select = [] -%} +{%- set derived_input_columns = datavault4dbt.extract_input_columns(derived_columns) -%} {%- if include_source_columns -%} {%- set source_columns_to_select = datavault4dbt.process_columns_to_select(all_source_columns, exclude_column_names) | list -%} + {%- set source_columns_to_select = (source_columns_to_select + derived_input_columns) | unique | list -%} {%- for column in all_columns -%} diff --git a/macros/staging/oracle/stage.sql b/macros/staging/oracle/stage.sql index c2be1409..a70a6485 100644 --- a/macros/staging/oracle/stage.sql +++ b/macros/staging/oracle/stage.sql @@ -98,17 +98,18 @@ {%- set hashed_column_names = datavault4dbt.extract_column_names(hashed_columns) -%} {%- set prejoined_column_names = datavault4dbt.extract_column_names(prejoined_columns) -%} {%- set missing_column_names = datavault4dbt.extract_column_names(missing_columns) -%} -{%- set exclude_column_names = hashed_column_names + prejoined_column_names + missing_column_names + ldts_rsrc_input_column_names %} +{%- set exclude_column_names = derived_column_names + hashed_column_names + prejoined_column_names + missing_column_names + ldts_rsrc_input_column_names %} {%- set source_and_derived_column_names = (all_source_columns + derived_column_names) | unique | list -%} {%- set all_columns = adapter.get_columns_in_relation( source_relation ) -%} {%- set columns_without_excluded_columns = [] -%} {%- set final_columns_to_select = [] -%} +{%- set derived_input_columns = datavault4dbt.extract_input_columns(derived_columns) -%} {%- if include_source_columns -%} {%- set source_columns_to_select = datavault4dbt.process_columns_to_select(all_source_columns, exclude_column_names) | list -%} - {{ log('source_columns_to_select when include_source_columns=true: '~ source_columns_to_select, false) }} + {%- set source_columns_to_select = (source_columns_to_select + derived_input_columns) | unique | list -%} {%- for column in all_columns -%} diff --git a/macros/staging/postgres/stage.sql b/macros/staging/postgres/stage.sql index 9edd3a38..ed09d25e 100644 --- a/macros/staging/postgres/stage.sql +++ b/macros/staging/postgres/stage.sql @@ -98,16 +98,18 @@ {%- set hashed_column_names = datavault4dbt.extract_column_names(hashed_columns) -%} {%- set prejoined_column_names = datavault4dbt.extract_column_names(prejoined_columns) -%} {%- set missing_column_names = datavault4dbt.extract_column_names(missing_columns) -%} -{%- set exclude_column_names = hashed_column_names + prejoined_column_names + missing_column_names + ldts_rsrc_input_column_names %} +{%- set exclude_column_names = derived_column_names + hashed_column_names + prejoined_column_names + missing_column_names + ldts_rsrc_input_column_names %} {%- set source_and_derived_column_names = (all_source_columns + derived_column_names) | unique | list -%} {%- set all_columns = adapter.get_columns_in_relation( source_relation ) -%} {%- set columns_without_excluded_columns = [] -%} {%- set final_columns_to_select = [] -%} +{%- set derived_input_columns = datavault4dbt.extract_input_columns(derived_columns) -%} {%- if include_source_columns -%} {%- set source_columns_to_select = datavault4dbt.process_columns_to_select(all_source_columns, exclude_column_names) | list -%} + {%- set source_columns_to_select = (source_columns_to_select + derived_input_columns) | unique | list -%} {%- for column in all_columns -%} diff --git a/macros/staging/redshift/stage.sql b/macros/staging/redshift/stage.sql index a7704c03..fdfb2568 100644 --- a/macros/staging/redshift/stage.sql +++ b/macros/staging/redshift/stage.sql @@ -97,16 +97,18 @@ {%- set hashed_column_names = datavault4dbt.extract_column_names(hashed_columns) -%} {%- set prejoined_column_names = datavault4dbt.extract_column_names(prejoined_columns) -%} {%- set missing_column_names = datavault4dbt.extract_column_names(missing_columns) -%} -{%- set exclude_column_names = hashed_column_names + prejoined_column_names + missing_column_names + ldts_rsrc_input_column_names %} +{%- set exclude_column_names = derived_column_names + hashed_column_names + prejoined_column_names + missing_column_names + ldts_rsrc_input_column_names %} {%- set source_and_derived_column_names = (all_source_columns + derived_column_names) | unique | list -%} {%- set all_columns = adapter.get_columns_in_relation( source_relation ) -%} {%- set columns_without_excluded_columns = [] -%} {%- set final_columns_to_select = [] -%} +{%- set derived_input_columns = datavault4dbt.extract_input_columns(derived_columns) -%} {%- if include_source_columns -%} {%- set source_columns_to_select = datavault4dbt.process_columns_to_select(all_source_columns, exclude_column_names) | list -%} + {%- set source_columns_to_select = (source_columns_to_select + derived_input_columns) | unique | list -%} {%- for column in all_columns -%} diff --git a/macros/staging/snowflake/stage.sql b/macros/staging/snowflake/stage.sql index 956c632e..e2d4d281 100644 --- a/macros/staging/snowflake/stage.sql +++ b/macros/staging/snowflake/stage.sql @@ -98,17 +98,18 @@ {%- set hashed_column_names = datavault4dbt.extract_column_names(hashed_columns) -%} {%- set prejoined_column_names = datavault4dbt.extract_column_names(prejoined_columns) -%} {%- set missing_column_names = datavault4dbt.extract_column_names(missing_columns) -%} -{%- set exclude_column_names = hashed_column_names + prejoined_column_names + missing_column_names + ldts_rsrc_input_column_names %} +{%- set exclude_column_names = derived_column_names + hashed_column_names + prejoined_column_names + missing_column_names + ldts_rsrc_input_column_names %} {%- set source_and_derived_column_names = (all_source_columns + derived_column_names) | unique | list -%} {%- set all_columns = adapter.get_columns_in_relation( source_relation ) -%} {%- set columns_without_excluded_columns = [] -%} {%- set final_columns_to_select = [] -%} +{%- set derived_input_columns = datavault4dbt.extract_input_columns(derived_columns) -%} {%- if include_source_columns -%} {%- set source_columns_to_select = datavault4dbt.process_columns_to_select(all_source_columns, exclude_column_names) | list -%} - {{ log('source_columns_to_select when include_source_columns=true: '~ source_columns_to_select, false) }} + {%- set source_columns_to_select = (source_columns_to_select + derived_input_columns) | unique | list -%} {%- for column in all_columns -%} From c8593111a93fb335339a610edf45382829dc08a7 Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Wed, 8 Jan 2025 11:48:09 +0100 Subject: [PATCH 41/50] fix fabric ghost records for datetime2 --- macros/supporting/ghost_record_per_datatype.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/macros/supporting/ghost_record_per_datatype.sql b/macros/supporting/ghost_record_per_datatype.sql index d04110ea..4af62c6a 100644 --- a/macros/supporting/ghost_record_per_datatype.sql +++ b/macros/supporting/ghost_record_per_datatype.sql @@ -467,7 +467,7 @@ {%- if ghost_record_type == 'unknown' -%} - {%- if datatype in ['DATETIME2'] %} CONVERT(datetime2(6),{{- datavault4dbt.string_to_timestamp( timestamp_format , beginning_of_all_times) }}) as {{ alias }} + {%- if 'DATETIME2' in datatype %} CONVERT(datetime2(6),{{- datavault4dbt.string_to_timestamp( timestamp_format , beginning_of_all_times) }}) as {{ alias }} {%- elif datatype in ['DATETIMEOFFSET'] %} CONVERT({{ datatype }},{{- datavault4dbt.string_to_timestamp( timestamp_format , beginning_of_all_times) }}) as {{ alias }} {%- elif 'CHAR' in datatype -%} {%- if col_size is not none -%} @@ -499,7 +499,7 @@ {%- elif ghost_record_type == 'error' -%} - {%- if datatype in ['DATETIME2'] %} CONVERT(datetime2(6),{{- datavault4dbt.string_to_timestamp( timestamp_format , end_of_all_times) }}) as {{ alias }} + {%- if 'DATETIME2' in datatype %} CONVERT(datetime2(6),{{- datavault4dbt.string_to_timestamp( timestamp_format , end_of_all_times) }}) as {{ alias }} {%- elif datatype in ['DATETIMEOFFSET'] %} CONVERT({{ datatype }},{{- datavault4dbt.string_to_timestamp( timestamp_format , end_of_all_times) }}) as {{ alias }} {%- elif 'CHAR' in datatype -%} {%- if col_size is not none -%} From 6c380d399bc7180189aa780b6d8654d0e95af841 Mon Sep 17 00:00:00 2001 From: Theo Kiehn <162969167+tkiehn@users.noreply.github.com> Date: Wed, 8 Jan 2025 14:44:14 +0100 Subject: [PATCH 42/50] Update oracle stage, remove AS for alias of joined tables --- macros/staging/oracle/stage.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/macros/staging/oracle/stage.sql b/macros/staging/oracle/stage.sql index 1cba2216..6f9c2e53 100644 --- a/macros/staging/oracle/stage.sql +++ b/macros/staging/oracle/stage.sql @@ -343,7 +343,7 @@ prejoined_columns AS ( {%- endif -%} {%- set prejoin_alias = 'pj_' + loop.index|string %} - left join {{ relation }} as {{ prejoin_alias }} + left join {{ relation }} {{ prejoin_alias }} on {{ datavault4dbt.multikey(columns=prejoin['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=prejoin['ref_column_name']) }} {%- endfor -%} From e1703c7e3fb3815e2bc81f8e4eabb5f989eb5ba0 Mon Sep 17 00:00:00 2001 From: Theo Kiehn <162969167+tkiehn@users.noreply.github.com> Date: Thu, 9 Jan 2025 08:56:02 +0100 Subject: [PATCH 43/50] Update oracle ghost_record_per_datatype, simplify detection of (var)char types --- macros/supporting/ghost_record_per_datatype.sql | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/macros/supporting/ghost_record_per_datatype.sql b/macros/supporting/ghost_record_per_datatype.sql index 4af62c6a..1f78348b 100644 --- a/macros/supporting/ghost_record_per_datatype.sql +++ b/macros/supporting/ghost_record_per_datatype.sql @@ -609,11 +609,7 @@ {%- if ghost_record_type == 'unknown' -%} {%- if datatype == 'TIMESTAMP' or datatype == 'TIMESTAMP WITH TIME ZONE' or datatype == 'TIMESTAMP WITH LOCAL TIME ZONE' %} {{- datavault4dbt.string_to_timestamp( timestamp_format , beginning_of_all_times) }} as {{ alias }} {%- elif datatype == 'DATE'-%} TO_DATE('{{ beginning_of_all_times_date }}', '{{ date_format }}' ) as "{{ alias }}" - {%- elif datatype == 'VARCHAR'-%} '{{unknown_value__STRING}}' as {{ alias }} - {%- elif datatype == 'VARCHAR2'-%} '{{unknown_value__STRING}}' as {{ alias }} - {%- elif datatype == 'NVARCHAR2' %} '{{unknown_value__STRING}}' as {{ alias }} - {%- elif datatype == 'CHAR' %} '{{unknown_value__STRING}}' as {{ alias }} - {%- elif datatype == 'NCHAR' %} '{{unknown_value__STRING}}' as {{ alias }} + {%- elif 'CHAR' in datatype %} '{{unknown_value__STRING}}' as {{ alias }} {%- elif datatype == 'LONG' %} '{{unknown_value__STRING}}' as {{ alias }} {%- elif datatype == 'NUMBER' %} CAST('{{unknown_value__numeric}}' as NUMBER) as {{ alias }} {%- elif datatype == 'FLOAT' %} CAST('{{unknown_value__numeric}}' as FLOAT) as {{ alias }} @@ -622,11 +618,7 @@ {%- elif ghost_record_type == 'error' -%} {%- if datatype == 'TIMESTAMP' or datatype == 'TIMESTAMP WITH TIME ZONE' or datatype == 'TIMESTAMP WITH LOCAL TIME ZONE' %} {{- datavault4dbt.string_to_timestamp( timestamp_format , end_of_all_times) }} as {{ alias }} {%- elif datatype == 'DATE'-%} TO_DATE('{{ end_of_all_times_date }}', '{{ date_format }}' ) as "{{ alias }}" - {%- elif datatype == 'VARCHAR'-%} CAST('{{error_value__STRING}}' as VARCHAR2(40)) as {{ alias }} - {%- elif datatype == 'VARCHAR2'-%} '{{error_value__STRING}}' as {{ alias }} - {%- elif datatype == 'NVARCHAR2' %} '{{error_value__STRING}}' as {{ alias }} - {%- elif datatype == 'CHAR' %} '{{error_value__STRING}}' as {{ alias }} - {%- elif datatype == 'NCHAR' %} '{{error_value__STRING}}' as {{ alias }} + {%- elif 'CHAR' in datatype %} '{{error_value__STRING}}' as {{ alias }} {%- elif datatype == 'LONG' %} '{{error_value__STRING}}' as {{ alias }} {%- elif datatype == 'NUMBER' %} CAST('{{error_value__numeric}}' as NUMBER) as {{ alias }} {%- elif datatype == 'FLOAT' %} CAST('{{error_value__numeric}}' as FLOAT) as {{ alias }} @@ -638,4 +630,4 @@ {%- endif %} {%- endif -%} -{%- endmacro -%} \ No newline at end of file +{%- endmacro -%} From c0d40bf2091796a83f6433e072a715a78d259c81 Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Thu, 9 Jan 2025 11:38:49 +0100 Subject: [PATCH 44/50] redshift: use qualify statement instead of prep CTEs throughout all macros --- macros/tables/redshift/eff_sat_v0.sql | 43 ++++++-------------------- macros/tables/redshift/hub.sql | 16 ++++------ macros/tables/redshift/link.sql | 16 ++++------ macros/tables/redshift/ma_sat_v0.sql | 31 +++++-------------- macros/tables/redshift/nh_link.sql | 18 +++++------ macros/tables/redshift/ref_sat_v0.sql | 44 ++++++--------------------- macros/tables/redshift/sat_v0.sql | 42 +++++++------------------ 7 files changed, 58 insertions(+), 152 deletions(-) diff --git a/macros/tables/redshift/eff_sat_v0.sql b/macros/tables/redshift/eff_sat_v0.sql index 3483db4a..5a660163 100644 --- a/macros/tables/redshift/eff_sat_v0.sql +++ b/macros/tables/redshift/eff_sat_v0.sql @@ -49,23 +49,13 @@ source_data AS ( In all incremental cases, the current status for each hashkey is selected from the existing Effectivity Satellite. #} {%- if is_incremental() %} -current_status_prep AS ( - - SELECT - {{ tracked_hashkey }}, - {{ is_active_alias}}, - ROW_NUMBER() OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }} DESC) as rn - FROM {{ this }} - -), - current_status AS ( SELECT {{ tracked_hashkey }}, {{ is_active_alias }} - FROM current_status_prep - WHERE rn = 1 + FROM {{ this }} redshift_requires_an_alias_if_the_qualify_is_directly_after_the_from + QUALIFY ROW_NUMBER() OVER(PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }} DESC) = 1 ), {% endif %} @@ -136,32 +126,19 @@ current_status AS ( {# The rows are deduplicated on the is_active_alias, to only include status changes. - Additionally, a ROW_NUMBER() is calculated in incremental runs, to use it in the next step for comparison against the current status. #} - deduplicated_incoming_prep AS ( + deduplicated_incoming AS ( SELECT is_active.{{ tracked_hashkey }}, is_active.{{ src_ldts }}, - is_active.{{ is_active_alias }}, - LAG(is_active.{{ is_active_alias }}) OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) as lag_is_active - - FROM is_active - - ), - - deduplicated_incoming AS ( - - SELECT - deduplicated_incoming_prep.{{ tracked_hashkey }}, - deduplicated_incoming_prep.{{ src_ldts }}, - deduplicated_incoming_prep.{{ is_active_alias }} - - FROM - deduplicated_incoming_prep - WHERE - deduplicated_incoming_prep.{{ is_active_alias }} != deduplicated_incoming_prep.lag_is_active - OR deduplicated_incoming_prep.lag_is_active IS NULL + is_active.{{ is_active_alias }} + FROM is_active redshift_requires_an_alias_if_the_qualify_is_directly_after_the_from + QUALIFY + CASE + WHEN is_active.{{ is_active_alias }} = LAG(is_active.{{ is_active_alias }}) OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) THEN FALSE + ELSE TRUE + END ), diff --git a/macros/tables/redshift/hub.sql b/macros/tables/redshift/hub.sql index dc45edf9..9418f59e 100644 --- a/macros/tables/redshift/hub.sql +++ b/macros/tables/redshift/hub.sql @@ -207,21 +207,17 @@ source_new_union AS ( {%- endif %} -earliest_hk_over_all_sources_prep AS ( - SELECT - lcte.*, - ROW_NUMBER() OVER (PARTITION BY {{ hashkey }} ORDER BY {{ src_ldts - }}) as rn - FROM {{ ns.last_cte }} AS lcte), - earliest_hk_over_all_sources AS ( {#- Deduplicate the unionized records again to only insert the earliest one. #} SELECT lcte.* - FROM earliest_hk_over_all_sources_prep AS lcte - WHERE rn = 1 - {%- set ns.last_cte = 'earliest_hk_over_all_sources' -%}), + FROM {{ ns.last_cte }} AS lcte + QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ hashkey }} ORDER BY {{ src_ldts }}) = 1 + + {%- set ns.last_cte = 'earliest_hk_over_all_sources' -%} + +), records_to_insert AS ( {#- Select everything from the previous CTE, if incremental filter for hashkeys that are not already in the hub. #} diff --git a/macros/tables/redshift/link.sql b/macros/tables/redshift/link.sql index 05ccbe8a..9e67e1d9 100644 --- a/macros/tables/redshift/link.sql +++ b/macros/tables/redshift/link.sql @@ -210,21 +210,17 @@ source_new_union AS ( {%- endif %} -earliest_hk_over_all_sources_prep AS ( - SELECT - lcte.*, - ROW_NUMBER() OVER (PARTITION BY {{ link_hashkey }} ORDER BY {{ src_ldts - }}) as rn - FROM {{ ns.last_cte }} AS lcte), - earliest_hk_over_all_sources AS ( {#- Deduplicate the unionized records again to only insert the earliest one. #} SELECT lcte.* - FROM earliest_hk_over_all_sources_prep AS lcte - WHERE rn = 1 - {%- set ns.last_cte = 'earliest_hk_over_all_sources' -%}), + FROM {{ ns.last_cte }} AS lcte + QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ link_hashkey }} ORDER BY {{ src_ldts }}) = 1 + + {%- set ns.last_cte = 'earliest_hk_over_all_sources' -%} + +), records_to_insert AS ( {# Select everything from the previous CTE, if incremental filter for hashkeys that are not already in the link. #} diff --git a/macros/tables/redshift/ma_sat_v0.sql b/macros/tables/redshift/ma_sat_v0.sql index 5e322ab2..4f59994c 100644 --- a/macros/tables/redshift/ma_sat_v0.sql +++ b/macros/tables/redshift/ma_sat_v0.sql @@ -41,44 +41,29 @@ source_data AS ( {# Get the latest record for each parent hashkey in existing sat, if incremental. #} {%- if is_incremental() %} -latest_entries_in_sat_prep AS ( - - SELECT - {{ parent_hashkey }}, - {{ ns.hdiff_alias }}, - ROW_NUMBER() OVER(PARTITION BY {{ parent_hashkey|lower }} ORDER BY {{ src_ldts }} DESC) as rn - FROM - {{ this }} -), - latest_entries_in_sat AS ( SELECT {{ parent_hashkey }}, {{ ns.hdiff_alias }} FROM - latest_entries_in_sat_prep - WHERE rn = 1 + {{ this }} redshift_requires_an_alias_if_the_qualify_is_directly_after_the_from + QUALIFY ROW_NUMBER() OVER(PARTITION BY {{ parent_hashkey }} ORDER BY {{ src_ldts }} DESC) = 1 ), {%- endif %} {# Get a list of all distinct hashdiffs that exist for each parent_hashkey. #} - lag_source_data AS ( - SELECT - {{ parent_hashkey }}, - {{ src_ldts }}, - {{ ns.hdiff_alias }}, - LAG({{ ns.hdiff_alias }}) OVER (PARTITION BY {{ parent_hashkey }} ORDER BY {{ src_ldts }}) as prev_ns_hdiff_alias - FROM source_data -), - deduped_row_hashdiff AS ( + SELECT {{ parent_hashkey }}, {{ src_ldts }}, {{ ns.hdiff_alias }} - FROM lag_source_data - WHERE {{ ns.hdiff_alias }} != prev_ns_hdiff_alias OR prev_ns_hdiff_alias IS NULL + FROM source_data redshift_requires_an_alias_if_the_qualify_is_directly_after_the_from + QUALIFY CASE + WHEN {{ ns.hdiff_alias }} = LAG({{ ns.hdiff_alias }}) OVER (PARTITION BY {{ parent_hashkey }} ORDER BY {{ src_ldts }}) THEN FALSE + ELSE TRUE + END ), {# Dedupe the source data regarding non-delta groups. #} diff --git a/macros/tables/redshift/nh_link.sql b/macros/tables/redshift/nh_link.sql index 38ec0368..b4e895c8 100644 --- a/macros/tables/redshift/nh_link.sql +++ b/macros/tables/redshift/nh_link.sql @@ -226,21 +226,17 @@ source_new_union AS ( {%- if not source_is_single_batch %} -earliest_hk_over_all_sources_prep AS ( - SELECT - lcte.*, - ROW_NUMBER() OVER (PARTITION BY {{ link_hashkey }} ORDER BY {{ src_ldts - }}) as rn - FROM {{ ns.last_cte }} AS lcte), - earliest_hk_over_all_sources AS ( - {#- Deduplicate the unionized records again to only insert the earliest one. #} +{#- Deduplicate the unionized records again to only insert the earliest one. #} SELECT lcte.* - FROM earliest_hk_over_all_sources_prep AS lcte - WHERE rn = 1 - {%- set ns.last_cte = 'earliest_hk_over_all_sources' -%}), + FROM {{ ns.last_cte }} AS lcte + QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ link_hashkey }} ORDER BY {{ src_ldts }}) = 1 + + {%- set ns.last_cte = 'earliest_hk_over_all_sources' -%} + +), {%- endif %} diff --git a/macros/tables/redshift/ref_sat_v0.sql b/macros/tables/redshift/ref_sat_v0.sql index 6ce47239..4fceff2d 100644 --- a/macros/tables/redshift/ref_sat_v0.sql +++ b/macros/tables/redshift/ref_sat_v0.sql @@ -46,18 +46,6 @@ source_data AS ( {# Get the latest record for each parent ref key combination in existing sat, if incremental. #} {%- if is_incremental() %} -latest_entries_in_sat_prep AS ( - - SELECT - {% for ref_key in parent_ref_keys %} - {{ref_key}}, - {% endfor %} - {{ ns.hdiff_alias }}, - ROW_NUMBER() OVER(PARTITION BY {%- for ref_key in parent_ref_keys %} {{ref_key|lower}} {%- if not loop.last %}, {% endif %}{% endfor %} ORDER BY {{ src_ldts }} DESC) as rn - FROM - {{ this }} -), - latest_entries_in_sat AS ( SELECT @@ -66,8 +54,8 @@ latest_entries_in_sat AS ( {% endfor %} {{ ns.hdiff_alias }} FROM - latest_entries_in_sat_prep - WHERE rn = 1 + {{ this }} redshift_requires_an_alias_if_the_qualify_is_directly_after_the_from + QUALIFY ROW_NUMBER() OVER(PARTITION BY {%- for ref_key in parent_ref_keys %} {{ref_key}} {%- if not loop.last %}, {% endif %}{% endfor %} ORDER BY {{ src_ldts }} DESC) = 1 ), {%- endif %} @@ -75,7 +63,7 @@ latest_entries_in_sat AS ( Deduplicate source by comparing each hashdiff to the hashdiff of the previous record, for each parent ref key combination. Additionally adding a row number based on that order, if incremental. #} -deduplicated_numbered_source_prep AS ( +deduplicated_numbered_source AS ( SELECT {% for ref_key in parent_ref_keys %} @@ -86,24 +74,12 @@ deduplicated_numbered_source_prep AS ( {% if is_incremental() -%} , ROW_NUMBER() OVER(PARTITION BY {%- for ref_key in parent_ref_keys %} {{ref_key}} {%- if not loop.last %}, {% endif %}{% endfor %} ORDER BY {{ src_ldts }}) as rn {%- endif %} - , LAG({{ ns.hdiff_alias }}) OVER(PARTITION BY {%- for ref_key in parent_ref_keys %} {{ref_key|lower}} {%- if not loop.last %}, {% endif %}{% endfor %} ORDER BY {{ src_ldts }}) as prev_hashdiff - FROM source_data -), - -deduplicated_numbered_source AS ( - - SELECT - {% for ref_key in parent_ref_keys %} - {{ref_key}}, - {% endfor %} - {{ ns.hdiff_alias }}, - {{ datavault4dbt.print_list(source_cols) }} - FROM deduplicated_numbered_source_prep - WHERE 1=1 - AND {{ ns.hdiff_alias }} <> prev_hashdiff OR prev_hashdiff IS NULL - {% if is_incremental() -%} - AND rn = 1 - {%- endif %} + FROM source_data redshift_requires_an_alias_if_the_qualify_is_directly_after_the_from + QUALIFY + CASE + WHEN {{ ns.hdiff_alias }} = LAG({{ ns.hdiff_alias }}) OVER(PARTITION BY {%- for ref_key in parent_ref_keys %} {{ref_key}} {%- if not loop.last %}, {% endif %}{% endfor %} ORDER BY {{ src_ldts }}) THEN FALSE + ELSE TRUE + END ), {# @@ -128,7 +104,7 @@ records_to_insert AS ( AND {{ datavault4dbt.multikey(ref_key, prefix=['latest_entries_in_sat', 'deduplicated_numbered_source'], condition='=') }} {% endfor %} AND {{ datavault4dbt.multikey(ns.hdiff_alias, prefix=['latest_entries_in_sat', 'deduplicated_numbered_source'], condition='=') }} - ) + AND deduplicated_numbered_source.rn = 1) {%- endif %} ) diff --git a/macros/tables/redshift/sat_v0.sql b/macros/tables/redshift/sat_v0.sql index e04d4f66..8833c635 100644 --- a/macros/tables/redshift/sat_v0.sql +++ b/macros/tables/redshift/sat_v0.sql @@ -42,24 +42,14 @@ source_data AS ( {# Get the latest record for each parent hashkey in existing sat, if incremental. #} {%- if is_incremental() %} -latest_entries_in_sat_prep AS ( - - SELECT - {{ parent_hashkey }}, - {{ ns.hdiff_alias }}, - ROW_NUMBER() OVER(PARTITION BY {{ parent_hashkey|lower }} ORDER BY {{ src_ldts }} DESC) as rn - FROM - {{ this }} -), - latest_entries_in_sat AS ( SELECT {{ parent_hashkey }}, {{ ns.hdiff_alias }} FROM - latest_entries_in_sat_prep - WHERE rn = 1 + {{ this }} redshift_requires_an_alias_if_the_qualify_is_directly_after_the_from + QUALIFY ROW_NUMBER() OVER(PARTITION BY {{ parent_hashkey }} ORDER BY {{ src_ldts }} DESC) = 1 ), {%- endif %} @@ -67,7 +57,7 @@ latest_entries_in_sat AS ( Deduplicate source by comparing each hashdiff to the hashdiff of the previous record, for each hashkey. Additionally adding a row number based on that order, if incremental. #} -deduplicated_numbered_source_prep AS ( +deduplicated_numbered_source AS ( SELECT {{ parent_hashkey }}, @@ -76,23 +66,12 @@ deduplicated_numbered_source_prep AS ( {% if is_incremental() -%} , ROW_NUMBER() OVER(PARTITION BY {{ parent_hashkey }} ORDER BY {{ src_ldts }}) as rn {%- endif %} - , LAG({{ ns.hdiff_alias }}) OVER(PARTITION BY {{ parent_hashkey|lower }} ORDER BY {{ src_ldts }}) as prev_hashdiff - FROM source_data - -), - -deduplicated_numbered_source AS ( - - SELECT - {{ parent_hashkey }}, - {{ ns.hdiff_alias }}, - {{ datavault4dbt.print_list(source_cols) }} - FROM deduplicated_numbered_source_prep - WHERE 1=1 - AND {{ ns.hdiff_alias }} <> prev_hashdiff OR prev_hashdiff IS NULL - {% if is_incremental() -%} - AND rn = 1 - {%- endif %} + FROM source_data redshift_requires_an_alias_if_the_qualify_is_directly_after_the_from + QUALIFY + CASE + WHEN {{ ns.hdiff_alias }} = LAG({{ ns.hdiff_alias }}) OVER(PARTITION BY {{ parent_hashkey }} ORDER BY {{ src_ldts }}) THEN FALSE + ELSE TRUE + END ), {# @@ -111,7 +90,8 @@ records_to_insert AS ( SELECT 1 FROM latest_entries_in_sat WHERE {{ datavault4dbt.multikey(parent_hashkey, prefix=['latest_entries_in_sat', 'deduplicated_numbered_source'], condition='=') }} - AND {{ datavault4dbt.multikey(ns.hdiff_alias, prefix=['latest_entries_in_sat', 'deduplicated_numbered_source'], condition='=') }}) + AND {{ datavault4dbt.multikey(ns.hdiff_alias, prefix=['latest_entries_in_sat', 'deduplicated_numbered_source'], condition='=') }} + AND deduplicated_numbered_source.rn = 1) {%- endif %} ) From 7f13d91197d6566c4e8ca143ffb6dc89b6408c3e Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Thu, 9 Jan 2025 16:08:15 +0100 Subject: [PATCH 45/50] change placeholder alias to actual alias in deduplicated_incoming cte for redshift eff_sat --- macros/tables/redshift/eff_sat_v0.sql | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/macros/tables/redshift/eff_sat_v0.sql b/macros/tables/redshift/eff_sat_v0.sql index 5a660163..8e734972 100644 --- a/macros/tables/redshift/eff_sat_v0.sql +++ b/macros/tables/redshift/eff_sat_v0.sql @@ -130,13 +130,13 @@ current_status AS ( deduplicated_incoming AS ( SELECT - is_active.{{ tracked_hashkey }}, - is_active.{{ src_ldts }}, - is_active.{{ is_active_alias }} - FROM is_active redshift_requires_an_alias_if_the_qualify_is_directly_after_the_from + ia.{{ tracked_hashkey }}, + ia.{{ src_ldts }}, + ia.{{ is_active_alias }} + FROM is_active ia QUALIFY CASE - WHEN is_active.{{ is_active_alias }} = LAG(is_active.{{ is_active_alias }}) OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) THEN FALSE + WHEN ia.{{ is_active_alias }} = LAG(ia.{{ is_active_alias }}) OVER (PARTITION BY {{ tracked_hashkey }} ORDER BY {{ src_ldts }}) THEN FALSE ELSE TRUE END From d14373207ea04172cf0efdf2c522e07678bdb3b9 Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Thu, 9 Jan 2025 16:43:39 +0100 Subject: [PATCH 46/50] small fixes on exasol stage --- macros/staging/exasol/stage.sql | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/macros/staging/exasol/stage.sql b/macros/staging/exasol/stage.sql index c3874711..f919ec9f 100644 --- a/macros/staging/exasol/stage.sql +++ b/macros/staging/exasol/stage.sql @@ -90,7 +90,7 @@ {# Getting the column names for all additional columns #} {%- set derived_column_names = datavault4dbt.extract_column_names(derived_columns) -%} {%- set hashed_column_names = datavault4dbt.extract_column_names(hashed_columns) -%} -{%- set prejoined_column_names = datavault4dbt.extract_column_names(prejoined_columns) -%} +{%- set prejoined_column_names = datavault4dbt.extract_prejoin_column_names(prejoined_columns) -%} {%- set missing_column_names = datavault4dbt.extract_column_names(missing_columns) -%} {%- set exclude_column_names = derived_column_names + hashed_column_names + prejoined_column_names + missing_column_names + ldts_rsrc_input_column_names %} {%- set source_and_derived_column_names = (all_source_columns + derived_column_names) | unique | list -%} @@ -470,7 +470,7 @@ unknown_values AS ( {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for all source columns, except the ldts, rsrc & edwSequence column #} {%- for column in columns_without_excluded_columns %} - ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown') }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, col_size=column.char_size, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} @@ -478,7 +478,7 @@ unknown_values AS ( {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for missing columns #} {%- for col, dtype in missing_columns.items() %} - ,{{- datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='unknown') }} + ,{{- datavault4dbt.ghost_record_per_datatype(column_name=col|upper, datatype=dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} @@ -500,7 +500,7 @@ unknown_values AS ( {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} {{ log('column found? yes, for column: ' ~ column.name , false) }} - ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', alias=prejoin['aliases'][prejoin_col_index]) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name|upper, datatype=column.dtype, col_size=column.char_size, ghost_record_type='unknown', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} {%- endfor -%} @@ -510,7 +510,7 @@ unknown_values AS ( {%- if datavault4dbt.is_something(derived_columns) -%} {# Additionally generating Ghost Records for Derived Columns #} {% for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='unknown') }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} @@ -536,7 +536,7 @@ error_values AS ( {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for Source Columns #} {%- for column in columns_without_excluded_columns %} - ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error') }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, col_size=column.char_size, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} @@ -544,7 +544,7 @@ error_values AS ( {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for Missing columns #} {%- for col, dtype in missing_columns.items() %} - ,{{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='error') }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=col|upper, datatype=dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} @@ -566,7 +566,7 @@ error_values AS ( {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} {{ log('column found? yes, for column: ' ~ column.name , false) }} - ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', alias=prejoin['aliases'][prejoin_col_index]) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, col_size=column.char_size, ghost_record_type='error', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} {%- endfor -%} @@ -576,7 +576,7 @@ error_values AS ( {%- if datavault4dbt.is_something(derived_columns) %} {# Additionally generating Ghost Records for Derived Columns #} {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='error') }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, col_size=column.char_size, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} From 15708008bebff91c482476a07ff2fffc77373f82 Mon Sep 17 00:00:00 2001 From: Theo Kiehn <162969167+tkiehn@users.noreply.github.com> Date: Thu, 9 Jan 2025 18:01:58 +0100 Subject: [PATCH 47/50] Update ghost_record_per_datatype.sql, fix databricks hash_default_values --- macros/supporting/ghost_record_per_datatype.sql | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/macros/supporting/ghost_record_per_datatype.sql b/macros/supporting/ghost_record_per_datatype.sql index 03698646..991b212b 100644 --- a/macros/supporting/ghost_record_per_datatype.sql +++ b/macros/supporting/ghost_record_per_datatype.sql @@ -553,8 +553,8 @@ {%- set error_value__numeric = var('datavault4dbt.error_value__numeric', -2) -%} {%- set hash = datavault4dbt.hash_method() -%} -{%- set hash_default_values = datavault4dbt.hash_default_values(hash_function=hash) -%} -{%- set unknown_value__HASHTYPE = hash_default_values['unknown_key'] -%} +{%- set hash_default_values = fromjson(datavault4dbt.hash_default_values(hash_function=hash)) -%} +{%- set unknown_value__HASHTYPE = hash_default_values.get('unknown_key') -%} {%- set error_value__HASHTYPE = hash_default_values['error_key'] -%} {%- set datatype = datatype | string | upper | trim -%} @@ -566,7 +566,7 @@ {%- elif datatype in ['INT', 'SMALLINT', 'TINYINT', 'BIGINT', 'DOUBLE', 'FLOAT'] %} CAST('{{unknown_value__numeric}}' as {{ datatype}}) as {{ alias }} {%- elif datatype.upper().startswith('DECIMAL') %} CAST('{{unknown_value__numeric}}' as DECIMAL) as {{ alias }} {%- elif datatype == 'BOOLEAN' %} CAST('FALSE' as BOOLEAN) as {{ alias }} - {%- elif datatype == 'BINARY' %} CAST('{{ unknown_value__HASHTYPE }}') as {{ alias }} + {%- elif datatype == 'BINARY' %} CAST('{{ unknown_value__HASHTYPE }}' as BINARY) as {{ alias }} {%- else %} CAST(NULL as {{ datatype }}) as {{ alias }} {% endif %} {%- elif ghost_record_type == 'error' -%} @@ -576,7 +576,7 @@ {%- elif datatype in ['INT', 'SMALLINT', 'TINYINT', 'BIGINT', 'DOUBLE', 'FLOAT'] %} CAST('{{error_value__numeric}}' as {{ datatype}}) as {{ alias }} {%- elif datatype.upper().startswith('DECIMAL') %} CAST('{{error_value__numeric}}' as DECIMAL) as {{ alias }} {%- elif datatype == 'BOOLEAN' %} CAST('FALSE' as BOOLEAN) as {{ alias }} - {%- elif datatype == 'BINARY' %} CAST('{{ error_value__HASHTYPE }}') as {{ alias }} + {%- elif datatype == 'BINARY' %} CAST('{{ error_value__HASHTYPE }}' as BINARY) as {{ alias }} {%- else %} CAST(NULL as {{ datatype }}) as {{ alias }} {% endif %} {%- else -%} @@ -636,4 +636,4 @@ {%- endif %} {%- endif -%} -{%- endmacro -%} \ No newline at end of file +{%- endmacro -%} From c524a57d41e5b25a8ef47b03f72ce05d4b8ffea9 Mon Sep 17 00:00:00 2001 From: Tim Kirschke Date: Mon, 13 Jan 2025 10:57:13 +0100 Subject: [PATCH 48/50] Added optional parameter "union_strategy" and implemented it in all nh_link macros --- macros/tables/bigquery/nh_link.sql | 16 ++++++++++++++-- macros/tables/databricks/nh_link.sql | 15 +++++++++++++-- macros/tables/exasol/nh_link.sql | 15 +++++++++++++-- macros/tables/fabric/nh_link.sql | 15 +++++++++++++-- macros/tables/nh_link.sql | 11 +++++++++-- macros/tables/oracle/nh_link.sql | 14 ++++++++++++-- macros/tables/postgres/nh_link.sql | 14 ++++++++++++-- macros/tables/redshift/nh_link.sql | 14 ++++++++++++-- macros/tables/snowflake/nh_link.sql | 15 +++++++++++++-- 9 files changed, 111 insertions(+), 18 deletions(-) diff --git a/macros/tables/bigquery/nh_link.sql b/macros/tables/bigquery/nh_link.sql index e05de7dc..f6fa56c8 100644 --- a/macros/tables/bigquery/nh_link.sql +++ b/macros/tables/bigquery/nh_link.sql @@ -1,4 +1,4 @@ -{%- macro default__nh_link(link_hashkey, foreign_hashkeys, payload, source_models, src_ldts, src_rsrc, disable_hwm, source_is_single_batch) -%} +{%- macro default__nh_link(link_hashkey, foreign_hashkeys, payload, source_models, src_ldts, src_rsrc, disable_hwm, source_is_single_batch, union_strategy) -%} {%- set ns = namespace(last_cte= "", source_included_before = {}, has_rsrc_static_defined=true, source_models_rsrc_dict={}) -%} @@ -25,6 +25,18 @@ {%- set ns.source_models_rsrc_dict = source_model_values['source_models_rsrc_dict'] -%} {{ log('source_models: '~source_models, false) }} +{% if union_strategy|lower = 'all' %} + {% set union_command = 'UNION ALL' %} +{% elif union_strategy|lower == 'distinct' %} + {% set union_command = 'UNION DISTINCT' %} +{% else %} + {%- if execute -%} + {%- do exceptions.warn("[" ~ this ~ "] Warning: Parameter 'union_strategy' set to '" ~ union_strategy ~ "' which is not a supported choice. Set to 'all' or 'distinct' instead. UNION ALL is used now.") -%} + {% endif %} + {% set union_command = 'UNION ALL' %} +{% endif %} + + {%- if not datavault4dbt.is_something(foreign_hashkeys) -%} {%- set foreign_hashkeys = [] -%} @@ -213,7 +225,7 @@ source_new_union AS ( FROM src_new_{{ source_number }} {%- if not loop.last %} - UNION ALL + {{ union_command }} {% endif -%} {%- endfor -%} diff --git a/macros/tables/databricks/nh_link.sql b/macros/tables/databricks/nh_link.sql index 450f362a..36ca65e6 100644 --- a/macros/tables/databricks/nh_link.sql +++ b/macros/tables/databricks/nh_link.sql @@ -1,4 +1,4 @@ -{%- macro databricks__nh_link(link_hashkey, foreign_hashkeys, payload, source_models, src_ldts, src_rsrc, disable_hwm, source_is_single_batch) -%} +{%- macro databricks__nh_link(link_hashkey, foreign_hashkeys, payload, source_models, src_ldts, src_rsrc, disable_hwm, source_is_single_batch, union_strategy) -%} {%- set ns = namespace(last_cte= "", source_included_before = {}, has_rsrc_static_defined=true, source_models_rsrc_dict={}) -%} @@ -26,6 +26,17 @@ {%- set ns.source_models_rsrc_dict = source_model_values['source_models_rsrc_dict'] -%} {{ log('source_models: '~source_models, false) }} +{% if union_strategy|lower = 'all' %} + {% set union_command = 'UNION ALL' %} +{% elif union_strategy|lower == 'distinct' %} + {% set union_command = 'UNION DISTINCT' %} +{% else %} + {%- if execute -%} + {%- do exceptions.warn("[" ~ this ~ "] Warning: Parameter 'union_strategy' set to '" ~ union_strategy ~ "' which is not a supported choice. Set to 'all' or 'distinct' instead. UNION ALL is used now.") -%} + {% endif %} + {% set union_command = 'UNION ALL' %} +{% endif %} + {%- if not datavault4dbt.is_something(foreign_hashkeys) -%} {%- set foreign_hashkeys = [] -%} {%- endif -%} @@ -213,7 +224,7 @@ source_new_union AS ( FROM src_new_{{ source_number }} {%- if not loop.last %} - UNION ALL + {{ union_command }} {% endif -%} {%- endfor -%} diff --git a/macros/tables/exasol/nh_link.sql b/macros/tables/exasol/nh_link.sql index 24bffbee..b94050ff 100644 --- a/macros/tables/exasol/nh_link.sql +++ b/macros/tables/exasol/nh_link.sql @@ -1,4 +1,4 @@ -{%- macro exasol__nh_link(link_hashkey, foreign_hashkeys, payload, source_models, src_ldts, src_rsrc, disable_hwm, source_is_single_batch) -%} +{%- macro exasol__nh_link(link_hashkey, foreign_hashkeys, payload, source_models, src_ldts, src_rsrc, disable_hwm, source_is_single_batch, union_strategy) -%} {%- set ns = namespace(last_cte= "", source_included_before = {}, has_rsrc_static_defined=true, source_models_rsrc_dict={}) -%} @@ -26,6 +26,17 @@ {{ log('source_models: '~source_models, false) }} +{% if union_strategy|lower = 'all' %} + {% set union_command = 'UNION ALL' %} +{% elif union_strategy|lower == 'distinct' %} + {% set union_command = 'UNION' %} +{% else %} + {%- if execute -%} + {%- do exceptions.warn("[" ~ this ~ "] Warning: Parameter 'union_strategy' set to '" ~ union_strategy ~ "' which is not a supported choice. Set to 'all' or 'distinct' instead. UNION ALL is used now.") -%} + {% endif %} + {% set union_command = 'UNION ALL' %} +{% endif %} + {%- if not datavault4dbt.is_something(foreign_hashkeys) -%} {%- set foreign_hashkeys = [] -%} {%- endif -%} @@ -213,7 +224,7 @@ source_new_union AS ( FROM src_new_{{ source_number }} {%- if not loop.last %} - UNION ALL + {{ union_command }} {% endif -%} {%- endfor -%} diff --git a/macros/tables/fabric/nh_link.sql b/macros/tables/fabric/nh_link.sql index 1317f2b1..01d3b0ba 100644 --- a/macros/tables/fabric/nh_link.sql +++ b/macros/tables/fabric/nh_link.sql @@ -1,4 +1,4 @@ -{%- macro fabric__nh_link(link_hashkey, foreign_hashkeys, payload, source_models, src_ldts, src_rsrc, disable_hwm, source_is_single_batch) -%} +{%- macro fabric__nh_link(link_hashkey, foreign_hashkeys, payload, source_models, src_ldts, src_rsrc, disable_hwm, source_is_single_batch, union_strategy) -%} {%- set ns = namespace(last_cte= "", source_included_before = {}, has_rsrc_static_defined=true, source_models_rsrc_dict={}) -%} @@ -27,6 +27,17 @@ {%- set ns.source_models_rsrc_dict = source_model_values['source_models_rsrc_dict'] -%} {{ log('source_models: '~source_models, false) }} +{% if union_strategy|lower = 'all' %} + {% set union_command = 'UNION ALL' %} +{% elif union_strategy|lower == 'distinct' %} + {% set union_command = 'UNION' %} +{% else %} + {%- if execute -%} + {%- do exceptions.warn("[" ~ this ~ "] Warning: Parameter 'union_strategy' set to '" ~ union_strategy ~ "' which is not a supported choice. Set to 'all' or 'distinct' instead. UNION ALL is used now.") -%} + {% endif %} + {% set union_command = 'UNION ALL' %} +{% endif %} + {%- if not datavault4dbt.is_something(foreign_hashkeys) -%} {%- set foreign_hashkeys = [] -%} {%- endif -%} @@ -221,7 +232,7 @@ source_new_union AS ( FROM src_new_{{ source_number }} {%- if not loop.last %} - UNION ALL + {{ union_command }} {% endif -%} {%- endfor -%} diff --git a/macros/tables/nh_link.sql b/macros/tables/nh_link.sql index dabbb0df..1a9ff691 100644 --- a/macros/tables/nh_link.sql +++ b/macros/tables/nh_link.sql @@ -6,7 +6,7 @@ In the background a non-historized link uses exactly the same loading logic as a regular link, but adds the descriptive attributes as additional payload. #} -{%- macro nh_link(yaml_metadata=none, link_hashkey=none, payload=none, source_models=none, foreign_hashkeys=none, src_ldts=none, src_rsrc=none, disable_hwm=false, source_is_single_batch=false) -%} +{%- macro nh_link(yaml_metadata=none, link_hashkey=none, payload=none, source_models=none, foreign_hashkeys=none, src_ldts=none, src_rsrc=none, disable_hwm=false, source_is_single_batch=false, union_strategy='all') -%} {% set link_hashkey_description = " link_hashkey::string Name of the non-historized link hashkey column inside the stage. Should get calculated out of all business keys inside @@ -92,6 +92,11 @@ Needs to use the same column name as defined as alias inside the staging model. " %} + {% set union_strategy_description = " + union_strategy::'all' | 'distinct' Defines how multiple sources should be unioned. 'all' will result in a UNION ALL and represents the default value. Should only be changed, if you have duplicates across + source systems, and don't want to deduplicate them upfront. + " %} + {%- set link_hashkey = datavault4dbt.yaml_metadata_parser(name='link_hashkey', yaml_metadata=yaml_metadata, parameter=link_hashkey, required=True, documentation=link_hashkey_description) -%} {%- set payload = datavault4dbt.yaml_metadata_parser(name='payload', yaml_metadata=yaml_metadata, parameter=payload, required=True, documentation=payload_description) -%} {%- set source_models = datavault4dbt.yaml_metadata_parser(name='source_models', yaml_metadata=yaml_metadata, parameter=source_models, required=True, documentation=source_models_description) -%} @@ -100,6 +105,7 @@ {%- set rsrc = datavault4dbt.yaml_metadata_parser(name='rsrc', yaml_metadata=yaml_metadata, parameter=rsrc, required=False, documentation=rsrc_description) -%} {%- set disable_hwm = datavault4dbt.yaml_metadata_parser(name='disable_hwm', yaml_metadata=yaml_metadata, parameter=disable_hwm, required=False, documentation='Whether the High Water Mark should be turned off. Optional, default False.') -%} {%- set source_is_single_batch = datavault4dbt.yaml_metadata_parser(name='source_is_single_batch', yaml_metadata=yaml_metadata, parameter=source_is_single_batch, required=False, documentation='Whether the source contains only one batch. Optional, default False.') -%} + {%- set union_strategy = datavault4dbt.yaml_metadata_parser(name='union_strategy', yaml_metadata=yaml_metadata, parameter=union_strategy, required=False, documentation=union_strategy_description) -%} {# Applying the default aliases as stored inside the global variables, if src_ldts and src_rsrc are not set. #} @@ -113,6 +119,7 @@ src_rsrc=src_rsrc, source_models=source_models, disable_hwm=disable_hwm, - source_is_single_batch=source_is_single_batch) -}} + source_is_single_batch=source_is_single_batch, + union_strategy=union_strategy) -}} {%- endmacro -%} diff --git a/macros/tables/oracle/nh_link.sql b/macros/tables/oracle/nh_link.sql index 86a77276..fd4937e4 100644 --- a/macros/tables/oracle/nh_link.sql +++ b/macros/tables/oracle/nh_link.sql @@ -1,4 +1,4 @@ -{%- macro oracle__nh_link(link_hashkey, foreign_hashkeys, payload, source_models, src_ldts, src_rsrc, disable_hwm, source_is_single_batch) -%} +{%- macro oracle__nh_link(link_hashkey, foreign_hashkeys, payload, source_models, src_ldts, src_rsrc, disable_hwm, source_is_single_batch, union_strategy) -%} {%- set ns = namespace(last_cte= "", source_included_before = {}, has_rsrc_static_defined=true, source_models_rsrc_dict={}) -%} @@ -26,6 +26,16 @@ {%- set ns.source_models_rsrc_dict = source_model_values['source_models_rsrc_dict'] -%} {{ log('source_models: '~source_models, false) }} +{% if union_strategy|lower = 'all' %} + {% set union_command = 'UNION ALL' %} +{% elif union_strategy|lower == 'distinct' %} + {% set union_command = 'UNION' %} +{% else %} + {%- if execute -%} + {%- do exceptions.warn("[" ~ this ~ "] Warning: Parameter 'union_strategy' set to '" ~ union_strategy ~ "' which is not a supported choice. Set to 'all' or 'distinct' instead. UNION ALL is used now.") -%} + {% endif %} + {% set union_command = 'UNION ALL' %} +{% endif %} {%- if not datavault4dbt.is_something(foreign_hashkeys) -%} {%- set foreign_hashkeys = [] -%} @@ -214,7 +224,7 @@ source_new_union AS ( FROM src_new_{{ source_number }} {%- if not loop.last %} - UNION ALL + {{ union_command }} {% endif -%} {%- endfor -%} diff --git a/macros/tables/postgres/nh_link.sql b/macros/tables/postgres/nh_link.sql index 24c4e3d1..f6c52a94 100644 --- a/macros/tables/postgres/nh_link.sql +++ b/macros/tables/postgres/nh_link.sql @@ -1,4 +1,4 @@ -{%- macro postgres__nh_link(link_hashkey, foreign_hashkeys, payload, source_models, src_ldts, src_rsrc, disable_hwm, source_is_single_batch) -%} +{%- macro postgres__nh_link(link_hashkey, foreign_hashkeys, payload, source_models, src_ldts, src_rsrc, disable_hwm, source_is_single_batch, union_strategy) -%} {%- set ns = namespace(last_cte= "", source_included_before = {}, has_rsrc_static_defined=true, source_models_rsrc_dict={}) -%} @@ -26,6 +26,16 @@ {%- set ns.source_models_rsrc_dict = source_model_values['source_models_rsrc_dict'] -%} {{ log('source_models: '~source_models, false) }} +{% if union_strategy|lower = 'all' %} + {% set union_command = 'UNION ALL' %} +{% elif union_strategy|lower == 'distinct' %} + {% set union_command = 'UNION' %} +{% else %} + {%- if execute -%} + {%- do exceptions.warn("[" ~ this ~ "] Warning: Parameter 'union_strategy' set to '" ~ union_strategy ~ "' which is not a supported choice. Set to 'all' or 'distinct' instead. UNION ALL is used now.") -%} + {% endif %} + {% set union_command = 'UNION ALL' %} +{% endif %} {%- if not datavault4dbt.is_something(foreign_hashkeys) -%} {%- set foreign_hashkeys = [] -%} @@ -214,7 +224,7 @@ source_new_union AS ( FROM src_new_{{ source_number }} {%- if not loop.last %} - UNION ALL + {{ union_command }} {% endif -%} {%- endfor -%} diff --git a/macros/tables/redshift/nh_link.sql b/macros/tables/redshift/nh_link.sql index b4e895c8..38644184 100644 --- a/macros/tables/redshift/nh_link.sql +++ b/macros/tables/redshift/nh_link.sql @@ -1,4 +1,4 @@ -{%- macro redshift__nh_link(link_hashkey, foreign_hashkeys, payload, source_models, src_ldts, src_rsrc, disable_hwm, source_is_single_batch) -%} +{%- macro redshift__nh_link(link_hashkey, foreign_hashkeys, payload, source_models, src_ldts, src_rsrc, disable_hwm, source_is_single_batch, union_strategy) -%} {%- set ns = namespace(last_cte= "", source_included_before = {}, has_rsrc_static_defined=true, source_models_rsrc_dict={}) -%} @@ -25,6 +25,16 @@ {%- set ns.source_models_rsrc_dict = source_model_values['source_models_rsrc_dict'] -%} {{ log('source_models: '~source_models, false) }} +{% if union_strategy|lower = 'all' %} + {% set union_command = 'UNION ALL' %} +{% elif union_strategy|lower == 'distinct' %} + {% set union_command = 'UNION' %} +{% else %} + {%- if execute -%} + {%- do exceptions.warn("[" ~ this ~ "] Warning: Parameter 'union_strategy' set to '" ~ union_strategy ~ "' which is not a supported choice. Set to 'all' or 'distinct' instead. UNION ALL is used now.") -%} + {% endif %} + {% set union_command = 'UNION ALL' %} +{% endif %} {%- if not datavault4dbt.is_something(foreign_hashkeys) -%} {%- set foreign_hashkeys = [] -%} @@ -213,7 +223,7 @@ source_new_union AS ( FROM src_new_{{ source_number }} {%- if not loop.last %} - UNION ALL + {{ union_command }} {% endif -%} {%- endfor -%} diff --git a/macros/tables/snowflake/nh_link.sql b/macros/tables/snowflake/nh_link.sql index 701041d9..0965bfa4 100644 --- a/macros/tables/snowflake/nh_link.sql +++ b/macros/tables/snowflake/nh_link.sql @@ -1,4 +1,4 @@ -{%- macro snowflake__nh_link(link_hashkey, foreign_hashkeys, payload, source_models, src_ldts, src_rsrc, disable_hwm, source_is_single_batch) -%} +{%- macro snowflake__nh_link(link_hashkey, foreign_hashkeys, payload, source_models, src_ldts, src_rsrc, disable_hwm, source_is_single_batch, union_strategy) -%} {%- set ns = namespace(last_cte= "", source_included_before = {}, has_rsrc_static_defined=true, source_models_rsrc_dict={}) -%} @@ -25,6 +25,17 @@ {%- set ns.source_models_rsrc_dict = source_model_values['source_models_rsrc_dict'] -%} {{ log('source_models: '~source_models, false) }} +{% if union_strategy|lower = 'all' %} + {% set union_command = 'UNION ALL' %} +{% elif union_strategy|lower == 'distinct' %} + {% set union_command = 'UNION' %} +{% else %} + {%- if execute -%} + {%- do exceptions.warn("[" ~ this ~ "] Warning: Parameter 'union_strategy' set to '" ~ union_strategy ~ "' which is not a supported choice. Set to 'all' or 'distinct' instead. UNION ALL is used now.") -%} + {% endif %} + {% set union_command = 'UNION ALL' %} +{% endif %} + {%- if not datavault4dbt.is_something(foreign_hashkeys) -%} {%- set foreign_hashkeys = [] -%} {%- endif -%} @@ -212,7 +223,7 @@ source_new_union AS ( FROM src_new_{{ source_number }} {%- if not loop.last %} - UNION ALL + {{ union_command }} {% endif -%} {%- endfor -%} From 72e0085d6861d2617fc87e610ac0eae4236001c3 Mon Sep 17 00:00:00 2001 From: Tim Kirschke Date: Mon, 13 Jan 2025 10:57:24 +0100 Subject: [PATCH 49/50] Added optional parameter "union_strategy" and implemented it in all nh_link macros --- macros/tables/synapse/nh_link.sql | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/macros/tables/synapse/nh_link.sql b/macros/tables/synapse/nh_link.sql index bf66d726..b6258ae3 100644 --- a/macros/tables/synapse/nh_link.sql +++ b/macros/tables/synapse/nh_link.sql @@ -1,4 +1,4 @@ -{%- macro synapse__nh_link(link_hashkey, foreign_hashkeys, payload, source_models, src_ldts, src_rsrc, disable_hwm, source_is_single_batch) -%} +{%- macro synapse__nh_link(link_hashkey, foreign_hashkeys, payload, source_models, src_ldts, src_rsrc, disable_hwm, source_is_single_batch, union_strategy) -%} {%- set ns = namespace(last_cte= "", source_included_before = {}, has_rsrc_static_defined=true, source_models_rsrc_dict={}) -%} @@ -26,6 +26,16 @@ {%- set ns.source_models_rsrc_dict = source_model_values['source_models_rsrc_dict'] -%} {{ log('source_models: '~source_models, false) }} +{% if union_strategy|lower = 'all' %} + {% set union_command = 'UNION ALL' %} +{% elif union_strategy|lower == 'distinct' %} + {% set union_command = 'UNION' %} +{% else %} + {%- if execute -%} + {%- do exceptions.warn("[" ~ this ~ "] Warning: Parameter 'union_strategy' set to '" ~ union_strategy ~ "' which is not a supported choice. Set to 'all' or 'distinct' instead. UNION ALL is used now.") -%} + {% endif %} + {% set union_command = 'UNION ALL' %} +{% endif %} {%- if not datavault4dbt.is_something(foreign_hashkeys) -%} {%- set foreign_hashkeys = [] -%} @@ -214,7 +224,7 @@ source_new_union AS ( FROM src_new_{{ source_number }} {%- if not loop.last %} - UNION ALL + {{ union_command }} {% endif -%} {%- endfor -%} From 78d636470aea72c88bfc172c505d69293b3f0575 Mon Sep 17 00:00:00 2001 From: Theo Kiehn <162969167+tkiehn@users.noreply.github.com> Date: Mon, 13 Jan 2025 13:58:47 +0100 Subject: [PATCH 50/50] fix nh_link union strategy condition syntax --- macros/tables/bigquery/nh_link.sql | 2 +- macros/tables/databricks/nh_link.sql | 2 +- macros/tables/exasol/nh_link.sql | 2 +- macros/tables/fabric/nh_link.sql | 2 +- macros/tables/oracle/nh_link.sql | 2 +- macros/tables/postgres/nh_link.sql | 2 +- macros/tables/redshift/nh_link.sql | 2 +- macros/tables/snowflake/nh_link.sql | 2 +- macros/tables/synapse/nh_link.sql | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/macros/tables/bigquery/nh_link.sql b/macros/tables/bigquery/nh_link.sql index f6fa56c8..59584184 100644 --- a/macros/tables/bigquery/nh_link.sql +++ b/macros/tables/bigquery/nh_link.sql @@ -25,7 +25,7 @@ {%- set ns.source_models_rsrc_dict = source_model_values['source_models_rsrc_dict'] -%} {{ log('source_models: '~source_models, false) }} -{% if union_strategy|lower = 'all' %} +{% if union_strategy|lower == 'all' %} {% set union_command = 'UNION ALL' %} {% elif union_strategy|lower == 'distinct' %} {% set union_command = 'UNION DISTINCT' %} diff --git a/macros/tables/databricks/nh_link.sql b/macros/tables/databricks/nh_link.sql index 36ca65e6..00f02889 100644 --- a/macros/tables/databricks/nh_link.sql +++ b/macros/tables/databricks/nh_link.sql @@ -26,7 +26,7 @@ {%- set ns.source_models_rsrc_dict = source_model_values['source_models_rsrc_dict'] -%} {{ log('source_models: '~source_models, false) }} -{% if union_strategy|lower = 'all' %} +{% if union_strategy|lower == 'all' %} {% set union_command = 'UNION ALL' %} {% elif union_strategy|lower == 'distinct' %} {% set union_command = 'UNION DISTINCT' %} diff --git a/macros/tables/exasol/nh_link.sql b/macros/tables/exasol/nh_link.sql index b94050ff..fb28e493 100644 --- a/macros/tables/exasol/nh_link.sql +++ b/macros/tables/exasol/nh_link.sql @@ -26,7 +26,7 @@ {{ log('source_models: '~source_models, false) }} -{% if union_strategy|lower = 'all' %} +{% if union_strategy|lower == 'all' %} {% set union_command = 'UNION ALL' %} {% elif union_strategy|lower == 'distinct' %} {% set union_command = 'UNION' %} diff --git a/macros/tables/fabric/nh_link.sql b/macros/tables/fabric/nh_link.sql index 01d3b0ba..e73615a8 100644 --- a/macros/tables/fabric/nh_link.sql +++ b/macros/tables/fabric/nh_link.sql @@ -27,7 +27,7 @@ {%- set ns.source_models_rsrc_dict = source_model_values['source_models_rsrc_dict'] -%} {{ log('source_models: '~source_models, false) }} -{% if union_strategy|lower = 'all' %} +{% if union_strategy|lower == 'all' %} {% set union_command = 'UNION ALL' %} {% elif union_strategy|lower == 'distinct' %} {% set union_command = 'UNION' %} diff --git a/macros/tables/oracle/nh_link.sql b/macros/tables/oracle/nh_link.sql index fd4937e4..c60a794e 100644 --- a/macros/tables/oracle/nh_link.sql +++ b/macros/tables/oracle/nh_link.sql @@ -26,7 +26,7 @@ {%- set ns.source_models_rsrc_dict = source_model_values['source_models_rsrc_dict'] -%} {{ log('source_models: '~source_models, false) }} -{% if union_strategy|lower = 'all' %} +{% if union_strategy|lower == 'all' %} {% set union_command = 'UNION ALL' %} {% elif union_strategy|lower == 'distinct' %} {% set union_command = 'UNION' %} diff --git a/macros/tables/postgres/nh_link.sql b/macros/tables/postgres/nh_link.sql index f6c52a94..7147fb0e 100644 --- a/macros/tables/postgres/nh_link.sql +++ b/macros/tables/postgres/nh_link.sql @@ -26,7 +26,7 @@ {%- set ns.source_models_rsrc_dict = source_model_values['source_models_rsrc_dict'] -%} {{ log('source_models: '~source_models, false) }} -{% if union_strategy|lower = 'all' %} +{% if union_strategy|lower == 'all' %} {% set union_command = 'UNION ALL' %} {% elif union_strategy|lower == 'distinct' %} {% set union_command = 'UNION' %} diff --git a/macros/tables/redshift/nh_link.sql b/macros/tables/redshift/nh_link.sql index 38644184..74e5e9dc 100644 --- a/macros/tables/redshift/nh_link.sql +++ b/macros/tables/redshift/nh_link.sql @@ -25,7 +25,7 @@ {%- set ns.source_models_rsrc_dict = source_model_values['source_models_rsrc_dict'] -%} {{ log('source_models: '~source_models, false) }} -{% if union_strategy|lower = 'all' %} +{% if union_strategy|lower == 'all' %} {% set union_command = 'UNION ALL' %} {% elif union_strategy|lower == 'distinct' %} {% set union_command = 'UNION' %} diff --git a/macros/tables/snowflake/nh_link.sql b/macros/tables/snowflake/nh_link.sql index 0965bfa4..0b38718d 100644 --- a/macros/tables/snowflake/nh_link.sql +++ b/macros/tables/snowflake/nh_link.sql @@ -25,7 +25,7 @@ {%- set ns.source_models_rsrc_dict = source_model_values['source_models_rsrc_dict'] -%} {{ log('source_models: '~source_models, false) }} -{% if union_strategy|lower = 'all' %} +{% if union_strategy|lower == 'all' %} {% set union_command = 'UNION ALL' %} {% elif union_strategy|lower == 'distinct' %} {% set union_command = 'UNION' %} diff --git a/macros/tables/synapse/nh_link.sql b/macros/tables/synapse/nh_link.sql index b6258ae3..2f44bdf1 100644 --- a/macros/tables/synapse/nh_link.sql +++ b/macros/tables/synapse/nh_link.sql @@ -26,7 +26,7 @@ {%- set ns.source_models_rsrc_dict = source_model_values['source_models_rsrc_dict'] -%} {{ log('source_models: '~source_models, false) }} -{% if union_strategy|lower = 'all' %} +{% if union_strategy|lower == 'all' %} {% set union_command = 'UNION ALL' %} {% elif union_strategy|lower == 'distinct' %} {% set union_command = 'UNION' %}