From 98d001d363b041e784f6ae8d41ea5e9d0b687dea Mon Sep 17 00:00:00 2001 From: Theo Kiehn <162969167+tkiehn@users.noreply.github.com> Date: Tue, 19 Nov 2024 08:21:53 +0100 Subject: [PATCH 01/25] add macro to process new prejoin list syntax --- .../helpers/stage_processing_macros.sql | 73 ++++++++++++++++++- 1 file changed, 72 insertions(+), 1 deletion(-) diff --git a/macros/internal/helpers/stage_processing_macros.sql b/macros/internal/helpers/stage_processing_macros.sql index 6ecf2676..75a90978 100644 --- a/macros/internal/helpers/stage_processing_macros.sql +++ b/macros/internal/helpers/stage_processing_macros.sql @@ -123,4 +123,75 @@ {%- endif %} {%- endfor -%} -{%- endmacro -%} \ No newline at end of file +{%- endmacro -%} + + +{%- macro process_prejoined_columns(prejoined_columns=none) -%} +{# Check if the new list syntax is used for prejoined columns + If so parse it to dictionaries #} + +{% if not datavault4dbt.is_list(prejoined_columns) %} + {% do return(prejoined_columns) %} +{% else %} + {# if the (new) list syntax for prejoins is used + it needs to be converted to the old syntax #} + + {# Initialize emtpy dict which will be filled by each entry #} + {% set return_dict = {} %} + + {# Iterate over each dictionary in the prejoined_colums-list #} + {% for dict_item in prejoined_columns %} + + {# If column aliases are present they they have to map 1:1 to the extract_columns #} + {% if datavault4dbt.is_something(dict_item.aliases) + and not dict_item.aliases|length == dict_item.extract_columns|length%} + {{ exceptions.raise_compiler_error("Prejoin aliases must have the same length as extract_columns") }} + {% endif %} + + {# If multiple columns from the same source should be extracted each column has to be processed once #} + {% if datavault4dbt.is_list(dict_item.extract_columns) %} + {% for column in dict_item.extract_columns %} + {# If aliases are defined they should be used as dict keys + These will be used as new column names #} + {% if datavault4dbt.is_something(dict_item.aliases) %} + {% set dict_key = dict_item.aliases[loop.index-1] %} + {% else %} + {% set dict_key = dict_item.extract_columns[loop.index-1] %} + {% endif %} + + {% set tmp_dict %} + {{dict_key}}: + ref_model: {{dict_item.ref_model}} + bk: {{dict_item.extract_columns[loop.index-1]}} + this_column_name: {{dict_item.this_column_name}} + ref_column_name: {{dict_item.ref_column_name}} + {% endset %} + {% do return_dict.update(fromyaml(tmp_dict)) %} + {% endfor %} + + {% else %} + + {# If aliases are defined they should be used as dict keys + These will be used as new column names #} + {% if datavault4dbt.is_something(dict_item.aliases) %} + {% set dict_key = dict_item.aliases[loop.index-1] %} + {% else %} + {% set dict_key = dict_item.extract_columns[loop.index-1] %} + {% endif %} + + {% set tmp_dict %} + {{dict_key}}: + ref_model: {{dict_item.ref_model}} + bk: {{dict_item.extract_columns[loop.index-1]}} + this_column_name: {{dict_item.this_column_name}} + ref_column_name: {{dict_item.ref_column_name}} + {% endset %} + {% do return_dict.update(fromyaml(tmp_dict)) %} + {% endif %} + {% endfor %} + + {%- do return(return_dict) -%} + +{% endif %} + +{%- endmacro -%} From 4b0e02d5c27b82decb0c396318b26aae08000229 Mon Sep 17 00:00:00 2001 From: Theo Kiehn <162969167+tkiehn@users.noreply.github.com> Date: Tue, 19 Nov 2024 08:38:16 +0100 Subject: [PATCH 02/25] add process_prejoined_columns macro to top-level stage macro --- macros/staging/stage.sql | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/macros/staging/stage.sql b/macros/staging/stage.sql index 76b17ed0..403df72c 100644 --- a/macros/staging/stage.sql +++ b/macros/staging/stage.sql @@ -120,6 +120,11 @@ {%- if datavault4dbt.is_nothing(ldts) -%} {%- set ldts = datavault4dbt.current_timestamp() -%} {%- endif -%} + + {# To parse the list syntax of prejoined columns #} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {%- set prejoined_columns = datavault4dbt.process_prejoined_columns(prejoined_columns) -%} + {%- endif -%} {{- adapter.dispatch('stage', 'datavault4dbt')(include_source_columns=include_source_columns, ldts=ldts, From 07ec2dec22339ed4c54a70ce0f38b53ddb0eb13d Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Wed, 20 Nov 2024 18:29:51 +0100 Subject: [PATCH 03/25] change prejoin-logic to perform less joins --- macros/staging/bigquery/stage.sql | 37 +++++++++++++++++--------- macros/staging/databricks/stage.sql | 38 +++++++++++++++++---------- macros/staging/exasol/stage.sql | 37 +++++++++++++++++--------- macros/staging/fabric/stage.sql | 40 +++++++++++++++++++---------- macros/staging/oracle/stage.sql | 37 +++++++++++++++++--------- macros/staging/postgres/stage.sql | 37 +++++++++++++++++--------- macros/staging/redshift/stage.sql | 38 +++++++++++++++++---------- macros/staging/snowflake/stage.sql | 37 +++++++++++++++++--------- macros/staging/synapse/stage.sql | 39 ++++++++++++++++++---------- 9 files changed, 226 insertions(+), 114 deletions(-) diff --git a/macros/staging/bigquery/stage.sql b/macros/staging/bigquery/stage.sql index 8c94c387..0486c5f2 100644 --- a/macros/staging/bigquery/stage.sql +++ b/macros/staging/bigquery/stage.sql @@ -256,6 +256,7 @@ missing_columns AS ( ), {%- endif -%} + {%- if datavault4dbt.is_something(prejoined_columns) %} {# Prejoining Business Keys of other source objects for Link purposes #} prejoined_columns AS ( @@ -263,14 +264,12 @@ prejoined_columns AS ( SELECT {% if final_columns_to_select | length > 0 -%} {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} - {% endif %} - {%- for col, vals in prejoined_columns.items() -%} - ,pj_{{loop.index}}.{{ vals['bk'] }} AS {{ col }} - {% endfor -%} - - FROM {{ last_cte }} lcte + {%- endif -%} - {% for col, vals in prejoined_columns.items() %} + {#- prepare join statements -#} + {%- set prejoin_statements_list = [] -%} + {%- set processed_prejoin_hashes = [] -%} + {%- for col, vals in prejoined_columns.items() -%} {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} @@ -309,15 +308,29 @@ prejoined_columns AS ( {%- set operator = vals['operator'] -%} {%- endif -%} - {%- set prejoin_alias = 'pj_' + loop.index|string -%} - left join {{ relation }} as {{ prejoin_alias }} - on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} + {%- set prejoin_hash = '`' ~ local_md5(relation~vals['this_column_name']~operator~vals['ref_column_name']) ~ '`' -%} - {% endfor %} + {%- if not prejoin_hash in processed_prejoin_hashes %} + {%- do processed_prejoin_hashes.append(prejoin_hash) %} + {%- set prejoin_join_statement_tmp -%} + left join {{ relation }} as {{ prejoin_hash }} + on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_hash], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} + + {% endset -%} + {%- do prejoin_statements_list.append(prejoin_join_statement_tmp) -%} + {%- endif -%} + +{# select the prejoined columns #} + ,{{prejoin_hash}}.{{ vals['bk'] }} AS {{ col }} + {% endfor -%} + + FROM {{ last_cte }} lcte + + {{ prejoin_statements_list|join(' ')}} {% set last_cte = "prejoined_columns" -%} - {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names %} + {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names -%} ), {%- endif -%} diff --git a/macros/staging/databricks/stage.sql b/macros/staging/databricks/stage.sql index fc76044b..a3ff3b28 100644 --- a/macros/staging/databricks/stage.sql +++ b/macros/staging/databricks/stage.sql @@ -253,6 +253,7 @@ missing_columns AS ( ), {%- endif -%} + {%- if datavault4dbt.is_something(prejoined_columns) %} {# Prejoining Business Keys of other source objects for Link purposes #} prejoined_columns AS ( @@ -260,14 +261,12 @@ prejoined_columns AS ( SELECT {% if final_columns_to_select | length > 0 -%} {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} - {% endif %} - {%- for col, vals in prejoined_columns.items() -%} - ,pj_{{loop.index}}.{{ vals['bk'] }} AS {{ col }} - {% endfor -%} - - FROM {{ last_cte }} lcte + {%- endif -%} - {% for col, vals in prejoined_columns.items() %} + {#- prepare join statements -#} + {%- set prejoin_statements_list = [] -%} + {%- set processed_prejoin_hashes = [] -%} + {%- for col, vals in prejoined_columns.items() -%} {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} @@ -306,19 +305,32 @@ prejoined_columns AS ( {%- set operator = vals['operator'] -%} {%- endif -%} - {%- set prejoin_alias = 'pj_' + loop.index|string -%} - left join {{ relation }} as {{ prejoin_alias }} - on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} + {%- set prejoin_hash = '`' ~ local_md5(relation~vals['this_column_name']~operator~vals['ref_column_name']) ~ '`' -%} - {% endfor %} + {%- if not prejoin_hash in processed_prejoin_hashes %} + {%- do processed_prejoin_hashes.append(prejoin_hash) %} + {%- set prejoin_join_statement_tmp -%} + left join {{ relation }} as {{ prejoin_hash }} + on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_hash], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} + + {% endset -%} + {%- do prejoin_statements_list.append(prejoin_join_statement_tmp) -%} + {%- endif -%} + +{# select the prejoined columns #} + ,{{prejoin_hash}}.{{ vals['bk'] }} AS {{ col }} + {% endfor -%} + + FROM {{ last_cte }} lcte + + {{ prejoin_statements_list|join(' ')}} {% set last_cte = "prejoined_columns" -%} - {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names %} + {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names -%} ), {%- endif -%} - {%- if datavault4dbt.is_something(derived_columns) %} {# Adding derived columns to the selection #} derived_columns AS ( diff --git a/macros/staging/exasol/stage.sql b/macros/staging/exasol/stage.sql index 50dd35d3..9812f462 100644 --- a/macros/staging/exasol/stage.sql +++ b/macros/staging/exasol/stage.sql @@ -244,6 +244,7 @@ missing_columns AS ( ), {%- endif -%} + {%- if datavault4dbt.is_something(prejoined_columns) %} {# Prejoining Business Keys of other source objects for Link purposes #} prejoined_columns AS ( @@ -251,14 +252,12 @@ prejoined_columns AS ( SELECT {% if final_columns_to_select | length > 0 -%} {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} - {% endif %} - {%- for col, vals in prejoined_columns.items() -%} - ,pj_{{loop.index}}.{{ vals['bk'] }} AS "{{ col | upper }}" - {% endfor -%} - - FROM {{ last_cte }} lcte + {%- endif -%} - {% for col, vals in prejoined_columns.items() %} + {#- prepare join statements -#} + {%- set prejoin_statements_list = [] -%} + {%- set processed_prejoin_hashes = [] -%} + {%- for col, vals in prejoined_columns.items() -%} {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} @@ -297,15 +296,29 @@ prejoined_columns AS ( {%- set operator = vals['operator'] -%} {%- endif -%} - {%- set prejoin_alias = 'pj_' + loop.index|string -%} - left join {{ relation }} as {{ prejoin_alias }} - on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} + {%- set prejoin_hash = '"' ~ local_md5(relation~vals['this_column_name']~operator~vals['ref_column_name']) ~ '"' -%} - {% endfor %} + {%- if not prejoin_hash in processed_prejoin_hashes %} + {%- do processed_prejoin_hashes.append(prejoin_hash) %} + {%- set prejoin_join_statement_tmp -%} + left join {{ relation }} as {{ prejoin_hash }} + on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_hash], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} + + {% endset -%} + {%- do prejoin_statements_list.append(prejoin_join_statement_tmp) -%} + {%- endif -%} + +{# select the prejoined columns #} + ,{{prejoin_hash}}.{{ vals['bk'] }} AS {{ col }} + {% endfor -%} + + FROM {{ last_cte }} lcte + + {{ prejoin_statements_list|join(' ')}} {% set last_cte = "prejoined_columns" -%} - {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names %} + {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names -%} ), {%- endif -%} diff --git a/macros/staging/fabric/stage.sql b/macros/staging/fabric/stage.sql index 605f6861..d042d7a9 100644 --- a/macros/staging/fabric/stage.sql +++ b/macros/staging/fabric/stage.sql @@ -253,23 +253,21 @@ missing_columns AS ( ), {%- endif -%} -{%- if datavault4dbt.is_something(prejoined_columns) %} -{%- set final_columns_to_select = (final_columns_to_select + derived_input_columns) | unique | list -%} +{%- if datavault4dbt.is_something(prejoined_columns) %} {# Prejoining Business Keys of other source objects for Link purposes #} + prejoined_columns AS ( SELECT {% if final_columns_to_select | length > 0 -%} {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} - {% endif %} - {%- for col, vals in prejoined_columns.items() -%} - ,pj_{{loop.index}}.{{datavault4dbt.escape_column_names(vals['bk'])}} AS {{datavault4dbt.escape_column_names(col)}} - {% endfor -%} - - FROM {{ last_cte }} lcte + {%- endif -%} - {% for col, vals in prejoined_columns.items() %} + {#- prepare join statements -#} + {%- set prejoin_statements_list = [] -%} + {%- set processed_prejoin_hashes = [] -%} + {%- for col, vals in prejoined_columns.items() -%} {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} @@ -308,15 +306,29 @@ prejoined_columns AS ( {%- set operator = vals['operator'] -%} {%- endif -%} - {%- set prejoin_alias = 'pj_' + loop.index|string -%} - left join {{ relation }} as {{ prejoin_alias }} - on {{ datavault4dbt.multikey(columns=datavault4dbt.escape_column_names(vals['this_column_name']), prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=datavault4dbt.escape_column_names(vals['ref_column_name'])) }} + {%- set prejoin_hash = '"' ~ local_md5(relation~vals['this_column_name']~operator~vals['ref_column_name']) ~ '"' -%} - {% endfor %} + {%- if not prejoin_hash in processed_prejoin_hashes %} + {%- do processed_prejoin_hashes.append(prejoin_hash) %} + {%- set prejoin_join_statement_tmp -%} + left join {{ relation }} as {{ prejoin_hash }} + on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_hash], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} + + {% endset -%} + {%- do prejoin_statements_list.append(prejoin_join_statement_tmp) -%} + {%- endif -%} + +{# select the prejoined columns #} + ,{{prejoin_hash}}.{{ vals['bk'] }} AS {{ col }} + {% endfor -%} + + FROM {{ last_cte }} lcte + + {{ prejoin_statements_list|join(' ')}} {% set last_cte = "prejoined_columns" -%} - {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names %} + {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names -%} ), {%- endif -%} diff --git a/macros/staging/oracle/stage.sql b/macros/staging/oracle/stage.sql index c2be1409..40afc202 100644 --- a/macros/staging/oracle/stage.sql +++ b/macros/staging/oracle/stage.sql @@ -263,6 +263,7 @@ missing_columns AS ( ), {%- endif -%} + {%- if datavault4dbt.is_something(prejoined_columns) %} {# Prejoining Business Keys of other source objects for Link purposes #} prejoined_columns AS ( @@ -270,14 +271,12 @@ prejoined_columns AS ( SELECT {% if final_columns_to_select | length > 0 -%} {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} - {% endif %} - {%- for col, vals in prejoined_columns.items() -%} - ,pj_{{loop.index}}.{{ vals['bk'] }} AS {{ col }} - {% endfor -%} - - FROM {{ last_cte }} lcte + {%- endif -%} - {% for col, vals in prejoined_columns.items() %} + {#- prepare join statements -#} + {%- set prejoin_statements_list = [] -%} + {%- set processed_prejoin_hashes = [] -%} + {%- for col, vals in prejoined_columns.items() -%} {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} @@ -316,15 +315,29 @@ prejoined_columns AS ( {%- set operator = vals['operator'] -%} {%- endif -%} - {%- set prejoin_alias = 'pj_' + loop.index|string -%} - left join {{ relation }} {{ prejoin_alias }} - on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} + {%- set prejoin_hash = '"' ~ local_md5(relation~vals['this_column_name']~operator~vals['ref_column_name']) ~ '"' -%} - {% endfor %} + {%- if not prejoin_hash in processed_prejoin_hashes %} + {%- do processed_prejoin_hashes.append(prejoin_hash) %} + {%- set prejoin_join_statement_tmp -%} + left join {{ relation }} as {{ prejoin_hash }} + on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_hash], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} + + {% endset -%} + {%- do prejoin_statements_list.append(prejoin_join_statement_tmp) -%} + {%- endif -%} + +{# select the prejoined columns #} + ,{{prejoin_hash}}.{{ vals['bk'] }} AS {{ col }} + {% endfor -%} + + FROM {{ last_cte }} lcte + + {{ prejoin_statements_list|join(' ')}} {% set last_cte = "prejoined_columns" -%} - {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names %} + {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names -%} ), {%- endif -%} diff --git a/macros/staging/postgres/stage.sql b/macros/staging/postgres/stage.sql index 9edd3a38..1f4ef548 100644 --- a/macros/staging/postgres/stage.sql +++ b/macros/staging/postgres/stage.sql @@ -256,6 +256,7 @@ missing_columns AS ( ), {%- endif -%} + {%- if datavault4dbt.is_something(prejoined_columns) %} {# Prejoining Business Keys of other source objects for Link purposes #} prejoined_columns AS ( @@ -263,14 +264,12 @@ prejoined_columns AS ( SELECT {% if final_columns_to_select | length > 0 -%} {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} - {% endif %} - {%- for col, vals in prejoined_columns.items() -%} - ,pj_{{loop.index}}.{{ vals['bk'] }} AS {{ col }} - {% endfor -%} - - FROM {{ last_cte }} lcte + {%- endif -%} - {% for col, vals in prejoined_columns.items() %} + {#- prepare join statements -#} + {%- set prejoin_statements_list = [] -%} + {%- set processed_prejoin_hashes = [] -%} + {%- for col, vals in prejoined_columns.items() -%} {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} @@ -309,15 +308,29 @@ prejoined_columns AS ( {%- set operator = vals['operator'] -%} {%- endif -%} - {%- set prejoin_alias = 'pj_' + loop.index|string -%} - left join {{ relation }} as {{ prejoin_alias }} - on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} + {%- set prejoin_hash = '"' ~ local_md5(relation~vals['this_column_name']~operator~vals['ref_column_name']) ~ '"' -%} - {% endfor %} + {%- if not prejoin_hash in processed_prejoin_hashes %} + {%- do processed_prejoin_hashes.append(prejoin_hash) %} + {%- set prejoin_join_statement_tmp -%} + left join {{ relation }} as {{ prejoin_hash }} + on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_hash], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} + + {% endset -%} + {%- do prejoin_statements_list.append(prejoin_join_statement_tmp) -%} + {%- endif -%} + +{# select the prejoined columns #} + ,{{prejoin_hash}}.{{ vals['bk'] }} AS {{ col }} + {% endfor -%} + + FROM {{ last_cte }} lcte + + {{ prejoin_statements_list|join(' ')}} {% set last_cte = "prejoined_columns" -%} - {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names %} + {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names -%} ), {%- endif -%} diff --git a/macros/staging/redshift/stage.sql b/macros/staging/redshift/stage.sql index a7704c03..6c9238b0 100644 --- a/macros/staging/redshift/stage.sql +++ b/macros/staging/redshift/stage.sql @@ -260,16 +260,14 @@ missing_columns AS ( prejoined_columns AS ( SELECT - {% if final_columns_to_select | length > 0 -%} + {% if final_columns_to_select | length > 0 -%} {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} - {% endif %} - {%- for col, vals in prejoined_columns.items() -%} - ,pj_{{loop.index}}.{{ vals['bk'] }} AS {{ col }} - {% endfor -%} - - FROM {{ last_cte }} lcte + {%- endif -%} - {% for col, vals in prejoined_columns.items() %} + {#- prepare join statements -#} + {%- set prejoin_statements_list = [] -%} + {%- set processed_prejoin_hashes = [] -%} + {%- for col, vals in prejoined_columns.items() -%} {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} @@ -308,15 +306,29 @@ prejoined_columns AS ( {%- set operator = vals['operator'] -%} {%- endif -%} - {%- set prejoin_alias = 'pj_' + loop.index|string -%} - left join {{ relation }} as {{ prejoin_alias }} - on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} + {%- set prejoin_hash = '"' ~ local_md5(relation~vals['this_column_name']~operator~vals['ref_column_name']) ~ '"' -%} - {% endfor %} + {%- if not prejoin_hash in processed_prejoin_hashes %} + {%- do processed_prejoin_hashes.append(prejoin_hash) %} + {%- set prejoin_join_statement_tmp -%} + left join {{ relation }} as {{ prejoin_hash }} + on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_hash], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} + + {% endset -%} + {%- do prejoin_statements_list.append(prejoin_join_statement_tmp) -%} + {%- endif -%} + +{# select the prejoined columns #} + ,{{prejoin_hash}}.{{ vals['bk'] }} AS {{ col }} + {% endfor -%} + + FROM {{ last_cte }} lcte + + {{ prejoin_statements_list|join(' ')}} {% set last_cte = "prejoined_columns" -%} - {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names %} + {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names -%} ), {%- endif -%} diff --git a/macros/staging/snowflake/stage.sql b/macros/staging/snowflake/stage.sql index 956c632e..ed4c9d22 100644 --- a/macros/staging/snowflake/stage.sql +++ b/macros/staging/snowflake/stage.sql @@ -263,6 +263,7 @@ missing_columns AS ( ), {%- endif -%} + {%- if datavault4dbt.is_something(prejoined_columns) %} {# Prejoining Business Keys of other source objects for Link purposes #} prejoined_columns AS ( @@ -270,14 +271,12 @@ prejoined_columns AS ( SELECT {% if final_columns_to_select | length > 0 -%} {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} - {% endif %} - {%- for col, vals in prejoined_columns.items() -%} - ,pj_{{loop.index}}.{{ vals['bk'] }} AS {{ col }} - {% endfor -%} - - FROM {{ last_cte }} lcte + {%- endif -%} - {% for col, vals in prejoined_columns.items() %} + {#- prepare join statements -#} + {%- set prejoin_statements_list = [] -%} + {%- set processed_prejoin_hashes = [] -%} + {%- for col, vals in prejoined_columns.items() -%} {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} @@ -316,15 +315,29 @@ prejoined_columns AS ( {%- set operator = vals['operator'] -%} {%- endif -%} - {%- set prejoin_alias = 'pj_' + loop.index|string -%} - left join {{ relation }} as {{ prejoin_alias }} - on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} + {%- set prejoin_hash = '"' ~ local_md5(relation~vals['this_column_name']~operator~vals['ref_column_name']) ~ '"' -%} - {% endfor %} + {%- if not prejoin_hash in processed_prejoin_hashes %} + {%- do processed_prejoin_hashes.append(prejoin_hash) %} + {%- set prejoin_join_statement_tmp -%} + left join {{ relation }} as {{ prejoin_hash }} + on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_hash], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} + + {% endset -%} + {%- do prejoin_statements_list.append(prejoin_join_statement_tmp) -%} + {%- endif -%} + +{# select the prejoined columns #} + ,{{prejoin_hash}}.{{ vals['bk'] }} AS {{ col }} + {% endfor -%} + + FROM {{ last_cte }} lcte + + {{ prejoin_statements_list|join(' ')}} {% set last_cte = "prejoined_columns" -%} - {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names %} + {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names -%} ), {%- endif -%} diff --git a/macros/staging/synapse/stage.sql b/macros/staging/synapse/stage.sql index 84edee88..88356508 100644 --- a/macros/staging/synapse/stage.sql +++ b/macros/staging/synapse/stage.sql @@ -255,23 +255,20 @@ missing_columns AS ( ), {%- endif -%} -{%- if datavault4dbt.is_something(prejoined_columns) %} -{%- set final_columns_to_select = (final_columns_to_select + derived_input_columns) | unique | list -%} +{%- if datavault4dbt.is_something(prejoined_columns) %} {# Prejoining Business Keys of other source objects for Link purposes #} prejoined_columns AS ( SELECT {% if final_columns_to_select | length > 0 -%} {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} - {% endif %} - {%- for col, vals in prejoined_columns.items() -%} - ,pj_{{loop.index}}.{{ vals['bk'] }} AS {{ col }} - {% endfor -%} - - FROM {{ last_cte }} lcte + {%- endif -%} - {% for col, vals in prejoined_columns.items() %} + {#- prepare join statements -#} + {%- set prejoin_statements_list = [] -%} + {%- set processed_prejoin_hashes = [] -%} + {%- for col, vals in prejoined_columns.items() -%} {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} @@ -310,15 +307,29 @@ prejoined_columns AS ( {%- set operator = vals['operator'] -%} {%- endif -%} - {%- set prejoin_alias = 'pj_' + loop.index|string -%} - left join {{ relation }} as {{ prejoin_alias }} - on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} + {%- set prejoin_hash = '"' ~ local_md5(relation~vals['this_column_name']~operator~vals['ref_column_name']) ~ '"' -%} - {% endfor %} + {%- if not prejoin_hash in processed_prejoin_hashes %} + {%- do processed_prejoin_hashes.append(prejoin_hash) %} + {%- set prejoin_join_statement_tmp -%} + left join {{ relation }} as {{ prejoin_hash }} + on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_hash], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} + + {% endset -%} + {%- do prejoin_statements_list.append(prejoin_join_statement_tmp) -%} + {%- endif -%} + +{# select the prejoined columns #} + ,{{prejoin_hash}}.{{ vals['bk'] }} AS {{ col }} + {% endfor -%} + + FROM {{ last_cte }} lcte + + {{ prejoin_statements_list|join(' ')}} {% set last_cte = "prejoined_columns" -%} - {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names %} + {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names -%} ), {%- endif -%} From fa4087e5aad952b5038305f972dbb9448f0982b3 Mon Sep 17 00:00:00 2001 From: Theo Kiehn <162969167+tkiehn@users.noreply.github.com> Date: Wed, 27 Nov 2024 15:43:22 +0100 Subject: [PATCH 04/25] add check and compilation error if a prejoined column is defined twice --- .../helpers/stage_processing_macros.sql | 24 +++++++++++++------ 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/macros/internal/helpers/stage_processing_macros.sql b/macros/internal/helpers/stage_processing_macros.sql index 75a90978..71b0c718 100644 --- a/macros/internal/helpers/stage_processing_macros.sql +++ b/macros/internal/helpers/stage_processing_macros.sql @@ -144,7 +144,7 @@ {# If column aliases are present they they have to map 1:1 to the extract_columns #} {% if datavault4dbt.is_something(dict_item.aliases) - and not dict_item.aliases|length == dict_item.extract_columns|length%} + and not dict_item.aliases|length == dict_item.extract_columns|length %} {{ exceptions.raise_compiler_error("Prejoin aliases must have the same length as extract_columns") }} {% endif %} @@ -154,15 +154,20 @@ {# If aliases are defined they should be used as dict keys These will be used as new column names #} {% if datavault4dbt.is_something(dict_item.aliases) %} - {% set dict_key = dict_item.aliases[loop.index-1] %} + {% set dict_key = dict_item.aliases[loop.index0] %} {% else %} - {% set dict_key = dict_item.extract_columns[loop.index-1] %} + {% set dict_key = dict_item.extract_columns[loop.index0] %} + {% endif %} + + {# To make sure each column or alias is present only once #} + {% if dict_key|lower in return_dict.keys()|map('lower') %} + {{ exceptions.raise_compiler_error("Prejoined Column name or alias '" ~ dict_key ~ "' is defined twice.") }} {% endif %} {% set tmp_dict %} {{dict_key}}: ref_model: {{dict_item.ref_model}} - bk: {{dict_item.extract_columns[loop.index-1]}} + bk: {{dict_item.extract_columns[loop.index0]}} this_column_name: {{dict_item.this_column_name}} ref_column_name: {{dict_item.ref_column_name}} {% endset %} @@ -174,15 +179,20 @@ {# If aliases are defined they should be used as dict keys These will be used as new column names #} {% if datavault4dbt.is_something(dict_item.aliases) %} - {% set dict_key = dict_item.aliases[loop.index-1] %} + {% set dict_key = dict_item.aliases[loop.index0] %} {% else %} - {% set dict_key = dict_item.extract_columns[loop.index-1] %} + {% set dict_key = dict_item.extract_columns[loop.index0] %} + {% endif %} + + {# To make sure each column or alias is present only once #} + {% if dict_key|lower in return_dict.keys()|map('lower') %} + {{ exceptions.raise_compiler_error("Prejoined Column name or alias '" ~ dict_key ~ "' is defined twice.") }} {% endif %} {% set tmp_dict %} {{dict_key}}: ref_model: {{dict_item.ref_model}} - bk: {{dict_item.extract_columns[loop.index-1]}} + bk: {{dict_item.extract_columns[loop.index0]}} this_column_name: {{dict_item.this_column_name}} ref_column_name: {{dict_item.ref_column_name}} {% endset %} From 54b8720e9ccdb23e0f0fc0e282c6ec9c61ff5eb7 Mon Sep 17 00:00:00 2001 From: Theo Kiehn <162969167+tkiehn@users.noreply.github.com> Date: Wed, 27 Nov 2024 16:17:06 +0100 Subject: [PATCH 05/25] add amount of extract_columns and aliases to amount-mismatch compilation error message --- macros/internal/helpers/stage_processing_macros.sql | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/macros/internal/helpers/stage_processing_macros.sql b/macros/internal/helpers/stage_processing_macros.sql index 71b0c718..ad84a86a 100644 --- a/macros/internal/helpers/stage_processing_macros.sql +++ b/macros/internal/helpers/stage_processing_macros.sql @@ -145,7 +145,8 @@ {# If column aliases are present they they have to map 1:1 to the extract_columns #} {% if datavault4dbt.is_something(dict_item.aliases) and not dict_item.aliases|length == dict_item.extract_columns|length %} - {{ exceptions.raise_compiler_error("Prejoin aliases must have the same length as extract_columns") }} + {{ exceptions.raise_compiler_error("Prejoin aliases must have the same length as extract_columns. Got " + ~ dict_item.extract_columns|length ~ " extract_columns and " ~ dict_item.aliases|length ~ " aliases.") }} {% endif %} {# If multiple columns from the same source should be extracted each column has to be processed once #} From 60b2a985ef13f85f0ce80dbdc3dd93791a563c48 Mon Sep 17 00:00:00 2001 From: Theo Kiehn <162969167+tkiehn@users.noreply.github.com> Date: Wed, 27 Nov 2024 16:50:45 +0100 Subject: [PATCH 06/25] add prejoin with source to processing-macro --- .../internal/helpers/stage_processing_macros.sql | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/macros/internal/helpers/stage_processing_macros.sql b/macros/internal/helpers/stage_processing_macros.sql index ad84a86a..5881fc6a 100644 --- a/macros/internal/helpers/stage_processing_macros.sql +++ b/macros/internal/helpers/stage_processing_macros.sql @@ -167,7 +167,14 @@ {% set tmp_dict %} {{dict_key}}: + {%- if 'ref_model' in dict_item.keys()|map('lower') %} ref_model: {{dict_item.ref_model}} + {%- elif 'src_name' in dict_item.keys()|map('lower') and 'src_table' in dict_item.keys()|map('lower') %} + src_name: {{dict_item.src_name}} + src_table: {{dict_item.src_table}} + {%- else %} + {{ exceptions.raise_compiler_error("Either ref_model or src_name and src_table have to be defined for each prejoin") }} + {%- endif %} bk: {{dict_item.extract_columns[loop.index0]}} this_column_name: {{dict_item.this_column_name}} ref_column_name: {{dict_item.ref_column_name}} @@ -192,7 +199,14 @@ {% set tmp_dict %} {{dict_key}}: + {%- if 'ref_model' in dict_item.keys()|map('lower') %} ref_model: {{dict_item.ref_model}} + {%- elif 'src_name' in dict_item.keys()|map('lower') and 'src_table' in dict_item.keys()|map('lower') %} + src_name: {{dict_item.src_name}} + src_table: {{dict_item.src_table}} + {%- else %} + {{ exceptions.raise_compiler_error("Either ref_model or src_name and src_table have to be defined for each prejoin") }} + {%- endif %} bk: {{dict_item.extract_columns[loop.index0]}} this_column_name: {{dict_item.this_column_name}} ref_column_name: {{dict_item.ref_column_name}} From 3fed9637774800dd34936ff0e3bcedb58eee3e87 Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Mon, 9 Dec 2024 16:29:55 +0100 Subject: [PATCH 07/25] move stage_processing_macros.sql into staging folder --- macros/{internal/helpers => staging}/stage_processing_macros.sql | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename macros/{internal/helpers => staging}/stage_processing_macros.sql (100%) diff --git a/macros/internal/helpers/stage_processing_macros.sql b/macros/staging/stage_processing_macros.sql similarity index 100% rename from macros/internal/helpers/stage_processing_macros.sql rename to macros/staging/stage_processing_macros.sql From edf3dc472862a38dc2b17598066ddf903433f344 Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Wed, 11 Dec 2024 15:09:34 +0100 Subject: [PATCH 08/25] change extract_input_columns, process_prejoined_columns. add extract_prejoin_column_names change extract_input_columns to handle the new prejoin list syntax. change process_prejoined_columns to parse the old dict syntax to the new list syntax, merging prejoins with the same conditions. add extract_prejoin_column_names which can handle the new list syntax --- macros/staging/stage_processing_macros.sql | 178 ++++++++++----------- 1 file changed, 85 insertions(+), 93 deletions(-) diff --git a/macros/staging/stage_processing_macros.sql b/macros/staging/stage_processing_macros.sql index 5881fc6a..3f77d594 100644 --- a/macros/staging/stage_processing_macros.sql +++ b/macros/staging/stage_processing_macros.sql @@ -54,24 +54,27 @@ {# Do nothing. No source column required. #} {%- elif value is mapping and value.is_hashdiff -%} {%- do extracted_input_columns.append(value['columns']) -%} - {%- elif value is mapping and 'this_column_name' in value.keys() -%} - {%- if datavault4dbt.is_list(value['this_column_name'])-%} - {%- for column in value['this_column_name'] -%} - {%- do extracted_input_columns.append(column) -%} - {%- endfor -%} - {%- else -%} - {%- do extracted_input_columns.append(value['this_column_name']) -%} - {%- endif -%} {%- else -%} {%- do extracted_input_columns.append(value) -%} {%- endif -%} {%- endfor -%} - - {%- do return(extracted_input_columns) -%} + + {%- elif datavault4dbt.is_list(columns_dict) -%} + {% for prejoin in columns_dict %} + {%- if datavault4dbt.is_list(prejoin['this_column_name'])-%} + {%- for column in prejoin['this_column_name'] -%} + {%- do extracted_input_columns.append(column) -%} + {%- endfor -%} + {%- else -%} + {%- do extracted_input_columns.append(prejoin['this_column_name']) -%} + {%- endif -%} + {% endfor %} {%- else -%} {%- do return([]) -%} {%- endif -%} + {%- do return(extracted_input_columns) -%} + {%- endmacro -%} @@ -127,96 +130,85 @@ {%- macro process_prejoined_columns(prejoined_columns=none) -%} -{# Check if the new list syntax is used for prejoined columns - If so parse it to dictionaries #} - -{% if not datavault4dbt.is_list(prejoined_columns) %} - {% do return(prejoined_columns) %} -{% else %} - {# if the (new) list syntax for prejoins is used - it needs to be converted to the old syntax #} - - {# Initialize emtpy dict which will be filled by each entry #} - {% set return_dict = {} %} - - {# Iterate over each dictionary in the prejoined_colums-list #} - {% for dict_item in prejoined_columns %} - - {# If column aliases are present they they have to map 1:1 to the extract_columns #} - {% if datavault4dbt.is_something(dict_item.aliases) - and not dict_item.aliases|length == dict_item.extract_columns|length %} - {{ exceptions.raise_compiler_error("Prejoin aliases must have the same length as extract_columns. Got " - ~ dict_item.extract_columns|length ~ " extract_columns and " ~ dict_item.aliases|length ~ " aliases.") }} - {% endif %} - - {# If multiple columns from the same source should be extracted each column has to be processed once #} - {% if datavault4dbt.is_list(dict_item.extract_columns) %} - {% for column in dict_item.extract_columns %} - {# If aliases are defined they should be used as dict keys - These will be used as new column names #} - {% if datavault4dbt.is_something(dict_item.aliases) %} - {% set dict_key = dict_item.aliases[loop.index0] %} - {% else %} - {% set dict_key = dict_item.extract_columns[loop.index0] %} + {# Check if the old syntax is used for prejoined columns + If so parse it to new list syntax #} + + {% if datavault4dbt.is_list(prejoined_columns) %} + {% do return(prejoined_columns) %} + {% else %} + {% set output = [] %} + + {% for key, value in prejoined_columns.items() %} + {% set ref_model = value.get('ref_model') %} + {% set src_name = value.get('src_name') %} + {% set src_table = value.get('src_table') %} + {%- if 'operator' not in value.keys() -%} + {%- do value.update({'operator': 'AND'}) -%} + {%- set operator = 'AND' -%} + {%- else -%} + {%- set operator = value.get('operator') -%} + {%- endif -%} + + {% set match_criteria = ( + ref_model and output | selectattr('ref_model', 'equalto', ref_model) or + src_name and output | selectattr('src_name', 'equalto', src_name) | selectattr('src_table', 'equalto', src_table) + ) | selectattr('this_column_name', 'equalto', value.this_column_name) + | selectattr('ref_column_name', 'equalto', value.ref_column_name) + | selectattr('operator', 'equalto', value.operator) + | list | first %} + + {% if match_criteria %} + {% do match_criteria['extract_columns'].append(value.bk) %} + {% do match_criteria['aliases'].append(key) %} + {% else %} + {% set new_item = { + 'extract_columns': [value.bk], + 'aliases': [key], + 'this_column_name': value.this_column_name, + 'ref_column_name': value.ref_column_name, + 'operator': operator + } %} + + {% if ref_model %} + {% do new_item.update({'ref_model': ref_model}) %} + {% elif src_name and src_table %} + {% do new_item.update({'src_name': src_name, 'src_table': src_table}) %} {% endif %} + + {% do output.append(new_item) %} + {% endif %} + {% endfor %} + {% endif %} - {# To make sure each column or alias is present only once #} - {% if dict_key|lower in return_dict.keys()|map('lower') %} - {{ exceptions.raise_compiler_error("Prejoined Column name or alias '" ~ dict_key ~ "' is defined twice.") }} - {% endif %} + {%- do return(output) -%} - {% set tmp_dict %} - {{dict_key}}: - {%- if 'ref_model' in dict_item.keys()|map('lower') %} - ref_model: {{dict_item.ref_model}} - {%- elif 'src_name' in dict_item.keys()|map('lower') and 'src_table' in dict_item.keys()|map('lower') %} - src_name: {{dict_item.src_name}} - src_table: {{dict_item.src_table}} - {%- else %} - {{ exceptions.raise_compiler_error("Either ref_model or src_name and src_table have to be defined for each prejoin") }} - {%- endif %} - bk: {{dict_item.extract_columns[loop.index0]}} - this_column_name: {{dict_item.this_column_name}} - ref_column_name: {{dict_item.ref_column_name}} - {% endset %} - {% do return_dict.update(fromyaml(tmp_dict)) %} - {% endfor %} +{%- endmacro -%} - {% else %} - {# If aliases are defined they should be used as dict keys - These will be used as new column names #} - {% if datavault4dbt.is_something(dict_item.aliases) %} - {% set dict_key = dict_item.aliases[loop.index0] %} - {% else %} - {% set dict_key = dict_item.extract_columns[loop.index0] %} - {% endif %} +{%- macro extract_prejoin_column_names(prejoined_columns=none) -%} - {# To make sure each column or alias is present only once #} - {% if dict_key|lower in return_dict.keys()|map('lower') %} - {{ exceptions.raise_compiler_error("Prejoined Column name or alias '" ~ dict_key ~ "' is defined twice.") }} - {% endif %} + {%- set extracted_column_names = [] -%} + + {% if not datavault4dbt.is_something(prejoined_columns) %} + {%- do return(extracted_column_names) -%} + {% endif %} - {% set tmp_dict %} - {{dict_key}}: - {%- if 'ref_model' in dict_item.keys()|map('lower') %} - ref_model: {{dict_item.ref_model}} - {%- elif 'src_name' in dict_item.keys()|map('lower') and 'src_table' in dict_item.keys()|map('lower') %} - src_name: {{dict_item.src_name}} - src_table: {{dict_item.src_table}} - {%- else %} - {{ exceptions.raise_compiler_error("Either ref_model or src_name and src_table have to be defined for each prejoin") }} - {%- endif %} - bk: {{dict_item.extract_columns[loop.index0]}} - this_column_name: {{dict_item.this_column_name}} - ref_column_name: {{dict_item.ref_column_name}} - {% endset %} - {% do return_dict.update(fromyaml(tmp_dict)) %} + {% for prejoin in prejoined_columns %} + {% if datavault4dbt.is_list(prejoin['aliases']) %} + {% for alias in prejoin['aliases'] %} + {%- do extracted_column_names.append(alias) -%} + {% endfor %} + {% elif datavault4dbt.is_something(prejoin['aliases']) %} + {%- do extracted_column_names.append(prejoin['aliases']) -%} + {% elif datavault4dbt.is_list(prejoin['extract_columns']) %} + {% for column in prejoin['extract_columns'] %} + {%- do extracted_column_names.append(column) -%} + {% endfor %} + {% else %} + {%- do extracted_column_names.append(prejoin['extract_columns']) -%} {% endif %} - {% endfor %} - - {%- do return(return_dict) -%} - -{% endif %} + {%- endfor -%} + + {%- do return(extracted_column_names) -%} {%- endmacro -%} From da7d00047dc4add7f634f7c0e5b2ad5740a7f008 Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Wed, 11 Dec 2024 15:10:05 +0100 Subject: [PATCH 09/25] add staging.yml with descriptions of process_prejoined_columns and extract_prejoin_column_names --- macros/staging/staging.yml | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 macros/staging/staging.yml diff --git a/macros/staging/staging.yml b/macros/staging/staging.yml new file mode 100644 index 00000000..86988615 --- /dev/null +++ b/macros/staging/staging.yml @@ -0,0 +1,23 @@ +version: 2 + +macros: + - name: process_prejoined_columns + description: > + A macro to process prejoined columns. If a list of dictioniaries(new syntax) is provided it will do nothing and return the list. + If a dictionary of dictionaries if provided(old syntax) it will be transformed to the new syntax. + When multiple columns are to be extracted from the same prejoin-target and with the same conditions(columns and operator) they will be combined into one item. + arguments: + - name: prejoined_columns + type: list or dictionary + description: The value of the prejoined_columns as defined in the yaml_metadata of the stage-model. + + - name: extract_prejoin_column_names + description: > + A macro to extract the names of the prejoined columns of each staging-model. + Takes a list of prejoins and will add the aliases of the prejoins to the return-list. + If no aliases are present it will return the names of the extracted columns. + Returns an empty list if the passed parameter is empty. + arguments: + - name: prejoined_columns + type: list + description: The prejoined_columns as process by the process_prejoined_columns-macro \ No newline at end of file From a970a0a08e43e776ae3bdc84a31c2a8a4fd0527f Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Wed, 11 Dec 2024 15:11:38 +0100 Subject: [PATCH 10/25] postgres: modify stage to handle new prejoin syntax and simplify setting of commas for ghost records --- macros/staging/postgres/stage.sql | 196 +++++++++++++++--------------- 1 file changed, 100 insertions(+), 96 deletions(-) diff --git a/macros/staging/postgres/stage.sql b/macros/staging/postgres/stage.sql index 1f4ef548..f5cd9898 100644 --- a/macros/staging/postgres/stage.sql +++ b/macros/staging/postgres/stage.sql @@ -96,7 +96,7 @@ {# Getting the column names for all additional columns #} {%- set derived_column_names = datavault4dbt.extract_column_names(derived_columns) -%} {%- set hashed_column_names = datavault4dbt.extract_column_names(hashed_columns) -%} -{%- set prejoined_column_names = datavault4dbt.extract_column_names(prejoined_columns) -%} +{%- set prejoined_column_names = datavault4dbt.extract_prejoin_column_names(prejoined_columns) -%} {%- set missing_column_names = datavault4dbt.extract_column_names(missing_columns) -%} {%- set exclude_column_names = hashed_column_names + prejoined_column_names + missing_column_names + ldts_rsrc_input_column_names %} {%- set source_and_derived_column_names = (all_source_columns + derived_column_names) | unique | list -%} @@ -266,15 +266,41 @@ prejoined_columns AS ( {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} {%- endif -%} - {#- prepare join statements -#} - {%- set prejoin_statements_list = [] -%} - {%- set processed_prejoin_hashes = [] -%} - {%- for col, vals in prejoined_columns.items() -%} + {# Iterate over each prejoin, doing logic checks and generating the select-statements #} + {%- for prejoin in prejoined_columns -%} + {%- set prejoin_alias = 'pj_' + loop.index|string -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {# If extract_columns and/or aliases are passed as string convert them to a list so they can be used as iterators later #} + {%- if not datavault4dbt.is_list(prejoin['extract_columns'])-%} + {%- do prejoin.update({'extract_columns': [prejoin['extract_columns']]}) -%} + {%- endif -%} + {%- if not datavault4dbt.is_list(prejoin['aliases']) and datavault4dbt.is_something(prejoin['aliases']) -%} + {%- do prejoin.update({'aliases': [prejoin['aliases']]}) -%} + {%- endif -%} + + {# If passed, make sure there are as many aliases as there are extract_columns, ensuring a 1:1 mapping #} + {%- if datavault4dbt.is_something(prejoin['aliases']) -%} + {%- if not prejoin['aliases']|length == prejoin['extract_columns']|length -%} + {%- do exceptions.raise_compiler_error("Prejoin aliases must have the same length as extract_columns. Got " + ~ prejoin['extract_columns']|length ~ " extract_column(s) and " ~ prejoin['aliases']|length ~ " aliase(s).") -%} + {%- endif -%} + {%- endif -%} + + {# Generate the columns for the SELECT-statement #} + {%- for column in prejoin['extract_columns'] %} + ,{{ prejoin_alias }}.{{ column }} {% if datavault4dbt.is_something(prejoin['aliases']) -%} AS {{ prejoin['aliases'][loop.index0] }} {% endif -%} + {%- endfor -%} + {%- endfor %} + + FROM {{ last_cte }} lcte + + {# Iterate over prejoins and generate the join-statements #} + {%- for prejoin in prejoined_columns -%} + + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- else -%} {%- set error_message -%} Prejoin error: Invalid target entity definition. Allowed are: @@ -295,42 +321,25 @@ prejoined_columns AS ( ref_column_name: join_columns_in_ref_model Got: - {{ col }}: {{ vals }} + {{ prejoin }} {%- endset -%} - {%- do exceptions.raise_compiler_error(error_message) -%} + {%- do exceptions.raise_compiler_error(error_message) -%} {%- endif -%} -{# This sets a default value for the operator that connects multiple joining conditions. Only when it is not set by user. #} - {%- if 'operator' not in vals.keys() -%} + {%- if 'operator' not in prejoin.keys() -%} {%- set operator = 'AND' -%} {%- else -%} - {%- set operator = vals['operator'] -%} + {%- set operator = prejoin['operator'] -%} {%- endif -%} - - - {%- set prejoin_hash = '"' ~ local_md5(relation~vals['this_column_name']~operator~vals['ref_column_name']) ~ '"' -%} - - {%- if not prejoin_hash in processed_prejoin_hashes %} - {%- do processed_prejoin_hashes.append(prejoin_hash) %} - {%- set prejoin_join_statement_tmp -%} - left join {{ relation }} as {{ prejoin_hash }} - on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_hash], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} - - {% endset -%} - {%- do prejoin_statements_list.append(prejoin_join_statement_tmp) -%} - {%- endif -%} - -{# select the prejoined columns #} - ,{{prejoin_hash}}.{{ vals['bk'] }} AS {{ col }} - {% endfor -%} - - FROM {{ last_cte }} lcte - - {{ prejoin_statements_list|join(' ')}} + {%- set prejoin_alias = 'pj_' + loop.index|string %} + + left join {{ relation }} as {{ prejoin_alias }} + on {{ datavault4dbt.multikey(columns=prejoin['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=prejoin['ref_column_name']) }} + {%- endfor -%} {% set last_cte = "prejoined_columns" -%} - {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names -%} + {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names %} ), {%- endif -%} @@ -457,65 +466,61 @@ unknown_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ unknown_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ unknown_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for all source columns, except the ldts, rsrc & edwSequence column #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{- datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes#} - {% for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes #} + {%- for prejoin in prejoined_columns -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {%- set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} - {{ log('pj_relation_columns: ' ~ pj_relation_columns, false ) }} - - {% for column in pj_relation_columns -%} - - {% if column.name|lower == vals['bk']|lower -%} - {{ log('column found? yes, for column :' ~ column.name , false) }} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', alias=col) }} + {{ log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) }} + + {%- for column in pj_relation_columns -%} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} {%- endfor -%} - {%- if not loop.last %},{% endif %} {% endfor -%} {%- endif %} - {%- if datavault4dbt.is_something(derived_columns) -%}, - {# Additionally generating Ghost Records for Derived Columns #} - {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + {%- if datavault4dbt.is_something(derived_columns) -%} + {# Additionally generating Ghost Records for Derived Columns #} + {% for column_name, properties in derived_columns_with_datatypes_DICT.items() %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} @@ -527,62 +532,61 @@ error_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ error_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ error_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for Source Columns #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for Missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes #} - {%- for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes#} + {% for prejoin in prejoined_columns %} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} + {{- log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) -}} - {% for column in pj_relation_columns -%} - {% if column.name|lower == vals['bk']|lower -%} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', alias=col) -}} - {%- endif -%} - {%- endfor -%} - {%- if not loop.last -%},{%- endif %} - {% endfor -%} + {% for column in pj_relation_columns -%} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', alias=prejoin['aliases'][prejoin_col_index]) }} + {%- endif -%} - {%- endif -%} + {%- endfor -%} + {% endfor -%} + {%- endif %} - {%- if datavault4dbt.is_something(derived_columns) %}, + {%- if datavault4dbt.is_something(derived_columns) %} {# Additionally generating Ghost Records for Derived Columns #} {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} From f8767ae1e1c73f283db929f5b596d0727af739e4 Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Wed, 11 Dec 2024 16:46:20 +0100 Subject: [PATCH 11/25] bigquery: stage: implement new prejoin syntax --- macros/staging/bigquery/stage.sql | 188 +++++++++++++++--------------- 1 file changed, 97 insertions(+), 91 deletions(-) diff --git a/macros/staging/bigquery/stage.sql b/macros/staging/bigquery/stage.sql index 0486c5f2..8e9b1d47 100644 --- a/macros/staging/bigquery/stage.sql +++ b/macros/staging/bigquery/stage.sql @@ -96,7 +96,7 @@ {# Getting the column names for all additional columns #} {%- set derived_column_names = datavault4dbt.extract_column_names(derived_columns) -%} {%- set hashed_column_names = datavault4dbt.extract_column_names(hashed_columns) -%} -{%- set prejoined_column_names = datavault4dbt.extract_column_names(prejoined_columns) -%} +{%- set prejoined_column_names = datavault4dbt.extract_prejoin_column_names(prejoined_columns) -%} {%- set missing_column_names = datavault4dbt.extract_column_names(missing_columns) -%} {%- set exclude_column_names = hashed_column_names + prejoined_column_names + missing_column_names + ldts_rsrc_input_column_names %} {%- set source_and_derived_column_names = (all_source_columns + derived_column_names) | unique | list -%} @@ -183,6 +183,8 @@ {# Setting the ldts default datatype #} {% set ldts_default_dtype = datavault4dbt.timestamp_default_dtype() %} +{{ datavault4dbt.prepend_generated_by() }} + WITH {# Selecting everything that we need from the source relation. #} @@ -266,15 +268,41 @@ prejoined_columns AS ( {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} {%- endif -%} - {#- prepare join statements -#} - {%- set prejoin_statements_list = [] -%} - {%- set processed_prejoin_hashes = [] -%} - {%- for col, vals in prejoined_columns.items() -%} + {# Iterate over each prejoin, doing logic checks and generating the select-statements #} + {%- for prejoin in prejoined_columns -%} + {%- set prejoin_alias = 'pj_' + loop.index|string -%} + + {# If extract_columns and/or aliases are passed as string convert them to a list so they can be used as iterators later #} + {%- if not datavault4dbt.is_list(prejoin['extract_columns'])-%} + {%- do prejoin.update({'extract_columns': [prejoin['extract_columns']]}) -%} + {%- endif -%} + {%- if not datavault4dbt.is_list(prejoin['aliases']) and datavault4dbt.is_something(prejoin['aliases']) -%} + {%- do prejoin.update({'aliases': [prejoin['aliases']]}) -%} + {%- endif -%} + + {# If passed, make sure there are as many aliases as there are extract_columns, ensuring a 1:1 mapping #} + {%- if datavault4dbt.is_something(prejoin['aliases']) -%} + {%- if not prejoin['aliases']|length == prejoin['extract_columns']|length -%} + {%- do exceptions.raise_compiler_error("Prejoin aliases must have the same length as extract_columns. Got " + ~ prejoin['extract_columns']|length ~ " extract_column(s) and " ~ prejoin['aliases']|length ~ " aliase(s).") -%} + {%- endif -%} + {%- endif -%} + + {# Generate the columns for the SELECT-statement #} + {%- for column in prejoin['extract_columns'] %} + ,{{ prejoin_alias }}.{{ column }} {% if datavault4dbt.is_something(prejoin['aliases']) -%} AS {{ prejoin['aliases'][loop.index0] }} {% endif -%} + {%- endfor -%} + {%- endfor %} + + FROM {{ last_cte }} lcte + + {# Iterate over prejoins and generate the join-statements #} + {%- for prejoin in prejoined_columns -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- else -%} {%- set error_message -%} Prejoin error: Invalid target entity definition. Allowed are: @@ -295,39 +323,22 @@ prejoined_columns AS ( ref_column_name: join_columns_in_ref_model Got: - {{ col }}: {{ vals }} + {{ prejoin }} {%- endset -%} {%- do exceptions.raise_compiler_error(error_message) -%} {%- endif -%} -{# This sets a default value for the operator that connects multiple joining conditions. Only when it is not set by user. #} - {%- if 'operator' not in vals.keys() -%} + {%- if 'operator' not in prejoin.keys() -%} {%- set operator = 'AND' -%} {%- else -%} - {%- set operator = vals['operator'] -%} - {%- endif -%} - - - {%- set prejoin_hash = '`' ~ local_md5(relation~vals['this_column_name']~operator~vals['ref_column_name']) ~ '`' -%} - - {%- if not prejoin_hash in processed_prejoin_hashes %} - {%- do processed_prejoin_hashes.append(prejoin_hash) %} - {%- set prejoin_join_statement_tmp -%} - left join {{ relation }} as {{ prejoin_hash }} - on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_hash], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} - - {% endset -%} - {%- do prejoin_statements_list.append(prejoin_join_statement_tmp) -%} + {%- set operator = prejoin['operator'] -%} {%- endif -%} - -{# select the prejoined columns #} - ,{{prejoin_hash}}.{{ vals['bk'] }} AS {{ col }} - {% endfor -%} - - FROM {{ last_cte }} lcte - - {{ prejoin_statements_list|join(' ')}} + {%- set prejoin_alias = 'pj_' + loop.index|string %} + + left join {{ relation }} as {{ prejoin_alias }} + on {{ datavault4dbt.multikey(columns=prejoin['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=prejoin['ref_column_name']) }} + {%- endfor -%} {% set last_cte = "prejoined_columns" -%} {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names -%} @@ -457,65 +468,61 @@ unknown_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ unknown_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ unknown_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for all source columns, except the ldts, rsrc & edwSequence column #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{- datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes#} - {% for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes #} + {%- for prejoin in prejoined_columns -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {%- set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} - {{ log('pj_relation_columns: ' ~ pj_relation_columns, false ) }} - - {% for column in pj_relation_columns -%} - - {% if column.name|lower == vals['bk']|lower -%} - {{ log('column found? yes, for column :' ~ column.name , false) }} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', alias=col) }} + {{ log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) }} + + {%- for column in pj_relation_columns -%} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} {%- endfor -%} - {%- if not loop.last %},{% endif %} {% endfor -%} {%- endif %} - {%- if datavault4dbt.is_something(derived_columns) -%}, - {# Additionally generating Ghost Records for Derived Columns #} - {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + {%- if datavault4dbt.is_something(derived_columns) -%} + {# Additionally generating Ghost Records for Derived Columns #} + {% for column_name, properties in derived_columns_with_datatypes_DICT.items() %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} @@ -527,62 +534,61 @@ error_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ error_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ error_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for Source Columns #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for Missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes #} - {%- for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes#} + {% for prejoin in prejoined_columns %} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} + {{- log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) -}} {% for column in pj_relation_columns -%} - {% if column.name|lower == vals['bk']|lower -%} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', alias=col) -}} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} + {%- endfor -%} - {%- if not loop.last -%},{%- endif %} {% endfor -%} + {%- endif %} - {%- endif -%} - - {%- if datavault4dbt.is_something(derived_columns) %}, + {%- if datavault4dbt.is_something(derived_columns) %} {# Additionally generating Ghost Records for Derived Columns #} {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} From 90dc5c8898730921baacfdc860f4d179f5bb94c7 Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Wed, 11 Dec 2024 16:47:12 +0100 Subject: [PATCH 12/25] databricks: stage: implement new prejoin syntax --- macros/staging/databricks/stage.sql | 198 +++++++++++++++------------- 1 file changed, 104 insertions(+), 94 deletions(-) diff --git a/macros/staging/databricks/stage.sql b/macros/staging/databricks/stage.sql index a3ff3b28..e7bd16f3 100644 --- a/macros/staging/databricks/stage.sql +++ b/macros/staging/databricks/stage.sql @@ -96,7 +96,7 @@ {# Getting the column names for all additional columns #} {%- set derived_column_names = datavault4dbt.extract_column_names(derived_columns) -%} {%- set hashed_column_names = datavault4dbt.extract_column_names(hashed_columns) -%} -{%- set prejoined_column_names = datavault4dbt.extract_column_names(prejoined_columns) -%} +{%- set prejoined_column_names = datavault4dbt.extract_prejoin_column_names(prejoined_columns) -%} {%- set missing_column_names = datavault4dbt.extract_column_names(missing_columns) -%} {%- set exclude_column_names = hashed_column_names + prejoined_column_names + missing_column_names + ldts_rsrc_input_column_names %} {%- set source_and_derived_column_names = (all_source_columns + derived_column_names) | unique | list -%} @@ -177,8 +177,13 @@ {% set error_value_rsrc = var('datavault4dbt.default_error_rsrc', 'ERROR') %} {% set unknown_value_rsrc = var('datavault4dbt.default_unknown_rsrc', 'SYSTEM') %} -{# Setting the rsrc default datatype #} -{% set rsrc_default_dtype = datavault4dbt.string_default_dtype(type=rsrc) %} +{# Setting the rsrc default datatype and length #} +{% set rsrc_default_dtype = datavault4dbt.string_default_dtype(type='rsrc') %} + +{# Setting the ldts default datatype #} +{% set ldts_default_dtype = datavault4dbt.timestamp_default_dtype() %} + +{{ datavault4dbt.prepend_generated_by() }} WITH @@ -206,7 +211,7 @@ source_data AS ( ldts_rsrc_data AS ( SELECT - {{ ldts }} AS {{ load_datetime_col_name}}, + CAST( {{ ldts }} as {{ ldts_default_dtype }} ) AS {{ load_datetime_col_name }}, CAST( {{ rsrc }} as {{ rsrc_default_dtype }} ) AS {{ record_source_col_name }} {%- if datavault4dbt.is_something(sequence) %}, {{ sequence }} AS edwSequence @@ -263,15 +268,41 @@ prejoined_columns AS ( {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} {%- endif -%} - {#- prepare join statements -#} - {%- set prejoin_statements_list = [] -%} - {%- set processed_prejoin_hashes = [] -%} - {%- for col, vals in prejoined_columns.items() -%} + {# Iterate over each prejoin, doing logic checks and generating the select-statements #} + {%- for prejoin in prejoined_columns -%} + {%- set prejoin_alias = 'pj_' + loop.index|string -%} + + {# If extract_columns and/or aliases are passed as string convert them to a list so they can be used as iterators later #} + {%- if not datavault4dbt.is_list(prejoin['extract_columns'])-%} + {%- do prejoin.update({'extract_columns': [prejoin['extract_columns']]}) -%} + {%- endif -%} + {%- if not datavault4dbt.is_list(prejoin['aliases']) and datavault4dbt.is_something(prejoin['aliases']) -%} + {%- do prejoin.update({'aliases': [prejoin['aliases']]}) -%} + {%- endif -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {# If passed, make sure there are as many aliases as there are extract_columns, ensuring a 1:1 mapping #} + {%- if datavault4dbt.is_something(prejoin['aliases']) -%} + {%- if not prejoin['aliases']|length == prejoin['extract_columns']|length -%} + {%- do exceptions.raise_compiler_error("Prejoin aliases must have the same length as extract_columns. Got " + ~ prejoin['extract_columns']|length ~ " extract_column(s) and " ~ prejoin['aliases']|length ~ " aliase(s).") -%} + {%- endif -%} + {%- endif -%} + + {# Generate the columns for the SELECT-statement #} + {%- for column in prejoin['extract_columns'] %} + ,{{ prejoin_alias }}.{{ column }} {% if datavault4dbt.is_something(prejoin['aliases']) -%} AS {{ prejoin['aliases'][loop.index0] }} {% endif -%} + {%- endfor -%} + {%- endfor %} + + FROM {{ last_cte }} lcte + + {# Iterate over prejoins and generate the join-statements #} + {%- for prejoin in prejoined_columns -%} + + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- else -%} {%- set error_message -%} Prejoin error: Invalid target entity definition. Allowed are: @@ -292,45 +323,29 @@ prejoined_columns AS ( ref_column_name: join_columns_in_ref_model Got: - {{ col }}: {{ vals }} + {{ prejoin }} {%- endset -%} {%- do exceptions.raise_compiler_error(error_message) -%} {%- endif -%} -{# This sets a default value for the operator that connects multiple joining conditions. Only when it is not set by user. #} - {%- if 'operator' not in vals.keys() -%} + {%- if 'operator' not in prejoin.keys() -%} {%- set operator = 'AND' -%} {%- else -%} - {%- set operator = vals['operator'] -%} - {%- endif -%} - - - {%- set prejoin_hash = '`' ~ local_md5(relation~vals['this_column_name']~operator~vals['ref_column_name']) ~ '`' -%} - - {%- if not prejoin_hash in processed_prejoin_hashes %} - {%- do processed_prejoin_hashes.append(prejoin_hash) %} - {%- set prejoin_join_statement_tmp -%} - left join {{ relation }} as {{ prejoin_hash }} - on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_hash], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} - - {% endset -%} - {%- do prejoin_statements_list.append(prejoin_join_statement_tmp) -%} + {%- set operator = prejoin['operator'] -%} {%- endif -%} - -{# select the prejoined columns #} - ,{{prejoin_hash}}.{{ vals['bk'] }} AS {{ col }} - {% endfor -%} - - FROM {{ last_cte }} lcte - - {{ prejoin_statements_list|join(' ')}} + {%- set prejoin_alias = 'pj_' + loop.index|string %} + + left join {{ relation }} as {{ prejoin_alias }} + on {{ datavault4dbt.multikey(columns=prejoin['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=prejoin['ref_column_name']) }} + {%- endfor -%} {% set last_cte = "prejoined_columns" -%} {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names -%} ), {%- endif -%} + {%- if datavault4dbt.is_something(derived_columns) %} {# Adding derived columns to the selection #} derived_columns AS ( @@ -451,65 +466,61 @@ unknown_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ unknown_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ unknown_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for all source columns, except the ldts, rsrc & edwSequence column #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{- datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes#} - {% for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes #} + {%- for prejoin in prejoined_columns -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {%- set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} - {{ log('pj_relation_columns: ' ~ pj_relation_columns, false ) }} - - {% for column in pj_relation_columns -%} - - {% if column.name|lower == vals['bk']|lower -%} - {{ log('column found? yes, for column :' ~ column.name , false) }} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', alias=col) }} + {{ log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) }} + + {%- for column in pj_relation_columns -%} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} {%- endfor -%} - {%- if not loop.last %},{% endif %} {% endfor -%} {%- endif %} - {%- if datavault4dbt.is_something(derived_columns) -%}, - {# Additionally generating Ghost Records for Derived Columns #} - {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + {%- if datavault4dbt.is_something(derived_columns) -%} + {# Additionally generating Ghost Records for Derived Columns #} + {% for column_name, properties in derived_columns_with_datatypes_DICT.items() %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} @@ -521,62 +532,61 @@ error_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ error_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ error_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for Source Columns #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for Missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes #} - {%- for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes#} + {% for prejoin in prejoined_columns %} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} + {{- log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) -}} {% for column in pj_relation_columns -%} - {% if column.name|lower == vals['bk']|lower -%} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', alias=col) -}} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} + {%- endfor -%} - {%- if not loop.last -%},{%- endif %} {% endfor -%} + {%- endif %} - {%- endif -%} - - {%- if datavault4dbt.is_something(derived_columns) %}, + {%- if datavault4dbt.is_something(derived_columns) %} {# Additionally generating Ghost Records for Derived Columns #} {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} From 706b2af95cd9a1f4887b6451f69611f480f1c9f6 Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Wed, 11 Dec 2024 16:47:20 +0100 Subject: [PATCH 13/25] exasol stage: implement new prejoin syntax --- macros/staging/exasol/stage.sql | 194 +++++++++++++++++--------------- 1 file changed, 106 insertions(+), 88 deletions(-) diff --git a/macros/staging/exasol/stage.sql b/macros/staging/exasol/stage.sql index 9812f462..a654057c 100644 --- a/macros/staging/exasol/stage.sql +++ b/macros/staging/exasol/stage.sql @@ -178,6 +178,8 @@ {# Setting the ldts default datatype #} {% set ldts_default_dtype = datavault4dbt.timestamp_default_dtype() %} +{{ datavault4dbt.prepend_generated_by() }} + WITH {# Selecting everything that we need from the source relation. #} @@ -188,6 +190,12 @@ source_data AS ( FROM {{ source_relation }} + {% if is_incremental() %} + WHERE {{ ldts }} > (SELECT max({{ load_datetime_col_name}}) + FROM {{ this }} + WHERE {{ load_datetime_col_name}} != {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} ) + {%- endif -%} + {% set last_cte = "source_data" -%} ), @@ -254,15 +262,41 @@ prejoined_columns AS ( {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} {%- endif -%} - {#- prepare join statements -#} - {%- set prejoin_statements_list = [] -%} - {%- set processed_prejoin_hashes = [] -%} - {%- for col, vals in prejoined_columns.items() -%} + {# Iterate over each prejoin, doing logic checks and generating the select-statements #} + {%- for prejoin in prejoined_columns -%} + {%- set prejoin_alias = 'pj_' + loop.index|string -%} + + {# If extract_columns and/or aliases are passed as string convert them to a list so they can be used as iterators later #} + {%- if not datavault4dbt.is_list(prejoin['extract_columns'])-%} + {%- do prejoin.update({'extract_columns': [prejoin['extract_columns']]}) -%} + {%- endif -%} + {%- if not datavault4dbt.is_list(prejoin['aliases']) and datavault4dbt.is_something(prejoin['aliases']) -%} + {%- do prejoin.update({'aliases': [prejoin['aliases']]}) -%} + {%- endif -%} + + {# If passed, make sure there are as many aliases as there are extract_columns, ensuring a 1:1 mapping #} + {%- if datavault4dbt.is_something(prejoin['aliases']) -%} + {%- if not prejoin['aliases']|length == prejoin['extract_columns']|length -%} + {%- do exceptions.raise_compiler_error("Prejoin aliases must have the same length as extract_columns. Got " + ~ prejoin['extract_columns']|length ~ " extract_column(s) and " ~ prejoin['aliases']|length ~ " aliase(s).") -%} + {%- endif -%} + {%- endif -%} + + {# Generate the columns for the SELECT-statement #} + {%- for column in prejoin['extract_columns'] %} + ,{{ prejoin_alias }}.{{ column }} {% if datavault4dbt.is_something(prejoin['aliases']) -%} AS {{ prejoin['aliases'][loop.index0] }} {% endif -%} + {%- endfor -%} + {%- endfor %} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + FROM {{ last_cte }} lcte + + {# Iterate over prejoins and generate the join-statements #} + {%- for prejoin in prejoined_columns -%} + + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- else -%} {%- set error_message -%} Prejoin error: Invalid target entity definition. Allowed are: @@ -283,42 +317,25 @@ prejoined_columns AS ( ref_column_name: join_columns_in_ref_model Got: - {{ col }}: {{ vals }} + {{ prejoin }} {%- endset -%} {%- do exceptions.raise_compiler_error(error_message) -%} {%- endif -%} -{# This sets a default value for the operator that connects multiple joining conditions. Only when it is not set by user. #} - {%- if 'operator' not in vals.keys() -%} + {%- if 'operator' not in prejoin.keys() -%} {%- set operator = 'AND' -%} {%- else -%} - {%- set operator = vals['operator'] -%} + {%- set operator = prejoin['operator'] -%} {%- endif -%} - - - {%- set prejoin_hash = '"' ~ local_md5(relation~vals['this_column_name']~operator~vals['ref_column_name']) ~ '"' -%} - - {%- if not prejoin_hash in processed_prejoin_hashes %} - {%- do processed_prejoin_hashes.append(prejoin_hash) %} - {%- set prejoin_join_statement_tmp -%} - left join {{ relation }} as {{ prejoin_hash }} - on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_hash], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} - - {% endset -%} - {%- do prejoin_statements_list.append(prejoin_join_statement_tmp) -%} - {%- endif -%} - -{# select the prejoined columns #} - ,{{prejoin_hash}}.{{ vals['bk'] }} AS {{ col }} - {% endfor -%} - - FROM {{ last_cte }} lcte - - {{ prejoin_statements_list|join(' ')}} + {%- set prejoin_alias = 'pj_' + loop.index|string %} + + left join {{ relation }} as {{ prejoin_alias }} + on {{ datavault4dbt.multikey(columns=prejoin['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=prejoin['ref_column_name']) }} + {%- endfor -%} {% set last_cte = "prejoined_columns" -%} - {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names -%} + {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names %} ), {%- endif -%} @@ -442,63 +459,64 @@ hashed_columns AS ( {%- if enable_ghost_records and not is_incremental() %} {# Creating Ghost Record for unknown case, based on datatype #} unknown_values AS ( + SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ unknown_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ unknown_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for all source columns, except the ldts, rsrc & edwSequence column #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, col_size=column.char_size, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col|upper, datatype=dtype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{- datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes#} - {% for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes #} + {%- for prejoin in prejoined_columns -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {%- set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} - {% for column in pj_relation_columns -%} - {% if column.name|lower == vals['bk']|lower -%} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col|upper, datatype=column.dtype, col_size=column.char_size, ghost_record_type='unknown') }} + {{ log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) }} + + {%- for column in pj_relation_columns -%} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} {%- endfor -%} - {%- if not loop.last %},{% endif %} {% endfor -%} {%- endif %} - {%- if datavault4dbt.is_something(derived_columns) -%}, - {# Additionally generating Ghost Records for Derived Columns #} - {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + {%- if datavault4dbt.is_something(derived_columns) -%} + {# Additionally generating Ghost Records for Derived Columns #} + {% for column_name, properties in derived_columns_with_datatypes_DICT.items() %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} @@ -510,62 +528,61 @@ error_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ error_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ error_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for Source Columns #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, col_size=column.char_size, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for Missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col|upper, datatype=dtype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes #} - {%- for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes#} + {% for prejoin in prejoined_columns %} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} + {{- log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) -}} {% for column in pj_relation_columns -%} - {% if column.name|lower == vals['bk']|lower -%} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col|upper, datatype=column.dtype, col_size=column.char_size, ghost_record_type='error') -}} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} + {%- endfor -%} - {%- if not loop.last -%},{%- endif %} {% endfor -%} + {%- endif %} - {%- endif -%} - - {%- if datavault4dbt.is_something(derived_columns) %}, + {%- if datavault4dbt.is_something(derived_columns) %} {# Additionally generating Ghost Records for Derived Columns #} {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='error') }} - {%- if not loop.last %},{% endif %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} @@ -595,12 +612,13 @@ columns_to_select AS ( {%- if enable_ghost_records and not is_incremental() %} UNION ALL + SELECT {{ datavault4dbt.print_list(datavault4dbt.escape_column_names(final_columns_to_select)) }} FROM ghost_records -{%- endif -%} +{% endif %} ) SELECT * FROM columns_to_select From 20d012c0c18aee99fb7eb93d1090d65480b6b174 Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Wed, 11 Dec 2024 16:47:26 +0100 Subject: [PATCH 14/25] fabric stage: implement new prejoin syntax --- macros/staging/fabric/stage.sql | 185 +++++++++++++++++--------------- 1 file changed, 96 insertions(+), 89 deletions(-) diff --git a/macros/staging/fabric/stage.sql b/macros/staging/fabric/stage.sql index d042d7a9..14b511c6 100644 --- a/macros/staging/fabric/stage.sql +++ b/macros/staging/fabric/stage.sql @@ -89,12 +89,11 @@ {# Getting the column names for all additional columns #} {%- set derived_column_names = datavault4dbt.extract_column_names(derived_columns) -%} {%- set hashed_column_names = datavault4dbt.extract_column_names(hashed_columns) -%} -{%- set prejoined_column_names = datavault4dbt.extract_column_names(prejoined_columns) -%} +{%- set prejoined_column_names = datavault4dbt.extract_prejoin_column_names(prejoined_columns) -%} {%- set missing_column_names = datavault4dbt.extract_column_names(missing_columns) -%} {%- set exclude_column_names = derived_column_names + hashed_column_names + prejoined_column_names + missing_column_names + ldts_rsrc_input_column_names %} {%- set source_and_derived_column_names = (all_source_columns + derived_column_names) | unique | list -%} {%- set all_columns = adapter.get_columns_in_relation( source_relation ) -%} - {%- set columns_without_excluded_columns = [] -%} {%- set final_columns_to_select = [] -%} @@ -134,8 +133,11 @@ {%- set only_include_from_source = (derived_input_columns + hashed_input_columns + prejoined_input_columns + ma_keys) | unique | list -%} {%- else -%} + {%- set only_include_from_source = (derived_input_columns + hashed_input_columns + prejoined_input_columns) | unique | list -%} + {%- endif -%} + {%- set source_columns_to_select = only_include_from_source -%} {%- endif-%} @@ -264,15 +266,41 @@ prejoined_columns AS ( {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} {%- endif -%} - {#- prepare join statements -#} - {%- set prejoin_statements_list = [] -%} - {%- set processed_prejoin_hashes = [] -%} - {%- for col, vals in prejoined_columns.items() -%} + {# Iterate over each prejoin, doing logic checks and generating the select-statements #} + {%- for prejoin in prejoined_columns -%} + {%- set prejoin_alias = 'pj_' + loop.index|string -%} + + {# If extract_columns and/or aliases are passed as string convert them to a list so they can be used as iterators later #} + {%- if not datavault4dbt.is_list(prejoin['extract_columns'])-%} + {%- do prejoin.update({'extract_columns': [prejoin['extract_columns']]}) -%} + {%- endif -%} + {%- if not datavault4dbt.is_list(prejoin['aliases']) and datavault4dbt.is_something(prejoin['aliases']) -%} + {%- do prejoin.update({'aliases': [prejoin['aliases']]}) -%} + {%- endif -%} + + {# If passed, make sure there are as many aliases as there are extract_columns, ensuring a 1:1 mapping #} + {%- if datavault4dbt.is_something(prejoin['aliases']) -%} + {%- if not prejoin['aliases']|length == prejoin['extract_columns']|length -%} + {%- do exceptions.raise_compiler_error("Prejoin aliases must have the same length as extract_columns. Got " + ~ prejoin['extract_columns']|length ~ " extract_column(s) and " ~ prejoin['aliases']|length ~ " aliase(s).") -%} + {%- endif -%} + {%- endif -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {# Generate the columns for the SELECT-statement #} + {%- for column in prejoin['extract_columns'] %} + ,{{ prejoin_alias }}.{{ column }} {% if datavault4dbt.is_something(prejoin['aliases']) -%} AS {{ prejoin['aliases'][loop.index0] }} {% endif -%} + {%- endfor -%} + {%- endfor %} + + FROM {{ last_cte }} lcte + + {# Iterate over prejoins and generate the join-statements #} + {%- for prejoin in prejoined_columns -%} + + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- else -%} {%- set error_message -%} Prejoin error: Invalid target entity definition. Allowed are: @@ -293,39 +321,22 @@ prejoined_columns AS ( ref_column_name: join_columns_in_ref_model Got: - {{ col }}: {{ vals }} + {{ prejoin }} {%- endset -%} {%- do exceptions.raise_compiler_error(error_message) -%} {%- endif -%} -{# This sets a default value for the operator that connects multiple joining conditions. Only when it is not set by user. #} - {%- if 'operator' not in vals.keys() -%} + {%- if 'operator' not in prejoin.keys() -%} {%- set operator = 'AND' -%} {%- else -%} - {%- set operator = vals['operator'] -%} - {%- endif -%} - - - {%- set prejoin_hash = '"' ~ local_md5(relation~vals['this_column_name']~operator~vals['ref_column_name']) ~ '"' -%} - - {%- if not prejoin_hash in processed_prejoin_hashes %} - {%- do processed_prejoin_hashes.append(prejoin_hash) %} - {%- set prejoin_join_statement_tmp -%} - left join {{ relation }} as {{ prejoin_hash }} - on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_hash], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} - - {% endset -%} - {%- do prejoin_statements_list.append(prejoin_join_statement_tmp) -%} + {%- set operator = prejoin['operator'] -%} {%- endif -%} - -{# select the prejoined columns #} - ,{{prejoin_hash}}.{{ vals['bk'] }} AS {{ col }} - {% endfor -%} - - FROM {{ last_cte }} lcte - - {{ prejoin_statements_list|join(' ')}} + {%- set prejoin_alias = 'pj_' + loop.index|string %} + + left join {{ relation }} as {{ prejoin_alias }} + on {{ datavault4dbt.multikey(columns=prejoin['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=prejoin['ref_column_name']) }} + {%- endfor -%} {% set last_cte = "prejoined_columns" -%} {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names -%} @@ -455,65 +466,61 @@ unknown_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ unknown_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ unknown_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for all source columns, except the ldts, rsrc & edwSequence column #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column.name), datatype=column.dtype, ghost_record_type='unknown', col_size=column.char_size) }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column.name), datatype=column.dtype, ghost_record_type='unknown', col_size=column.char_size) }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(col), datatype=dtype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(col), datatype=dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes#} - {% for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes #} + {%- for prejoin in prejoined_columns -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {%- set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} - {{ log('pj_relation_columns: ' ~ pj_relation_columns, false ) }} + {{ log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) }} {%- for column in pj_relation_columns -%} - - {%- if column.name|lower == vals['bk']|lower -%} - {{- log('column found? yes, for column :' ~ column.name , false) -}} - {{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column.name), datatype=column.dtype, ghost_record_type='unknown', alias=datavault4dbt.escape_column_names(col)) }} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} {%- endfor -%} - {%- if not loop.last %},{%- endif %} {% endfor -%} {%- endif %} - {%- if datavault4dbt.is_something(derived_columns) -%}, - {# Additionally generating Ghost Records for Derived Columns #} + {%- if datavault4dbt.is_something(derived_columns) -%} + {# Additionally generating Ghost Records for Derived Columns #} {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column_name), datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column_name), datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} @@ -525,62 +532,61 @@ error_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ error_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ error_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for Source Columns #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column.name), datatype=column.dtype, ghost_record_type='error', col_size=column.char_size) }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column.name), datatype=column.dtype, ghost_record_type='error', col_size=column.char_size) }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for Missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(col), datatype=dtype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(col), datatype=dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes #} - {%- for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes#} + {% for prejoin in prejoined_columns %} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} + {{- log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) -}} {% for column in pj_relation_columns -%} - {% if column.name|lower == vals['bk']|lower -%} - {{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column.name), datatype=column.dtype, ghost_record_type='error', alias=datavault4dbt.escape_column_names(col)) -}} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} + {%- endfor -%} - {%- if not loop.last -%},{%- endif %} {% endfor -%} + {%- endif %} - {%- endif -%} - - {%- if datavault4dbt.is_something(derived_columns) %}, + {%- if datavault4dbt.is_something(derived_columns) %} {# Additionally generating Ghost Records for Derived Columns #} {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column_name), datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='error') }} - {%- if not loop.last %},{% endif %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column_name), datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} @@ -607,6 +613,7 @@ columns_to_select AS ( {{ datavault4dbt.print_list(datavault4dbt.escape_column_names(final_columns_to_select)) }} FROM {{ last_cte }} + {% if enable_ghost_records and not is_incremental() %} UNION ALL From 36617f33d36b2cf2eda28437731e705afb89e0ea Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Wed, 11 Dec 2024 16:47:37 +0100 Subject: [PATCH 15/25] oracle stage: implement new prejoin syntax --- macros/staging/oracle/stage.sql | 191 ++++++++++++++++---------------- 1 file changed, 98 insertions(+), 93 deletions(-) diff --git a/macros/staging/oracle/stage.sql b/macros/staging/oracle/stage.sql index 40afc202..e34df521 100644 --- a/macros/staging/oracle/stage.sql +++ b/macros/staging/oracle/stage.sql @@ -96,12 +96,11 @@ {# Getting the column names for all additional columns #} {%- set derived_column_names = datavault4dbt.extract_column_names(derived_columns) -%} {%- set hashed_column_names = datavault4dbt.extract_column_names(hashed_columns) -%} -{%- set prejoined_column_names = datavault4dbt.extract_column_names(prejoined_columns) -%} +{%- set prejoined_column_names = datavault4dbt.extract_prejoin_column_names(prejoined_columns) -%} {%- set missing_column_names = datavault4dbt.extract_column_names(missing_columns) -%} {%- set exclude_column_names = hashed_column_names + prejoined_column_names + missing_column_names + ldts_rsrc_input_column_names %} {%- set source_and_derived_column_names = (all_source_columns + derived_column_names) | unique | list -%} {%- set all_columns = adapter.get_columns_in_relation( source_relation ) -%} - {%- set columns_without_excluded_columns = [] -%} {%- set final_columns_to_select = [] -%} @@ -189,6 +188,8 @@ {# Setting the ldts default datatype #} {% set ldts_default_dtype = datavault4dbt.timestamp_default_dtype() %} +{{ datavault4dbt.prepend_generated_by() }} + WITH {# Selecting everything that we need from the source relation. #} @@ -273,15 +274,41 @@ prejoined_columns AS ( {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} {%- endif -%} - {#- prepare join statements -#} - {%- set prejoin_statements_list = [] -%} - {%- set processed_prejoin_hashes = [] -%} - {%- for col, vals in prejoined_columns.items() -%} + {# Iterate over each prejoin, doing logic checks and generating the select-statements #} + {%- for prejoin in prejoined_columns -%} + {%- set prejoin_alias = 'pj_' + loop.index|string -%} + + {# If extract_columns and/or aliases are passed as string convert them to a list so they can be used as iterators later #} + {%- if not datavault4dbt.is_list(prejoin['extract_columns'])-%} + {%- do prejoin.update({'extract_columns': [prejoin['extract_columns']]}) -%} + {%- endif -%} + {%- if not datavault4dbt.is_list(prejoin['aliases']) and datavault4dbt.is_something(prejoin['aliases']) -%} + {%- do prejoin.update({'aliases': [prejoin['aliases']]}) -%} + {%- endif -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {# If passed, make sure there are as many aliases as there are extract_columns, ensuring a 1:1 mapping #} + {%- if datavault4dbt.is_something(prejoin['aliases']) -%} + {%- if not prejoin['aliases']|length == prejoin['extract_columns']|length -%} + {%- do exceptions.raise_compiler_error("Prejoin aliases must have the same length as extract_columns. Got " + ~ prejoin['extract_columns']|length ~ " extract_column(s) and " ~ prejoin['aliases']|length ~ " aliase(s).") -%} + {%- endif -%} + {%- endif -%} + + {# Generate the columns for the SELECT-statement #} + {%- for column in prejoin['extract_columns'] %} + ,{{ prejoin_alias }}.{{ column }} {% if datavault4dbt.is_something(prejoin['aliases']) -%} AS {{ prejoin['aliases'][loop.index0] }} {% endif -%} + {%- endfor -%} + {%- endfor %} + + FROM {{ last_cte }} lcte + + {# Iterate over prejoins and generate the join-statements #} + {%- for prejoin in prejoined_columns -%} + + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- else -%} {%- set error_message -%} Prejoin error: Invalid target entity definition. Allowed are: @@ -302,39 +329,22 @@ prejoined_columns AS ( ref_column_name: join_columns_in_ref_model Got: - {{ col }}: {{ vals }} + {{ prejoin }} {%- endset -%} {%- do exceptions.raise_compiler_error(error_message) -%} {%- endif -%} -{# This sets a default value for the operator that connects multiple joining conditions. Only when it is not set by user. #} - {%- if 'operator' not in vals.keys() -%} + {%- if 'operator' not in prejoin.keys() -%} {%- set operator = 'AND' -%} {%- else -%} - {%- set operator = vals['operator'] -%} - {%- endif -%} - - - {%- set prejoin_hash = '"' ~ local_md5(relation~vals['this_column_name']~operator~vals['ref_column_name']) ~ '"' -%} - - {%- if not prejoin_hash in processed_prejoin_hashes %} - {%- do processed_prejoin_hashes.append(prejoin_hash) %} - {%- set prejoin_join_statement_tmp -%} - left join {{ relation }} as {{ prejoin_hash }} - on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_hash], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} - - {% endset -%} - {%- do prejoin_statements_list.append(prejoin_join_statement_tmp) -%} + {%- set operator = prejoin['operator'] -%} {%- endif -%} - -{# select the prejoined columns #} - ,{{prejoin_hash}}.{{ vals['bk'] }} AS {{ col }} - {% endfor -%} - - FROM {{ last_cte }} lcte - - {{ prejoin_statements_list|join(' ')}} + {%- set prejoin_alias = 'pj_' + loop.index|string %} + + left join {{ relation }} as {{ prejoin_alias }} + on {{ datavault4dbt.multikey(columns=prejoin['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=prejoin['ref_column_name']) }} + {%- endfor -%} {% set last_cte = "prejoined_columns" -%} {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names -%} @@ -465,65 +475,61 @@ unknown_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ unknown_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ unknown_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for all source columns, except the ldts, rsrc & edwSequence column #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{- datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes#} - {% for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes #} + {%- for prejoin in prejoined_columns -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {%- set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} - {{ log('pj_relation_columns: ' ~ pj_relation_columns, false ) }} - - {% for column in pj_relation_columns -%} - - {% if column.name|lower == vals['bk']|lower -%} - {{ log('column found? yes, for column :' ~ column.name , false) }} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', alias=col) }} + {{ log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) }} + + {%- for column in pj_relation_columns -%} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} {%- endfor -%} - {%- if not loop.last %},{% endif %} {% endfor -%} {%- endif %} - {%- if datavault4dbt.is_something(derived_columns) -%}, - {# Additionally generating Ghost Records for Derived Columns #} - {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + {%- if datavault4dbt.is_something(derived_columns) -%} + {# Additionally generating Ghost Records for Derived Columns #} + {% for column_name, properties in derived_columns_with_datatypes_DICT.items() %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} @@ -536,62 +542,61 @@ error_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ error_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ error_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for Source Columns #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for Missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes #} - {%- for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes#} + {% for prejoin in prejoined_columns %} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} + {{- log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) -}} {% for column in pj_relation_columns -%} - {% if column.name|lower == vals['bk']|lower -%} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', alias=col) -}} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} + {%- endfor -%} - {%- if not loop.last -%},{%- endif %} {% endfor -%} + {%- endif %} - {%- endif -%} - - {%- if datavault4dbt.is_something(derived_columns) %}, + {%- if datavault4dbt.is_something(derived_columns) %} {# Additionally generating Ghost Records for Derived Columns #} {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='error') }} - {%- if not loop.last %},{% endif %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} @@ -634,4 +639,4 @@ columns_to_select AS ( SELECT * FROM columns_to_select -{%- endmacro -%} \ No newline at end of file +{%- endmacro -%} From 7adba6909cb635c141fb222e5056f689a4cfa1e2 Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Wed, 11 Dec 2024 16:47:56 +0100 Subject: [PATCH 16/25] postgres stage add prepend_generated_by() --- macros/staging/postgres/stage.sql | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/macros/staging/postgres/stage.sql b/macros/staging/postgres/stage.sql index f5cd9898..c59cc9c6 100644 --- a/macros/staging/postgres/stage.sql +++ b/macros/staging/postgres/stage.sql @@ -101,7 +101,6 @@ {%- set exclude_column_names = hashed_column_names + prejoined_column_names + missing_column_names + ldts_rsrc_input_column_names %} {%- set source_and_derived_column_names = (all_source_columns + derived_column_names) | unique | list -%} {%- set all_columns = adapter.get_columns_in_relation( source_relation ) -%} - {%- set columns_without_excluded_columns = [] -%} {%- set final_columns_to_select = [] -%} @@ -184,6 +183,8 @@ {# Setting the ldts default datatype #} {% set ldts_default_dtype = datavault4dbt.timestamp_default_dtype() %} +{{ datavault4dbt.prepend_generated_by() }} + WITH {# Selecting everything that we need from the source relation. #} From 3070e0a4c6afceceb578e8e7e5e701f63136c7e5 Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Wed, 11 Dec 2024 16:48:02 +0100 Subject: [PATCH 17/25] redshift stage: implement new prejoin syntax --- macros/staging/redshift/stage.sql | 191 ++++++++++++++++-------------- 1 file changed, 99 insertions(+), 92 deletions(-) diff --git a/macros/staging/redshift/stage.sql b/macros/staging/redshift/stage.sql index 6c9238b0..b9861cb1 100644 --- a/macros/staging/redshift/stage.sql +++ b/macros/staging/redshift/stage.sql @@ -95,7 +95,7 @@ {# Getting the column names for all additional columns #} {%- set derived_column_names = datavault4dbt.extract_column_names(derived_columns) -%} {%- set hashed_column_names = datavault4dbt.extract_column_names(hashed_columns) -%} -{%- set prejoined_column_names = datavault4dbt.extract_column_names(prejoined_columns) -%} +{%- set prejoined_column_names = datavault4dbt.extract_prejoin_column_names(prejoined_columns) -%} {%- set missing_column_names = datavault4dbt.extract_column_names(missing_columns) -%} {%- set exclude_column_names = hashed_column_names + prejoined_column_names + missing_column_names + ldts_rsrc_input_column_names %} {%- set source_and_derived_column_names = (all_source_columns + derived_column_names) | unique | list -%} @@ -183,6 +183,8 @@ {# Setting the ldts default datatype #} {% set ldts_default_dtype = datavault4dbt.timestamp_default_dtype() %} +{{ datavault4dbt.prepend_generated_by() }} + WITH {# Selecting everything that we need from the source relation. #} @@ -255,6 +257,7 @@ missing_columns AS ( ), {%- endif -%} + {%- if datavault4dbt.is_something(prejoined_columns) %} {# Prejoining Business Keys of other source objects for Link purposes #} prejoined_columns AS ( @@ -264,15 +267,41 @@ prejoined_columns AS ( {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} {%- endif -%} - {#- prepare join statements -#} - {%- set prejoin_statements_list = [] -%} - {%- set processed_prejoin_hashes = [] -%} - {%- for col, vals in prejoined_columns.items() -%} + {# Iterate over each prejoin, doing logic checks and generating the select-statements #} + {%- for prejoin in prejoined_columns -%} + {%- set prejoin_alias = 'pj_' + loop.index|string -%} + + {# If extract_columns and/or aliases are passed as string convert them to a list so they can be used as iterators later #} + {%- if not datavault4dbt.is_list(prejoin['extract_columns'])-%} + {%- do prejoin.update({'extract_columns': [prejoin['extract_columns']]}) -%} + {%- endif -%} + {%- if not datavault4dbt.is_list(prejoin['aliases']) and datavault4dbt.is_something(prejoin['aliases']) -%} + {%- do prejoin.update({'aliases': [prejoin['aliases']]}) -%} + {%- endif -%} + + {# If passed, make sure there are as many aliases as there are extract_columns, ensuring a 1:1 mapping #} + {%- if datavault4dbt.is_something(prejoin['aliases']) -%} + {%- if not prejoin['aliases']|length == prejoin['extract_columns']|length -%} + {%- do exceptions.raise_compiler_error("Prejoin aliases must have the same length as extract_columns. Got " + ~ prejoin['extract_columns']|length ~ " extract_column(s) and " ~ prejoin['aliases']|length ~ " aliase(s).") -%} + {%- endif -%} + {%- endif -%} + + {# Generate the columns for the SELECT-statement #} + {%- for column in prejoin['extract_columns'] %} + ,{{ prejoin_alias }}.{{ column }} {% if datavault4dbt.is_something(prejoin['aliases']) -%} AS {{ prejoin['aliases'][loop.index0] }} {% endif -%} + {%- endfor -%} + {%- endfor %} + + FROM {{ last_cte }} lcte - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {# Iterate over prejoins and generate the join-statements #} + {%- for prejoin in prejoined_columns -%} + + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- else -%} {%- set error_message -%} Prejoin error: Invalid target entity definition. Allowed are: @@ -293,39 +322,22 @@ prejoined_columns AS ( ref_column_name: join_columns_in_ref_model Got: - {{ col }}: {{ vals }} + {{ prejoin }} {%- endset -%} {%- do exceptions.raise_compiler_error(error_message) -%} {%- endif -%} -{# This sets a default value for the operator that connects multiple joining conditions. Only when it is not set by user. #} - {%- if 'operator' not in vals.keys() -%} + {%- if 'operator' not in prejoin.keys() -%} {%- set operator = 'AND' -%} {%- else -%} - {%- set operator = vals['operator'] -%} - {%- endif -%} - - - {%- set prejoin_hash = '"' ~ local_md5(relation~vals['this_column_name']~operator~vals['ref_column_name']) ~ '"' -%} - - {%- if not prejoin_hash in processed_prejoin_hashes %} - {%- do processed_prejoin_hashes.append(prejoin_hash) %} - {%- set prejoin_join_statement_tmp -%} - left join {{ relation }} as {{ prejoin_hash }} - on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_hash], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} - - {% endset -%} - {%- do prejoin_statements_list.append(prejoin_join_statement_tmp) -%} + {%- set operator = prejoin['operator'] -%} {%- endif -%} - -{# select the prejoined columns #} - ,{{prejoin_hash}}.{{ vals['bk'] }} AS {{ col }} - {% endfor -%} - - FROM {{ last_cte }} lcte - - {{ prejoin_statements_list|join(' ')}} + {%- set prejoin_alias = 'pj_' + loop.index|string %} + + left join {{ relation }} as {{ prejoin_alias }} + on {{ datavault4dbt.multikey(columns=prejoin['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=prejoin['ref_column_name']) }} + {%- endfor -%} {% set last_cte = "prejoined_columns" -%} {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names -%} @@ -467,65 +479,61 @@ unknown_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ unknown_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ unknown_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, - {# Generating Ghost Records for all source columns, except the ldts, rsrc & edwSequence column and derived_columns #} + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} + {# Generating Ghost Records for all source columns, except the ldts, rsrc & edwSequence column #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.data_type, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{- datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes#} - {% for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes #} + {%- for prejoin in prejoined_columns -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {%- set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} - {{ log('pj_relation_columns: ' ~ pj_relation_columns, false ) }} - - {% for column in pj_relation_columns -%} - - {% if column.name|lower == vals['bk']|lower -%} - {{ log('column found? yes, for column :' ~ column.name , false) }} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.data_type, ghost_record_type='unknown', alias=col) }} + {{ log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) }} + + {%- for column in pj_relation_columns -%} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} {%- endfor -%} - {%- if not loop.last %},{% endif %} {% endfor -%} {%- endif %} - {%- if datavault4dbt.is_something(derived_columns) -%}, - {# Additionally generating Ghost Records for Derived Columns #} - {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + {%- if datavault4dbt.is_something(derived_columns) -%} + {# Additionally generating Ghost Records for Derived Columns #} + {% for column_name, properties in derived_columns_with_datatypes_DICT.items() %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} @@ -537,62 +545,61 @@ error_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ error_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ error_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for Source Columns #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.data_type, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for Missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes #} - {%- for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes#} + {% for prejoin in prejoined_columns %} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} + {{- log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) -}} {% for column in pj_relation_columns -%} - {% if column.name|lower == vals['bk']|lower -%} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.data_type, ghost_record_type='error', alias=col) -}} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} + {%- endfor -%} - {%- if not loop.last -%},{%- endif %} {% endfor -%} + {%- endif %} - {%- endif -%} - - {%- if datavault4dbt.is_something(derived_columns) %}, + {%- if datavault4dbt.is_something(derived_columns) %} {# Additionally generating Ghost Records for Derived Columns #} {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} From ef8b3750ae44f4955dabcb4b5ef7da4c54b59fa6 Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Wed, 11 Dec 2024 16:48:08 +0100 Subject: [PATCH 18/25] snowflake stage: implement new prejoin syntax --- macros/staging/snowflake/stage.sql | 188 +++++++++++++++-------------- 1 file changed, 97 insertions(+), 91 deletions(-) diff --git a/macros/staging/snowflake/stage.sql b/macros/staging/snowflake/stage.sql index ed4c9d22..14810ed4 100644 --- a/macros/staging/snowflake/stage.sql +++ b/macros/staging/snowflake/stage.sql @@ -96,7 +96,7 @@ {# Getting the column names for all additional columns #} {%- set derived_column_names = datavault4dbt.extract_column_names(derived_columns) -%} {%- set hashed_column_names = datavault4dbt.extract_column_names(hashed_columns) -%} -{%- set prejoined_column_names = datavault4dbt.extract_column_names(prejoined_columns) -%} +{%- set prejoined_column_names = datavault4dbt.extract_prejoin_column_names(prejoined_columns) -%} {%- set missing_column_names = datavault4dbt.extract_column_names(missing_columns) -%} {%- set exclude_column_names = hashed_column_names + prejoined_column_names + missing_column_names + ldts_rsrc_input_column_names %} {%- set source_and_derived_column_names = (all_source_columns + derived_column_names) | unique | list -%} @@ -189,6 +189,8 @@ {# Setting the ldts default datatype #} {% set ldts_default_dtype = datavault4dbt.timestamp_default_dtype() %} +{{ datavault4dbt.prepend_generated_by() }} + WITH {# Selecting everything that we need from the source relation. #} @@ -273,15 +275,41 @@ prejoined_columns AS ( {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} {%- endif -%} - {#- prepare join statements -#} - {%- set prejoin_statements_list = [] -%} - {%- set processed_prejoin_hashes = [] -%} - {%- for col, vals in prejoined_columns.items() -%} + {# Iterate over each prejoin, doing logic checks and generating the select-statements #} + {%- for prejoin in prejoined_columns -%} + {%- set prejoin_alias = 'pj_' + loop.index|string -%} + + {# If extract_columns and/or aliases are passed as string convert them to a list so they can be used as iterators later #} + {%- if not datavault4dbt.is_list(prejoin['extract_columns'])-%} + {%- do prejoin.update({'extract_columns': [prejoin['extract_columns']]}) -%} + {%- endif -%} + {%- if not datavault4dbt.is_list(prejoin['aliases']) and datavault4dbt.is_something(prejoin['aliases']) -%} + {%- do prejoin.update({'aliases': [prejoin['aliases']]}) -%} + {%- endif -%} + + {# If passed, make sure there are as many aliases as there are extract_columns, ensuring a 1:1 mapping #} + {%- if datavault4dbt.is_something(prejoin['aliases']) -%} + {%- if not prejoin['aliases']|length == prejoin['extract_columns']|length -%} + {%- do exceptions.raise_compiler_error("Prejoin aliases must have the same length as extract_columns. Got " + ~ prejoin['extract_columns']|length ~ " extract_column(s) and " ~ prejoin['aliases']|length ~ " aliase(s).") -%} + {%- endif -%} + {%- endif -%} + + {# Generate the columns for the SELECT-statement #} + {%- for column in prejoin['extract_columns'] %} + ,{{ prejoin_alias }}.{{ column }} {% if datavault4dbt.is_something(prejoin['aliases']) -%} AS {{ prejoin['aliases'][loop.index0] }} {% endif -%} + {%- endfor -%} + {%- endfor %} + + FROM {{ last_cte }} lcte + + {# Iterate over prejoins and generate the join-statements #} + {%- for prejoin in prejoined_columns -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- else -%} {%- set error_message -%} Prejoin error: Invalid target entity definition. Allowed are: @@ -302,39 +330,22 @@ prejoined_columns AS ( ref_column_name: join_columns_in_ref_model Got: - {{ col }}: {{ vals }} + {{ prejoin }} {%- endset -%} {%- do exceptions.raise_compiler_error(error_message) -%} {%- endif -%} -{# This sets a default value for the operator that connects multiple joining conditions. Only when it is not set by user. #} - {%- if 'operator' not in vals.keys() -%} + {%- if 'operator' not in prejoin.keys() -%} {%- set operator = 'AND' -%} {%- else -%} - {%- set operator = vals['operator'] -%} - {%- endif -%} - - - {%- set prejoin_hash = '"' ~ local_md5(relation~vals['this_column_name']~operator~vals['ref_column_name']) ~ '"' -%} - - {%- if not prejoin_hash in processed_prejoin_hashes %} - {%- do processed_prejoin_hashes.append(prejoin_hash) %} - {%- set prejoin_join_statement_tmp -%} - left join {{ relation }} as {{ prejoin_hash }} - on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_hash], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} - - {% endset -%} - {%- do prejoin_statements_list.append(prejoin_join_statement_tmp) -%} + {%- set operator = prejoin['operator'] -%} {%- endif -%} - -{# select the prejoined columns #} - ,{{prejoin_hash}}.{{ vals['bk'] }} AS {{ col }} - {% endfor -%} - - FROM {{ last_cte }} lcte - - {{ prejoin_statements_list|join(' ')}} + {%- set prejoin_alias = 'pj_' + loop.index|string %} + + left join {{ relation }} as {{ prejoin_alias }} + on {{ datavault4dbt.multikey(columns=prejoin['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=prejoin['ref_column_name']) }} + {%- endfor -%} {% set last_cte = "prejoined_columns" -%} {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names -%} @@ -408,65 +419,61 @@ unknown_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ unknown_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ unknown_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for all source columns, except the ldts, rsrc & edwSequence column #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{- datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes#} - {% for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes #} + {%- for prejoin in prejoined_columns -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {%- set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} - {{ log('pj_relation_columns: ' ~ pj_relation_columns, false ) }} - - {% for column in pj_relation_columns -%} - - {% if column.name|lower == vals['bk']|lower -%} - {{ log('column found? yes, for column :' ~ column.name , false) }} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', alias=col) }} + {{ log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) }} + + {%- for column in pj_relation_columns -%} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} {%- endfor -%} - {%- if not loop.last %},{% endif %} {% endfor -%} {%- endif %} - {%- if datavault4dbt.is_something(derived_columns) -%}, - {# Additionally generating Ghost Records for Derived Columns #} - {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + {%- if datavault4dbt.is_something(derived_columns) -%} + {# Additionally generating Ghost Records for Derived Columns #} + {% for column_name, properties in derived_columns_with_datatypes_DICT.items() %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} @@ -478,62 +485,61 @@ error_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ error_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ error_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for Source Columns #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for Missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes #} - {%- for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes#} + {% for prejoin in prejoined_columns %} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} + {{- log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) -}} {% for column in pj_relation_columns -%} - {% if column.name|lower == vals['bk']|lower -%} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', alias=col) -}} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} + {%- endfor -%} - {%- if not loop.last -%},{%- endif %} {% endfor -%} + {%- endif %} - {%- endif -%} - - {%- if datavault4dbt.is_something(derived_columns) %}, + {%- if datavault4dbt.is_something(derived_columns) %} {# Additionally generating Ghost Records for Derived Columns #} {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='error') }} - {%- if not loop.last %},{% endif %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} From e8910462b546e1921234eaab1086296407fc290c Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Wed, 11 Dec 2024 16:48:16 +0100 Subject: [PATCH 19/25] synapse stage: implement new prejoin syntax --- macros/staging/synapse/stage.sql | 184 ++++++++++++++++--------------- 1 file changed, 95 insertions(+), 89 deletions(-) diff --git a/macros/staging/synapse/stage.sql b/macros/staging/synapse/stage.sql index 88356508..71525cf0 100644 --- a/macros/staging/synapse/stage.sql +++ b/macros/staging/synapse/stage.sql @@ -179,6 +179,8 @@ {# Setting the ldts default datatype #} {% set ldts_default_dtype = datavault4dbt.timestamp_default_dtype() %} +{{ datavault4dbt.prepend_generated_by() }} + WITH {# Selecting everything that we need from the source relation. #} @@ -265,15 +267,41 @@ prejoined_columns AS ( {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} {%- endif -%} - {#- prepare join statements -#} - {%- set prejoin_statements_list = [] -%} - {%- set processed_prejoin_hashes = [] -%} - {%- for col, vals in prejoined_columns.items() -%} + {# Iterate over each prejoin, doing logic checks and generating the select-statements #} + {%- for prejoin in prejoined_columns -%} + {%- set prejoin_alias = 'pj_' + loop.index|string -%} + + {# If extract_columns and/or aliases are passed as string convert them to a list so they can be used as iterators later #} + {%- if not datavault4dbt.is_list(prejoin['extract_columns'])-%} + {%- do prejoin.update({'extract_columns': [prejoin['extract_columns']]}) -%} + {%- endif -%} + {%- if not datavault4dbt.is_list(prejoin['aliases']) and datavault4dbt.is_something(prejoin['aliases']) -%} + {%- do prejoin.update({'aliases': [prejoin['aliases']]}) -%} + {%- endif -%} + + {# If passed, make sure there are as many aliases as there are extract_columns, ensuring a 1:1 mapping #} + {%- if datavault4dbt.is_something(prejoin['aliases']) -%} + {%- if not prejoin['aliases']|length == prejoin['extract_columns']|length -%} + {%- do exceptions.raise_compiler_error("Prejoin aliases must have the same length as extract_columns. Got " + ~ prejoin['extract_columns']|length ~ " extract_column(s) and " ~ prejoin['aliases']|length ~ " aliase(s).") -%} + {%- endif -%} + {%- endif -%} + + {# Generate the columns for the SELECT-statement #} + {%- for column in prejoin['extract_columns'] %} + ,{{ prejoin_alias }}.{{ column }} {% if datavault4dbt.is_something(prejoin['aliases']) -%} AS {{ prejoin['aliases'][loop.index0] }} {% endif -%} + {%- endfor -%} + {%- endfor %} + + FROM {{ last_cte }} lcte + + {# Iterate over prejoins and generate the join-statements #} + {%- for prejoin in prejoined_columns -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- else -%} {%- set error_message -%} Prejoin error: Invalid target entity definition. Allowed are: @@ -294,39 +322,22 @@ prejoined_columns AS ( ref_column_name: join_columns_in_ref_model Got: - {{ col }}: {{ vals }} + {{ prejoin }} {%- endset -%} {%- do exceptions.raise_compiler_error(error_message) -%} {%- endif -%} -{# This sets a default value for the operator that connects multiple joining conditions. Only when it is not set by user. #} - {%- if 'operator' not in vals.keys() -%} + {%- if 'operator' not in prejoin.keys() -%} {%- set operator = 'AND' -%} {%- else -%} - {%- set operator = vals['operator'] -%} - {%- endif -%} - - - {%- set prejoin_hash = '"' ~ local_md5(relation~vals['this_column_name']~operator~vals['ref_column_name']) ~ '"' -%} - - {%- if not prejoin_hash in processed_prejoin_hashes %} - {%- do processed_prejoin_hashes.append(prejoin_hash) %} - {%- set prejoin_join_statement_tmp -%} - left join {{ relation }} as {{ prejoin_hash }} - on {{ datavault4dbt.multikey(columns=vals['this_column_name'], prefix=['lcte', prejoin_hash], condition='=', operator=operator, right_columns=vals['ref_column_name']) }} - - {% endset -%} - {%- do prejoin_statements_list.append(prejoin_join_statement_tmp) -%} + {%- set operator = prejoin['operator'] -%} {%- endif -%} - -{# select the prejoined columns #} - ,{{prejoin_hash}}.{{ vals['bk'] }} AS {{ col }} - {% endfor -%} - - FROM {{ last_cte }} lcte - - {{ prejoin_statements_list|join(' ')}} + {%- set prejoin_alias = 'pj_' + loop.index|string %} + + left join {{ relation }} as {{ prejoin_alias }} + on {{ datavault4dbt.multikey(columns=prejoin['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=prejoin['ref_column_name']) }} + {%- endfor -%} {% set last_cte = "prejoined_columns" -%} {%- set final_columns_to_select = final_columns_to_select + prejoined_column_names -%} @@ -456,65 +467,61 @@ unknown_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ unknown_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format, beginning_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ unknown_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for all source columns, except the ldts, rsrc & edwSequence column #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', col_size=column.char_size) }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column.name), datatype=column.dtype, ghost_record_type='unknown', col_size=column.char_size) }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(col), datatype=dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes#} - {% for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes #} + {%- for prejoin in prejoined_columns -%} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {%- set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} - {{ log('pj_relation_columns: ' ~ pj_relation_columns, false ) }} - - {% for column in pj_relation_columns -%} - - {% if column.name|lower == vals['bk']|lower -%} - {{ log('column found? yes, for column :' ~ column.name , false) }} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', alias=col) }} + {{ log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) }} + + {%- for column in pj_relation_columns -%} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} {%- endfor -%} - {%- if not loop.last %},{% endif %} {% endfor -%} {%- endif %} - {%- if datavault4dbt.is_something(derived_columns) -%}, - {# Additionally generating Ghost Records for Derived Columns #} + {%- if datavault4dbt.is_something(derived_columns) -%} + {# Additionally generating Ghost Records for Derived Columns #} {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='unknown') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column_name), datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=unknown_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} @@ -526,62 +533,61 @@ error_values AS ( SELECT - {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }}, - '{{ error_value_rsrc }}' as {{ record_source_col_name }} + {{ datavault4dbt.string_to_timestamp(timestamp_format , end_of_all_times) }} as {{ load_datetime_col_name }} + ,'{{ error_value_rsrc }}' as {{ record_source_col_name }} - {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%}, + {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for Source Columns #} {%- for column in columns_without_excluded_columns %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', col_size=column.char_size) }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column.name), datatype=column.dtype, ghost_record_type='error', col_size=column.char_size) }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(missing_columns) -%}, + {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for Missing columns #} {%- for col, dtype in missing_columns.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='error') }} - {%- if not loop.last %},{% endif -%} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(col), datatype=dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(prejoined_columns) -%}, - {# Additionally generating ghost records for the prejoined attributes #} - {%- for col, vals in prejoined_columns.items() %} + {%- if datavault4dbt.is_something(prejoined_columns) -%} + {# Additionally generating ghost records for the prejoined attributes#} + {% for prejoin in prejoined_columns %} - {%- if 'src_name' in vals.keys() or 'src_table' in vals.keys() -%} - {%- set relation = source(vals['src_name']|string, vals['src_table']) -%} - {%- elif 'ref_model' in vals.keys() -%} - {%- set relation = ref(vals['ref_model']) -%} + {%- if 'ref_model' in prejoin.keys() -%} + {% set relation = ref(prejoin['ref_model']) -%} + {%- elif 'src_name' in prejoin.keys() and 'src_table' in prejoin.keys() -%} + {%- set relation = source(prejoin['src_name']|string, prejoin['src_table']) -%} {%- endif -%} {%- set pj_relation_columns = adapter.get_columns_in_relation( relation ) -%} + {{- log('pj_relation_columns for '~relation~': ' ~ pj_relation_columns, false ) -}} {% for column in pj_relation_columns -%} - {% if column.name|lower == vals['bk']|lower -%} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', alias=col) -}} + {%- if column.name|lower in prejoin['extract_columns']|map('lower') -%} + {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} + {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} + {{ log('column found? yes, for column: ' ~ column.name , false) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', alias=prejoin['aliases'][prejoin_col_index]) }} {%- endif -%} + {%- endfor -%} - {%- if not loop.last -%},{%- endif %} {% endfor -%} + {%- endif %} - {%- endif -%} - - {%- if datavault4dbt.is_something(derived_columns) %}, + {%- if datavault4dbt.is_something(derived_columns) %} {# Additionally generating Ghost Records for Derived Columns #} {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - {{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='error') }} - {%- if not loop.last %},{% endif %} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column_name), datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} - {%- if datavault4dbt.is_something(processed_hash_columns) -%}, + {%- if datavault4dbt.is_something(processed_hash_columns) -%} {%- for hash_column in processed_hash_columns %} - CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} - {%- if not loop.last %},{% endif %} + ,CAST({{ datavault4dbt.as_constant(column_str=error_key) }} as {{ hash_dtype }}) as {{ hash_column }} {%- endfor -%} {%- endif -%} From 71bfee3fefeac66cd9549f6e6419ce062cf85437 Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Thu, 12 Dec 2024 08:26:53 +0100 Subject: [PATCH 20/25] synapse stage: remove column name escaping in ghost record macro call --- macros/staging/synapse/stage.sql | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/macros/staging/synapse/stage.sql b/macros/staging/synapse/stage.sql index 71525cf0..9234f9e7 100644 --- a/macros/staging/synapse/stage.sql +++ b/macros/staging/synapse/stage.sql @@ -473,7 +473,7 @@ unknown_values AS ( {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for all source columns, except the ldts, rsrc & edwSequence column #} {%- for column in columns_without_excluded_columns %} - ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column.name), datatype=column.dtype, ghost_record_type='unknown', col_size=column.char_size) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', col_size=column.char_size) }} {%- endfor -%} {%- endif -%} @@ -481,7 +481,7 @@ unknown_values AS ( {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for missing columns #} {%- for col, dtype in missing_columns.items() %} - ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(col), datatype=dtype, ghost_record_type='unknown') }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} @@ -513,7 +513,7 @@ unknown_values AS ( {%- if datavault4dbt.is_something(derived_columns) -%} {# Additionally generating Ghost Records for Derived Columns #} {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column_name), datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='unknown') }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} @@ -539,7 +539,7 @@ error_values AS ( {%- if columns_without_excluded_columns is defined and columns_without_excluded_columns| length > 0 -%} {# Generating Ghost Records for Source Columns #} {%- for column in columns_without_excluded_columns %} - ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column.name), datatype=column.dtype, ghost_record_type='error', col_size=column.char_size) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', col_size=column.char_size) }} {%- endfor -%} {%- endif -%} @@ -547,7 +547,7 @@ error_values AS ( {%- if datavault4dbt.is_something(missing_columns) -%} {# Additionally generating ghost record for Missing columns #} {%- for col, dtype in missing_columns.items() %} - ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(col), datatype=dtype, ghost_record_type='error') }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=col, datatype=dtype, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} @@ -579,7 +579,7 @@ error_values AS ( {%- if datavault4dbt.is_something(derived_columns) %} {# Additionally generating Ghost Records for Derived Columns #} {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column_name), datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='error') }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} From e683e45f3ef20b748fd8a6e7e6685458be5c2eb5 Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Thu, 12 Dec 2024 10:18:49 +0100 Subject: [PATCH 21/25] fabric stage fix escape column names --- macros/staging/fabric/stage.sql | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/macros/staging/fabric/stage.sql b/macros/staging/fabric/stage.sql index 14b511c6..19d55efb 100644 --- a/macros/staging/fabric/stage.sql +++ b/macros/staging/fabric/stage.sql @@ -288,7 +288,7 @@ prejoined_columns AS ( {# Generate the columns for the SELECT-statement #} {%- for column in prejoin['extract_columns'] %} - ,{{ prejoin_alias }}.{{ column }} {% if datavault4dbt.is_something(prejoin['aliases']) -%} AS {{ prejoin['aliases'][loop.index0] }} {% endif -%} + ,{{ prejoin_alias }}.{{ datavault4dbt.escape_column_names(column) }} {% if datavault4dbt.is_something(prejoin['aliases']) -%} AS {{ datavault4dbt.escape_column_names(prejoin['aliases'][loop.index0]) }} {% endif -%} {%- endfor -%} {%- endfor %} @@ -335,7 +335,7 @@ prejoined_columns AS ( {%- set prejoin_alias = 'pj_' + loop.index|string %} left join {{ relation }} as {{ prejoin_alias }} - on {{ datavault4dbt.multikey(columns=prejoin['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=prejoin['ref_column_name']) }} + on {{ datavault4dbt.multikey(columns=datavault4dbt.escape_column_names(prejoin['this_column_name']), prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=datavault4dbt.escape_column_names(prejoin['ref_column_name'])) }} {%- endfor -%} {% set last_cte = "prejoined_columns" -%} @@ -502,7 +502,7 @@ unknown_values AS ( {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} {{ log('column found? yes, for column: ' ~ column.name , false) }} - ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='unknown', alias=prejoin['aliases'][prejoin_col_index]) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column.name), datatype=column.dtype, ghost_record_type='unknown', alias=datavault4dbt.escape_column_names(prejoin['aliases'][prejoin_col_index])) }} {%- endif -%} {%- endfor -%} @@ -568,7 +568,7 @@ error_values AS ( {%- set prejoin_extract_cols_lower = prejoin['extract_columns']|map('lower')|list -%} {%- set prejoin_col_index = prejoin_extract_cols_lower.index(column.name|lower) -%} {{ log('column found? yes, for column: ' ~ column.name , false) }} - ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column.name, datatype=column.dtype, ghost_record_type='error', alias=prejoin['aliases'][prejoin_col_index]) }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=datavault4dbt.escape_column_names(column.name), datatype=column.dtype, ghost_record_type='error', alias=datavault4dbt.escape_column_names(prejoin['aliases'][prejoin_col_index])) }} {%- endif -%} {%- endfor -%} From 3a86469d0e65bdfe7c0d877bee52e05082c87b45 Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Thu, 12 Dec 2024 11:33:09 +0100 Subject: [PATCH 22/25] synapse, fabric stages: fix derived input columns --- macros/staging/fabric/stage.sql | 2 ++ macros/staging/synapse/stage.sql | 2 ++ 2 files changed, 4 insertions(+) diff --git a/macros/staging/fabric/stage.sql b/macros/staging/fabric/stage.sql index 19d55efb..ef94f2de 100644 --- a/macros/staging/fabric/stage.sql +++ b/macros/staging/fabric/stage.sql @@ -259,6 +259,8 @@ missing_columns AS ( {%- if datavault4dbt.is_something(prejoined_columns) %} {# Prejoining Business Keys of other source objects for Link purposes #} +{%- set final_columns_to_select = (final_columns_to_select + derived_input_columns) | unique | list -%} + prejoined_columns AS ( SELECT diff --git a/macros/staging/synapse/stage.sql b/macros/staging/synapse/stage.sql index 9234f9e7..a4b6f806 100644 --- a/macros/staging/synapse/stage.sql +++ b/macros/staging/synapse/stage.sql @@ -262,6 +262,8 @@ missing_columns AS ( {# Prejoining Business Keys of other source objects for Link purposes #} prejoined_columns AS ( +{%- set final_columns_to_select = (final_columns_to_select + derived_input_columns) | unique | list -%} + SELECT {% if final_columns_to_select | length > 0 -%} {{ datavault4dbt.print_list(datavault4dbt.prefix(columns=datavault4dbt.escape_column_names(final_columns_to_select), prefix_str='lcte').split(',')) }} From 57e21daeb9c13121dcd4a0605ac6ea12d8d16cfc Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Thu, 12 Dec 2024 11:58:02 +0100 Subject: [PATCH 23/25] oracle stage: include col_size to ghost records --- macros/staging/oracle/stage.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/macros/staging/oracle/stage.sql b/macros/staging/oracle/stage.sql index e34df521..1d4ba239 100644 --- a/macros/staging/oracle/stage.sql +++ b/macros/staging/oracle/stage.sql @@ -521,7 +521,7 @@ unknown_values AS ( {%- if datavault4dbt.is_something(derived_columns) -%} {# Additionally generating Ghost Records for Derived Columns #} {% for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='unknown') }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='unknown') }} {%- endfor -%} {%- endif -%} @@ -588,7 +588,7 @@ error_values AS ( {%- if datavault4dbt.is_something(derived_columns) %} {# Additionally generating Ghost Records for Derived Columns #} {%- for column_name, properties in derived_columns_with_datatypes_DICT.items() %} - ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, ghost_record_type='error') }} + ,{{ datavault4dbt.ghost_record_per_datatype(column_name=column_name, datatype=properties.datatype, col_size=properties.col_size, ghost_record_type='error') }} {%- endfor -%} {%- endif -%} From c8df4d3101677582564f9e70748a2afedeecf782 Mon Sep 17 00:00:00 2001 From: tkiehn <162969167+tkiehn@users.noreply.github.com> Date: Thu, 12 Dec 2024 13:53:36 +0100 Subject: [PATCH 24/25] synapse stage: fix prejoin_column_names --- macros/staging/synapse/stage.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/macros/staging/synapse/stage.sql b/macros/staging/synapse/stage.sql index a4b6f806..77c51daa 100644 --- a/macros/staging/synapse/stage.sql +++ b/macros/staging/synapse/stage.sql @@ -89,7 +89,7 @@ {# Getting the column names for all additional columns #} {%- set derived_column_names = datavault4dbt.extract_column_names(derived_columns) -%} {%- set hashed_column_names = datavault4dbt.extract_column_names(hashed_columns) -%} -{%- set prejoined_column_names = datavault4dbt.extract_column_names(prejoined_columns) -%} +{%- set prejoined_column_names = datavault4dbt.extract_prejoin_column_names(prejoined_columns) -%} {%- set missing_column_names = datavault4dbt.extract_column_names(missing_columns) -%} {%- set exclude_column_names = derived_column_names + hashed_column_names + prejoined_column_names + missing_column_names + ldts_rsrc_input_column_names %} {%- set source_and_derived_column_names = (all_source_columns + derived_column_names) | unique | list -%} From 75189a2c30988f4fe753526482084dc72cd5a363 Mon Sep 17 00:00:00 2001 From: Theo Kiehn <162969167+tkiehn@users.noreply.github.com> Date: Mon, 6 Jan 2025 10:43:28 +0100 Subject: [PATCH 25/25] Update oracle stage.sql, remove AS for join alias Naming join relations using "AS" is not allowed in oracle --- macros/staging/oracle/stage.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/macros/staging/oracle/stage.sql b/macros/staging/oracle/stage.sql index 1d4ba239..cae9e7ea 100644 --- a/macros/staging/oracle/stage.sql +++ b/macros/staging/oracle/stage.sql @@ -342,7 +342,7 @@ prejoined_columns AS ( {%- endif -%} {%- set prejoin_alias = 'pj_' + loop.index|string %} - left join {{ relation }} as {{ prejoin_alias }} + left join {{ relation }} {{ prejoin_alias }} on {{ datavault4dbt.multikey(columns=prejoin['this_column_name'], prefix=['lcte', prejoin_alias], condition='=', operator=operator, right_columns=prejoin['ref_column_name']) }} {%- endfor -%}